import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold, RepeatedKFold, \
GridSearchCV, ParameterGrid, RandomizedSearchCVfrom sklearn.tree import DecisionTreeRegressor
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
from IPython import display
#Libraries for visualizing trees
from sklearn.tree import export_graphviz, export_text
from six import StringIO
from IPython.display import Image
import pydotplus
import time as tm
5 Regression trees
Read section 8.1.1 of the book before using these notes.
Note that in this course, lecture notes are not sufficient, you must read the book for better understanding. Lecture notes are just implementing the concepts of the book on a dataset, but not explaining the concepts elaborately.
#Using the same datasets as used for linear regression in STAT303-2,
#so that we can compare the non-linear models with linear regression
= pd.read_csv('./Datasets/Car_features_train.csv')
trainf = pd.read_csv('./Datasets/Car_prices_train.csv')
trainp = pd.read_csv('./Datasets/Car_features_test.csv')
testf = pd.read_csv('./Datasets/Car_prices_test.csv')
testp = pd.merge(trainf,trainp)
train = pd.merge(testf,testp)
test train.head()
carID | brand | model | year | transmission | mileage | fuelType | tax | mpg | engineSize | price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18473 | bmw | 6 Series | 2020 | Semi-Auto | 11 | Diesel | 145 | 53.3282 | 3.0 | 37980 |
1 | 15064 | bmw | 6 Series | 2019 | Semi-Auto | 10813 | Diesel | 145 | 53.0430 | 3.0 | 33980 |
2 | 18268 | bmw | 6 Series | 2020 | Semi-Auto | 6 | Diesel | 145 | 53.4379 | 3.0 | 36850 |
3 | 18480 | bmw | 6 Series | 2017 | Semi-Auto | 18895 | Diesel | 145 | 51.5140 | 3.0 | 25998 |
4 | 18492 | bmw | 6 Series | 2015 | Automatic | 62953 | Diesel | 160 | 51.4903 | 3.0 | 18990 |
5.1 Building a regression tree
Develop a regression tree to predict car price based on mileage
= train['mileage']
X = train['price'] y
#Defining the object to build a regression tree
= DecisionTreeRegressor(random_state=1, max_depth=3)
model
#Fitting the regression tree to the data
-1,1), y) model.fit(X.values.reshape(
DecisionTreeRegressor(max_depth=3, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=3, random_state=1)
#Visualizing the regression tree
= StringIO()
dot_data =dot_data,
export_graphviz(model, out_file=True, rounded=True,
filled=['mileage'],precision=0)
feature_names = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph 'car_price_tree.png')
graph.write_png( Image(graph.create_png())
#prediction on test data
=model.predict(test[['mileage']].values) pred
#RMSE on test data
np.sqrt(mean_squared_error(test.price, pred))
13764.798425410803
#Visualizing the model fit
= np.linspace(min(X), max(X), 100)
Xtest = model.predict(Xtest.reshape(-1,1))
pred_test = 'mileage', y = 'price', data = train, color = 'orange')
sns.scatterplot(x = Xtest, y = pred_test, color = 'blue'); sns.lineplot(x
All cars falling within the same terminal node have the same predicted price, which is seen as flat line segments in the above model curve.
Develop a regression tree to predict car price based on mileage
, mpg
, engineSize
and year
= train[['mileage','mpg','year','engineSize']]
X = DecisionTreeRegressor(random_state=1, max_depth=3)
model
model.fit(X, y)= StringIO()
dot_data =dot_data,
export_graphviz(model, out_file=True, rounded=True,
filled=['mileage','mpg','year','engineSize'],precision=0)
feature_names = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph 'car_price_tree.png')
graph.write_png( Image(graph.create_png())
The model can also be visualized in the text format as below.
print(export_text(model))
|--- feature_3 <= 2.75
| |--- feature_2 <= 2018.50
| | |--- feature_3 <= 1.75
| | | |--- value: [9912.24]
| | |--- feature_3 > 1.75
| | | |--- value: [16599.03]
| |--- feature_2 > 2018.50
| | |--- feature_3 <= 1.90
| | | |--- value: [19363.81]
| | |--- feature_3 > 1.90
| | | |--- value: [31919.42]
|--- feature_3 > 2.75
| |--- feature_2 <= 2017.50
| | |--- feature_0 <= 53289.00
| | | |--- value: [31004.63]
| | |--- feature_0 > 53289.00
| | | |--- value: [15255.91]
| |--- feature_2 > 2017.50
| | |--- feature_1 <= 21.79
| | | |--- value: [122080.00]
| | |--- feature_1 > 21.79
| | | |--- value: [49350.79]
5.2 Optimizing parameters to improve the regression tree
Let us find the optimal depth of the tree and the number of terminal nodes (leaves) by cross validation.
5.2.1 Range of hyperparameter values
First, we’ll find the minimum and maximum possible values of the depth and leaves, and then find the optimal value in that range.
= DecisionTreeRegressor(random_state=1)
model
model.fit(X, y)
print("Maximum tree depth =", model.get_depth())
print("Maximum leaves =", model.get_n_leaves())
Maximum tree depth = 29
Maximum leaves = 4845
5.2.2 Cross validation: Coarse grid
We’ll use the sklearn
function GridSearchCV
to find the optimal hyperparameter values over a grid of possible values. By default, GridSearchCV
returns the optimal hyperparameter values based on the coefficient of determination \(R^2\). However, the scoring
argument of the function can be used to find the optimal parameters based on several different criteria as mentioned in the scoring-parameter documentation.
#Finding cross-validation error for trees
= {'max_depth':range(2,30, 3),'max_leaf_nodes':range(2,4900, 100)}
parameters = KFold(n_splits = 5,shuffle=True,random_state=1)
cv = GridSearchCV(DecisionTreeRegressor(random_state=1), parameters, n_jobs=-1,verbose=1,cv=cv)
model
model.fit(X, y)print (model.best_score_, model.best_params_)
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
0.8433100904754441 {'max_depth': 11, 'max_leaf_nodes': 302}
Let us find the optimal hyperparameters based on root mean squared error (RMSE), instead of \(R^2\). Let us compute \(R^2\) as well during cross validation, as we can compute multiple performance metrics using the scoring
argument. However, when computing multiple performance metrics, we will need to specify the performance metric used to find the optimal hyperparameters with the refit
argument.
#Finding cross-validation error for trees
= {'max_depth':range(2,30, 3),'max_leaf_nodes':range(2,4900, 100)}
parameters = KFold(n_splits = 5,shuffle=True,random_state=1)
cv = GridSearchCV(DecisionTreeRegressor(random_state=1), parameters, n_jobs=-1,verbose=1,cv=cv,
model =['neg_root_mean_squared_error', 'r2'], refit = 'neg_root_mean_squared_error')
scoring
model.fit(X, y)print (model.best_score_, model.best_params_)
Fitting 5 folds for each of 490 candidates, totalling 2450 fits
-6475.329183576911 {'max_depth': 11, 'max_leaf_nodes': 302}
Note that as the GridSearchCV
function maximizes the performance metric to find the optimal hyperparameters, we are maximizing the negative root mean squared error (neg_root_mean_squared_error
), and the function returns the optimal negative mean squared error.
Let us visualize the mean squared error based on the hyperparameter values. We’ll use the cross validation results stored in the cv_results_
attribute of the GridSearchCV
fit()
object.
#Detailed results of k-fold cross validation
= pd.DataFrame(model.cv_results_)
cv_results cv_results.head()
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_max_depth | param_max_leaf_nodes | params | split0_test_neg_root_mean_squared_error | split1_test_neg_root_mean_squared_error | split2_test_neg_root_mean_squared_error | ... | std_test_neg_root_mean_squared_error | rank_test_neg_root_mean_squared_error | split0_test_r2 | split1_test_r2 | split2_test_r2 | split3_test_r2 | split4_test_r2 | mean_test_r2 | std_test_r2 | rank_test_r2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.010178 | 7.531409e-04 | 0.003791 | 0.000415 | 2 | 2 | {'max_depth': 2, 'max_leaf_nodes': 2} | -13729.979521 | -13508.807583 | -12792.941600 | ... | 390.725290 | 481 | 0.314894 | 0.329197 | 0.394282 | 0.361007 | 0.382504 | 0.356377 | 0.030333 | 481 |
1 | 0.009574 | 1.758238e-03 | 0.003782 | 0.000396 | 2 | 102 | {'max_depth': 2, 'max_leaf_nodes': 102} | -10586.885662 | -11230.674720 | -10682.195189 | ... | 321.837965 | 433 | 0.592662 | 0.536369 | 0.577671 | 0.568705 | 0.612407 | 0.577563 | 0.025368 | 433 |
2 | 0.009774 | 7.458305e-04 | 0.003590 | 0.000488 | 2 | 202 | {'max_depth': 2, 'max_leaf_nodes': 202} | -10586.885662 | -11230.674720 | -10682.195189 | ... | 321.837965 | 433 | 0.592662 | 0.536369 | 0.577671 | 0.568705 | 0.612407 | 0.577563 | 0.025368 | 433 |
3 | 0.009568 | 4.953541e-04 | 0.003391 | 0.000489 | 2 | 302 | {'max_depth': 2, 'max_leaf_nodes': 302} | -10586.885662 | -11230.674720 | -10682.195189 | ... | 321.837965 | 433 | 0.592662 | 0.536369 | 0.577671 | 0.568705 | 0.612407 | 0.577563 | 0.025368 | 433 |
4 | 0.008976 | 6.843901e-07 | 0.003192 | 0.000399 | 2 | 402 | {'max_depth': 2, 'max_leaf_nodes': 402} | -10586.885662 | -11230.674720 | -10682.195189 | ... | 321.837965 | 433 | 0.592662 | 0.536369 | 0.577671 | 0.568705 | 0.612407 | 0.577563 | 0.025368 | 433 |
5 rows × 23 columns
= plt.subplots(1,2,figsize=(14,5))
fig, axes =0.2)
plt.subplots_adjust(wspace0].plot(cv_results.param_max_depth, (-cv_results.mean_test_neg_root_mean_squared_error), 'o')
axes[0].set_ylim([6200, 7500])
axes[0].set_xlabel('Depth')
axes[0].set_ylabel('K-fold RMSE')
axes[1].plot(cv_results.param_max_leaf_nodes, (-cv_results.mean_test_neg_root_mean_squared_error), 'o')
axes[1].set_ylim([6200, 7500])
axes[1].set_xlabel('Leaves')
axes[1].set_ylabel('K-fold RMSE'); axes[
We observe that for a depth of around 8-14, and number of leaves within 1000, we get the lowest \(K\)-fold RMSE. So, we should do a finer search in that region to obtain more precise hyperparameter values.
5.2.3 Cross validation: Finer grid
#Finding cross-validation error for trees
= tm.time()
start_time = {'max_depth':range(8,15),'max_leaf_nodes':range(2,1000)}
parameters = KFold(n_splits = 5,shuffle=True,random_state=1)
cv = GridSearchCV(DecisionTreeRegressor(random_state=1), parameters, n_jobs=-1,verbose=1,cv=cv,
model = 'neg_root_mean_squared_error')
scoring
model.fit(X, y)print (model.best_score_, model.best_params_)
print("Time taken =", round((tm.time() - start_time)/60), "minutes")
Fitting 5 folds for each of 6986 candidates, totalling 34930 fits
-6414.468922119372 {'max_depth': 10, 'max_leaf_nodes': 262}
Time taken = 2 minutes
From the above cross-validation, the optimal hyperparameter values are max_depth = 10
and max_leaf_nodes = 262
. Note that the cross-validation score with finer grid is only slightly lower than the course grid. However, depending on the dataset, the finer grid may lead to more benefit.
#Developing the tree based on optimal hyperparameters found by cross-validation
= DecisionTreeRegressor(random_state=1, max_depth=10,max_leaf_nodes=262)
model model.fit(X, y)
DecisionTreeRegressor(max_depth=10, max_leaf_nodes=262, random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=10, max_leaf_nodes=262, random_state=1)
#RMSE on test data
= test[['mileage','mpg','year','engineSize']]
Xtest np.sqrt(mean_squared_error(test.price, model.predict(Xtest)))
6921.0404660552895
The RMSE for the decision tree is lower than that of linear regression models with these four predictors. This may be probably due to car price having a highly non-linear association with the predictors.
Note that we may also use RandomizedSearchCV()
or BayesSearchCV()
to optimze the hyperparameters.
Predictor importance: The importance of a predictor is computed as the (normalized) total reduction of the criterion (SSE in case of regression trees) brought by that predictor.
Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values) Source: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor.feature_importances_
Why?
Because high cardinality predictors will tend to overfit. When the predictors have high cardinality, it means they form little groups (in the leaf nodes) and then the model “learns” the individuals, instead of “learning” the general trend. The higher the cardinality of the predictor, the more prone is the model to overfitting.
model.feature_importances_
array([0.04490344, 0.15882336, 0.29739951, 0.49887369])
Engine size is the most important predictor, followed by year, which is followed by mpg, and mileage is the least important predictor.
5.3 Cost complexity pruning
While optimizing parameters above, we optimized them within a range that we thought was reasonable. While doing so, we restricted ourselves to considering only a subset of the unpruned tree. Thus, we could have missed out on finding the optimal tree (or the best model).
With cost complexity pruning, we first develop an unpruned tree without any restrictions. Then, using cross validation, we find the optimal value of the tuning parameter \(\alpha\). All the non-terminal nodes for which \(\alpha_{eff}\) is smaller that the optimal \(\alpha\) will be pruned. You will need to check out the link below to understand this better.
Check out a detailed explanation of how cost complexity pruning is implemented in sklearn at: https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning
Here are some informative visualizations that will help you understand what is happening in cost complexity pruning: https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py
= DecisionTreeRegressor(random_state = 1)#model without any restrictions
model = model.cost_complexity_pruning_path(X,y)# Compute the pruning path during Minimal Cost-Complexity Pruning. path
=path['ccp_alphas'] alphas
len(alphas)
4126
= tm.time()
start_time = KFold(n_splits = 5,shuffle=True,random_state=1)
cv = GridSearchCV(DecisionTreeRegressor(random_state=1), param_grid = {'ccp_alpha':alphas},
tree = 'neg_mean_squared_error',n_jobs=-1,verbose=1,cv=cv)
scoring
tree.fit(X, y)print (tree.best_score_, tree.best_params_)
print("Time taken =",round((tm.time()-start_time)/60), "minutes")
Fitting 5 folds for each of 4126 candidates, totalling 20630 fits
-44150619.209031895 {'ccp_alpha': 143722.94076639024}
Time taken = 2 minutes
The code took 2 minutes to run on a dataset of about 5000 observations and 4 predictors.
= DecisionTreeRegressor(ccp_alpha=143722.94076639024,random_state=1)
model
model.fit(X, y)= model.predict(Xtest)
pred np.sqrt(mean_squared_error(test.price, pred))
7306.592294294368
The RMSE for the decision tree with cost complexity pruning is lower than that of linear regression models and spline regression models (including MARS), with these four predictors. However, it is higher than the one obtained with tuning tree parameters using grid search (shown previously). Cost complexity pruning considers a completely unpruned tree unlike the ‘grid search’ method of searching over a grid of hyperparameters such as max_depth
and max_leaf_nodes
, and thus may seem to be more comprehensive than the ‘grid search’ approach. However, both the approaches may consider trees that are not considered by the other approach, and thus either one may provide a more accurate model. Depending on the grid of parameters chosen for cross validation, the grid search method may be more or less comprehensive than cost complexity pruning.
= pd.DataFrame(tree.cv_results_)
gridcv_results = -gridcv_results['mean_test_score'] cv_error
#Visualizing the 5-fold cross validation error vs alpha
plt.plot(alphas,cv_error)'log')
plt.xscale('alpha')
plt.xlabel('K-fold MSE'); plt.ylabel(
#Zooming in the above visualization to see the alpha where the 5-fold cross validation error is minimizing
0:4093],cv_error[0:4093])
plt.plot(alphas['alpha')
plt.xlabel('K-fold MSE'); plt.ylabel(
5.3.1 Depth vs alpha; Node counts vs alpha
= time.time()
stime =[]
treesfor i in alphas:
= DecisionTreeRegressor(ccp_alpha=i,random_state=1)
tree 'price'])
tree.fit(X, train[
trees.append(tree)print(time.time()-stime)
268.10325384140015
This code takes 4.5 minutes to run
= [clf.tree_.node_count for clf in trees]
node_counts = [clf.tree_.max_depth for clf in trees] depth
= plt.subplots(1, 2,figsize=(10,6))
fig, ax 0].plot(alphas[0:4093], node_counts[0:4093], marker="o", drawstyle="steps-post")#Plotting the zoomed-in plot (ignoring very high alphas), otherwise it is hard to see the trend
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(alphas[0:4093], depth[0:4093], marker="o", drawstyle="steps-post")#Plotting the zoomed-in plot (ignoring very high alphas), otherwise it is hard to see the trend
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
ax[#fig.tight_layout()
Text(0.5, 1.0, 'Depth vs alpha')
5.3.2 Train and test accuracies (R-squared) vs alpha
= [clf.score(X, y) for clf in trees]
train_scores = [clf.score(Xtest, test.price) for clf in trees] test_scores
= plt.subplots()
fig, ax "alpha")
ax.set_xlabel("accuracy")
ax.set_ylabel("Accuracy vs alpha for training and testing sets")
ax.set_title(0:4093], train_scores[0:4093], marker="o", label="train", drawstyle="steps-post")#Plotting the zoomed-in plot (ignoring very high alphas), otherwise it is hard to see the trend
ax.plot(alphas[0:4093], test_scores[0:4093], marker="o", label="test", drawstyle="steps-post")#Plotting the zoomed-in plot (ignoring very high alphas), otherwise it is hard to see the trend
ax.plot(alphas[
ax.legend() plt.show()