import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split, KFold, GridSearchCV, ParameterGrid, \
RandomizedSearchCVfrom sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor,BaggingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_squared_errorfrom skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence, plot_histogram, plot_objective
from IPython import display
import itertools as it
from sklearn.preprocessing import StandardScaler
#Libraries for visualizing trees
from sklearn.tree import export_graphviz, export_text
from six import StringIO
from IPython.display import Image
import pydotplus
import time as time
import warnings
7 Bagging
Read section 8.2.1 of the book before using these notes.
Note that in this course, lecture notes are not sufficient, you must read the book for better understanding. Lecture notes are just implementing the concepts of the book on a dataset, but not explaining the concepts elaborately.
#Using the same datasets as in linear regression in STAT303-2,
#so that we can compare the non-linear models with linear regression
= pd.read_csv('./Datasets/Car_features_train.csv')
trainf = pd.read_csv('./Datasets/Car_prices_train.csv')
trainp = pd.read_csv('./Datasets/Car_features_test.csv')
testf = pd.read_csv('./Datasets/Car_prices_test.csv')
testp = pd.merge(trainf,trainp)
train = pd.merge(testf,testp)
test train.head()
carID | brand | model | year | transmission | mileage | fuelType | tax | mpg | engineSize | price | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18473 | bmw | 6 Series | 2020 | Semi-Auto | 11 | Diesel | 145 | 53.3282 | 3.0 | 37980 |
1 | 15064 | bmw | 6 Series | 2019 | Semi-Auto | 10813 | Diesel | 145 | 53.0430 | 3.0 | 33980 |
2 | 18268 | bmw | 6 Series | 2020 | Semi-Auto | 6 | Diesel | 145 | 53.4379 | 3.0 | 36850 |
3 | 18480 | bmw | 6 Series | 2017 | Semi-Auto | 18895 | Diesel | 145 | 51.5140 | 3.0 | 25998 |
4 | 18492 | bmw | 6 Series | 2015 | Automatic | 62953 | Diesel | 160 | 51.4903 | 3.0 | 18990 |
= train[['mileage','mpg','year','engineSize']]
X = test[['mileage','mpg','year','engineSize']]
Xtest = train['price']
y = test['price'] ytest
7.1 Bagging regression trees
Bag regression trees to develop a model to predict car price using the predictors mileage
,mpg
,year
,and engineSize
.
#Bagging the results of 10 decision trees to predict car price
= BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=10, random_state=1,
model =-1).fit(X, y) n_jobs
np.sqrt(mean_squared_error(test.price, model.predict(Xtest)))
5752.0779571060875
The RMSE has reduced a lot by averaging the predictions of 10 trees. The RMSE for a single tree model with optimized parameters was around 7000.
7.1.1 Model accuracy vs number of trees
How does the model accuracy vary with the number of trees?
As we increase the number of trees, it will tend to reduce the variance of individual trees leading to a more accurate prediction.
#Finding model accuracy vs number of trees
"ignore")
warnings.filterwarnings(={};test_rsquared={};oob_rmse={};test_rmse = {}
oob_rsquaredfor i in np.linspace(10,400,40,dtype=int):
= BaggingRegressor(estimator=DecisionTreeRegressor(), n_estimators=i, random_state=1,
model =-1,oob_score=True).fit(X, y)
n_jobs=model.oob_score_ #Returns the out-of_bag R-squared of the model
oob_rsquared[i]=model.score(Xtest,ytest) #Returns the test R-squared of the model
test_rsquared[i]=np.sqrt(mean_squared_error(model.oob_prediction_,y))
oob_rmse[i]=np.sqrt(mean_squared_error(model.predict(Xtest),ytest))
test_rmse[i]
warnings.resetwarnings()
# The hidden warning is: "Some inputs do not have OOB scores. This probably means too few
# estimators were used to compute any reliable oob estimates." This warning will appear
# in case of small number of estimators. In such a case, some observations may be use
# by all the estimators, and their OOB score can't be computed
As we are bagging only 10 trees in the first iteration, some of the observations are selected in every bootstrapped sample, and thus they don’t have an out-of-bag error, which is producing the warning. For every observation to have an out-of-bag error, the number of trees must be sufficiently large.
Let us visualize the out-of-bag (OOB) R-squared and R-squared on test data vs the number of trees.
'font.size': 15})
plt.rcParams.update({=(8, 6), dpi=80)
plt.figure(figsize= 'Out of bag R-squared')
plt.plot(oob_rsquared.keys(),oob_rsquared.values(),label 'o',color = 'blue')
plt.plot(oob_rsquared.keys(),oob_rsquared.values(),= 'Test data R-squared')
plt.plot(test_rsquared.keys(),test_rsquared.values(), label 'Number of trees')
plt.xlabel('Rsquared')
plt.ylabel(; plt.legend()
The out-of-bag R-squared initially increases, and then stabilizes after a certain number of trees (around 150 in this case). Note that increasing the number of trees further will not lead to overfitting. However, increasing the number of trees will increase the computations. Thus, we don’t need to develop more trees once the R-squared stabilizes.
#Visualizing out-of-bag RMSE and test data RMSE
'font.size': 15})
plt.rcParams.update({=(8, 6), dpi=80)
plt.figure(figsize= 'Out of bag RMSE')
plt.plot(oob_rmse.keys(),oob_rmse.values(),label 'o',color = 'blue')
plt.plot(oob_rmse.keys(),oob_rmse.values(),= 'Test data RMSE')
plt.plot(test_rmse.keys(),test_rmse.values(), label 'Number of trees')
plt.xlabel('RMSE')
plt.ylabel( plt.legend()
A similar trend can be seen by plotting out-of-bag RMSE and test RMSE. Note that RMSE is proportional to R-squared. We only need to visualize one of RMSE or R-squared to find the optimal number of trees.
#Bagging with 150 trees
= BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=150, random_state=1,
model =True,n_jobs=-1).fit(X, y) oob_score
#OOB R-squared
model.oob_score_
0.897561533100511
#RMSE on test data
= model.predict(Xtest)
pred np.sqrt(mean_squared_error(test.price, pred))
5673.756466489405
7.1.2 Optimizing bagging hyperparameters using grid search
More parameters of a bagged regression tree model can be optimized using the typical approach of k-fold cross validation over a grid of parameter values.
Note that we don’t need to tune the number of trees in bagging as we know that the higher the number of trees, the lower will be the expected MSE. So, we will tune all the hyperparameters for a fixed number of trees. Once we have obtained the optimal hyperparameter values, we’ll keep increasing the number of trees until the gains are neglible.
= train.shape[0]
n_samples = train.shape[1]
n_features
= {'base_estimator': [DecisionTreeRegressor(random_state = 1),LinearRegression()],#Comparing bagging with a linear regression model as well
params 'n_estimators': [100],
'max_samples': [0.5,1.0],
'max_features': [0.5,1.0],
'bootstrap': [True, False],
'bootstrap_features': [True, False]}
= KFold(n_splits=5,shuffle=True,random_state=1)
cv = GridSearchCV(BaggingRegressor(random_state=1, n_jobs=-1),
bagging_regressor_grid =params, cv=cv, n_jobs=-1, verbose=1)
param_grid
bagging_regressor_grid.fit(X, y)
print('Train R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(X, y))
print('Test R^2 Score : %.3f'%bagging_regressor_grid.best_estimator_.score(Xtest, ytest))
print('Best R^2 Score Through Grid Search : %.3f'%bagging_regressor_grid.best_score_)
print('Best Parameters : ',bagging_regressor_grid.best_params_)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Train R^2 Score : 0.986
Test R^2 Score : 0.882
Best R^2 Score Through Grid Search : 0.892
Best Parameters : {'base_estimator': DecisionTreeRegressor(random_state=1), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
You may use the object bagging_regressor_grid
to directly make the prediction.
np.sqrt(mean_squared_error(test.price, bagging_regressor_grid.predict(Xtest)))
5708.308794847089
Note that once the model has been tuned and the optimal hyperparameters identified, we can keep increasing the number of trees until it ceases to benefit.
#Model with optimal hyperparameters and increased number of trees
= BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=500, random_state=1,
model =True,n_jobs=-1,bootstrap_features=False,bootstrap=True,
oob_score=1.0,max_samples=1.0).fit(X, y) max_features
#RMSE on test data
np.sqrt(mean_squared_error(test.price, model.predict(Xtest)))
5624.685464926517
7.2 Bagging for classification
Bag classification tree models to predict if a person has diabetes.
= pd.read_csv('./Datasets/diabetes_train.csv')
train = pd.read_csv('./Datasets/diabetes_test.csv') test
= train.drop(columns = 'Outcome')
X = test.drop(columns = 'Outcome')
Xtest = train['Outcome']
y = test['Outcome'] ytest
#Bagging the results of 10 decision trees to predict car price
= BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=150, random_state=1,
model =-1).fit(X, y) n_jobs
# Performance metrics computation for the optimum decision threshold probability
= 0.23
desired_threshold
= model.predict_proba(Xtest)[:,1]
y_pred_prob
# Classifying observations in the positive class (y = 1) if the predicted probability is greater
# than the desired decision threshold probability
= y_pred_prob > desired_threshold
y_pred = y_pred.astype(int)
y_pred
#Computing the accuracy
print("Accuracy: ",accuracy_score(y_pred, ytest)*100)
#Computing the ROC-AUC
= roc_curve(ytest, y_pred_prob)
fpr, tpr, auc_thresholds print("ROC-AUC: ",auc(fpr, tpr))# AUC of ROC
#Computing the precision and recall
print("Precision: ", precision_score(ytest, y_pred))
print("Recall: ", recall_score(ytest, y_pred))
#Confusion matrix
= pd.DataFrame(confusion_matrix(ytest, y_pred),
cm =['Predicted 0', 'Predicted 1'], index = ['Actual 0', 'Actual 1'])
columns=True, cmap='Blues', fmt='g'); sns.heatmap(cm, annot
Accuracy: 76.62337662337663
ROC-AUC: 0.8766084963863917
Precision: 0.6404494382022472
Recall: 0.9344262295081968
As a result of bagging, we obtain a model (with a threshold probabiltiy cutoff of 0.23) that has a better performance on test data in terms of almost all the metrics - accuracy, precision (comparable performance), recall, and ROC-AUC, as compared the single tree classification model (with a threshold probability cutoff of 0.23). Note that we have not yet tuned the model using GridSearchCv
here, which is shown towards the end of this chapter.
7.2.1 Model accuracy vs number of trees
#Finding model accuracy vs number of trees
={};test_accuracy={};oob_rmse={};test_rmse = {}
oob_accuracyfor i in np.linspace(10,400,40,dtype=int):
= BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=i, random_state=1,
model =-1,oob_score=True).fit(X, y)
n_jobs=model.oob_score_ #Returns the out-of_bag R-squared of the model
oob_accuracy[i]=model.score(Xtest,ytest) #Returns the test R-squared of the model test_accuracy[i]
C:\Users\akl0407\Anaconda3\lib\site-packages\sklearn\ensemble\_bagging.py:640: UserWarning: Some inputs do not have OOB scores. This probably means too few estimators were used to compute any reliable oob estimates.
warn("Some inputs do not have OOB scores. "
C:\Users\akl0407\Anaconda3\lib\site-packages\sklearn\ensemble\_bagging.py:644: RuntimeWarning: invalid value encountered in true_divide
oob_decision_function = (predictions /
'font.size': 15})
plt.rcParams.update({=(8, 6), dpi=80)
plt.figure(figsize= 'Out of bag accuracy')
plt.plot(oob_accuracy.keys(),oob_accuracy.values(),label 'o',color = 'blue')
plt.plot(oob_accuracy.keys(),oob_accuracy.values(),= 'Test data accuracy')
plt.plot(test_accuracy.keys(),test_accuracy.values(), label 'Number of trees')
plt.xlabel('Rsquared')
plt.ylabel( plt.legend()
#ROC curve on training data
= model.predict_proba(X)[:, 1]
ypred = roc_curve(y, ypred)
fpr, tpr, auc_thresholds print(auc(fpr, tpr))# AUC of ROC
def plot_roc_curve(fpr, tpr, label=None):
=(8,8))
plt.figure(figsize'ROC Curve')
plt.title(=2, label=label)
plt.plot(fpr, tpr, linewidth0, 1], [0, 1], 'k--')
plt.plot([-0.005, 1, 0, 1.005])
plt.axis([0,1, 0.05), rotation=90)
plt.xticks(np.arange("False Positive Rate")
plt.xlabel("True Positive Rate (Recall)")
plt.ylabel(
= roc_curve(y, ypred)
fpr, tpr, auc_thresholds plot_roc_curve(fpr, tpr)
1.0
Note that there is perfect separation in train data as ROC-AUC = 1. This shows that the model is probably overfitting. However, this also shows that, despite the reduced variance (as compared to a single tree), the bagged tree model is flexibly enough to perfectly separate the classes.
#ROC curve on test data
= model.predict_proba(Xtest)[:, 1]
ypred = roc_curve(ytest, ypred)
fpr, tpr, auc_thresholds print("ROC-AUC = ",auc(fpr, tpr))# AUC of ROC
def plot_roc_curve(fpr, tpr, label=None):
=(8,8))
plt.figure(figsize'ROC Curve')
plt.title(=2, label=label)
plt.plot(fpr, tpr, linewidth0, 1], [0, 1], 'k--')
plt.plot([-0.005, 1, 0, 1.005])
plt.axis([0,1, 0.05), rotation=90)
plt.xticks(np.arange("False Positive Rate")
plt.xlabel("True Positive Rate (Recall)")
plt.ylabel(
= roc_curve(ytest, ypred)
fpr, tpr, auc_thresholds plot_roc_curve(fpr, tpr)
ROC-AUC = 0.8781949585757096
7.2.2 Optimizing bagging hyperparameters using grid search
More parameters of a bagged classification tree model can be optimized using the typical approach of k-fold cross validation over a grid of parameter values.
= train.shape[0]
n_samples = train.shape[1]
n_features
= {'base_estimator': [DecisionTreeClassifier(random_state = 1),LogisticRegression()],#Comparing bagging with a linear regression model as well
params 'n_estimators': [150,200,250],
'max_samples': [0.5,1.0],
'max_features': [0.5,1.0],
'bootstrap': [True, False],
'bootstrap_features': [True, False]}
= KFold(n_splits=5,shuffle=True,random_state=1)
cv = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1),
bagging_classifier_grid =params, cv=cv, n_jobs=-1, verbose=1,
param_grid = ['precision', 'recall'], refit='recall')
scoring
bagging_classifier_grid.fit(X, y)
print('Train accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X, y))
print('Test accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(Xtest, ytest))
print('Best accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Train accuracy : 1.000
Test accuracy : 0.786
Best accuracy Through Grid Search : 0.573
Best Parameters : {'base_estimator': DecisionTreeClassifier(random_state=1), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 200}
7.2.3 Tuning the decision threshold probability
We’ll find a decision threshold probability that balances recall with precision.
= BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1), n_estimators=200,
model =1,max_features=1.0, oob_score=True,
random_state=1.0,n_jobs=-1,bootstrap=True,bootstrap_features=False).fit(X, y) max_samples
As the model is overfitting on the train data, it will not be a good idea to tune the decision threshold probability based on the precision-recall curve on train data, as shown in the figure below.
= model.predict_proba(X)[:,1]
ypred = precision_recall_curve(y, ypred)
p, r, thresholds def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
=(8, 8))
plt.figure(figsize"Precision and Recall Scores as a function of the decision threshold")
plt.title(-1], "b--", label="Precision")
plt.plot(thresholds, precisions[:-1], "g-", label="Recall")
plt.plot(thresholds, recalls[:-1], "o", color = 'blue')
plt.plot(thresholds, precisions[:-1], "o", color = 'green')
plt.plot(thresholds, recalls[:"Score")
plt.ylabel("Decision Threshold")
plt.xlabel(='best')
plt.legend(loc
plt.legend() plot_precision_recall_vs_threshold(p, r, thresholds)
Instead, we should make the precision-recall curve using the out-of-bag predictions, as shown below. The method oob_decision_function_
provides the predicted probability.
= model.oob_decision_function_[:,1]
ypred = precision_recall_curve(y, ypred)
p, r, thresholds def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
=(8, 8))
plt.figure(figsize"Precision and Recall Scores as a function of the decision threshold")
plt.title(-1], "b--", label="Precision")
plt.plot(thresholds, precisions[:-1], "g-", label="Recall")
plt.plot(thresholds, recalls[:-1], "o", color = 'blue')
plt.plot(thresholds, precisions[:-1], "o", color = 'green')
plt.plot(thresholds, recalls[:"Score")
plt.ylabel("Decision Threshold")
plt.xlabel(='best')
plt.legend(loc
plt.legend() plot_precision_recall_vs_threshold(p, r, thresholds)
# Thresholds with precision and recall
= np.concatenate([thresholds.reshape(-1,1), p[:-1].reshape(-1,1), r[:-1].reshape(-1,1)], axis = 1)
all_thresholds = all_thresholds[all_thresholds[:,2]>0.8,:]
recall_more_than_80 # As the values in 'recall_more_than_80' are arranged in decreasing order of recall and increasing threshold,
# the last value will provide the maximum threshold probability for the recall to be more than 80%
# We wish to find the maximum threshold probability to obtain the maximum possible precision
0]-1] recall_more_than_80[recall_more_than_80.shape[
array([0.2804878 , 0.53205128, 0.80193237])
Suppose, we wish to have at least 80% recall, with the highest possible precision. Then, based on the precision-recall curve, we should have a decision threshold probability of 0.28.
# Performance metrics computation for the optimum decision threshold probability
= 0.28
desired_threshold
= model.predict_proba(Xtest)[:,1]
y_pred_prob
# Classifying observations in the positive class (y = 1) if the predicted probability is greater
# than the desired decision threshold probability
= y_pred_prob > desired_threshold
y_pred = y_pred.astype(int)
y_pred
#Computing the accuracy
print("Accuracy: ",accuracy_score(y_pred, ytest)*100)
#Computing the ROC-AUC
= roc_curve(ytest, y_pred_prob)
fpr, tpr, auc_thresholds print("ROC-AUC: ",auc(fpr, tpr))# AUC of ROC
#Computing the precision and recall
print("Precision: ", precision_score(ytest, y_pred))
print("Recall: ", recall_score(ytest, y_pred))
#Confusion matrix
= pd.DataFrame(confusion_matrix(ytest, y_pred),
cm =['Predicted 0', 'Predicted 1'], index = ['Actual 0', 'Actual 1'])
columns=True, cmap='Blues', fmt='g'); sns.heatmap(cm, annot
Accuracy: 79.22077922077922
ROC-AUC: 0.8802221047065044
Precision: 0.6705882352941176
Recall: 0.9344262295081968
Note that this model has a better performance than the untuned bagged model earlier, and the single tree classification model, as expected.