import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFoldfrom sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, recall_score, mean_squared_error
from scipy.stats import uniform
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import seaborn as sns
from skopt.plots import plot_objective
import matplotlib.pyplot as plt
import warnings
import time as tm
Appendix B — Parallel processing bonus Q
#Using the same datasets as used for linear regression in STAT303-2,
#so that we can compare the non-linear models with linear regression
= pd.read_csv('./Datasets/Car_features_train.csv')
trainf = pd.read_csv('./Datasets/Car_prices_train.csv')
trainp = pd.read_csv('./Datasets/Car_features_test.csv')
testf = pd.read_csv('./Datasets/Car_prices_test.csv')
testp = pd.merge(trainf,trainp)
train = pd.merge(testf,testp)
test
train.head()= ['mpg', 'engineSize', 'year', 'mileage']
predictors = train[predictors]
X_train = train['price']
y_train = test[predictors]
X_test = test['price']
y_test
# Scale
= StandardScaler()
sc
sc.fit(X_train)= sc.transform(X_train)
X_train_scaled = sc.transform(X_test) X_test_scaled
Case 1: No parallelization
= []
time_taken_case1 for i in range(50):
= tm.time()
start_time = range(1, 20)
Ks = KFold(n_splits=5, shuffle=True, random_state=1)
kfold = []
cross_val_error for k in Ks:
-cross_val_score(KNeighborsRegressor(n_neighbors=k),
cross_val_error.append(= kfold,
X_train_scaled, y_train, cv ="neg_root_mean_squared_error").mean())
scoring- start_time) time_taken_case1.append(tm.time()
Case 2: Parallelization in cross_val_score()
= []
time_taken_case2 for i in range(50):
= tm.time()
start_time = range(1, 20)
Ks = KFold(n_splits=5, shuffle=True, random_state=1)
kfold = []
cross_val_error for k in Ks:
-cross_val_score(KNeighborsRegressor(n_neighbors=k),
cross_val_error.append(= kfold, n_jobs = -1,
X_train_scaled, y_train, cv ="neg_root_mean_squared_error").mean())
scoring- start_time) time_taken_case2.append(tm.time()
Case 3: Parallelization in KNeighborsRegressor()
= []
time_taken_case3 for i in range(50):
= tm.time()
start_time = range(1, 20)
Ks = KFold(n_splits=5, shuffle=True, random_state=1)
kfold = []
cross_val_error for k in Ks:
-cross_val_score(KNeighborsRegressor(n_neighbors=k,
cross_val_error.append(= -1), X_train_scaled, y_train, cv = kfold,
n_jobs="neg_root_mean_squared_error").mean())
scoring- start_time) time_taken_case3.append(tm.time()
Case 4: Nested parallelization: Both cross_val_score()
and KNeighborsRegressor()
= []
time_taken_case4 for i in range(50):
= tm.time()
start_time = range(1, 20)
Ks = KFold(n_splits=5, shuffle=True, random_state=1)
kfold = []
cross_val_error for k in Ks:
-cross_val_score(KNeighborsRegressor(n_neighbors=k,
cross_val_error.append(= -1), X_train_scaled, y_train, cv = kfold, n_jobs = -1,
n_jobs="neg_root_mean_squared_error").mean())
scoring- start_time) time_taken_case4.append(tm.time()
sns.boxplot([time_taken_case1, time_taken_case2, time_taken_case3, time_taken_case4])0, 1, 2, 3], ['Case 1', 'Case 2', 'Case 3', 'Case 4']);
plt.xticks(['Time'); plt.ylabel(
Q1
Case 1 is without parallelization. Why is Case 3 with parallelization of KNeighborsRegressor()
taking more time than case 1?
Q2
If nested parallelization is worse than parallelization, why is case 4 with nested parallelization taking less time than case 3 with parallelization of KNeighborsRegressor()?
Q3
If nested parallelization is worse than no parallelization, why is case 4 with nested parallelization taking less time than case 1 with no parallelization?
Q4
If nested parallelization is the best scenario, why is case 4 with nested parallelization taking more time than case 2 with with parallelization in cross_val_score()
?