It’s been a while since I had a look at the UCI Machine Learning Repository. So, let’s have a look at a recently uploaded dataset, the QSAR aquatic toxicity which originates from work by Cassotti et al. (2014).
Contents
Dataset exploration
The dataset describes aquatic toxicity of substances for Daphnia magna using QSAR (Quantitative Structure-Activity Relationships) to describe these chemicals. I’m not going into details here, just read the paper!
Okay, let’s see what we got here:
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost
columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
inputData = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)
display(inputData.sample(10))
display(inputData.describe())
TPSA | SAacc | H-050 | MLOGP | RDCHI | GATS1p | nN | C-040 | LC50 | |
---|---|---|---|---|---|---|---|---|---|
336 | 77.76 | 153.195 | 3 | 0.559 | 2.137 | 0.767 | 0 | 0 | 3.669 |
457 | 47.36 | 56.055 | 0 | 3.373 | 3.471 | 0.992 | 3 | 0 | 4.943 |
216 | 0.00 | 0.000 | 0 | 2.604 | 1.475 | 0.505 | 0 | 0 | 3.460 |
340 | 54.57 | 41.028 | 0 | 4.789 | 3.580 | 1.505 | 0 | 0 | 5.512 |
512 | 32.67 | 45.055 | 0 | 3.355 | 3.097 | 0.889 | 2 | 1 | 4.821 |
156 | 69.48 | 142.262 | 0 | 4.096 | 3.884 | 1.169 | 2 | 1 | 5.742 |
253 | 20.23 | 42.683 | 1 | 2.461 | 1.975 | 0.737 | 0 | 0 | 5.691 |
508 | 45.82 | 50.747 | 0 | 3.110 | 2.154 | 0.478 | 1 | 0 | 4.251 |
14 | 92.14 | 107.097 | 1 | 4.275 | 4.174 | 1.174 | 1 | 1 | 4.112 |
2 | 9.23 | 11.000 | 0 | 5.799 | 2.930 | 0.486 | 0 | 0 | 7.019 |
TPSA | SAacc | H-050 | MLOGP | RDCHI | GATS1p | nN | C-040 | LC50 | |
---|---|---|---|---|---|---|---|---|---|
count | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 | 546.000000 |
mean | 48.472930 | 58.869018 | 0.937729 | 2.313493 | 2.492299 | 1.046264 | 1.003663 | 0.353480 | 4.658421 |
std | 46.763983 | 68.166554 | 1.618632 | 1.741797 | 0.811004 | 0.403677 | 1.397240 | 0.806827 | 1.665215 |
min | 0.000000 | 0.000000 | 0.000000 | -6.446000 | 1.000000 | 0.281000 | 0.000000 | 0.000000 | 0.122000 |
25% | 15.790000 | 11.000000 | 0.000000 | 1.232500 | 1.975000 | 0.737000 | 0.000000 | 0.000000 | 3.601500 |
50% | 40.460000 | 42.683000 | 0.000000 | 2.273500 | 2.344000 | 1.020500 | 1.000000 | 0.000000 | 4.516000 |
75% | 70.022500 | 77.492750 | 1.000000 | 3.392750 | 2.911000 | 1.266500 | 2.000000 | 0.000000 | 5.607500 |
max | 347.320000 | 571.952000 | 18.000000 | 9.148000 | 6.439000 | 2.500000 | 11.000000 | 11.000000 | 10.047000 |
Let’s summarize:
- small dataset
- 8 features
- 1 target variable
Let’s have a look at some dataset properties:
Brute force approach
Let’s see throw some algorithms at it and see how it ends ;)
y = inputData["LC50"].copy(deep=True)
X = inputData.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
datasets = {}
datasets[0] = {'X_train': X_train,
'X_test' : X_test,
'y_train': y_train,
'y_test' : y_test}
def train_test_linear_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
linear_regression = LinearRegression()
grid_parameters_linear_regression = {'fit_intercept' : [False, True]}
start_time = time.time()
grid_obj = GridSearchCV(linear_regression,
param_grid=grid_parameters_linear_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Linear Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_bRidge_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
bRidge_regression = BayesianRidge()
grid_parameters_BayesianRidge_regression = {'fit_intercept' : [False, True],
'n_iter':[300,1000,5000]}
start_time = time.time()
grid_obj = GridSearchCV(bRidge_regression,
param_grid=grid_parameters_BayesianRidge_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Bayesian Ridge Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_decision_tree_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
decision_tree_regression = DecisionTreeRegressor(random_state=42)
grid_parameters_decision_tree_regression = {'max_depth' : [None, 3,5,7,9,10,11]}
start_time = time.time()
grid_obj = GridSearchCV(decision_tree_regression,
param_grid=grid_parameters_decision_tree_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Decision Tree Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_knn_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
knn_regression = KNeighborsRegressor()
grid_parameters_knn_regression = {'n_neighbors' : [1,2,3],
'weights': ['uniform', 'distance'],
'algorithm': ['ball_tree', 'kd_tree'],
'leaf_size': [30,90,100,110],
'p': [1,2]}
start_time = time.time()
grid_obj = GridSearchCV(knn_regression,
param_grid=grid_parameters_knn_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'KNN Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_SVR_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
SVR_regression = LinearSVR()
grid_parameters_SVR_regression = {'C' : [1, 10, 50],
'epsilon' : [0.01, 0.1],
'fit_intercept' : [False, True]}
start_time = time.time()
grid_obj = GridSearchCV(SVR_regression,
param_grid=grid_parameters_SVR_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStarTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStarTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Linear SVM Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_random_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
random_forest_regression = RandomForestRegressor(random_state=42)
grid_parameters_random_forest_regression = {'n_estimators' : [3,5,10,15,18],
'max_depth' : [None, 2,3,5,7,9]}
start_time = time.time()
grid_obj = GridSearchCV(random_forest_regression,
param_grid=grid_parameters_random_forest_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Random Forest Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def train_test_mondrian_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
mondrian_forest_regression = MondrianForestRegressor(random_state=42)
grid_parameters_mondrian_forest_regression = {'n_estimators' : [3,5,10,15,18,100],
'max_depth' : [None, 2,3,5,7,9,25,50]}
start_time = time.time()
grid_obj = GridSearchCV(mondrian_forest_regression,
param_grid=grid_parameters_mondrian_forest_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'Mondrian Forest Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
def xgboost_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset_id):
x_gradient_boosting_regression = xgboost.XGBRegressor(random_state=42)
grid_parameters_x_gradient_boosting_regression = {'n_estimators' : [3,5,10,15,50,60,80,100,200,300],
'max_depth' : [1,2, 3,5,7,9,10,11,15],
'learning_rate' :[ 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.8, 1.0],
}
start_time = time.time()
grid_obj = GridSearchCV(x_gradient_boosting_regression,
param_grid=grid_parameters_x_gradient_boosting_regression,
cv=cv_count,
n_jobs=-1,
scoring=scorer,
verbose=2)
grid_fit = grid_obj.fit(X_train, y_train)
training_time = time.time() - start_time
best_linear_regression = grid_fit.best_estimator_
infStartTime = time.time()
prediction = best_linear_regression.predict(X_test)
prediction_time = time.time() - infStartTime
r2 = r2_score(y_true=y_test, y_pred=prediction)
mse = mean_squared_error(y_true=y_test, y_pred=prediction)
mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
medae = median_absolute_error(y_true=y_test, y_pred=prediction)
# metrics for true values
# r2 remains unchanged, mse, mea will change and cannot be scaled
# because there is some physical meaning behind it
if 1==1:
prediction_true_scale = prediction
prediction = prediction
y_test_true_scale = y_test
mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
return {'Regression type' : 'XGBoost Regression',
'model' : grid_fit,
'Predictions' : prediction,
'R2' : r2,
'MSE' : mse,
'MAE' : mae,
'MSE_true_scale' : mse_true_scale,
'RMSE_true_scale' : np.sqrt(mse_true_scale),
'MAE_true_scale' : mae_true_scale,
'MedAE_true_scale' : medae_true_scale,
'Training time' : training_time,
'Prediction time' : prediction_time,
'dataset' : dataset_id}
# make scorer
scorer = 'neg_mean_squared_error'
results = {}
counter = 0
cv_count = 5
for dataset in [0]:
X_train, X_test, y_train, y_test = datasets[dataset]['X_train'], datasets[dataset]['X_test'], datasets[dataset]['y_train'], datasets[dataset]['y_test']
results[counter] = train_test_linear_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Linear Regression completed")
counter += 1
results[counter] = train_test_bRidge_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Bayesian Ridge Regression completed")
counter += 1
results[counter] = train_test_decision_tree_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Decision Trees completed")
counter += 1
results[counter] = train_test_knn_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("KNN completed")
counter += 1
results[counter] = train_test_SVR_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("SVR completed")
counter += 1
results[counter] = train_test_random_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Random Forest completed")
counter += 1
results[counter] = train_test_mondrian_forest_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("Mondrian Forest completed")
counter += 1
results[counter] = xgboost_regression(X_train,
X_test,
y_train,
y_test,
cv_count,
scorer,
dataset)
print("XGBoost completed")
counter += 1
results_df = pd.DataFrame.from_dict(results, orient='index')
display(results_df)
results_df.to_csv('results_df_manual.csv')
pickle.dump(results, open('results_manual.p', 'wb'))
Regression type | model | Predictions | R2 | MSE | MAE | MSE_true_scale | RMSE_true_scale | MAE_true_scale | MedAE_true_scale | Training time | Prediction time | dataset | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Linear Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.5338844628594983, 4.458763905734787, 4.2549... | 0.499440 | 1.502224 | 0.930824 | 1.502224 | 1.225652 | 0.930824 | 0.734009 | 1.573259 | 0.001276 | 0 |
1 | Bayesian Ridge Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.5632073495510084, 4.361513694406263, 4.2435... | 0.500691 | 1.498469 | 0.926011 | 1.498469 | 1.224120 | 0.926011 | 0.728115 | 0.111000 | 0.000795 | 0 |
2 | Decision Tree Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.0135, 3.54264864864865, 4.0135, 3.747043478... | 0.392786 | 1.822302 | 1.009686 | 1.822302 | 1.349927 | 1.009686 | 0.861600 | 0.104871 | 0.001365 | 0 |
3 | KNN Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [3.510695042287087, 5.552267935344628, 3.45457... | 0.478214 | 1.565925 | 0.916530 | 1.565925 | 1.251369 | 0.916530 | 0.661797 | 0.582334 | 0.002165 | 0 |
4 | Linear SVM Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [6.219684172226497, 4.92922723439761, 7.043169... | -0.436243 | 4.310286 | 1.564261 | 4.310286 | 2.076123 | 1.564261 | 1.235107 | 0.365867 | 0.000695 | 0 |
5 | Random Forest Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.202572017409114, 4.127909161387632, 3.34896... | 0.457783 | 1.627240 | 0.945932 | 1.627240 | 1.275633 | 0.945932 | 0.770660 | 0.841694 | 0.001862 | 0 |
6 | Mondrian Forest Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.313382453918457, 5.6062146914005275, 4.0430... | 0.469886 | 1.590917 | 0.922933 | 1.590917 | 1.261316 | 0.922933 | 0.652625 | 1.849978 | 0.023086 | 0 |
7 | XGBoost Regression | GridSearchCV(cv=5, error_score='raise-deprecat... | [4.666296, 4.5902467, 3.343176, 3.8965771, 4.3... | 0.483594 | 1.549778 | 0.905769 | 1.549778 | 1.244901 | 0.905769 | 0.768517 | 34.654719 | 0.001308 | 0 |
Well, that doesn’t look so good. The baseline result in Cassotti et al. (2014) is “R2 and Q2cv equal to 0.78, Q2ext equal to 0.7”. However, without a threshold using the Mahalanobis distance, the baseline kNN result is R2 =0.6, Q2cv and Q2_ext = 0.43. This result is pretty similar to our result here because the Q2 value is basically the R2 but as cross-validation score (Q2cv) and as score on the test set (Q2ext). Well, in general we could also question if R2 is the correct metric here. Furthermore, the paper lacks of many interesting details how data was split.
TPOT
So, let’s see if TPOT comes up with something better.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor
columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
input_data = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)
# split data into X and y
y = input_data["LC50"].copy(deep=True)
X = input_data.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.25,
shuffle=True,
random_state=42)
tpot = TPOTRegressor(max_time_mins=60,
verbosity=2,
n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('QSAR_aquatic_toxicity.py')
y_predictions = tpot.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
R2 score: 0.4802354355830183
MAE: 0.9276848574311194
MSE: 1.5598575121614906
RMSE: 1.2489425575908168
Well, same shit, different approach ;).
TPOT’s final pipeline is:
import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Binarizer, MaxAbsScaler
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy
# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['target'].values, random_state=None)
# Average CV score on the training set was:-0.9705966623633835
exported_pipeline = make_pipeline(
make_union(
make_pipeline(
make_union(
FunctionTransformer(copy),
FastICA(tol=0.35000000000000003)
),
MaxAbsScaler()
),
make_union(
FunctionTransformer(copy),
Binarizer(threshold=0.9500000000000001)
)
),
OneHotEncoder(minimum_fraction=0.05, sparse=False, threshold=10),
XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.8500000000000001)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)