Revisiting Machine Learning Datasets

It’s been a while since I had a look at the UCI Machine Learning Repository. So, let’s have a look at a recently uploaded dataset, the QSAR aquatic toxicity which originates from work by Cassotti et al. (2014).

Contents

Dataset exploration
Brute force approach
TPOT

Dataset exploration

The dataset describes aquatic toxicity of substances for Daphnia magna using QSAR (Quantitative Structure-Activity Relationships) to describe these chemicals. I’m not going into details here, just read the paper!

Okay, let’s see what we got here:

import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost

columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
inputData = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)

display(inputData.sample(10))
display(inputData.describe())

	TPSA	SAacc	H-050	MLOGP	RDCHI	GATS1p	nN	C-040	LC50
336	77.76	153.195	3	0.559	2.137	0.767	0	0	3.669
457	47.36	56.055	0	3.373	3.471	0.992	3	0	4.943
216	0.00	0.000	0	2.604	1.475	0.505	0	0	3.460
340	54.57	41.028	0	4.789	3.580	1.505	0	0	5.512
512	32.67	45.055	0	3.355	3.097	0.889	2	1	4.821
156	69.48	142.262	0	4.096	3.884	1.169	2	1	5.742
253	20.23	42.683	1	2.461	1.975	0.737	0	0	5.691
508	45.82	50.747	0	3.110	2.154	0.478	1	0	4.251
14	92.14	107.097	1	4.275	4.174	1.174	1	1	4.112
2	9.23	11.000	0	5.799	2.930	0.486	0	0	7.019

	TPSA	SAacc	H-050	MLOGP	RDCHI	GATS1p	nN	C-040	LC50
count	546.000000	546.000000	546.000000	546.000000	546.000000	546.000000	546.000000	546.000000	546.000000
mean	48.472930	58.869018	0.937729	2.313493	2.492299	1.046264	1.003663	0.353480	4.658421
std	46.763983	68.166554	1.618632	1.741797	0.811004	0.403677	1.397240	0.806827	1.665215
min	0.000000	0.000000	0.000000	-6.446000	1.000000	0.281000	0.000000	0.000000	0.122000
25%	15.790000	11.000000	0.000000	1.232500	1.975000	0.737000	0.000000	0.000000	3.601500
50%	40.460000	42.683000	0.000000	2.273500	2.344000	1.020500	1.000000	0.000000	4.516000
75%	70.022500	77.492750	1.000000	3.392750	2.911000	1.266500	2.000000	0.000000	5.607500
max	347.320000	571.952000	18.000000	9.148000	6.439000	2.500000	11.000000	11.000000	10.047000

Let’s summarize:

small dataset
8 features
1 target variable

Let’s have a look at some dataset properties:

Brute force approach

Let’s see throw some algorithms at it and see how it ends ;)

y = inputData["LC50"].copy(deep=True)
X = inputData.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
datasets = {}
datasets[0] = {'X_train': X_train, 
               'X_test' : X_test,
               'y_train': y_train, 
               'y_test' : y_test}


def train_test_linear_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    linear_regression = LinearRegression()
    grid_parameters_linear_regression = {'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(linear_regression,
                            param_grid=grid_parameters_linear_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime 
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_bRidge_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    bRidge_regression = BayesianRidge()
    grid_parameters_BayesianRidge_regression = {'fit_intercept' : [False, True], 
                                                'n_iter':[300,1000,5000]}
    start_time = time.time()
    grid_obj = GridSearchCV(bRidge_regression,
                            param_grid=grid_parameters_BayesianRidge_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Bayesian Ridge Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_decision_tree_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    decision_tree_regression = DecisionTreeRegressor(random_state=42)
    grid_parameters_decision_tree_regression = {'max_depth' : [None, 3,5,7,9,10,11]}
    start_time = time.time()
    grid_obj = GridSearchCV(decision_tree_regression,
                            param_grid=grid_parameters_decision_tree_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Decision Tree Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_knn_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    knn_regression = KNeighborsRegressor()
    grid_parameters_knn_regression = {'n_neighbors' : [1,2,3],
                                      'weights': ['uniform', 'distance'],
                                      'algorithm': ['ball_tree', 'kd_tree'],
                                      'leaf_size': [30,90,100,110],
                                      'p': [1,2]}
    start_time = time.time()
    grid_obj = GridSearchCV(knn_regression,
                            param_grid=grid_parameters_knn_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'KNN Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_SVR_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    SVR_regression = LinearSVR()
    grid_parameters_SVR_regression = {'C' : [1, 10, 50],
                                     'epsilon' : [0.01, 0.1],
                                     'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(SVR_regression,
                            param_grid=grid_parameters_SVR_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStarTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStarTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear SVM Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_random_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    random_forest_regression = RandomForestRegressor(random_state=42)
    grid_parameters_random_forest_regression = {'n_estimators' : [3,5,10,15,18],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_regression,
                            param_grid=grid_parameters_random_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Random Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_mondrian_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    mondrian_forest_regression = MondrianForestRegressor(random_state=42)
    grid_parameters_mondrian_forest_regression = {'n_estimators' : [3,5,10,15,18,100],
                                     'max_depth' : [None, 2,3,5,7,9,25,50]}
    start_time = time.time()
    grid_obj = GridSearchCV(mondrian_forest_regression,
                            param_grid=grid_parameters_mondrian_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Mondrian Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def xgboost_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    x_gradient_boosting_regression = xgboost.XGBRegressor(random_state=42)
    grid_parameters_x_gradient_boosting_regression = {'n_estimators' : [3,5,10,15,50,60,80,100,200,300],
                                                    'max_depth' : [1,2, 3,5,7,9,10,11,15],
                                                    'learning_rate' :[ 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.8, 1.0],
                                                     }
    start_time = time.time()
    grid_obj = GridSearchCV(x_gradient_boosting_regression,
                            param_grid=grid_parameters_x_gradient_boosting_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'XGBoost Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

# make scorer
scorer = 'neg_mean_squared_error'
results = {}
counter = 0
cv_count = 5

for dataset in [0]:
    X_train, X_test, y_train, y_test = datasets[dataset]['X_train'], datasets[dataset]['X_test'], datasets[dataset]['y_train'], datasets[dataset]['y_test']
    results[counter] = train_test_linear_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Linear Regression completed")
    counter += 1
    results[counter] = train_test_bRidge_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Bayesian Ridge Regression completed")
    counter += 1
    results[counter] = train_test_decision_tree_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Decision Trees completed")
    counter += 1
    results[counter] = train_test_knn_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("KNN completed")
    counter += 1
    results[counter] = train_test_SVR_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("SVR completed")
    counter += 1
    results[counter] = train_test_random_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Random Forest completed")
    counter += 1
    results[counter] = train_test_mondrian_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Mondrian Forest completed")
    counter += 1
    results[counter] = xgboost_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("XGBoost completed")
    counter += 1

results_df = pd.DataFrame.from_dict(results, orient='index')
display(results_df)
results_df.to_csv('results_df_manual.csv')
pickle.dump(results, open('results_manual.p', 'wb'))


  
    
      
      Regression type
      model
      Predictions
      R2
      MSE
      MAE
      MSE_true_scale
      RMSE_true_scale
      MAE_true_scale
      MedAE_true_scale
      Training time
      Prediction time
      dataset
    
  
  
    
      0
      Linear Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [3.5338844628594983, 4.458763905734787, 4.2549...
      0.499440
      1.502224
      0.930824
      1.502224
      1.225652
      0.930824
      0.734009
      1.573259
      0.001276
      0
    
    
      1
      Bayesian Ridge Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [3.5632073495510084, 4.361513694406263, 4.2435...
      0.500691
      1.498469
      0.926011
      1.498469
      1.224120
      0.926011
      0.728115
      0.111000
      0.000795
      0
    
    
      2
      Decision Tree Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [4.0135, 3.54264864864865, 4.0135, 3.747043478...
      0.392786
      1.822302
      1.009686
      1.822302
      1.349927
      1.009686
      0.861600
      0.104871
      0.001365
      0
    
    
      3
      KNN Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [3.510695042287087, 5.552267935344628, 3.45457...
      0.478214
      1.565925
      0.916530
      1.565925
      1.251369
      0.916530
      0.661797
      0.582334
      0.002165
      0
    
    
      4
      Linear SVM Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [6.219684172226497, 4.92922723439761, 7.043169...
      -0.436243
      4.310286
      1.564261
      4.310286
      2.076123
      1.564261
      1.235107
      0.365867
      0.000695
      0
    
    
      5
      Random Forest Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [4.202572017409114, 4.127909161387632, 3.34896...
      0.457783
      1.627240
      0.945932
      1.627240
      1.275633
      0.945932
      0.770660
      0.841694
      0.001862
      0
    
    
      6
      Mondrian Forest Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [4.313382453918457, 5.6062146914005275, 4.0430...
      0.469886
      1.590917
      0.922933
      1.590917
      1.261316
      0.922933
      0.652625
      1.849978
      0.023086
      0
    
    
      7
      XGBoost Regression
      GridSearchCV(cv=5, error_score='raise-deprecat...
      [4.666296, 4.5902467, 3.343176, 3.8965771, 4.3...
      0.483594
      1.549778
      0.905769
      1.549778
      1.244901
      0.905769
      0.768517
      34.654719
      0.001308
      0

	Regression type	model	Predictions	R2	MSE	MAE	MSE_true_scale	RMSE_true_scale	MAE_true_scale	MedAE_true_scale	Training time	Prediction time
0	Linear Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[3.5338844628594983, 4.458763905734787, 4.2549...	0.499440	1.502224	0.930824	1.502224	1.225652	0.930824	0.734009	1.573259	0.001276
1	Bayesian Ridge Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[3.5632073495510084, 4.361513694406263, 4.2435...	0.500691	1.498469	0.926011	1.498469	1.224120	0.926011	0.728115	0.111000	0.000795
2	Decision Tree Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[4.0135, 3.54264864864865, 4.0135, 3.747043478...	0.392786	1.822302	1.009686	1.822302	1.349927	1.009686	0.861600	0.104871	0.001365
3	KNN Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[3.510695042287087, 5.552267935344628, 3.45457...	0.478214	1.565925	0.916530	1.565925	1.251369	0.916530	0.661797	0.582334	0.002165
4	Linear SVM Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[6.219684172226497, 4.92922723439761, 7.043169...	-0.436243	4.310286	1.564261	4.310286	2.076123	1.564261	1.235107	0.365867	0.000695
5	Random Forest Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[4.202572017409114, 4.127909161387632, 3.34896...	0.457783	1.627240	0.945932	1.627240	1.275633	0.945932	0.770660	0.841694	0.001862
6	Mondrian Forest Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[4.313382453918457, 5.6062146914005275, 4.0430...	0.469886	1.590917	0.922933	1.590917	1.261316	0.922933	0.652625	1.849978	0.023086
7	XGBoost Regression	GridSearchCV(cv=5, error_score='raise-deprecat...	[4.666296, 4.5902467, 3.343176, 3.8965771, 4.3...	0.483594	1.549778	0.905769	1.549778	1.244901	0.905769	0.768517	34.654719	0.001308

Well, that doesn’t look so good. The baseline result in Cassotti et al. (2014) is “R2 and Q2cv equal to 0.78, Q2ext equal to 0.7”. However, without a threshold using the Mahalanobis distance, the baseline kNN result is R2 =0.6, Q2cv and Q2_ext = 0.43. This result is pretty similar to our result here because the Q2 value is basically the R2 but as cross-validation score (Q2cv) and as score on the test set (Q2ext). Well, in general we could also question if R2 is the correct metric here. Furthermore, the paper lacks of many interesting details how data was split.

TPOT

So, let’s see if TPOT comes up with something better.

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor

columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
input_data = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)
# split data into X and y
y = input_data["LC50"].copy(deep=True)
X = input_data.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.25,
                                                     shuffle=True,
                                                     random_state=42)
tpot = TPOTRegressor(max_time_mins=60,
                     verbosity=2,
                     n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('QSAR_aquatic_toxicity.py')
y_predictions = tpot.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)

R2 score: 0.4802354355830183
MAE: 0.9276848574311194
MSE: 1.5598575121614906
RMSE: 1.2489425575908168

Well, same shit, different approach ;).

TPOT’s final pipeline is:

import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Binarizer, MaxAbsScaler
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-0.9705966623633835
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                FunctionTransformer(copy),
                FastICA(tol=0.35000000000000003)
            ),
            MaxAbsScaler()
        ),
        make_union(
            FunctionTransformer(copy),
            Binarizer(threshold=0.9500000000000001)
        )
    ),
    OneHotEncoder(minimum_fraction=0.05, sparse=False, threshold=10),
    XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.8500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)