It’s been a while since I had a look at the UCI Machine Learning Repository. So, let’s have a look at a recently uploaded dataset, the QSAR aquatic toxicity which originates from work by Cassotti et al. (2014).


Contents


Dataset exploration

The dataset describes aquatic toxicity of substances for Daphnia magna using QSAR (Quantitative Structure-Activity Relationships) to describe these chemicals. I’m not going into details here, just read the paper!

Okay, let’s see what we got here:

import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error, median_absolute_error
import sklearn.metrics
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from skgarden.mondrian import MondrianForestRegressor
import xgboost

columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
inputData = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)

display(inputData.sample(10))
display(inputData.describe())
TPSA SAacc H-050 MLOGP RDCHI GATS1p nN C-040 LC50
336 77.76 153.195 3 0.559 2.137 0.767 0 0 3.669
457 47.36 56.055 0 3.373 3.471 0.992 3 0 4.943
216 0.00 0.000 0 2.604 1.475 0.505 0 0 3.460
340 54.57 41.028 0 4.789 3.580 1.505 0 0 5.512
512 32.67 45.055 0 3.355 3.097 0.889 2 1 4.821
156 69.48 142.262 0 4.096 3.884 1.169 2 1 5.742
253 20.23 42.683 1 2.461 1.975 0.737 0 0 5.691
508 45.82 50.747 0 3.110 2.154 0.478 1 0 4.251
14 92.14 107.097 1 4.275 4.174 1.174 1 1 4.112
2 9.23 11.000 0 5.799 2.930 0.486 0 0 7.019
TPSA SAacc H-050 MLOGP RDCHI GATS1p nN C-040 LC50
count 546.000000 546.000000 546.000000 546.000000 546.000000 546.000000 546.000000 546.000000 546.000000
mean 48.472930 58.869018 0.937729 2.313493 2.492299 1.046264 1.003663 0.353480 4.658421
std 46.763983 68.166554 1.618632 1.741797 0.811004 0.403677 1.397240 0.806827 1.665215
min 0.000000 0.000000 0.000000 -6.446000 1.000000 0.281000 0.000000 0.000000 0.122000
25% 15.790000 11.000000 0.000000 1.232500 1.975000 0.737000 0.000000 0.000000 3.601500
50% 40.460000 42.683000 0.000000 2.273500 2.344000 1.020500 1.000000 0.000000 4.516000
75% 70.022500 77.492750 1.000000 3.392750 2.911000 1.266500 2.000000 0.000000 5.607500
max 347.320000 571.952000 18.000000 9.148000 6.439000 2.500000 11.000000 11.000000 10.047000

Let’s summarize:

  • small dataset
  • 8 features
  • 1 target variable

Let’s have a look at some dataset properties:

Brute force approach

Let’s see throw some algorithms at it and see how it ends ;)

y = inputData["LC50"].copy(deep=True)
X = inputData.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
datasets = {}
datasets[0] = {'X_train': X_train, 
               'X_test' : X_test,
               'y_train': y_train, 
               'y_test' : y_test}


def train_test_linear_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    linear_regression = LinearRegression()
    grid_parameters_linear_regression = {'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(linear_regression,
                            param_grid=grid_parameters_linear_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime 
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_bRidge_regression(X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 cv_count,
                                 scorer,
                                 dataset_id):
    bRidge_regression = BayesianRidge()
    grid_parameters_BayesianRidge_regression = {'fit_intercept' : [False, True], 
                                                'n_iter':[300,1000,5000]}
    start_time = time.time()
    grid_obj = GridSearchCV(bRidge_regression,
                            param_grid=grid_parameters_BayesianRidge_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Bayesian Ridge Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_decision_tree_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    decision_tree_regression = DecisionTreeRegressor(random_state=42)
    grid_parameters_decision_tree_regression = {'max_depth' : [None, 3,5,7,9,10,11]}
    start_time = time.time()
    grid_obj = GridSearchCV(decision_tree_regression,
                            param_grid=grid_parameters_decision_tree_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    

    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Decision Tree Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_knn_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    knn_regression = KNeighborsRegressor()
    grid_parameters_knn_regression = {'n_neighbors' : [1,2,3],
                                      'weights': ['uniform', 'distance'],
                                      'algorithm': ['ball_tree', 'kd_tree'],
                                      'leaf_size': [30,90,100,110],
                                      'p': [1,2]}
    start_time = time.time()
    grid_obj = GridSearchCV(knn_regression,
                            param_grid=grid_parameters_knn_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'KNN Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def train_test_SVR_regression(X_train,
                              X_test,
                              y_train,
                              y_test,
                              cv_count,
                              scorer,
                              dataset_id):
    SVR_regression = LinearSVR()
    grid_parameters_SVR_regression = {'C' : [1, 10, 50],
                                     'epsilon' : [0.01, 0.1],
                                     'fit_intercept' : [False, True]}
    start_time = time.time()
    grid_obj = GridSearchCV(SVR_regression,
                            param_grid=grid_parameters_SVR_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStarTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStarTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Linear SVM Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_random_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    random_forest_regression = RandomForestRegressor(random_state=42)
    grid_parameters_random_forest_regression = {'n_estimators' : [3,5,10,15,18],
                                     'max_depth' : [None, 2,3,5,7,9]}
    start_time = time.time()
    grid_obj = GridSearchCV(random_forest_regression,
                            param_grid=grid_parameters_random_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Random Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}


def train_test_mondrian_forest_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    mondrian_forest_regression = MondrianForestRegressor(random_state=42)
    grid_parameters_mondrian_forest_regression = {'n_estimators' : [3,5,10,15,18,100],
                                     'max_depth' : [None, 2,3,5,7,9,25,50]}
    start_time = time.time()
    grid_obj = GridSearchCV(mondrian_forest_regression,
                            param_grid=grid_parameters_mondrian_forest_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'Mondrian Forest Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,            
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

def xgboost_regression(X_train,
                                        X_test,
                                        y_train,
                                        y_test,
                                        cv_count,
                                        scorer,
                                        dataset_id):
    x_gradient_boosting_regression = xgboost.XGBRegressor(random_state=42)
    grid_parameters_x_gradient_boosting_regression = {'n_estimators' : [3,5,10,15,50,60,80,100,200,300],
                                                    'max_depth' : [1,2, 3,5,7,9,10,11,15],
                                                    'learning_rate' :[ 0.0001, 0.001, 0.01, 0.1, 0.15, 0.2, 0.8, 1.0],
                                                     }
    start_time = time.time()
    grid_obj = GridSearchCV(x_gradient_boosting_regression,
                            param_grid=grid_parameters_x_gradient_boosting_regression,
                            cv=cv_count,
                            n_jobs=-1,
                            scoring=scorer,
                            verbose=2)
    grid_fit = grid_obj.fit(X_train, y_train)
    training_time = time.time() - start_time
    best_linear_regression = grid_fit.best_estimator_
    infStartTime = time.time()
    prediction = best_linear_regression.predict(X_test)
    prediction_time = time.time() - infStartTime
    r2 = r2_score(y_true=y_test, y_pred=prediction)
    mse = mean_squared_error(y_true=y_test, y_pred=prediction)
    mae = mean_absolute_error(y_true=y_test, y_pred=prediction)
    medae = median_absolute_error(y_true=y_test, y_pred=prediction)
    
    # metrics for true values
    # r2 remains unchanged, mse, mea will change and cannot be scaled
    # because there is some physical meaning behind it
    if 1==1:
        prediction_true_scale = prediction
        prediction = prediction
        y_test_true_scale = y_test
        mae_true_scale = mean_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        medae_true_scale = median_absolute_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)
        mse_true_scale = mean_squared_error(y_true=y_test_true_scale, y_pred=prediction_true_scale)

    return {'Regression type' : 'XGBoost Regression',
            'model' : grid_fit,
            'Predictions' : prediction,
            'R2' : r2,
            'MSE' : mse,
            'MAE' : mae,
            'MSE_true_scale' : mse_true_scale,
            'RMSE_true_scale' : np.sqrt(mse_true_scale),
            'MAE_true_scale' : mae_true_scale,
            'MedAE_true_scale' : medae_true_scale,
            'Training time' : training_time,
            'Prediction time' : prediction_time,
            'dataset' : dataset_id}

# make scorer
scorer = 'neg_mean_squared_error'
results = {}
counter = 0
cv_count = 5

for dataset in [0]:
    X_train, X_test, y_train, y_test = datasets[dataset]['X_train'], datasets[dataset]['X_test'], datasets[dataset]['y_train'], datasets[dataset]['y_test']
    results[counter] = train_test_linear_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Linear Regression completed")
    counter += 1
    results[counter] = train_test_bRidge_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Bayesian Ridge Regression completed")
    counter += 1
    results[counter] = train_test_decision_tree_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Decision Trees completed")
    counter += 1
    results[counter] = train_test_knn_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("KNN completed")
    counter += 1
    results[counter] = train_test_SVR_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("SVR completed")
    counter += 1
    results[counter] = train_test_random_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Random Forest completed")
    counter += 1
    results[counter] = train_test_mondrian_forest_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("Mondrian Forest completed")
    counter += 1
    results[counter] = xgboost_regression(X_train,
                                                    X_test,
                                                    y_train,
                                                    y_test,
                                                    cv_count,
                                                    scorer,
                                                    dataset)
    print("XGBoost completed")
    counter += 1

results_df = pd.DataFrame.from_dict(results, orient='index')
display(results_df)
results_df.to_csv('results_df_manual.csv')
pickle.dump(results, open('results_manual.p', 'wb'))
Regression type model Predictions R2 MSE MAE MSE_true_scale RMSE_true_scale MAE_true_scale MedAE_true_scale Training time Prediction time dataset
0 Linear Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.5338844628594983, 4.458763905734787, 4.2549... 0.499440 1.502224 0.930824 1.502224 1.225652 0.930824 0.734009 1.573259 0.001276 0
1 Bayesian Ridge Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.5632073495510084, 4.361513694406263, 4.2435... 0.500691 1.498469 0.926011 1.498469 1.224120 0.926011 0.728115 0.111000 0.000795 0
2 Decision Tree Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.0135, 3.54264864864865, 4.0135, 3.747043478... 0.392786 1.822302 1.009686 1.822302 1.349927 1.009686 0.861600 0.104871 0.001365 0
3 KNN Regression GridSearchCV(cv=5, error_score='raise-deprecat... [3.510695042287087, 5.552267935344628, 3.45457... 0.478214 1.565925 0.916530 1.565925 1.251369 0.916530 0.661797 0.582334 0.002165 0
4 Linear SVM Regression GridSearchCV(cv=5, error_score='raise-deprecat... [6.219684172226497, 4.92922723439761, 7.043169... -0.436243 4.310286 1.564261 4.310286 2.076123 1.564261 1.235107 0.365867 0.000695 0
5 Random Forest Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.202572017409114, 4.127909161387632, 3.34896... 0.457783 1.627240 0.945932 1.627240 1.275633 0.945932 0.770660 0.841694 0.001862 0
6 Mondrian Forest Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.313382453918457, 5.6062146914005275, 4.0430... 0.469886 1.590917 0.922933 1.590917 1.261316 0.922933 0.652625 1.849978 0.023086 0
7 XGBoost Regression GridSearchCV(cv=5, error_score='raise-deprecat... [4.666296, 4.5902467, 3.343176, 3.8965771, 4.3... 0.483594 1.549778 0.905769 1.549778 1.244901 0.905769 0.768517 34.654719 0.001308 0

Well, that doesn’t look so good. The baseline result in Cassotti et al. (2014) is “R2 and Q2cv equal to 0.78, Q2ext equal to 0.7”. However, without a threshold using the Mahalanobis distance, the baseline kNN result is R2 =0.6, Q2cv and Q2_ext = 0.43. This result is pretty similar to our result here because the Q2 value is basically the R2 but as cross-validation score (Q2cv) and as score on the test set (Q2ext). Well, in general we could also question if R2 is the correct metric here. Furthermore, the paper lacks of many interesting details how data was split.

TPOT

So, let’s see if TPOT comes up with something better.

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error, mean_absolute_error
import sklearn.metrics
from tpot import TPOTRegressor

columns = ["TPSA","SAacc", "H-050", "MLOGP", "RDCHI", "GATS1p", "nN", "C-040", "LC50"]
input_data = pd.read_csv("./data/qsar_aquatic_toxicity.csv",delimiter=";",names=columns)
# split data into X and y
y = input_data["LC50"].copy(deep=True)
X = input_data.copy(deep=True)
X.drop(["LC50"], inplace=True, axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                     y,
                                                     test_size=0.25,
                                                     shuffle=True,
                                                     random_state=42)
tpot = TPOTRegressor(max_time_mins=60,
                     verbosity=2,
                     n_jobs=-1)
tpot.fit(X_train,y_train)
tpot.export('QSAR_aquatic_toxicity.py')
y_predictions = tpot.predict(X_test)
r2 = sklearn.metrics.r2_score(y_test, y_predictions)
mae = sklearn.metrics.mean_absolute_error(y_test, y_predictions)
mse = sklearn.metrics.mean_squared_error(y_test, y_predictions)
rmse = np.sqrt(mse)
print("R2 score:", r2)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
R2 score: 0.4802354355830183
MAE: 0.9276848574311194
MSE: 1.5598575121614906
RMSE: 1.2489425575908168

Well, same shit, different approach ;).

TPOT’s final pipeline is:

import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import Binarizer, MaxAbsScaler
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-0.9705966623633835
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                FunctionTransformer(copy),
                FastICA(tol=0.35000000000000003)
            ),
            MaxAbsScaler()
        ),
        make_union(
            FunctionTransformer(copy),
            Binarizer(threshold=0.9500000000000001)
        )
    ),
    OneHotEncoder(minimum_fraction=0.05, sparse=False, threshold=10),
    XGBRegressor(learning_rate=0.1, max_depth=9, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.8500000000000001)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)