Revisiting Machine Learning Datasets

Today, we will have a look at a dataset on critical temperatures for superconductivity as part of my “Exploring Less Known Datasets for Machine Learning” series. The dataset is hosted on the UCI Machine Learning Repository and originates from a Japanese database. The dataset was used in this publication by K. Hamidieh (2018). The baseline result is a RMSE of ±9.5 K. Let’s see if we can beat it without much effort.

The problem with superconductors is that if they cool down below a critical temperature, then they lose their conductivity. Let’s see how good we can predict this temperature from a few extrated features.

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

filepath_input_data = "./data/train.csv"
input_data_df = pd.read_csv(filepath_input_data)
display(input_data_df.head(3))
display(input_data_df.tail(3))
input_data_df.describe()


  
    
      
      number_of_elements
      mean_atomic_mass
      wtd_mean_atomic_mass
      gmean_atomic_mass
      wtd_gmean_atomic_mass
      entropy_atomic_mass
      wtd_entropy_atomic_mass
      range_atomic_mass
      wtd_range_atomic_mass
      std_atomic_mass
      ...
      wtd_mean_Valence
      gmean_Valence
      wtd_gmean_Valence
      entropy_Valence
      wtd_entropy_Valence
      range_Valence
      wtd_range_Valence
      std_Valence
      wtd_std_Valence
      critical_temp
    
  
  
    
      0
      4
      88.944468
      57.862692
      66.361592
      36.116612
      1.181795
      1.062396
      122.90607
      31.794921
      51.968828
      ...
      2.257143
      2.213364
      2.219783
      1.368922
      1.066221
      1
      1.085714
      0.433013
      0.437059
      29.0
    
    
      1
      5
      92.729214
      58.518416
      73.132787
      36.396602
      1.449309
      1.057755
      122.90607
      36.161939
      47.094633
      ...
      2.257143
      1.888175
      2.210679
      1.557113
      1.047221
      2
      1.128571
      0.632456
      0.468606
      26.0
    
    
      2
      4
      88.944468
      57.885242
      66.361592
      36.122509
      1.181795
      0.975980
      122.90607
      35.741099
      51.968828
      ...
      2.271429
      2.213364
      2.232679
      1.368922
      1.029175
      1
      1.114286
      0.433013
      0.444697
      19.0


  
    
      
      number_of_elements
      mean_atomic_mass
      wtd_mean_atomic_mass
      gmean_atomic_mass
      wtd_gmean_atomic_mass
      entropy_atomic_mass
      wtd_entropy_atomic_mass
      range_atomic_mass
      wtd_range_atomic_mass
      std_atomic_mass
      ...
      wtd_mean_Valence
      gmean_Valence
      wtd_gmean_Valence
      entropy_Valence
      wtd_entropy_Valence
      range_Valence
      wtd_range_Valence
      std_Valence
      wtd_std_Valence
      critical_temp
    
  
  
    
      21260
      2
      99.663190
      95.609104
      99.433882
      95.464320
      0.690847
      0.530198
      13.51362
      53.041104
      6.756810
      ...
      4.80
      4.472136
      4.781762
      0.686962
      0.450561
      1
      3.20
      0.500000
      0.400000
      1.98
    
    
      21261
      2
      99.663190
      97.095602
      99.433882
      96.901083
      0.690847
      0.640883
      13.51362
      31.115202
      6.756810
      ...
      4.69
      4.472136
      4.665819
      0.686962
      0.577601
      1
      2.21
      0.500000
      0.462493
      1.84
    
    
      21262
      3
      87.468333
      86.858500
      82.555758
      80.458722
      1.041270
      0.895229
      71.75500
      43.144000
      29.905282
      ...
      4.50
      4.762203
      4.242641
      1.054920
      0.970116
      3
      1.80
      1.414214
      1.500000
      12.80


  
    
      
      number_of_elements
      mean_atomic_mass
      wtd_mean_atomic_mass
      gmean_atomic_mass
      wtd_gmean_atomic_mass
      entropy_atomic_mass
      wtd_entropy_atomic_mass
      range_atomic_mass
      wtd_range_atomic_mass
      std_atomic_mass
      ...
      wtd_mean_Valence
      gmean_Valence
      wtd_gmean_Valence
      entropy_Valence
      wtd_entropy_Valence
      range_Valence
      wtd_range_Valence
      std_Valence
      wtd_std_Valence
      critical_temp
    
  
  
    
      count
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      ...
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
      21263.000000
    
    
      mean
      4.115224
      87.557631
      72.988310
      71.290627
      58.539916
      1.165608
      1.063884
      115.601251
      33.225218
      44.391893
      ...
      3.153127
      3.056536
      3.055885
      1.295682
      1.052841
      2.041010
      1.483007
      0.839342
      0.673987
      34.421219
    
    
      std
      1.439295
      29.676497
      33.490406
      31.030272
      36.651067
      0.364930
      0.401423
      54.626887
      26.967752
      20.035430
      ...
      1.191249
      1.046257
      1.174815
      0.393155
      0.380291
      1.242345
      0.978176
      0.484676
      0.455580
      34.254362
    
    
      min
      1.000000
      6.941000
      6.423452
      5.320573
      1.960849
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      1.000000
      1.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000210
    
    
      25%
      3.000000
      72.458076
      52.143839
      58.041225
      35.248990
      0.966676
      0.775363
      78.512902
      16.824174
      32.890369
      ...
      2.116732
      2.279705
      2.091251
      1.060857
      0.775678
      1.000000
      0.921454
      0.451754
      0.306892
      5.365000
    
    
      50%
      4.000000
      84.922750
      60.696571
      66.361592
      39.918385
      1.199541
      1.146783
      122.906070
      26.636008
      45.123500
      ...
      2.618182
      2.615321
      2.434057
      1.368922
      1.166532
      2.000000
      1.063077
      0.800000
      0.500000
      20.000000
    
    
      75%
      5.000000
      100.404410
      86.103540
      78.116681
      73.113234
      1.444537
      1.359418
      154.119320
      38.356908
      59.322812
      ...
      4.026201
      3.727919
      3.914868
      1.589027
      1.330801
      3.000000
      1.918400
      1.200000
      1.020436
      63.000000
    
    
      max
      9.000000
      208.980400
      208.980400
      208.980400
      208.980400
      1.983797
      1.958203
      207.972460
      205.589910
      101.019700
      ...
      7.000000
      7.000000
      7.000000
      2.141963
      1.949739
      6.000000
      6.992200
      3.000000
      3.000000
      185.000000

Well, visualizations are always nicer :)

Next, we have to scale the input data and throw some machine learning algorithms at it (train-test splitting: 0.75-0.25 with 5 fold cross-validation):

The most interesting result is that both Random Forest as well XGBoost outperform the baseline result a bit. With more careful hyperparameter tuning and perhaps different pre-processing this probably could be improved even more. The performance of the neural networks is bad. *I didn’t spent any time on designing them, just used a NN with 2 and one with 5 dense layers. But there is one thing I want to point out here: metrics for regression problems, especiall R2, MAE and RMSE have problems to evaluate symmetric error distributions properly.

	number_of_elements	mean_atomic_mass	wtd_mean_atomic_mass	gmean_atomic_mass	wtd_gmean_atomic_mass	entropy_atomic_mass	wtd_entropy_atomic_mass	range_atomic_mass	wtd_range_atomic_mass	std_atomic_mass	...	wtd_mean_Valence	gmean_Valence	wtd_gmean_Valence	entropy_Valence	wtd_entropy_Valence	range_Valence	wtd_range_Valence	std_Valence	wtd_std_Valence	critical_temp
0	4	88.944468	57.862692	66.361592	36.116612	1.181795	1.062396	122.90607	31.794921	51.968828	...	2.257143	2.213364	2.219783	1.368922	1.066221	1	1.085714	0.433013	0.437059	29.0
1	5	92.729214	58.518416	73.132787	36.396602	1.449309	1.057755	122.90607	36.161939	47.094633	...	2.257143	1.888175	2.210679	1.557113	1.047221	2	1.128571	0.632456	0.468606	26.0
2	4	88.944468	57.885242	66.361592	36.122509	1.181795	0.975980	122.90607	35.741099	51.968828	...	2.271429	2.213364	2.232679	1.368922	1.029175	1	1.114286	0.433013	0.444697	19.0

	number_of_elements	mean_atomic_mass	wtd_mean_atomic_mass	gmean_atomic_mass	wtd_gmean_atomic_mass	entropy_atomic_mass	wtd_entropy_atomic_mass	range_atomic_mass	wtd_range_atomic_mass	std_atomic_mass	...	wtd_mean_Valence	gmean_Valence	wtd_gmean_Valence	entropy_Valence	wtd_entropy_Valence	range_Valence	wtd_range_Valence	std_Valence	wtd_std_Valence	critical_temp
21260	2	99.663190	95.609104	99.433882	95.464320	0.690847	0.530198	13.51362	53.041104	6.756810	...	4.80	4.472136	4.781762	0.686962	0.450561	1	3.20	0.500000	0.400000	1.98
21261	2	99.663190	97.095602	99.433882	96.901083	0.690847	0.640883	13.51362	31.115202	6.756810	...	4.69	4.472136	4.665819	0.686962	0.577601	1	2.21	0.500000	0.462493	1.84
21262	3	87.468333	86.858500	82.555758	80.458722	1.041270	0.895229	71.75500	43.144000	29.905282	...	4.50	4.762203	4.242641	1.054920	0.970116	3	1.80	1.414214	1.500000	12.80

	number_of_elements	mean_atomic_mass	wtd_mean_atomic_mass	gmean_atomic_mass	wtd_gmean_atomic_mass	entropy_atomic_mass	wtd_entropy_atomic_mass	range_atomic_mass	wtd_range_atomic_mass	std_atomic_mass	...	wtd_mean_Valence	gmean_Valence	wtd_gmean_Valence	entropy_Valence	wtd_entropy_Valence	range_Valence	wtd_range_Valence	std_Valence	wtd_std_Valence	critical_temp
count	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	...	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000	21263.000000
mean	4.115224	87.557631	72.988310	71.290627	58.539916	1.165608	1.063884	115.601251	33.225218	44.391893	...	3.153127	3.056536	3.055885	1.295682	1.052841	2.041010	1.483007	0.839342	0.673987	34.421219
std	1.439295	29.676497	33.490406	31.030272	36.651067	0.364930	0.401423	54.626887	26.967752	20.035430	...	1.191249	1.046257	1.174815	0.393155	0.380291	1.242345	0.978176	0.484676	0.455580	34.254362
min	1.000000	6.941000	6.423452	5.320573	1.960849	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000210
25%	3.000000	72.458076	52.143839	58.041225	35.248990	0.966676	0.775363	78.512902	16.824174	32.890369	...	2.116732	2.279705	2.091251	1.060857	0.775678	1.000000	0.921454	0.451754	0.306892	5.365000
50%	4.000000	84.922750	60.696571	66.361592	39.918385	1.199541	1.146783	122.906070	26.636008	45.123500	...	2.618182	2.615321	2.434057	1.368922	1.166532	2.000000	1.063077	0.800000	0.500000	20.000000
75%	5.000000	100.404410	86.103540	78.116681	73.113234	1.444537	1.359418	154.119320	38.356908	59.322812	...	4.026201	3.727919	3.914868	1.589027	1.330801	3.000000	1.918400	1.200000	1.020436	63.000000
max	9.000000	208.980400	208.980400	208.980400	208.980400	1.983797	1.958203	207.972460	205.589910	101.019700	...	7.000000	7.000000	7.000000	2.141963	1.949739	6.000000	6.992200	3.000000	3.000000	185.000000