diff --git a/README.md b/README.md index 9f4ff2b..6cc6e15 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,69 @@ -# pyQSARplus -Library of tools for the analysis of QSAR/QSPR datasets and models. +# qsarify -# What is included? ------------------ +qsarify is a library of tools for the analysis of QSAR/QSPR datasets and models. This library is intended to be used to produce models which relate a set of calculated chemical descriptors to a given numeric endpoint. Many great tools will take the geometry or string data of a given chemical and compute **descriptors**, which are numeric measures of the properties of these, but you can generate some of these with another one of my scripts, [Free Descriptors](https://github.com/StephenSzwiec/free_descriptors). -- Data preprocessing: `data_tools` +# Dependencies + +- Python 3 +- [numpy](https://numpy.org/) +- [pandas](https://pandas.pydata.org/) +- [scikit-learn](https://scikit-learn.org) +- [matplotlib](https://matplotlib.org) + + +# Installation + +`pip install qsarify` + +# What is included right now? + +- Data preprocessing tools: `data_tools` - Dimensionality reduction via clustering: `clustering` +- Feature selection: + - Single threaded: `feature_selection_single` + - Multi-threaded: `feature_selection_multi` +- Model Export and Visualization: `model_export` +- Cross Valiidation: `cross_validation` + +# How to use + +The best way to learn how to use this library is to look at the example notebook in the `examples` folder. This notebook will walk you through the workflow of using this library to build a QSAR model. + +# Future Plans + +- Massively parallel feature selection methods: + - CUDA acceleration + - MPI acceleration +- Include Shannon Entropy as a dimensionality reduction metric in clustering +- Embedded kernel methods +- More visualization tools +- More cross validation tools +- Feature selection tools for categorical data + +# Contributing + + +If you would like to contribute to this project, please feel free to fork this repository and submit a pull request. Otherwise, you may also submit an issue. I will try to respond to issues as quickly as possible. + +# License + + +This project is licensed under the GNU GPLv3 license. See the LICENSE file for more details. + +# Citation + +If you use this library in your work, please cite it as follows: + +Szwiec, Stephen. (2023). qsarify: A high performance library for QSAR model development. + +BibTex: +``` +@misc{szwiec2023qsarify, + author = {Szwiec, Stephen}, + title = {qsarify: A high performance library for QSAR model development}, + year = {2023}, + publisher = {GitHub}, + journal = {GitHub repository}, + howpublished = {\url{https://github.com/stephenszwiec/qsarify}}, + } +``` diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index 4ba45b3..0000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,1780 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "2a56c28f-2cc5-4baa-b30b-4e85453affcb", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# basic toolkit for the workflow\n", - "import pandas as pd\n", - "import data_tools as dt\n", - "import clustering as cl\n", - "import feature_selection_single as fss\n", - "import feature_selection_multi as fsm\n", - "import cross_validation as cv\n", - "import export_model as em" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "70cd653c-efdc-405b-989a-a50d40f5b9a5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# in this example, the last column is the response variable (log10 of LD50)\n", - "# we use pandas to manipulate the data\n", - "dfx = pd.DataFrame(pd.read_csv('28BenzeneDescriptors.csv')).iloc[:,:-1]\n", - "dfy = pd.DataFrame(pd.read_csv('28BenzeneDescriptors.csv')).iloc[:,-1]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9992350f-1431-49d3-9f57-1cef849eb4cc", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | No. | \n", - "AMW | \n", - "Sp | \n", - "Mv | \n", - "Me | \n", - "Ms | \n", - "nBM | \n", - "ARR | \n", - "RBN | \n", - "RBF | \n", - "... | \n", - "PCWTe | \n", - "LDI | \n", - "Hy | \n", - "AMR | \n", - "MLOGP | \n", - "MLOGP2 | \n", - "ALOGP | \n", - "GVWAI-80 | \n", - "Infective-80 | \n", - "BLTD48 | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", - "1 | \n", - "6.510 | \n", - "8.284 | \n", - "0.649 | \n", - "0.971 | \n", - "2.000 | \n", - "6 | \n", - "1.000 | \n", - "0 | \n", - "0.000 | \n", - "... | \n", - "10.100 | \n", - "0.062 | \n", - "-0.921 | \n", - "26.058 | \n", - "2.255 | \n", - "5.085 | \n", - "2.047 | \n", - "0 | \n", - "0 | \n", - "-3.46 | \n", - "
1 | \n", - "2 | \n", - "6.143 | \n", - "10.045 | \n", - "0.626 | \n", - "0.969 | \n", - "1.952 | \n", - "6 | \n", - "0.857 | \n", - "1 | \n", - "0.067 | \n", - "... | \n", - "11.371 | \n", - "0.057 | \n", - "-0.936 | \n", - "31.099 | \n", - "2.608 | \n", - "6.802 | \n", - "2.514 | \n", - "0 | \n", - "0 | \n", - "-3.80 | \n", - "
2 | \n", - "3 | \n", - "8.794 | \n", - "9.438 | \n", - "0.658 | \n", - "1.037 | \n", - "3.074 | \n", - "8 | \n", - "0.889 | \n", - "1 | \n", - "0.071 | \n", - "... | \n", - "2.209 | \n", - "0.144 | \n", - "-0.636 | \n", - "33.383 | \n", - "1.797 | \n", - "3.229 | \n", - "2.000 | \n", - "0 | \n", - "0 | \n", - "-3.03 | \n", - "
3 | \n", - "4 | \n", - "8.068 | \n", - "11.199 | \n", - "0.636 | \n", - "1.024 | \n", - "2.933 | \n", - "8 | \n", - "0.800 | \n", - "2 | \n", - "0.118 | \n", - "... | \n", - "2.441 | \n", - "0.130 | \n", - "-0.672 | \n", - "38.424 | \n", - "2.150 | \n", - "4.623 | \n", - "2.467 | \n", - "0 | \n", - "0 | \n", - "-3.36 | \n", - "
4 | \n", - "5 | \n", - "8.068 | \n", - "11.199 | \n", - "0.636 | \n", - "1.024 | \n", - "2.933 | \n", - "8 | \n", - "0.800 | \n", - "2 | \n", - "0.118 | \n", - "... | \n", - "2.313 | \n", - "0.123 | \n", - "-0.672 | \n", - "38.424 | \n", - "2.150 | \n", - "4.623 | \n", - "2.467 | \n", - "0 | \n", - "0 | \n", - "-3.36 | \n", - "
5 rows × 676 columns
\n", - "A high performance library for QSAR model development
+ + + +One function to perform data pre-processing
+Reduce the time it takes to develop a QSAR model using intelligent feature selection
+Built-in functions for model validation and visualzation
+
+#-*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: Classification Scoring Module
+#
+#Copyright (C) 2023 Stephen Szwiec
+#
+#This file is part of qsarify.
+#
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Classification Scoring Module
+
+This module provides summary information about Classification
+"""
+
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+
+[docs]
+class ClassifierScore :
+ """
+ Provides summary information about Classification
+
+ Parameters
+ ----------
+ y_data : pandas DataFrame , shape = (n_samples,)
+ pred_y : pandas DataFrame , shape = (n_samples,)
+ => predicted Y values as result of classification
+
+ Sub functions
+ -------
+ score (self)
+ tf_table(self)
+ """
+
+ def __init__ (self,y_data,pred_y) :
+ """
+ Initializes the classifer
+ """
+ # Initialize the variables
+ self.y_data = y_data
+ self.pred_y = pred_y
+ self.real_y = [] #hash y_data
+ # Hash the y_data
+ for i in np.array(self.y_data) :
+ self.real_y.append(i[0])
+
+
+[docs]
+ def score (self) :
+ """
+ Calculate accuracy score
+ Returns
+ -------
+ None
+ """
+ # Initialize the variables
+ n = 0
+ cnt = 0
+ # Count the number of wrong predictions
+ for i in np.array(self.real_y) :
+ if i != self.pred_y[n] :
+ cnt += 1
+ n += 1
+ print('Number of all :',n) #all data
+ print('Number of worng :', cnt)
+ print('AccuracyScore :',accuracy_score(self.real_y, self.pred_y))
+
+
+
+[docs]
+ def tf_table(self) :
+ """
+ Calculate Precision & Recall
+ Generates a confusion matrix
+
+ Returns
+ -------
+ None
+ """
+ # Initialize the variables
+ one = 0
+ zero = 0
+ n = 0
+ cnt = 0
+ realzero = 0
+ realone = 0
+ # Initialize the confusion matrix
+ for i in np.array(self.y_data) :
+ if i[0] == 0 :
+ zero += 1
+ if i[0] == 1 :
+ one += 1
+ # Count the number of wrong predictions
+ for i in np.array(self.y_data):
+ if i[0] != self.pred_y[n]:
+ #print ('real',i[0],'///','pred',y_pred[n])
+ if i[0] == 0 :
+ realzero += 1
+ if i[0] == 1 :
+ realone += 1
+ cnt +=1
+ n += 1
+ # Print the results
+ print(('Number of 1 :',one))
+ print('Number of 0 :',zero)
+ print('True Positive(real 1 but pred 1) :',one-realone) #TP
+ print('True Negative(real 0 but pred 0) :',zero-realzero) #TN
+ print('False Positive(real 0 but pred 1) :',realzero) #FP
+ print('False Negative(real 1 but pred 0) :',realone) #FN
+ print('Precision', (one-realone)/((one-realone)+realzero)) # TP / TP+FP
+ print('Recall',(one-realone)/((one-realone)+realone)) # TP / TP+FN
+
+
+
+#-*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: Clustering Module
+#
+#Copyright (C) 2023 Stephen Szwiec
+#
+#This file is part of qsarify.
+#
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#
+
+"""
+Clustering Module
+
+This module contains functions for clustering features based on hierarchical clustering method
+and calculating the cophenetic correlation coefficient of linkages. The cophenetic correlation
+coefficient is a measure of the correlation between the distance of observations in feature space
+and the distance of observations in cluster space. The cophenetic correlation coefficient is
+calculated for each linkage method and the method with the highest cophenetic correlation
+coefficient is used to cluster the features. The cophenetic correlation coefficient is calculated
+using the scipy.cluster.hierarchy.cophenet function.
+
+"""
+
+import numpy as np
+import pandas as pd
+from pandas import DataFrame, Series
+import matplotlib.pyplot as plt
+from scipy.spatial.distance import pdist, squareform
+from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cophenet
+
+
+[docs]
+def cophenetic(X_data):
+ """
+ Calculate the cophenetic correlation coefficient of linkages
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame, shape = (n_samples, m_features)
+ method : str, method for linkage generation, default = 'corr' (Pearson correlation)
+
+ Returns
+ -------
+ None
+ """
+ distance = abs(np.corrcoef(X_data, rowvar=False))
+ # drop any columns and rows that produced NaNs
+ distance = distance[~np.isnan(distance).any(axis=1)]
+ distance = distance[:, ~np.isnan(distance).any(axis=0)]
+ # calculate the cophenetic correlation coefficient
+ Z1 = linkage(distance, method='average', metric='euclidean')
+ Z2 = linkage(distance, method='complete', metric='euclidean')
+ Z3 = linkage(distance, method='single', metric='euclidean')
+ c1, coph_dists1 = cophenet(Z1, pdist(distance))
+ c2, coph_dists2 = cophenet(Z2, pdist(distance))
+ c3, coph_dists3 = cophenet(Z3, pdist(distance))
+ print("cophenetic correlation average linkage: ", c1)
+ print("cophenetic correlation complete linkage: ", c2)
+ print("cophenetic correlation single linkage: ", c3)
+
+
+
+[docs]
+class featureCluster:
+ """
+ Make cluster of features based on hierarchical clustering method
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame, shape = (n_samples, n_features)
+ link : str, kind of linkage method, default = 'average', 'complete', 'single'
+ cut_d : int, depth in cluster(dendrogram), default = 3
+
+ Sub functions
+ -------------
+ set_cluster(self)
+ cluster_dist(self)
+ """
+
+ def __init__(self, X_data, method='corr', link='average', cut_d=3):
+ """
+ Initializes cluster object:
+ Makes a cluster of features based on hierarchical clustering method
+ and calculates the cophenetic correlation coefficient of linkages
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame, shape = (n_samples, n_features)
+ link : str, kind of linkage method, default = 'average', 'complete', 'single'
+ cut_d : int, depth in cluster(dendrogram), default = 3
+ This is a tunable parameter for clustering
+ """
+ self.method = method
+ self.cluster_info = []
+ self.assignments = np.array([])
+ self.cluster_output = DataFrame()
+ self.cludict = {}
+ self.X_data = X_data
+ self.link = link
+ self.cut_d = cut_d
+ self.xcorr = pd.DataFrame(abs(np.corrcoef(self.X_data, rowvar=False)), columns=X_data.columns, index=X_data.columns)
+
+
+[docs]
+ def set_cluster(self, verbose=False, graph=False):
+ """
+ Make cluster of features based on hierarchical clustering method
+
+ Parameters
+ ----------
+ verbose : bool, print cluster information, default = False
+ graph : bool, show dendrogram, default = False
+
+ Returns
+ -------
+ cludict : dict, cluster information of features as a dictionary
+ """
+ Z = linkage( self.xcorr, method=self.link, metric='euclidean')
+ self.assignments = fcluster(Z, self.cut_d, criterion='distance')
+ self.cluster_output = DataFrame({'Feature':list(self.X_data.columns.values), 'cluster':self.assignments})
+ nc = list(self.cluster_output.cluster.values)
+ name = list(self.cluster_output.Feature.values)
+ # zip cluster number and feature name
+ self.cludict = dict(zip(name, nc))
+ # make cluster information as an input for feature selection function
+ # print cluster information for key in cludict.items if range of cluster number is 1~nnc
+ for t in range(1, max(nc)+1):
+ self.cluster_info.append( [k for k, v in self.cludict.items() if v == t] )
+ if verbose:
+ print('\n','\x1b[1;46m'+'Cluster'+'\x1b[0m',t,self.cluster_info[t-1],)
+ if graph:
+ plt.figure(figsize=(25, 40))
+ plt.title('Hierarchical Clustering Dendrogram')
+ plt.xlabel('sample index')
+ plt.ylabel('distance')
+ dendrogram(Z, color_threshold=self.cut_d, above_threshold_color='k', no_labels=True, leaf_label_func=None, show_contracted=True, orientation='left')
+ plt.show()
+ return self.cludict
+
+
+
+[docs]
+ def cluster_dist(self):
+ """
+ Show dendrogram of hierarchical clustering
+
+ Returns
+ -------
+ None
+ """
+
+ # have we actually clustered? If not, please do so first:
+ if self.assignments.any() == False:
+ self.set_cluster()
+ nc = list(self.cluster_output.cluster.values)
+ cluster = [[k for k, value in self.cludict.items() if value == t] for t in range(1, max(nc)+1)]
+ # list comprehension which returns a list of average autocorrelation values for each cluster, unless the cluster length is 1
+ # in which case it returns nothing
+ dist_box = [ (np.array([self.xcorr.loc[i,i]]).sum() - len(i)/2)/(len(i)**2 - len(i)/2) for i in cluster if len(i) > 1]
+ plt.hist(dist_box)
+ plt.ylabel("Frequency")
+ if self.method == 'info':
+ plt.xlabel("Shannon mutual information of each cluster")
+ else:
+ plt.xlabel("Correlation coefficient of each cluster")
+ plt.show()
+
+
+
+#-*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: Data Preprocessing Module
+#
+#Copyright (C) 2023 Stephen Szwiec
+#
+#This file is part of qsarify.
+#
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#
+
+"""
+Data Preprocessing Module
+
+This module contains functions for data preprocessing, including:
+ - removing features with 'NaN' as value
+ - removing features with constant values
+ - removing features with low variance
+ - removing features with 'NaN' as value when calculating correlation coefficients
+ - generating a sequential train-test split by sorting the data by response variable
+ - generating a random train-test split
+ - scaling data
+
+The main function of this module is `clean_data`, which performs all of the above functions.
+
+"""
+
+
+
+import numpy as np
+from numpy import ndarray
+import pandas as pd
+from pandas import DataFrame, Series
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import MinMaxScaler
+
+
+[docs]
+def rm_nan(X_data):
+ """
+ Remove features with 'NaN' as value
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, n_features)
+
+ Returns
+ -------
+ Modified DataFrame
+ """
+ # get the indices of the features with 'NaN' as value
+ A = X_data.isnull().any()
+ # delete the features with 'NaN' as value
+ return X_data.drop(X_data.columns[A], axis=1)
+
+
+
+[docs]
+def rm_constant(X_data):
+ """
+ Remove features with constant values
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, n_features)
+
+ Returns
+ -------
+ Modified DataFrame
+ """
+ A = X_data.std() == 0
+ return X_data.drop(X_data.columns[A], axis=1)
+
+
+
+[docs]
+def rm_lowVar(X_data, cutoff=0.9):
+ """
+ Remove features with low variance
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, n_features)
+ cutoff : float, default = 0.1
+
+ Returns
+ -------
+ Modified DataFrame
+ """
+ A = X_data.var() >= cutoff
+ return X_data.drop(X_data.columns[A], axis=1)
+
+
+
+[docs]
+def rm_nanCorr(X_data):
+ """
+ Remove features with 'NaN' as value when calculating correlation coefficients
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, n_features)
+
+ Returns
+ -------
+ Modified DataFrame
+ """
+ corr_mtx = pd.DataFrame(np.corrcoef(X_data, rowvar=False), columns=X_data.columns, index=X_data.columns)
+ A = corr_mtx.isnull().any()
+ return X_data.drop(X_data.columns[A], axis=1)
+
+
+
+
+[docs]
+def sorted_split(X_data, y_data, test_size=0.2):
+ """
+ Generate a sequential train-test split by sorting the data by response variable
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, m_features)
+ y_data : pandas DataFrame , shape = (n_samples, 1)
+ test_size : float, default = 0.2
+
+ Returns
+ -------
+ X_train : pandas DataFrame , shape = (n_samples, m_features)
+ X_test : pandas DataFrame, shape = (n_samples, m_features)
+ y_train : pandas DataFrame , shape = (n_samples, 1)
+ y_test : pandas DataFrame , shape = (n_samples, 1)
+ """
+ # every n-th row is a test row, computed from test_size as a fraction
+ n = int(1 / test_size)
+ # sort by response variable
+ df = pd.concat([X_data, y_data], axis=1)
+ df.sort_values(by=y_data.name, inplace=True)
+ test_idx = df.index[::n]
+ train_idx = df.index.difference(test_idx)
+ # return train and test data
+ return X_data.loc[train_idx], X_data.loc[test_idx], y_data.loc[train_idx], y_data.loc[test_idx]
+
+
+
+[docs]
+def random_split(X_data, y_data, test_size=0.2):
+ """
+ Generate a random train-test split
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, m_features)
+ y_data : pandas DataFrame , shape = (n_samples, 1)
+ test_size : float, default = 0.2
+
+ Returns
+ -------give count of NaN in pandas dataframe
+ X_train : pandas DataFrame , shape = (n_samples, m_features)
+ X_test : pandas DataFrame , shape = (n_samples, m_features)
+ y_train : pandas DataFrame , shape = (n_samples, 1)
+ y_test : pandas DataFrame , shape = (n_samples, 1)
+ """
+ # every n-th row is a test row, computed from test_size as a fraction
+ n = int(1 / test_size)
+ # return indices of test rows
+ test_idx = np.random.choice(X_data.index, size=int(len(X_data) * test_size), replace=False)
+ # return indices of train rows
+ train_idx = X_data.index.difference(test_idx)
+ # return train and test data
+ return X_data.loc[train_idx], X_data.loc[test_idx], y_data.loc[train_idx], y_data.loc[test_idx]
+
+
+
+[docs]
+def scale_data(X_train, X_test):
+ """
+ Scale the data using the training data; apply the same transformation to the test data
+
+ Parameters
+ ----------
+ X_train : pandas DataFrame , shape = (n_samples, m_features)
+ X_test : pandas DataFrame , shape = (p_samples, m_features)
+
+ Returns
+ -------
+ X_train_scaled : pandas DataFrame , shape = (n_samples, m_features)
+ X_test_scaled : pandas DataFrame , shape = (p_samples, m_features)
+ """
+
+ # scale the data
+ scaler = MinMaxScaler()
+ X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=list(X_train.columns.values))
+ X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=list(X_test.columns.values))
+ return X_train_scaled, X_test_scaled
+
+
+
+[docs]
+def clean_data(X_data, y_data, split='sorted', test_size=0.2, cutoff=None, plot=False):
+ """
+ Perform the entire data cleaning process as one function
+ Optionally, plot the correlation matrix
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame, shape = (n_samples, n_features)
+ split : string, optional, 'sorted' or 'random'
+ test_size : float, optional, default = 0.2
+ cutoff : float, optional, auto-correlaton coefficient below which we keep
+ plot : boolean, optional, default = False
+
+ Returns
+ -------
+ X_train : pandas DataFrame , shape = (n_samples, m_features)
+ X_test : pandas DataFrame , shape = (p_samples, m_features)
+ y_train : pandas DataFrame , shape = (n_samples, 1)
+ y_test : pandas DataFrame , shape = (p_samples, 1)
+
+
+ """
+ # Create a deep copy of the data
+ df = X_data.copy()
+ # Remove columns with constant data
+ df = rm_constant(df)
+ # Remove columns with NaN values
+ df = rm_nan(df)
+ # Remove columns with NaN values when calculating correlation coefficients
+ df = rm_nanCorr(df)
+ # Remove columns with low variance
+ if cutoff:
+ df = rm_lowVar(df, cutoff)
+ # Create split
+ if split == 'random':
+ X_train, X_test, y_train, y_test = random_split(df, y_data, test_size)
+ else:
+ X_train, X_test, y_train, y_test = sorted_split(df, y_data, test_size)
+ # Scale the data and return
+ X_train, X_test = scale_data(X_train, X_test)
+ if plot:
+ plt.matshow(df.corr())
+ plt.set_cmap('seismic')
+ # show legend for the matrix
+ plt.colorbar()
+ plt.show()
+ return X_train, X_test, y_train, y_test
+
+
+#-*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: Multi-Processing Feature Selection Module
+#
+#Copyright (C) 2023 Stephen Szwiec
+#
+#This file is part of qsarify.
+#
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+
+"""
+Multi-Processing Feature Selection Module
+
+This module contains the functions for performing feature selection using
+the clustering module's output as a guide for feature selection, and implements
+a genetic algorithm for feature selection using reflection.
+
+"""
+
+import datetime
+import random
+import numpy as np
+from sklearn import linear_model as lm
+from sklearn.svm import SVC
+import itertools
+import multiprocessing as mp
+
+"""
+Reflector class for the evolve function; allows for the use of a pool of workers.
+"""
+
+[docs]
+class Evolution:
+ """
+ Initializes the evolution class with the learning algorithm to be used
+ """
+ def __init__(self, evolve):
+ self.e_mlr = lm.LinearRegression()
+ self.evolve = evolve
+
+ """
+ Function call for the evolution function
+ """
+ def __call__(self, i, cluster_info, cluster, X_data, y_data):
+ return self.evolve(i, cluster_info, cluster, X_data, y_data, self.e_mlr)
+
+ """
+ Evolution of descriptors for learning algorithm, implemented as a function map
+
+ Parameters
+ ----------
+ i: list, descriptor set
+ cluster_info: dict, descriptor cluster information
+ cluster: list, descriptor cluster
+ X_data: DataFrame, descriptor data
+ y_data: DataFrame, target data
+ """
+
+[docs]
+ def evolve(i, cluster_info, cluster, X_data, y_data, e_mlr):
+ # Get the descriptors in the model
+ i = i[1]
+ # Get the groups of descriptors in model
+ group_n = [cluster_info[x]-1 for x in i]
+ # randomly select one descriptor to remove
+ sw_index = random.randrange(0, len(i))
+ # randomly select new group from cluster to swap with
+ sw_group = random.randrange(0, max(list(cluster_info.values())))
+ while sw_group in group_n:
+ # make sure the new group is not in the current group
+ sw_group = random.randrange(0, len(cluster))
+ # list comprehension which generates a new list of descriptors by
+ # swapping the indexed descriptor with a new one randomly chosen from the new cluster group
+ b_set = [random.choice(cluster[sw_group]) if x == sw_index else i[x] for x in range(0, len(i))]
+ b_set.sort()
+ x = X_data[b_set].values
+ y = y_data.values.ravel()
+ score = e_mlr.fit(x, y).score(x, y)
+ return [score, b_set]
+
+
+
+
+[docs]
+def selection(X_data, y_data, cluster_info, model="regression", learning=500000, bank=200, component=4, interval=1000, cores=(mp.cpu_count()*2)-1):
+ """
+ Forward feature selection using cophenetically correlated data on mutliple cores
+
+ Parameters
+ ----------
+ X_data : pandas DataFrame , shape = (n_samples, n_features)
+ y_data : pandas DataFrame , shape = (n_samples,)
+ cluster_info : dictionary returned by clustering.featureCluster.set_cluster()
+ model : default="regression", otherwise "classification"
+ learning : default=500000, number of overall models to be trained
+ bank : default=200, number of models to be trained in each iteration
+ component : default=4, number of features to be selected
+ interval : optional, default=1000, print current scoring and selected features
+ every interval
+ cores: optional, default=(mp.cpu_count()*2)-1, number of processes to be used
+ for multiprocessing; default is twice the number of cores minus 1, which
+ is assuming you have SMT, HT, or something similar) If you have a large
+ number of cores, you may want to set this to a lower number to avoid
+ memory issues.
+
+ Returns
+ -------
+ list, result of selected best feature set
+ """
+ now = datetime.datetime.now()
+ print("Start time: ", now.strftime('%H:%M:%S'))
+
+ if model == "regression":
+ print('\x1b[1;42m','Regression','\x1b[0m')
+ y_mlr = lm.LinearRegression()
+ e_mlr = lm.LinearRegression()
+ else:
+ print('\x1b[1;42m','Classification','\x1b[0m')
+ y_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
+ e_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
+
+ # a list of numbered clusters
+ nc = list(cluster_info.values())
+ num_clusters = list(range(max(nc)))
+
+ # extract information from dictionary by inversion
+ inv_cluster_info = dict()
+ for k, v in cluster_info.items():
+ inv_cluster_info.setdefault(v, list()).append(k)
+
+ # an ordered list of features in each cluster
+ cluster = list(dict(sorted(inv_cluster_info.items())).values())
+
+ # fill the interation bank with random models
+ # models contain (1 - component) number of features
+ # ensure the models are not duplicated and non redundant
+ index_sort_bank = set()
+ model_bank = [ ini_desc for _ in range(bank) for ini_desc in [sorted([random.choice(cluster[random.choice(num_clusters)]) for _ in range(random.randint(1,component))])] if ini_desc not in tuple(index_sort_bank) and not index_sort_bank.add(tuple(ini_desc))]
+
+ # score each set of features, saving each score and the corresponding feature set
+ scoring_bank = list(map(lambda x: [y_mlr.fit(np.array(X_data.loc[:,x]), y_data.values.ravel()).score(np.array(X_data.loc[:,x]), y_data), list(X_data.loc[:,x].columns.values)], model_bank))
+
+ # create a reflection of the evolution function
+ evolver = Evolution(Evolution.evolve)
+
+ with mp.Pool(processes = cores) as pool:
+ # perform main learning loop
+ for n in range(learning):
+ # initialize best score to the worst possible score
+ best_score = -float("inf")
+ # Evolve the bank of models and allow those surpassing the best score to replace the worst models up to the bank size
+ results = pool.starmap(evolver, [(i, cluster_info, cluster, X_data, y_data) for i in scoring_bank])
+ rank_filter = [x for x in results if (best_score := max(best_score, x[0])) == x[0]]
+ scoring_bank = sorted(itertools.chain(scoring_bank, rank_filter), reverse = True)[:bank]
+ if n % interval == 0 and n != 0:
+ tt = datetime.datetime.now()
+ print(n, '=>', tt.strftime('%H:%M:%S'), scoring_bank[0])
+
+
+ # print output and return best model found during training
+ print("Best score: ", scoring_bank[0])
+ clulog = [cluster_info[y] for _, y in scoring_bank[0][1]]
+ print("Model's cluster info", clulog)
+ fi = datetime.datetime.now()
+ fiTime = fi.strftime('%H:%M:%S')
+ print("Finish Time : ", fiTime)
+ return scoring_bank[0][1]
+
+
+#-*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: Single-Threaded Feature Selection Module
+#
+#Copyright (C) 2023 Stephen Szwiec
+#
+#This file is part of qsarify.
+#
+#This program is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+#
+#This program is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+#GNU General Public License for more details.
+#
+#You should have received a copy of the GNU General Public License
+#along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+Single-Threaded Feature Selection Module
+
+This module contains the single-threaded version of the feature selection algorithm,
+which is a genetic algorithm that uses a linear regression model to score each set of features,
+using the output of clustering to ensure that the features are not redundant.
+
+"""
+import datetime
+import random
+import numpy as np
+import pandas as pd
+import sklearn.linear_model as lm
+import itertools
+
+
+[docs]
+def mlr_selection(X_data, y_data, cluster_info, component, model="regression", learning=50000, bank=200, interval=1000):
+ """
+ Performs feature selection using a using a linear regression model and a genetic algorithm on a single thread.
+ This is the vanilla version of the algorithm, which is not parallelized.
+
+ Parameters
+ ----------
+ X_data: DataFrame, descriptor data
+ y_data: DataFrame, target data
+ cluster_info: dict, descriptor cluster information
+ component: int, number of features to select
+ model: str, learning algorithm to use, default = "regression"
+ learning: int, number of iterations to perform, default = 50000
+ bank: int, number of models to keep in the bank, default = 200
+ interval: int, number of iterations to perform before printing the current time, default = 1000
+
+ Returns
+ -------
+ best_model: list, best model found
+ best_score: float, best score found
+ """
+
+ now = datetime.datetime.now()
+ print("Start time: ", now.strftime('%H:%M:%S'))
+
+ if model == "regression":
+ print('\x1b[1;42m','Regression','\x1b[0m')
+ y_mlr = lm.LinearRegression()
+ e_mlr = lm.LinearRegression()
+ else:
+ print('\x1b[1;42m','Classification','\x1b[0m')
+ y_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
+ e_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
+
+ # a list of numbered clusters
+ nc = list(cluster_info.values())
+ num_clusters = list(range(max(nc)))
+
+ # extract information from dictionary by inversion
+ inv_cluster_info = dict()
+ for k, v in cluster_info.items():
+ inv_cluster_info.setdefault(v, list()).append(k)
+
+ # an ordered list of features in each cluster
+ cluster = list(dict(sorted(inv_cluster_info.items())).values())
+
+ # fill the interation bank with random models
+ # models contain 1-component number of features
+ # ensure the models are not duplicated and non redundant
+ index_sort_bank = set()
+ model_bank = [ ini_desc for _ in range(bank) for ini_desc in [sorted([random.choice(cluster[random.choice(num_clusters)]) for _ in range(random.randint(1,component))])] if ini_desc not in tuple(index_sort_bank) and not index_sort_bank.add(tuple(ini_desc))]
+
+ # score each set of features, saving each score and the corresponding feature set
+ scoring_bank = list(map(lambda x: [y_mlr.fit(np.array(X_data.loc[:,x]), y_data.values.ravel()).score(np.array(X_data.loc[:,x]), y_data), list(X_data.loc[:,x].columns.values)], model_bank))
+
+ def evolve(i):
+ """
+ Evolution of descriptors for learning algorithm, implemented as a function map
+
+ Parameters
+ ----------
+ i: list, descriptor set
+ """
+ i = i[1]
+ group_n = [cluster_info[x]-1 for x in i]
+ sw_index = random.randrange(0, len(i))
+ sw_group = random.randrange(0, max(nc))
+ while sw_group in group_n:
+ sw_group = random.randrange(0, max(nc))
+ b_set = [random.choice(cluster[sw_group]) if x == sw_index else i[x] for x in range(0, len(i))]
+ b_set.sort()
+ x = X_data[b_set].values
+ y = y_data.values.ravel()
+ score = e_mlr.fit(x, y).score(x, y)
+ return [score, b_set]
+
+ # perform main learning loop
+ for n in range(learning):
+ # initialize best score to the worst possible score
+ best_score = -float("inf")
+ # Evolve the bank of models and allow those surpassing the best score to replace the worst models up to the bank size
+ rank_filter = filter(lambda x, best_score=best_score: x[0] > best_score and (best_score := x[0]), map(evolve, scoring_bank))
+ scoring_bank = sorted(itertools.chain(scoring_bank, rank_filter), reverse = True)[:bank]
+ if n % interval == 0 and n != 0:
+ tt = datetime.datetime.now()
+ print(n, '=>', tt.strftime('%H:%M:%S'), scoring_bank[0])
+
+ # print output and return best model found during training
+ print("Best score: ", scoring_bank[0][0])
+ clulog = [cluster_info[y] for y in scoring_bank[0][1]]
+ print("Model's cluster info", clulog)
+ fi = datetime.datetime.now()
+ fiTime = fi.strftime('%H:%M:%S')
+ print("Finish Time : ", fiTime)
+ return scoring_bank[0][1]
+
+
+
+[docs]
+def rf_selection(X_data, y_data, cluster_info, component, model="regression", learning=50000, bank=200, interval=1000):
+ """
+ Performs feature selection using a using a random forest model and a genetic algorithm on a single thread.
+ This is the vanilla version of the algorithm, which is not parallelized.
+
+ Parameters
+ ----------
+ X_data: DataFrame, descriptor data
+ y_data: DataFrame, target data
+ cluster_info: dict, descriptor cluster information
+ component: int, number of features to select
+ model: str, learning algorithm to use, default = "regression"
+ learning: int, number of iterations to perform, default = 50000
+ bank: int, number of models to keep in the bank, default = 200
+ interval: int, number of iterations to perform before printing the current time, default = 1000
+
+ Returns
+ -------
+ best_model: list, best model found
+ best_score: float, best score found
+ """
+
+ now = datetime.datetime.now()
+ print("Start time: ", now.strftime('%H:%M:%S'))
+
+ if model == "regression":
+ print('\x1b[1;42m','Regression','\x1b[0m')
+ y_rf = RandomForestRegressor(n_estimators=100, max_depth=2, random_state=0)
+ e_rf = RandomForestRegressor(n_estimators=100, max_depth=2, random_state=0)
+ else:
+ print('\x1b[1;42m','Classification','\x1b[0m')
+ y_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
+ e_rf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
+
+ # a list of numbered clusters
+ nc = list(cluster_info.values())
+ num_clusters = list(range(max(nc)))
+
+ # extract information from dictionary by inversion
+ inv_cluster_info = dict()
+ for k, v in cluster_info.items():
+ inv_cluster_info.setdefault(v, list()).append(k)
+
+ # an ordered list of features in each cluster
+ cluster = list(dict(sorted(inv_cluster_info.items())).values())
+
+ # fill the interation bank with random models
+ # models contain 1-component number of features
+ # ensure the models are not duplicated and non redundant
+ index_sort_bank = set()
+ model_bank = [ ini_desc for _ in range(bank) for ini_desc in [sorted([random.choice(cluster[random.choice(num_clusters)]) for _ in range(random.randint(1,component))])] if ini_desc not in tuple(index_sort_bank) and not index_sort_bank.add(tuple(ini_desc))]
+
+ # score each set of features, saving each score and the corresponding feature set
+ scoring_bank = list(map(lambda x: [y_rf.fit(np.array(X_data.loc[:,x]), y_data.values.ravel()).score(np.array(X_data.loc[:,x]), y_data), list(X_data.loc[:,x].columns.values)], model_bank))
+
+ def evolve(i):
+ """
+ Evolution of descriptors for learning algorithm, implemented as a function map
+
+ Parameters
+ ----------
+ i: list, descriptor set
+ """
+ i = i[1]
+ group_n = [cluster_info[x]-1 for x in i]
+ sw_index = random.randrange(0, len(i))
+ sw_group = random.randrange(0, max(nc))
+ while sw_group in group_n:
+ sw_group = random.randrange(0, max(nc))
+ b_set = [random.choice(cluster[sw_group]) if x == sw_index else i[x] for x in range(0, len(i))]
+ b_set.sort()
+ x = X_data[b_set].values
+ y = y_data.values.ravel()
+ score = e_rf.fit(x, y).score(x, y)
+ return [score, b_set]
+
+ # perform main learning loop
+ for n in range(learning):
+ # initialize best score to the worst possible score
+ best_score = -float("inf")
+ # Evolve the bank of models and allow those surpassing the best score to replace the worst models up to the bank size
+ rank_filter = filter(lambda x, best_score=best_score: x[0] > best_score and (best_score := x[0]), map(evolve, scoring_bank))
+ scoring_bank = sorted(itertools.chain(scoring_bank, rank_filter), reverse = True)[:bank]
+ if n % interval == 0 and n != 0:
+ tt = datetime.datetime.now()
+ print(n, '=>', tt.strftime('%H:%M:%S'), scoring_bank[0])
+
+ # print output and return best model found during training
+ print("Best score: ", scoring_bank[0][0])
+ clulog = [cluster_info[y] for y in scoring_bank[0][1]]
+ print("Model's cluster info", clulog)
+ fi = datetime.datetime.now()
+ fiTime = fi.strftime('%H:%M:%S')
+ print("Finish Time : ", fiTime)
+ return scoring_bank[0][1]
+
+
+# -*- coding: utf-8 -*-
+# Author: Stephen Szwiec
+# Date: 2023-02-19
+# Description: QSAR Scoring Module
+"""
+Copyright (C) 2023 Stephen Szwiec
+
+This file is part of qsarify.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+"""
+import numpy as np
+"""
+Commonly used scoring functions for QSAR models
+"""
+
+
+[docs]
+def rmse_score(y_true, y_pred):
+ """
+ Calculates the RMSE score
+
+ Parameters
+ ----------
+ y_true : numpy array , shape (n_samples,)
+ y_pred : numpy array, shape (n_samples,)
+
+ Returns
+ -------
+ float
+ """
+ return np.sqrt(np.mean(np.square(y_true - y_pred)))
+
+
+
+[docs]
+def q2_score(y_true, y_pred):
+ """
+ Calculates the Q2 score
+
+ Parameters
+ ----------
+ y_true : numpy array , shape (n_samples,)
+ y_pred : numpy array, shape (n_samples,)
+
+ Returns
+ -------
+ float
+ """
+ press = np.sum(np.square(y_true - y_pred))
+ tss = np.sum(np.square(y_true - np.mean(y_true)))
+ return 1 - press/tss
+
+
+
+[docs]
+def q2f_score(y_true, y_pred, y_mean):
+ """
+ Calculates the Q2_f1 or Q2_f2 score
+ depending on whether the mean is calculated from the training set or the external set
+
+ Parameters
+ ----------
+ y_true : numpy array, shape (n_samples,)
+ y_pred : numpy array, shape (n_samples,)
+ y_mean : float, mean of the training (for q2f1) or test (for q2f2) set
+
+ Returns
+ -------
+ float
+ """
+ press = np.sum(np.square(y_true - y_pred))
+ tss = np.sum(np.square(y_true - y_mean))
+ return 1 - press/tss
+
+
+
+[docs]
+def q2f3_score(y_true, y_pred, n_train, n_external):
+ """
+ Calculates the Q2_f3 score
+
+ Parameters
+ ----------
+ y_true : numpy array, shape (n_samples,)
+ y_pred : numpy array, shape (n_samples,)
+ n_external : int
+ number of external samples
+ n_train : int
+ number of training samples
+
+ Returns
+ -------
+ float
+ """
+ press = np.sum(np.square(y_true - y_pred))
+ tss = np.sum(np.square(y_true - np.mean(y_true)))
+ return 1 - (press / n_external) / (tss * n_train)
+
+
+
+[docs]
+def ccc_score(y_true, y_pred):
+ """
+ Calculates the CCC score
+
+ Parameters
+ ----------
+ y_true : numpy array, shape (n_samples,)
+ y_pred : numpy array, shape (n_samples,)
+
+ Returns
+ -------
+ float
+ """
+ mean_true = y_true.mean()
+ mean_pred = y_pred.mean()
+ var_true = y_true.var()
+ var_pred = y_pred.var()
+ covar_true_pred = np.cov(y_true, y_pred)[0,1]
+ return 2 * covar_true_pred / (var_true + var_pred + (mean_true - mean_pred)**2)
+
+
' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/docs/html/man/html/genindex.html b/docs/html/man/html/genindex.html new file mode 100644 index 0000000..8618097 --- /dev/null +++ b/docs/html/man/html/genindex.html @@ -0,0 +1,296 @@ + + + + + ++ | + |
+ | + |
+ |
|
+
|
+ + |
+ | + |
+ | + |
+ |
Evolution
+selection()
Classification Scoring Module
+This module provides summary information about Classification
+Bases: object
Provides summary information about Classification
+y_data (pandas DataFrame , shape = (n_samples,)) –
pred_y (pandas DataFrame , shape = (n_samples,)) –
classification (=> predicted Y values as result of) –
functions (Sub) –
------- –
(self) (score) –
tf_table(self) –
Clustering Module
+This module contains functions for clustering features based on hierarchical clustering method +and calculating the cophenetic correlation coefficient of linkages. The cophenetic correlation +coefficient is a measure of the correlation between the distance of observations in feature space +and the distance of observations in cluster space. The cophenetic correlation coefficient is +calculated for each linkage method and the method with the highest cophenetic correlation +coefficient is used to cluster the features. The cophenetic correlation coefficient is calculated +using the scipy.cluster.hierarchy.cophenet function.
+Calculate the cophenetic correlation coefficient of linkages
+X_data (pandas DataFrame, shape = (n_samples, m_features)) –
method (str, method for linkage generation, default = 'corr' (Pearson correlation)) –
None
+Bases: object
Make cluster of features based on hierarchical clustering method
+X_data (pandas DataFrame, shape = (n_samples, n_features)) –
link (str, kind of linkage method, default = 'average', 'complete', 'single') –
cut_d (int, depth in cluster(dendrogram), default = 3) –
functions (Sub) –
------------- –
set_cluster(self) –
cluster_dist(self) –
Make cluster of features based on hierarchical clustering method
+verbose (bool, print cluster information, default = False) –
graph (bool, show dendrogram, default = False) –
cludict
+dict, cluster information of features as a dictionary
+Data Preprocessing Module
+removing features with ‘NaN’ as value
removing features with constant values
removing features with low variance
removing features with ‘NaN’ as value when calculating correlation coefficients
generating a sequential train-test split by sorting the data by response variable
generating a random train-test split
scaling data
The main function of this module is clean_data, which performs all of the above functions.
+Perform the entire data cleaning process as one function +Optionally, plot the correlation matrix
+X_data (pandas DataFrame, shape = (n_samples, n_features)) –
split (string, optional, 'sorted' or 'random') –
test_size (float, optional, default = 0.2) –
cutoff (float, optional, auto-correlaton coefficient below which we keep) –
plot (boolean, optional, default = False) –
X_train (pandas DataFrame , shape = (n_samples, m_features))
X_test (pandas DataFrame , shape = (p_samples, m_features))
y_train (pandas DataFrame , shape = (n_samples, 1))
y_test (pandas DataFrame , shape = (p_samples, 1))
Generate a random train-test split
+X_data (pandas DataFrame , shape = (n_samples, m_features)) –
y_data (pandas DataFrame , shape = (n_samples, 1)) –
test_size (float, default = 0.2) –
Returns –
dataframe (-------give count of NaN in pandas) –
X_train (pandas DataFrame , shape = (n_samples, m_features)) –
X_test (pandas DataFrame , shape = (n_samples, m_features)) –
y_train (pandas DataFrame , shape = (n_samples, 1)) –
y_test (pandas DataFrame , shape = (n_samples, 1)) –
Remove features with constant values
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Remove features with low variance
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
cutoff (float, default = 0.1) –
Modified DataFrame
+Remove features with ‘NaN’ as value
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Remove features with ‘NaN’ as value when calculating correlation coefficients
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Scale the data using the training data; apply the same transformation to the test data
+X_train (pandas DataFrame , shape = (n_samples, m_features)) –
X_test (pandas DataFrame , shape = (p_samples, m_features)) –
X_train_scaled (pandas DataFrame , shape = (n_samples, m_features))
X_test_scaled (pandas DataFrame , shape = (p_samples, m_features))
Generate a sequential train-test split by sorting the data by response variable
+X_data (pandas DataFrame , shape = (n_samples, m_features)) –
y_data (pandas DataFrame , shape = (n_samples, 1)) –
test_size (float, default = 0.2) –
X_train (pandas DataFrame , shape = (n_samples, m_features))
X_test (pandas DataFrame, shape = (n_samples, m_features))
y_train (pandas DataFrame , shape = (n_samples, 1))
y_test (pandas DataFrame , shape = (n_samples, 1))
Multi-Processing Feature Selection Module
+This module contains the functions for performing feature selection using +the clustering module’s output as a guide for feature selection, and implements +a genetic algorithm for feature selection using reflection.
+Bases: object
Initializes the evolution class with the learning algorithm to be used
+ + +Forward feature selection using cophenetically correlated data on mutliple cores
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
y_data (pandas DataFrame , shape = (n_samples,)) –
cluster_info (dictionary returned by clustering.featureCluster.set_cluster()) –
model (default="regression", otherwise "classification") –
learning (default=500000, number of overall models to be trained) –
bank (default=200, number of models to be trained in each iteration) –
component (default=4, number of features to be selected) –
interval (optional, default=1000, print current scoring and selected features) – every interval
cores (optional, default=(mp.cpu_count()*2)-1, number of processes to be used) – for multiprocessing; default is twice the number of cores minus 1, which +is assuming you have SMT, HT, or something similar) If you have a large +number of cores, you may want to set this to a lower number to avoid +memory issues.
list, result of selected best feature set
+Single-Threaded Feature Selection Module
+This module contains the single-threaded version of the feature selection algorithm, +which is a genetic algorithm that uses a linear regression model to score each set of features, +using the output of clustering to ensure that the features are not redundant.
+Performs feature selection using a using a linear regression model and a genetic algorithm on a single thread. +This is the vanilla version of the algorithm, which is not parallelized.
+X_data (DataFrame, descriptor data) –
y_data (DataFrame, target data) –
cluster_info (dict, descriptor cluster information) –
component (int, number of features to select) –
model (str, learning algorithm to use, default = "regression") –
learning (int, number of iterations to perform, default = 50000) –
bank (int, number of models to keep in the bank, default = 200) –
interval (int, number of iterations to perform before printing the current time, default = 1000) –
best_model (list, best model found)
best_score (float, best score found)
Performs feature selection using a using a random forest model and a genetic algorithm on a single thread. +This is the vanilla version of the algorithm, which is not parallelized.
+X_data (DataFrame, descriptor data) –
y_data (DataFrame, target data) –
cluster_info (dict, descriptor cluster information) –
component (int, number of features to select) –
model (str, learning algorithm to use, default = "regression") –
learning (int, number of iterations to perform, default = 50000) –
bank (int, number of models to keep in the bank, default = 200) –
interval (int, number of iterations to perform before printing the current time, default = 1000) –
best_model (list, best model found)
best_score (float, best score found)
Copyright (C) 2023 Stephen Szwiec
+This file is part of qsarify.
+This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version.
+This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details.
+You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>.
+Calculates the CCC score
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
float
+Calculates the Q2 score
+y_true (numpy array , shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
float
+Calculates the Q2_f3 score
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
n_external (int) – number of external samples
n_train (int) – number of training samples
float
+Calculates the Q2_f1 or Q2_f2 score +depending on whether the mean is calculated from the training set or the external set
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
y_mean (float, mean of the training (for q2f1) or test (for q2f2) set) –
float
++ q | ||
+ |
+ qsarify | + |
+ |
+ qsarify.classification | + |
+ |
+ qsarify.clustering | + |
+ |
+ qsarify.data_tools | + |
+ |
+ qsarify.feature_selection_multi | + |
+ |
+ qsarify.feature_selection_single | + |
+ |
+ qsarify.qsar_scoring | + |
Classification Scoring Module
+This module provides summary information about Classification
+Bases: object
Provides summary information about Classification
+y_data (pandas DataFrame , shape = (n_samples,)) –
pred_y (pandas DataFrame , shape = (n_samples,)) –
classification (=> predicted Y values as result of) –
functions (Sub) –
------- –
(self) (score) –
tf_table(self) –
Clustering Module
+This module contains functions for clustering features based on hierarchical clustering method +and calculating the cophenetic correlation coefficient of linkages. The cophenetic correlation +coefficient is a measure of the correlation between the distance of observations in feature space +and the distance of observations in cluster space. The cophenetic correlation coefficient is +calculated for each linkage method and the method with the highest cophenetic correlation +coefficient is used to cluster the features. The cophenetic correlation coefficient is calculated +using the scipy.cluster.hierarchy.cophenet function.
+Calculate the cophenetic correlation coefficient of linkages
+X_data (pandas DataFrame, shape = (n_samples, m_features)) –
method (str, method for linkage generation, default = 'corr' (Pearson correlation)) –
None
+Bases: object
Make cluster of features based on hierarchical clustering method
+X_data (pandas DataFrame, shape = (n_samples, n_features)) –
link (str, kind of linkage method, default = 'average', 'complete', 'single') –
cut_d (int, depth in cluster(dendrogram), default = 3) –
functions (Sub) –
------------- –
set_cluster(self) –
cluster_dist(self) –
Make cluster of features based on hierarchical clustering method
+verbose (bool, print cluster information, default = False) –
graph (bool, show dendrogram, default = False) –
cludict
+dict, cluster information of features as a dictionary
+Data Preprocessing Module
+removing features with ‘NaN’ as value
removing features with constant values
removing features with low variance
removing features with ‘NaN’ as value when calculating correlation coefficients
generating a sequential train-test split by sorting the data by response variable
generating a random train-test split
scaling data
The main function of this module is clean_data, which performs all of the above functions.
+Perform the entire data cleaning process as one function +Optionally, plot the correlation matrix
+X_data (pandas DataFrame, shape = (n_samples, n_features)) –
split (string, optional, 'sorted' or 'random') –
test_size (float, optional, default = 0.2) –
cutoff (float, optional, auto-correlaton coefficient below which we keep) –
plot (boolean, optional, default = False) –
X_train (pandas DataFrame , shape = (n_samples, m_features))
X_test (pandas DataFrame , shape = (p_samples, m_features))
y_train (pandas DataFrame , shape = (n_samples, 1))
y_test (pandas DataFrame , shape = (p_samples, 1))
Generate a random train-test split
+X_data (pandas DataFrame , shape = (n_samples, m_features)) –
y_data (pandas DataFrame , shape = (n_samples, 1)) –
test_size (float, default = 0.2) –
Returns –
dataframe (-------give count of NaN in pandas) –
X_train (pandas DataFrame , shape = (n_samples, m_features)) –
X_test (pandas DataFrame , shape = (n_samples, m_features)) –
y_train (pandas DataFrame , shape = (n_samples, 1)) –
y_test (pandas DataFrame , shape = (n_samples, 1)) –
Remove features with constant values
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Remove features with low variance
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
cutoff (float, default = 0.1) –
Modified DataFrame
+Remove features with ‘NaN’ as value
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Remove features with ‘NaN’ as value when calculating correlation coefficients
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
+Modified DataFrame
+Scale the data using the training data; apply the same transformation to the test data
+X_train (pandas DataFrame , shape = (n_samples, m_features)) –
X_test (pandas DataFrame , shape = (p_samples, m_features)) –
X_train_scaled (pandas DataFrame , shape = (n_samples, m_features))
X_test_scaled (pandas DataFrame , shape = (p_samples, m_features))
Generate a sequential train-test split by sorting the data by response variable
+X_data (pandas DataFrame , shape = (n_samples, m_features)) –
y_data (pandas DataFrame , shape = (n_samples, 1)) –
test_size (float, default = 0.2) –
X_train (pandas DataFrame , shape = (n_samples, m_features))
X_test (pandas DataFrame, shape = (n_samples, m_features))
y_train (pandas DataFrame , shape = (n_samples, 1))
y_test (pandas DataFrame , shape = (n_samples, 1))
Multi-Processing Feature Selection Module
+This module contains the functions for performing feature selection using +the clustering module’s output as a guide for feature selection, and implements +a genetic algorithm for feature selection using reflection.
+Bases: object
Initializes the evolution class with the learning algorithm to be used
+ + +Forward feature selection using cophenetically correlated data on mutliple cores
+X_data (pandas DataFrame , shape = (n_samples, n_features)) –
y_data (pandas DataFrame , shape = (n_samples,)) –
cluster_info (dictionary returned by clustering.featureCluster.set_cluster()) –
model (default="regression", otherwise "classification") –
learning (default=500000, number of overall models to be trained) –
bank (default=200, number of models to be trained in each iteration) –
component (default=4, number of features to be selected) –
interval (optional, default=1000, print current scoring and selected features) – every interval
cores (optional, default=(mp.cpu_count()*2)-1, number of processes to be used) – for multiprocessing; default is twice the number of cores minus 1, which +is assuming you have SMT, HT, or something similar) If you have a large +number of cores, you may want to set this to a lower number to avoid +memory issues.
list, result of selected best feature set
+Single-Threaded Feature Selection Module
+This module contains the single-threaded version of the feature selection algorithm, +which is a genetic algorithm that uses a linear regression model to score each set of features, +using the output of clustering to ensure that the features are not redundant.
+Performs feature selection using a using a linear regression model and a genetic algorithm on a single thread. +This is the vanilla version of the algorithm, which is not parallelized.
+X_data (DataFrame, descriptor data) –
y_data (DataFrame, target data) –
cluster_info (dict, descriptor cluster information) –
component (int, number of features to select) –
model (str, learning algorithm to use, default = "regression") –
learning (int, number of iterations to perform, default = 50000) –
bank (int, number of models to keep in the bank, default = 200) –
interval (int, number of iterations to perform before printing the current time, default = 1000) –
best_model (list, best model found)
best_score (float, best score found)
Performs feature selection using a using a random forest model and a genetic algorithm on a single thread. +This is the vanilla version of the algorithm, which is not parallelized.
+X_data (DataFrame, descriptor data) –
y_data (DataFrame, target data) –
cluster_info (dict, descriptor cluster information) –
component (int, number of features to select) –
model (str, learning algorithm to use, default = "regression") –
learning (int, number of iterations to perform, default = 50000) –
bank (int, number of models to keep in the bank, default = 200) –
interval (int, number of iterations to perform before printing the current time, default = 1000) –
best_model (list, best model found)
best_score (float, best score found)
Copyright (C) 2023 Stephen Szwiec
+This file is part of qsarify.
+This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version.
+This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details.
+You should have received a copy of the GNU General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>.
+Calculates the CCC score
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
float
+Calculates the Q2 score
+y_true (numpy array , shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
float
+Calculates the Q2_f3 score
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
n_external (int) – number of external samples
n_train (int) – number of training samples
float
+Calculates the Q2_f1 or Q2_f2 score +depending on whether the mean is calculated from the training set or the external set
+y_true (numpy array, shape (n_samples,)) –
y_pred (numpy array, shape (n_samples,)) –
y_mean (float, mean of the training (for q2f1) or test (for q2f2) set) –
float
+\n", + " | No. | \n", + "AMW | \n", + "Sp | \n", + "Mv | \n", + "Me | \n", + "Ms | \n", + "nBM | \n", + "ARR | \n", + "RBN | \n", + "RBF | \n", + "... | \n", + "PCWTe | \n", + "LDI | \n", + "Hy | \n", + "AMR | \n", + "MLOGP | \n", + "MLOGP2 | \n", + "ALOGP | \n", + "GVWAI-80 | \n", + "Infective-80 | \n", + "BLTD48 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "6.510 | \n", + "8.284 | \n", + "0.649 | \n", + "0.971 | \n", + "2.000 | \n", + "6 | \n", + "1.000 | \n", + "0 | \n", + "0.000 | \n", + "... | \n", + "10.100 | \n", + "0.062 | \n", + "-0.921 | \n", + "26.058 | \n", + "2.255 | \n", + "5.085 | \n", + "2.047 | \n", + "0 | \n", + "0 | \n", + "-3.46 | \n", + "
1 | \n", + "2 | \n", + "6.143 | \n", + "10.045 | \n", + "0.626 | \n", + "0.969 | \n", + "1.952 | \n", + "6 | \n", + "0.857 | \n", + "1 | \n", + "0.067 | \n", + "... | \n", + "11.371 | \n", + "0.057 | \n", + "-0.936 | \n", + "31.099 | \n", + "2.608 | \n", + "6.802 | \n", + "2.514 | \n", + "0 | \n", + "0 | \n", + "-3.80 | \n", + "
2 | \n", + "3 | \n", + "8.794 | \n", + "9.438 | \n", + "0.658 | \n", + "1.037 | \n", + "3.074 | \n", + "8 | \n", + "0.889 | \n", + "1 | \n", + "0.071 | \n", + "... | \n", + "2.209 | \n", + "0.144 | \n", + "-0.636 | \n", + "33.383 | \n", + "1.797 | \n", + "3.229 | \n", + "2.000 | \n", + "0 | \n", + "0 | \n", + "-3.03 | \n", + "
3 | \n", + "4 | \n", + "8.068 | \n", + "11.199 | \n", + "0.636 | \n", + "1.024 | \n", + "2.933 | \n", + "8 | \n", + "0.800 | \n", + "2 | \n", + "0.118 | \n", + "... | \n", + "2.441 | \n", + "0.130 | \n", + "-0.672 | \n", + "38.424 | \n", + "2.150 | \n", + "4.623 | \n", + "2.467 | \n", + "0 | \n", + "0 | \n", + "-3.36 | \n", + "
4 | \n", + "5 | \n", + "8.068 | \n", + "11.199 | \n", + "0.636 | \n", + "1.024 | \n", + "2.933 | \n", + "8 | \n", + "0.800 | \n", + "2 | \n", + "0.118 | \n", + "... | \n", + "2.313 | \n", + "0.123 | \n", + "-0.672 | \n", + "38.424 | \n", + "2.150 | \n", + "4.623 | \n", + "2.467 | \n", + "0 | \n", + "0 | \n", + "-3.36 | \n", + "
5 rows × 676 columns
\n", + "" + ], + "text/plain": [ + " No. AMW Sp Mv Me Ms nBM ARR RBN RBF ... \\\n", + "0 1 6.510 8.284 0.649 0.971 2.000 6 1.000 0 0.000 ... \n", + "1 2 6.143 10.045 0.626 0.969 1.952 6 0.857 1 0.067 ... \n", + "2 3 8.794 9.438 0.658 1.037 3.074 8 0.889 1 0.071 ... \n", + "3 4 8.068 11.199 0.636 1.024 2.933 8 0.800 2 0.118 ... \n", + "4 5 8.068 11.199 0.636 1.024 2.933 8 0.800 2 0.118 ... \n", + "\n", + " PCWTe LDI Hy AMR MLOGP MLOGP2 ALOGP GVWAI-80 Infective-80 \\\n", + "0 10.100 0.062 -0.921 26.058 2.255 5.085 2.047 0 0 \n", + "1 11.371 0.057 -0.936 31.099 2.608 6.802 2.514 0 0 \n", + "2 2.209 0.144 -0.636 33.383 1.797 3.229 2.000 0 0 \n", + "3 2.441 0.130 -0.672 38.424 2.150 4.623 2.467 0 0 \n", + "4 2.313 0.123 -0.672 38.424 2.150 4.623 2.467 0 0 \n", + "\n", + " BLTD48 \n", + "0 -3.46 \n", + "1 -3.80 \n", + "2 -3.03 \n", + "3 -3.36 \n", + "4 -3.36 \n", + "\n", + "[5 rows x 676 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfx.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "daf17e1a-182a-43c3-9b92-68f76eec6b74", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(28, 676)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfx.shape " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bd7cbef0-6a23-4430-a582-d7c6f9970097", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(22, 549) \n", + " (6, 549)\n" + ] + } + ], + "source": [ + "# this is the basic workflow for the data_tools module\n", + "dfx = dt.rm_nan(dfx)\n", + "dfx = dt.rm_constant(dfx)\n", + "dfx = dt.rm_lowVar(dfx)\n", + "dfx = dt.rm_nanCorr(dfx)\n", + "xtrain, xtest, ytrain, ytest = dt.sorted_split(dfx,dfy,0.2)\n", + "xtrain, xtest = dt.scale_data(xtrain, xtest)\n", + "print( xtrain.shape, '\\n', xtest.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "09a250cf-a5c9-465c-ae07-d96f210c413f", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "\n", + " | No. | \n", + "AMW | \n", + "Sp | \n", + "Mv | \n", + "Me | \n", + "Ms | \n", + "nBM | \n", + "ARR | \n", + "RBN | \n", + "RBF | \n", + "... | \n", + "PCWTe | \n", + "LDI | \n", + "Hy | \n", + "AMR | \n", + "MLOGP | \n", + "MLOGP2 | \n", + "ALOGP | \n", + "GVWAI-80 | \n", + "Infective-80 | \n", + "BLTD48 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "6.510 | \n", + "8.284 | \n", + "0.649 | \n", + "0.971 | \n", + "2.000 | \n", + "6 | \n", + "1.000 | \n", + "0 | \n", + "0.000 | \n", + "... | \n", + "10.100 | \n", + "0.062 | \n", + "-0.921 | \n", + "26.058 | \n", + "2.255 | \n", + "5.085 | \n", + "2.047 | \n", + "0 | \n", + "0 | \n", + "-3.46 | \n", + "
1 | \n", + "2 | \n", + "6.143 | \n", + "10.045 | \n", + "0.626 | \n", + "0.969 | \n", + "1.952 | \n", + "6 | \n", + "0.857 | \n", + "1 | \n", + "0.067 | \n", + "... | \n", + "11.371 | \n", + "0.057 | \n", + "-0.936 | \n", + "31.099 | \n", + "2.608 | \n", + "6.802 | \n", + "2.514 | \n", + "0 | \n", + "0 | \n", + "-3.80 | \n", + "
2 | \n", + "3 | \n", + "8.794 | \n", + "9.438 | \n", + "0.658 | \n", + "1.037 | \n", + "3.074 | \n", + "8 | \n", + "0.889 | \n", + "1 | \n", + "0.071 | \n", + "... | \n", + "2.209 | \n", + "0.144 | \n", + "-0.636 | \n", + "33.383 | \n", + "1.797 | \n", + "3.229 | \n", + "2.000 | \n", + "0 | \n", + "0 | \n", + "-3.03 | \n", + "
3 | \n", + "4 | \n", + "8.068 | \n", + "11.199 | \n", + "0.636 | \n", + "1.024 | \n", + "2.933 | \n", + "8 | \n", + "0.800 | \n", + "2 | \n", + "0.118 | \n", + "... | \n", + "2.441 | \n", + "0.130 | \n", + "-0.672 | \n", + "38.424 | \n", + "2.150 | \n", + "4.623 | \n", + "2.467 | \n", + "0 | \n", + "0 | \n", + "-3.36 | \n", + "
4 | \n", + "5 | \n", + "8.068 | \n", + "11.199 | \n", + "0.636 | \n", + "1.024 | \n", + "2.933 | \n", + "8 | \n", + "0.800 | \n", + "2 | \n", + "0.118 | \n", + "... | \n", + "2.313 | \n", + "0.123 | \n", + "-0.672 | \n", + "38.424 | \n", + "2.150 | \n", + "4.623 | \n", + "2.467 | \n", + "0 | \n", + "0 | \n", + "-3.36 | \n", + "
5 rows × 676 columns
\n", + "