From 82e5c3b3f47301a30d54fb1685166e3fdeee3041 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 19 Sep 2024 15:46:20 +0200 Subject: [PATCH 01/62] first line --- fsspark/fs/core.py | 4 +- fsspark/fs/fdataframe.py | 433 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 434 insertions(+), 3 deletions(-) create mode 100644 fsspark/fs/fdataframe.py diff --git a/fsspark/fs/core.py b/fsspark/fs/core.py index 8c59fce..1f05008 100644 --- a/fsspark/fs/core.py +++ b/fsspark/fs/core.py @@ -6,8 +6,6 @@ Set, Tuple) -import pyspark.pandas -import pyspark.sql from pyspark.ml.feature import (VectorAssembler, StringIndexer, Binarizer, @@ -22,7 +20,7 @@ rand) logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("FSSPARK") +logger = logging.getLogger("pickfeat") logger.setLevel(logging.INFO) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py new file mode 100644 index 0000000..ae0aca8 --- /dev/null +++ b/fsspark/fs/fdataframe.py @@ -0,0 +1,433 @@ + + + +class FSDataFrame: + """ + FSDataFrame is a representation of a Spark DataFrame with some functionalities to perform feature selection. + An object from FSDataFrame is basically represented by a Spark DataFrame with samples + as rows and features as columns, with extra distributed indexed pandas series for + features names and samples labels. + + An object of FSDataFrame offers an interface to a Spark DataFrame, a Pandas on Spark DataFrame + (e.g. suitable for visualization) or a Spark DataFrame with features as a Dense column vector (e.g. suitable for + applying most algorithms from Spark MLib API). + + It can also be split in training and testing dataset and filtered by removing selected features (by name or index). + + [...] + + """ + + def __init__( + self, + df: Union[pyspark.sql.DataFrame, pyspark.pandas.DataFrame], + sample_col: str = None, + label_col: str = None, + row_index_col: Optional[str] = '_row_index', + parse_col_names: bool = False, + parse_features: bool = False, + ): + """ + Create an instance of FSDataFrame. + + Expected an input DataFrame with 2+N columns. + After specifying sample id and sample label columns, the remaining N columns will be considered as features. + + :param df: Spark (or Pandas on Spark) DataFrame + :param sample_col: Sample id column name + :param label_col: Sample label column name + :param row_index_col: Optional. Column name of row indices. + :param parse_col_names: Replace dots (.) in column names with underscores. + :param parse_features: Coerce all features to float. + """ + + self.__df = self._convert_psdf_to_sdf(df) + self.__sample_col = sample_col + self.__label_col = label_col + self.__row_index_name = row_index_col + + # check input dataframe + self._check_df() + + # replace dots in column names, if any. + if parse_col_names: + # TODO: Dots in column names are prone to errors, since dots are used to access attributes from DataFrame. + # Should we make this replacement optional? Or print out a warning? + self.__df = self.__df.toDF(*(c.replace('.', '_') for c in self.__df.columns)) + + # If the specified row index column name does not exist, add row index to the dataframe + if self.__row_index_name not in self.__df.columns: + self.__df = self._add_row_index(index_name=self.__row_index_name) + + if parse_features: + # coerce all features to float + non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] + feature_cols = [c for c in self.__df.columns if c not in non_features_cols] + self.__df = self.__df.withColumns({c: self.__df[c].cast('float') for c in feature_cols}) + + self.__indexed_features = self._set_indexed_cols() + self.__indexed_instances = self._set_indexed_rows() + + def _check_df(self): + """ + Check if input DataFrame meet the minimal requirements to feed an FS pipeline. + + :return: None + """ + col_names = self.__df.columns + if self.__sample_col not in col_names: + raise DataFormatError(f"Column sample name {self.__sample_col} not found...") + elif self.__label_col not in col_names: + raise DataFormatError(f"Column label name {self.__label_col} not found...") + elif not isinstance(self.__row_index_name, str): + raise DataFormatError("Row index column name must be a valid string...") + else: + pass + + @staticmethod + def _convert_psdf_to_sdf(df: Union[pyspark.pandas.DataFrame, pyspark.sql.DataFrame]) -> pyspark.sql.DataFrame: + """ + Convert Pandas on Spark DataFrame (psdf) to Spark DataFrame (sdf). + + :param df: Spark (or Pandas on Spark) DataFrame + :return: Spark DataFrame + """ + return df.to_spark(index_col=None) if isinstance(df, pyspark.pandas.DataFrame) else df + + def _set_indexed_cols(self) -> pyspark.pandas.series.Series: + """ + Create a distributed indexed Series representing features. + + :return: Pandas on Spark (PoS) Series + """ + # TODO: Check for equivalent to pandas distributed Series in Spark. + non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] + features = [f for f in self.__df.columns if f not in non_features_cols] + return Series(features) + + def _set_indexed_rows(self) -> pyspark.pandas.series.Series: + """ + Create a distributed indexed Series representing samples labels. + It will use existing row indices, if any. + + :return: Pandas on Spark (PoS) Series + """ + # TODO: Check for equivalent to pandas distributed Series in Spark. + label = self.__df.select(self.__label_col).collect() + row_index = self.__df.select(self.__row_index_name).collect() + return Series(label, index=row_index) + + def get_features_indexed(self) -> pyspark.pandas.series.Series: + """ + Return features names with indices as a Series. + :return: Indexed Series. + """ + return self.__indexed_features + + def get_sample_label_indexed(self) -> pyspark.pandas.series.Series: + """ + Return sample labels with indices as a Series. + :return: Indexed Series. + """ + return self.__indexed_instances + + def get_features_names(self) -> list: + """ + Get features names from DataFrame. + :return: List of features names + """ + return self.__indexed_features.tolist() + + def get_features_by_index(self, indices: Union[List[int], Set[int]]) -> List[str]: + """ + Get features names by specified index from DataFrame. + + :param: indices: List of feature indexes + :return: List of features names + """ + return self.__indexed_features.loc[indices].tolist() + + def get_sample_label(self) -> list: + """ + Get samples class (label) from DataFrame. + :return: List of sample class labels + """ + return self.__indexed_instances.tolist() + + def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql.DataFrame: + """ + Return a Spark dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column). + This format is required as input for multiple algorithms from MLlib API. + + :param: output_column_vector: Name of the output column vector. + :return: Spark DataFrame + """ + + sdf = self.__df + features_cols = self.get_features_names() + sdf_vector = _assemble_column_vector(sdf, + input_feature_cols=features_cols, + output_column_vector=output_column_vector) + + return sdf_vector + + def get_sdf_and_label(self, + output_column_vector: str = 'features') -> Tuple[pyspark.sql.dataframe.DataFrame, str, str]: + """ + Extracts the Spark DataFrame and label column name from FSDataFrame. + + :param: output_column_vector: Name of the output column vector. + :return: A tuple containing the Spark DataFrame and the label column name. + """ + sdf = self.get_sdf_vector(output_column_vector=output_column_vector) + label_col = self.get_label_col_name() + return sdf, label_col, output_column_vector + + def _collect_features_as_array(self) -> np.array: + """ + Collect features from FSDataFrame as an array. + `Warning`: This method will collect the entire DataFrame into the driver. + Uses this method on small datasets only (e.g., after filtering or splitting the data) + + :return: Numpy array + """ + sdf = self.get_sdf().select(*self.get_features_names()) + a = np.array(sdf.collect()) + return a + + def to_psdf(self) -> pyspark.pandas.DataFrame: + """ + Convert Spark DataFrame to Pandas on Spark DataFrame + :return: Pandas on Spark DataFrame + """ + return self.__df.pandas_api() + + def get_sdf(self) -> pyspark.sql.DataFrame: + """ + Return current Spark DataFrame + :return: Spark DataFrame + """ + return self.__df + + def get_sample_col_name(self) -> str: + """ + Return sample id column name. + + :return: Sample id column name. + """ + return self.__sample_col + + def get_label_col_name(self) -> str: + """ + Return sample label column name. + + :return: Sample label column name. + """ + return self.__label_col + + def get_row_index_name(self) -> str: + """ + Return row (instances) id column name. + + :return: Row id column name. + """ + return self.__row_index_name + + def _add_row_index(self, index_name: str = '_row_index') -> pyspark.sql.DataFrame: + """ + Add row indices to DataFrame. + Unique indices of type integer will be added in non-consecutive increasing order. + + :param: index_name: Name of the row index column. + :return: Spark DataFrame with extra column of row indices. + """ + return self.__df.withColumn(index_name, monotonically_increasing_id()) + + def count_features(self) -> int: + """ + Return the number of features. + + :return: Number of features. + """ + return self.get_features_indexed().size + + def count_instances(self) -> int: + """ + Return the number of samples (instances). + + :return: Number of samples. + """ + return self.get_sample_label_indexed().size + + def filter_features(self, features: List[str], keep: bool = True) -> 'FSDataFrame': + """ + Select or drop specified features from DataFrame. + + :param features: List of features names to drop or select from DataFrame + :param keep: If True (default), keep features. Remove otherwise. + + :return: FSDataFrame + """ + + current_features = self.get_features_names() + if len(set(current_features).intersection(features)) == 0: + logger.warning(f"There is no overlap of specified features with the input data frame.\n" + f"Skipping this filter step...") + return self + + count_a = self.count_features() + sdf = self.get_sdf() + + if keep: + sdf = sdf.select( + self.__sample_col, + self.__label_col, + self.__row_index_name, + *features) + else: + sdf = sdf.drop(*features) + + fsdf_filtered = self.update(sdf, self.__sample_col, self.__label_col, self.__row_index_name) + count_b = fsdf_filtered.count_features() + + logger.info(f"{count_b} features out of {count_a} remain after applying this filter...") + + return fsdf_filtered + + def filter_features_by_index(self, feature_indices: Set[int], keep: bool = True) -> 'FSDataFrame': + """ + Select or drop specified features from DataFrame by its indices. + + :param feature_indices: Set of features indices to drop or select from DataFrame + :param keep: If True (default), keep features. Remove otherwise. + + :return: FSDataFrame + """ + feature_names = self.get_features_by_index(feature_indices) + return self.filter_features(feature_names, keep=keep) + + def get_label_strata(self) -> list: + """ + Get strata from a categorical column in DataFrame. + + :return: List of levels for categorical variable. + """ + levels = self.get_sample_label_indexed().unique().tolist() + number_of_lvs = len(levels) + if number_of_lvs > 20: # TODO: Check if this is a right cutoff. + logger.warning(f"Number of observed levels too high: {number_of_lvs}.\n" + f"Should this variable be considered continuous?") + return levels + + def scale_features(self, scaler_method: str = 'standard', **kwargs) -> 'FSDataFrame': + """ + Scales features in DataFrame + + :param scaler_method: One of: min_max, max_abs, standard or robust. + :return: FSDataFrame with scaled features. + """ + + if scaler_method == 'min_max': + scaler = MinMaxScaler(**kwargs) + elif scaler_method == 'max_abs': + scaler = MaxAbsScaler(**kwargs) + elif scaler_method == 'standard': + scaler = StandardScaler(**kwargs) + elif scaler_method == 'robust': + scaler = RobustScaler(**kwargs) + else: + raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") + + features_col_vector = '_features' + scaled_features_vector = '_features_scaled' + + sdf = self.get_sdf_vector(output_column_vector=features_col_vector) + + sdf = (scaler + .setInputCol(features_col_vector) + .setOutputCol(scaled_features_vector) + .fit(sdf) + .transform(sdf) + .drop(features_col_vector) + ) + + sdf = _disassemble_column_vector(sdf, + features_cols=self.get_features_names(), + col_vector_name=scaled_features_vector, + drop_col_vector=True) + + return self.update(sdf, + self.__sample_col, + self.__label_col, + self.__row_index_name) + + def split_df(self, + label_type_cat: bool = True, + split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: + """ + Split DataFrame into training and test dataset. + It will generate a nearly class-balanced training + and testing set for both categorical and continuous label input. + + :param label_type_cat: If True (the default), the input label colum will be processed as categorical. + Otherwise, it will be considered a continuous variable and binarized. + :param split_training_factor: Proportion of the training set. Usually, a value between 0.6 and 0.8. + + :return: Tuple of FSDataFrames. First element is the training set and second element is the testing set. + """ + + row_index_col = self.get_row_index_name() + label_col = self.get_label_col_name() + sdf = self.__df + + # create a temporal indexed categorical variable for sampling and splitting the data set. + tmp_label_col = '_tmp_label_indexed' + if label_type_cat: + sdf = _string_indexer(sdf=sdf, input_col=label_col, output_col=tmp_label_col) + else: + # If the input label is continuous, create a uniform random distribution [0,1] and binarize this variable. + # It will be used then as categorical for sampling the dataframe. + sdf = sdf.withColumn("_tmp_uniform_rand", rand()) + sdf = (_binarizer(sdf, + input_col="_tmp_uniform_rand", + output_col=tmp_label_col, + threshold=0.5, + drop_input_col=True) + ) + + # Get number of levels for categorical variable. + levels = [lv[tmp_label_col] for lv in sdf.select([tmp_label_col]).distinct().collect()] + + # Sampling DataFrame to extract class-balanced training set. + # This will keep similar proportion by stratum in both training and testing set. + fraction_dict = dict(zip(levels, [split_training_factor] * len(levels))) + training_df = sdf.sampleBy(col=sdf[tmp_label_col], fractions=fraction_dict) + + # Filter out the testing set from the input Dataframe. testing_df = input_sdf[-training_df]. + testing_df = sdf.join(training_df, [row_index_col], "leftanti") + + # Drop tmp cols + training_df = training_df.drop(tmp_label_col) + testing_df = testing_df.drop(tmp_label_col) + + return (self.update(training_df, self.__sample_col, self.__label_col, self.__row_index_name), + self.update(testing_df, self.__sample_col, self.__label_col, self.__row_index_name)) + + @classmethod + def update(cls, + df: pyspark.sql.DataFrame, + sample_col: str, + label_col: str, + row_index_col: str): + """ + Create a new instance of FSDataFrame. + + :param df: Spark DataFrame + :param sample_col: Name of sample id column. + :param label_col: Name of sample label column. + :param row_index_col: Name of row (instances) id column. + + :return: FSDataFrame + """ + return cls(df, sample_col, label_col, row_index_col) + From deb2df6a23d563b6a2d3f65d51d6a37671b3d1de Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 19 Sep 2024 17:33:44 +0200 Subject: [PATCH 02/62] first iteration of pandas fdataframe.py --- fsspark/fs/fdataframe.py | 180 +++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 84 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index ae0aca8..b56d1b1 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -1,16 +1,25 @@ +import logging +from typing import Optional, Union, List, Set, Tuple +import numpy as np +import pandas as pd +from pandas import DataFrame, Series +from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler +logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +logger = logging.getLogger("pickfeat") +logger.setLevel(logging.INFO) class FSDataFrame: """ - FSDataFrame is a representation of a Spark DataFrame with some functionalities to perform feature selection. - An object from FSDataFrame is basically represented by a Spark DataFrame with samples + FSDataFrame is a representation of a DataFrame with some functionalities to perform feature selection. + An object from FSDataFrame is basically represented by a DataFrame with samples as rows and features as columns, with extra distributed indexed pandas series for features names and samples labels. - An object of FSDataFrame offers an interface to a Spark DataFrame, a Pandas on Spark DataFrame - (e.g. suitable for visualization) or a Spark DataFrame with features as a Dense column vector (e.g. suitable for - applying most algorithms from Spark MLib API). + An object of FSDataFrame offers an interface to a DataFrame, a Pandas on DataFrame + (e.g. suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for + applying most algorithms from MLib API). It can also be split in training and testing dataset and filtered by removing selected features (by name or index). @@ -20,7 +29,7 @@ class FSDataFrame: def __init__( self, - df: Union[pyspark.sql.DataFrame, pyspark.pandas.DataFrame], + df: DataFrame, sample_col: str = None, label_col: str = None, row_index_col: Optional[str] = '_row_index', @@ -33,7 +42,7 @@ def __init__( Expected an input DataFrame with 2+N columns. After specifying sample id and sample label columns, the remaining N columns will be considered as features. - :param df: Spark (or Pandas on Spark) DataFrame + :param df: Pandas DataFrame :param sample_col: Sample id column name :param label_col: Sample label column name :param row_index_col: Optional. Column name of row indices. @@ -76,55 +85,45 @@ def _check_df(self): """ col_names = self.__df.columns if self.__sample_col not in col_names: - raise DataFormatError(f"Column sample name {self.__sample_col} not found...") + raise ValueError(f"Column sample name {self.__sample_col} not found...") elif self.__label_col not in col_names: - raise DataFormatError(f"Column label name {self.__label_col} not found...") + raise ValueError(f"Column label name {self.__label_col} not found...") elif not isinstance(self.__row_index_name, str): - raise DataFormatError("Row index column name must be a valid string...") + raise ValueError("Row index column name must be a valid string...") else: pass - @staticmethod - def _convert_psdf_to_sdf(df: Union[pyspark.pandas.DataFrame, pyspark.sql.DataFrame]) -> pyspark.sql.DataFrame: - """ - Convert Pandas on Spark DataFrame (psdf) to Spark DataFrame (sdf). - - :param df: Spark (or Pandas on Spark) DataFrame - :return: Spark DataFrame - """ - return df.to_spark(index_col=None) if isinstance(df, pyspark.pandas.DataFrame) else df - - def _set_indexed_cols(self) -> pyspark.pandas.series.Series: + def _set_indexed_cols(self) -> Series: """ Create a distributed indexed Series representing features. - :return: Pandas on Spark (PoS) Series + :return: Pandas on (PoS) Series """ - # TODO: Check for equivalent to pandas distributed Series in Spark. + # TODO: Check for equivalent to pandas distributed Series in . non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] features = [f for f in self.__df.columns if f not in non_features_cols] return Series(features) - def _set_indexed_rows(self) -> pyspark.pandas.series.Series: + def _set_indexed_rows(self) -> Series: """ Create a distributed indexed Series representing samples labels. It will use existing row indices, if any. - :return: Pandas on Spark (PoS) Series + :return: Pandas on (PoS) Series """ - # TODO: Check for equivalent to pandas distributed Series in Spark. + # TODO: Check for equivalent to pandas distributed Series in . label = self.__df.select(self.__label_col).collect() row_index = self.__df.select(self.__row_index_name).collect() return Series(label, index=row_index) - def get_features_indexed(self) -> pyspark.pandas.series.Series: + def get_features_indexed(self) -> Series: """ Return features names with indices as a Series. :return: Indexed Series. """ return self.__indexed_features - def get_sample_label_indexed(self) -> pyspark.pandas.series.Series: + def get_sample_label_indexed(self) -> Series: """ Return sample labels with indices as a Series. :return: Indexed Series. @@ -154,13 +153,13 @@ def get_sample_label(self) -> list: """ return self.__indexed_instances.tolist() - def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql.DataFrame: + def get_sdf_vector(self, output_column_vector: str = 'features') -> pd.DataFrame: """ - Return a Spark dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column). + Return a dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column). This format is required as input for multiple algorithms from MLlib API. :param: output_column_vector: Name of the output column vector. - :return: Spark DataFrame + :return: DataFrame """ sdf = self.__df @@ -172,12 +171,12 @@ def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql. return sdf_vector def get_sdf_and_label(self, - output_column_vector: str = 'features') -> Tuple[pyspark.sql.dataframe.DataFrame, str, str]: + output_column_vector: str = 'features') -> Tuple[DataFrame, str, str]: """ - Extracts the Spark DataFrame and label column name from FSDataFrame. + Extracts the DataFrame and label column name from FSDataFrame. :param: output_column_vector: Name of the output column vector. - :return: A tuple containing the Spark DataFrame and the label column name. + :return: A tuple containing the DataFrame and the label column name. """ sdf = self.get_sdf_vector(output_column_vector=output_column_vector) label_col = self.get_label_col_name() @@ -195,19 +194,15 @@ def _collect_features_as_array(self) -> np.array: a = np.array(sdf.collect()) return a - def to_psdf(self) -> pyspark.pandas.DataFrame: + def to_psdf(self) -> DataFrame: """ - Convert Spark DataFrame to Pandas on Spark DataFrame - :return: Pandas on Spark DataFrame + Convert DataFrame to Pandas on DataFrame + :return: Pandas on DataFrame """ return self.__df.pandas_api() - def get_sdf(self) -> pyspark.sql.DataFrame: - """ - Return current Spark DataFrame - :return: Spark DataFrame - """ - return self.__df + def get_sdf(self) -> DataFrame: + return self.__df def get_sample_col_name(self) -> str: """ @@ -233,15 +228,17 @@ def get_row_index_name(self) -> str: """ return self.__row_index_name - def _add_row_index(self, index_name: str = '_row_index') -> pyspark.sql.DataFrame: + def _add_row_index(self, index_name: str = '_row_index') -> pd.DataFrame: """ Add row indices to DataFrame. Unique indices of type integer will be added in non-consecutive increasing order. - :param: index_name: Name of the row index column. - :return: Spark DataFrame with extra column of row indices. + :param index_name: Name of the row index column. + :return: DataFrame with extra column of row indices. """ - return self.__df.withColumn(index_name, monotonically_increasing_id()) + # Add a new column with unique row indices using a range + self.__df[index_name] = range(len(self.__df)) + return self.__df def count_features(self) -> int: """ @@ -365,6 +362,7 @@ def split_df(self, label_type_cat: bool = True, split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: """ + TODO: Split dataframe in training and test dataset, maintaining balance between classes. Split DataFrame into training and test dataset. It will generate a nearly class-balanced training and testing set for both categorical and continuous label input. @@ -376,53 +374,19 @@ def split_df(self, :return: Tuple of FSDataFrames. First element is the training set and second element is the testing set. """ - row_index_col = self.get_row_index_name() - label_col = self.get_label_col_name() - sdf = self.__df - # create a temporal indexed categorical variable for sampling and splitting the data set. - tmp_label_col = '_tmp_label_indexed' - if label_type_cat: - sdf = _string_indexer(sdf=sdf, input_col=label_col, output_col=tmp_label_col) - else: - # If the input label is continuous, create a uniform random distribution [0,1] and binarize this variable. - # It will be used then as categorical for sampling the dataframe. - sdf = sdf.withColumn("_tmp_uniform_rand", rand()) - sdf = (_binarizer(sdf, - input_col="_tmp_uniform_rand", - output_col=tmp_label_col, - threshold=0.5, - drop_input_col=True) - ) - - # Get number of levels for categorical variable. - levels = [lv[tmp_label_col] for lv in sdf.select([tmp_label_col]).distinct().collect()] - - # Sampling DataFrame to extract class-balanced training set. - # This will keep similar proportion by stratum in both training and testing set. - fraction_dict = dict(zip(levels, [split_training_factor] * len(levels))) - training_df = sdf.sampleBy(col=sdf[tmp_label_col], fractions=fraction_dict) - - # Filter out the testing set from the input Dataframe. testing_df = input_sdf[-training_df]. - testing_df = sdf.join(training_df, [row_index_col], "leftanti") - - # Drop tmp cols - training_df = training_df.drop(tmp_label_col) - testing_df = testing_df.drop(tmp_label_col) - - return (self.update(training_df, self.__sample_col, self.__label_col, self.__row_index_name), - self.update(testing_df, self.__sample_col, self.__label_col, self.__row_index_name)) + @classmethod def update(cls, - df: pyspark.sql.DataFrame, + df: DataFrame, sample_col: str, label_col: str, row_index_col: str): """ Create a new instance of FSDataFrame. - :param df: Spark DataFrame + :param df: DataFrame :param sample_col: Name of sample id column. :param label_col: Name of sample label column. :param row_index_col: Name of row (instances) id column. @@ -431,3 +395,51 @@ def update(cls, """ return cls(df, sample_col, label_col, row_index_col) + def _assemble_column_vector(self, + input_feature_cols: List[str], + output_column_vector: str = 'features', + drop_input_cols: bool = True) -> pd.DataFrame: + """ + Assemble features (columns) from DataFrame into a column of type Numpy array. + + :param drop_input_cols: Boolean flag to drop the input feature columns. + :param input_feature_cols: List of feature column names. + :param output_column_vector: Name of the output column that will contain the combined vector. + :param sdf: Pandas DataFrame + + :return: DataFrame with column of type Numpy array. + """ + + # Combine the input columns into a single vector (Numpy array) + self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) + + # Drop input columns if flag is set to True + if drop_input_cols: + return self.__df.drop(columns=input_feature_cols) + else: + return self.__df + +def _disassemble_column_vector(self, + features_cols: List[str], + col_vector_name: str, + drop_col_vector: bool = True) -> pd.DataFrame: + """ + Convert a column of Numpy arrays in DataFrame to individual columns (a.k.a features). + This is the reverse operation of `_assemble_column_vector`. + + :param features_cols: List of new feature column names. + :param col_vector_name: Name of the column that contains the vector (Numpy array). + :param drop_col_vector: Boolean flag to drop the original vector column. + :return: DataFrame with individual feature columns. + """ + + # Unpack the vector (Numpy array) into individual columns + for i, feature in enumerate(features_cols): + self.__df[feature] = self.__df[col_vector_name].apply(lambda x: x[i]) + + # Drop the original vector column if needed + if drop_col_vector: + self.__df = self.__df.drop(columns=[col_vector_name]) + + return self.__df + From 70fec44e8430b4985b8ac6db4502e42b1b6cf25a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 19 Sep 2024 17:39:48 +0200 Subject: [PATCH 03/62] first iteration of pandas fdataframe.py --- fsspark/fs/fdataframe.py | 70 +++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index b56d1b1..5a686de 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -13,13 +13,13 @@ class FSDataFrame: """ FSDataFrame is a representation of a DataFrame with some functionalities to perform feature selection. - An object from FSDataFrame is basically represented by a DataFrame with samples + An object from FSDataFrame is basically represented by a DataFrame with samples as rows and features as columns, with extra distributed indexed pandas series for features names and samples labels. An object of FSDataFrame offers an interface to a DataFrame, a Pandas on DataFrame (e.g. suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for - applying most algorithms from MLib API). + applying most algorithms from MLib API). It can also be split in training and testing dataset and filtered by removing selected features (by name or index). @@ -80,7 +80,6 @@ def __init__( def _check_df(self): """ Check if input DataFrame meet the minimal requirements to feed an FS pipeline. - :return: None """ col_names = self.__df.columns @@ -96,10 +95,8 @@ def _check_df(self): def _set_indexed_cols(self) -> Series: """ Create a distributed indexed Series representing features. - :return: Pandas on (PoS) Series """ - # TODO: Check for equivalent to pandas distributed Series in . non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] features = [f for f in self.__df.columns if f not in non_features_cols] return Series(features) @@ -362,18 +359,45 @@ def split_df(self, label_type_cat: bool = True, split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: """ - TODO: Split dataframe in training and test dataset, maintaining balance between classes. Split DataFrame into training and test dataset. It will generate a nearly class-balanced training and testing set for both categorical and continuous label input. - :param label_type_cat: If True (the default), the input label colum will be processed as categorical. + :param label_type_cat: If True (the default), the input label column will be processed as categorical. Otherwise, it will be considered a continuous variable and binarized. :param split_training_factor: Proportion of the training set. Usually, a value between 0.6 and 0.8. :return: Tuple of FSDataFrames. First element is the training set and second element is the testing set. """ + label_col = self.get_label_col_name() + df = self.__df.copy() + + # Create a temporary label column for sampling + tmp_label_col = '_tmp_label_indexed' + + if label_type_cat: + # Use factorize to convert categorical labels to integer indices + df[tmp_label_col], _ = pd.factorize(df[label_col]) + else: + # For continuous labels, create a uniform random column and binarize it + df['_tmp_uniform_rand'] = np.random.rand(len(df)) + df[tmp_label_col] = (df['_tmp_uniform_rand'] > 0.5).astype(int) + df = df.drop(columns=['_tmp_uniform_rand']) + + # Perform stratified sampling to get class-balanced training set + train_df = df.groupby(tmp_label_col, group_keys=False).apply(lambda x: x.sample(frac=split_training_factor)) + + # Get the test set by subtracting the training set from the original DataFrame + test_df = df.drop(train_df.index) + + # Drop the temporary label column + train_df = train_df.drop(columns=[tmp_label_col]) + test_df = test_df.drop(columns=[tmp_label_col]) + + # Return the updated DataFrames + return self.update(train_df), self.update(test_df) + @@ -395,29 +419,29 @@ def update(cls, """ return cls(df, sample_col, label_col, row_index_col) - def _assemble_column_vector(self, +def _assemble_column_vector(self, input_feature_cols: List[str], output_column_vector: str = 'features', drop_input_cols: bool = True) -> pd.DataFrame: - """ - Assemble features (columns) from DataFrame into a column of type Numpy array. + """ + Assemble features (columns) from DataFrame into a column of type Numpy array. - :param drop_input_cols: Boolean flag to drop the input feature columns. - :param input_feature_cols: List of feature column names. - :param output_column_vector: Name of the output column that will contain the combined vector. - :param sdf: Pandas DataFrame + :param drop_input_cols: Boolean flag to drop the input feature columns. + :param input_feature_cols: List of feature column names. + :param output_column_vector: Name of the output column that will contain the combined vector. + :param sdf: Pandas DataFrame - :return: DataFrame with column of type Numpy array. - """ + :return: DataFrame with column of type Numpy array. + """ - # Combine the input columns into a single vector (Numpy array) - self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) + # Combine the input columns into a single vector (Numpy array) + self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) - # Drop input columns if flag is set to True - if drop_input_cols: - return self.__df.drop(columns=input_feature_cols) - else: - return self.__df + # Drop input columns if flag is set to True + if drop_input_cols: + return self.__df.drop(columns=input_feature_cols) + else: + return self.__df def _disassemble_column_vector(self, features_cols: List[str], From b99aee04389b38df6759e9a66376a73390acaae2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 19 Sep 2024 22:57:38 +0200 Subject: [PATCH 04/62] first iteration of pandas fdataframe.py --- environment.yml | 1 - fsspark/fs/fdataframe.py | 41 ++++++++-------- fsspark/tests/test_FSDataFrame.py | 79 ------------------------------- fsspark/tests/test_fsdataframe.py | 27 +++++++++++ 4 files changed, 48 insertions(+), 100 deletions(-) delete mode 100644 fsspark/tests/test_FSDataFrame.py create mode 100644 fsspark/tests/test_fsdataframe.py diff --git a/environment.yml b/environment.yml index 9f48f63..7d4be58 100644 --- a/environment.yml +++ b/environment.yml @@ -10,5 +10,4 @@ dependencies: - pyspark~=3.3.0 - networkx~=2.8.7 - numpy~=1.23.4 - - pandas~=1.5.1 - pyarrow~=8.0.0 diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index 5a686de..3cba18c 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -50,27 +50,25 @@ def __init__( :param parse_features: Coerce all features to float. """ - self.__df = self._convert_psdf_to_sdf(df) self.__sample_col = sample_col self.__label_col = label_col - self.__row_index_name = row_index_col + self.__row_index_col = row_index_col + self.__df = df # check input dataframe self._check_df() # replace dots in column names, if any. if parse_col_names: - # TODO: Dots in column names are prone to errors, since dots are used to access attributes from DataFrame. - # Should we make this replacement optional? Or print out a warning? self.__df = self.__df.toDF(*(c.replace('.', '_') for c in self.__df.columns)) # If the specified row index column name does not exist, add row index to the dataframe - if self.__row_index_name not in self.__df.columns: - self.__df = self._add_row_index(index_name=self.__row_index_name) + if self.__row_index_col not in self.__df.columns: + self.__df = self._add_row_index(index_name=self.__row_index_col) if parse_features: # coerce all features to float - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] + non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_col] feature_cols = [c for c in self.__df.columns if c not in non_features_cols] self.__df = self.__df.withColumns({c: self.__df[c].cast('float') for c in feature_cols}) @@ -87,7 +85,7 @@ def _check_df(self): raise ValueError(f"Column sample name {self.__sample_col} not found...") elif self.__label_col not in col_names: raise ValueError(f"Column label name {self.__label_col} not found...") - elif not isinstance(self.__row_index_name, str): + elif not isinstance(self.__row_index_col, str): raise ValueError("Row index column name must be a valid string...") else: pass @@ -97,21 +95,24 @@ def _set_indexed_cols(self) -> Series: Create a distributed indexed Series representing features. :return: Pandas on (PoS) Series """ - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] + non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_col] features = [f for f in self.__df.columns if f not in non_features_cols] return Series(features) - def _set_indexed_rows(self) -> Series: + def _set_indexed_rows(self) -> pd.Series: """ - Create a distributed indexed Series representing samples labels. - It will use existing row indices, if any. + Create an indexed Series representing sample labels. + It will use existing row indices from the DataFrame. - :return: Pandas on (PoS) Series + :return: Pandas Series """ - # TODO: Check for equivalent to pandas distributed Series in . - label = self.__df.select(self.__label_col).collect() - row_index = self.__df.select(self.__row_index_name).collect() - return Series(label, index=row_index) + + # Extract the label and row index columns from the DataFrame + labels = self.__df[self.__label_col] + row_indices = self.__df[self.__row_index_col] + + # Create a Pandas Series with row_indices as index and labels as values + return pd.Series(data=labels.values, index=row_indices.values) def get_features_indexed(self) -> Series: """ @@ -223,7 +224,7 @@ def get_row_index_name(self) -> str: :return: Row id column name. """ - return self.__row_index_name + return self.__row_index_col def _add_row_index(self, index_name: str = '_row_index') -> pd.DataFrame: """ @@ -276,12 +277,12 @@ def filter_features(self, features: List[str], keep: bool = True) -> 'FSDataFram sdf = sdf.select( self.__sample_col, self.__label_col, - self.__row_index_name, + self.__row_index_col, *features) else: sdf = sdf.drop(*features) - fsdf_filtered = self.update(sdf, self.__sample_col, self.__label_col, self.__row_index_name) + fsdf_filtered = self.update(sdf, self.__sample_col, self.__label_col, self.__row_index_col) count_b = fsdf_filtered.count_features() logger.info(f"{count_b} features out of {count_a} remain after applying this filter...") diff --git a/fsspark/tests/test_FSDataFrame.py b/fsspark/tests/test_FSDataFrame.py deleted file mode 100644 index 2376b99..0000000 --- a/fsspark/tests/test_FSDataFrame.py +++ /dev/null @@ -1,79 +0,0 @@ -import unittest - -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table_as_psdf - - -class FSDataFrameTest(unittest.TestCase): - """ - Define testing methods for FSDataFrame class. - """ - - def setUp(self) -> None: - init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) - - def tearDown(self) -> None: - stop_spark_session() - - @staticmethod - def import_FSDataFrame(): - df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col='Sample', label_col='label') - return fsdf - - def test_FSDataFrame(self): - """ - Test FSDataFrame class. - :return: None - """ - - # create object of type FSDataFrame - fsdf = self.import_FSDataFrame() - - self.assertEqual(len(fsdf.get_features_names()), 500) - self.assertEqual(len(fsdf.get_sample_label()), 44) - - def test_get_sdf_vector(self): - """ - Test get_sdf_vector method. - :return: None - """ - - fsdf = self.import_FSDataFrame() - - sdf = fsdf.get_sdf_vector(output_column_vector='features') - sdf.show(5) - self.assertEqual(len(sdf.columns), 4) - - def test_scale_features(self): - """ - Test scale_features method. - :return: None - """ - - fsdf = self.import_FSDataFrame() - fsdf = fsdf.scale_features(scaler_method='min_max') - - fsdf.get_sdf().show(10) - self.assertGreaterEqual(min(fsdf.to_psdf()['tr|E9PBJ4'].to_numpy()), 0.0) - self.assertLessEqual(max(fsdf.to_psdf()['tr|E9PBJ4'].to_numpy()), 1.0) - - def test_get_features_indices(self): - """ - Test get_features_indices method. - :return: None - """ - - fsdf = self.import_FSDataFrame() - feature_indices = fsdf.get_features_indexed() - feature_names = feature_indices.loc[[0, 1, 2, 5]].tolist() - - self.assertTrue(all([x in ['tr|E9PBJ4', 'sp|P07437', 'sp|P68371', 'tr|F8VWX4'] for x in feature_names])) - - -if __name__ == '__main__': - unittest.main() diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py new file mode 100644 index 0000000..09fc2ac --- /dev/null +++ b/fsspark/tests/test_fsdataframe.py @@ -0,0 +1,27 @@ +import pytest +import pandas as pd +from fsspark.fs.fdataframe import FSDataFrame + +def test_initializes_fsdataframe(): + + # Create a sample DataFrame + data = { + 'sample_id': [1, 2, 3], + 'label': ['A', 'B', 'C'], + 'feature1': [0.1, 0.2, 0.3], + 'feature2': [1.1, 1.2, 1.3] + } + df = pd.DataFrame(data) + + # Initialize FSDataFrame + fs_df = FSDataFrame( + df=df, + sample_col='sample_id', + label_col='label', + row_index_col='_row_index', + parse_col_names=False, + parse_features=False + ) + + # Assertions to check if the initialization is correct + assert (fs_df.get_sdf(), df) \ No newline at end of file From 64fb1aa643ecd80a9b816269b88522abed883456 Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 13:49:07 +0200 Subject: [PATCH 05/62] update fdataframe class --- fsspark/fs/fdataframe.py | 160 +++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 75 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index 5a686de..53e8327 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -10,6 +10,7 @@ logger = logging.getLogger("pickfeat") logger.setLevel(logging.INFO) + class FSDataFrame: """ FSDataFrame is a representation of a DataFrame with some functionalities to perform feature selection. @@ -50,7 +51,7 @@ def __init__( :param parse_features: Coerce all features to float. """ - self.__df = self._convert_psdf_to_sdf(df) + self.__df = df self.__sample_col = sample_col self.__label_col = label_col self.__row_index_name = row_index_col @@ -106,12 +107,12 @@ def _set_indexed_rows(self) -> Series: Create a distributed indexed Series representing samples labels. It will use existing row indices, if any. - :return: Pandas on (PoS) Series + :return: Pandas Series """ - # TODO: Check for equivalent to pandas distributed Series in . - label = self.__df.select(self.__label_col).collect() - row_index = self.__df.select(self.__row_index_name).collect() - return Series(label, index=row_index) + + label = self.__df[self.__label_col] + row_index = self.__df[self.__row_index_name] + return pd.Series(data=label.values, index=row_index.values) def get_features_indexed(self) -> Series: """ @@ -187,7 +188,7 @@ def _collect_features_as_array(self) -> np.array: :return: Numpy array """ - sdf = self.get_sdf().select(*self.get_features_names()) + sdf = self.get_df().select(*self.get_features_names()) a = np.array(sdf.collect()) return a @@ -198,8 +199,8 @@ def to_psdf(self) -> DataFrame: """ return self.__df.pandas_api() - def get_sdf(self) -> DataFrame: - return self.__df + def get_df(self) -> DataFrame: + return self.__df def get_sample_col_name(self) -> str: """ @@ -234,7 +235,7 @@ def _add_row_index(self, index_name: str = '_row_index') -> pd.DataFrame: :return: DataFrame with extra column of row indices. """ # Add a new column with unique row indices using a range - self.__df[index_name] = range(len(self.__df)) + self.__df[index_name] = list(range(len(self.__df))) return self.__df def count_features(self) -> int: @@ -270,7 +271,7 @@ def filter_features(self, features: List[str], keep: bool = True) -> 'FSDataFram return self count_a = self.count_features() - sdf = self.get_sdf() + sdf = self.get_df() if keep: sdf = sdf.select( @@ -332,23 +333,14 @@ def scale_features(self, scaler_method: str = 'standard', **kwargs) -> 'FSDataFr else: raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") - features_col_vector = '_features' - scaled_features_vector = '_features_scaled' + feature_array = self._features_to_array() - sdf = self.get_sdf_vector(output_column_vector=features_col_vector) + feature_array = (scaler + .fit(feature_array) + .transform() + ) - sdf = (scaler - .setInputCol(features_col_vector) - .setOutputCol(scaled_features_vector) - .fit(sdf) - .transform(sdf) - .drop(features_col_vector) - ) - - sdf = _disassemble_column_vector(sdf, - features_cols=self.get_features_names(), - col_vector_name=scaled_features_vector, - drop_col_vector=True) + df_scaled = self._array_to_features(feature_array) return self.update(sdf, self.__sample_col, @@ -398,9 +390,6 @@ def split_df(self, # Return the updated DataFrames return self.update(train_df), self.update(test_df) - - - @classmethod def update(cls, df: DataFrame, @@ -419,51 +408,72 @@ def update(cls, """ return cls(df, sample_col, label_col, row_index_col) -def _assemble_column_vector(self, - input_feature_cols: List[str], - output_column_vector: str = 'features', - drop_input_cols: bool = True) -> pd.DataFrame: - """ - Assemble features (columns) from DataFrame into a column of type Numpy array. - - :param drop_input_cols: Boolean flag to drop the input feature columns. - :param input_feature_cols: List of feature column names. - :param output_column_vector: Name of the output column that will contain the combined vector. - :param sdf: Pandas DataFrame - - :return: DataFrame with column of type Numpy array. - """ - - # Combine the input columns into a single vector (Numpy array) - self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) - - # Drop input columns if flag is set to True - if drop_input_cols: - return self.__df.drop(columns=input_feature_cols) - else: - return self.__df - -def _disassemble_column_vector(self, - features_cols: List[str], - col_vector_name: str, - drop_col_vector: bool = True) -> pd.DataFrame: - """ - Convert a column of Numpy arrays in DataFrame to individual columns (a.k.a features). - This is the reverse operation of `_assemble_column_vector`. - - :param features_cols: List of new feature column names. - :param col_vector_name: Name of the column that contains the vector (Numpy array). - :param drop_col_vector: Boolean flag to drop the original vector column. - :return: DataFrame with individual feature columns. - """ - - # Unpack the vector (Numpy array) into individual columns - for i, feature in enumerate(features_cols): - self.__df[feature] = self.__df[col_vector_name].apply(lambda x: x[i]) - - # Drop the original vector column if needed - if drop_col_vector: - self.__df = self.__df.drop(columns=[col_vector_name]) + def _features_to_array(self) -> np.array: + """ + Collect features from FSDataFrame as an array. + `Warning`: This method will collect the entire DataFrame into the driver. + Uses this method on small datasets only (e.g., after filtering or splitting the data) - return self.__df + :return: Numpy array + """ + sdf = self.get_df().select(*self.get_features_names()) + a = np.array(sdf.collect()) + return a + def _array_to_features(self, a: np.array) -> pd.DataFrame: + """ + Convert a Numpy array to a DataFrame with features as columns. + :param a: Numpy array + :return: Pandas DataFrame + """ + return pd.DataFrame(a, columns=self.get_features_names()) + +# +# def _assemble_column_vector(self, +# input_feature_cols: List[str], +# output_column_vector: str = 'features', +# drop_input_cols: bool = True) -> pd.DataFrame: +# """ +# Assemble features (columns) from DataFrame into a column of type Numpy array. +# +# :param drop_input_cols: Boolean flag to drop the input feature columns. +# :param input_feature_cols: List of feature column names. +# :param output_column_vector: Name of the output column that will contain the combined vector. +# :param sdf: Pandas DataFrame +# +# :return: DataFrame with column of type Numpy array. +# """ +# +# # Combine the input columns into a single vector (Numpy array) +# self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) +# +# # Drop input columns if flag is set to True +# if drop_input_cols: +# return self.__df.drop(columns=input_feature_cols) +# else: +# return self.__df +# +# +# def _disassemble_column_vector(self, +# features_cols: List[str], +# col_vector_name: str, +# drop_col_vector: bool = True) -> pd.DataFrame: +# """ +# Convert a column of Numpy arrays in DataFrame to individual columns (a.k.a features). +# This is the reverse operation of `_assemble_column_vector`. +# +# :param features_cols: List of new feature column names. +# :param col_vector_name: Name of the column that contains the vector (Numpy array). +# :param drop_col_vector: Boolean flag to drop the original vector column. +# :return: DataFrame with individual feature columns. +# """ +# +# # Unpack the vector (Numpy array) into individual columns +# for i, feature in enumerate(features_cols): +# self.__df[feature] = self.__df[col_vector_name].apply(lambda x: x[i]) +# +# # Drop the original vector column if needed +# if drop_col_vector: +# self.__df = self.__df.drop(columns=[col_vector_name]) +# +# return self.__df From cc471f0d24b221a7fdc204560d067e086fcadd02 Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 13:49:36 +0200 Subject: [PATCH 06/62] minor refactory --- fsspark/fs/univariate.py | 153 +++++++++++++++++---------------------- requirements.txt | 7 +- 2 files changed, 71 insertions(+), 89 deletions(-) diff --git a/fsspark/fs/univariate.py b/fsspark/fs/univariate.py index 1103762..713cd26 100644 --- a/fsspark/fs/univariate.py +++ b/fsspark/fs/univariate.py @@ -1,133 +1,110 @@ import logging from typing import Dict, List -import pyspark.sql.functions as f -from pyspark.ml.feature import UnivariateFeatureSelector - -from fsspark.fs.constants import ANOVA, UNIVARIATE_CORRELATION, F_REGRESSION, UNIVARIATE_METHODS - -from fsspark.fs.core import FSDataFrame -from fsspark.utils.generic import tag +import pandas as pd +from sklearn.feature_selection import SelectKBest, f_classif, f_regression +from scipy.stats import pearsonr logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("FSSPARK:UNIVARIATE") +logger = logging.getLogger("FS:UNIVARIATE") logger.setLevel(logging.INFO) -@tag("spark implementation") -def compute_univariate_corr(fsdf: FSDataFrame) -> Dict[str, float]: +def compute_univariate_corr(df: pd.DataFrame, features: List[str], label: str) -> Dict[str, float]: """ Compute the correlation coefficient between every column (features) in the input DataFrame and the label (class). - :param fsdf: Input FSDataFrame + :param df: Input DataFrame + :param features: List of feature column names + :param label: Label column name :return: Return dict {feature -> corr} """ - - sdf = fsdf.get_sdf() - features = fsdf.get_features_names() - label = fsdf.get_label_col_name() - - u_corr = sdf.select([f.abs(f.corr(sdf[c], sdf[label])).alias(c) for c in features]) - - return u_corr.first().asDict() + correlations = {feature: abs(df[feature].corr(df[label])) for feature in features} + return correlations -def univariate_correlation_selector(fsdf: FSDataFrame, - corr_threshold: float = 0.3) -> List[str]: +def univariate_correlation_selector(df: pd.DataFrame, features: List[str], label: str, corr_threshold: float = 0.3) -> \ +List[str]: """ - Select features based on its correlation with a label (class), if corr value is less than a specified threshold. - Expected both features and label to be of type numeric. + Select features based on their correlation with a label (class), if the correlation value is less than the specified threshold. - :param fsdf: FSDataFrame - :param corr_threshold: Maximal correlation threshold allowed between feature and label. + :param df: Input DataFrame + :param features: List of feature column names + :param label: Label column name + :param corr_threshold: Maximum allowed correlation threshold - :return: List of selected features names + :return: List of selected feature names """ - d = compute_univariate_corr(fsdf) - selected_features = [k for k in d.keys() if d.get(k) <= corr_threshold] - + correlations = compute_univariate_corr(df, features, label) + selected_features = [feature for feature, corr in correlations.items() if corr <= corr_threshold] return selected_features -@tag("spark implementation") -def univariate_selector(fsdf: FSDataFrame, - label_type: str = 'categorical', - selection_mode: str = 'percentile', - selection_threshold: float = 0.8, - **kwargs) -> List[str]: +def univariate_selector(df: pd.DataFrame, features: List[str], label: str, label_type: str = 'categorical', + selection_mode: str = 'percentile', selection_threshold: float = 0.8) -> List[str]: """ - Wrapper for `UnivariateFeatureSelector`. - See https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.UnivariateFeatureSelector.html - - Only continues features are supported. If label is categorical, ANOVA test is used. If label is of type continues - then an F-regression test is used. + Wrapper for scikit-learn's `SelectKBest` feature selector. + If the label is categorical, ANOVA test is used; if continuous, F-regression test is used. - :param fsdf: Input FSDataFrame - :param label_type: Type of label. Possible values are 'categorical' or 'continuous'. - :param selection_mode: Mode for feature selection. Possible values are 'numTopFeatures' or 'percentile'. - :param selection_threshold: Number of features to select or the percentage of features to select. + :param df: Input DataFrame + :param features: List of feature column names + :param label: Label column name + :param label_type: Type of label ('categorical' or 'continuous') + :param selection_mode: Mode for feature selection ('percentile' or 'k_best') + :param selection_threshold: Number of features to select or the percentage of features - :return: List of selected features names + :return: List of selected feature names """ - vector_col_name = 'features' - sdf = fsdf.get_sdf_vector(output_column_vector=vector_col_name) - label = fsdf.get_label_col_name() + X = df[features].values + y = df[label].values - # set selector if label_type == 'categorical': - # TODO: print msg to logger with the method being used here... - print("ANOVA (F-classification) univariate feature selection") + logger.info("ANOVA (F-classification) univariate feature selection") + selector = SelectKBest(score_func=f_classif) elif label_type == 'continuous': - # TODO: print msg to logger with the method being used here... - print("F-value (F-regression) univariate feature selection") + logger.info("F-value (F-regression) univariate feature selection") + selector = SelectKBest(score_func=f_regression) + else: + raise ValueError("`label_type` must be one of 'categorical' or 'continuous'") + + if selection_mode == 'percentile': + selector.set_params(k='all') # We'll handle the percentile threshold manually + selector.fit(X, y) + scores = selector.scores_ + selected_indices = [i for i, score in enumerate(scores) if score >= selection_threshold * max(scores)] else: - raise ValueError("`label_type` must be one of categorical or continuous") - - selector = UnivariateFeatureSelector(**kwargs) - (selector - .setLabelType(label_type) - .setFeaturesCol(vector_col_name) - .setFeatureType("continuous") - .setOutputCol("selectedFeatures") - .setLabelCol(label) - .setSelectionMode(selection_mode) - .setSelectionThreshold(selection_threshold) - ) - - model = selector.fit(sdf) - selected_features_indices = model.selectedFeatures - selected_features = fsdf.get_features_by_index(selected_features_indices) + selector.set_params(k=int(selection_threshold)) + selector.fit(X, y) + selected_indices = selector.get_support(indices=True) + selected_features = [features[i] for i in selected_indices] return selected_features -@tag("spark implementation") -def univariate_filter(fsdf: FSDataFrame, - univariate_method: str = 'u_corr', - **kwargs) -> FSDataFrame: +def univariate_filter(df: pd.DataFrame, features: List[str], label: str, univariate_method: str = 'u_corr', + **kwargs) -> pd.DataFrame: """ Filter features after applying a univariate feature selector method. - :param fsdf: Input FSDataFrame - :param univariate_method: Univariate selector method. - Possible values are 'u_corr', 'anova' (categorical label) - or 'f_regression' (continuous label). + :param df: Input DataFrame + :param features: List of feature column names + :param label: Label column name + :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') - :return: Filtered FSDataFrame + :return: Filtered DataFrame with selected features """ - if univariate_method == ANOVA: - selected_features = univariate_selector(fsdf, label_type='categorical', **kwargs) - elif univariate_method == F_REGRESSION: - selected_features = univariate_selector(fsdf, label_type='continuous', **kwargs) - elif univariate_method == UNIVARIATE_CORRELATION: - selected_features = univariate_correlation_selector(fsdf, **kwargs) + if univariate_method == 'anova': + selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) + elif univariate_method == 'f_regression': + selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) + elif univariate_method == 'u_corr': + selected_features = univariate_correlation_selector(df, features, label, **kwargs) else: - raise ValueError(f"Univariate method {univariate_method} not supported. " - f"Expected one of {UNIVARIATE_METHODS.keys()}") + raise ValueError(f"Univariate method {univariate_method} not supported.") - logger.info(f"Applying univariate filter {univariate_method}.") + logger.info(f"Applying univariate filter using method: {univariate_method}") - return fsdf.filter_features(selected_features, keep=True) + return df[selected_features + [label]] # Return DataFrame with selected features and label column diff --git a/requirements.txt b/requirements.txt index de6efc3..a5aa099 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,9 @@ networkx~=2.8.7 numpy~=1.23.4 setuptools~=65.5.0 pandas~=1.5.1 -pyarrow~=8.0.0 \ No newline at end of file +pyarrow~=8.0.0 +fsspark>=0.0.1 +scikit-learn>=1.2.2 +scipy>=1.9.3 +matplotlib>=3.6.2 +seaborn>=0.12.1 \ No newline at end of file From 471dafa9806fb43ebdb64475d90f2cd1abc1b139 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 20 Sep 2024 13:59:09 +0200 Subject: [PATCH 07/62] first iteration of pandas fdataframe.py --- requirements.txt | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index de6efc3..757aeef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ -pyspark~=3.3.0 -networkx~=2.8.7 -numpy~=1.23.4 -setuptools~=65.5.0 -pandas~=1.5.1 -pyarrow~=8.0.0 \ No newline at end of file +networkx +numpy +setuptools +pandas +pyarrow \ No newline at end of file From 174196a853439d24683e109fd117fb5e12453af0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 20 Sep 2024 14:55:19 +0200 Subject: [PATCH 08/62] first iteration of pandas fdataframe.py --- fsspark/fs/fdataframe.py | 80 +++++++++++-------------------- fsspark/tests/test_fsdataframe.py | 4 +- 2 files changed, 30 insertions(+), 54 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index 21cc06c..cf590b7 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd from pandas import DataFrame, Series -from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler +from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("pickfeat") @@ -34,8 +34,6 @@ def __init__( sample_col: str = None, label_col: str = None, row_index_col: Optional[str] = '_row_index', - parse_col_names: bool = False, - parse_features: bool = False, ): """ Create an instance of FSDataFrame. @@ -47,49 +45,29 @@ def __init__( :param sample_col: Sample id column name :param label_col: Sample label column name :param row_index_col: Optional. Column name of row indices. - :param parse_col_names: Replace dots (.) in column names with underscores. - :param parse_features: Coerce all features to float. """ - self.__sample_col = sample_col - self.__label_col = label_col - self.__row_index_col = row_index_col - self.__df = df - - # check input dataframe - self._check_df() - - # replace dots in column names, if any. - if parse_col_names: - self.__df = self.__df.toDF(*(c.replace('.', '_') for c in self.__df.columns)) - - # If the specified row index column name does not exist, add row index to the dataframe - if self.__row_index_col not in self.__df.columns: - self.__df = self._add_row_index(index_name=self.__row_index_col) - - if parse_features: - # coerce all features to float - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_col] - feature_cols = [c for c in self.__df.columns if c not in non_features_cols] - self.__df = self.__df.withColumns({c: self.__df[c].cast('float') for c in feature_cols}) - - self.__indexed_features = self._set_indexed_cols() - self.__indexed_instances = self._set_indexed_rows() + if sample_col is None: + self.__sample_col = None + self.__samples = [] + logging.info("No sample column specified.") + else: + self.__sample_col = sample_col + self.__samples = df[sample_col].tolist() + df = df.drop(columns=[sample_col]) - def _check_df(self): - """ - Check if input DataFrame meet the minimal requirements to feed an FS pipeline. - :return: None - """ - col_names = self.__df.columns - if self.__sample_col not in col_names: - raise ValueError(f"Column sample name {self.__sample_col} not found...") - elif self.__label_col not in col_names: - raise ValueError(f"Column label name {self.__label_col} not found...") - elif not isinstance(self.__row_index_col, str): - raise ValueError("Row index column name must be a valid string...") + if label_col is None: + raise ValueError("No label column specified. A class/label column is required.") else: - pass + self.__label_col = label_col + self.__labels = df[label_col].tolist() + label_encoder = LabelEncoder() + self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist() + df = df.drop(columns=[label_col]) + + self.__original_features = df.columns.tolist() + numerical_df = df.select_dtypes(include=[np.number]) + self.__matrix = numerical_df.to_numpy(dtype=np.float32) def _set_indexed_cols(self) -> Series: """ @@ -97,7 +75,7 @@ def _set_indexed_cols(self) -> Series: :return: Pandas on (PoS) Series """ non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_col] - features = [f for f in self.__df.columns if f not in non_features_cols] + features = [f for f in self.__matrix.columns if f not in non_features_cols] return Series(features) def _set_indexed_rows(self) -> pd.Series: @@ -109,8 +87,8 @@ def _set_indexed_rows(self) -> pd.Series: """ # Extract the label and row index columns from the DataFrame - labels = self.__df[self.__label_col] - row_indices = self.__df[self.__row_index_col] + labels = self.__matrix[self.__label_col] + row_indices = self.__matrix[self.__row_index_col] # Create a Pandas Series with row_indices as index and labels as values return pd.Series(data=labels.values, index=row_indices.values) @@ -161,7 +139,7 @@ def get_sdf_vector(self, output_column_vector: str = 'features') -> pd.DataFrame :return: DataFrame """ - sdf = self.__df + sdf = self.__matrix features_cols = self.get_features_names() sdf_vector = _assemble_column_vector(sdf, input_feature_cols=features_cols, @@ -198,10 +176,10 @@ def to_psdf(self) -> DataFrame: Convert DataFrame to Pandas on DataFrame :return: Pandas on DataFrame """ - return self.__df.pandas_api() + return self.__matrix.pandas_api() def get_df(self) -> DataFrame: - return self.__df + return self.__matrix def get_sample_col_name(self) -> str: """ @@ -236,8 +214,8 @@ def _add_row_index(self, index_name: str = '_row_index') -> pd.DataFrame: :return: DataFrame with extra column of row indices. """ # Add a new column with unique row indices using a range - self.__df[index_name] = list(range(len(self.__df))) - return self.__df + self.__matrix[index_name] = list(range(len(self.__matrix))) + return self.__matrix def count_features(self) -> int: """ @@ -364,7 +342,7 @@ def split_df(self, """ label_col = self.get_label_col_name() - df = self.__df.copy() + df = self.__matrix.copy() # Create a temporary label column for sampling tmp_label_col = '_tmp_label_indexed' diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 09fc2ac..5b56a23 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -19,9 +19,7 @@ def test_initializes_fsdataframe(): sample_col='sample_id', label_col='label', row_index_col='_row_index', - parse_col_names=False, - parse_features=False ) # Assertions to check if the initialization is correct - assert (fs_df.get_sdf(), df) \ No newline at end of file + assert isinstance(fs_df, FSDataFrame) \ No newline at end of file From fa0d32013978080acea16c4072c68d1d31b53781 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 20 Sep 2024 15:14:25 +0200 Subject: [PATCH 09/62] first iteration of pandas fdataframe.py --- fsspark/fs/fdataframe.py | 370 ++++-------------------------- fsspark/tests/test_fsdataframe.py | 32 ++- 2 files changed, 74 insertions(+), 328 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index cf590b7..fc58933 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -1,6 +1,7 @@ import logging from typing import Optional, Union, List, Set, Tuple +import numpy import numpy as np import pandas as pd from pandas import DataFrame, Series @@ -33,7 +34,6 @@ def __init__( df: DataFrame, sample_col: str = None, label_col: str = None, - row_index_col: Optional[str] = '_row_index', ): """ Create an instance of FSDataFrame. @@ -44,7 +44,6 @@ def __init__( :param df: Pandas DataFrame :param sample_col: Sample id column name :param label_col: Sample label column name - :param row_index_col: Optional. Column name of row indices. """ if sample_col is None: @@ -68,117 +67,9 @@ def __init__( self.__original_features = df.columns.tolist() numerical_df = df.select_dtypes(include=[np.number]) self.__matrix = numerical_df.to_numpy(dtype=np.float32) + self.__is_scaled = (False, None) - def _set_indexed_cols(self) -> Series: - """ - Create a distributed indexed Series representing features. - :return: Pandas on (PoS) Series - """ - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_col] - features = [f for f in self.__matrix.columns if f not in non_features_cols] - return Series(features) - - def _set_indexed_rows(self) -> pd.Series: - """ - Create an indexed Series representing sample labels. - It will use existing row indices from the DataFrame. - - :return: Pandas Series - """ - - # Extract the label and row index columns from the DataFrame - labels = self.__matrix[self.__label_col] - row_indices = self.__matrix[self.__row_index_col] - - # Create a Pandas Series with row_indices as index and labels as values - return pd.Series(data=labels.values, index=row_indices.values) - - def get_features_indexed(self) -> Series: - """ - Return features names with indices as a Series. - :return: Indexed Series. - """ - return self.__indexed_features - - def get_sample_label_indexed(self) -> Series: - """ - Return sample labels with indices as a Series. - :return: Indexed Series. - """ - return self.__indexed_instances - - def get_features_names(self) -> list: - """ - Get features names from DataFrame. - :return: List of features names - """ - return self.__indexed_features.tolist() - - def get_features_by_index(self, indices: Union[List[int], Set[int]]) -> List[str]: - """ - Get features names by specified index from DataFrame. - - :param: indices: List of feature indexes - :return: List of features names - """ - return self.__indexed_features.loc[indices].tolist() - - def get_sample_label(self) -> list: - """ - Get samples class (label) from DataFrame. - :return: List of sample class labels - """ - return self.__indexed_instances.tolist() - - def get_sdf_vector(self, output_column_vector: str = 'features') -> pd.DataFrame: - """ - Return a dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column). - This format is required as input for multiple algorithms from MLlib API. - - :param: output_column_vector: Name of the output column vector. - :return: DataFrame - """ - - sdf = self.__matrix - features_cols = self.get_features_names() - sdf_vector = _assemble_column_vector(sdf, - input_feature_cols=features_cols, - output_column_vector=output_column_vector) - - return sdf_vector - - def get_sdf_and_label(self, - output_column_vector: str = 'features') -> Tuple[DataFrame, str, str]: - """ - Extracts the DataFrame and label column name from FSDataFrame. - - :param: output_column_vector: Name of the output column vector. - :return: A tuple containing the DataFrame and the label column name. - """ - sdf = self.get_sdf_vector(output_column_vector=output_column_vector) - label_col = self.get_label_col_name() - return sdf, label_col, output_column_vector - - def _collect_features_as_array(self) -> np.array: - """ - Collect features from FSDataFrame as an array. - `Warning`: This method will collect the entire DataFrame into the driver. - Uses this method on small datasets only (e.g., after filtering or splitting the data) - - :return: Numpy array - """ - sdf = self.get_df().select(*self.get_features_names()) - a = np.array(sdf.collect()) - return a - - def to_psdf(self) -> DataFrame: - """ - Convert DataFrame to Pandas on DataFrame - :return: Pandas on DataFrame - """ - return self.__matrix.pandas_api() - - def get_df(self) -> DataFrame: + def get_feature_matrix(self) -> numpy.array: return self.__matrix def get_sample_col_name(self) -> str: @@ -197,105 +88,23 @@ def get_label_col_name(self) -> str: """ return self.__label_col - def get_row_index_name(self) -> str: - """ - Return row (instances) id column name. - - :return: Row id column name. - """ - return self.__row_index_col - - def _add_row_index(self, index_name: str = '_row_index') -> pd.DataFrame: - """ - Add row indices to DataFrame. - Unique indices of type integer will be added in non-consecutive increasing order. - - :param index_name: Name of the row index column. - :return: DataFrame with extra column of row indices. - """ - # Add a new column with unique row indices using a range - self.__matrix[index_name] = list(range(len(self.__matrix))) - return self.__matrix - def count_features(self) -> int: """ Return the number of features. - :return: Number of features. """ - return self.get_features_indexed().size + return self.__matrix.shape[1] def count_instances(self) -> int: """ Return the number of samples (instances). - :return: Number of samples. """ - return self.get_sample_label_indexed().size - - def filter_features(self, features: List[str], keep: bool = True) -> 'FSDataFrame': - """ - Select or drop specified features from DataFrame. - - :param features: List of features names to drop or select from DataFrame - :param keep: If True (default), keep features. Remove otherwise. - - :return: FSDataFrame - """ - - current_features = self.get_features_names() - if len(set(current_features).intersection(features)) == 0: - logger.warning(f"There is no overlap of specified features with the input data frame.\n" - f"Skipping this filter step...") - return self - - count_a = self.count_features() - sdf = self.get_df() - - if keep: - sdf = sdf.select( - self.__sample_col, - self.__label_col, - self.__row_index_col, - *features) - else: - sdf = sdf.drop(*features) - - fsdf_filtered = self.update(sdf, self.__sample_col, self.__label_col, self.__row_index_col) - count_b = fsdf_filtered.count_features() - - logger.info(f"{count_b} features out of {count_a} remain after applying this filter...") - - return fsdf_filtered - - def filter_features_by_index(self, feature_indices: Set[int], keep: bool = True) -> 'FSDataFrame': - """ - Select or drop specified features from DataFrame by its indices. - - :param feature_indices: Set of features indices to drop or select from DataFrame - :param keep: If True (default), keep features. Remove otherwise. + return self.__matrix.shape[0] - :return: FSDataFrame + def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool: """ - feature_names = self.get_features_by_index(feature_indices) - return self.filter_features(feature_names, keep=keep) - - def get_label_strata(self) -> list: - """ - Get strata from a categorical column in DataFrame. - - :return: List of levels for categorical variable. - """ - levels = self.get_sample_label_indexed().unique().tolist() - number_of_lvs = len(levels) - if number_of_lvs > 20: # TODO: Check if this is a right cutoff. - logger.warning(f"Number of observed levels too high: {number_of_lvs}.\n" - f"Should this variable be considered continuous?") - return levels - - def scale_features(self, scaler_method: str = 'standard', **kwargs) -> 'FSDataFrame': - """ - Scales features in DataFrame + Scales features in the SDataFrame using a specified method. :param scaler_method: One of: min_max, max_abs, standard or robust. :return: FSDataFrame with scaled features. @@ -312,19 +121,16 @@ def scale_features(self, scaler_method: str = 'standard', **kwargs) -> 'FSDataFr else: raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") - feature_array = self._features_to_array() - - feature_array = (scaler - .fit(feature_array) - .transform() - ) + # TODO: Scale only the features for now, we have to investigate if we scale cateogrical variables + self.__matrix = scaler.fit_transform(self.__matrix) + self.__is_scaled = (True, scaler_method) + return True - df_scaled = self._array_to_features(feature_array) + def is_scaled(self): + return self.__is_scaled[0] - return self.update(sdf, - self.__sample_col, - self.__label_col, - self.__row_index_name) + def get_scaled_method(self): + return self.__is_scaled[1] def split_df(self, label_type_cat: bool = True, @@ -339,120 +145,34 @@ def split_df(self, :param split_training_factor: Proportion of the training set. Usually, a value between 0.6 and 0.8. :return: Tuple of FSDataFrames. First element is the training set and second element is the testing set. - """ - - label_col = self.get_label_col_name() - df = self.__matrix.copy() - - # Create a temporary label column for sampling - tmp_label_col = '_tmp_label_indexed' + //TODO: To be done. + """ + + # label_col = self.get_label_col_name() + # df = self.__matrix.copy() + # + # # Create a temporary label column for sampling + # tmp_label_col = '_tmp_label_indexed' + # + # if label_type_cat: + # # Use factorize to convert categorical labels to integer indices + # df[tmp_label_col], _ = pd.factorize(df[label_col]) + # else: + # # For continuous labels, create a uniform random column and binarize it + # df['_tmp_uniform_rand'] = np.random.rand(len(df)) + # df[tmp_label_col] = (df['_tmp_uniform_rand'] > 0.5).astype(int) + # df = df.drop(columns=['_tmp_uniform_rand']) + # + # # Perform stratified sampling to get class-balanced training set + # train_df = df.groupby(tmp_label_col, group_keys=False).apply(lambda x: x.sample(frac=split_training_factor)) + # + # # Get the test set by subtracting the training set from the original DataFrame + # test_df = df.drop(train_df.index) + # + # # Drop the temporary label column + # train_df = train_df.drop(columns=[tmp_label_col]) + # test_df = test_df.drop(columns=[tmp_label_col]) + # + # # Return the updated DataFrames + # return self.update(train_df), self.update(test_df) - if label_type_cat: - # Use factorize to convert categorical labels to integer indices - df[tmp_label_col], _ = pd.factorize(df[label_col]) - else: - # For continuous labels, create a uniform random column and binarize it - df['_tmp_uniform_rand'] = np.random.rand(len(df)) - df[tmp_label_col] = (df['_tmp_uniform_rand'] > 0.5).astype(int) - df = df.drop(columns=['_tmp_uniform_rand']) - - # Perform stratified sampling to get class-balanced training set - train_df = df.groupby(tmp_label_col, group_keys=False).apply(lambda x: x.sample(frac=split_training_factor)) - - # Get the test set by subtracting the training set from the original DataFrame - test_df = df.drop(train_df.index) - - # Drop the temporary label column - train_df = train_df.drop(columns=[tmp_label_col]) - test_df = test_df.drop(columns=[tmp_label_col]) - - # Return the updated DataFrames - return self.update(train_df), self.update(test_df) - - @classmethod - def update(cls, - df: DataFrame, - sample_col: str, - label_col: str, - row_index_col: str): - """ - Create a new instance of FSDataFrame. - - :param df: DataFrame - :param sample_col: Name of sample id column. - :param label_col: Name of sample label column. - :param row_index_col: Name of row (instances) id column. - - :return: FSDataFrame - """ - return cls(df, sample_col, label_col, row_index_col) - - def _features_to_array(self) -> np.array: - """ - Collect features from FSDataFrame as an array. - `Warning`: This method will collect the entire DataFrame into the driver. - Uses this method on small datasets only (e.g., after filtering or splitting the data) - - :return: Numpy array - """ - sdf = self.get_df().select(*self.get_features_names()) - a = np.array(sdf.collect()) - return a - - def _array_to_features(self, a: np.array) -> pd.DataFrame: - """ - Convert a Numpy array to a DataFrame with features as columns. - :param a: Numpy array - :return: Pandas DataFrame - """ - return pd.DataFrame(a, columns=self.get_features_names()) - -# -# def _assemble_column_vector(self, -# input_feature_cols: List[str], -# output_column_vector: str = 'features', -# drop_input_cols: bool = True) -> pd.DataFrame: -# """ -# Assemble features (columns) from DataFrame into a column of type Numpy array. -# -# :param drop_input_cols: Boolean flag to drop the input feature columns. -# :param input_feature_cols: List of feature column names. -# :param output_column_vector: Name of the output column that will contain the combined vector. -# :param sdf: Pandas DataFrame -# -# :return: DataFrame with column of type Numpy array. -# """ -# -# # Combine the input columns into a single vector (Numpy array) -# self.__df[output_column_vector] = self.__df[input_feature_cols].apply(lambda row: np.array(row), axis=1) -# -# # Drop input columns if flag is set to True -# if drop_input_cols: -# return self.__df.drop(columns=input_feature_cols) -# else: -# return self.__df -# -# -# def _disassemble_column_vector(self, -# features_cols: List[str], -# col_vector_name: str, -# drop_col_vector: bool = True) -> pd.DataFrame: -# """ -# Convert a column of Numpy arrays in DataFrame to individual columns (a.k.a features). -# This is the reverse operation of `_assemble_column_vector`. -# -# :param features_cols: List of new feature column names. -# :param col_vector_name: Name of the column that contains the vector (Numpy array). -# :param drop_col_vector: Boolean flag to drop the original vector column. -# :return: DataFrame with individual feature columns. -# """ -# -# # Unpack the vector (Numpy array) into individual columns -# for i, feature in enumerate(features_cols): -# self.__df[feature] = self.__df[col_vector_name].apply(lambda x: x[i]) -# -# # Drop the original vector column if needed -# if drop_col_vector: -# self.__df = self.__df.drop(columns=[col_vector_name]) -# -# return self.__df diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 5b56a23..6f10d32 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -17,9 +17,35 @@ def test_initializes_fsdataframe(): fs_df = FSDataFrame( df=df, sample_col='sample_id', - label_col='label', - row_index_col='_row_index', + label_col='label' ) # Assertions to check if the initialization is correct - assert isinstance(fs_df, FSDataFrame) \ No newline at end of file + assert isinstance(fs_df, FSDataFrame) + + assert fs_df.get_sample_col_name() == 'sample_id' + +def test_scaler_df(): + + # Create a sample DataFrame + data = { + 'sample_id': [1, 2, 3], + 'label': ['A', 'B', 'C'], + 'feature1': [0.1, 0.2, 0.3], + 'feature2': [1.1, 1.2, 1.3] + } + df = pd.DataFrame(data) + + # Initialize FSDataFrame + fs_df = FSDataFrame( + df=df, + sample_col='sample_id', + label_col='label' + ) + + # Scale the DataFrame + fs_df.scale_features(scaler_method='standard') + + # Assertions to check if the scaling is correct + assert fs_df.is_scaled() == True + assert fs_df.get_scaled_method() == 'standard' \ No newline at end of file From 0a8080bd24a672e596e7414ce0e85078b3c3b25a Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 18:14:19 +0200 Subject: [PATCH 10/62] added test univariate corr --- fsspark/tests/test_univariate_methods.py | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 fsspark/tests/test_univariate_methods.py diff --git a/fsspark/tests/test_univariate_methods.py b/fsspark/tests/test_univariate_methods.py new file mode 100644 index 0000000..bf3a123 --- /dev/null +++ b/fsspark/tests/test_univariate_methods.py @@ -0,0 +1,44 @@ +import unittest + +import pandas as pd +from fsspark.utils.datasets import get_tnbc_data_path +from fsspark.fs.fdataframe import FSDataFrame + +from fsspark.fs.univariate import univariate_filter + + +class UnivariateMethodsTest(unittest.TestCase): + """ + Define testing methods for FSDataFrame class. + """ + + def setUp(self) -> None: + # import tsv as pandas DataFrame + self.df = pd.read_csv(get_tnbc_data_path(), sep='\t') + + # create FSDataFrame instance + self.fsdf = FSDataFrame(df=self.df, + sample_col='Sample', + label_col='label') + + def tearDown(self) -> None: + pass + + def test_univariate_filter_corr(self): + """ + Test univariate_filter method with 'u_corr' method. + :return: None + """ + + fsdf = self.fsdf + fsdf_filtered = univariate_filter(fsdf, + univariate_method='u_corr', + corr_threshold=0.3) + + self.assertEqual(fsdf.count_features(), 500) + self.assertEqual(fsdf_filtered.count_features(), 211) + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv('filtered_tnbc_data.csv', index=False) + From 8558656d40f9a91e8a4ccadff44d7bd6184e4cda Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 18:14:51 +0200 Subject: [PATCH 11/62] refactor univariate methods (corr) --- fsspark/fs/univariate.py | 67 ++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/fsspark/fs/univariate.py b/fsspark/fs/univariate.py index 713cd26..2b83cd0 100644 --- a/fsspark/fs/univariate.py +++ b/fsspark/fs/univariate.py @@ -1,43 +1,48 @@ import logging from typing import Dict, List +import numpy as np import pandas as pd from sklearn.feature_selection import SelectKBest, f_classif, f_regression -from scipy.stats import pearsonr + +from fsspark.fs.fdataframe import FSDataFrame logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FS:UNIVARIATE") logger.setLevel(logging.INFO) -def compute_univariate_corr(df: pd.DataFrame, features: List[str], label: str) -> Dict[str, float]: +def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: """ - Compute the correlation coefficient between every column (features) in the input DataFrame and the label (class). + Compute the correlation coefficient between every column (features) in the input NumPy array and the label (class) + using a dictionary comprehension. - :param df: Input DataFrame - :param features: List of feature column names - :param label: Label column name - - :return: Return dict {feature -> corr} + :param df: Input FSDataFrame + :return: Return dict {feature_index -> corr} """ - correlations = {feature: abs(df[feature].corr(df[label])) for feature in features} - return correlations + + f_matrix = df.get_feature_matrix() # get the feature matrix + labels = df.get_label_vector() # get the label vector + features_index = range(f_matrix.shape[1]) # get the feature index + + return { + f_index: abs(np.corrcoef(f_matrix[:, f_index], labels)[0, 1]) + for f_index in features_index + } -def univariate_correlation_selector(df: pd.DataFrame, features: List[str], label: str, corr_threshold: float = 0.3) -> \ -List[str]: +def univariate_correlation_selector(df: FSDataFrame, corr_threshold: float = 0.3) -> List[int]: """ - Select features based on their correlation with a label (class), if the correlation value is less than the specified threshold. + Select features based on their correlation with a label (class), if the correlation value is less than the specified + threshold. :param df: Input DataFrame - :param features: List of feature column names - :param label: Label column name :param corr_threshold: Maximum allowed correlation threshold - :return: List of selected feature names + :return: List of selected feature indices """ - correlations = compute_univariate_corr(df, features, label) - selected_features = [feature for feature, corr in correlations.items() if corr <= corr_threshold] + correlations = compute_univariate_corr(df) + selected_features = [feature_index for feature_index, corr in correlations.items() if corr <= corr_threshold] return selected_features @@ -83,28 +88,38 @@ def univariate_selector(df: pd.DataFrame, features: List[str], label: str, label return selected_features -def univariate_filter(df: pd.DataFrame, features: List[str], label: str, univariate_method: str = 'u_corr', - **kwargs) -> pd.DataFrame: +def univariate_filter(df: FSDataFrame, + univariate_method: str = 'u_corr', + **kwargs) -> FSDataFrame: """ Filter features after applying a univariate feature selector method. :param df: Input DataFrame - :param features: List of feature column names - :param label: Label column name :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') :return: Filtered DataFrame with selected features """ + selected_features = [] + if univariate_method == 'anova': - selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) + # TODO: Implement ANOVA selector + # selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) + pass elif univariate_method == 'f_regression': - selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) + # TODO: Implement F-regression selector + # selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) + pass elif univariate_method == 'u_corr': - selected_features = univariate_correlation_selector(df, features, label, **kwargs) + selected_features = univariate_correlation_selector(df, **kwargs) else: raise ValueError(f"Univariate method {univariate_method} not supported.") logger.info(f"Applying univariate filter using method: {univariate_method}") - return df[selected_features + [label]] # Return DataFrame with selected features and label column + if len(selected_features) == 0: + logger.warning("No features selected. Returning original DataFrame.") + return df + else: + logger.info(f"Selected {len(selected_features)} features...") + return df.select_features_by_index(selected_features) From d2ca24d0202bdb597eab87e154777bc1b577c555 Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 18:15:10 +0200 Subject: [PATCH 12/62] update --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 757aeef..574afb4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ networkx numpy setuptools pandas -pyarrow \ No newline at end of file +scikit-learn \ No newline at end of file From 516b4c671613ae9f15b32666935fdd3b4711b656 Mon Sep 17 00:00:00 2001 From: enriquea Date: Fri, 20 Sep 2024 18:16:13 +0200 Subject: [PATCH 13/62] added methods to select features and update FSDataFrame --- fsspark/fs/fdataframe.py | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index fc58933..f579b8f 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -72,6 +72,9 @@ def __init__( def get_feature_matrix(self) -> numpy.array: return self.__matrix + def get_label_vector(self) -> numpy.array: + return self.__labels_matrix + def get_sample_col_name(self) -> str: """ Return sample id column name. @@ -132,6 +135,56 @@ def is_scaled(self): def get_scaled_method(self): return self.__is_scaled[1] + def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame': + """ + Keep only the specified features (by index) and return an updated instance of FSDataFrame. + + :param feature_indexes: List of feature column indices to keep. + :return: A new FSDataFrame instance with only the selected features. + """ + # Filter the feature matrix to retain only the selected columns (features) + updated_matrix = self.__matrix[:, feature_indexes] + + # Filter the original feature names to retain only the selected ones + updated_features = [self.__original_features[i] for i in feature_indexes] + + # Create a new DataFrame with the retained features and their names + updated_df = pd.DataFrame(updated_matrix, columns=updated_features) + + # Reattach the sample column (if it exists) + if self.__sample_col: + updated_df[self.__sample_col] = self.__samples + + # Reattach the label column + updated_df[self.__label_col] = self.__labels + + # Return a new instance of FSDataFrame with the updated data + return FSDataFrame(updated_df, sample_col=self.__sample_col, label_col=self.__label_col) + + def to_pandas(self) -> DataFrame: + """ + Return the DataFrame representation of the FSDataFrame. + + :return: Pandas DataFrame. + """ + + df = pd.DataFrame() + + # Reattach the sample column (if it exists) + if self.__sample_col: + df[self.__sample_col] = self.__samples + + # Reattach the label column + df[self.__label_col] = self.__labels + + # Create a DataFrame from the feature matrix + df_features = pd.DataFrame(self.__matrix, columns=self.__original_features) + + # Concatenate the features DataFrame + df = pd.concat([df, df_features], axis=1) + + return df + def split_df(self, label_type_cat: bool = True, split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: From a787707164602cdcac7ed71db3bac5600d660000 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 20 Sep 2024 18:43:17 +0200 Subject: [PATCH 14/62] move from unitests to pytests --- fsspark/fs/fdataframe.py | 4 +- fsspark/tests/generate_big_tests.py | 51 ++++++++++++++++++++++++ fsspark/tests/test_fsdataframe.py | 1 - fsspark/tests/test_univariate_methods.py | 43 ++++++-------------- 4 files changed, 66 insertions(+), 33 deletions(-) create mode 100644 fsspark/tests/generate_big_tests.py diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index f579b8f..399c200 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -1,10 +1,10 @@ import logging -from typing import Optional, Union, List, Set, Tuple +from typing import List, Tuple import numpy import numpy as np import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") diff --git a/fsspark/tests/generate_big_tests.py b/fsspark/tests/generate_big_tests.py new file mode 100644 index 0000000..eb7d677 --- /dev/null +++ b/fsspark/tests/generate_big_tests.py @@ -0,0 +1,51 @@ +import logging + +import pandas as pd +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq + +def test_generate_big_dataset(): + # Parameters for the dataset + n_samples = 1200 + n_features = 10_000 + chunk_size = 100 # Adjust chunk size for memory efficiency + + # Generate sample IDs and labels + sample_ids = np.arange(1, n_samples + 1) + labels = np.random.choice(['LV', 'RV', 'LA', 'RA'], size=n_samples) + + # Parquet schema definition + schema = pa.schema([pa.field('sample_id', pa.int32()), pa.field('label', pa.string())] + + [pa.field(f'feature{i}', pa.float32()) for i in range(1, n_features + 1)]) + + # Create an empty Parquet file + output_file = 'large_dataset_optimized_samples_{}_features_{}.parquet'.format(n_samples, n_features) + with pq.ParquetWriter(output_file, schema, compression='snappy') as writer: + # Process in chunks to reduce memory usage + for chunk_start in range(0, n_samples, chunk_size): + chunk_end = min(chunk_start + chunk_size, n_samples) + + # Generate chunk of samples and labels + chunk_sample_ids = sample_ids[chunk_start:chunk_end] + chunk_labels = labels[chunk_start:chunk_end] + + # Generate chunk of features + chunk_features = {f'feature{i}': np.random.rand(chunk_end - chunk_start) for i in range(1, n_features + 1)} + + # Create DataFrame chunk + chunk_data = { + 'sample_id': chunk_sample_ids, + 'label': chunk_labels + } + chunk_data.update(chunk_features) + + df_chunk = pd.DataFrame(chunk_data) + + # Convert to PyArrow Table and write chunk to Parquet file + table_chunk = pa.Table.from_pandas(df_chunk, schema=schema) + writer.write_table(table_chunk) + logging.info(f'Processed samples {chunk_start + 1} to {chunk_end}') + + print("Optimized Parquet file created successfully!") + diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 6f10d32..38f009c 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -1,4 +1,3 @@ -import pytest import pandas as pd from fsspark.fs.fdataframe import FSDataFrame diff --git a/fsspark/tests/test_univariate_methods.py b/fsspark/tests/test_univariate_methods.py index bf3a123..f2f2bf8 100644 --- a/fsspark/tests/test_univariate_methods.py +++ b/fsspark/tests/test_univariate_methods.py @@ -1,44 +1,27 @@ -import unittest - import pandas as pd from fsspark.utils.datasets import get_tnbc_data_path from fsspark.fs.fdataframe import FSDataFrame from fsspark.fs.univariate import univariate_filter - -class UnivariateMethodsTest(unittest.TestCase): +def test_univariate_filter_corr(): """ - Define testing methods for FSDataFrame class. + Test univariate_filter method with 'u_corr' method. + :return: None """ - def setUp(self) -> None: - # import tsv as pandas DataFrame - self.df = pd.read_csv(get_tnbc_data_path(), sep='\t') - - # create FSDataFrame instance - self.fsdf = FSDataFrame(df=self.df, - sample_col='Sample', - label_col='label') - - def tearDown(self) -> None: - pass + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep='\t') - def test_univariate_filter_corr(self): - """ - Test univariate_filter method with 'u_corr' method. - :return: None - """ + # create FSDataFrame instance + fs_df = FSDataFrame(df=df,sample_col='Sample',label_col='label') - fsdf = self.fsdf - fsdf_filtered = univariate_filter(fsdf, - univariate_method='u_corr', - corr_threshold=0.3) + fsdf_filtered = univariate_filter(fs_df,univariate_method='u_corr', corr_threshold=0.3) - self.assertEqual(fsdf.count_features(), 500) - self.assertEqual(fsdf_filtered.count_features(), 211) + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 211 - # Export the filtered DataFrame as Pandas DataFrame - df_filtered = fsdf_filtered.to_pandas() - df_filtered.to_csv('filtered_tnbc_data.csv', index=False) + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv('filtered_tnbc_data.csv', index=False) From f75093de00f0e39655bf697f27dd0e0cae11d1e2 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 20 Sep 2024 21:21:32 +0200 Subject: [PATCH 15/62] move from unitests to pytests --- fsspark/tests/test_fsdataframe.py | 55 ++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 38f009c..726933e 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -1,4 +1,9 @@ +import numpy as np import pandas as pd +import matplotlib.pyplot as plt +from memory_profiler import memory_usage +import gc + from fsspark.fs.fdataframe import FSDataFrame def test_initializes_fsdataframe(): @@ -47,4 +52,52 @@ def test_scaler_df(): # Assertions to check if the scaling is correct assert fs_df.is_scaled() == True - assert fs_df.get_scaled_method() == 'standard' \ No newline at end of file + assert fs_df.get_scaled_method() == 'standard' + +def test_memory_fsdataframe(): + def create_test_data(n_samples, n_features): + """Create test data for FSDataFrame.""" + data = np.random.rand(n_samples, n_features) + df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(n_features)]) + df['sample_id'] = [f'sample_{i}' for i in range(n_samples)] + df['label'] = np.random.choice(['A', 'B'], n_samples) + return df + + def measure_memory_usage(n_samples, n_features): + """Measure memory usage of FSDataFrame for given number of samples and features.""" + df = create_test_data(n_samples, n_features) + mem_usage = memory_usage((FSDataFrame, (df, 'sample_id', 'label')), max_iterations=1)[0] + gc.collect() # Force garbage collection to free memory + return mem_usage + + # Define test cases + feature_sizes = [1000, 5000, 10000, 50000, 100_000, 1_000_000] + sample_sizes = [10, 50, 100, 500, 1000] + + # Measure memory usage for each test case + results = [] + for n_samples in sample_sizes: + for n_features in feature_sizes: + mem_usage = measure_memory_usage(n_samples, n_features) + results.append((n_samples, n_features, mem_usage)) + + # Convert results to DataFrame + results_df = pd.DataFrame(results, columns=['Samples', 'Features', 'Memory (MB)']) + + # Create 2D line plot + plt.figure(figsize=(12, 8)) + + for feature_size in feature_sizes: + data = results_df[results_df['Features'] == feature_size] + plt.plot(data['Samples'], data['Memory (MB)'], marker='o', label=f'{feature_size} Features') + + plt.xlabel('Number of Samples') + plt.ylabel('Memory Usage (MB)') + plt.title('FSDataFrame Memory Usage') + plt.legend() + plt.xscale('log') # Using log scale for x-axis to better visualize the range + plt.tight_layout() + plt.show() + + # Print results table + print(results_df.to_string(index=False)) \ No newline at end of file From f15b4e83720b2e68f2084e3a1c4181d6011fcde0 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 21 Sep 2024 16:06:54 +0100 Subject: [PATCH 16/62] minor changes to store sparse matrices --- fsspark/fs/fdataframe.py | 109 +++++++++++++++++++++++------- fsspark/tests/test_fsdataframe.py | 25 ++++--- requirements.txt | 3 +- 3 files changed, 101 insertions(+), 36 deletions(-) diff --git a/fsspark/fs/fdataframe.py b/fsspark/fs/fdataframe.py index 399c200..fc0effd 100644 --- a/fsspark/fs/fdataframe.py +++ b/fsspark/fs/fdataframe.py @@ -1,10 +1,12 @@ import logging -from typing import List, Tuple +from typing import List, Tuple, Optional import numpy import numpy as np import pandas as pd +import psutil from pandas import DataFrame +from scipy import sparse from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -28,45 +30,97 @@ class FSDataFrame: [...] """ - def __init__( self, - df: DataFrame, - sample_col: str = None, - label_col: str = None, + df: pd.DataFrame, + sample_col: Optional[str] = None, + label_col: Optional[str] = None, + sparse_threshold: float = 0.7, # Threshold for sparsity + memory_threshold: Optional[float] = 0.75 # Proportion of system memory to use for dense arrays ): """ Create an instance of FSDataFrame. - Expected an input DataFrame with 2+N columns. - After specifying sample id and sample label columns, the remaining N columns will be considered as features. + The input DataFrame should contain 2+N columns. After specifying the sample id and label columns, + the remaining N columns will be considered features. The feature columns should contain only numerical data. + The DataFrame is stored in a dense or sparse format based on the sparsity of the data and available memory. - :param df: Pandas DataFrame - :param sample_col: Sample id column name - :param label_col: Sample label column name + :param df: Input Pandas DataFrame + :param sample_col: Column name for sample identifiers (optional) + :param label_col: Column name for labels (required) + :param sparse_threshold: Threshold for sparsity, default is 70%. If the proportion of zero entries + in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows. + :param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense. """ + self.__df = df.copy() + + # Check for necessary columns + columns_to_drop = [] - if sample_col is None: + # Handle sample column + if sample_col: + if sample_col not in df.columns: + raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.") + self.__sample_col = sample_col + self.__samples = df[sample_col].tolist() + columns_to_drop.append(sample_col) + else: self.__sample_col = None self.__samples = [] logging.info("No sample column specified.") - else: - self.__sample_col = sample_col - self.__samples = df[sample_col].tolist() - df = df.drop(columns=[sample_col]) + # Handle label column if label_col is None: - raise ValueError("No label column specified. A class/label column is required.") + raise ValueError("A label column is required but was not specified.") + if label_col not in df.columns: + raise ValueError(f"Label column '{label_col}' not found in DataFrame.") + + self.__label_col = label_col + self.__labels = df[label_col].tolist() + + # Encode labels + label_encoder = LabelEncoder() + self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist() + columns_to_drop.append(label_col) + + # Drop both sample and label columns in one step + self.__df = self.__df.drop(columns=columns_to_drop) + + # Extract features + self.__original_features = self.__df.columns.tolist() + + # Ensure only numerical features are retained + numerical_df = self.__df.select_dtypes(include=[np.number]) + if numerical_df.empty: + raise ValueError("No numerical features found in the DataFrame.") + + # Check sparsity + num_elements = numerical_df.size + num_zeros = (numerical_df == 0).sum().sum() + sparsity = num_zeros / num_elements + + dense_matrix_size = numerical_df.memory_usage(deep=True).sum() # In bytes + available_memory = psutil.virtual_memory().available # In bytes + + if sparsity > sparse_threshold: + if dense_matrix_size < memory_threshold * available_memory: + # Use dense matrix if enough memory is available + logging.info(f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. " + f"Using a dense matrix.") + self.__matrix = numerical_df.to_numpy(dtype=np.float32) + self.__is_sparse = False + else: + # Use sparse matrix due to memory constraints + logging.info(f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. " + f"Using a sparse matrix representation.") + self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32)) + self.__is_sparse = True else: - self.__label_col = label_col - self.__labels = df[label_col].tolist() - label_encoder = LabelEncoder() - self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist() - df = df.drop(columns=[label_col]) - - self.__original_features = df.columns.tolist() - numerical_df = df.select_dtypes(include=[np.number]) - self.__matrix = numerical_df.to_numpy(dtype=np.float32) + # Use dense matrix since it's not sparse + logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.") + self.__matrix = numerical_df.to_numpy(dtype=np.float32) + self.__is_sparse = False + self.__is_scaled = (False, None) def get_feature_matrix(self) -> numpy.array: @@ -124,7 +178,7 @@ def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool: else: raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") - # TODO: Scale only the features for now, we have to investigate if we scale cateogrical variables + # TODO: Scale only the features for now, we have to investigate if we scale categorical variables self.__matrix = scaler.fit_transform(self.__matrix) self.__is_scaled = (True, scaler_method) return True @@ -135,6 +189,9 @@ def is_scaled(self): def get_scaled_method(self): return self.__is_scaled[1] + def is_sparse(self): + return self.__is_sparse + def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame': """ Keep only the specified features (by index) and return an updated instance of FSDataFrame. diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 726933e..7b01311 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -55,41 +55,48 @@ def test_scaler_df(): assert fs_df.get_scaled_method() == 'standard' def test_memory_fsdataframe(): - def create_test_data(n_samples, n_features): + def create_test_data(n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05): """Create test data for FSDataFrame.""" data = np.random.rand(n_samples, n_features) + data[np.random.rand(n_samples, n_features) < zero_prob] = 0 + data[np.random.rand(n_samples, n_features) < nan_prob] = np.nan + df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(n_features)]) df['sample_id'] = [f'sample_{i}' for i in range(n_samples)] df['label'] = np.random.choice(['A', 'B'], n_samples) return df - def measure_memory_usage(n_samples, n_features): + def measure_memory_usage(n_samples: int, n_features: int, nan_prob = 0.01) -> float: """Measure memory usage of FSDataFrame for given number of samples and features.""" - df = create_test_data(n_samples, n_features) + df = create_test_data(n_samples, n_features, nan_prob=nan_prob) mem_usage = memory_usage((FSDataFrame, (df, 'sample_id', 'label')), max_iterations=1)[0] gc.collect() # Force garbage collection to free memory return mem_usage # Define test cases feature_sizes = [1000, 5000, 10000, 50000, 100_000, 1_000_000] - sample_sizes = [10, 50, 100, 500, 1000] + sample_sizes = [100, 500, 1000] + nan_prob = [0.05, 0.1, 0.2, 0.5] # Measure memory usage for each test case results = [] for n_samples in sample_sizes: for n_features in feature_sizes: - mem_usage = measure_memory_usage(n_samples, n_features) - results.append((n_samples, n_features, mem_usage)) + for prob in nan_prob: + mem_usage = measure_memory_usage(n_samples, n_features, nan_prob=prob) + results.append((n_samples, n_features, prob, mem_usage)) # Append prob to results # Convert results to DataFrame - results_df = pd.DataFrame(results, columns=['Samples', 'Features', 'Memory (MB)']) + results_df = pd.DataFrame(results, columns=['Samples', 'Features', 'NAN Prob', 'Memory (MB)']) # Create 2D line plot plt.figure(figsize=(12, 8)) for feature_size in feature_sizes: - data = results_df[results_df['Features'] == feature_size] - plt.plot(data['Samples'], data['Memory (MB)'], marker='o', label=f'{feature_size} Features') + for prob in nan_prob: + data = results_df[(results_df['Features'] == feature_size) & (results_df['NAN Prob'] == prob)] + plt.plot(data['Samples'], data['Memory (MB)'], marker='o', + label=f'{feature_size} Features - {prob} NAN Prob') plt.xlabel('Number of Samples') plt.ylabel('Memory Usage (MB)') diff --git a/requirements.txt b/requirements.txt index 574afb4..94ce35f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ networkx numpy setuptools pandas -scikit-learn \ No newline at end of file +scikit-learn +scipy \ No newline at end of file From ea15b18dfb1597117e2951e102a9b6bb2ad272d1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 08:00:05 +0100 Subject: [PATCH 17/62] fsspark -> fslite --- README.md | 28 +++++++++++++++------------- fsspark/tests/test_fsdataframe.py | 5 ++++- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f8d3e5c..a8fa632 100644 --- a/README.md +++ b/README.md @@ -1,41 +1,43 @@ -[![Python application](https://github.com/enriquea/fsspark/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/enriquea/fsspark/actions/workflows/python-app.yml) -[![Python Package using Conda](https://github.com/enriquea/fsspark/actions/workflows/python-package-conda.yml/badge.svg?branch=main)](https://github.com/enriquea/fsspark/actions/workflows/python-package-conda.yml) +[![Python application](https://github.com/enriquea/fslite/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/enriquea/fslite/actions/workflows/python-app.yml) +[![Python Package using Conda](https://github.com/enriquea/fslite/actions/workflows/python-package-conda.yml/badge.svg?branch=main)](https://github.com/enriquea/fslite/actions/workflows/python-package-conda.yml) -# fsspark +# fslite --- -## Feature selection in Spark +### Memory-Efficient, High-Performance Feature Selection Library for Big and Small Datasets ### Description -`fsspark` is a python module to perform feature selection and machine learning based on spark. -Pipelines written using `fsspark` can be divided roughly in four major stages: 1) data pre-processing, 2) univariate +`fslite` is a python module to perform feature selection and machine learning using pre-built FS pipelines. +Pipelines written using `fslite` can be divided roughly in four major stages: 1) data pre-processing, 2) univariate filters, 3) multivariate filters and 4) machine learning wrapped with cross-validation (**Figure 1**). +`fslite` is based on our previous work [feseR](https://github.com/enriquea/feseR); previously implemented in R and caret package; publication can be found [here](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0189875). + ![Feature Selection flowchart](images/fs_workflow.png) -**Figure 1**. Feature selection workflow example implemented in fsspark. +**Figure 1**. Feature selection workflow example implemented in fslite. ### Documentation The package documentation describes the [data structures](docs/README.data.md) and -[features selection methods](docs/README.methods.md) implemented in `fsspark`. +[features selection methods](docs/README.methods.md) implemented in `fslite`. ### Installation - pip ```bash -git clone https://github.com/enriquea/fsspark.git -cd fsspark +git clone https://github.com/bigbio/fslite.git +cd fslite pip install . -r requirements.txt ``` - conda ```bash -git clone https://github.com/enriquea/fsspark.git -cd fsspark +git clone https://github.com/bigbio/fslite.git +cd fslite conda env create -f environment.yml -conda activate fsspark-venv +conda activate fslite-venv pip install . -r requirements.txt ``` diff --git a/fsspark/tests/test_fsdataframe.py b/fsspark/tests/test_fsdataframe.py index 7b01311..b1ab9e9 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fsspark/tests/test_fsdataframe.py @@ -107,4 +107,7 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob = 0.01) -> fl plt.show() # Print results table - print(results_df.to_string(index=False)) \ No newline at end of file + print(results_df.to_string(index=False)) + + # Initialize FSDataFrame with DataFrame having sparse numerical features and insufficient memory for dense matrix + From a4de03c2ed2114006a912579daf488e2e2cb2f00 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 08:01:15 +0100 Subject: [PATCH 18/62] fsspark -> fslite --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a8fa632..617ce34 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![Python application](https://github.com/enriquea/fslite/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/enriquea/fslite/actions/workflows/python-app.yml) -[![Python Package using Conda](https://github.com/enriquea/fslite/actions/workflows/python-package-conda.yml/badge.svg?branch=main)](https://github.com/enriquea/fslite/actions/workflows/python-package-conda.yml) +[![Python application](https://github.com/bigbio/fslite/actions/workflows/python-app.yml/badge.svg?branch=main)](https://github.com/enriquea/fslite/actions/workflows/python-app.yml) +[![Python Package using Conda](https://github.com/bigbio/fslite/actions/workflows/python-package-conda.yml/badge.svg?branch=main)](https://github.com/bigbio/fslite/actions/workflows/python-package-conda.yml) # fslite From 032a422eb6c905ece64733bce8a7fb31440abb0a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 09:05:41 +0100 Subject: [PATCH 19/62] better structure for methods in constants.py --- fsspark/fs/constants.py | 114 +++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 53 deletions(-) diff --git a/fsspark/fs/constants.py b/fsspark/fs/constants.py index 27eb6e3..1dd7bb3 100644 --- a/fsspark/fs/constants.py +++ b/fsspark/fs/constants.py @@ -1,53 +1,61 @@ -# Define constants for the project - - -# Define univariate feature selection methods constants -ANOVA = 'anova' -UNIVARIATE_CORRELATION = 'u_corr' -F_REGRESSION = 'f_regression' - -# Define dict with univariate feature selection methods and brief description -UNIVARIATE_METHODS = { - ANOVA: 'ANOVA univariate feature selection (F-classification)', - UNIVARIATE_CORRELATION: 'Univariate Correlation', - F_REGRESSION: 'Univariate F-regression' -} - -# Define multivariate feature selection methods constants -MULTIVARIATE_CORRELATION = 'm_corr' -MULTIVARIATE_VARIANCE = 'variance' - -# Define dict with multivariate feature selection methods and brief description -MULTIVARIATE_METHODS = { - MULTIVARIATE_CORRELATION: 'Multivariate Correlation', - MULTIVARIATE_VARIANCE: 'Multivariate Variance' -} - -# Define machine learning wrapper methods constants - -# binary classification -RF_BINARY = 'rf_binary' -LSVC_BINARY = 'lsvc_binary' -FM_BINARY = 'fm_binary' # TODO: implement this method - -# multilabel classification -RF_MULTILABEL = 'rf_multilabel' -LR_MULTILABEL = 'lg_multilabel' # TODO: implement this method - -# regression -RF_REGRESSION = 'rf_regression' -FM_REGRESSION = 'fm_regression' # TODO: implement this method - - -# Define dict with machine learning wrapper methods and brief description -ML_METHODS = { - RF_BINARY: 'Random Forest Binary Classifier', - LSVC_BINARY: 'Linear SVC Binary Classifier', - FM_BINARY: 'Factorization Machine Binary Classifier', - - RF_MULTILABEL: 'Random Forest Multi-label Classifier', - LR_MULTILABEL: 'Logistic Regression Multi-label Classifier', - - RF_REGRESSION: 'Random Forest Regression', - FM_REGRESSION: 'Factorization Machine Regression' -} +""" +This file contains a list of constants used in the feature selection and machine learning methods. +""" + +FS_METHODS = { + 'univariate': { + "title": 'Univariate Feature Selection', + "methods": [ + { + 'name': 'anova', + 'description': 'ANOVA univariate feature selection (F-classification)' + } + ] + }, + 'multivariate': { + "title": 'Multivariate Feature Selection', + "methods": [ + { + 'name': 'm_corr', + 'description': 'Multivariate Correlation' + }, + { + 'name': 'variance', + 'description': 'Multivariate Variance' + } + ] + }, + 'ml': { + "title": 'Machine Learning Wrapper', + "methods": [ + { + 'name': 'rf_binary', + 'description': 'Random Forest Binary Classifier' + }, + { + 'name': 'lsvc_binary', + 'description': 'Linear SVC Binary Classifier' + }, + { + 'name': 'fm_binary', + 'description': 'Factorization Machine Binary Classifier' + }, + { + 'name': 'rf_multilabel', + 'description': 'Random Forest Multi-label Classifier' + }, + { + 'name': 'lg_multilabel', + 'description': 'Logistic Regression Multi-label Classifier' + }, + { + 'name': 'rf_regression', + 'description': 'Random Forest Regression' + }, + { + 'name': 'fm_regression', + 'description': 'Factorization Machine Regression' + } + ] + } +} \ No newline at end of file From c2312c899723e9d7a2f1db249507fdeea57e6de7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 09:08:03 +0100 Subject: [PATCH 20/62] better structure for methods in constants.py --- fsspark/fs/constants.py | 25 +- fsspark/fs/core.py | 564 ---------------------------------------- 2 files changed, 24 insertions(+), 565 deletions(-) delete mode 100644 fsspark/fs/core.py diff --git a/fsspark/fs/constants.py b/fsspark/fs/constants.py index 1dd7bb3..1134493 100644 --- a/fsspark/fs/constants.py +++ b/fsspark/fs/constants.py @@ -58,4 +58,27 @@ } ] } -} \ No newline at end of file +} + + +def get_fs_methods(): + """ + Get the list of feature selection methods + :return: dict + """ + return FS_METHODS + +def get_fs_method_details(method_name: str): + """ + Get the details of the feature selection method, this function search in all-methods definitions + and get the details of the method with the given name. If the method is not found, it returns None. + The method name is case-insensitive. + :param method_name: str + :return: dict + """ + + for method_type in FS_METHODS: + for method in FS_METHODS[method_type]['methods']: + if method['name'].lower() == method_name.lower(): + return method + return None diff --git a/fsspark/fs/core.py b/fsspark/fs/core.py deleted file mode 100644 index 1f05008..0000000 --- a/fsspark/fs/core.py +++ /dev/null @@ -1,564 +0,0 @@ -import logging -import numpy as np -from typing import (Union, - Optional, - List, - Set, - Tuple) - -from pyspark.ml.feature import (VectorAssembler, - StringIndexer, - Binarizer, - MinMaxScaler, - MaxAbsScaler, - StandardScaler, - RobustScaler) -from pyspark.ml.functions import vector_to_array -from pyspark.pandas.series import Series -from pyspark.sql.functions import (monotonically_increasing_id, - col, - rand) - -logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("pickfeat") -logger.setLevel(logging.INFO) - - -class FSDataFrame: - """ - FSDataFrame is a representation of a Spark DataFrame with some functionalities to perform feature selection. - An object from FSDataFrame is basically represented by a Spark DataFrame with samples - as rows and features as columns, with extra distributed indexed pandas series for - features names and samples labels. - - An object of FSDataFrame offers an interface to a Spark DataFrame, a Pandas on Spark DataFrame - (e.g. suitable for visualization) or a Spark DataFrame with features as a Dense column vector (e.g. suitable for - applying most algorithms from Spark MLib API). - - It can also be split in training and testing dataset and filtered by removing selected features (by name or index). - - [...] - - """ - - def __init__( - self, - df: Union[pyspark.sql.DataFrame, pyspark.pandas.DataFrame], - sample_col: str = None, - label_col: str = None, - row_index_col: Optional[str] = '_row_index', - parse_col_names: bool = False, - parse_features: bool = False, - ): - """ - Create an instance of FSDataFrame. - - Expected an input DataFrame with 2+N columns. - After specifying sample id and sample label columns, the remaining N columns will be considered as features. - - :param df: Spark (or Pandas on Spark) DataFrame - :param sample_col: Sample id column name - :param label_col: Sample label column name - :param row_index_col: Optional. Column name of row indices. - :param parse_col_names: Replace dots (.) in column names with underscores. - :param parse_features: Coerce all features to float. - """ - - self.__df = self._convert_psdf_to_sdf(df) - self.__sample_col = sample_col - self.__label_col = label_col - self.__row_index_name = row_index_col - - # check input dataframe - self._check_df() - - # replace dots in column names, if any. - if parse_col_names: - # TODO: Dots in column names are prone to errors, since dots are used to access attributes from DataFrame. - # Should we make this replacement optional? Or print out a warning? - self.__df = self.__df.toDF(*(c.replace('.', '_') for c in self.__df.columns)) - - # If the specified row index column name does not exist, add row index to the dataframe - if self.__row_index_name not in self.__df.columns: - self.__df = self._add_row_index(index_name=self.__row_index_name) - - if parse_features: - # coerce all features to float - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] - feature_cols = [c for c in self.__df.columns if c not in non_features_cols] - self.__df = self.__df.withColumns({c: self.__df[c].cast('float') for c in feature_cols}) - - self.__indexed_features = self._set_indexed_cols() - self.__indexed_instances = self._set_indexed_rows() - - def _check_df(self): - """ - Check if input DataFrame meet the minimal requirements to feed an FS pipeline. - - :return: None - """ - col_names = self.__df.columns - if self.__sample_col not in col_names: - raise DataFormatError(f"Column sample name {self.__sample_col} not found...") - elif self.__label_col not in col_names: - raise DataFormatError(f"Column label name {self.__label_col} not found...") - elif not isinstance(self.__row_index_name, str): - raise DataFormatError("Row index column name must be a valid string...") - else: - pass - - @staticmethod - def _convert_psdf_to_sdf(df: Union[pyspark.pandas.DataFrame, pyspark.sql.DataFrame]) -> pyspark.sql.DataFrame: - """ - Convert Pandas on Spark DataFrame (psdf) to Spark DataFrame (sdf). - - :param df: Spark (or Pandas on Spark) DataFrame - :return: Spark DataFrame - """ - return df.to_spark(index_col=None) if isinstance(df, pyspark.pandas.DataFrame) else df - - def _set_indexed_cols(self) -> pyspark.pandas.series.Series: - """ - Create a distributed indexed Series representing features. - - :return: Pandas on Spark (PoS) Series - """ - # TODO: Check for equivalent to pandas distributed Series in Spark. - non_features_cols = [self.__sample_col, self.__label_col, self.__row_index_name] - features = [f for f in self.__df.columns if f not in non_features_cols] - return Series(features) - - def _set_indexed_rows(self) -> pyspark.pandas.series.Series: - """ - Create a distributed indexed Series representing samples labels. - It will use existing row indices, if any. - - :return: Pandas on Spark (PoS) Series - """ - # TODO: Check for equivalent to pandas distributed Series in Spark. - label = self.__df.select(self.__label_col).collect() - row_index = self.__df.select(self.__row_index_name).collect() - return Series(label, index=row_index) - - def get_features_indexed(self) -> pyspark.pandas.series.Series: - """ - Return features names with indices as a Series. - :return: Indexed Series. - """ - return self.__indexed_features - - def get_sample_label_indexed(self) -> pyspark.pandas.series.Series: - """ - Return sample labels with indices as a Series. - :return: Indexed Series. - """ - return self.__indexed_instances - - def get_features_names(self) -> list: - """ - Get features names from DataFrame. - :return: List of features names - """ - return self.__indexed_features.tolist() - - def get_features_by_index(self, indices: Union[List[int], Set[int]]) -> List[str]: - """ - Get features names by specified index from DataFrame. - - :param: indices: List of feature indexes - :return: List of features names - """ - return self.__indexed_features.loc[indices].tolist() - - def get_sample_label(self) -> list: - """ - Get samples class (label) from DataFrame. - :return: List of sample class labels - """ - return self.__indexed_instances.tolist() - - def get_sdf_vector(self, output_column_vector: str = 'features') -> pyspark.sql.DataFrame: - """ - Return a Spark dataframe with feature columns assembled into a column vector (a.k.a. Dense Vector column). - This format is required as input for multiple algorithms from MLlib API. - - :param: output_column_vector: Name of the output column vector. - :return: Spark DataFrame - """ - - sdf = self.__df - features_cols = self.get_features_names() - sdf_vector = _assemble_column_vector(sdf, - input_feature_cols=features_cols, - output_column_vector=output_column_vector) - - return sdf_vector - - def get_sdf_and_label(self, - output_column_vector: str = 'features') -> Tuple[pyspark.sql.dataframe.DataFrame, str, str]: - """ - Extracts the Spark DataFrame and label column name from FSDataFrame. - - :param: output_column_vector: Name of the output column vector. - :return: A tuple containing the Spark DataFrame and the label column name. - """ - sdf = self.get_sdf_vector(output_column_vector=output_column_vector) - label_col = self.get_label_col_name() - return sdf, label_col, output_column_vector - - def _collect_features_as_array(self) -> np.array: - """ - Collect features from FSDataFrame as an array. - `Warning`: This method will collect the entire DataFrame into the driver. - Uses this method on small datasets only (e.g., after filtering or splitting the data) - - :return: Numpy array - """ - sdf = self.get_sdf().select(*self.get_features_names()) - a = np.array(sdf.collect()) - return a - - def to_psdf(self) -> pyspark.pandas.DataFrame: - """ - Convert Spark DataFrame to Pandas on Spark DataFrame - :return: Pandas on Spark DataFrame - """ - return self.__df.pandas_api() - - def get_sdf(self) -> pyspark.sql.DataFrame: - """ - Return current Spark DataFrame - :return: Spark DataFrame - """ - return self.__df - - def get_sample_col_name(self) -> str: - """ - Return sample id column name. - - :return: Sample id column name. - """ - return self.__sample_col - - def get_label_col_name(self) -> str: - """ - Return sample label column name. - - :return: Sample label column name. - """ - return self.__label_col - - def get_row_index_name(self) -> str: - """ - Return row (instances) id column name. - - :return: Row id column name. - """ - return self.__row_index_name - - def _add_row_index(self, index_name: str = '_row_index') -> pyspark.sql.DataFrame: - """ - Add row indices to DataFrame. - Unique indices of type integer will be added in non-consecutive increasing order. - - :param: index_name: Name of the row index column. - :return: Spark DataFrame with extra column of row indices. - """ - return self.__df.withColumn(index_name, monotonically_increasing_id()) - - def count_features(self) -> int: - """ - Return the number of features. - - :return: Number of features. - """ - return self.get_features_indexed().size - - def count_instances(self) -> int: - """ - Return the number of samples (instances). - - :return: Number of samples. - """ - return self.get_sample_label_indexed().size - - def filter_features(self, features: List[str], keep: bool = True) -> 'FSDataFrame': - """ - Select or drop specified features from DataFrame. - - :param features: List of features names to drop or select from DataFrame - :param keep: If True (default), keep features. Remove otherwise. - - :return: FSDataFrame - """ - - current_features = self.get_features_names() - if len(set(current_features).intersection(features)) == 0: - logger.warning(f"There is no overlap of specified features with the input data frame.\n" - f"Skipping this filter step...") - return self - - count_a = self.count_features() - sdf = self.get_sdf() - - if keep: - sdf = sdf.select( - self.__sample_col, - self.__label_col, - self.__row_index_name, - *features) - else: - sdf = sdf.drop(*features) - - fsdf_filtered = self.update(sdf, self.__sample_col, self.__label_col, self.__row_index_name) - count_b = fsdf_filtered.count_features() - - logger.info(f"{count_b} features out of {count_a} remain after applying this filter...") - - return fsdf_filtered - - def filter_features_by_index(self, feature_indices: Set[int], keep: bool = True) -> 'FSDataFrame': - """ - Select or drop specified features from DataFrame by its indices. - - :param feature_indices: Set of features indices to drop or select from DataFrame - :param keep: If True (default), keep features. Remove otherwise. - - :return: FSDataFrame - """ - feature_names = self.get_features_by_index(feature_indices) - return self.filter_features(feature_names, keep=keep) - - def get_label_strata(self) -> list: - """ - Get strata from a categorical column in DataFrame. - - :return: List of levels for categorical variable. - """ - levels = self.get_sample_label_indexed().unique().tolist() - number_of_lvs = len(levels) - if number_of_lvs > 20: # TODO: Check if this is a right cutoff. - logger.warning(f"Number of observed levels too high: {number_of_lvs}.\n" - f"Should this variable be considered continuous?") - return levels - - def scale_features(self, scaler_method: str = 'standard', **kwargs) -> 'FSDataFrame': - """ - Scales features in DataFrame - - :param scaler_method: One of: min_max, max_abs, standard or robust. - :return: FSDataFrame with scaled features. - """ - - if scaler_method == 'min_max': - scaler = MinMaxScaler(**kwargs) - elif scaler_method == 'max_abs': - scaler = MaxAbsScaler(**kwargs) - elif scaler_method == 'standard': - scaler = StandardScaler(**kwargs) - elif scaler_method == 'robust': - scaler = RobustScaler(**kwargs) - else: - raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") - - features_col_vector = '_features' - scaled_features_vector = '_features_scaled' - - sdf = self.get_sdf_vector(output_column_vector=features_col_vector) - - sdf = (scaler - .setInputCol(features_col_vector) - .setOutputCol(scaled_features_vector) - .fit(sdf) - .transform(sdf) - .drop(features_col_vector) - ) - - sdf = _disassemble_column_vector(sdf, - features_cols=self.get_features_names(), - col_vector_name=scaled_features_vector, - drop_col_vector=True) - - return self.update(sdf, - self.__sample_col, - self.__label_col, - self.__row_index_name) - - def split_df(self, - label_type_cat: bool = True, - split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: - """ - Split DataFrame into training and test dataset. - It will generate a nearly class-balanced training - and testing set for both categorical and continuous label input. - - :param label_type_cat: If True (the default), the input label colum will be processed as categorical. - Otherwise, it will be considered a continuous variable and binarized. - :param split_training_factor: Proportion of the training set. Usually, a value between 0.6 and 0.8. - - :return: Tuple of FSDataFrames. First element is the training set and second element is the testing set. - """ - - row_index_col = self.get_row_index_name() - label_col = self.get_label_col_name() - sdf = self.__df - - # create a temporal indexed categorical variable for sampling and splitting the data set. - tmp_label_col = '_tmp_label_indexed' - if label_type_cat: - sdf = _string_indexer(sdf=sdf, input_col=label_col, output_col=tmp_label_col) - else: - # If the input label is continuous, create a uniform random distribution [0,1] and binarize this variable. - # It will be used then as categorical for sampling the dataframe. - sdf = sdf.withColumn("_tmp_uniform_rand", rand()) - sdf = (_binarizer(sdf, - input_col="_tmp_uniform_rand", - output_col=tmp_label_col, - threshold=0.5, - drop_input_col=True) - ) - - # Get number of levels for categorical variable. - levels = [lv[tmp_label_col] for lv in sdf.select([tmp_label_col]).distinct().collect()] - - # Sampling DataFrame to extract class-balanced training set. - # This will keep similar proportion by stratum in both training and testing set. - fraction_dict = dict(zip(levels, [split_training_factor] * len(levels))) - training_df = sdf.sampleBy(col=sdf[tmp_label_col], fractions=fraction_dict) - - # Filter out the testing set from the input Dataframe. testing_df = input_sdf[-training_df]. - testing_df = sdf.join(training_df, [row_index_col], "leftanti") - - # Drop tmp cols - training_df = training_df.drop(tmp_label_col) - testing_df = testing_df.drop(tmp_label_col) - - return (self.update(training_df, self.__sample_col, self.__label_col, self.__row_index_name), - self.update(testing_df, self.__sample_col, self.__label_col, self.__row_index_name)) - - @classmethod - def update(cls, - df: pyspark.sql.DataFrame, - sample_col: str, - label_col: str, - row_index_col: str): - """ - Create a new instance of FSDataFrame. - - :param df: Spark DataFrame - :param sample_col: Name of sample id column. - :param label_col: Name of sample label column. - :param row_index_col: Name of row (instances) id column. - - :return: FSDataFrame - """ - return cls(df, sample_col, label_col, row_index_col) - - -def _assemble_column_vector(sdf: pyspark.sql.DataFrame, - input_feature_cols: List[str], - output_column_vector: str = 'features', - drop_input_cols: bool = True) -> pyspark.sql.DataFrame: - """ - Assemble features (columns) from DataFrame into a column of type Dense Vector. - - :param drop_input_cols: - :param sdf: Spark DataFrame - :param input_feature_cols: List of features column names. - :param output_column_vector: Output column of type DenseVector. - - :return: Spark DataFrame - """ - - sdf_vector = (VectorAssembler() - .setInputCols(input_feature_cols) - .setOutputCol(output_column_vector) - .transform(sdf) - ) - - return sdf_vector.drop(*input_feature_cols) if drop_input_cols else sdf_vector - - -def _disassemble_column_vector(sdf: pyspark.sql.DataFrame, - features_cols: List[str], - col_vector_name: str, - drop_col_vector: bool = True) -> pyspark.sql.DataFrame: - """ - Convert a Column Dense Vector in Spark DataFrame to individual columns (a.k.a features). - Basically, revert the operation from `_assemble_column_vector`. - - :param drop_col_vector: - :param sdf: Spark DataFrame - :param features_cols: - :param col_vector_name: - - :return: Spark DataFrame - """ - - sdf = (sdf - .withColumn("_array_col", vector_to_array(sdf[col_vector_name])) - .withColumns({features_cols[i]: col("_array_col")[i] for i in range(len(features_cols))}) - .drop("_array_col") - ) - - return sdf.drop(col_vector_name) if drop_col_vector else sdf - - -def _string_indexer(sdf: pyspark.sql.DataFrame, - input_col: str = None, - output_col: str = "_label_indexed", - drop_input_col: bool = False) -> pyspark.sql.DataFrame: - """ - Wrapper for `pyspark.ml.feature.StringIndexer`. - See https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.StringIndexer.html. - - :param sdf: Spark DataFrame. - :param input_col: Name of the input column to be indexed. - :param output_col: Name of the output column indexed. - :param drop_input_col: Drop input column after indexing. Default: False. - - :return: Spark DataFrame - """ - sdf = (StringIndexer() - .setInputCol(input_col) - .setOutputCol(output_col) - .fit(sdf) - .transform(sdf) - ) - return sdf.drop(input_col) if drop_input_col else sdf - - -def _binarizer(sdf: pyspark.sql.DataFrame, - input_col: str = None, - output_col: str = "_label_binarized", - threshold: float = 0.5, - drop_input_col: bool = False) -> pyspark.sql.DataFrame: - """ - Wrapper for `pyspark.ml.feature.Binarizer`. - See https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.Binarizer.html - - :param sdf: Spark DataFrame. - :param input_col: Name of the numeric input column to be binarized. - :param output_col: Name of the output column binarized. - :param threshold: Threshold used to binarize continuous features. - The features greater than the threshold will be binarized to 1.0. - The features equal to or less than the threshold will be binarized to 0.0 - :param drop_input_col: Drop input column after binarizing. Default: False. - - :return: Spark DataFrame - """ - sdf = (Binarizer() - .setInputCol(input_col) - .setOutputCol(output_col) - .setThreshold(threshold) - .transform(sdf) - ) - - return sdf.drop(input_col) if drop_input_col else sdf - - -class DataFormatError(Exception): - """ - Exception raised for errors in the input/output data format. - """ - pass From 10ee2e8e5f22a51e3b2ae9ed831546b042d1d63b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 09:11:29 +0100 Subject: [PATCH 21/62] fsspark -> fslite --- docs/README.data.md | 22 ++++--- docs/README.methods.md | 2 +- environment.yml | 2 +- {fsspark => fslite}/__init__.py | 0 {fsspark/config => fslite/fs}/__init__.py | 0 {fsspark => fslite}/fs/constants.py | 0 {fsspark => fslite}/fs/fdataframe.py | 0 {fsspark => fslite}/fs/methods.py | 12 ++-- {fsspark => fslite}/fs/ml.py | 18 +++--- {fsspark => fslite}/fs/multivariate.py | 8 +-- {fsspark => fslite}/fs/univariate.py | 2 +- {fsspark => fslite}/fs/utils.py | 4 +- .../pipeline/fs_pipeline_example.py | 12 ++-- {fsspark => fslite}/testdata/TNBC.tsv.gz | Bin {fsspark => fslite}/testdata/TNBC_missing.tsv | 0 {fsspark => fslite}/tests/__init__.py | 0 .../tests/generate_big_tests.py | 0 .../tests/test_data_preprocessing.py | 10 +-- {fsspark => fslite}/tests/test_fs_pipeline.py | 10 +-- {fsspark => fslite}/tests/test_fsdataframe.py | 2 +- .../tests/test_import_export.py | 6 +- {fsspark => fslite}/tests/test_ml_methods.py | 12 ++-- .../tests/test_univariate_methods.py | 6 +- {fsspark/fs => fslite/utils}/__init__.py | 0 {fsspark => fslite}/utils/datasets.py | 0 {fsspark => fslite}/utils/generic.py | 0 {fsspark => fslite}/utils/io.py | 2 +- fsspark/config/context.py | 59 ------------------ fsspark/config/global_settings.py | 22 ------- fsspark/utils/__init__.py | 0 setup.py | 2 +- 31 files changed, 67 insertions(+), 146 deletions(-) rename {fsspark => fslite}/__init__.py (100%) rename {fsspark/config => fslite/fs}/__init__.py (100%) rename {fsspark => fslite}/fs/constants.py (100%) rename {fsspark => fslite}/fs/fdataframe.py (100%) rename {fsspark => fslite}/fs/methods.py (98%) rename {fsspark => fslite}/fs/ml.py (97%) rename {fsspark => fslite}/fs/multivariate.py (95%) rename {fsspark => fslite}/fs/univariate.py (99%) rename {fsspark => fslite}/fs/utils.py (97%) rename {fsspark => fslite}/pipeline/fs_pipeline_example.py (82%) rename {fsspark => fslite}/testdata/TNBC.tsv.gz (100%) rename {fsspark => fslite}/testdata/TNBC_missing.tsv (100%) rename {fsspark => fslite}/tests/__init__.py (100%) rename {fsspark => fslite}/tests/generate_big_tests.py (100%) rename {fsspark => fslite}/tests/test_data_preprocessing.py (86%) rename {fsspark => fslite}/tests/test_fs_pipeline.py (86%) rename {fsspark => fslite}/tests/test_fsdataframe.py (98%) rename {fsspark => fslite}/tests/test_import_export.py (85%) rename {fsspark => fslite}/tests/test_ml_methods.py (95%) rename {fsspark => fslite}/tests/test_univariate_methods.py (81%) rename {fsspark/fs => fslite/utils}/__init__.py (100%) rename {fsspark => fslite}/utils/datasets.py (100%) rename {fsspark => fslite}/utils/generic.py (100%) rename {fsspark => fslite}/utils/io.py (97%) delete mode 100644 fsspark/config/context.py delete mode 100644 fsspark/config/global_settings.py delete mode 100644 fsspark/utils/__init__.py diff --git a/docs/README.data.md b/docs/README.data.md index e812609..bb82603 100644 --- a/docs/README.data.md +++ b/docs/README.data.md @@ -37,19 +37,21 @@ The following is an example of a TSV file with a binary response variable: - `import_table` - Import data from a TSV file into a Spark Data Frame (sdf). ```python -from fsspark.utils.io import import_table -sdf = import_table('data.tsv.bgz', - sep='\t', - n_partitions=5) +from fslite.utils.io import import_table + +sdf = import_table('data.tsv.bgz', + sep='\t', + n_partitions=5) ``` - `import_table_as_psdf` - Import data from a TSV file into a Spark Data Frame (sdf) and convert it into a Pandas on Spark Data Frame (psdf). ```python -from fsspark.utils.io import import_table_as_psdf -psdf = import_table_as_psdf('data.tsv.bgz', - sep='\t', +from fslite.utils.io import import_table_as_psdf + +psdf = import_table_as_psdf('data.tsv.bgz', + sep='\t', n_partitions=5) ``` @@ -73,9 +75,9 @@ contains the response variable. #### How to create a Feature Selection Spark Data Frame (FSDF) ```python -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.utils.io import import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.fs.core import FSDataFrame +from fslite.utils.io import import_table_as_psdf # Init spark init_spark() diff --git a/docs/README.methods.md b/docs/README.methods.md index 5fb42cd..2a1149e 100644 --- a/docs/README.methods.md +++ b/docs/README.methods.md @@ -53,4 +53,4 @@ A typical workflow written using `fsspark` can be divided roughly in four major ### 5. Feature selection pipeline example -[FS pipeline example](../fsspark/pipeline/fs_pipeline_example.py) +[FS pipeline example](../fslite/pipeline/fs_pipeline_example.py) diff --git a/environment.yml b/environment.yml index 7d4be58..8fe1a48 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: fsspark-venv +name: fslite-venv channels: - defaults - conda-forge diff --git a/fsspark/__init__.py b/fslite/__init__.py similarity index 100% rename from fsspark/__init__.py rename to fslite/__init__.py diff --git a/fsspark/config/__init__.py b/fslite/fs/__init__.py similarity index 100% rename from fsspark/config/__init__.py rename to fslite/fs/__init__.py diff --git a/fsspark/fs/constants.py b/fslite/fs/constants.py similarity index 100% rename from fsspark/fs/constants.py rename to fslite/fs/constants.py diff --git a/fsspark/fs/fdataframe.py b/fslite/fs/fdataframe.py similarity index 100% rename from fsspark/fs/fdataframe.py rename to fslite/fs/fdataframe.py diff --git a/fsspark/fs/methods.py b/fslite/fs/methods.py similarity index 98% rename from fsspark/fs/methods.py rename to fslite/fs/methods.py index d9ce31b..4ceeb16 100644 --- a/fsspark/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,12 +1,12 @@ from abc import ABC, abstractmethod from typing import List, Type, Union, Tuple, Optional, Dict, Any -from fsspark.fs.constants import (ML_METHODS, UNIVARIATE_METHODS, - MULTIVARIATE_METHODS) -from fsspark.fs.core import FSDataFrame -from fsspark.fs.ml import MLCVModel -from fsspark.fs.multivariate import multivariate_filter -from fsspark.fs.univariate import univariate_filter +from fslite.fs.constants import (ML_METHODS, UNIVARIATE_METHODS, + MULTIVARIATE_METHODS) +from fslite.fs.core import FSDataFrame +from fslite.fs.ml import MLCVModel +from fslite.fs.multivariate import multivariate_filter +from fslite.fs.univariate import univariate_filter class FSMethod(ABC): diff --git a/fsspark/fs/ml.py b/fslite/fs/ml.py similarity index 97% rename from fsspark/fs/ml.py rename to fslite/fs/ml.py index 42b51cc..b6ea39d 100644 --- a/fsspark/fs/ml.py +++ b/fslite/fs/ml.py @@ -20,15 +20,15 @@ from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel, Param -from fsspark.fs.constants import (RF_BINARY, - LSVC_BINARY, - FM_BINARY, - RF_MULTILABEL, - LR_MULTILABEL, - RF_REGRESSION, - FM_REGRESSION, - ML_METHODS) -from fsspark.fs.core import FSDataFrame +from fslite.fs.constants import (RF_BINARY, + LSVC_BINARY, + FM_BINARY, + RF_MULTILABEL, + LR_MULTILABEL, + RF_REGRESSION, + FM_REGRESSION, + ML_METHODS) +from fslite.fs.core import FSDataFrame ESTIMATORS_CLASSES = [RandomForestClassifier, RandomForestRegressionModel, LinearSVC, LogisticRegression] EVALUATORS_CLASSES = [BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator] diff --git a/fsspark/fs/multivariate.py b/fslite/fs/multivariate.py similarity index 95% rename from fsspark/fs/multivariate.py rename to fslite/fs/multivariate.py index f4af43e..ef00526 100644 --- a/fsspark/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -6,11 +6,11 @@ from pyspark.ml.feature import (VarianceThresholdSelector) from pyspark.ml.stat import Correlation -from fsspark.fs.constants import MULTIVARIATE_METHODS, MULTIVARIATE_CORRELATION, MULTIVARIATE_VARIANCE +from fslite.fs.constants import MULTIVARIATE_METHODS, MULTIVARIATE_CORRELATION, MULTIVARIATE_VARIANCE -from fsspark.fs.core import FSDataFrame -from fsspark.fs.utils import find_maximal_independent_set -from fsspark.utils.generic import tag +from fslite.fs.core import FSDataFrame +from fslite.fs.utils import find_maximal_independent_set +from fslite.utils.generic import tag logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FSSPARK:MULTIVARIATE") diff --git a/fsspark/fs/univariate.py b/fslite/fs/univariate.py similarity index 99% rename from fsspark/fs/univariate.py rename to fslite/fs/univariate.py index 2b83cd0..e769452 100644 --- a/fsspark/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -5,7 +5,7 @@ import pandas as pd from sklearn.feature_selection import SelectKBest, f_classif, f_regression -from fsspark.fs.fdataframe import FSDataFrame +from fslite.fs.fdataframe import FSDataFrame logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FS:UNIVARIATE") diff --git a/fsspark/fs/utils.py b/fslite/fs/utils.py similarity index 97% rename from fsspark/fs/utils.py rename to fslite/fs/utils.py index 31d4c39..9fc6a70 100644 --- a/fsspark/fs/utils.py +++ b/fslite/fs/utils.py @@ -6,8 +6,8 @@ from networkx.algorithms.mis import maximal_independent_set from pyspark.ml.feature import Imputer -from fsspark.fs.core import FSDataFrame -from fsspark.utils.generic import tag +from fslite.fs.core import FSDataFrame +from fslite.utils.generic import tag logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FSSPARK:UTILS") diff --git a/fsspark/pipeline/fs_pipeline_example.py b/fslite/pipeline/fs_pipeline_example.py similarity index 82% rename from fsspark/pipeline/fs_pipeline_example.py rename to fslite/pipeline/fs_pipeline_example.py index c7b9f75..3c4c498 100644 --- a/fsspark/pipeline/fs_pipeline_example.py +++ b/fslite/pipeline/fs_pipeline_example.py @@ -1,16 +1,16 @@ """ -Example of a feature selection pipeline implemented in fsspark. +Example of a feature selection pipeline implemented in fslite. After data import and pre-processing, the pipeline applies univariate correlation filter, multivariate correlation filter and Randon Forest classification. """ -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.fs.core import FSDataFrame +from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod +from fslite.utils.datasets import get_tnbc_data_path +from fslite.utils.io import import_table_as_psdf # Init spark init_spark(apply_pyarrow_settings=True, diff --git a/fsspark/testdata/TNBC.tsv.gz b/fslite/testdata/TNBC.tsv.gz similarity index 100% rename from fsspark/testdata/TNBC.tsv.gz rename to fslite/testdata/TNBC.tsv.gz diff --git a/fsspark/testdata/TNBC_missing.tsv b/fslite/testdata/TNBC_missing.tsv similarity index 100% rename from fsspark/testdata/TNBC_missing.tsv rename to fslite/testdata/TNBC_missing.tsv diff --git a/fsspark/tests/__init__.py b/fslite/tests/__init__.py similarity index 100% rename from fsspark/tests/__init__.py rename to fslite/tests/__init__.py diff --git a/fsspark/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py similarity index 100% rename from fsspark/tests/generate_big_tests.py rename to fslite/tests/generate_big_tests.py diff --git a/fsspark/tests/test_data_preprocessing.py b/fslite/tests/test_data_preprocessing.py similarity index 86% rename from fsspark/tests/test_data_preprocessing.py rename to fslite/tests/test_data_preprocessing.py index 85a6e37..9e35ad7 100644 --- a/fsspark/tests/test_data_preprocessing.py +++ b/fslite/tests/test_data_preprocessing.py @@ -2,11 +2,11 @@ import numpy as np -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.fs.utils import compute_missingness_rate, remove_features_by_missingness_rate, impute_missing -from fsspark.utils.datasets import get_tnbc_data_missing_values_path -from fsspark.utils.io import import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.fs.core import FSDataFrame +from fslite.fs.utils import compute_missingness_rate, remove_features_by_missingness_rate, impute_missing +from fslite.utils.datasets import get_tnbc_data_missing_values_path +from fslite.utils.io import import_table_as_psdf class TestDataPreprocessing(unittest.TestCase): diff --git a/fsspark/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py similarity index 86% rename from fsspark/tests/test_fs_pipeline.py rename to fslite/tests/test_fs_pipeline.py index a1f6b5e..ca5517e 100644 --- a/fsspark/tests/test_fs_pipeline.py +++ b/fslite/tests/test_fs_pipeline.py @@ -1,10 +1,10 @@ import unittest -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.fs.core import FSDataFrame +from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod +from fslite.utils.datasets import get_tnbc_data_path +from fslite.utils.io import import_table_as_psdf class FeatureSelectionPipelineTest(unittest.TestCase): diff --git a/fsspark/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py similarity index 98% rename from fsspark/tests/test_fsdataframe.py rename to fslite/tests/test_fsdataframe.py index b1ab9e9..6b67863 100644 --- a/fsspark/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -4,7 +4,7 @@ from memory_profiler import memory_usage import gc -from fsspark.fs.fdataframe import FSDataFrame +from fslite.fs.fdataframe import FSDataFrame def test_initializes_fsdataframe(): diff --git a/fsspark/tests/test_import_export.py b/fslite/tests/test_import_export.py similarity index 85% rename from fsspark/tests/test_import_export.py rename to fslite/tests/test_import_export.py index 57b0c5b..05d801f 100644 --- a/fsspark/tests/test_import_export.py +++ b/fslite/tests/test_import_export.py @@ -3,9 +3,9 @@ import pyspark import pyspark.pandas as ps -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table, import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.utils.datasets import get_tnbc_data_path +from fslite.utils.io import import_table, import_table_as_psdf class TestImportExport(unittest.TestCase): diff --git a/fsspark/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py similarity index 95% rename from fsspark/tests/test_ml_methods.py rename to fslite/tests/test_ml_methods.py index 3dc4bda..1afd46f 100644 --- a/fsspark/tests/test_ml_methods.py +++ b/fslite/tests/test_ml_methods.py @@ -5,11 +5,11 @@ from pyspark.ml.evaluation import (BinaryClassificationEvaluator, MulticlassClassificationEvaluator) -from fsspark.config.context import init_spark, stop_spark_session -from fsspark.fs.core import FSDataFrame -from fsspark.fs.ml import MLCVModel -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.utils.io import import_table_as_psdf +from fslite.config.context import init_spark, stop_spark_session +from fslite.fs.core import FSDataFrame +from fslite.fs.ml import MLCVModel +from fslite.utils.datasets import get_tnbc_data_path +from fslite.utils.io import import_table_as_psdf class MLMethodTest(unittest.TestCase): @@ -145,7 +145,7 @@ def test_multilabel_lr_model(self): assert testing_acc > 0.7 def test_FSMLMethod(self): - from fsspark.fs.methods import FSMLMethod + from fslite.fs.methods import FSMLMethod fsdf = self.import_FSDataFrame() training_data, testing_data = fsdf.split_df(split_training_factor=0.7) diff --git a/fsspark/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py similarity index 81% rename from fsspark/tests/test_univariate_methods.py rename to fslite/tests/test_univariate_methods.py index f2f2bf8..228ca2c 100644 --- a/fsspark/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -1,8 +1,8 @@ import pandas as pd -from fsspark.utils.datasets import get_tnbc_data_path -from fsspark.fs.fdataframe import FSDataFrame +from fslite.utils.datasets import get_tnbc_data_path +from fslite.fs.fdataframe import FSDataFrame -from fsspark.fs.univariate import univariate_filter +from fslite.fs.univariate import univariate_filter def test_univariate_filter_corr(): """ diff --git a/fsspark/fs/__init__.py b/fslite/utils/__init__.py similarity index 100% rename from fsspark/fs/__init__.py rename to fslite/utils/__init__.py diff --git a/fsspark/utils/datasets.py b/fslite/utils/datasets.py similarity index 100% rename from fsspark/utils/datasets.py rename to fslite/utils/datasets.py diff --git a/fsspark/utils/generic.py b/fslite/utils/generic.py similarity index 100% rename from fsspark/utils/generic.py rename to fslite/utils/generic.py diff --git a/fsspark/utils/io.py b/fslite/utils/io.py similarity index 97% rename from fsspark/utils/io.py rename to fslite/utils/io.py index 951c5cb..85adb13 100644 --- a/fsspark/utils/io.py +++ b/fslite/utils/io.py @@ -3,7 +3,7 @@ import pyspark.pandas import pyspark.sql -from fsspark.config.context import PANDAS_ON_SPARK_API_SETTINGS +from fslite.config.context import PANDAS_ON_SPARK_API_SETTINGS warnings.filterwarnings("ignore") diff --git a/fsspark/config/context.py b/fsspark/config/context.py deleted file mode 100644 index 2cc5a50..0000000 --- a/fsspark/config/context.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import pyspark -from pyspark.sql import SparkSession - -from fsspark.config.global_settings import (SPARK_EXTRA_SETTINGS, - PYARROW_SETTINGS, - PANDAS_ON_SPARK_API_SETTINGS) - -os.environ['PYARROW_IGNORE_TIMEZONE'] = "1" - - -# os.environ['JAVA_HOME'] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_162.jdk/Contents/Home" -# os.environ['SPARK_HOME'] = "/usr/local/spark-3.3.0-bin-hadoop3" - -def init_spark(master: str = "local[8]", - apply_pyarrow_settings: bool = True, - apply_extra_spark_settings: bool = True, - apply_pandas_settings: bool = True) -> SparkSession: - """ - Init Spark session. - - :return: Spark session - """ - # stop any current session before starting a new one. - # stop_spark_session() - - # init or get spark session. - spark = (SparkSession.builder - .master(master) - .appName("fs-spark") - ) - - if apply_extra_spark_settings: - # Spark must be configured before starting context. - for k in SPARK_EXTRA_SETTINGS.keys(): - spark = spark.config(k, SPARK_EXTRA_SETTINGS.get(k)) - spark = spark.getOrCreate() - else: - spark = spark.getOrCreate() - - if apply_pyarrow_settings: - [spark.conf.set(k, PYARROW_SETTINGS.get(k)) for k in PYARROW_SETTINGS.keys()] - if apply_pandas_settings: - [spark.conf.set(k, PANDAS_ON_SPARK_API_SETTINGS.get(k)) for k in PANDAS_ON_SPARK_API_SETTINGS.keys()] - - return spark - - -def stop_spark_session() -> None: - """ - If any, stop active Spark Session. - - :return: None - """ - sc = pyspark.sql.SparkSession.getActiveSession() - if sc is not None: - sc.stop() - else: - return None diff --git a/fsspark/config/global_settings.py b/fsspark/config/global_settings.py deleted file mode 100644 index 53675ee..0000000 --- a/fsspark/config/global_settings.py +++ /dev/null @@ -1,22 +0,0 @@ -# Description: Global settings for the fsspark package. -# These settings provide a way to configure the spark session and the spark context to run the fsspark package locally. - -# spark settings to test this module locally. -SPARK_EXTRA_SETTINGS = {'spark.executor.memory': '8g', - 'spark.driver.memory': '16g', - "spark.memory.offHeap.enabled": 'true', - "spark.memory.offHeap.size": '4g', - "spark.sql.pivotMaxValues": '100000', - "spark.network.timeout": '100000', - "spark.sql.session.timeZone": "UTC" - } - -# pyarrow settings to make available columnar data processing -PYARROW_SETTINGS = {"spark.sql.execution.arrow.pyspark.enabled": "true", - "spark.sql.execution.arrow.pyspark.fallback.enabled": "true" - } - -# setting for pandas api on spark (PoS) -PANDAS_ON_SPARK_API_SETTINGS = {"compute.default_index_type": "distributed", - "compute.ordered_head": False, - "display.max_rows": 100} diff --git a/fsspark/utils/__init__.py b/fsspark/utils/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py index 7beeec5..73c353d 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ long_description = fh.read() setup( - name='fsspark', + name='fslite', version='0.0.1', url='https://github.com/bigbio/fsspark', license='Apache-2.0', From a69ac12cf2a25726d43633fdf89e62952dce3b70 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 09:55:59 +0100 Subject: [PATCH 22/62] Minor changes in constants.py --- fslite/fs/constants.py | 35 +++++++++++++++++++++++-- fslite/fs/fdataframe.py | 1 - fslite/fs/univariate.py | 4 +++ fslite/tests/test_univariate_methods.py | 2 +- 4 files changed, 38 insertions(+), 4 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 1134493..8937465 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -1,6 +1,7 @@ """ This file contains a list of constants used in the feature selection and machine learning methods. """ +from typing import Dict, List, Union FS_METHODS = { 'univariate': { @@ -8,7 +9,15 @@ "methods": [ { 'name': 'anova', - 'description': 'ANOVA univariate feature selection (F-classification)' + 'description': 'Univariate ANOVA feature selection (f-classification)' + }, + { + 'name': 'u_corr', + 'description': 'Univariate correlation' + }, + { + 'name': 'f_regression', + 'description': 'Univariate f-regression' } ] }, @@ -68,7 +77,7 @@ def get_fs_methods(): """ return FS_METHODS -def get_fs_method_details(method_name: str): +def get_fs_method_details(method_name: str) -> Union[Dict, None]: """ Get the details of the feature selection method, this function search in all-methods definitions and get the details of the method with the given name. If the method is not found, it returns None. @@ -82,3 +91,25 @@ def get_fs_method_details(method_name: str): if method['name'].lower() == method_name.lower(): return method return None + +def get_fs_univariate_methods() -> List: + """ + Get the list of univariate methods implemented in the library + :return: list + """ + univariate_methods = FS_METHODS['univariate'] + univariate_names = [method["name"] for method in univariate_methods["methods"]] + return univariate_names + +def is_valid_univariate_method(method_name: str) -> bool: + """ + This method check if the given method name is a supported univariate method + :param method_name method name + :return: boolean + """ + for method in FS_METHODS["univariate"]["methods"]: + if method["name"].lower() == method_name: + return True + return False + + diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index fc0effd..8748c05 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -221,7 +221,6 @@ def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame': def to_pandas(self) -> DataFrame: """ Return the DataFrame representation of the FSDataFrame. - :return: Pandas DataFrame. """ diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index e769452..ddf3ac6 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -5,6 +5,7 @@ import pandas as pd from sklearn.feature_selection import SelectKBest, f_classif, f_regression +from fslite.fs.constants import get_fs_univariate_methods, is_valid_univariate_method from fslite.fs.fdataframe import FSDataFrame logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -100,6 +101,9 @@ def univariate_filter(df: FSDataFrame, :return: Filtered DataFrame with selected features """ + if not is_valid_univariate_method(univariate_method): + raise NotImplementedError("The provided method {} is not implented !! please select one from this list {}".format(univariate_method, get_fs_univariate_methods())) + selected_features = [] if univariate_method == 'anova': diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 228ca2c..16ae0a5 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -16,7 +16,7 @@ def test_univariate_filter_corr(): # create FSDataFrame instance fs_df = FSDataFrame(df=df,sample_col='Sample',label_col='label') - fsdf_filtered = univariate_filter(fs_df,univariate_method='u_corr', corr_threshold=0.3) + fsdf_filtered = univariate_filter(fs_df, univariate_method='u_corr', corr_threshold=0.3) assert fs_df.count_features() == 500 assert fsdf_filtered.count_features() == 211 From 3f56ded16a32f905aee6c5254c80049a75361371 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 10:11:59 +0100 Subject: [PATCH 23/62] black applied --- fslite/fs/constants.py | 96 +++++------ fslite/fs/fdataframe.py | 74 ++++++--- fslite/fs/methods.py | 129 +++++++++------ fslite/fs/ml.py | 202 ++++++++++++++++-------- fslite/fs/multivariate.py | 83 ++++++---- fslite/fs/univariate.py | 54 ++++--- fslite/fs/utils.py | 9 +- fslite/pipeline/fs_pipeline_example.py | 39 ++--- fslite/tests/generate_big_tests.py | 28 ++-- fslite/tests/test_data_preprocessing.py | 24 +-- fslite/tests/test_fs_pipeline.py | 41 ++--- fslite/tests/test_fsdataframe.py | 85 +++++----- fslite/tests/test_import_export.py | 18 +-- fslite/tests/test_ml_methods.py | 81 +++++----- fslite/tests/test_univariate_methods.py | 12 +- fslite/utils/generic.py | 2 + fslite/utils/io.py | 53 +++---- setup.py | 18 +-- 18 files changed, 603 insertions(+), 445 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 8937465..87ab22b 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -1,72 +1,52 @@ """ This file contains a list of constants used in the feature selection and machine learning methods. """ + from typing import Dict, List, Union FS_METHODS = { - 'univariate': { - "title": 'Univariate Feature Selection', + "univariate": { + "title": "Univariate Feature Selection", "methods": [ { - 'name': 'anova', - 'description': 'Univariate ANOVA feature selection (f-classification)' - }, - { - 'name': 'u_corr', - 'description': 'Univariate correlation' + "name": "anova", + "description": "Univariate ANOVA feature selection (f-classification)", }, - { - 'name': 'f_regression', - 'description': 'Univariate f-regression' - } - ] + {"name": "u_corr", "description": "Univariate correlation"}, + {"name": "f_regression", "description": "Univariate f-regression"}, + ], }, - 'multivariate': { - "title": 'Multivariate Feature Selection', + "multivariate": { + "title": "Multivariate Feature Selection", "methods": [ - { - 'name': 'm_corr', - 'description': 'Multivariate Correlation' - }, - { - 'name': 'variance', - 'description': 'Multivariate Variance' - } - ] + {"name": "m_corr", "description": "Multivariate Correlation"}, + {"name": "variance", "description": "Multivariate Variance"}, + ], }, - 'ml': { - "title": 'Machine Learning Wrapper', + "ml": { + "title": "Machine Learning Wrapper", "methods": [ + {"name": "rf_binary", "description": "Random Forest Binary Classifier"}, + {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"}, { - 'name': 'rf_binary', - 'description': 'Random Forest Binary Classifier' - }, - { - 'name': 'lsvc_binary', - 'description': 'Linear SVC Binary Classifier' + "name": "fm_binary", + "description": "Factorization Machine Binary Classifier", }, { - 'name': 'fm_binary', - 'description': 'Factorization Machine Binary Classifier' + "name": "rf_multilabel", + "description": "Random Forest Multi-label Classifier", }, { - 'name': 'rf_multilabel', - 'description': 'Random Forest Multi-label Classifier' + "name": "lg_multilabel", + "description": "Logistic Regression Multi-label Classifier", }, + {"name": "rf_regression", "description": "Random Forest Regression"}, { - 'name': 'lg_multilabel', - 'description': 'Logistic Regression Multi-label Classifier' + "name": "fm_regression", + "description": "Factorization Machine Regression", }, - { - 'name': 'rf_regression', - 'description': 'Random Forest Regression' - }, - { - 'name': 'fm_regression', - 'description': 'Factorization Machine Regression' - } - ] - } + ], + }, } @@ -77,6 +57,7 @@ def get_fs_methods(): """ return FS_METHODS + def get_fs_method_details(method_name: str) -> Union[Dict, None]: """ Get the details of the feature selection method, this function search in all-methods definitions @@ -87,19 +68,19 @@ def get_fs_method_details(method_name: str) -> Union[Dict, None]: """ for method_type in FS_METHODS: - for method in FS_METHODS[method_type]['methods']: - if method['name'].lower() == method_name.lower(): + for method in FS_METHODS[method_type]["methods"]: + if method["name"].lower() == method_name.lower(): return method return None + def get_fs_univariate_methods() -> List: """ Get the list of univariate methods implemented in the library :return: list """ - univariate_methods = FS_METHODS['univariate'] - univariate_names = [method["name"] for method in univariate_methods["methods"]] - return univariate_names + return get_fs_method_by_class["univariate"] + def is_valid_univariate_method(method_name: str) -> bool: """ @@ -113,3 +94,12 @@ def is_valid_univariate_method(method_name: str) -> bool: return False +def get_fs_method_by_class(fs_class: str) -> List: + """ + Get the FS method supported for a given FS class, for example, univariate + :param fs_class + :return FS List + """ + fs_methods = FS_METHODS[fs_class] + fs_names = [method["name"] for method in fs_methods["methods"]] + return fs_names diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index 8748c05..3553014 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -7,7 +7,13 @@ import psutil from pandas import DataFrame from scipy import sparse -from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, LabelEncoder +from sklearn.preprocessing import ( + MinMaxScaler, + MaxAbsScaler, + StandardScaler, + RobustScaler, + LabelEncoder, +) logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("pickfeat") @@ -30,13 +36,16 @@ class FSDataFrame: [...] """ + def __init__( - self, - df: pd.DataFrame, - sample_col: Optional[str] = None, - label_col: Optional[str] = None, - sparse_threshold: float = 0.7, # Threshold for sparsity - memory_threshold: Optional[float] = 0.75 # Proportion of system memory to use for dense arrays + self, + df: pd.DataFrame, + sample_col: Optional[str] = None, + label_col: Optional[str] = None, + sparse_threshold: float = 0.7, # Threshold for sparsity + memory_threshold: Optional[ + float + ] = 0.75, # Proportion of system memory to use for dense arrays ): """ Create an instance of FSDataFrame. @@ -60,7 +69,9 @@ def __init__( # Handle sample column if sample_col: if sample_col not in df.columns: - raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.") + raise ValueError( + f"Sample column '{sample_col}' not found in DataFrame." + ) self.__sample_col = sample_col self.__samples = df[sample_col].tolist() columns_to_drop.append(sample_col) @@ -105,19 +116,27 @@ def __init__( if sparsity > sparse_threshold: if dense_matrix_size < memory_threshold * available_memory: # Use dense matrix if enough memory is available - logging.info(f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. " - f"Using a dense matrix.") + logging.info( + f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. " + f"Using a dense matrix." + ) self.__matrix = numerical_df.to_numpy(dtype=np.float32) self.__is_sparse = False else: # Use sparse matrix due to memory constraints - logging.info(f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. " - f"Using a sparse matrix representation.") - self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32)) + logging.info( + f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. " + f"Using a sparse matrix representation." + ) + self.__matrix = sparse.csr_matrix( + numerical_df.to_numpy(dtype=np.float32) + ) self.__is_sparse = True else: # Use dense matrix since it's not sparse - logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.") + logging.info( + f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix." + ) self.__matrix = numerical_df.to_numpy(dtype=np.float32) self.__is_sparse = False @@ -159,7 +178,7 @@ def count_instances(self) -> int: """ return self.__matrix.shape[0] - def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool: + def scale_features(self, scaler_method: str = "standard", **kwargs) -> bool: """ Scales features in the SDataFrame using a specified method. @@ -167,16 +186,18 @@ def scale_features(self, scaler_method: str = 'standard', **kwargs) -> bool: :return: FSDataFrame with scaled features. """ - if scaler_method == 'min_max': + if scaler_method == "min_max": scaler = MinMaxScaler(**kwargs) - elif scaler_method == 'max_abs': + elif scaler_method == "max_abs": scaler = MaxAbsScaler(**kwargs) - elif scaler_method == 'standard': + elif scaler_method == "standard": scaler = StandardScaler(**kwargs) - elif scaler_method == 'robust': + elif scaler_method == "robust": scaler = RobustScaler(**kwargs) else: - raise ValueError("`scaler_method` must be one of: min_max, max_abs, standard or robust.") + raise ValueError( + "`scaler_method` must be one of: min_max, max_abs, standard or robust." + ) # TODO: Scale only the features for now, we have to investigate if we scale categorical variables self.__matrix = scaler.fit_transform(self.__matrix) @@ -192,7 +213,7 @@ def get_scaled_method(self): def is_sparse(self): return self.__is_sparse - def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame': + def select_features_by_index(self, feature_indexes: List[int]) -> "FSDataFrame": """ Keep only the specified features (by index) and return an updated instance of FSDataFrame. @@ -216,7 +237,9 @@ def select_features_by_index(self, feature_indexes: List[int]) -> 'FSDataFrame': updated_df[self.__label_col] = self.__labels # Return a new instance of FSDataFrame with the updated data - return FSDataFrame(updated_df, sample_col=self.__sample_col, label_col=self.__label_col) + return FSDataFrame( + updated_df, sample_col=self.__sample_col, label_col=self.__label_col + ) def to_pandas(self) -> DataFrame: """ @@ -241,9 +264,9 @@ def to_pandas(self) -> DataFrame: return df - def split_df(self, - label_type_cat: bool = True, - split_training_factor: float = 0.7) -> Tuple['FSDataFrame', 'FSDataFrame']: + def split_df( + self, label_type_cat: bool = True, split_training_factor: float = 0.7 + ) -> Tuple["FSDataFrame", "FSDataFrame"]: """ Split DataFrame into training and test dataset. It will generate a nearly class-balanced training @@ -284,4 +307,3 @@ def split_df(self, # # # Return the updated DataFrames # return self.update(train_df), self.update(test_df) - diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 4ceeb16..7fbb6c6 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,8 +1,7 @@ from abc import ABC, abstractmethod from typing import List, Type, Union, Tuple, Optional, Dict, Any -from fslite.fs.constants import (ML_METHODS, UNIVARIATE_METHODS, - MULTIVARIATE_METHODS) +from fslite.fs.constants import ML_METHODS, UNIVARIATE_METHODS, MULTIVARIATE_METHODS from fslite.fs.core import FSDataFrame from fslite.fs.ml import MLCVModel from fslite.fs.multivariate import multivariate_filter @@ -16,12 +15,10 @@ class FSMethod(ABC): valid_methods: Tuple[str] - def __init__(self, - fs_method, - **kwargs): + def __init__(self, fs_method, **kwargs): """ Initialize the feature selection method with the specified parameters. - + :param fs_method: The feature selection method to be used. :param kwargs: Additional keyword arguments for the feature selection method. """ @@ -140,9 +137,7 @@ def select_features(self, fsdf) -> FSDataFrame: The selected features. """ - return univariate_filter( - fsdf, univariate_method=self.fs_method, **self.kwargs - ) + return univariate_filter(fsdf, univariate_method=self.fs_method, **self.kwargs) def __str__(self): return f"FSUnivariate(method={self.fs_method}, kwargs={self.kwargs})" @@ -233,12 +228,14 @@ class FSMLMethod(FSMethod): valid_methods = list(ML_METHODS.keys()) _ml_model: MLCVModel = None - def __init__(self, - fs_method: str, - rfe: bool = False, - rfe_iterations: int = 3, - percent_to_keep: float = 0.90, - **kwargs): + def __init__( + self, + fs_method: str, + rfe: bool = False, + rfe_iterations: int = 3, + percent_to_keep: float = 0.90, + **kwargs, + ): """ Initialize the machine learning feature selection method with the specified parameters. @@ -251,10 +248,14 @@ def __init__(self, self.validate_method(fs_method) # set the estimator, grid and cv parameters (or none if not provided) - self.estimator_params = kwargs.get('estimator_params', None) # estimator parameters - self.evaluator_params = kwargs.get('evaluator_params', None) # evaluator parameters - self.grid_params = kwargs.get('grid_params', None) # grid parameters - self.cv_params = kwargs.get('cv_params', None) # cross-validation parameters + self.estimator_params = kwargs.get( + "estimator_params", None + ) # estimator parameters + self.evaluator_params = kwargs.get( + "evaluator_params", None + ) # evaluator parameters + self.grid_params = kwargs.get("grid_params", None) # grid parameters + self.cv_params = kwargs.get("cv_params", None) # cross-validation parameters # set the machine learning model self._ml_model = self._set_ml_model() @@ -265,7 +266,9 @@ def __init__(self, self.rfe_iterations = rfe_iterations # performance metrics - self.rfe_training_metric: list = [] # performance metrics on training for each rfe iteration + self.rfe_training_metric: list = ( + [] + ) # performance metrics on training for each rfe iteration self.training_metric = None # performance metrics on training (final model) self.testing_metric = None # performance metrics on testing (final model) @@ -310,7 +313,7 @@ def _set_ml_model(self): estimator_params=self.estimator_params, evaluator_params=self.evaluator_params, grid_params=self.grid_params, - cv_params=self.cv_params + cv_params=self.cv_params, ) return self._ml_model @@ -326,8 +329,8 @@ def _fit_and_filter(self, df: FSDataFrame) -> FSDataFrame: # get feature based on the (percentile) threshold provided # expected a dataframe sorted by scores in descending order selected_features = feature_scores.iloc[ - :int(self.percent_to_keep * len(feature_scores)) - ]['feature_index'] + : int(self.percent_to_keep * len(feature_scores)) + ]["feature_index"] return df.filter_features_by_index(selected_features, keep=True) @@ -343,17 +346,23 @@ def select_features(self, fsdf: FSDataFrame) -> FSDataFrame: """ if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: - raise ValueError("The data frame is empty or does not contain any features.") + raise ValueError( + "The data frame is empty or does not contain any features." + ) fsdf = self._fit_and_filter(fsdf) # Recursive feature elimination if self.rfe: for iteration in range(self.rfe_iterations): - print(f"RFE: running {iteration + 1} of {self.rfe_iterations} iterations...") + print( + f"RFE: running {iteration + 1} of {self.rfe_iterations} iterations..." + ) fsdf = self._fit_and_filter(fsdf) # collect the performance metrics on training for every rfe iteration - self.rfe_training_metric.append(self._ml_model.get_eval_metric_on_training()) + self.rfe_training_metric.append( + self._ml_model.get_eval_metric_on_training() + ) # get the final performance metric on training self.training_metric = self._ml_model.get_eval_metric_on_training() @@ -384,7 +393,9 @@ def get_eval_metric_on_training_rfe(self): The evaluation metric on the training data for each RFE iteration. """ if self.rfe_training_metric is None: - raise ValueError("No training metric is available. Run the select_features method first.") + raise ValueError( + "No training metric is available. Run the select_features method first." + ) return self.rfe_training_metric def get_eval_metric_on_training(self): @@ -395,7 +406,9 @@ def get_eval_metric_on_training(self): The evaluation metric on the training data. """ if self.training_metric is None: - raise ValueError("No training metric is available. Run the select_features method first.") + raise ValueError( + "No training metric is available. Run the select_features method first." + ) return self.training_metric def get_eval_metric_on_testing(self, fsdf: FSDataFrame): @@ -410,7 +423,9 @@ def get_eval_metric_on_testing(self, fsdf: FSDataFrame): """ if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: - raise ValueError("The testing data frame is empty or does not contain any features.") + raise ValueError( + "The testing data frame is empty or does not contain any features." + ) # evaluate the model on the testing data eval_metric = self._ml_model.get_eval_metric_on_testing(fsdf) @@ -427,7 +442,9 @@ def get_feature_scores(self): """ if self.feature_scores is None: - raise ValueError("Feature scores are not available. Run the feature selection method first.") + raise ValueError( + "Feature scores are not available. Run the feature selection method first." + ) return self.feature_scores @@ -452,14 +469,18 @@ class FSPipeline: selected_features = fs_pipeline.select_features(fsdf) """ - _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [FSUnivariate, - FSMultivariate, - FSMLMethod] + _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [ + FSUnivariate, + FSMultivariate, + FSMLMethod, + ] - def __init__(self, - df_training: FSDataFrame, - df_testing: Optional[FSDataFrame], - fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]]): + def __init__( + self, + df_training: FSDataFrame, + df_testing: Optional[FSDataFrame], + fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]], + ): """ Initialize the feature selection pipeline with the specified feature selection methods. @@ -482,15 +503,23 @@ def validate_methods(self): """ # check if the pipeline contains at least one feature selection method if len(self.fs_stages) == 0: - raise ValueError("The pipeline must contain at least one feature selection method.") + raise ValueError( + "The pipeline must contain at least one feature selection method." + ) # check if the feature selection methods are valid - if not all(isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages): - raise InvalidMethodError(f"Invalid feature selection method. " - f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}") + if not all( + isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages + ): + raise InvalidMethodError( + f"Invalid feature selection method. " + f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}" + ) # check if only one ML method is used in the pipeline - ml_methods = [method for method in self.fs_stages if isinstance(method, FSMLMethod)] + ml_methods = [ + method for method in self.fs_stages if isinstance(method, FSMLMethod) + ] if len(ml_methods) > 1: raise ValueError("Only one ML method is allowed in the pipeline.") @@ -509,7 +538,9 @@ def run(self) -> Dict[str, Any]: self.pipeline_results.update(n_stages=n_stages) for i, method in enumerate(self.fs_stages): - print(f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}") + print( + f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}" + ) if isinstance(method, FSMLMethod): fsdf_tmp = method.select_features(fsdf_tmp) @@ -518,8 +549,12 @@ def run(self) -> Dict[str, Any]: self.pipeline_results.update(rfe_iterations=method.rfe_iterations) self.pipeline_results.update(feature_scores=method.get_feature_scores()) self.pipeline_results.update(eval_metric=method.get_eval_metric_name()) - self.pipeline_results.update(rfe_training_metric=method.get_eval_metric_on_training_rfe()) - self.pipeline_results.update(training_metric=method.get_eval_metric_on_training()) + self.pipeline_results.update( + rfe_training_metric=method.get_eval_metric_on_training_rfe() + ) + self.pipeline_results.update( + training_metric=method.get_eval_metric_on_training() + ) if self.df_testing is not None: @@ -530,7 +565,9 @@ def run(self) -> Dict[str, Any]: else: fsdf_tmp = method.select_features(fsdf_tmp) - self.pipeline_results.update(n_initial_features=self.df_training.count_features()) + self.pipeline_results.update( + n_initial_features=self.df_training.count_features() + ) self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features()) return self.pipeline_results diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index b6ea39d..bf97fae 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -4,34 +4,57 @@ for feature selection (e.g., rank by feature importance) and prediction. """ + import warnings from typing import List, Any, Dict, Optional, Union import pandas as pd from pyspark.ml import Estimator, Model -from pyspark.ml.classification import (RandomForestClassificationModel, - LinearSVCModel, - RandomForestClassifier, - LinearSVC, LogisticRegression, LogisticRegressionModel) -from pyspark.ml.evaluation import (Evaluator, - BinaryClassificationEvaluator, - MulticlassClassificationEvaluator, - RegressionEvaluator) +from pyspark.ml.classification import ( + RandomForestClassificationModel, + LinearSVCModel, + RandomForestClassifier, + LinearSVC, + LogisticRegression, + LogisticRegressionModel, +) +from pyspark.ml.evaluation import ( + Evaluator, + BinaryClassificationEvaluator, + MulticlassClassificationEvaluator, + RegressionEvaluator, +) from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor -from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel, Param - -from fslite.fs.constants import (RF_BINARY, - LSVC_BINARY, - FM_BINARY, - RF_MULTILABEL, - LR_MULTILABEL, - RF_REGRESSION, - FM_REGRESSION, - ML_METHODS) +from pyspark.ml.tuning import ( + CrossValidator, + ParamGridBuilder, + CrossValidatorModel, + Param, +) + +from fslite.fs.constants import ( + RF_BINARY, + LSVC_BINARY, + FM_BINARY, + RF_MULTILABEL, + LR_MULTILABEL, + RF_REGRESSION, + FM_REGRESSION, + ML_METHODS, +) from fslite.fs.core import FSDataFrame -ESTIMATORS_CLASSES = [RandomForestClassifier, RandomForestRegressionModel, LinearSVC, LogisticRegression] -EVALUATORS_CLASSES = [BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator] +ESTIMATORS_CLASSES = [ + RandomForestClassifier, + RandomForestRegressionModel, + LinearSVC, + LogisticRegression, +] +EVALUATORS_CLASSES = [ + BinaryClassificationEvaluator, + MulticlassClassificationEvaluator, + RegressionEvaluator, +] # Define an abstract class that allow to create a factory of models @@ -52,18 +75,24 @@ class MLCVModel: _best_model: Model = None _fsdf: FSDataFrame = None - def __init__(self, - estimator: Union[RandomForestClassifier | - RandomForestRegressionModel | - LinearSVC | - LogisticRegression], - evaluator: Union[BinaryClassificationEvaluator | - MulticlassClassificationEvaluator | - RegressionEvaluator], - estimator_params: Optional[Dict[str, Any]] = None, - evaluator_params: Optional[Dict[str, Any]] = None, - grid_params: Optional[Dict[str, List[Any]]] = None, - cv_params: Optional[Dict[str, Any]] = None): + def __init__( + self, + estimator: Union[ + RandomForestClassifier + | RandomForestRegressionModel + | LinearSVC + | LogisticRegression + ], + evaluator: Union[ + BinaryClassificationEvaluator + | MulticlassClassificationEvaluator + | RegressionEvaluator + ], + estimator_params: Optional[Dict[str, Any]] = None, + evaluator_params: Optional[Dict[str, Any]] = None, + grid_params: Optional[Dict[str, List[Any]]] = None, + cv_params: Optional[Dict[str, Any]] = None, + ): """ Initializes the MLModel with optional estimator, evaluator, and parameter specifications. """ @@ -96,7 +125,9 @@ def _initialize_model(self): # Initialize and set cross-validator parameters self._set_cross_validator() - def _parse_grid_params(self, grid_params: Dict[str, List[Any]]) -> List[Dict[Param, Any]]: + def _parse_grid_params( + self, grid_params: Dict[str, List[Any]] + ) -> List[Dict[Param, Any]]: """ Parse the grid parameters to create a list of dictionaries. @@ -108,10 +139,12 @@ def _parse_grid_params(self, grid_params: Dict[str, List[Any]]) -> List[Dict[Par if hasattr(self.estimator, param): grid = grid.addGrid(getattr(self.estimator, param), values) else: - raise AttributeError(f"{self.estimator.__class__.__name__} does not have attribute {param}") + raise AttributeError( + f"{self.estimator.__class__.__name__} does not have attribute {param}" + ) return grid.build() - def _validate_estimator(self, estimator: Estimator) -> 'MLCVModel': + def _validate_estimator(self, estimator: Estimator) -> "MLCVModel": """ Validate the estimator. @@ -123,7 +156,7 @@ def _validate_estimator(self, estimator: Estimator) -> 'MLCVModel': raise ValueError(f"Estimator must be an instance of {ESTIMATORS_CLASSES}") return self - def _validate_evaluator(self, evaluator: Evaluator) -> 'MLCVModel': + def _validate_evaluator(self, evaluator: Evaluator) -> "MLCVModel": """ Validate the evaluator. @@ -145,7 +178,9 @@ def _validate_estimator_params(self, estimator_params: Dict[str, Any]) -> None: return for param, _ in estimator_params.items(): if not self.estimator.hasParam(param): - raise AttributeError(f"{self.estimator.__class__.__name__} does not have attribute {param}") + raise AttributeError( + f"{self.estimator.__class__.__name__} does not have attribute {param}" + ) def _validate_evaluator_params(self, evaluator_params: Dict[str, Any]) -> None: """ @@ -157,9 +192,11 @@ def _validate_evaluator_params(self, evaluator_params: Dict[str, Any]) -> None: return for param, _ in evaluator_params.items(): if not self.evaluator.hasParam(param): - raise AttributeError(f"{self.evaluator.__class__.__name__} does not have attribute {param}") + raise AttributeError( + f"{self.evaluator.__class__.__name__} does not have attribute {param}" + ) - def _set_evaluator_params(self) -> 'MLCVModel': + def _set_evaluator_params(self) -> "MLCVModel": """ Set evaluator parameters. """ @@ -167,7 +204,7 @@ def _set_evaluator_params(self) -> 'MLCVModel': self.evaluator = self.evaluator.setParams(**self.evaluator_params) return self - def _set_estimator_params(self) -> 'MLCVModel': + def _set_estimator_params(self) -> "MLCVModel": """ Set estimator parameters. """ @@ -175,7 +212,7 @@ def _set_estimator_params(self) -> 'MLCVModel': self.estimator = self.estimator.setParams(**self.estimator_params) return self - def _set_cv_params(self, cv_params: Dict[str, Any]) -> 'MLCVModel': + def _set_cv_params(self, cv_params: Dict[str, Any]) -> "MLCVModel": """ Parse the cross-validator parameters to create an instance of CrossValidator. @@ -187,10 +224,12 @@ def _set_cv_params(self, cv_params: Dict[str, Any]) -> 'MLCVModel': if hasattr(self._cross_validator, param): setattr(self._cross_validator, param, value) else: - raise AttributeError(f"{self._cross_validator.__class__.__name__} does not have attribute {param}") + raise AttributeError( + f"{self._cross_validator.__class__.__name__} does not have attribute {param}" + ) return self - def _set_cross_validator(self) -> 'MLCVModel': + def _set_cross_validator(self) -> "MLCVModel": """ Build the model using the cross-validator. @@ -203,14 +242,16 @@ def _set_cross_validator(self) -> 'MLCVModel': evaluator=self.evaluator, ) if self.cv_params is not None: - self._cross_validator = self._cross_validator.setParams(**self.cv_params) + self._cross_validator = self._cross_validator.setParams( + **self.cv_params + ) return self except Exception as e: print(f"An error occurred while creating the CrossValidator: {str(e)}") # Handle the exception or raise it to be handled by the caller raise - def fit(self, fsdf: FSDataFrame) -> 'MLCVModel': + def fit(self, fsdf: FSDataFrame) -> "MLCVModel": """ Fit the model using the cross-validator. @@ -219,8 +260,14 @@ def fit(self, fsdf: FSDataFrame) -> 'MLCVModel': # Extract the Spark DataFrame and label column name from FSDataFrame self._fsdf = fsdf - if self._cross_validator is None or self.estimator is None or self.evaluator is None: - raise ValueError("Cross-validator, estimator, or evaluator not set properly.") + if ( + self._cross_validator is None + or self.estimator is None + or self.evaluator is None + ): + raise ValueError( + "Cross-validator, estimator, or evaluator not set properly." + ) self._fitted_cv_model = self._cross_validator.fit(self._fsdf.get_sdf_vector()) return self @@ -232,17 +279,21 @@ def _get_best_model(self) -> Model: :return: The best model. """ if self._fitted_cv_model is None: - raise ValueError("CrossValidatorModel not fitted. Use fit() to fit the model.") + raise ValueError( + "CrossValidatorModel not fitted. Use fit() to fit the model." + ) self._best_model = self._fitted_cv_model.bestModel return self._best_model # define a static method that allows to set a ml model based on the model type @staticmethod - def create_model(model_type: str, - estimator_params: Dict[str, Any] = None, - evaluator_params: Dict[str, Any] = None, - grid_params: Dict[str, List[Any]] = None, - cv_params: Dict[str, Any] = None) -> 'MLCVModel': + def create_model( + model_type: str, + estimator_params: Dict[str, Any] = None, + evaluator_params: Dict[str, Any] = None, + grid_params: Dict[str, List[Any]] = None, + cv_params: Dict[str, Any] = None, + ) -> "MLCVModel": """ Set a machine learning model based on the model type. @@ -270,8 +321,10 @@ def create_model(model_type: str, estimator = RandomForestRegressor() evaluator = RegressionEvaluator() else: - raise ValueError(f"Unsupported model type: {model_type}." - f"Supported model types are: {list(ML_METHODS.keys())}") + raise ValueError( + f"Unsupported model type: {model_type}." + f"Supported model types are: {list(ML_METHODS.keys())}" + ) ml_method = MLCVModel( estimator=estimator, @@ -279,7 +332,7 @@ def create_model(model_type: str, estimator_params=estimator_params, evaluator_params=evaluator_params, grid_params=grid_params, - cv_params=cv_params + cv_params=cv_params, ) return ml_method @@ -301,18 +354,22 @@ def get_feature_scores(self) -> pd.DataFrame: # raise exception if the model is not none if best_model is None: - raise ValueError("No ML model have been fitted. Use fit() to fit the model.") + raise ValueError( + "No ML model have been fitted. Use fit() to fit the model." + ) - df_features = pd.DataFrame(indexed_features.to_numpy(), - columns=["features"]) + df_features = pd.DataFrame(indexed_features.to_numpy(), columns=["features"]) - if isinstance(best_model, (RandomForestClassificationModel, RandomForestRegressionModel)): + if isinstance( + best_model, (RandomForestClassificationModel, RandomForestRegressionModel) + ): df_scores = pd.DataFrame( - data=best_model.featureImportances.toArray(), - columns=["scores"] + data=best_model.featureImportances.toArray(), columns=["scores"] ) - df_scores = df_scores.reset_index(level=0).rename(columns={"index": "feature_index"}) + df_scores = df_scores.reset_index(level=0).rename( + columns={"index": "feature_index"} + ) # merge the feature scores with the feature names df = df_features.merge( @@ -323,14 +380,16 @@ def get_feature_scores(self) -> pd.DataFrame: df = df.sort_values(by="scores", ascending=False) # add feature percentile rank to the features_scores dataframe - df['percentile_rank'] = df['scores'].rank(pct=True) + df["percentile_rank"] = df["scores"].rank(pct=True) return df else: - raise ValueError("Unsupported model type. " - "Only RandomForestClassificationModel, " - "RandomForestRegressionModel, and LinearSVCModel are supported.") + raise ValueError( + "Unsupported model type. " + "Only RandomForestClassificationModel, " + "RandomForestRegressionModel, and LinearSVCModel are supported." + ) def get_eval_metric_on_training(self) -> float: """ @@ -347,7 +406,9 @@ def get_eval_metric_on_training(self) -> float: # get the eval metric name from the evaluator eval_metric_name = self.get_eval_metric_name() - if isinstance(best_model, (RandomForestClassificationModel, LogisticRegressionModel)): + if isinstance( + best_model, (RandomForestClassificationModel, LogisticRegressionModel) + ): metric_value = getattr(best_model.summary, eval_metric_name) elif isinstance(best_model, LinearSVCModel): @@ -378,7 +439,10 @@ def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: # predict the test data predictions = None - if isinstance(best_model, (RandomForestClassificationModel, LinearSVCModel, LogisticRegressionModel)): + if isinstance( + best_model, + (RandomForestClassificationModel, LinearSVCModel, LogisticRegressionModel), + ): predictions = best_model.transform(test_data.get_sdf_vector()) metric_value = None diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index ef00526..3db3414 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -3,10 +3,14 @@ import numpy as np import pyspark -from pyspark.ml.feature import (VarianceThresholdSelector) +from pyspark.ml.feature import VarianceThresholdSelector from pyspark.ml.stat import Correlation -from fslite.fs.constants import MULTIVARIATE_METHODS, MULTIVARIATE_CORRELATION, MULTIVARIATE_VARIANCE +from fslite.fs.constants import ( + MULTIVARIATE_METHODS, + MULTIVARIATE_CORRELATION, + MULTIVARIATE_VARIANCE, +) from fslite.fs.core import FSDataFrame from fslite.fs.utils import find_maximal_independent_set @@ -18,9 +22,11 @@ @tag("experimental") -def _compute_correlation_matrix(sdf: pyspark.sql.DataFrame, - features_col: str = 'features', - corr_method: str = "pearson") -> np.ndarray: +def _compute_correlation_matrix( + sdf: pyspark.sql.DataFrame, + features_col: str = "features", + corr_method: str = "pearson", +) -> np.ndarray: """ Compute features Matrix Correlation. @@ -31,23 +37,23 @@ def _compute_correlation_matrix(sdf: pyspark.sql.DataFrame, :return: Numpy array. """ - logger.warning("Warning: Computed matrix correlation will be collected into the drive with this implementation.\n" - "This may cause memory issues. Use it preferably with small datasets.") + logger.warning( + "Warning: Computed matrix correlation will be collected into the drive with this implementation.\n" + "This may cause memory issues. Use it preferably with small datasets." + ) logger.info(f"Computing correlation matrix using {corr_method} method.") - mcorr = (Correlation - .corr(sdf, features_col, corr_method) - .collect()[0][0] - .toArray() - ) + mcorr = Correlation.corr(sdf, features_col, corr_method).collect()[0][0].toArray() return mcorr @tag("experimental") -def multivariate_correlation_selector(fsdf: FSDataFrame, - strict: bool = True, - corr_threshold: float = 0.75, - corr_method: str = "pearson") -> List[str]: +def multivariate_correlation_selector( + fsdf: FSDataFrame, + strict: bool = True, + corr_threshold: float = 0.75, + corr_method: str = "pearson", +) -> List[str]: """ Compute the correlation matrix (Pearson) among input features and select those below a specified threshold. @@ -61,17 +67,21 @@ def multivariate_correlation_selector(fsdf: FSDataFrame, :return: List of selected features names """ - colum_vector_features = 'features' + colum_vector_features = "features" sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) # compute correlation matrix - mcorr = _compute_correlation_matrix(sdf, - features_col=colum_vector_features, - corr_method=corr_method) + mcorr = _compute_correlation_matrix( + sdf, features_col=colum_vector_features, corr_method=corr_method + ) mcorr = np.abs(mcorr) # get absolute correlation value - combs_above_cutoff = np.triu(mcorr, k=1) > corr_threshold # create bool matrix that meet criteria - correlated_col_index = tuple(np.column_stack(np.where(combs_above_cutoff))) # get correlated pairs cols index + combs_above_cutoff = ( + np.triu(mcorr, k=1) > corr_threshold + ) # create bool matrix that meet criteria + correlated_col_index = tuple( + np.column_stack(np.where(combs_above_cutoff)) + ) # get correlated pairs cols index index_to_remove = set() if strict: @@ -94,8 +104,9 @@ def multivariate_correlation_selector(fsdf: FSDataFrame, @tag("spark implementation") -def multivariate_variance_selector(fsdf: FSDataFrame, - variance_threshold: float = 0.0) -> List[str]: +def multivariate_variance_selector( + fsdf: FSDataFrame, variance_threshold: float = 0.0 +) -> List[str]: """ Select features after removing low-variance ones (e.g., features with quasi-constant value across samples). @@ -105,15 +116,15 @@ def multivariate_variance_selector(fsdf: FSDataFrame, :return: List of selected features names """ - colum_vector_features = 'features' + colum_vector_features = "features" sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) selector = VarianceThresholdSelector() - (selector - .setFeaturesCol(colum_vector_features) - .setOutputCol("selectedFeatures") - .setVarianceThreshold(variance_threshold) - ) + ( + selector.setFeaturesCol(colum_vector_features) + .setOutputCol("selectedFeatures") + .setVarianceThreshold(variance_threshold) + ) model = selector.fit(sdf) selected_features_indices = set(model.selectedFeatures) @@ -122,9 +133,9 @@ def multivariate_variance_selector(fsdf: FSDataFrame, return selected_features -def multivariate_filter(fsdf: FSDataFrame, - multivariate_method: str = 'm_corr', - **kwargs) -> FSDataFrame: +def multivariate_filter( + fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs +) -> FSDataFrame: """ Filter features after applying a multivariate feature selector method. @@ -139,8 +150,10 @@ def multivariate_filter(fsdf: FSDataFrame, elif multivariate_method == MULTIVARIATE_VARIANCE: selected_features = multivariate_variance_selector(fsdf, **kwargs) else: - raise ValueError(f"Invalid multivariate method: {multivariate_method}. " - f"Choose one of {MULTIVARIATE_METHODS.keys()}.") + raise ValueError( + f"Invalid multivariate method: {multivariate_method}. " + f"Choose one of {MULTIVARIATE_METHODS.keys()}." + ) logger.info(f"Applying multivariate filter {multivariate_method}.") diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index ddf3ac6..ee53b22 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -32,7 +32,9 @@ def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: } -def univariate_correlation_selector(df: FSDataFrame, corr_threshold: float = 0.3) -> List[int]: +def univariate_correlation_selector( + df: FSDataFrame, corr_threshold: float = 0.3 +) -> List[int]: """ Select features based on their correlation with a label (class), if the correlation value is less than the specified threshold. @@ -43,12 +45,22 @@ def univariate_correlation_selector(df: FSDataFrame, corr_threshold: float = 0.3 :return: List of selected feature indices """ correlations = compute_univariate_corr(df) - selected_features = [feature_index for feature_index, corr in correlations.items() if corr <= corr_threshold] + selected_features = [ + feature_index + for feature_index, corr in correlations.items() + if corr <= corr_threshold + ] return selected_features -def univariate_selector(df: pd.DataFrame, features: List[str], label: str, label_type: str = 'categorical', - selection_mode: str = 'percentile', selection_threshold: float = 0.8) -> List[str]: +def univariate_selector( + df: pd.DataFrame, + features: List[str], + label: str, + label_type: str = "categorical", + selection_mode: str = "percentile", + selection_threshold: float = 0.8, +) -> List[str]: """ Wrapper for scikit-learn's `SelectKBest` feature selector. If the label is categorical, ANOVA test is used; if continuous, F-regression test is used. @@ -66,20 +78,24 @@ def univariate_selector(df: pd.DataFrame, features: List[str], label: str, label X = df[features].values y = df[label].values - if label_type == 'categorical': + if label_type == "categorical": logger.info("ANOVA (F-classification) univariate feature selection") selector = SelectKBest(score_func=f_classif) - elif label_type == 'continuous': + elif label_type == "continuous": logger.info("F-value (F-regression) univariate feature selection") selector = SelectKBest(score_func=f_regression) else: raise ValueError("`label_type` must be one of 'categorical' or 'continuous'") - if selection_mode == 'percentile': - selector.set_params(k='all') # We'll handle the percentile threshold manually + if selection_mode == "percentile": + selector.set_params(k="all") # We'll handle the percentile threshold manually selector.fit(X, y) scores = selector.scores_ - selected_indices = [i for i, score in enumerate(scores) if score >= selection_threshold * max(scores)] + selected_indices = [ + i + for i, score in enumerate(scores) + if score >= selection_threshold * max(scores) + ] else: selector.set_params(k=int(selection_threshold)) selector.fit(X, y) @@ -89,9 +105,9 @@ def univariate_selector(df: pd.DataFrame, features: List[str], label: str, label return selected_features -def univariate_filter(df: FSDataFrame, - univariate_method: str = 'u_corr', - **kwargs) -> FSDataFrame: +def univariate_filter( + df: FSDataFrame, univariate_method: str = "u_corr", **kwargs +) -> FSDataFrame: """ Filter features after applying a univariate feature selector method. @@ -102,22 +118,24 @@ def univariate_filter(df: FSDataFrame, """ if not is_valid_univariate_method(univariate_method): - raise NotImplementedError("The provided method {} is not implented !! please select one from this list {}".format(univariate_method, get_fs_univariate_methods())) + raise NotImplementedError( + "The provided method {} is not implemented !! please select one from this list {}".format( + univariate_method, get_fs_univariate_methods() + ) + ) selected_features = [] - if univariate_method == 'anova': + if univariate_method == "anova": # TODO: Implement ANOVA selector # selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) pass - elif univariate_method == 'f_regression': + elif univariate_method == "f_regression": # TODO: Implement F-regression selector # selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) pass - elif univariate_method == 'u_corr': + elif univariate_method == "u_corr": selected_features = univariate_correlation_selector(df, **kwargs) - else: - raise ValueError(f"Univariate method {univariate_method} not supported.") logger.info(f"Applying univariate filter using method: {univariate_method}") diff --git a/fslite/fs/utils.py b/fslite/fs/utils.py index 9fc6a70..80ca059 100644 --- a/fslite/fs/utils.py +++ b/fslite/fs/utils.py @@ -30,7 +30,8 @@ def compute_missingness_rate(fsdf: FSDataFrame) -> Dict[str, float]: missing_rates = sdf.select( [ ( - f.sum(f.when(f.isnan(sdf[c]) | f.isnull(sdf[c]), 1).otherwise(0)) / n_instances + f.sum(f.when(f.isnan(sdf[c]) | f.isnull(sdf[c]), 1).otherwise(0)) + / n_instances ).alias(c) for c in features ] @@ -40,7 +41,7 @@ def compute_missingness_rate(fsdf: FSDataFrame) -> Dict[str, float]: def remove_features_by_missingness_rate( - fsdf: FSDataFrame, threshold: float = 0.15 + fsdf: FSDataFrame, threshold: float = 0.15 ) -> FSDataFrame: """ Remove features from FSDataFrame with missingness rate higher or equal than a specified threshold. @@ -107,7 +108,9 @@ def find_maximal_independent_set(pairs: Tuple[int], keep: bool = True) -> Set[in :return: Set of indices (maximal independent set or remaining indices). """ - logger.warning("This method is experimental and have been not extensively tested...") + logger.warning( + "This method is experimental and have been not extensively tested..." + ) graph = nx.Graph() graph.add_edges_from(pairs) diff --git a/fslite/pipeline/fs_pipeline_example.py b/fslite/pipeline/fs_pipeline_example.py index 3c4c498..32159e9 100644 --- a/fslite/pipeline/fs_pipeline_example.py +++ b/fslite/pipeline/fs_pipeline_example.py @@ -13,13 +13,15 @@ from fslite.utils.io import import_table_as_psdf # Init spark -init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) +init_spark( + apply_pyarrow_settings=True, + apply_extra_spark_settings=True, + apply_pandas_settings=True, +) # 1. Import data df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -fsdf = FSDataFrame(df, sample_col='Sample', label_col='label') +fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") # 2. Split data training_data, testing_data = fsdf.split_df(split_training_factor=0.6) @@ -27,33 +29,32 @@ # 3. Set feature selection methods # create a Univariate object univariate = FSUnivariate( - fs_method='anova', - selection_mode='percentile', - selection_threshold=0.8) + fs_method="anova", selection_mode="percentile", selection_threshold=0.8 +) # create a Multivariate object multivariate = FSMultivariate( - fs_method='m_corr', - corr_threshold=0.75, - corr_method="pearson" + fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" ) # create a MLMethod object rf_classifier = FSMLMethod( - fs_method='rf_multilabel', + fs_method="rf_multilabel", rfe=True, rfe_iterations=2, percent_to_keep=0.9, - estimator_params={'labelCol': 'label'}, - evaluator_params={'metricName': 'accuracy'}, - grid_params={'numTrees': [10, 15], 'maxDepth': [5, 10]}, - cv_params={'parallelism': 2, 'numFolds': 5} + estimator_params={"labelCol": "label"}, + evaluator_params={"metricName": "accuracy"}, + grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, + cv_params={"parallelism": 2, "numFolds": 5}, ) # 4. Create a pipeline object -fs_pipeline = FSPipeline(df_training=training_data, - df_testing=testing_data, - fs_stages=[univariate, multivariate, rf_classifier]) +fs_pipeline = FSPipeline( + df_training=training_data, + df_testing=testing_data, + fs_stages=[univariate, multivariate, rf_classifier], +) # 5. Run the pipeline results = fs_pipeline.run() @@ -61,7 +62,7 @@ # Print results print(f"Accuracy on training: {results['training_metric']}") print(f"Accuracy on testing: {results['testing_metric']}") -print(results.get('feature_scores')) +print(results.get("feature_scores")) stop_spark_session() diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index eb7d677..ccc6f19 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -5,6 +5,7 @@ import pyarrow as pa import pyarrow.parquet as pq + def test_generate_big_dataset(): # Parameters for the dataset n_samples = 1200 @@ -13,15 +14,19 @@ def test_generate_big_dataset(): # Generate sample IDs and labels sample_ids = np.arange(1, n_samples + 1) - labels = np.random.choice(['LV', 'RV', 'LA', 'RA'], size=n_samples) + labels = np.random.choice(["LV", "RV", "LA", "RA"], size=n_samples) # Parquet schema definition - schema = pa.schema([pa.field('sample_id', pa.int32()), pa.field('label', pa.string())] + - [pa.field(f'feature{i}', pa.float32()) for i in range(1, n_features + 1)]) + schema = pa.schema( + [pa.field("sample_id", pa.int32()), pa.field("label", pa.string())] + + [pa.field(f"feature{i}", pa.float32()) for i in range(1, n_features + 1)] + ) # Create an empty Parquet file - output_file = 'large_dataset_optimized_samples_{}_features_{}.parquet'.format(n_samples, n_features) - with pq.ParquetWriter(output_file, schema, compression='snappy') as writer: + output_file = "large_dataset_optimized_samples_{}_features_{}.parquet".format( + n_samples, n_features + ) + with pq.ParquetWriter(output_file, schema, compression="snappy") as writer: # Process in chunks to reduce memory usage for chunk_start in range(0, n_samples, chunk_size): chunk_end = min(chunk_start + chunk_size, n_samples) @@ -31,13 +36,13 @@ def test_generate_big_dataset(): chunk_labels = labels[chunk_start:chunk_end] # Generate chunk of features - chunk_features = {f'feature{i}': np.random.rand(chunk_end - chunk_start) for i in range(1, n_features + 1)} + chunk_features = { + f"feature{i}": np.random.rand(chunk_end - chunk_start) + for i in range(1, n_features + 1) + } # Create DataFrame chunk - chunk_data = { - 'sample_id': chunk_sample_ids, - 'label': chunk_labels - } + chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels} chunk_data.update(chunk_features) df_chunk = pd.DataFrame(chunk_data) @@ -45,7 +50,6 @@ def test_generate_big_dataset(): # Convert to PyArrow Table and write chunk to Parquet file table_chunk = pa.Table.from_pandas(df_chunk, schema=schema) writer.write_table(table_chunk) - logging.info(f'Processed samples {chunk_start + 1} to {chunk_end}') + logging.info(f"Processed samples {chunk_start + 1} to {chunk_end}") print("Optimized Parquet file created successfully!") - diff --git a/fslite/tests/test_data_preprocessing.py b/fslite/tests/test_data_preprocessing.py index 9e35ad7..a247491 100644 --- a/fslite/tests/test_data_preprocessing.py +++ b/fslite/tests/test_data_preprocessing.py @@ -4,7 +4,11 @@ from fslite.config.context import init_spark, stop_spark_session from fslite.fs.core import FSDataFrame -from fslite.fs.utils import compute_missingness_rate, remove_features_by_missingness_rate, impute_missing +from fslite.fs.utils import ( + compute_missingness_rate, + remove_features_by_missingness_rate, + impute_missing, +) from fslite.utils.datasets import get_tnbc_data_missing_values_path from fslite.utils.io import import_table_as_psdf @@ -16,9 +20,11 @@ class TestDataPreprocessing(unittest.TestCase): """ def setUp(self) -> None: - init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) + init_spark( + apply_pyarrow_settings=True, + apply_extra_spark_settings=True, + apply_pandas_settings=True, + ) def tearDown(self) -> None: stop_spark_session() @@ -32,7 +38,7 @@ def import_FSDataFrame() -> FSDataFrame: :return: """ df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col='Sample', label_col='label') + fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") return fsdf def test_compute_missingness_rate(self): @@ -43,8 +49,8 @@ def test_compute_missingness_rate(self): fsdf = self.import_FSDataFrame() features_missing_rates = compute_missingness_rate(fsdf) - self.assertEqual(features_missing_rates.get('tr|E9PBJ4'), 0.0) - self.assertAlmostEqual(features_missing_rates.get('sp|P07437'), 0.295, places=2) + self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0) + self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2) def test_filter_by_missingness_rate(self): """ @@ -66,7 +72,7 @@ def test_impute_missing(self): """ fsdf = self.import_FSDataFrame() - fsdf = impute_missing(fsdf, strategy='mean') + fsdf = impute_missing(fsdf, strategy="mean") # Collect features as array array = fsdf._collect_features_as_array() @@ -75,5 +81,5 @@ def test_impute_missing(self): self.assertFalse(np.isnan(array).any()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py index ca5517e..6b8176e 100644 --- a/fslite/tests/test_fs_pipeline.py +++ b/fslite/tests/test_fs_pipeline.py @@ -10,9 +10,11 @@ class FeatureSelectionPipelineTest(unittest.TestCase): def setUp(self) -> None: - init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) + init_spark( + apply_pyarrow_settings=True, + apply_extra_spark_settings=True, + apply_pandas_settings=True, + ) def tearDown(self) -> None: stop_spark_session() @@ -20,7 +22,7 @@ def tearDown(self) -> None: @staticmethod def import_FSDataFrame(): df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col='Sample', label_col='label') + fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") return fsdf def test_feature_selection_pipeline(self): @@ -30,33 +32,32 @@ def test_feature_selection_pipeline(self): # create a Univariate object univariate = FSUnivariate( - fs_method='anova', - selection_mode='percentile', - selection_threshold=0.8) + fs_method="anova", selection_mode="percentile", selection_threshold=0.8 + ) # create a Multivariate object multivariate = FSMultivariate( - fs_method='m_corr', - corr_threshold=0.75, - corr_method="pearson" + fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" ) # create a MLMethod object rf_classifier = FSMLMethod( - fs_method='rf_multilabel', + fs_method="rf_multilabel", rfe=True, rfe_iterations=2, percent_to_keep=0.9, - estimator_params={'labelCol': 'label'}, - evaluator_params={'metricName': 'accuracy'}, - grid_params={'numTrees': [10, 15], 'maxDepth': [5, 10]}, - cv_params={'parallelism': 2, 'numFolds': 5} + estimator_params={"labelCol": "label"}, + evaluator_params={"metricName": "accuracy"}, + grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, + cv_params={"parallelism": 2, "numFolds": 5}, ) # create a pipeline object - fs_pipeline = FSPipeline(df_training=training_data, - df_testing=testing_data, - fs_stages=[univariate, multivariate, rf_classifier]) + fs_pipeline = FSPipeline( + df_training=training_data, + df_testing=testing_data, + fs_stages=[univariate, multivariate, rf_classifier], + ) # run the pipeline results = fs_pipeline.run() @@ -64,8 +65,8 @@ def test_feature_selection_pipeline(self): # print results print(results) - assert results.get('training_metric') > 0.9 + assert results.get("training_metric") > 0.9 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index 6b67863..adda1bb 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -6,70 +6,69 @@ from fslite.fs.fdataframe import FSDataFrame + def test_initializes_fsdataframe(): # Create a sample DataFrame data = { - 'sample_id': [1, 2, 3], - 'label': ['A', 'B', 'C'], - 'feature1': [0.1, 0.2, 0.3], - 'feature2': [1.1, 1.2, 1.3] + "sample_id": [1, 2, 3], + "label": ["A", "B", "C"], + "feature1": [0.1, 0.2, 0.3], + "feature2": [1.1, 1.2, 1.3], } df = pd.DataFrame(data) # Initialize FSDataFrame - fs_df = FSDataFrame( - df=df, - sample_col='sample_id', - label_col='label' - ) + fs_df = FSDataFrame(df=df, sample_col="sample_id", label_col="label") # Assertions to check if the initialization is correct assert isinstance(fs_df, FSDataFrame) - assert fs_df.get_sample_col_name() == 'sample_id' + assert fs_df.get_sample_col_name() == "sample_id" + def test_scaler_df(): # Create a sample DataFrame data = { - 'sample_id': [1, 2, 3], - 'label': ['A', 'B', 'C'], - 'feature1': [0.1, 0.2, 0.3], - 'feature2': [1.1, 1.2, 1.3] + "sample_id": [1, 2, 3], + "label": ["A", "B", "C"], + "feature1": [0.1, 0.2, 0.3], + "feature2": [1.1, 1.2, 1.3], } df = pd.DataFrame(data) # Initialize FSDataFrame - fs_df = FSDataFrame( - df=df, - sample_col='sample_id', - label_col='label' - ) + fs_df = FSDataFrame(df=df, sample_col="sample_id", label_col="label") # Scale the DataFrame - fs_df.scale_features(scaler_method='standard') + fs_df.scale_features(scaler_method="standard") # Assertions to check if the scaling is correct assert fs_df.is_scaled() == True - assert fs_df.get_scaled_method() == 'standard' + assert fs_df.get_scaled_method() == "standard" + def test_memory_fsdataframe(): - def create_test_data(n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05): + def create_test_data( + n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05 + ): """Create test data for FSDataFrame.""" data = np.random.rand(n_samples, n_features) data[np.random.rand(n_samples, n_features) < zero_prob] = 0 data[np.random.rand(n_samples, n_features) < nan_prob] = np.nan - df = pd.DataFrame(data, columns=[f'feature_{i}' for i in range(n_features)]) - df['sample_id'] = [f'sample_{i}' for i in range(n_samples)] - df['label'] = np.random.choice(['A', 'B'], n_samples) + df = pd.DataFrame(data, columns=[f"feature_{i}" for i in range(n_features)]) + df["sample_id"] = [f"sample_{i}" for i in range(n_samples)] + df["label"] = np.random.choice(["A", "B"], n_samples) return df - def measure_memory_usage(n_samples: int, n_features: int, nan_prob = 0.01) -> float: + def measure_memory_usage(n_samples: int, n_features: int, nan_prob=0.01) -> float: """Measure memory usage of FSDataFrame for given number of samples and features.""" df = create_test_data(n_samples, n_features, nan_prob=nan_prob) - mem_usage = memory_usage((FSDataFrame, (df, 'sample_id', 'label')), max_iterations=1)[0] + mem_usage = memory_usage( + (FSDataFrame, (df, "sample_id", "label")), max_iterations=1 + )[0] gc.collect() # Force garbage collection to free memory return mem_usage @@ -84,25 +83,36 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob = 0.01) -> fl for n_features in feature_sizes: for prob in nan_prob: mem_usage = measure_memory_usage(n_samples, n_features, nan_prob=prob) - results.append((n_samples, n_features, prob, mem_usage)) # Append prob to results + results.append( + (n_samples, n_features, prob, mem_usage) + ) # Append prob to results # Convert results to DataFrame - results_df = pd.DataFrame(results, columns=['Samples', 'Features', 'NAN Prob', 'Memory (MB)']) + results_df = pd.DataFrame( + results, columns=["Samples", "Features", "NAN Prob", "Memory (MB)"] + ) # Create 2D line plot plt.figure(figsize=(12, 8)) for feature_size in feature_sizes: for prob in nan_prob: - data = results_df[(results_df['Features'] == feature_size) & (results_df['NAN Prob'] == prob)] - plt.plot(data['Samples'], data['Memory (MB)'], marker='o', - label=f'{feature_size} Features - {prob} NAN Prob') - - plt.xlabel('Number of Samples') - plt.ylabel('Memory Usage (MB)') - plt.title('FSDataFrame Memory Usage') + data = results_df[ + (results_df["Features"] == feature_size) + & (results_df["NAN Prob"] == prob) + ] + plt.plot( + data["Samples"], + data["Memory (MB)"], + marker="o", + label=f"{feature_size} Features - {prob} NAN Prob", + ) + + plt.xlabel("Number of Samples") + plt.ylabel("Memory Usage (MB)") + plt.title("FSDataFrame Memory Usage") plt.legend() - plt.xscale('log') # Using log scale for x-axis to better visualize the range + plt.xscale("log") # Using log scale for x-axis to better visualize the range plt.tight_layout() plt.show() @@ -110,4 +120,3 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob = 0.01) -> fl print(results_df.to_string(index=False)) # Initialize FSDataFrame with DataFrame having sparse numerical features and insufficient memory for dense matrix - diff --git a/fslite/tests/test_import_export.py b/fslite/tests/test_import_export.py index 05d801f..507f379 100644 --- a/fslite/tests/test_import_export.py +++ b/fslite/tests/test_import_export.py @@ -11,9 +11,11 @@ class TestImportExport(unittest.TestCase): def setUp(self) -> None: - init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) + init_spark( + apply_pyarrow_settings=True, + apply_extra_spark_settings=True, + apply_pandas_settings=True, + ) def tearDown(self) -> None: stop_spark_session() @@ -23,8 +25,7 @@ def test_import_tsv(self): Test import tsv file as Spark DataFrame. :return: None """ - df = import_table(path=get_tnbc_data_path(), - n_partitions=10) + df = import_table(path=get_tnbc_data_path(), n_partitions=10) self.assertIsInstance(df, pyspark.sql.DataFrame) self.assertEqual(df.count(), 44) @@ -34,12 +35,11 @@ def test_import_tsv_as_psdf(self): Test import tsv file as Pandas on Spark DataFrame (PoS). :return: None """ - df = import_table_as_psdf(path=get_tnbc_data_path(), - n_partitions=10) + df = import_table_as_psdf(path=get_tnbc_data_path(), n_partitions=10) self.assertIsInstance(df, ps.frame.DataFrame) - self.assertEqual(df.shape, (44, 502)) + self.assertEqual(df.shape, (44, 502)) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/fslite/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py index 1afd46f..5b624d5 100644 --- a/fslite/tests/test_ml_methods.py +++ b/fslite/tests/test_ml_methods.py @@ -1,9 +1,10 @@ import unittest -from pyspark.ml.classification import (RandomForestClassifier, - LogisticRegression) -from pyspark.ml.evaluation import (BinaryClassificationEvaluator, - MulticlassClassificationEvaluator) +from pyspark.ml.classification import RandomForestClassifier, LogisticRegression +from pyspark.ml.evaluation import ( + BinaryClassificationEvaluator, + MulticlassClassificationEvaluator, +) from fslite.config.context import init_spark, stop_spark_session from fslite.fs.core import FSDataFrame @@ -15,9 +16,11 @@ class MLMethodTest(unittest.TestCase): def setUp(self) -> None: - init_spark(apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True) + init_spark( + apply_pyarrow_settings=True, + apply_extra_spark_settings=True, + apply_pandas_settings=True, + ) def tearDown(self) -> None: stop_spark_session() @@ -25,20 +28,20 @@ def tearDown(self) -> None: @staticmethod def import_FSDataFrame(): df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col='Sample', label_col='label') + fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") return fsdf def test_build_model_using_cross_validator(self): fsdf = self.import_FSDataFrame() estimator = RandomForestClassifier() evaluator = BinaryClassificationEvaluator() - grid_params = {'numTrees': [10, 20, 30], 'maxDepth': [5, 10, 15]} + grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} ml_method = MLCVModel( estimator=estimator, evaluator=evaluator, estimator_params=None, grid_params=None, - cv_params=None + cv_params=None, ) print(ml_method._cross_validator.__str__()) @@ -51,21 +54,19 @@ def test_get_feature_scores_random_forest_classifier(self): # Create a RandomForestClassifier model estimator = RandomForestClassifier() evaluator = MulticlassClassificationEvaluator() - estimator_params = {'labelCol': 'label'} - grid_params = {'numTrees': [10, 20, 30], 'maxDepth': [5, 10, 15]} - cv_params = {'parallelism': 2, 'numFolds': 5, 'collectSubModels': False} + estimator_params = {"labelCol": "label"} + grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} + cv_params = {"parallelism": 2, "numFolds": 5, "collectSubModels": False} ml_method = MLCVModel( estimator=estimator, evaluator=evaluator, estimator_params=estimator_params, grid_params=grid_params, - cv_params=cv_params + cv_params=cv_params, ) - (ml_method - .fit(fsdf) - ) + (ml_method.fit(fsdf)) # Get the feature scores feature_scores = ml_method.get_feature_scores() @@ -74,11 +75,11 @@ def test_get_feature_scores_random_forest_classifier(self): assert not feature_scores.empty # Assert that the feature scores DataFrame has the expected columns - expected_columns = ['features', 'feature_index', 'scores', 'percentile_rank'] + expected_columns = ["features", "feature_index", "scores", "percentile_rank"] assert list(feature_scores.columns) == expected_columns # check if dataframe is sorted by scores (descending) - assert feature_scores['scores'].is_monotonic_decreasing + assert feature_scores["scores"].is_monotonic_decreasing print(feature_scores) @@ -87,22 +88,20 @@ def test_multilabel_rf_model(self): training_data, testing_data = fsdf.split_df(split_training_factor=0.8) estimator = RandomForestClassifier() - evaluator = MulticlassClassificationEvaluator(metricName='accuracy') - estimator_params = {'labelCol': 'label'} - grid_params = {'numTrees': [5, 10], 'maxDepth': [3, 5]} - cv_params = {'parallelism': 2, 'numFolds': 3} + evaluator = MulticlassClassificationEvaluator(metricName="accuracy") + estimator_params = {"labelCol": "label"} + grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} + cv_params = {"parallelism": 2, "numFolds": 3} ml_method = MLCVModel( estimator=estimator, evaluator=evaluator, estimator_params=estimator_params, grid_params=grid_params, - cv_params=cv_params + cv_params=cv_params, ) - (ml_method - .fit(training_data) - ) + (ml_method.fit(training_data)) # get the accuracy on training eval_training = ml_method.get_eval_metric_on_training() @@ -118,22 +117,20 @@ def test_multilabel_lr_model(self): training_data, testing_data = fsdf.split_df(split_training_factor=0.6) estimator = LogisticRegression() - evaluator = MulticlassClassificationEvaluator(metricName='accuracy') - estimator_params = {'labelCol': 'label'} - grid_params = {'regParam': [0.1, 0.01]} - cv_params = {'parallelism': 2, 'numFolds': 3} + evaluator = MulticlassClassificationEvaluator(metricName="accuracy") + estimator_params = {"labelCol": "label"} + grid_params = {"regParam": [0.1, 0.01]} + cv_params = {"parallelism": 2, "numFolds": 3} ml_method = MLCVModel( estimator=estimator, evaluator=evaluator, estimator_params=estimator_params, grid_params=grid_params, - cv_params=cv_params + cv_params=cv_params, ) - (ml_method - .fit(training_data) - ) + (ml_method.fit(training_data)) # get the accuracy on training eval_training = ml_method.get_eval_metric_on_training() @@ -150,19 +147,19 @@ def test_FSMLMethod(self): fsdf = self.import_FSDataFrame() training_data, testing_data = fsdf.split_df(split_training_factor=0.7) - estimator_params = {'labelCol': 'label'} - grid_params = {'numTrees': [5, 10], 'maxDepth': [3, 5]} - cv_params = {'parallelism': 2, 'numFolds': 3} + estimator_params = {"labelCol": "label"} + grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} + cv_params = {"parallelism": 2, "numFolds": 3} ml_method = FSMLMethod( - fs_method='rf_multilabel', + fs_method="rf_multilabel", rfe=True, rfe_iterations=2, percent_to_keep=0.9, estimator_params=estimator_params, - evaluator_params={'metricName': 'accuracy'}, + evaluator_params={"metricName": "accuracy"}, grid_params=grid_params, - cv_params=cv_params + cv_params=cv_params, ) filtered_fsdf = ml_method.select_features(training_data) @@ -176,5 +173,5 @@ def test_FSMLMethod(self): assert testing_acc > 0.7 -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 16ae0a5..e97d636 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -4,6 +4,7 @@ from fslite.fs.univariate import univariate_filter + def test_univariate_filter_corr(): """ Test univariate_filter method with 'u_corr' method. @@ -11,17 +12,18 @@ def test_univariate_filter_corr(): """ # import tsv as pandas DataFrame - df = pd.read_csv(get_tnbc_data_path(), sep='\t') + df = pd.read_csv(get_tnbc_data_path(), sep="\t") # create FSDataFrame instance - fs_df = FSDataFrame(df=df,sample_col='Sample',label_col='label') + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") - fsdf_filtered = univariate_filter(fs_df, univariate_method='u_corr', corr_threshold=0.3) + fsdf_filtered = univariate_filter( + fs_df, univariate_method="u_corr", corr_threshold=0.3 + ) assert fs_df.count_features() == 500 assert fsdf_filtered.count_features() == 211 # Export the filtered DataFrame as Pandas DataFrame df_filtered = fsdf_filtered.to_pandas() - df_filtered.to_csv('filtered_tnbc_data.csv', index=False) - + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) diff --git a/fslite/utils/generic.py b/fslite/utils/generic.py index 5674998..9ab70e8 100644 --- a/fslite/utils/generic.py +++ b/fslite/utils/generic.py @@ -11,10 +11,12 @@ def tag(label: str): :param label: tag label :return: decorator """ + def decorator(func): def wrapper(*args, **kwargs): print(f"Tag for {func.__name__}: {label}") return func(*args, **kwargs) return wrapper + return decorator diff --git a/fslite/utils/io.py b/fslite/utils/io.py index 85adb13..74c202c 100644 --- a/fslite/utils/io.py +++ b/fslite/utils/io.py @@ -8,10 +8,9 @@ warnings.filterwarnings("ignore") -def import_table(path: str, - header: bool = True, - sep: str = "\t", - n_partitions: int = 5) -> pyspark.sql.DataFrame: +def import_table( + path: str, header: bool = True, sep: str = "\t", n_partitions: int = 5 +) -> pyspark.sql.DataFrame: """ Import tsv file as Spark DataFrame. @@ -28,19 +27,17 @@ def import_table(path: str, if _sc is None: raise ValueError("Active Spark Session not found...") - sdf = (_sc - .read - .option("delimiter", sep) - .option("header", header) - .option("inferSchema", "true") - .csv(path) - .repartition(n_partitions) - ) + sdf = ( + _sc.read.option("delimiter", sep) + .option("header", header) + .option("inferSchema", "true") + .csv(path) + .repartition(n_partitions) + ) return sdf -def import_parquet(path: str, - header: bool = True) -> pyspark.sql.DataFrame: +def import_parquet(path: str, header: bool = True) -> pyspark.sql.DataFrame: """ Import parquet file as Spark DataFrame. @@ -55,18 +52,13 @@ def import_parquet(path: str, if _sc is None: raise ValueError("Active Spark Session not found...") - sdf = (_sc - .read - .option("header", header) - .option("inferSchema", "true") - .parquet(path) - ) + sdf = _sc.read.option("header", header).option("inferSchema", "true").parquet(path) return sdf -def import_table_as_psdf(path: str, - sep: str = "\t", - n_partitions: int = 5) -> pyspark.pandas.DataFrame: +def import_table_as_psdf( + path: str, sep: str = "\t", n_partitions: int = 5 +) -> pyspark.pandas.DataFrame: """ Import tsv file as Pandas on Spark DataFrame @@ -80,13 +72,10 @@ def import_table_as_psdf(path: str, import pyspark.pandas as ps # apply settings for pandas on spark api - [ps.set_option(k, PANDAS_ON_SPARK_API_SETTINGS.get(k)) - for k in PANDAS_ON_SPARK_API_SETTINGS.keys()] - - psdf = (ps - .read_csv(path, - sep=sep) - .spark - .repartition(n_partitions) - ) + [ + ps.set_option(k, PANDAS_ON_SPARK_API_SETTINGS.get(k)) + for k in PANDAS_ON_SPARK_API_SETTINGS.keys() + ] + + psdf = ps.read_csv(path, sep=sep).spark.repartition(n_partitions) return psdf diff --git a/setup.py b/setup.py index 73c353d..b4bb0c4 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ long_description = fh.read() setup( - name='fslite', - version='0.0.1', - url='https://github.com/bigbio/fsspark', - license='Apache-2.0', - author='Enrique Audain Martinez', - author_email='enrique.audain@gmail.com', - description='Feature selection in Spark', + name="fslite", + version="0.0.1", + url="https://github.com/bigbio/fsspark", + license="Apache-2.0", + author="Enrique Audain Martinez", + author_email="enrique.audain@gmail.com", + description="Feature selection in Spark", long_description=long_description, long_description_content_type="text/markdown", packages=find_packages(), @@ -20,7 +20,7 @@ "networkx", "setuptools", "pandas", - "pyarrow" + "pyarrow", ], classifiers=[ # Classifiers for your package @@ -28,5 +28,5 @@ "License :: OSI Approved :: MIT License", "Operating System :: POSIX :: Linux", ], - python_requires='>=3.9.0', + python_requires=">=3.9.0", ) From 1fafeb57637ab9b6afe8ab7e37bc886aabd3db99 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 10:19:54 +0100 Subject: [PATCH 24/62] clean more code. --- fslite/fs/constants.py | 8 +- fslite/fs/methods.py | 10 +- fslite/fs/ml.py | 883 +++++++++++++++++++------------------- fslite/fs/multivariate.py | 320 +++++++------- fslite/fs/utils.py | 27 +- 5 files changed, 616 insertions(+), 632 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 87ab22b..0a10c92 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -79,7 +79,13 @@ def get_fs_univariate_methods() -> List: Get the list of univariate methods implemented in the library :return: list """ - return get_fs_method_by_class["univariate"] + return get_fs_method_by_class("univariate") + +def get_fs_multivariate_methods() -> List: + return get_fs_method_by_class("multivariate") + +def get_fs_ml_methods() -> List: + return get_fs_method_by_class("ml") def is_valid_univariate_method(method_name: str) -> bool: diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 7fbb6c6..e0de8ec 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod from typing import List, Type, Union, Tuple, Optional, Dict, Any -from fslite.fs.constants import ML_METHODS, UNIVARIATE_METHODS, MULTIVARIATE_METHODS -from fslite.fs.core import FSDataFrame +from fslite.fs.constants import FS_METHODS, get_fs_multivariate_methods, get_fs_ml_methods +from fslite.fs.fdataframe import FSDataFrame from fslite.fs.ml import MLCVModel from fslite.fs.multivariate import multivariate_filter from fslite.fs.univariate import univariate_filter @@ -88,7 +88,7 @@ class FSUnivariate(FSMethod): kwargs (dict): Additional keyword arguments for the feature selection method. """ - valid_methods = list(UNIVARIATE_METHODS.keys()) + valid_methods = list(FS_METHODS.keys()) def __init__(self, fs_method: str, **kwargs): """ @@ -161,7 +161,7 @@ class FSMultivariate(FSMethod): selected_features = fs_multivariate.select_features(fsdf) """ - valid_methods = list(MULTIVARIATE_METHODS.keys()) + valid_methods = list(get_fs_multivariate_methods()) def __init__(self, fs_method: str, **kwargs): """ @@ -225,7 +225,7 @@ class FSMLMethod(FSMethod): kwargs (dict): Additional keyword arguments for the feature selection method. """ - valid_methods = list(ML_METHODS.keys()) + valid_methods = list(get_fs_ml_methods()) _ml_model: MLCVModel = None def __init__( diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index bf97fae..362d793 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -1,452 +1,431 @@ -""" - -A set of pre-defined ML algorithms wrapped with cross-validation approach -for feature selection (e.g., rank by feature importance) and prediction. - -""" - -import warnings -from typing import List, Any, Dict, Optional, Union - -import pandas as pd -from pyspark.ml import Estimator, Model -from pyspark.ml.classification import ( - RandomForestClassificationModel, - LinearSVCModel, - RandomForestClassifier, - LinearSVC, - LogisticRegression, - LogisticRegressionModel, -) -from pyspark.ml.evaluation import ( - Evaluator, - BinaryClassificationEvaluator, - MulticlassClassificationEvaluator, - RegressionEvaluator, -) -from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor -from pyspark.ml.tuning import ( - CrossValidator, - ParamGridBuilder, - CrossValidatorModel, - Param, -) - -from fslite.fs.constants import ( - RF_BINARY, - LSVC_BINARY, - FM_BINARY, - RF_MULTILABEL, - LR_MULTILABEL, - RF_REGRESSION, - FM_REGRESSION, - ML_METHODS, -) -from fslite.fs.core import FSDataFrame - -ESTIMATORS_CLASSES = [ - RandomForestClassifier, - RandomForestRegressionModel, - LinearSVC, - LogisticRegression, -] -EVALUATORS_CLASSES = [ - BinaryClassificationEvaluator, - MulticlassClassificationEvaluator, - RegressionEvaluator, -] - - -# Define an abstract class that allow to create a factory of models -# with the same interface -# This class allows to perform the following operations: -# - Define an Estimator -# - Define an Evaluator -# - Define a grid of parameters (model tuning) -# - Define a cross-validator (model fitting) -class MLCVModel: - """ - A factory class for creating various machine learning models with Spark MLlib. - ML model are created using a cross-validator approach for hyperparameter tuning. - """ - - _cross_validator: CrossValidator = None - _fitted_cv_model: CrossValidatorModel = None - _best_model: Model = None - _fsdf: FSDataFrame = None - - def __init__( - self, - estimator: Union[ - RandomForestClassifier - | RandomForestRegressionModel - | LinearSVC - | LogisticRegression - ], - evaluator: Union[ - BinaryClassificationEvaluator - | MulticlassClassificationEvaluator - | RegressionEvaluator - ], - estimator_params: Optional[Dict[str, Any]] = None, - evaluator_params: Optional[Dict[str, Any]] = None, - grid_params: Optional[Dict[str, List[Any]]] = None, - cv_params: Optional[Dict[str, Any]] = None, - ): - """ - Initializes the MLModel with optional estimator, evaluator, and parameter specifications. - """ - self.estimator = estimator - self.evaluator = evaluator - self.estimator_params = estimator_params - self.evaluator_params = evaluator_params - self.grid_params = grid_params - self.cv_params = cv_params - - self._initialize_model() - - def _initialize_model(self): - # Validate and set estimator parameters - if self.estimator: - self._validate_estimator(self.estimator) - self._validate_estimator_params(self.estimator_params) - self._set_estimator_params() - - # Validate and evaluator - if self.evaluator: - self._validate_evaluator(self.evaluator) - self._validate_evaluator_params(self.evaluator_params) - self._set_evaluator_params() - - # Parse and set grid parameters - if self.grid_params: - self.grid_params = self._parse_grid_params(self.grid_params) - - # Initialize and set cross-validator parameters - self._set_cross_validator() - - def _parse_grid_params( - self, grid_params: Dict[str, List[Any]] - ) -> List[Dict[Param, Any]]: - """ - Parse the grid parameters to create a list of dictionaries. - - :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. - :return: A list of dictionaries, where each dictionary represents a set of parameter values. - """ - grid = ParamGridBuilder() - for param, values in grid_params.items(): - if hasattr(self.estimator, param): - grid = grid.addGrid(getattr(self.estimator, param), values) - else: - raise AttributeError( - f"{self.estimator.__class__.__name__} does not have attribute {param}" - ) - return grid.build() - - def _validate_estimator(self, estimator: Estimator) -> "MLCVModel": - """ - Validate the estimator. - - :param estimator: The estimator to validate. - :return: The validated estimator. - """ - # check estimator is an instance of ESTIMATORS_CLASSES - if not isinstance(estimator, tuple(ESTIMATORS_CLASSES)): - raise ValueError(f"Estimator must be an instance of {ESTIMATORS_CLASSES}") - return self - - def _validate_evaluator(self, evaluator: Evaluator) -> "MLCVModel": - """ - Validate the evaluator. - - :param evaluator: The evaluator to validate. - :return: The validated evaluator. - """ - # check evaluator is an instance of EVALUATORS_CLASSES - if not isinstance(evaluator, tuple(EVALUATORS_CLASSES)): - raise ValueError(f"Evaluator must be an instance of {EVALUATORS_CLASSES}") - return self - - def _validate_estimator_params(self, estimator_params: Dict[str, Any]) -> None: - """ - Validate the estimator parameters. - - :param estimator_params: A dictionary containing the parameter names as keys and values as values. - """ - if estimator_params is None: - return - for param, _ in estimator_params.items(): - if not self.estimator.hasParam(param): - raise AttributeError( - f"{self.estimator.__class__.__name__} does not have attribute {param}" - ) - - def _validate_evaluator_params(self, evaluator_params: Dict[str, Any]) -> None: - """ - Validate the evaluator parameters. - - :param evaluator_params: A dictionary containing the parameter names as keys and values as values. - """ - if evaluator_params is None: - return - for param, _ in evaluator_params.items(): - if not self.evaluator.hasParam(param): - raise AttributeError( - f"{self.evaluator.__class__.__name__} does not have attribute {param}" - ) - - def _set_evaluator_params(self) -> "MLCVModel": - """ - Set evaluator parameters. - """ - if self.evaluator_params is not None: - self.evaluator = self.evaluator.setParams(**self.evaluator_params) - return self - - def _set_estimator_params(self) -> "MLCVModel": - """ - Set estimator parameters. - """ - if self.estimator_params is not None: - self.estimator = self.estimator.setParams(**self.estimator_params) - return self - - def _set_cv_params(self, cv_params: Dict[str, Any]) -> "MLCVModel": - """ - Parse the cross-validator parameters to create an instance of CrossValidator. - - :param cv_params: A dictionary containing the parameter names as keys and values as values. - :return: An instance of CrossValidator. - """ - - for param, value in cv_params.items(): - if hasattr(self._cross_validator, param): - setattr(self._cross_validator, param, value) - else: - raise AttributeError( - f"{self._cross_validator.__class__.__name__} does not have attribute {param}" - ) - return self - - def _set_cross_validator(self) -> "MLCVModel": - """ - Build the model using the cross-validator. - - :return: The CrossValidator model. - """ - try: - self._cross_validator = CrossValidator( - estimator=self.estimator, - estimatorParamMaps=self.grid_params, - evaluator=self.evaluator, - ) - if self.cv_params is not None: - self._cross_validator = self._cross_validator.setParams( - **self.cv_params - ) - return self - except Exception as e: - print(f"An error occurred while creating the CrossValidator: {str(e)}") - # Handle the exception or raise it to be handled by the caller - raise - - def fit(self, fsdf: FSDataFrame) -> "MLCVModel": - """ - Fit the model using the cross-validator. - - :return: The CrossValidatorModel after fitting. - """ - # Extract the Spark DataFrame and label column name from FSDataFrame - self._fsdf = fsdf - - if ( - self._cross_validator is None - or self.estimator is None - or self.evaluator is None - ): - raise ValueError( - "Cross-validator, estimator, or evaluator not set properly." - ) - - self._fitted_cv_model = self._cross_validator.fit(self._fsdf.get_sdf_vector()) - return self - - def _get_best_model(self) -> Model: - """ - Get the best model from the fitted CrossValidatorModel. - - :return: The best model. - """ - if self._fitted_cv_model is None: - raise ValueError( - "CrossValidatorModel not fitted. Use fit() to fit the model." - ) - self._best_model = self._fitted_cv_model.bestModel - return self._best_model - - # define a static method that allows to set a ml model based on the model type - @staticmethod - def create_model( - model_type: str, - estimator_params: Dict[str, Any] = None, - evaluator_params: Dict[str, Any] = None, - grid_params: Dict[str, List[Any]] = None, - cv_params: Dict[str, Any] = None, - ) -> "MLCVModel": - """ - Set a machine learning model based on the model type. - - :param model_type: The type of model to set. - :param estimator_params: Parameters for the estimator. - :param evaluator_params: Parameters for the evaluator. - :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. - :param cv_params: Parameters for the cross-validator. - - :return: An instance of MLModel. - """ - if model_type == RF_BINARY: - estimator = RandomForestClassifier() - evaluator = BinaryClassificationEvaluator() - elif model_type == LSVC_BINARY: - estimator = LinearSVC() - evaluator = BinaryClassificationEvaluator() - elif model_type == RF_MULTILABEL: - estimator = RandomForestClassifier() - evaluator = MulticlassClassificationEvaluator() - elif model_type == LR_MULTILABEL: - estimator = LogisticRegression() - evaluator = MulticlassClassificationEvaluator() - elif model_type == RF_REGRESSION: - estimator = RandomForestRegressor() - evaluator = RegressionEvaluator() - else: - raise ValueError( - f"Unsupported model type: {model_type}." - f"Supported model types are: {list(ML_METHODS.keys())}" - ) - - ml_method = MLCVModel( - estimator=estimator, - evaluator=evaluator, - estimator_params=estimator_params, - evaluator_params=evaluator_params, - grid_params=grid_params, - cv_params=cv_params, - ) - - return ml_method - - def get_eval_metric_name(self) -> str: - """ - Get the evaluation metric name. - - :return: The evaluation metric name. - """ - return self.evaluator.getMetricName() - - def get_feature_scores(self) -> pd.DataFrame: - - # TODO: This function should be able to parse all available models. - - indexed_features = self._fsdf.get_features_indexed() - best_model = self._get_best_model() - - # raise exception if the model is not none - if best_model is None: - raise ValueError( - "No ML model have been fitted. Use fit() to fit the model." - ) - - df_features = pd.DataFrame(indexed_features.to_numpy(), columns=["features"]) - - if isinstance( - best_model, (RandomForestClassificationModel, RandomForestRegressionModel) - ): - df_scores = pd.DataFrame( - data=best_model.featureImportances.toArray(), columns=["scores"] - ) - - df_scores = df_scores.reset_index(level=0).rename( - columns={"index": "feature_index"} - ) - - # merge the feature scores with the feature names - df = df_features.merge( - df_scores, how="right", left_index=True, right_index=True - ) # index-to-index merging - - # sort the dataframe by scores in descending order - df = df.sort_values(by="scores", ascending=False) - - # add feature percentile rank to the features_scores dataframe - df["percentile_rank"] = df["scores"].rank(pct=True) - - return df - - else: - raise ValueError( - "Unsupported model type. " - "Only RandomForestClassificationModel, " - "RandomForestRegressionModel, and LinearSVCModel are supported." - ) - - def get_eval_metric_on_training(self) -> float: - """ - Get the evaluation metric on training data from a trained CrossValidatorModel (best model). - - :return: A dictionary containing the evaluation metric name and value. - """ - - # TODO: This function should be able to parse all available models. - - # get the best model from the fitted cross-validator model - best_model = self._get_best_model() - - # get the eval metric name from the evaluator - eval_metric_name = self.get_eval_metric_name() - - if isinstance( - best_model, (RandomForestClassificationModel, LogisticRegressionModel) - ): - metric_value = getattr(best_model.summary, eval_metric_name) - - elif isinstance(best_model, LinearSVCModel): - metric_value = getattr(best_model.summary(), eval_metric_name) - - else: - warnings.warn("Unsupported model type. Unable to get evaluation metric.") - metric_value = None - - return metric_value - - def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: - """ - Get accuracy on test data from a trained CrossValidatorModel (best model). - - :param test_data: The test data as a FSDataFrame object. - :return: accuracy - """ - - # TODO: This function should be able to parse all available models. - - # get the best model from the fitted cross-validator model - best_model = self._get_best_model() - - # get test data features harmonized with training features - training_features = self._fsdf.get_features_names() - test_data = test_data.filter_features(training_features, keep=True) - - # predict the test data - predictions = None - if isinstance( - best_model, - (RandomForestClassificationModel, LinearSVCModel, LogisticRegressionModel), - ): - predictions = best_model.transform(test_data.get_sdf_vector()) - - metric_value = None - if predictions is not None: - metric_value = self.evaluator.evaluate(predictions) - - return metric_value +# """ +# +# A set of pre-defined ML algorithms wrapped with cross-validation approach +# for feature selection (e.g., rank by feature importance) and prediction. +# +# """ +# +# import warnings +# from typing import List, Any, Dict, Optional, Union +# +# import pandas as pd +# +# +# from fslite.fs.constants import ( +# RF_BINARY, +# LSVC_BINARY, +# FM_BINARY, +# RF_MULTILABEL, +# LR_MULTILABEL, +# RF_REGRESSION, +# FM_REGRESSION, +# ML_METHODS, +# ) +# from fslite.fs.core import FSDataFrame +# +# ESTIMATORS_CLASSES = [ +# RandomForestClassifier, +# RandomForestRegressionModel, +# LinearSVC, +# LogisticRegression, +# ] +# EVALUATORS_CLASSES = [ +# BinaryClassificationEvaluator, +# MulticlassClassificationEvaluator, +# RegressionEvaluator, +# ] +# +# +# # Define an abstract class that allow to create a factory of models +# # with the same interface +# # This class allows to perform the following operations: +# # - Define an Estimator +# # - Define an Evaluator +# # - Define a grid of parameters (model tuning) +# # - Define a cross-validator (model fitting) +# class MLCVModel: +# """ +# A factory class for creating various machine learning models with Spark MLlib. +# ML model are created using a cross-validator approach for hyperparameter tuning. +# """ +# +# _cross_validator: CrossValidator = None +# _fitted_cv_model: CrossValidatorModel = None +# _best_model: Model = None +# _fsdf: FSDataFrame = None +# +# def __init__( +# self, +# estimator: Union[ +# RandomForestClassifier +# | RandomForestRegressionModel +# | LinearSVC +# | LogisticRegression +# ], +# evaluator: Union[ +# BinaryClassificationEvaluator +# | MulticlassClassificationEvaluator +# | RegressionEvaluator +# ], +# estimator_params: Optional[Dict[str, Any]] = None, +# evaluator_params: Optional[Dict[str, Any]] = None, +# grid_params: Optional[Dict[str, List[Any]]] = None, +# cv_params: Optional[Dict[str, Any]] = None, +# ): +# """ +# Initializes the MLModel with optional estimator, evaluator, and parameter specifications. +# """ +# self.estimator = estimator +# self.evaluator = evaluator +# self.estimator_params = estimator_params +# self.evaluator_params = evaluator_params +# self.grid_params = grid_params +# self.cv_params = cv_params +# +# self._initialize_model() +# +# def _initialize_model(self): +# # Validate and set estimator parameters +# if self.estimator: +# self._validate_estimator(self.estimator) +# self._validate_estimator_params(self.estimator_params) +# self._set_estimator_params() +# +# # Validate and evaluator +# if self.evaluator: +# self._validate_evaluator(self.evaluator) +# self._validate_evaluator_params(self.evaluator_params) +# self._set_evaluator_params() +# +# # Parse and set grid parameters +# if self.grid_params: +# self.grid_params = self._parse_grid_params(self.grid_params) +# +# # Initialize and set cross-validator parameters +# self._set_cross_validator() +# +# def _parse_grid_params( +# self, grid_params: Dict[str, List[Any]] +# ) -> List[Dict[Param, Any]]: +# """ +# Parse the grid parameters to create a list of dictionaries. +# +# :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. +# :return: A list of dictionaries, where each dictionary represents a set of parameter values. +# """ +# grid = ParamGridBuilder() +# for param, values in grid_params.items(): +# if hasattr(self.estimator, param): +# grid = grid.addGrid(getattr(self.estimator, param), values) +# else: +# raise AttributeError( +# f"{self.estimator.__class__.__name__} does not have attribute {param}" +# ) +# return grid.build() +# +# def _validate_estimator(self, estimator: Estimator) -> "MLCVModel": +# """ +# Validate the estimator. +# +# :param estimator: The estimator to validate. +# :return: The validated estimator. +# """ +# # check estimator is an instance of ESTIMATORS_CLASSES +# if not isinstance(estimator, tuple(ESTIMATORS_CLASSES)): +# raise ValueError(f"Estimator must be an instance of {ESTIMATORS_CLASSES}") +# return self +# +# def _validate_evaluator(self, evaluator: Evaluator) -> "MLCVModel": +# """ +# Validate the evaluator. +# +# :param evaluator: The evaluator to validate. +# :return: The validated evaluator. +# """ +# # check evaluator is an instance of EVALUATORS_CLASSES +# if not isinstance(evaluator, tuple(EVALUATORS_CLASSES)): +# raise ValueError(f"Evaluator must be an instance of {EVALUATORS_CLASSES}") +# return self +# +# def _validate_estimator_params(self, estimator_params: Dict[str, Any]) -> None: +# """ +# Validate the estimator parameters. +# +# :param estimator_params: A dictionary containing the parameter names as keys and values as values. +# """ +# if estimator_params is None: +# return +# for param, _ in estimator_params.items(): +# if not self.estimator.hasParam(param): +# raise AttributeError( +# f"{self.estimator.__class__.__name__} does not have attribute {param}" +# ) +# +# def _validate_evaluator_params(self, evaluator_params: Dict[str, Any]) -> None: +# """ +# Validate the evaluator parameters. +# +# :param evaluator_params: A dictionary containing the parameter names as keys and values as values. +# """ +# if evaluator_params is None: +# return +# for param, _ in evaluator_params.items(): +# if not self.evaluator.hasParam(param): +# raise AttributeError( +# f"{self.evaluator.__class__.__name__} does not have attribute {param}" +# ) +# +# def _set_evaluator_params(self) -> "MLCVModel": +# """ +# Set evaluator parameters. +# """ +# if self.evaluator_params is not None: +# self.evaluator = self.evaluator.setParams(**self.evaluator_params) +# return self +# +# def _set_estimator_params(self) -> "MLCVModel": +# """ +# Set estimator parameters. +# """ +# if self.estimator_params is not None: +# self.estimator = self.estimator.setParams(**self.estimator_params) +# return self +# +# def _set_cv_params(self, cv_params: Dict[str, Any]) -> "MLCVModel": +# """ +# Parse the cross-validator parameters to create an instance of CrossValidator. +# +# :param cv_params: A dictionary containing the parameter names as keys and values as values. +# :return: An instance of CrossValidator. +# """ +# +# for param, value in cv_params.items(): +# if hasattr(self._cross_validator, param): +# setattr(self._cross_validator, param, value) +# else: +# raise AttributeError( +# f"{self._cross_validator.__class__.__name__} does not have attribute {param}" +# ) +# return self +# +# def _set_cross_validator(self) -> "MLCVModel": +# """ +# Build the model using the cross-validator. +# +# :return: The CrossValidator model. +# """ +# try: +# self._cross_validator = CrossValidator( +# estimator=self.estimator, +# estimatorParamMaps=self.grid_params, +# evaluator=self.evaluator, +# ) +# if self.cv_params is not None: +# self._cross_validator = self._cross_validator.setParams( +# **self.cv_params +# ) +# return self +# except Exception as e: +# print(f"An error occurred while creating the CrossValidator: {str(e)}") +# # Handle the exception or raise it to be handled by the caller +# raise +# +# def fit(self, fsdf: FSDataFrame) -> "MLCVModel": +# """ +# Fit the model using the cross-validator. +# +# :return: The CrossValidatorModel after fitting. +# """ +# # Extract the Spark DataFrame and label column name from FSDataFrame +# self._fsdf = fsdf +# +# if ( +# self._cross_validator is None +# or self.estimator is None +# or self.evaluator is None +# ): +# raise ValueError( +# "Cross-validator, estimator, or evaluator not set properly." +# ) +# +# self._fitted_cv_model = self._cross_validator.fit(self._fsdf.get_sdf_vector()) +# return self +# +# def _get_best_model(self) -> Model: +# """ +# Get the best model from the fitted CrossValidatorModel. +# +# :return: The best model. +# """ +# if self._fitted_cv_model is None: +# raise ValueError( +# "CrossValidatorModel not fitted. Use fit() to fit the model." +# ) +# self._best_model = self._fitted_cv_model.bestModel +# return self._best_model +# +# # define a static method that allows to set a ml model based on the model type +# @staticmethod +# def create_model( +# model_type: str, +# estimator_params: Dict[str, Any] = None, +# evaluator_params: Dict[str, Any] = None, +# grid_params: Dict[str, List[Any]] = None, +# cv_params: Dict[str, Any] = None, +# ) -> "MLCVModel": +# """ +# Set a machine learning model based on the model type. +# +# :param model_type: The type of model to set. +# :param estimator_params: Parameters for the estimator. +# :param evaluator_params: Parameters for the evaluator. +# :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. +# :param cv_params: Parameters for the cross-validator. +# +# :return: An instance of MLModel. +# """ +# if model_type == RF_BINARY: +# estimator = RandomForestClassifier() +# evaluator = BinaryClassificationEvaluator() +# elif model_type == LSVC_BINARY: +# estimator = LinearSVC() +# evaluator = BinaryClassificationEvaluator() +# elif model_type == RF_MULTILABEL: +# estimator = RandomForestClassifier() +# evaluator = MulticlassClassificationEvaluator() +# elif model_type == LR_MULTILABEL: +# estimator = LogisticRegression() +# evaluator = MulticlassClassificationEvaluator() +# elif model_type == RF_REGRESSION: +# estimator = RandomForestRegressor() +# evaluator = RegressionEvaluator() +# else: +# raise ValueError( +# f"Unsupported model type: {model_type}." +# f"Supported model types are: {list(ML_METHODS.keys())}" +# ) +# +# ml_method = MLCVModel( +# estimator=estimator, +# evaluator=evaluator, +# estimator_params=estimator_params, +# evaluator_params=evaluator_params, +# grid_params=grid_params, +# cv_params=cv_params, +# ) +# +# return ml_method +# +# def get_eval_metric_name(self) -> str: +# """ +# Get the evaluation metric name. +# +# :return: The evaluation metric name. +# """ +# return self.evaluator.getMetricName() +# +# def get_feature_scores(self) -> pd.DataFrame: +# +# # TODO: This function should be able to parse all available models. +# +# indexed_features = self._fsdf.get_features_indexed() +# best_model = self._get_best_model() +# +# # raise exception if the model is not none +# if best_model is None: +# raise ValueError( +# "No ML model have been fitted. Use fit() to fit the model." +# ) +# +# df_features = pd.DataFrame(indexed_features.to_numpy(), columns=["features"]) +# +# if isinstance( +# best_model, (RandomForestClassificationModel, RandomForestRegressionModel) +# ): +# df_scores = pd.DataFrame( +# data=best_model.featureImportances.toArray(), columns=["scores"] +# ) +# +# df_scores = df_scores.reset_index(level=0).rename( +# columns={"index": "feature_index"} +# ) +# +# # merge the feature scores with the feature names +# df = df_features.merge( +# df_scores, how="right", left_index=True, right_index=True +# ) # index-to-index merging +# +# # sort the dataframe by scores in descending order +# df = df.sort_values(by="scores", ascending=False) +# +# # add feature percentile rank to the features_scores dataframe +# df["percentile_rank"] = df["scores"].rank(pct=True) +# +# return df +# +# else: +# raise ValueError( +# "Unsupported model type. " +# "Only RandomForestClassificationModel, " +# "RandomForestRegressionModel, and LinearSVCModel are supported." +# ) +# +# def get_eval_metric_on_training(self) -> float: +# """ +# Get the evaluation metric on training data from a trained CrossValidatorModel (best model). +# +# :return: A dictionary containing the evaluation metric name and value. +# """ +# +# # TODO: This function should be able to parse all available models. +# +# # get the best model from the fitted cross-validator model +# best_model = self._get_best_model() +# +# # get the eval metric name from the evaluator +# eval_metric_name = self.get_eval_metric_name() +# +# if isinstance( +# best_model, (RandomForestClassificationModel, LogisticRegressionModel) +# ): +# metric_value = getattr(best_model.summary, eval_metric_name) +# +# elif isinstance(best_model, LinearSVCModel): +# metric_value = getattr(best_model.summary(), eval_metric_name) +# +# else: +# warnings.warn("Unsupported model type. Unable to get evaluation metric.") +# metric_value = None +# +# return metric_value +# +# def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: +# """ +# Get accuracy on test data from a trained CrossValidatorModel (best model). +# +# :param test_data: The test data as a FSDataFrame object. +# :return: accuracy +# """ +# +# # TODO: This function should be able to parse all available models. +# +# # get the best model from the fitted cross-validator model +# best_model = self._get_best_model() +# +# # get test data features harmonized with training features +# training_features = self._fsdf.get_features_names() +# test_data = test_data.filter_features(training_features, keep=True) +# +# # predict the test data +# predictions = None +# if isinstance( +# best_model, +# (RandomForestClassificationModel, LinearSVCModel, LogisticRegressionModel), +# ): +# predictions = best_model.transform(test_data.get_sdf_vector()) +# +# metric_value = None +# if predictions is not None: +# metric_value = self.evaluator.evaluate(predictions) +# +# return metric_value diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index 3db3414..b2c63e1 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -1,160 +1,160 @@ -import logging -from typing import List - -import numpy as np -import pyspark -from pyspark.ml.feature import VarianceThresholdSelector -from pyspark.ml.stat import Correlation - -from fslite.fs.constants import ( - MULTIVARIATE_METHODS, - MULTIVARIATE_CORRELATION, - MULTIVARIATE_VARIANCE, -) - -from fslite.fs.core import FSDataFrame -from fslite.fs.utils import find_maximal_independent_set -from fslite.utils.generic import tag - -logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("FSSPARK:MULTIVARIATE") -logger.setLevel(logging.INFO) - - -@tag("experimental") -def _compute_correlation_matrix( - sdf: pyspark.sql.DataFrame, - features_col: str = "features", - corr_method: str = "pearson", -) -> np.ndarray: - """ - Compute features Matrix Correlation. - - :param sdf: Spark DataFrame - :param features_col: Name of the feature column vector name. - :param corr_method: One of `pearson` (default) or `spearman`. - - :return: Numpy array. - """ - - logger.warning( - "Warning: Computed matrix correlation will be collected into the drive with this implementation.\n" - "This may cause memory issues. Use it preferably with small datasets." - ) - logger.info(f"Computing correlation matrix using {corr_method} method.") - - mcorr = Correlation.corr(sdf, features_col, corr_method).collect()[0][0].toArray() - return mcorr - - -@tag("experimental") -def multivariate_correlation_selector( - fsdf: FSDataFrame, - strict: bool = True, - corr_threshold: float = 0.75, - corr_method: str = "pearson", -) -> List[str]: - """ - Compute the correlation matrix (Pearson) among input features and select those below a specified threshold. - - :param fsdf: Input FSDataFrame - :param strict: If True (default), apply hard filtering (strict) to remove highly correlated features. - Otherwise, find the maximal independent set of highly correlated features (approximate method). - `Warning`: The approximate method is experimental. - :param corr_threshold: Minimal correlation threshold to consider two features correlated. - :param corr_method: One of `pearson` (default) or `spearman`. - - :return: List of selected features names - """ - - colum_vector_features = "features" - sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) - - # compute correlation matrix - mcorr = _compute_correlation_matrix( - sdf, features_col=colum_vector_features, corr_method=corr_method - ) - - mcorr = np.abs(mcorr) # get absolute correlation value - combs_above_cutoff = ( - np.triu(mcorr, k=1) > corr_threshold - ) # create bool matrix that meet criteria - correlated_col_index = tuple( - np.column_stack(np.where(combs_above_cutoff)) - ) # get correlated pairs cols index - - index_to_remove = set() - if strict: - # hard filtering method - # Original implementation: https://www.rdocumentation.org/packages/caret/versions/6.0-93/topics/findCorrelation - cols_mean = np.mean(mcorr, axis=1) # get cols index mean - for pairs in correlated_col_index: - i = pairs[0] - j = pairs[1] - index_to_remove.add(i if cols_mean[i] > cols_mean[j] else j) - else: - # approximate method - index_to_remove = find_maximal_independent_set(correlated_col_index, keep=False) - - features = fsdf.get_features_names() # get all current features - features_to_remove = fsdf.get_features_by_index(index_to_remove) - selected_features = [sf for sf in features if sf not in features_to_remove] - - return selected_features - - -@tag("spark implementation") -def multivariate_variance_selector( - fsdf: FSDataFrame, variance_threshold: float = 0.0 -) -> List[str]: - """ - Select features after removing low-variance ones (e.g., features with quasi-constant value across samples). - - :param fsdf: Input FSDataFrame - :param variance_threshold: Minimal variance value allowed to select a feature. - - :return: List of selected features names - """ - - colum_vector_features = "features" - sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) - - selector = VarianceThresholdSelector() - ( - selector.setFeaturesCol(colum_vector_features) - .setOutputCol("selectedFeatures") - .setVarianceThreshold(variance_threshold) - ) - - model = selector.fit(sdf) - selected_features_indices = set(model.selectedFeatures) - selected_features = fsdf.get_features_by_index(selected_features_indices) - - return selected_features - - -def multivariate_filter( - fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs -) -> FSDataFrame: - """ - Filter features after applying a multivariate feature selector method. - - :param fsdf: Input FSDataFrame - :param multivariate_method: Multivariate selector method. - Possible values are 'm_corr' or 'variance'. - - :return: Filtered FSDataFrame - """ - if multivariate_method == MULTIVARIATE_CORRELATION: - selected_features = multivariate_correlation_selector(fsdf, **kwargs) - elif multivariate_method == MULTIVARIATE_VARIANCE: - selected_features = multivariate_variance_selector(fsdf, **kwargs) - else: - raise ValueError( - f"Invalid multivariate method: {multivariate_method}. " - f"Choose one of {MULTIVARIATE_METHODS.keys()}." - ) - - logger.info(f"Applying multivariate filter {multivariate_method}.") - - return fsdf.filter_features(selected_features, keep=True) +# import logging +# from typing import List +# +# import numpy as np +# import pyspark +# from pyspark.ml.feature import VarianceThresholdSelector +# from pyspark.ml.stat import Correlation +# +# from fslite.fs.constants import ( +# MULTIVARIATE_METHODS, +# MULTIVARIATE_CORRELATION, +# MULTIVARIATE_VARIANCE, +# ) +# +# from fslite.fs.core import FSDataFrame +# from fslite.fs.utils import find_maximal_independent_set +# from fslite.utils.generic import tag +# +# logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +# logger = logging.getLogger("FSSPARK:MULTIVARIATE") +# logger.setLevel(logging.INFO) +# +# +# @tag("experimental") +# def _compute_correlation_matrix( +# sdf: pyspark.sql.DataFrame, +# features_col: str = "features", +# corr_method: str = "pearson", +# ) -> np.ndarray: +# """ +# Compute features Matrix Correlation. +# +# :param sdf: Spark DataFrame +# :param features_col: Name of the feature column vector name. +# :param corr_method: One of `pearson` (default) or `spearman`. +# +# :return: Numpy array. +# """ +# +# logger.warning( +# "Warning: Computed matrix correlation will be collected into the drive with this implementation.\n" +# "This may cause memory issues. Use it preferably with small datasets." +# ) +# logger.info(f"Computing correlation matrix using {corr_method} method.") +# +# mcorr = Correlation.corr(sdf, features_col, corr_method).collect()[0][0].toArray() +# return mcorr +# +# +# @tag("experimental") +# def multivariate_correlation_selector( +# fsdf: FSDataFrame, +# strict: bool = True, +# corr_threshold: float = 0.75, +# corr_method: str = "pearson", +# ) -> List[str]: +# """ +# Compute the correlation matrix (Pearson) among input features and select those below a specified threshold. +# +# :param fsdf: Input FSDataFrame +# :param strict: If True (default), apply hard filtering (strict) to remove highly correlated features. +# Otherwise, find the maximal independent set of highly correlated features (approximate method). +# `Warning`: The approximate method is experimental. +# :param corr_threshold: Minimal correlation threshold to consider two features correlated. +# :param corr_method: One of `pearson` (default) or `spearman`. +# +# :return: List of selected features names +# """ +# +# colum_vector_features = "features" +# sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) +# +# # compute correlation matrix +# mcorr = _compute_correlation_matrix( +# sdf, features_col=colum_vector_features, corr_method=corr_method +# ) +# +# mcorr = np.abs(mcorr) # get absolute correlation value +# combs_above_cutoff = ( +# np.triu(mcorr, k=1) > corr_threshold +# ) # create bool matrix that meet criteria +# correlated_col_index = tuple( +# np.column_stack(np.where(combs_above_cutoff)) +# ) # get correlated pairs cols index +# +# index_to_remove = set() +# if strict: +# # hard filtering method +# # Original implementation: https://www.rdocumentation.org/packages/caret/versions/6.0-93/topics/findCorrelation +# cols_mean = np.mean(mcorr, axis=1) # get cols index mean +# for pairs in correlated_col_index: +# i = pairs[0] +# j = pairs[1] +# index_to_remove.add(i if cols_mean[i] > cols_mean[j] else j) +# else: +# # approximate method +# index_to_remove = find_maximal_independent_set(correlated_col_index, keep=False) +# +# features = fsdf.get_features_names() # get all current features +# features_to_remove = fsdf.get_features_by_index(index_to_remove) +# selected_features = [sf for sf in features if sf not in features_to_remove] +# +# return selected_features +# +# +# @tag("spark implementation") +# def multivariate_variance_selector( +# fsdf: FSDataFrame, variance_threshold: float = 0.0 +# ) -> List[str]: +# """ +# Select features after removing low-variance ones (e.g., features with quasi-constant value across samples). +# +# :param fsdf: Input FSDataFrame +# :param variance_threshold: Minimal variance value allowed to select a feature. +# +# :return: List of selected features names +# """ +# +# colum_vector_features = "features" +# sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) +# +# selector = VarianceThresholdSelector() +# ( +# selector.setFeaturesCol(colum_vector_features) +# .setOutputCol("selectedFeatures") +# .setVarianceThreshold(variance_threshold) +# ) +# +# model = selector.fit(sdf) +# selected_features_indices = set(model.selectedFeatures) +# selected_features = fsdf.get_features_by_index(selected_features_indices) +# +# return selected_features +# +# +# def multivariate_filter( +# fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs +# ) -> FSDataFrame: +# """ +# Filter features after applying a multivariate feature selector method. +# +# :param fsdf: Input FSDataFrame +# :param multivariate_method: Multivariate selector method. +# Possible values are 'm_corr' or 'variance'. +# +# :return: Filtered FSDataFrame +# """ +# if multivariate_method == MULTIVARIATE_CORRELATION: +# selected_features = multivariate_correlation_selector(fsdf, **kwargs) +# elif multivariate_method == MULTIVARIATE_VARIANCE: +# selected_features = multivariate_variance_selector(fsdf, **kwargs) +# else: +# raise ValueError( +# f"Invalid multivariate method: {multivariate_method}. " +# f"Choose one of {MULTIVARIATE_METHODS.keys()}." +# ) +# +# logger.info(f"Applying multivariate filter {multivariate_method}.") +# +# return fsdf.filter_features(selected_features, keep=True) diff --git a/fslite/fs/utils.py b/fslite/fs/utils.py index 80ca059..649ad30 100644 --- a/fslite/fs/utils.py +++ b/fslite/fs/utils.py @@ -2,11 +2,10 @@ from typing import Dict, Tuple, Set import networkx as nx -import pyspark.sql.functions as f from networkx.algorithms.mis import maximal_independent_set -from pyspark.ml.feature import Imputer +from sklearn.impute import SimpleImputer -from fslite.fs.core import FSDataFrame +from fslite.fs.fdataframe import FSDataFrame from fslite.utils.generic import tag logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") @@ -27,17 +26,17 @@ def compute_missingness_rate(fsdf: FSDataFrame) -> Dict[str, float]: n_instances = fsdf.count_instances() # number of instances/samples. features = fsdf.get_features_names() # list of features (column) names - missing_rates = sdf.select( - [ - ( - f.sum(f.when(f.isnan(sdf[c]) | f.isnull(sdf[c]), 1).otherwise(0)) - / n_instances - ).alias(c) - for c in features - ] - ) + # missing_rates = sdf.select( + # [ + # ( + # f.sum(f.when(f.isnan(sdf[c]) | f.isnull(sdf[c]), 1).otherwise(0)) + # / n_instances + # ).alias(c) + # for c in features + # ] + # ) - return missing_rates.first().asDict() + # return missing_rates.first().asDict() def remove_features_by_missingness_rate( @@ -79,7 +78,7 @@ def impute_missing(fsdf: FSDataFrame, strategy: str = "mean") -> FSDataFrame: col_features = fsdf.get_features_names() sdf_imputed = ( - Imputer() + SimpleImputer() .setStrategy(strategy) .setInputCols(col_features) .setOutputCols(col_features) From f2ce6642001a152758654d41cf35f9d2a86d99d9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 13:46:13 +0100 Subject: [PATCH 25/62] clean more code. --- fslite/fs/constants.py | 2 ++ fslite/fs/methods.py | 6 +++++- fslite/fs/univariate.py | 16 ++++++++-------- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 0a10c92..439faab 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -81,9 +81,11 @@ def get_fs_univariate_methods() -> List: """ return get_fs_method_by_class("univariate") + def get_fs_multivariate_methods() -> List: return get_fs_method_by_class("multivariate") + def get_fs_ml_methods() -> List: return get_fs_method_by_class("ml") diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index e0de8ec..6787218 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,7 +1,11 @@ from abc import ABC, abstractmethod from typing import List, Type, Union, Tuple, Optional, Dict, Any -from fslite.fs.constants import FS_METHODS, get_fs_multivariate_methods, get_fs_ml_methods +from fslite.fs.constants import ( + FS_METHODS, + get_fs_multivariate_methods, + get_fs_ml_methods, +) from fslite.fs.fdataframe import FSDataFrame from fslite.fs.ml import MLCVModel from fslite.fs.multivariate import multivariate_filter diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index ee53b22..f584776 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -33,7 +33,7 @@ def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: def univariate_correlation_selector( - df: FSDataFrame, corr_threshold: float = 0.3 + df: FSDataFrame, corr_threshold: float = 0.3 ) -> List[int]: """ Select features based on their correlation with a label (class), if the correlation value is less than the specified @@ -54,12 +54,12 @@ def univariate_correlation_selector( def univariate_selector( - df: pd.DataFrame, - features: List[str], - label: str, - label_type: str = "categorical", - selection_mode: str = "percentile", - selection_threshold: float = 0.8, + df: pd.DataFrame, + features: List[str], + label: str, + label_type: str = "categorical", + selection_mode: str = "percentile", + selection_threshold: float = 0.8, ) -> List[str]: """ Wrapper for scikit-learn's `SelectKBest` feature selector. @@ -106,7 +106,7 @@ def univariate_selector( def univariate_filter( - df: FSDataFrame, univariate_method: str = "u_corr", **kwargs + df: FSDataFrame, univariate_method: str = "u_corr", **kwargs ) -> FSDataFrame: """ Filter features after applying a univariate feature selector method. From 6d1f54a7948e3feb157d02db1ad73dcfeda62c4a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 13:48:56 +0100 Subject: [PATCH 26/62] update in dependencies --- environment.yml | 12 +++++++----- setup.py | 5 +++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/environment.yml b/environment.yml index 8fe1a48..583bb46 100644 --- a/environment.yml +++ b/environment.yml @@ -6,8 +6,10 @@ dependencies: - python==3.10 - pip - pip: - - setuptools~=65.5.0 - - pyspark~=3.3.0 - - networkx~=2.8.7 - - numpy~=1.23.4 - - pyarrow~=8.0.0 + - setuptools + - networkx + - numpy + - pyarrow + - pandas + - scipy + - scikit-learn diff --git a/setup.py b/setup.py index b4bb0c4..ae048c5 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name="fslite", version="0.0.1", - url="https://github.com/bigbio/fsspark", + url="https://github.com/bigbio/fslite", license="Apache-2.0", author="Enrique Audain Martinez", author_email="enrique.audain@gmail.com", @@ -15,12 +15,13 @@ long_description_content_type="text/markdown", packages=find_packages(), install_requires=[ - "pyspark", "numpy", "networkx", "setuptools", "pandas", "pyarrow", + "scikit-learn", + "scipy", ], classifiers=[ # Classifiers for your package From a0181aa3ada963f5d942aa6f3b5ee81773d1bf8e Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 13:53:35 +0100 Subject: [PATCH 27/62] update in dependencies --- environment.yml | 2 ++ requirements.txt | 4 +++- setup.py | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 583bb46..4998c16 100644 --- a/environment.yml +++ b/environment.yml @@ -13,3 +13,5 @@ dependencies: - pandas - scipy - scikit-learn + - psutil + - pytest diff --git a/requirements.txt b/requirements.txt index 94ce35f..870ed82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ numpy setuptools pandas scikit-learn -scipy \ No newline at end of file +scipy +psutil +pytest \ No newline at end of file diff --git a/setup.py b/setup.py index ae048c5..a7bcaf4 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ "pyarrow", "scikit-learn", "scipy", + "psutil", ], classifiers=[ # Classifiers for your package From 4a9362148de3de74879c60bfaf855cdf6b4b528a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 13:56:46 +0100 Subject: [PATCH 28/62] update in dependencies --- fslite/tests/test_data_preprocessing.py | 170 ++++++------ fslite/tests/test_fs_pipeline.py | 144 +++++----- fslite/tests/test_import_export.py | 90 +++--- fslite/tests/test_ml_methods.py | 354 ++++++++++++------------ 4 files changed, 379 insertions(+), 379 deletions(-) diff --git a/fslite/tests/test_data_preprocessing.py b/fslite/tests/test_data_preprocessing.py index a247491..dbf9f43 100644 --- a/fslite/tests/test_data_preprocessing.py +++ b/fslite/tests/test_data_preprocessing.py @@ -1,85 +1,85 @@ -import unittest - -import numpy as np - -from fslite.config.context import init_spark, stop_spark_session -from fslite.fs.core import FSDataFrame -from fslite.fs.utils import ( - compute_missingness_rate, - remove_features_by_missingness_rate, - impute_missing, -) -from fslite.utils.datasets import get_tnbc_data_missing_values_path -from fslite.utils.io import import_table_as_psdf - - -class TestDataPreprocessing(unittest.TestCase): - """ - Define testing methods for data preprocessing (e.g, scaling, imputation, etc.) - - """ - - def setUp(self) -> None: - init_spark( - apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True, - ) - - def tearDown(self) -> None: - stop_spark_session() - - @staticmethod - def import_FSDataFrame() -> FSDataFrame: - """ - Import FSDataFrame object with missing values. - Number of samples: 44 - Number of features: 10 (5 with missing values) - :return: - """ - df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") - return fsdf - - def test_compute_missingness_rate(self): - """ - Test compute_missingness_rate method. - :return: None - """ - - fsdf = self.import_FSDataFrame() - features_missing_rates = compute_missingness_rate(fsdf) - self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0) - self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2) - - def test_filter_by_missingness_rate(self): - """ - Test filter_missingness_rate method. - :return: None - """ - - fsdf = self.import_FSDataFrame() - fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.15) - # print number of features - print(f"Number of remaining features: {fsdf.count_features()}") - - self.assertEqual(fsdf.count_features(), 6) - - def test_impute_missing(self): - """ - Test impute_missing method. Impute missing values using the mean across columns. - :return: None - """ - - fsdf = self.import_FSDataFrame() - fsdf = impute_missing(fsdf, strategy="mean") - - # Collect features as array - array = fsdf._collect_features_as_array() - - # Check if there are no missing (NaNs) or null values - self.assertFalse(np.isnan(array).any()) - - -if __name__ == "__main__": - unittest.main() +# import unittest +# +# import numpy as np +# +# from fslite.config.context import init_spark, stop_spark_session +# from fslite.fs.core import FSDataFrame +# from fslite.fs.utils import ( +# compute_missingness_rate, +# remove_features_by_missingness_rate, +# impute_missing, +# ) +# from fslite.utils.datasets import get_tnbc_data_missing_values_path +# from fslite.utils.io import import_table_as_psdf +# +# +# class TestDataPreprocessing(unittest.TestCase): +# """ +# Define testing methods for data preprocessing (e.g, scaling, imputation, etc.) +# +# """ +# +# def setUp(self) -> None: +# init_spark( +# apply_pyarrow_settings=True, +# apply_extra_spark_settings=True, +# apply_pandas_settings=True, +# ) +# +# def tearDown(self) -> None: +# stop_spark_session() +# +# @staticmethod +# def import_FSDataFrame() -> FSDataFrame: +# """ +# Import FSDataFrame object with missing values. +# Number of samples: 44 +# Number of features: 10 (5 with missing values) +# :return: +# """ +# df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5) +# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") +# return fsdf +# +# def test_compute_missingness_rate(self): +# """ +# Test compute_missingness_rate method. +# :return: None +# """ +# +# fsdf = self.import_FSDataFrame() +# features_missing_rates = compute_missingness_rate(fsdf) +# self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0) +# self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2) +# +# def test_filter_by_missingness_rate(self): +# """ +# Test filter_missingness_rate method. +# :return: None +# """ +# +# fsdf = self.import_FSDataFrame() +# fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.15) +# # print number of features +# print(f"Number of remaining features: {fsdf.count_features()}") +# +# self.assertEqual(fsdf.count_features(), 6) +# +# def test_impute_missing(self): +# """ +# Test impute_missing method. Impute missing values using the mean across columns. +# :return: None +# """ +# +# fsdf = self.import_FSDataFrame() +# fsdf = impute_missing(fsdf, strategy="mean") +# +# # Collect features as array +# array = fsdf._collect_features_as_array() +# +# # Check if there are no missing (NaNs) or null values +# self.assertFalse(np.isnan(array).any()) +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py index 6b8176e..42be655 100644 --- a/fslite/tests/test_fs_pipeline.py +++ b/fslite/tests/test_fs_pipeline.py @@ -1,72 +1,72 @@ -import unittest - -from fslite.config.context import init_spark, stop_spark_session -from fslite.fs.core import FSDataFrame -from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -from fslite.utils.datasets import get_tnbc_data_path -from fslite.utils.io import import_table_as_psdf - - -class FeatureSelectionPipelineTest(unittest.TestCase): - - def setUp(self) -> None: - init_spark( - apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True, - ) - - def tearDown(self) -> None: - stop_spark_session() - - @staticmethod - def import_FSDataFrame(): - df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") - return fsdf - - def test_feature_selection_pipeline(self): - fsdf = self.import_FSDataFrame() - - training_data, testing_data = fsdf.split_df(split_training_factor=0.6) - - # create a Univariate object - univariate = FSUnivariate( - fs_method="anova", selection_mode="percentile", selection_threshold=0.8 - ) - - # create a Multivariate object - multivariate = FSMultivariate( - fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" - ) - - # create a MLMethod object - rf_classifier = FSMLMethod( - fs_method="rf_multilabel", - rfe=True, - rfe_iterations=2, - percent_to_keep=0.9, - estimator_params={"labelCol": "label"}, - evaluator_params={"metricName": "accuracy"}, - grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, - cv_params={"parallelism": 2, "numFolds": 5}, - ) - - # create a pipeline object - fs_pipeline = FSPipeline( - df_training=training_data, - df_testing=testing_data, - fs_stages=[univariate, multivariate, rf_classifier], - ) - - # run the pipeline - results = fs_pipeline.run() - - # print results - print(results) - - assert results.get("training_metric") > 0.9 - - -if __name__ == "__main__": - unittest.main() +# import unittest +# +# from fslite.config.context import init_spark, stop_spark_session +# from fslite.fs.core import FSDataFrame +# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod +# from fslite.utils.datasets import get_tnbc_data_path +# from fslite.utils.io import import_table_as_psdf +# +# +# class FeatureSelectionPipelineTest(unittest.TestCase): +# +# def setUp(self) -> None: +# init_spark( +# apply_pyarrow_settings=True, +# apply_extra_spark_settings=True, +# apply_pandas_settings=True, +# ) +# +# def tearDown(self) -> None: +# stop_spark_session() +# +# @staticmethod +# def import_FSDataFrame(): +# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) +# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") +# return fsdf +# +# def test_feature_selection_pipeline(self): +# fsdf = self.import_FSDataFrame() +# +# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) +# +# # create a Univariate object +# univariate = FSUnivariate( +# fs_method="anova", selection_mode="percentile", selection_threshold=0.8 +# ) +# +# # create a Multivariate object +# multivariate = FSMultivariate( +# fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" +# ) +# +# # create a MLMethod object +# rf_classifier = FSMLMethod( +# fs_method="rf_multilabel", +# rfe=True, +# rfe_iterations=2, +# percent_to_keep=0.9, +# estimator_params={"labelCol": "label"}, +# evaluator_params={"metricName": "accuracy"}, +# grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, +# cv_params={"parallelism": 2, "numFolds": 5}, +# ) +# +# # create a pipeline object +# fs_pipeline = FSPipeline( +# df_training=training_data, +# df_testing=testing_data, +# fs_stages=[univariate, multivariate, rf_classifier], +# ) +# +# # run the pipeline +# results = fs_pipeline.run() +# +# # print results +# print(results) +# +# assert results.get("training_metric") > 0.9 +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/fslite/tests/test_import_export.py b/fslite/tests/test_import_export.py index 507f379..32ee27a 100644 --- a/fslite/tests/test_import_export.py +++ b/fslite/tests/test_import_export.py @@ -1,45 +1,45 @@ -import unittest - -import pyspark -import pyspark.pandas as ps - -from fslite.config.context import init_spark, stop_spark_session -from fslite.utils.datasets import get_tnbc_data_path -from fslite.utils.io import import_table, import_table_as_psdf - - -class TestImportExport(unittest.TestCase): - - def setUp(self) -> None: - init_spark( - apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True, - ) - - def tearDown(self) -> None: - stop_spark_session() - - def test_import_tsv(self): - """ - Test import tsv file as Spark DataFrame. - :return: None - """ - df = import_table(path=get_tnbc_data_path(), n_partitions=10) - - self.assertIsInstance(df, pyspark.sql.DataFrame) - self.assertEqual(df.count(), 44) - - def test_import_tsv_as_psdf(self): - """ - Test import tsv file as Pandas on Spark DataFrame (PoS). - :return: None - """ - df = import_table_as_psdf(path=get_tnbc_data_path(), n_partitions=10) - - self.assertIsInstance(df, ps.frame.DataFrame) - self.assertEqual(df.shape, (44, 502)) - - -if __name__ == "__main__": - unittest.main() +# import unittest +# +# import pyspark +# import pyspark.pandas as ps +# +# from fslite.config.context import init_spark, stop_spark_session +# from fslite.utils.datasets import get_tnbc_data_path +# from fslite.utils.io import import_table, import_table_as_psdf +# +# +# class TestImportExport(unittest.TestCase): +# +# def setUp(self) -> None: +# init_spark( +# apply_pyarrow_settings=True, +# apply_extra_spark_settings=True, +# apply_pandas_settings=True, +# ) +# +# def tearDown(self) -> None: +# stop_spark_session() +# +# def test_import_tsv(self): +# """ +# Test import tsv file as Spark DataFrame. +# :return: None +# """ +# df = import_table(path=get_tnbc_data_path(), n_partitions=10) +# +# self.assertIsInstance(df, pyspark.sql.DataFrame) +# self.assertEqual(df.count(), 44) +# +# def test_import_tsv_as_psdf(self): +# """ +# Test import tsv file as Pandas on Spark DataFrame (PoS). +# :return: None +# """ +# df = import_table_as_psdf(path=get_tnbc_data_path(), n_partitions=10) +# +# self.assertIsInstance(df, ps.frame.DataFrame) +# self.assertEqual(df.shape, (44, 502)) +# +# +# if __name__ == "__main__": +# unittest.main() diff --git a/fslite/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py index 5b624d5..b46b2b9 100644 --- a/fslite/tests/test_ml_methods.py +++ b/fslite/tests/test_ml_methods.py @@ -1,177 +1,177 @@ -import unittest - -from pyspark.ml.classification import RandomForestClassifier, LogisticRegression -from pyspark.ml.evaluation import ( - BinaryClassificationEvaluator, - MulticlassClassificationEvaluator, -) - -from fslite.config.context import init_spark, stop_spark_session -from fslite.fs.core import FSDataFrame -from fslite.fs.ml import MLCVModel -from fslite.utils.datasets import get_tnbc_data_path -from fslite.utils.io import import_table_as_psdf - - -class MLMethodTest(unittest.TestCase): - - def setUp(self) -> None: - init_spark( - apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True, - ) - - def tearDown(self) -> None: - stop_spark_session() - - @staticmethod - def import_FSDataFrame(): - df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) - fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") - return fsdf - - def test_build_model_using_cross_validator(self): - fsdf = self.import_FSDataFrame() - estimator = RandomForestClassifier() - evaluator = BinaryClassificationEvaluator() - grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} - ml_method = MLCVModel( - estimator=estimator, - evaluator=evaluator, - estimator_params=None, - grid_params=None, - cv_params=None, - ) - - print(ml_method._cross_validator.__str__()) - assert ml_method._cross_validator is not None - - def test_get_feature_scores_random_forest_classifier(self): - # Create a sample FSDataFrame - fsdf = self.import_FSDataFrame() - - # Create a RandomForestClassifier model - estimator = RandomForestClassifier() - evaluator = MulticlassClassificationEvaluator() - estimator_params = {"labelCol": "label"} - grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} - cv_params = {"parallelism": 2, "numFolds": 5, "collectSubModels": False} - - ml_method = MLCVModel( - estimator=estimator, - evaluator=evaluator, - estimator_params=estimator_params, - grid_params=grid_params, - cv_params=cv_params, - ) - - (ml_method.fit(fsdf)) - - # Get the feature scores - feature_scores = ml_method.get_feature_scores() - - # Assert that the feature scores DataFrame is not empty - assert not feature_scores.empty - - # Assert that the feature scores DataFrame has the expected columns - expected_columns = ["features", "feature_index", "scores", "percentile_rank"] - assert list(feature_scores.columns) == expected_columns - - # check if dataframe is sorted by scores (descending) - assert feature_scores["scores"].is_monotonic_decreasing - - print(feature_scores) - - def test_multilabel_rf_model(self): - fsdf = self.import_FSDataFrame() - training_data, testing_data = fsdf.split_df(split_training_factor=0.8) - - estimator = RandomForestClassifier() - evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - estimator_params = {"labelCol": "label"} - grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} - cv_params = {"parallelism": 2, "numFolds": 3} - - ml_method = MLCVModel( - estimator=estimator, - evaluator=evaluator, - estimator_params=estimator_params, - grid_params=grid_params, - cv_params=cv_params, - ) - - (ml_method.fit(training_data)) - - # get the accuracy on training - eval_training = ml_method.get_eval_metric_on_training() - print(f"Accuracy on training data: {eval_training}") - - # get the accuracy on testing - testing_acc = ml_method.get_eval_metric_on_testing(testing_data) - print(f"Accuracy on test data: {testing_acc}") - assert testing_acc > 0.7 - - def test_multilabel_lr_model(self): - fsdf = self.import_FSDataFrame() - training_data, testing_data = fsdf.split_df(split_training_factor=0.6) - - estimator = LogisticRegression() - evaluator = MulticlassClassificationEvaluator(metricName="accuracy") - estimator_params = {"labelCol": "label"} - grid_params = {"regParam": [0.1, 0.01]} - cv_params = {"parallelism": 2, "numFolds": 3} - - ml_method = MLCVModel( - estimator=estimator, - evaluator=evaluator, - estimator_params=estimator_params, - grid_params=grid_params, - cv_params=cv_params, - ) - - (ml_method.fit(training_data)) - - # get the accuracy on training - eval_training = ml_method.get_eval_metric_on_training() - print(f"Accuracy on training data: {eval_training}") - - # get the accuracy on testing - testing_acc = ml_method.get_eval_metric_on_testing(testing_data) - print(f"Accuracy on test data: {testing_acc}") - assert testing_acc > 0.7 - - def test_FSMLMethod(self): - from fslite.fs.methods import FSMLMethod - - fsdf = self.import_FSDataFrame() - training_data, testing_data = fsdf.split_df(split_training_factor=0.7) - - estimator_params = {"labelCol": "label"} - grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} - cv_params = {"parallelism": 2, "numFolds": 3} - - ml_method = FSMLMethod( - fs_method="rf_multilabel", - rfe=True, - rfe_iterations=2, - percent_to_keep=0.9, - estimator_params=estimator_params, - evaluator_params={"metricName": "accuracy"}, - grid_params=grid_params, - cv_params=cv_params, - ) - - filtered_fsdf = ml_method.select_features(training_data) - - training_acc = ml_method.get_eval_metric_on_training() - print(f"Training accuracy: {training_acc}") - assert training_acc > 0.8 - - testing_acc = ml_method.get_eval_metric_on_testing(testing_data) - print(f"Testing accuracy: {testing_acc}") - assert testing_acc > 0.7 - - -if __name__ == "__main__": - unittest.main() +# import unittest +# +# from pyspark.ml.classification import RandomForestClassifier, LogisticRegression +# from pyspark.ml.evaluation import ( +# BinaryClassificationEvaluator, +# MulticlassClassificationEvaluator, +# ) +# +# from fslite.config.context import init_spark, stop_spark_session +# from fslite.fs.core import FSDataFrame +# from fslite.fs.ml import MLCVModel +# from fslite.utils.datasets import get_tnbc_data_path +# from fslite.utils.io import import_table_as_psdf +# +# +# class MLMethodTest(unittest.TestCase): +# +# def setUp(self) -> None: +# init_spark( +# apply_pyarrow_settings=True, +# apply_extra_spark_settings=True, +# apply_pandas_settings=True, +# ) +# +# def tearDown(self) -> None: +# stop_spark_session() +# +# @staticmethod +# def import_FSDataFrame(): +# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) +# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") +# return fsdf +# +# def test_build_model_using_cross_validator(self): +# fsdf = self.import_FSDataFrame() +# estimator = RandomForestClassifier() +# evaluator = BinaryClassificationEvaluator() +# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} +# ml_method = MLCVModel( +# estimator=estimator, +# evaluator=evaluator, +# estimator_params=None, +# grid_params=None, +# cv_params=None, +# ) +# +# print(ml_method._cross_validator.__str__()) +# assert ml_method._cross_validator is not None +# +# def test_get_feature_scores_random_forest_classifier(self): +# # Create a sample FSDataFrame +# fsdf = self.import_FSDataFrame() +# +# # Create a RandomForestClassifier model +# estimator = RandomForestClassifier() +# evaluator = MulticlassClassificationEvaluator() +# estimator_params = {"labelCol": "label"} +# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} +# cv_params = {"parallelism": 2, "numFolds": 5, "collectSubModels": False} +# +# ml_method = MLCVModel( +# estimator=estimator, +# evaluator=evaluator, +# estimator_params=estimator_params, +# grid_params=grid_params, +# cv_params=cv_params, +# ) +# +# (ml_method.fit(fsdf)) +# +# # Get the feature scores +# feature_scores = ml_method.get_feature_scores() +# +# # Assert that the feature scores DataFrame is not empty +# assert not feature_scores.empty +# +# # Assert that the feature scores DataFrame has the expected columns +# expected_columns = ["features", "feature_index", "scores", "percentile_rank"] +# assert list(feature_scores.columns) == expected_columns +# +# # check if dataframe is sorted by scores (descending) +# assert feature_scores["scores"].is_monotonic_decreasing +# +# print(feature_scores) +# +# def test_multilabel_rf_model(self): +# fsdf = self.import_FSDataFrame() +# training_data, testing_data = fsdf.split_df(split_training_factor=0.8) +# +# estimator = RandomForestClassifier() +# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") +# estimator_params = {"labelCol": "label"} +# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} +# cv_params = {"parallelism": 2, "numFolds": 3} +# +# ml_method = MLCVModel( +# estimator=estimator, +# evaluator=evaluator, +# estimator_params=estimator_params, +# grid_params=grid_params, +# cv_params=cv_params, +# ) +# +# (ml_method.fit(training_data)) +# +# # get the accuracy on training +# eval_training = ml_method.get_eval_metric_on_training() +# print(f"Accuracy on training data: {eval_training}") +# +# # get the accuracy on testing +# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) +# print(f"Accuracy on test data: {testing_acc}") +# assert testing_acc > 0.7 +# +# def test_multilabel_lr_model(self): +# fsdf = self.import_FSDataFrame() +# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) +# +# estimator = LogisticRegression() +# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") +# estimator_params = {"labelCol": "label"} +# grid_params = {"regParam": [0.1, 0.01]} +# cv_params = {"parallelism": 2, "numFolds": 3} +# +# ml_method = MLCVModel( +# estimator=estimator, +# evaluator=evaluator, +# estimator_params=estimator_params, +# grid_params=grid_params, +# cv_params=cv_params, +# ) +# +# (ml_method.fit(training_data)) +# +# # get the accuracy on training +# eval_training = ml_method.get_eval_metric_on_training() +# print(f"Accuracy on training data: {eval_training}") +# +# # get the accuracy on testing +# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) +# print(f"Accuracy on test data: {testing_acc}") +# assert testing_acc > 0.7 +# +# def test_FSMLMethod(self): +# from fslite.fs.methods import FSMLMethod +# +# fsdf = self.import_FSDataFrame() +# training_data, testing_data = fsdf.split_df(split_training_factor=0.7) +# +# estimator_params = {"labelCol": "label"} +# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} +# cv_params = {"parallelism": 2, "numFolds": 3} +# +# ml_method = FSMLMethod( +# fs_method="rf_multilabel", +# rfe=True, +# rfe_iterations=2, +# percent_to_keep=0.9, +# estimator_params=estimator_params, +# evaluator_params={"metricName": "accuracy"}, +# grid_params=grid_params, +# cv_params=cv_params, +# ) +# +# filtered_fsdf = ml_method.select_features(training_data) +# +# training_acc = ml_method.get_eval_metric_on_training() +# print(f"Training accuracy: {training_acc}") +# assert training_acc > 0.8 +# +# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) +# print(f"Testing accuracy: {testing_acc}") +# assert testing_acc > 0.7 +# +# +# if __name__ == "__main__": +# unittest.main() From 5d70dfc299b1afe4ae2b933a1c979d3ee030fa99 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 14:03:44 +0100 Subject: [PATCH 29/62] update in dependencies --- environment.yml | 1 + requirements.txt | 3 ++- setup.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 4998c16..d1ff29f 100644 --- a/environment.yml +++ b/environment.yml @@ -15,3 +15,4 @@ dependencies: - scikit-learn - psutil - pytest + - matplotlib diff --git a/requirements.txt b/requirements.txt index 870ed82..aa9b013 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ pandas scikit-learn scipy psutil -pytest \ No newline at end of file +pytest +matplotlib \ No newline at end of file diff --git a/setup.py b/setup.py index a7bcaf4..a6a20a1 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ "scikit-learn", "scipy", "psutil", + "matplotlib" ], classifiers=[ # Classifiers for your package From 0eddddd6ff490e3d7aa39eb89f4b5121626f4d00 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 14:06:34 +0100 Subject: [PATCH 30/62] update in dependencies --- environment.yml | 1 + requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index d1ff29f..373ccaf 100644 --- a/environment.yml +++ b/environment.yml @@ -16,3 +16,4 @@ dependencies: - psutil - pytest - matplotlib + - memory-profiler diff --git a/requirements.txt b/requirements.txt index aa9b013..75bbbe8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ scikit-learn scipy psutil pytest -matplotlib \ No newline at end of file +matplotlib +memory-profiler \ No newline at end of file From 94703eb2628cb8cda98821fd63dee2b7c490c765 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sun, 22 Sep 2024 19:56:35 +0100 Subject: [PATCH 31/62] smaller tests for CI/CD --- fslite/tests/test_fsdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index adda1bb..21879c7 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -73,7 +73,7 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob=0.01) -> floa return mem_usage # Define test cases - feature_sizes = [1000, 5000, 10000, 50000, 100_000, 1_000_000] + feature_sizes = [1000, 5000, 10000] sample_sizes = [100, 500, 1000] nan_prob = [0.05, 0.1, 0.2, 0.5] From f67a259f2a8003cec53d4b9ea566d5cd0ca32e86 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:11:32 +0100 Subject: [PATCH 32/62] smaller tests for CI/CD --- fslite/fs/constants.py | 22 +++ fslite/fs/fdataframe.py | 4 +- fslite/fs/methods.py | 406 +------------------------------------- fslite/fs/ml.py | 283 ++++++++++++++++++++++---- fslite/fs/multivariate.py | 295 ++++++++++++++------------- fslite/fs/univariate.py | 143 ++++++++++---- setup.py | 2 +- 7 files changed, 537 insertions(+), 618 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 439faab..c45dd65 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -101,6 +101,28 @@ def is_valid_univariate_method(method_name: str) -> bool: return True return False +def is_valid_multivariate_method(method_name: str) -> bool: + """ + This method check if the given method name is a supported multivariate method + :param method_name method name + :return: boolean + """ + for method in FS_METHODS["multivariate"]["methods"]: + if method["name"].lower() == method_name: + return True + return False + +def is_valid_ml_method(method_name: str) -> bool: + """ + This method check if the given method name is a supported machine learning method + :param method_name method name + :return: boolean + """ + for method in FS_METHODS["ml"]["methods"]: + if method["name"].lower() == method_name: + return True + return False + def get_fs_method_by_class(fs_class: str) -> List: """ diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index 3553014..b5938fc 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -25,10 +25,10 @@ class FSDataFrame: FSDataFrame is a representation of a DataFrame with some functionalities to perform feature selection. An object from FSDataFrame is basically represented by a DataFrame with samples as rows and features as columns, with extra distributed indexed pandas series for - features names and samples labels. + feature names and samples labels. An object of FSDataFrame offers an interface to a DataFrame, a Pandas on DataFrame - (e.g. suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for + (e.g., suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for applying most algorithms from MLib API). It can also be split in training and testing dataset and filtered by removing selected features (by name or index). diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 6787218..31bfa92 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -5,6 +5,7 @@ FS_METHODS, get_fs_multivariate_methods, get_fs_ml_methods, + get_fs_method_details, ) from fslite.fs.fdataframe import FSDataFrame from fslite.fs.ml import MLCVModel @@ -14,10 +15,12 @@ class FSMethod(ABC): """ - A general class for feature selection methods. + Feature selection abtract class, this class defines the basic structure of a feature selection method. + From this class are derived the specific feature selection methods like FSUnivariate, + FSMultivariate and FSMLMethod. """ - valid_methods: Tuple[str] + valid_methods: List[str] = [] def __init__(self, fs_method, **kwargs): """ @@ -30,23 +33,14 @@ def __init__(self, fs_method, **kwargs): self.kwargs = kwargs self.validate_method(fs_method) - @property - def valid_methods(self): - """ - Get the valid methods for feature selection. - - :return: A tuple of valid methods. - """ - return tuple(self.valid_methods) - @abstractmethod - def validate_method(self, fs_method: str): + def validate_method(self, fs_method: str) -> bool: """ Abstract method to validate the feature selection method. :param fs_method: The feature selection method to be validated. """ - pass + return get_fs_method_details(fs_method) is not None @abstractmethod def select_features(self, fsdf: FSDataFrame): @@ -57,15 +51,6 @@ def select_features(self, fsdf: FSDataFrame): """ pass - @abstractmethod - def validate_params(self, **kwargs): - """ - Abstract method to validate the parameters for the feature selection method. - - :param kwargs: The parameters to be validated. - """ - pass - def get_params(self): """ Get the parameters for the feature selection method. @@ -82,383 +67,6 @@ def set_params(self, **kwargs): """ self.kwargs.update(kwargs) - -class FSUnivariate(FSMethod): - """ - A class for univariate feature selection methods. - - Attributes: - fs_method (str): The univariate method to be used for feature selection. - kwargs (dict): Additional keyword arguments for the feature selection method. - """ - - valid_methods = list(FS_METHODS.keys()) - - def __init__(self, fs_method: str, **kwargs): - """ - Initialize the univariate feature selection method with the specified parameters. - - Parameters: - fs_method: The univariate method to be used for feature selection. - kwargs: Additional keyword arguments for the feature selection method. - """ - - super().__init__(fs_method, **kwargs) - self.validate_method(fs_method) - - def validate_method(self, fs_method: str): - """ - Validate the univariate method. - - Parameters: - fs_method: The univariate method to be validated. - """ - - if fs_method not in self.valid_methods: - raise InvalidMethodError( - f"Invalid univariate method: {fs_method}. " - f"Accepted methods are {', '.join(self.valid_methods)}" - ) - - def validate_params(self, **kwargs): - """ - Validate the parameters for the univariate method. - - Parameters: - kwargs: The parameters to be validated. - """ - # Additional validation is done directly in the underlying feature selection method - pass - - def select_features(self, fsdf) -> FSDataFrame: - """ - Select features using the specified univariate method. - - Parameters: - fsdf: The data frame on which feature selection is to be performed. - - Returns: - The selected features. - """ - - return univariate_filter(fsdf, univariate_method=self.fs_method, **self.kwargs) - - def __str__(self): - return f"FSUnivariate(method={self.fs_method}, kwargs={self.kwargs})" - - def __repr__(self): - return self.__str__() - - -class FSMultivariate(FSMethod): - """ - The FSMultivariate class is a subclass of the FSMethod class and is used for multivariate - feature selection methods. It provides a way to select features using different multivariate methods such as - multivariate correlation and variance. - - Example Usage - ------------- - # Create an instance of FSMultivariate with multivariate_method='m_corr' - fs_multivariate = FSMultivariate(multivariate_method='m_corr') - - # Select features using the multivariate method - selected_features = fs_multivariate.select_features(fsdf) - """ - - valid_methods = list(get_fs_multivariate_methods()) - - def __init__(self, fs_method: str, **kwargs): - """ - Initialize the multivariate feature selection method with the specified parameters. - - Parameters: - fsdf: The data frame on which feature selection is to be performed. - fs_method: The multivariate method to be used for feature selection. - kwargs: Additional keyword arguments for the feature selection method. - """ - - super().__init__(fs_method, **kwargs) - self.validate_method(fs_method) - - def validate_method(self, multivariate_method: str): - """ - Validate the multivariate method. - - Parameters: - multivariate_method: The multivariate method to be validated. - """ - - if multivariate_method not in self.valid_methods: - raise InvalidMethodError( - f"Invalid multivariate method: " - f"{multivariate_method}. Accepted methods are {', '.join(self.valid_methods)}" - ) - - def validate_params(self, **kwargs): - """ - Validate the parameters for the multivariate method. - - Parameters: - kwargs: The parameters to be validated. - """ - # Additional validation is done directly in the underlying feature selection method - pass - - def select_features(self, fsdf: FSDataFrame): - """ - Select features using the specified multivariate method. - """ - - return multivariate_filter( - fsdf, multivariate_method=self.fs_method, **self.kwargs - ) - - def __str__(self): - return f"FSMultivariate(multivariate_method={self.fs_method}, kwargs={self.kwargs})" - - def __repr__(self): - return self.__str__() - - -class FSMLMethod(FSMethod): - """ - A class for machine learning feature selection methods. - - Attributes: - fs_method (str): The machine learning method to be used for feature selection. - kwargs (dict): Additional keyword arguments for the feature selection method. - """ - - valid_methods = list(get_fs_ml_methods()) - _ml_model: MLCVModel = None - - def __init__( - self, - fs_method: str, - rfe: bool = False, - rfe_iterations: int = 3, - percent_to_keep: float = 0.90, - **kwargs, - ): - """ - Initialize the machine learning feature selection method with the specified parameters. - - Parameters: - fs_method: The machine learning method to be used for feature selection. - kwargs: Additional keyword arguments for the feature selection method. - """ - - super().__init__(fs_method, **kwargs) - self.validate_method(fs_method) - - # set the estimator, grid and cv parameters (or none if not provided) - self.estimator_params = kwargs.get( - "estimator_params", None - ) # estimator parameters - self.evaluator_params = kwargs.get( - "evaluator_params", None - ) # evaluator parameters - self.grid_params = kwargs.get("grid_params", None) # grid parameters - self.cv_params = kwargs.get("cv_params", None) # cross-validation parameters - - # set the machine learning model - self._ml_model = self._set_ml_model() - - # parameters to control the recursive feature elimination process (rfe) - self.rfe = rfe - self.percent_to_keep = percent_to_keep - self.rfe_iterations = rfe_iterations - - # performance metrics - self.rfe_training_metric: list = ( - [] - ) # performance metrics on training for each rfe iteration - self.training_metric = None # performance metrics on training (final model) - self.testing_metric = None # performance metrics on testing (final model) - - # feature importance - self.feature_scores = None - - def validate_method(self, fs_method: str): - """ - Validate the machine learning method. - - Parameters: - fs_method: The machine learning method to be validated. - """ - - if fs_method not in self.valid_methods: - raise InvalidMethodError( - f"Invalid machine learning method: {fs_method}. Accepted methods are {', '.join(self.valid_methods)}" - ) - - def validate_params(self, **kwargs): - """ - Validate the parameters for the machine learning method. - - Parameters: - kwargs: The parameters to be validated. - """ - # Additional validation is done directly in the underlying feature selection method - pass - - def _set_ml_model(self): - """ - Select the machine learning model to be used for feature selection. - - Returns: - The machine learning model. - """ - - model_type = self.fs_method - - self._ml_model = MLCVModel.create_model( - model_type=model_type, - estimator_params=self.estimator_params, - evaluator_params=self.evaluator_params, - grid_params=self.grid_params, - cv_params=self.cv_params, - ) - - return self._ml_model - - def _fit_and_filter(self, df: FSDataFrame) -> FSDataFrame: - - # fit the current machine learning model - self._ml_model.fit(df) - - # get feature scores - feature_scores = self._ml_model.get_feature_scores() - - # get feature based on the (percentile) threshold provided - # expected a dataframe sorted by scores in descending order - selected_features = feature_scores.iloc[ - : int(self.percent_to_keep * len(feature_scores)) - ]["feature_index"] - - return df.filter_features_by_index(selected_features, keep=True) - - def select_features(self, fsdf: FSDataFrame) -> FSDataFrame: - """ - Select features using the specified machine learning method. - - Parameters: - fsdf: The data frame on which feature selection is to be performed. - - Returns: - FSDataFrame: The data frame with selected features. - """ - - if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: - raise ValueError( - "The data frame is empty or does not contain any features." - ) - - fsdf = self._fit_and_filter(fsdf) - - # Recursive feature elimination - if self.rfe: - for iteration in range(self.rfe_iterations): - print( - f"RFE: running {iteration + 1} of {self.rfe_iterations} iterations..." - ) - fsdf = self._fit_and_filter(fsdf) - # collect the performance metrics on training for every rfe iteration - self.rfe_training_metric.append( - self._ml_model.get_eval_metric_on_training() - ) - - # get the final performance metric on training - self.training_metric = self._ml_model.get_eval_metric_on_training() - - # get the feature scores after feature selection - self.feature_scores = self._ml_model.get_feature_scores() - - return fsdf - - def get_eval_metric_name(self): - """ - Get the evaluation metric name. - - Returns: - The evaluation metric name. - """ - - if self._ml_model is None: - raise ValueError("No machine learning model is available.") - - return self._ml_model.get_eval_metric_name() - - def get_eval_metric_on_training_rfe(self): - """ - Get the evaluation metric on the training data for each RFE iteration. - - Returns: - The evaluation metric on the training data for each RFE iteration. - """ - if self.rfe_training_metric is None: - raise ValueError( - "No training metric is available. Run the select_features method first." - ) - return self.rfe_training_metric - - def get_eval_metric_on_training(self): - """ - Get the evaluation metric on the training data. - - Returns: - The evaluation metric on the training data. - """ - if self.training_metric is None: - raise ValueError( - "No training metric is available. Run the select_features method first." - ) - return self.training_metric - - def get_eval_metric_on_testing(self, fsdf: FSDataFrame): - """ - Evaluate the machine learning method on the testing data. - - Parameters: - fsdf: The testing data frame on which the machine learning method is to be evaluated. - - Returns: - The evaluation metric on the testing data. - """ - - if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: - raise ValueError( - "The testing data frame is empty or does not contain any features." - ) - - # evaluate the model on the testing data - eval_metric = self._ml_model.get_eval_metric_on_testing(fsdf) - self.testing_metric = eval_metric - - return eval_metric - - def get_feature_scores(self): - """ - Get the feature scores after feature selection. - - Returns: - The feature scores as a pandas DataFrame. - """ - - if self.feature_scores is None: - raise ValueError( - "Feature scores are not available. Run the feature selection method first." - ) - - return self.feature_scores - - def __str__(self): - return f"FSMLMethod(method={self.fs_method}, kwargs={self.kwargs})" - - def __repr__(self): - return self.__str__() - - class FSPipeline: """ The FSPipeline class creates a pipeline of feature selection methods. It provides a way to diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 362d793..77c9f69 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -1,39 +1,250 @@ -# """ -# -# A set of pre-defined ML algorithms wrapped with cross-validation approach -# for feature selection (e.g., rank by feature importance) and prediction. -# -# """ -# -# import warnings -# from typing import List, Any, Dict, Optional, Union -# -# import pandas as pd -# -# -# from fslite.fs.constants import ( -# RF_BINARY, -# LSVC_BINARY, -# FM_BINARY, -# RF_MULTILABEL, -# LR_MULTILABEL, -# RF_REGRESSION, -# FM_REGRESSION, -# ML_METHODS, -# ) -# from fslite.fs.core import FSDataFrame -# -# ESTIMATORS_CLASSES = [ -# RandomForestClassifier, -# RandomForestRegressionModel, -# LinearSVC, -# LogisticRegression, -# ] -# EVALUATORS_CLASSES = [ -# BinaryClassificationEvaluator, -# MulticlassClassificationEvaluator, -# RegressionEvaluator, -# ] +""" + +A set of pre-defined ML algorithms wrapped with cross-validation approach +for feature selection (e.g., rank by feature importance) and prediction. + +""" + +import warnings +from typing import List, Any, Dict, Optional, Union + +import pandas as pd +from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method +from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.methods import FSMethod, InvalidMethodError + + +class FSMLMethod(FSMethod): + """ + A class for machine learning feature selection methods. + + Attributes: + fs_method (str): The machine learning method to be used for feature selection. + kwargs (dict): Additional keyword arguments for the feature selection method. + """ + + valid_methods = get_fs_ml_methods() + _ml_model: MLCVModel = None + + def __init__( + self, + fs_method: str, + rfe: bool = False, + rfe_iterations: int = 3, + percent_to_keep: float = 0.90, + **kwargs, + ): + """ + Initialize the machine learning feature selection method with the specified parameters. + + Parameters: + fs_method: The machine learning method to be used for feature selection. + kwargs: Additional keyword arguments for the feature selection method. + """ + + super().__init__(fs_method, **kwargs) + self.validate_method(fs_method) + + # set the estimator, grid and cv parameters (or none if not provided) + self.estimator_params = kwargs.get( + "estimator_params", None + ) # estimator parameters + self.evaluator_params = kwargs.get( + "evaluator_params", None + ) # evaluator parameters + self.grid_params = kwargs.get("grid_params", None) # grid parameters + self.cv_params = kwargs.get("cv_params", None) # cross-validation parameters + + # set the machine learning model + self._ml_model = self._set_ml_model() + + # parameters to control the recursive feature elimination process (rfe) + self.rfe = rfe + self.percent_to_keep = percent_to_keep + self.rfe_iterations = rfe_iterations + + # performance metrics + self.rfe_training_metric: list = ( + [] + ) # performance metrics on training for each rfe iteration + self.training_metric = None # performance metrics on training (final model) + self.testing_metric = None # performance metrics on testing (final model) + + # feature importance + self.feature_scores = None + + def validate_method(self, fs_method: str): + """ + Validate the machine learning method. + + Parameters: + fs_method: The machine learning method to be validated. + """ + + if not is_valid_ml_method(fs_method): + raise InvalidMethodError( + f"Invalid machine learning method: {fs_method}. Accepted methods are {', '.join(self.valid_methods)}" + ) + + def _set_ml_model(self): + """ + Select the machine learning model to be used for feature selection. + + Returns: + The machine learning model. + """ + + model_type = self.fs_method + + self._ml_model = MLCVModel.create_model( + model_type=model_type, + estimator_params=self.estimator_params, + evaluator_params=self.evaluator_params, + grid_params=self.grid_params, + cv_params=self.cv_params, + ) + + return self._ml_model + + def _fit_and_filter(self, df: FSDataFrame) -> FSDataFrame: + + # fit the current machine learning model + self._ml_model.fit(df) + + # get feature scores + feature_scores = self._ml_model.get_feature_scores() + + # get feature based on the (percentile) threshold provided + # expected a dataframe sorted by scores in descending order + selected_features = feature_scores.iloc[ + : int(self.percent_to_keep * len(feature_scores)) + ]["feature_index"] + + return df.select_features_by_index(selected_features, keep=True) + + def select_features(self, fsdf: FSDataFrame) -> FSDataFrame: + """ + Select features using the specified machine learning method. + + Parameters: + fsdf: The data frame on which feature selection is to be performed. + + Returns: + FSDataFrame: The data frame with selected features. + """ + + if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: + raise ValueError( + "The data frame is empty or does not contain any features." + ) + + fsdf = self._fit_and_filter(fsdf) + + # Recursive feature elimination + if self.rfe: + for iteration in range(self.rfe_iterations): + print( + f"RFE: running {iteration + 1} of {self.rfe_iterations} iterations..." + ) + fsdf = self._fit_and_filter(fsdf) + # collect the performance metrics on training for every rfe iteration + self.rfe_training_metric.append( + self._ml_model.get_eval_metric_on_training() + ) + + # get the final performance metric on training + self.training_metric = self._ml_model.get_eval_metric_on_training() + + # get the feature scores after feature selection + self.feature_scores = self._ml_model.get_feature_scores() + + return fsdf + + def get_eval_metric_name(self): + """ + Get the evaluation metric name. + + Returns: + The evaluation metric name. + """ + + if self._ml_model is None: + raise ValueError("No machine learning model is available.") + + return self._ml_model.get_eval_metric_name() + + def get_eval_metric_on_training_rfe(self): + """ + Get the evaluation metric on the training data for each RFE iteration. + + Returns: + The evaluation metric on the training data for each RFE iteration. + """ + if self.rfe_training_metric is None: + raise ValueError( + "No training metric is available. Run the select_features method first." + ) + return self.rfe_training_metric + + def get_eval_metric_on_training(self): + """ + Get the evaluation metric on the training data. + + Returns: + The evaluation metric on the training data. + """ + if self.training_metric is None: + raise ValueError( + "No training metric is available. Run the select_features method first." + ) + return self.training_metric + + def get_eval_metric_on_testing(self, fsdf: FSDataFrame): + """ + Evaluate the machine learning method on the testing data. + + Parameters: + fsdf: The testing data frame on which the machine learning method is to be evaluated. + + Returns: + The evaluation metric on the testing data. + """ + + if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: + raise ValueError( + "The testing data frame is empty or does not contain any features." + ) + + # evaluate the model on the testing data + eval_metric = self._ml_model.get_eval_metric_on_testing(fsdf) + self.testing_metric = eval_metric + + return eval_metric + + def get_feature_scores(self): + """ + Get the feature scores after feature selection. + + Returns: + The feature scores as a pandas DataFrame. + """ + + if self.feature_scores is None: + raise ValueError( + "Feature scores are not available. Run the feature selection method first." + ) + + return self.feature_scores + + def __str__(self): + return f"FSMLMethod(method={self.fs_method}, kwargs={self.kwargs})" + + def __repr__(self): + return self.__str__() + + + + # # # # Define an abstract class that allow to create a factory of models diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index b2c63e1..d59ca5e 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -20,141 +20,160 @@ # logger = logging.getLogger("FSSPARK:MULTIVARIATE") # logger.setLevel(logging.INFO) # -# -# @tag("experimental") -# def _compute_correlation_matrix( -# sdf: pyspark.sql.DataFrame, -# features_col: str = "features", -# corr_method: str = "pearson", -# ) -> np.ndarray: -# """ -# Compute features Matrix Correlation. -# -# :param sdf: Spark DataFrame -# :param features_col: Name of the feature column vector name. -# :param corr_method: One of `pearson` (default) or `spearman`. -# -# :return: Numpy array. -# """ -# -# logger.warning( -# "Warning: Computed matrix correlation will be collected into the drive with this implementation.\n" -# "This may cause memory issues. Use it preferably with small datasets." -# ) -# logger.info(f"Computing correlation matrix using {corr_method} method.") -# -# mcorr = Correlation.corr(sdf, features_col, corr_method).collect()[0][0].toArray() -# return mcorr -# -# -# @tag("experimental") -# def multivariate_correlation_selector( -# fsdf: FSDataFrame, -# strict: bool = True, -# corr_threshold: float = 0.75, -# corr_method: str = "pearson", -# ) -> List[str]: -# """ -# Compute the correlation matrix (Pearson) among input features and select those below a specified threshold. -# -# :param fsdf: Input FSDataFrame -# :param strict: If True (default), apply hard filtering (strict) to remove highly correlated features. -# Otherwise, find the maximal independent set of highly correlated features (approximate method). -# `Warning`: The approximate method is experimental. -# :param corr_threshold: Minimal correlation threshold to consider two features correlated. -# :param corr_method: One of `pearson` (default) or `spearman`. -# -# :return: List of selected features names -# """ -# -# colum_vector_features = "features" -# sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) -# -# # compute correlation matrix -# mcorr = _compute_correlation_matrix( -# sdf, features_col=colum_vector_features, corr_method=corr_method -# ) -# -# mcorr = np.abs(mcorr) # get absolute correlation value -# combs_above_cutoff = ( -# np.triu(mcorr, k=1) > corr_threshold -# ) # create bool matrix that meet criteria -# correlated_col_index = tuple( -# np.column_stack(np.where(combs_above_cutoff)) -# ) # get correlated pairs cols index -# -# index_to_remove = set() -# if strict: -# # hard filtering method -# # Original implementation: https://www.rdocumentation.org/packages/caret/versions/6.0-93/topics/findCorrelation -# cols_mean = np.mean(mcorr, axis=1) # get cols index mean -# for pairs in correlated_col_index: -# i = pairs[0] -# j = pairs[1] -# index_to_remove.add(i if cols_mean[i] > cols_mean[j] else j) -# else: -# # approximate method -# index_to_remove = find_maximal_independent_set(correlated_col_index, keep=False) -# -# features = fsdf.get_features_names() # get all current features -# features_to_remove = fsdf.get_features_by_index(index_to_remove) -# selected_features = [sf for sf in features if sf not in features_to_remove] -# -# return selected_features -# -# -# @tag("spark implementation") -# def multivariate_variance_selector( -# fsdf: FSDataFrame, variance_threshold: float = 0.0 -# ) -> List[str]: -# """ -# Select features after removing low-variance ones (e.g., features with quasi-constant value across samples). -# -# :param fsdf: Input FSDataFrame -# :param variance_threshold: Minimal variance value allowed to select a feature. -# -# :return: List of selected features names -# """ -# -# colum_vector_features = "features" -# sdf = fsdf.get_sdf_vector(output_column_vector=colum_vector_features) -# -# selector = VarianceThresholdSelector() -# ( -# selector.setFeaturesCol(colum_vector_features) -# .setOutputCol("selectedFeatures") -# .setVarianceThreshold(variance_threshold) -# ) -# -# model = selector.fit(sdf) -# selected_features_indices = set(model.selectedFeatures) -# selected_features = fsdf.get_features_by_index(selected_features_indices) -# -# return selected_features -# -# -# def multivariate_filter( -# fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs -# ) -> FSDataFrame: -# """ -# Filter features after applying a multivariate feature selector method. -# -# :param fsdf: Input FSDataFrame -# :param multivariate_method: Multivariate selector method. -# Possible values are 'm_corr' or 'variance'. -# -# :return: Filtered FSDataFrame -# """ -# if multivariate_method == MULTIVARIATE_CORRELATION: -# selected_features = multivariate_correlation_selector(fsdf, **kwargs) -# elif multivariate_method == MULTIVARIATE_VARIANCE: -# selected_features = multivariate_variance_selector(fsdf, **kwargs) -# else: -# raise ValueError( -# f"Invalid multivariate method: {multivariate_method}. " -# f"Choose one of {MULTIVARIATE_METHODS.keys()}." -# ) -# -# logger.info(f"Applying multivariate filter {multivariate_method}.") -# -# return fsdf.filter_features(selected_features, keep=True) +import logging +from typing import List + +import numpy as np +from scipy.stats import spearmanr + +from fslite.fs.constants import get_fs_multivariate_methods +from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.methods import FSMethod, InvalidMethodError +from fslite.fs.utils import find_maximal_independent_set + +logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") +logger = logging.getLogger("FS:UNIVARIATE") +logger.setLevel(logging.INFO) + +class FSMultivariate(FSMethod): + """ + The FSMultivariate class is a subclass of the FSMethod class and is used for multivariate + feature selection methods. It provides a way to select features using different multivariate methods such as + multivariate correlation and variance. + + Example Usage + ------------- + # Create an instance of FSMultivariate with multivariate_method='m_corr' + fs_multivariate = FSMultivariate(multivariate_method='m_corr') + + # Select features using the multivariate method + selected_features = fs_multivariate.select_features(fsdf) + """ + + valid_methods = get_fs_multivariate_methods() + + def __init__(self, fs_method: str, **kwargs): + """ + Initialize the multivariate feature selection method with the specified parameters. + + Parameters: + fsdf: The data frame on which feature selection is to be performed. + fs_method: The multivariate method to be used for feature selection. + kwargs: Additional keyword arguments for the feature selection method. + """ + + super().__init__(fs_method, **kwargs) + self.validate_method(fs_method) + + def validate_method(self, multivariate_method: str): + """ + Validate the multivariate method. + + Parameters: + multivariate_method: The multivariate method to be validated. + """ + + if not is_valid_multivariate_method(multivariate_method): + raise InvalidMethodError( + f"Invalid multivariate method: " + f"{multivariate_method}. Accepted methods are {', '.join(self.valid_methods)}" + ) + def select_features(self, fsdf: FSDataFrame): + """ + Select features using the specified multivariate method. + """ + + return self.multivariate_filter( + fsdf, multivariate_method=self.fs_method, **self.kwargs + ) + + def multivariate_filter( + fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs + ) -> FSDataFrame: + """ + Filter features after applying a multivariate feature selector method. + + :param fsdf: Input FSDataFrame + :param multivariate_method: Multivariate selector method. + Possible values are 'm_corr' or 'variance'. + + :return: Filtered FSDataFrame + """ + if multivariate_method == "m_corr": + selected_features = multivariate_correlation_selector(fsdf, **kwargs) + elif multivariate_method == "variance": + selected_features = multivariate_variance_selector(fsdf, **kwargs) + else: + raise ValueError( + f"Invalid multivariate method: {multivariate_method}. " + f"Choose one of {get_fs_multivariate_methods()}." + ) + + logger.info(f"Applying multivariate filter {multivariate_method}.") + + return fsdf.select_features_by_index(selected_features) + + def __str__(self): + return f"FSMultivariate(multivariate_method={self.fs_method}, kwargs={self.kwargs})" + + def __repr__(self): + return self.__str__() + + +def multivariate_correlation_selector( + fsdf: FSDataFrame, + strict: bool = True, + corr_threshold: float = 0.75, + corr_method: str = "pearson", +) -> List[str]: + """ + Compute the correlation matrix among input features and select those below a specified threshold. + + :param fsdf: Input FSDataFrame object. + :param strict: If True (default), apply hard filtering to remove highly correlated features. + Otherwise, find the maximal independent set of highly correlated features (experimental). + :param corr_threshold: Minimal correlation threshold to consider two features correlated. + :param corr_method: Correlation method - 'pearson' (default) or 'spearman'. + + :return: List of selected feature names. + """ + # Retrieve the feature matrix + matrix = fsdf.get_matrix() + + # Retrieve feature names + feature_names = fsdf.get_features_names() + + # Compute correlation matrix + if corr_method == "pearson": + corr_matrix = np.corrcoef(matrix, rowvar=False) + elif corr_method == "spearman": + corr_matrix, _ = spearmanr(matrix) + else: + raise ValueError(f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'.") + + # Get absolute values of correlations to check magnitude + corr_matrix = np.abs(corr_matrix) + + # Find pairs of features with correlation above the threshold + combs_above_cutoff = np.triu(corr_matrix, k=1) > corr_threshold + correlated_pairs = np.column_stack(np.where(combs_above_cutoff)) + + # Set of indices to remove + index_to_remove = set() + if strict: + # Strict filtering: remove features with higher mean correlations + col_means = np.mean(corr_matrix, axis=1) + for i, j in correlated_pairs: + if col_means[i] > col_means[j]: + index_to_remove.add(i) + else: + index_to_remove.add(j) + else: + # Experimental approximate method + index_to_remove = find_maximal_independent_set(correlated_pairs, keep=False) + + # Select feature names to keep + features_to_remove = [feature_names[i] for i in index_to_remove] + selected_features = [f for f in feature_names if f not in features_to_remove] + + return selected_features diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index f584776..e9a8144 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -7,12 +7,113 @@ from fslite.fs.constants import get_fs_univariate_methods, is_valid_univariate_method from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.methods import FSMethod, InvalidMethodError logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FS:UNIVARIATE") logger.setLevel(logging.INFO) +class FSUnivariate(FSMethod): + """ + A class for univariate feature selection methods. + + Attributes: + fs_method (str): The univariate method to be used for feature selection. + kwargs (dict): Additional keyword arguments for the feature selection method. + """ + + valid_methods = get_fs_univariate_methods() + + def __init__(self, fs_method: str, **kwargs): + """ + Initialize the univariate feature selection method with the specified parameters. + + Parameters: + fs_method: The univariate method to be used for feature selection. + kwargs: Additional keyword arguments for the feature selection method. + """ + + super().__init__(fs_method, **kwargs) + self.validate_method(fs_method) + + def validate_method(self, fs_method: str): + """ + Validate the univariate method. + + Parameters: + fs_method: The univariate method to be validated. + """ + + if not is_valid_univariate_method(fs_method): + raise InvalidMethodError( + f"Invalid univariate method: {fs_method}. " + f"Accepted methods are {', '.join(self.valid_methods)}" + ) + + def select_features(self, fsdf) -> FSDataFrame: + """ + Select features using the specified univariate method. + + Parameters: + fsdf: The data frame on which feature selection is to be performed. + + Returns: + The selected features. + """ + + return self.univariate_filter( + fsdf, univariate_method=self.fs_method, **self.kwargs + ) + + def __str__(self): + return f"FSUnivariate(method={self.fs_method}, kwargs={self.kwargs})" + + def __repr__(self): + return self.__str__() + + def univariate_filter( + self, df: FSDataFrame, univariate_method: str = "u_corr", **kwargs + ) -> FSDataFrame: + """ + Filter features after applying a univariate feature selector method. + + :param df: Input DataFrame + :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') + + :return: Filtered DataFrame with selected features + """ + + if not is_valid_univariate_method(univariate_method): + raise NotImplementedError( + "The provided method {} is not implemented !! please select one from this list {}".format( + univariate_method, get_fs_univariate_methods() + ) + ) + + selected_features = [] + + if univariate_method == "anova": + # TODO: Implement ANOVA selector + # selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) + pass + elif univariate_method == "f_regression": + # TODO: Implement F-regression selector + # selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) + pass + elif univariate_method == "u_corr": + selected_features = univariate_correlation_selector(df, **kwargs) + + logger.info(f"Applying univariate filter using method: {univariate_method}") + + if len(selected_features) == 0: + logger.warning("No features selected. Returning original DataFrame.") + return df + else: + logger.info(f"Selected {len(selected_features)} features...") + return df.select_features_by_index(selected_features) + + def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: """ Compute the correlation coefficient between every column (features) in the input NumPy array and the label (class) @@ -103,45 +204,3 @@ def univariate_selector( selected_features = [features[i] for i in selected_indices] return selected_features - - -def univariate_filter( - df: FSDataFrame, univariate_method: str = "u_corr", **kwargs -) -> FSDataFrame: - """ - Filter features after applying a univariate feature selector method. - - :param df: Input DataFrame - :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') - - :return: Filtered DataFrame with selected features - """ - - if not is_valid_univariate_method(univariate_method): - raise NotImplementedError( - "The provided method {} is not implemented !! please select one from this list {}".format( - univariate_method, get_fs_univariate_methods() - ) - ) - - selected_features = [] - - if univariate_method == "anova": - # TODO: Implement ANOVA selector - # selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) - pass - elif univariate_method == "f_regression": - # TODO: Implement F-regression selector - # selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) - pass - elif univariate_method == "u_corr": - selected_features = univariate_correlation_selector(df, **kwargs) - - logger.info(f"Applying univariate filter using method: {univariate_method}") - - if len(selected_features) == 0: - logger.warning("No features selected. Returning original DataFrame.") - return df - else: - logger.info(f"Selected {len(selected_features)} features...") - return df.select_features_by_index(selected_features) diff --git a/setup.py b/setup.py index a6a20a1..63593d3 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ "scikit-learn", "scipy", "psutil", - "matplotlib" + "matplotlib", ], classifiers=[ # Classifiers for your package From 7a08e8252777655e8d8e8d183cb16197cc854faf Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:15:07 +0100 Subject: [PATCH 33/62] Another refactoring --- fslite/fs/methods.py | 259 ++++++++++++------------ fslite/tests/test_univariate_methods.py | 9 +- 2 files changed, 131 insertions(+), 137 deletions(-) diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 31bfa92..9ed62f3 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,16 +1,9 @@ from abc import ABC, abstractmethod -from typing import List, Type, Union, Tuple, Optional, Dict, Any - -from fslite.fs.constants import ( - FS_METHODS, - get_fs_multivariate_methods, - get_fs_ml_methods, - get_fs_method_details, -) +from typing import List, Type, Union, Optional, Dict, Any + +from fslite.fs.constants import get_fs_method_details from fslite.fs.fdataframe import FSDataFrame -from fslite.fs.ml import MLCVModel -from fslite.fs.multivariate import multivariate_filter -from fslite.fs.univariate import univariate_filter + class FSMethod(ABC): @@ -67,128 +60,128 @@ def set_params(self, **kwargs): """ self.kwargs.update(kwargs) -class FSPipeline: - """ - The FSPipeline class creates a pipeline of feature selection methods. It provides a way to - chain multiple feature selection methods together to create a pipeline of feature selection methods. - - Example Usage - ------------- - # Create an instance of FSPipeline with the specified feature selection methods - fs_pipeline = FSPipeline(fs_methods=[FSUnivariate('anova'), FSMultivariate('m_corr')]) - - # Select features using the pipeline - selected_features = fs_pipeline.select_features(fsdf) - """ - - _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [ - FSUnivariate, - FSMultivariate, - FSMLMethod, - ] - - def __init__( - self, - df_training: FSDataFrame, - df_testing: Optional[FSDataFrame], - fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]], - ): - """ - Initialize the feature selection pipeline with the specified feature selection methods. - - Parameters: - df_training: The training data frame on which the feature selection pipeline is to be run. - df_testing: The testing data frame on which the ML wrapper method (if any) is to be evaluated. - fs_stages: A list of feature selection methods to be used in the pipeline. - """ - - self.df_training = df_training - self.df_testing = df_testing - self.fs_stages = fs_stages - self.validate_methods() - - self.pipeline_results = {} - - def validate_methods(self): - """ - Validate the feature selection methods in the pipeline. - """ - # check if the pipeline contains at least one feature selection method - if len(self.fs_stages) == 0: - raise ValueError( - "The pipeline must contain at least one feature selection method." - ) - - # check if the feature selection methods are valid - if not all( - isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages - ): - raise InvalidMethodError( - f"Invalid feature selection method. " - f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}" - ) - - # check if only one ML method is used in the pipeline - ml_methods = [ - method for method in self.fs_stages if isinstance(method, FSMLMethod) - ] - if len(ml_methods) > 1: - raise ValueError("Only one ML method is allowed in the pipeline.") - - def run(self) -> Dict[str, Any]: - """ - Run the feature selection pipeline. - - Returns: - A dictionary with the results of the feature selection pipeline. - """ - - # apply each feature selection method in the pipeline sequentially - n_stages = len(self.fs_stages) - fsdf_tmp = self.df_training - - self.pipeline_results.update(n_stages=n_stages) - - for i, method in enumerate(self.fs_stages): - print( - f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}" - ) - if isinstance(method, FSMLMethod): - - fsdf_tmp = method.select_features(fsdf_tmp) - - # collect the results during the feature selection process (rfe iterations, feature scores, etc.) - self.pipeline_results.update(rfe_iterations=method.rfe_iterations) - self.pipeline_results.update(feature_scores=method.get_feature_scores()) - self.pipeline_results.update(eval_metric=method.get_eval_metric_name()) - self.pipeline_results.update( - rfe_training_metric=method.get_eval_metric_on_training_rfe() - ) - self.pipeline_results.update( - training_metric=method.get_eval_metric_on_training() - ) - - if self.df_testing is not None: - - # evaluate the final model on the testing data (if available) - testing_metric = method.get_eval_metric_on_testing(self.df_testing) - self.pipeline_results.update(testing_metric=testing_metric) - - else: - fsdf_tmp = method.select_features(fsdf_tmp) - - self.pipeline_results.update( - n_initial_features=self.df_training.count_features() - ) - self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features()) - - return self.pipeline_results - - def __str__(self): - return f"FSPipeline(fs_methods={self.fs_stages})" - - def __repr__(self): - return self.__str__() +# class FSPipeline: +# """ +# The FSPipeline class creates a pipeline of feature selection methods. It provides a way to +# chain multiple feature selection methods together to create a pipeline of feature selection methods. +# +# Example Usage +# ------------- +# # Create an instance of FSPipeline with the specified feature selection methods +# fs_pipeline = FSPipeline(fs_methods=[FSUnivariate('anova'), FSMultivariate('m_corr')]) +# +# # Select features using the pipeline +# selected_features = fs_pipeline.select_features(fsdf) +# """ +# +# _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [ +# FSUnivariate, +# FSMultivariate, +# FSMLMethod, +# ] +# +# def __init__( +# self, +# df_training: FSDataFrame, +# df_testing: Optional[FSDataFrame], +# fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]], +# ): +# """ +# Initialize the feature selection pipeline with the specified feature selection methods. +# +# Parameters: +# df_training: The training data frame on which the feature selection pipeline is to be run. +# df_testing: The testing data frame on which the ML wrapper method (if any) is to be evaluated. +# fs_stages: A list of feature selection methods to be used in the pipeline. +# """ +# +# self.df_training = df_training +# self.df_testing = df_testing +# self.fs_stages = fs_stages +# self.validate_methods() +# +# self.pipeline_results = {} +# +# def validate_methods(self): +# """ +# Validate the feature selection methods in the pipeline. +# """ +# # check if the pipeline contains at least one feature selection method +# if len(self.fs_stages) == 0: +# raise ValueError( +# "The pipeline must contain at least one feature selection method." +# ) +# +# # check if the feature selection methods are valid +# if not all( +# isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages +# ): +# raise InvalidMethodError( +# f"Invalid feature selection method. " +# f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}" +# ) +# +# # check if only one ML method is used in the pipeline +# ml_methods = [ +# method for method in self.fs_stages if isinstance(method, FSMLMethod) +# ] +# if len(ml_methods) > 1: +# raise ValueError("Only one ML method is allowed in the pipeline.") +# +# def run(self) -> Dict[str, Any]: +# """ +# Run the feature selection pipeline. +# +# Returns: +# A dictionary with the results of the feature selection pipeline. +# """ +# +# # apply each feature selection method in the pipeline sequentially +# n_stages = len(self.fs_stages) +# fsdf_tmp = self.df_training +# +# self.pipeline_results.update(n_stages=n_stages) +# +# for i, method in enumerate(self.fs_stages): +# print( +# f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}" +# ) +# if isinstance(method, FSMLMethod): +# +# fsdf_tmp = method.select_features(fsdf_tmp) +# +# # collect the results during the feature selection process (rfe iterations, feature scores, etc.) +# self.pipeline_results.update(rfe_iterations=method.rfe_iterations) +# self.pipeline_results.update(feature_scores=method.get_feature_scores()) +# self.pipeline_results.update(eval_metric=method.get_eval_metric_name()) +# self.pipeline_results.update( +# rfe_training_metric=method.get_eval_metric_on_training_rfe() +# ) +# self.pipeline_results.update( +# training_metric=method.get_eval_metric_on_training() +# ) +# +# if self.df_testing is not None: +# +# # evaluate the final model on the testing data (if available) +# testing_metric = method.get_eval_metric_on_testing(self.df_testing) +# self.pipeline_results.update(testing_metric=testing_metric) +# +# else: +# fsdf_tmp = method.select_features(fsdf_tmp) +# +# self.pipeline_results.update( +# n_initial_features=self.df_training.count_features() +# ) +# self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features()) +# +# return self.pipeline_results +# +# def __str__(self): +# return f"FSPipeline(fs_methods={self.fs_stages})" +# +# def __repr__(self): +# return self.__str__() class InvalidMethodError(Exception): diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index e97d636..4d16e0f 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -2,7 +2,7 @@ from fslite.utils.datasets import get_tnbc_data_path from fslite.fs.fdataframe import FSDataFrame -from fslite.fs.univariate import univariate_filter +from fslite.fs.univariate import FSUnivariate def test_univariate_filter_corr(): @@ -17,9 +17,10 @@ def test_univariate_filter_corr(): # create FSDataFrame instance fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") - fsdf_filtered = univariate_filter( - fs_df, univariate_method="u_corr", corr_threshold=0.3 - ) + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="u_corr", corr_threshold=0.3) + + fsdf_filtered = fs_univariate.select_features(fs_df) assert fs_df.count_features() == 500 assert fsdf_filtered.count_features() == 211 From 5e56b2125e3c56036f3211ad64a7d41958bafab9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:16:26 +0100 Subject: [PATCH 34/62] Another refactoring --- docs/README.data.md | 10 +++++----- docs/README.methods.md | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/README.data.md b/docs/README.data.md index bb82603..5305b74 100644 --- a/docs/README.data.md +++ b/docs/README.data.md @@ -1,9 +1,9 @@ -## fsspark - data structures +## fslite - data structures --- -`fsspark` is a Python package that provides a set of tools for feature selection in Spark. -Here we describe the main data structures used in `fsspark` and how to use them. +`fslite` is a Python package that provides a set of tools for feature selection in Spark. +Here we describe the main data structures used in `fslite` and how to use them. ### Input data @@ -32,7 +32,7 @@ The following is an example of a TSV file with a binary response variable: ### Import functions -`fsspark` provides two main functions to import data from a TSV file. +`fslite` provides two main functions to import data from a TSV file. - `import_table` - Import data from a TSV file into a Spark Data Frame (sdf). @@ -57,7 +57,7 @@ psdf = import_table_as_psdf('data.tsv.bgz', ### The Feature Selection Spark Data Frame (FSDataFrame) -The `FSDataFrame` (**Figure 1**) is a core functionality of `fsspark`. It is a wrapper around a Spark Data Frame (sdf) +The `FSDataFrame` (**Figure 1**) is a core functionality of `fslite`. It is a wrapper around a Spark Data Frame (sdf) that provides a set of methods to facilitate feature selection tasks. The `FSDataFrame` is initialized with a Spark Data Frame (sdf) or a Pandas on Spark Data Frame (psdf) and two mandatory arguments: `sample_col` and `label_col`. The `sample_col` argument is the name of the column in the sdf that diff --git a/docs/README.methods.md b/docs/README.methods.md index 2a1149e..9359f4c 100644 --- a/docs/README.methods.md +++ b/docs/README.methods.md @@ -1,10 +1,10 @@ -# fsspark - features selection methods +# fslite - features selection methods --- -`fsspark `includes a set of methods to perform feature selection and machine learning based on spark. -A typical workflow written using `fsspark` can be divided roughly in four major stages: +`fslite `includes a set of methods to perform feature selection and machine learning based on spark. +A typical workflow written using `fslite` can be divided roughly in four major stages: 1) data pre-processing. 2) univariate filters. From 9b74ada7ec933dc5ce82e86b3f3069e2d367239f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:20:37 +0100 Subject: [PATCH 35/62] Another refactoring --- fslite/fs/constants.py | 2 ++ fslite/fs/methods.py | 4 ++-- fslite/fs/ml.py | 6 ----- fslite/fs/multivariate.py | 29 ++++--------------------- fslite/pipeline/fs_pipeline_example.py | 1 + fslite/tests/generate_big_tests.py | 2 +- fslite/tests/test_fsdataframe.py | 5 +++-- fslite/tests/test_univariate_methods.py | 4 ++-- fslite/utils/io.py | 1 - 9 files changed, 15 insertions(+), 39 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index c45dd65..da81329 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -101,6 +101,7 @@ def is_valid_univariate_method(method_name: str) -> bool: return True return False + def is_valid_multivariate_method(method_name: str) -> bool: """ This method check if the given method name is a supported multivariate method @@ -112,6 +113,7 @@ def is_valid_multivariate_method(method_name: str) -> bool: return True return False + def is_valid_ml_method(method_name: str) -> bool: """ This method check if the given method name is a supported machine learning method diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 9ed62f3..adf67fe 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -1,11 +1,10 @@ from abc import ABC, abstractmethod -from typing import List, Type, Union, Optional, Dict, Any +from typing import List from fslite.fs.constants import get_fs_method_details from fslite.fs.fdataframe import FSDataFrame - class FSMethod(ABC): """ Feature selection abtract class, this class defines the basic structure of a feature selection method. @@ -60,6 +59,7 @@ def set_params(self, **kwargs): """ self.kwargs.update(kwargs) + # class FSPipeline: # """ # The FSPipeline class creates a pipeline of feature selection methods. It provides a way to diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 77c9f69..30bd5a7 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -5,10 +5,6 @@ """ -import warnings -from typing import List, Any, Dict, Optional, Union - -import pandas as pd from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError @@ -243,8 +239,6 @@ def __repr__(self): return self.__str__() - - # # # # Define an abstract class that allow to create a factory of models diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index d59ca5e..444a5b5 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -1,38 +1,16 @@ -# import logging -# from typing import List -# -# import numpy as np -# import pyspark -# from pyspark.ml.feature import VarianceThresholdSelector -# from pyspark.ml.stat import Correlation -# -# from fslite.fs.constants import ( -# MULTIVARIATE_METHODS, -# MULTIVARIATE_CORRELATION, -# MULTIVARIATE_VARIANCE, -# ) -# -# from fslite.fs.core import FSDataFrame -# from fslite.fs.utils import find_maximal_independent_set -# from fslite.utils.generic import tag -# -# logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -# logger = logging.getLogger("FSSPARK:MULTIVARIATE") -# logger.setLevel(logging.INFO) -# import logging from typing import List import numpy as np from scipy.stats import spearmanr -from fslite.fs.constants import get_fs_multivariate_methods +from fslite.fs.constants import get_fs_multivariate_methods, is_valid_multivariate_method from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError from fslite.fs.utils import find_maximal_independent_set logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") -logger = logging.getLogger("FS:UNIVARIATE") +logger = logging.getLogger("FS:MULTIVARIATE") logger.setLevel(logging.INFO) class FSMultivariate(FSMethod): @@ -102,7 +80,8 @@ def multivariate_filter( if multivariate_method == "m_corr": selected_features = multivariate_correlation_selector(fsdf, **kwargs) elif multivariate_method == "variance": - selected_features = multivariate_variance_selector(fsdf, **kwargs) + # selected_features = multivariate_variance_selector(fsdf, **kwargs) + logging.info("Variance method not implemented yet.") else: raise ValueError( f"Invalid multivariate method: {multivariate_method}. " diff --git a/fslite/pipeline/fs_pipeline_example.py b/fslite/pipeline/fs_pipeline_example.py index 32159e9..1d3c539 100644 --- a/fslite/pipeline/fs_pipeline_example.py +++ b/fslite/pipeline/fs_pipeline_example.py @@ -8,6 +8,7 @@ from fslite.config.context import init_spark, stop_spark_session from fslite.fs.core import FSDataFrame + from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod from fslite.utils.datasets import get_tnbc_data_path from fslite.utils.io import import_table_as_psdf diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index ccc6f19..94d9c5f 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -1,7 +1,7 @@ import logging -import pandas as pd import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index 21879c7..039637c 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -1,8 +1,9 @@ +import gc + +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import matplotlib.pyplot as plt from memory_profiler import memory_usage -import gc from fslite.fs.fdataframe import FSDataFrame diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 4d16e0f..1da779e 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -1,8 +1,8 @@ import pandas as pd -from fslite.utils.datasets import get_tnbc_data_path -from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.fdataframe import FSDataFrame from fslite.fs.univariate import FSUnivariate +from fslite.utils.datasets import get_tnbc_data_path def test_univariate_filter_corr(): diff --git a/fslite/utils/io.py b/fslite/utils/io.py index 74c202c..02dd07f 100644 --- a/fslite/utils/io.py +++ b/fslite/utils/io.py @@ -2,7 +2,6 @@ import pyspark.pandas import pyspark.sql - from fslite.config.context import PANDAS_ON_SPARK_API_SETTINGS warnings.filterwarnings("ignore") From b1c4ad5be292f0a954d534e41e7044143ecfec6a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:27:08 +0100 Subject: [PATCH 36/62] refactoring ml methods --- fslite/fs/ml.py | 548 ++++++++++++++---------------------------------- 1 file changed, 153 insertions(+), 395 deletions(-) diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 30bd5a7..6de1fd4 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -4,10 +4,19 @@ for feature selection (e.g., rank by feature importance) and prediction. """ +from typing import Union, Optional, Dict, Any from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.svm import SVC, LinearSVC +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer +import pandas as pd + +from fslite.fs.ml import MLCVModel class FSMLMethod(FSMethod): @@ -238,399 +247,148 @@ def __str__(self): def __repr__(self): return self.__str__() +class MLCVModel: + """ + A factory class for creating various machine learning models with scikit-learn. + ML models are created using a cross-validator approach for hyperparameter tuning. + """ + + _grid_search: GridSearchCV = None + _best_model: object = None + _fsdf: FSDataFrame = None + + def __init__( + self, + estimator: Union[ + RandomForestClassifier, + RandomForestRegressor, + LinearSVC, + LogisticRegression, + SVC + ], + scoring: str, + estimator_params: Optional[Dict[str, Any]] = None, + grid_params: Optional[Dict[str, List[Any]]] = None, + cv: int = 5, + ): + """ + Initializes the MLModel with optional estimator, scoring method, and parameter specifications. + """ + self.estimator = estimator + self.scoring = scoring + self.estimator_params = estimator_params + self.grid_params = grid_params + self.cv = cv + + self._initialize_model() + + def _initialize_model(self): + if self.estimator_params: + self.estimator.set_params(**self.estimator_params) + + if self.grid_params: + self._grid_search = GridSearchCV( + estimator=self.estimator, + param_grid=self.grid_params, + scoring=self.scoring, + cv=self.cv + ) + + def fit(self, fsdf: FSDataFrame) -> "MLCVModel": + """ + Fit the model using the cross-validator. + """ + self._fsdf = fsdf + X, y = self._fsdf.get_features_and_labels() + + if self._grid_search: + self._grid_search.fit(X, y) + self._best_model = self._grid_search.best_estimator_ + else: + self.estimator.fit(X, y) + self._best_model = self.estimator -# -# -# # Define an abstract class that allow to create a factory of models -# # with the same interface -# # This class allows to perform the following operations: -# # - Define an Estimator -# # - Define an Evaluator -# # - Define a grid of parameters (model tuning) -# # - Define a cross-validator (model fitting) -# class MLCVModel: -# """ -# A factory class for creating various machine learning models with Spark MLlib. -# ML model are created using a cross-validator approach for hyperparameter tuning. -# """ -# -# _cross_validator: CrossValidator = None -# _fitted_cv_model: CrossValidatorModel = None -# _best_model: Model = None -# _fsdf: FSDataFrame = None -# -# def __init__( -# self, -# estimator: Union[ -# RandomForestClassifier -# | RandomForestRegressionModel -# | LinearSVC -# | LogisticRegression -# ], -# evaluator: Union[ -# BinaryClassificationEvaluator -# | MulticlassClassificationEvaluator -# | RegressionEvaluator -# ], -# estimator_params: Optional[Dict[str, Any]] = None, -# evaluator_params: Optional[Dict[str, Any]] = None, -# grid_params: Optional[Dict[str, List[Any]]] = None, -# cv_params: Optional[Dict[str, Any]] = None, -# ): -# """ -# Initializes the MLModel with optional estimator, evaluator, and parameter specifications. -# """ -# self.estimator = estimator -# self.evaluator = evaluator -# self.estimator_params = estimator_params -# self.evaluator_params = evaluator_params -# self.grid_params = grid_params -# self.cv_params = cv_params -# -# self._initialize_model() -# -# def _initialize_model(self): -# # Validate and set estimator parameters -# if self.estimator: -# self._validate_estimator(self.estimator) -# self._validate_estimator_params(self.estimator_params) -# self._set_estimator_params() -# -# # Validate and evaluator -# if self.evaluator: -# self._validate_evaluator(self.evaluator) -# self._validate_evaluator_params(self.evaluator_params) -# self._set_evaluator_params() -# -# # Parse and set grid parameters -# if self.grid_params: -# self.grid_params = self._parse_grid_params(self.grid_params) -# -# # Initialize and set cross-validator parameters -# self._set_cross_validator() -# -# def _parse_grid_params( -# self, grid_params: Dict[str, List[Any]] -# ) -> List[Dict[Param, Any]]: -# """ -# Parse the grid parameters to create a list of dictionaries. -# -# :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. -# :return: A list of dictionaries, where each dictionary represents a set of parameter values. -# """ -# grid = ParamGridBuilder() -# for param, values in grid_params.items(): -# if hasattr(self.estimator, param): -# grid = grid.addGrid(getattr(self.estimator, param), values) -# else: -# raise AttributeError( -# f"{self.estimator.__class__.__name__} does not have attribute {param}" -# ) -# return grid.build() -# -# def _validate_estimator(self, estimator: Estimator) -> "MLCVModel": -# """ -# Validate the estimator. -# -# :param estimator: The estimator to validate. -# :return: The validated estimator. -# """ -# # check estimator is an instance of ESTIMATORS_CLASSES -# if not isinstance(estimator, tuple(ESTIMATORS_CLASSES)): -# raise ValueError(f"Estimator must be an instance of {ESTIMATORS_CLASSES}") -# return self -# -# def _validate_evaluator(self, evaluator: Evaluator) -> "MLCVModel": -# """ -# Validate the evaluator. -# -# :param evaluator: The evaluator to validate. -# :return: The validated evaluator. -# """ -# # check evaluator is an instance of EVALUATORS_CLASSES -# if not isinstance(evaluator, tuple(EVALUATORS_CLASSES)): -# raise ValueError(f"Evaluator must be an instance of {EVALUATORS_CLASSES}") -# return self -# -# def _validate_estimator_params(self, estimator_params: Dict[str, Any]) -> None: -# """ -# Validate the estimator parameters. -# -# :param estimator_params: A dictionary containing the parameter names as keys and values as values. -# """ -# if estimator_params is None: -# return -# for param, _ in estimator_params.items(): -# if not self.estimator.hasParam(param): -# raise AttributeError( -# f"{self.estimator.__class__.__name__} does not have attribute {param}" -# ) -# -# def _validate_evaluator_params(self, evaluator_params: Dict[str, Any]) -> None: -# """ -# Validate the evaluator parameters. -# -# :param evaluator_params: A dictionary containing the parameter names as keys and values as values. -# """ -# if evaluator_params is None: -# return -# for param, _ in evaluator_params.items(): -# if not self.evaluator.hasParam(param): -# raise AttributeError( -# f"{self.evaluator.__class__.__name__} does not have attribute {param}" -# ) -# -# def _set_evaluator_params(self) -> "MLCVModel": -# """ -# Set evaluator parameters. -# """ -# if self.evaluator_params is not None: -# self.evaluator = self.evaluator.setParams(**self.evaluator_params) -# return self -# -# def _set_estimator_params(self) -> "MLCVModel": -# """ -# Set estimator parameters. -# """ -# if self.estimator_params is not None: -# self.estimator = self.estimator.setParams(**self.estimator_params) -# return self -# -# def _set_cv_params(self, cv_params: Dict[str, Any]) -> "MLCVModel": -# """ -# Parse the cross-validator parameters to create an instance of CrossValidator. -# -# :param cv_params: A dictionary containing the parameter names as keys and values as values. -# :return: An instance of CrossValidator. -# """ -# -# for param, value in cv_params.items(): -# if hasattr(self._cross_validator, param): -# setattr(self._cross_validator, param, value) -# else: -# raise AttributeError( -# f"{self._cross_validator.__class__.__name__} does not have attribute {param}" -# ) -# return self -# -# def _set_cross_validator(self) -> "MLCVModel": -# """ -# Build the model using the cross-validator. -# -# :return: The CrossValidator model. -# """ -# try: -# self._cross_validator = CrossValidator( -# estimator=self.estimator, -# estimatorParamMaps=self.grid_params, -# evaluator=self.evaluator, -# ) -# if self.cv_params is not None: -# self._cross_validator = self._cross_validator.setParams( -# **self.cv_params -# ) -# return self -# except Exception as e: -# print(f"An error occurred while creating the CrossValidator: {str(e)}") -# # Handle the exception or raise it to be handled by the caller -# raise -# -# def fit(self, fsdf: FSDataFrame) -> "MLCVModel": -# """ -# Fit the model using the cross-validator. -# -# :return: The CrossValidatorModel after fitting. -# """ -# # Extract the Spark DataFrame and label column name from FSDataFrame -# self._fsdf = fsdf -# -# if ( -# self._cross_validator is None -# or self.estimator is None -# or self.evaluator is None -# ): -# raise ValueError( -# "Cross-validator, estimator, or evaluator not set properly." -# ) -# -# self._fitted_cv_model = self._cross_validator.fit(self._fsdf.get_sdf_vector()) -# return self -# -# def _get_best_model(self) -> Model: -# """ -# Get the best model from the fitted CrossValidatorModel. -# -# :return: The best model. -# """ -# if self._fitted_cv_model is None: -# raise ValueError( -# "CrossValidatorModel not fitted. Use fit() to fit the model." -# ) -# self._best_model = self._fitted_cv_model.bestModel -# return self._best_model -# -# # define a static method that allows to set a ml model based on the model type -# @staticmethod -# def create_model( -# model_type: str, -# estimator_params: Dict[str, Any] = None, -# evaluator_params: Dict[str, Any] = None, -# grid_params: Dict[str, List[Any]] = None, -# cv_params: Dict[str, Any] = None, -# ) -> "MLCVModel": -# """ -# Set a machine learning model based on the model type. -# -# :param model_type: The type of model to set. -# :param estimator_params: Parameters for the estimator. -# :param evaluator_params: Parameters for the evaluator. -# :param grid_params: A dictionary containing the parameter names as keys and a list of values as values. -# :param cv_params: Parameters for the cross-validator. -# -# :return: An instance of MLModel. -# """ -# if model_type == RF_BINARY: -# estimator = RandomForestClassifier() -# evaluator = BinaryClassificationEvaluator() -# elif model_type == LSVC_BINARY: -# estimator = LinearSVC() -# evaluator = BinaryClassificationEvaluator() -# elif model_type == RF_MULTILABEL: -# estimator = RandomForestClassifier() -# evaluator = MulticlassClassificationEvaluator() -# elif model_type == LR_MULTILABEL: -# estimator = LogisticRegression() -# evaluator = MulticlassClassificationEvaluator() -# elif model_type == RF_REGRESSION: -# estimator = RandomForestRegressor() -# evaluator = RegressionEvaluator() -# else: -# raise ValueError( -# f"Unsupported model type: {model_type}." -# f"Supported model types are: {list(ML_METHODS.keys())}" -# ) -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# evaluator_params=evaluator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# return ml_method -# -# def get_eval_metric_name(self) -> str: -# """ -# Get the evaluation metric name. -# -# :return: The evaluation metric name. -# """ -# return self.evaluator.getMetricName() -# -# def get_feature_scores(self) -> pd.DataFrame: -# -# # TODO: This function should be able to parse all available models. -# -# indexed_features = self._fsdf.get_features_indexed() -# best_model = self._get_best_model() -# -# # raise exception if the model is not none -# if best_model is None: -# raise ValueError( -# "No ML model have been fitted. Use fit() to fit the model." -# ) -# -# df_features = pd.DataFrame(indexed_features.to_numpy(), columns=["features"]) -# -# if isinstance( -# best_model, (RandomForestClassificationModel, RandomForestRegressionModel) -# ): -# df_scores = pd.DataFrame( -# data=best_model.featureImportances.toArray(), columns=["scores"] -# ) -# -# df_scores = df_scores.reset_index(level=0).rename( -# columns={"index": "feature_index"} -# ) -# -# # merge the feature scores with the feature names -# df = df_features.merge( -# df_scores, how="right", left_index=True, right_index=True -# ) # index-to-index merging -# -# # sort the dataframe by scores in descending order -# df = df.sort_values(by="scores", ascending=False) -# -# # add feature percentile rank to the features_scores dataframe -# df["percentile_rank"] = df["scores"].rank(pct=True) -# -# return df -# -# else: -# raise ValueError( -# "Unsupported model type. " -# "Only RandomForestClassificationModel, " -# "RandomForestRegressionModel, and LinearSVCModel are supported." -# ) -# -# def get_eval_metric_on_training(self) -> float: -# """ -# Get the evaluation metric on training data from a trained CrossValidatorModel (best model). -# -# :return: A dictionary containing the evaluation metric name and value. -# """ -# -# # TODO: This function should be able to parse all available models. -# -# # get the best model from the fitted cross-validator model -# best_model = self._get_best_model() -# -# # get the eval metric name from the evaluator -# eval_metric_name = self.get_eval_metric_name() -# -# if isinstance( -# best_model, (RandomForestClassificationModel, LogisticRegressionModel) -# ): -# metric_value = getattr(best_model.summary, eval_metric_name) -# -# elif isinstance(best_model, LinearSVCModel): -# metric_value = getattr(best_model.summary(), eval_metric_name) -# -# else: -# warnings.warn("Unsupported model type. Unable to get evaluation metric.") -# metric_value = None -# -# return metric_value -# -# def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: -# """ -# Get accuracy on test data from a trained CrossValidatorModel (best model). -# -# :param test_data: The test data as a FSDataFrame object. -# :return: accuracy -# """ -# -# # TODO: This function should be able to parse all available models. -# -# # get the best model from the fitted cross-validator model -# best_model = self._get_best_model() -# -# # get test data features harmonized with training features -# training_features = self._fsdf.get_features_names() -# test_data = test_data.filter_features(training_features, keep=True) -# -# # predict the test data -# predictions = None -# if isinstance( -# best_model, -# (RandomForestClassificationModel, LinearSVCModel, LogisticRegressionModel), -# ): -# predictions = best_model.transform(test_data.get_sdf_vector()) -# -# metric_value = None -# if predictions is not None: -# metric_value = self.evaluator.evaluate(predictions) -# -# return metric_value + return self + + def _get_best_model(self): + if self._best_model is None: + raise ValueError("No model has been fitted. Use fit() to fit the model.") + return self._best_model + + def get_feature_scores(self) -> pd.DataFrame: + """ + Get feature importance scores from the best model. + """ + if not isinstance(self._best_model, (RandomForestClassifier, RandomForestRegressor)): + raise ValueError("Feature importance is only available for tree-based models.") + + features = self._fsdf.get_feature_names() + importances = self._best_model.feature_importances_ + df = pd.DataFrame({ + 'feature': features, + 'importance': importances + }).sort_values(by='importance', ascending=False) + + return df + + def get_eval_metric_on_training(self) -> float: + """ + Get the evaluation metric on training data from the best model. + """ + X_train, y_train = self._fsdf.get_features_and_labels() + y_pred = self._best_model.predict(X_train) + + if self.scoring == 'accuracy': + return accuracy_score(y_train, y_pred) + elif self.scoring == 'f1': + return f1_score(y_train, y_pred) + elif self.scoring == 'roc_auc': + return roc_auc_score(y_train, y_pred) + else: + raise ValueError("Unsupported scoring method.") + + def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: + """ + Get evaluation metric on test data from the trained model. + """ + X_test, y_test = test_data.get_features_and_labels() + y_pred = self._best_model.predict(X_test) + + if self.scoring == 'accuracy': + return accuracy_score(y_test, y_pred) + elif self.scoring == 'f1': + return f1_score(y_test, y_pred) + elif self.scoring == 'roc_auc': + return roc_auc_score(y_test, y_pred) + else: + raise ValueError("Unsupported scoring method.") + + @staticmethod + def create_model( + model_type: str, + estimator_params: Dict[str, Any] = None, + grid_params: Dict[str, List[Any]] = None, + scoring: str = 'accuracy', + cv: int = 5 + ) -> "MLCVModel": + """ + Create an ML model based on the model type. + """ + if model_type == "RF_BINARY": + estimator = RandomForestClassifier() + elif model_type == "LSVC_BINARY": + estimator = LinearSVC() + elif model_type == "RF_REGRESSION": + estimator = RandomForestRegressor() + elif model_type == "LOGISTIC_REGRESSION": + estimator = LogisticRegression() + else: + raise ValueError(f"Unsupported model type: {model_type}.") + + return MLCVModel( + estimator=estimator, + scoring=scoring, + estimator_params=estimator_params, + grid_params=grid_params, + cv=cv, + ) From c657be9fe9e2e80251a487aa8a7c2b21d8e1d9d1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 06:48:52 +0100 Subject: [PATCH 37/62] refactoring ml methods --- fslite/fs/ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 6de1fd4..0a797a9 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -4,7 +4,7 @@ for feature selection (e.g., rank by feature importance) and prediction. """ -from typing import Union, Optional, Dict, Any +from typing import Union, Optional, Dict, Any, List from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method from fslite.fs.fdataframe import FSDataFrame From c46167c7ab59a113a6bf55d76ddd0fc806085c0b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 09:23:10 +0100 Subject: [PATCH 38/62] added file for experiments --- docs/EXPERIMENTS.md | 4 ++++ fslite/fs/constants.py | 11 +++++++++++ fslite/fs/ml.py | 37 +++++++++++++++++++++---------------- fslite/fs/multivariate.py | 19 +++++++++++++------ 4 files changed, 49 insertions(+), 22 deletions(-) create mode 100644 docs/EXPERIMENTS.md diff --git a/docs/EXPERIMENTS.md b/docs/EXPERIMENTS.md new file mode 100644 index 0000000..777d1e0 --- /dev/null +++ b/docs/EXPERIMENTS.md @@ -0,0 +1,4 @@ +## Experiments and Benchmarks + +This document contains the experiments and benchmarks that were conducted to evaluate the performance of fslite. +The experiments were conducted on the following datasets: \ No newline at end of file diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index da81329..79f9433 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -7,6 +7,11 @@ FS_METHODS = { "univariate": { "title": "Univariate Feature Selection", + "description": "Univariate feature selection refers to the process of selecting the most relevant features for " + "a machine learning model by evaluating each feature individually with respect to the target " + "variable using univariate statistical tests. It simplifies the feature selection process by " + "treating each feature independently and assessing its contribution to the predictive " + "performance of the model.", "methods": [ { "name": "anova", @@ -18,6 +23,11 @@ }, "multivariate": { "title": "Multivariate Feature Selection", + "description": "Multivariate feature selection is a method of selecting features by evaluating them in " + "combination rather than individually. Unlike univariate feature selection, which treats each " + "feature separately, multivariate feature selection considers the relationships and interactions " + "between multiple features and the target variable. This method aims to identify a subset of " + "features that work well together to improve the performance of a machine learning model.", "methods": [ {"name": "m_corr", "description": "Multivariate Correlation"}, {"name": "variance", "description": "Multivariate Variance"}, @@ -25,6 +35,7 @@ }, "ml": { "title": "Machine Learning Wrapper", + "description": "Machine learning wrapper methods are feature selection techniques that use a machine learning ", "methods": [ {"name": "rf_binary", "description": "Random Forest Binary Classifier"}, {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"}, diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index 0a797a9..bf115e1 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -4,6 +4,7 @@ for feature selection (e.g., rank by feature importance) and prediction. """ + from typing import Union, Optional, Dict, Any, List from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method @@ -247,6 +248,7 @@ def __str__(self): def __repr__(self): return self.__str__() + class MLCVModel: """ A factory class for creating various machine learning models with scikit-learn. @@ -264,7 +266,7 @@ def __init__( RandomForestRegressor, LinearSVC, LogisticRegression, - SVC + SVC, ], scoring: str, estimator_params: Optional[Dict[str, Any]] = None, @@ -291,7 +293,7 @@ def _initialize_model(self): estimator=self.estimator, param_grid=self.grid_params, scoring=self.scoring, - cv=self.cv + cv=self.cv, ) def fit(self, fsdf: FSDataFrame) -> "MLCVModel": @@ -319,15 +321,18 @@ def get_feature_scores(self) -> pd.DataFrame: """ Get feature importance scores from the best model. """ - if not isinstance(self._best_model, (RandomForestClassifier, RandomForestRegressor)): - raise ValueError("Feature importance is only available for tree-based models.") + if not isinstance( + self._best_model, (RandomForestClassifier, RandomForestRegressor) + ): + raise ValueError( + "Feature importance is only available for tree-based models." + ) features = self._fsdf.get_feature_names() importances = self._best_model.feature_importances_ - df = pd.DataFrame({ - 'feature': features, - 'importance': importances - }).sort_values(by='importance', ascending=False) + df = pd.DataFrame({"feature": features, "importance": importances}).sort_values( + by="importance", ascending=False + ) return df @@ -338,11 +343,11 @@ def get_eval_metric_on_training(self) -> float: X_train, y_train = self._fsdf.get_features_and_labels() y_pred = self._best_model.predict(X_train) - if self.scoring == 'accuracy': + if self.scoring == "accuracy": return accuracy_score(y_train, y_pred) - elif self.scoring == 'f1': + elif self.scoring == "f1": return f1_score(y_train, y_pred) - elif self.scoring == 'roc_auc': + elif self.scoring == "roc_auc": return roc_auc_score(y_train, y_pred) else: raise ValueError("Unsupported scoring method.") @@ -354,11 +359,11 @@ def get_eval_metric_on_testing(self, test_data: FSDataFrame) -> float: X_test, y_test = test_data.get_features_and_labels() y_pred = self._best_model.predict(X_test) - if self.scoring == 'accuracy': + if self.scoring == "accuracy": return accuracy_score(y_test, y_pred) - elif self.scoring == 'f1': + elif self.scoring == "f1": return f1_score(y_test, y_pred) - elif self.scoring == 'roc_auc': + elif self.scoring == "roc_auc": return roc_auc_score(y_test, y_pred) else: raise ValueError("Unsupported scoring method.") @@ -368,8 +373,8 @@ def create_model( model_type: str, estimator_params: Dict[str, Any] = None, grid_params: Dict[str, List[Any]] = None, - scoring: str = 'accuracy', - cv: int = 5 + scoring: str = "accuracy", + cv: int = 5, ) -> "MLCVModel": """ Create an ML model based on the model type. diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index 444a5b5..cc23705 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -4,7 +4,10 @@ import numpy as np from scipy.stats import spearmanr -from fslite.fs.constants import get_fs_multivariate_methods, is_valid_multivariate_method +from fslite.fs.constants import ( + get_fs_multivariate_methods, + is_valid_multivariate_method, +) from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError from fslite.fs.utils import find_maximal_independent_set @@ -13,6 +16,7 @@ logger = logging.getLogger("FS:MULTIVARIATE") logger.setLevel(logging.INFO) + class FSMultivariate(FSMethod): """ The FSMultivariate class is a subclass of the FSMethod class and is used for multivariate @@ -56,6 +60,7 @@ def validate_method(self, multivariate_method: str): f"Invalid multivariate method: " f"{multivariate_method}. Accepted methods are {', '.join(self.valid_methods)}" ) + def select_features(self, fsdf: FSDataFrame): """ Select features using the specified multivariate method. @@ -100,10 +105,10 @@ def __repr__(self): def multivariate_correlation_selector( - fsdf: FSDataFrame, - strict: bool = True, - corr_threshold: float = 0.75, - corr_method: str = "pearson", + fsdf: FSDataFrame, + strict: bool = True, + corr_threshold: float = 0.75, + corr_method: str = "pearson", ) -> List[str]: """ Compute the correlation matrix among input features and select those below a specified threshold. @@ -128,7 +133,9 @@ def multivariate_correlation_selector( elif corr_method == "spearman": corr_matrix, _ = spearmanr(matrix) else: - raise ValueError(f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'.") + raise ValueError( + f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'." + ) # Get absolute values of correlations to check magnitude corr_matrix = np.abs(corr_matrix) From 7b06d1e47eac164c9939433ecdffca15dd9fb349 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 09:30:43 +0100 Subject: [PATCH 39/62] minor comments --- fslite/fs/univariate.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index e9a8144..899ba33 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -17,10 +17,6 @@ class FSUnivariate(FSMethod): """ A class for univariate feature selection methods. - - Attributes: - fs_method (str): The univariate method to be used for feature selection. - kwargs (dict): Additional keyword arguments for the feature selection method. """ valid_methods = get_fs_univariate_methods() @@ -29,11 +25,9 @@ def __init__(self, fs_method: str, **kwargs): """ Initialize the univariate feature selection method with the specified parameters. - Parameters: - fs_method: The univariate method to be used for feature selection. - kwargs: Additional keyword arguments for the feature selection method. + :param fs_method: The univariate method to be used for feature selection. + :param kwargs: Additional keyword arguments for the feature selection method. """ - super().__init__(fs_method, **kwargs) self.validate_method(fs_method) @@ -41,8 +35,7 @@ def validate_method(self, fs_method: str): """ Validate the univariate method. - Parameters: - fs_method: The univariate method to be validated. + :param fs_method: The univariate method to be validated. """ if not is_valid_univariate_method(fs_method): @@ -55,11 +48,8 @@ def select_features(self, fsdf) -> FSDataFrame: """ Select features using the specified univariate method. - Parameters: - fsdf: The data frame on which feature selection is to be performed. - - Returns: - The selected features. + :param fsdf: The data frame on which feature selection is to be performed. + :return fsdf: The data frame with selected features. """ return self.univariate_filter( @@ -80,7 +70,6 @@ def univariate_filter( :param df: Input DataFrame :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') - :return: Filtered DataFrame with selected features """ From 35f58a2be13b9f94d1764f64e496d0e860881496 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 09:44:53 +0100 Subject: [PATCH 40/62] minor refinements --- fslite/fs/fdataframe.py | 6 +- fslite/fs/multivariate.py | 4 +- fslite/fs/univariate.py | 39 +++---- fslite/pipeline/fs_pipeline_example.py | 138 ++++++++++++------------- fslite/tests/test_fsdataframe.py | 6 +- fslite/utils/io.py | 78 -------------- 6 files changed, 96 insertions(+), 175 deletions(-) diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index b5938fc..eec3352 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -1,5 +1,5 @@ import logging -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Union import numpy import numpy as np @@ -28,7 +28,7 @@ class FSDataFrame: feature names and samples labels. An object of FSDataFrame offers an interface to a DataFrame, a Pandas on DataFrame - (e.g., suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for + (e.g., suitable for visualization) or a DataFrame with features as a Dense column vector (e.g. suitable for applying most algorithms from MLib API). It can also be split in training and testing dataset and filtered by removing selected features (by name or index). @@ -142,7 +142,7 @@ def __init__( self.__is_scaled = (False, None) - def get_feature_matrix(self) -> numpy.array: + def get_feature_matrix(self) -> Union[np.ndarray, sparse.csr_matrix]: return self.__matrix def get_label_vector(self) -> numpy.array: diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index cc23705..43e2036 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -122,10 +122,10 @@ def multivariate_correlation_selector( :return: List of selected feature names. """ # Retrieve the feature matrix - matrix = fsdf.get_matrix() + matrix = fsdf.get_feature_matrix() # Retrieve feature names - feature_names = fsdf.get_features_names() + feature_names = fsdf.get_label_col_name() # Compute correlation matrix if corr_method == "pearson": diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index 899ba33..1cfb8ba 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -103,25 +103,6 @@ def univariate_filter( return df.select_features_by_index(selected_features) -def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: - """ - Compute the correlation coefficient between every column (features) in the input NumPy array and the label (class) - using a dictionary comprehension. - - :param df: Input FSDataFrame - :return: Return dict {feature_index -> corr} - """ - - f_matrix = df.get_feature_matrix() # get the feature matrix - labels = df.get_label_vector() # get the label vector - features_index = range(f_matrix.shape[1]) # get the feature index - - return { - f_index: abs(np.corrcoef(f_matrix[:, f_index], labels)[0, 1]) - for f_index in features_index - } - - def univariate_correlation_selector( df: FSDataFrame, corr_threshold: float = 0.3 ) -> List[int]: @@ -134,7 +115,27 @@ def univariate_correlation_selector( :return: List of selected feature indices """ + + def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: + """ + Compute the correlation coefficient between every column (features) in the input NumPy array and the label (class) + using a dictionary comprehension. + + :param df: Input FSDataFrame + :return: Return dict {feature_index -> corr} + """ + + f_matrix = df.get_feature_matrix() # get the feature matrix + labels = df.get_label_vector() # get the label vector + features_index = range(f_matrix.shape[1]) # get the feature index + + return { + f_index: abs(np.corrcoef(f_matrix[:, f_index], labels)[0, 1]) + for f_index in features_index + } + correlations = compute_univariate_corr(df) + selected_features = [ feature_index for feature_index, corr in correlations.items() diff --git a/fslite/pipeline/fs_pipeline_example.py b/fslite/pipeline/fs_pipeline_example.py index 1d3c539..96c1378 100644 --- a/fslite/pipeline/fs_pipeline_example.py +++ b/fslite/pipeline/fs_pipeline_example.py @@ -1,69 +1,69 @@ -""" -Example of a feature selection pipeline implemented in fslite. - -After data import and pre-processing, the pipeline applies univariate correlation filter, -multivariate correlation filter and Randon Forest classification. - -""" - -from fslite.config.context import init_spark, stop_spark_session -from fslite.fs.core import FSDataFrame - -from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -from fslite.utils.datasets import get_tnbc_data_path -from fslite.utils.io import import_table_as_psdf - -# Init spark -init_spark( - apply_pyarrow_settings=True, - apply_extra_spark_settings=True, - apply_pandas_settings=True, -) - -# 1. Import data -df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") - -# 2. Split data -training_data, testing_data = fsdf.split_df(split_training_factor=0.6) - -# 3. Set feature selection methods -# create a Univariate object -univariate = FSUnivariate( - fs_method="anova", selection_mode="percentile", selection_threshold=0.8 -) - -# create a Multivariate object -multivariate = FSMultivariate( - fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" -) - -# create a MLMethod object -rf_classifier = FSMLMethod( - fs_method="rf_multilabel", - rfe=True, - rfe_iterations=2, - percent_to_keep=0.9, - estimator_params={"labelCol": "label"}, - evaluator_params={"metricName": "accuracy"}, - grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, - cv_params={"parallelism": 2, "numFolds": 5}, -) - -# 4. Create a pipeline object -fs_pipeline = FSPipeline( - df_training=training_data, - df_testing=testing_data, - fs_stages=[univariate, multivariate, rf_classifier], -) - -# 5. Run the pipeline -results = fs_pipeline.run() - -# Print results -print(f"Accuracy on training: {results['training_metric']}") -print(f"Accuracy on testing: {results['testing_metric']}") -print(results.get("feature_scores")) - - -stop_spark_session() +# """ +# Example of a feature selection pipeline implemented in fslite. +# +# After data import and pre-processing, the pipeline applies univariate correlation filter, +# multivariate correlation filter and Randon Forest classification. +# +# """ +# +# from fslite.config.context import init_spark, stop_spark_session +# from fslite.fs.core import FSDataFrame +# +# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod +# from fslite.utils.datasets import get_tnbc_data_path +# from fslite.utils.io import import_table_as_psdf +# +# # Init spark +# init_spark( +# apply_pyarrow_settings=True, +# apply_extra_spark_settings=True, +# apply_pandas_settings=True, +# ) +# +# # 1. Import data +# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) +# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") +# +# # 2. Split data +# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) +# +# # 3. Set feature selection methods +# # create a Univariate object +# univariate = FSUnivariate( +# fs_method="anova", selection_mode="percentile", selection_threshold=0.8 +# ) +# +# # create a Multivariate object +# multivariate = FSMultivariate( +# fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" +# ) +# +# # create a MLMethod object +# rf_classifier = FSMLMethod( +# fs_method="rf_multilabel", +# rfe=True, +# rfe_iterations=2, +# percent_to_keep=0.9, +# estimator_params={"labelCol": "label"}, +# evaluator_params={"metricName": "accuracy"}, +# grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, +# cv_params={"parallelism": 2, "numFolds": 5}, +# ) +# +# # 4. Create a pipeline object +# fs_pipeline = FSPipeline( +# df_training=training_data, +# df_testing=testing_data, +# fs_stages=[univariate, multivariate, rf_classifier], +# ) +# +# # 5. Run the pipeline +# results = fs_pipeline.run() +# +# # Print results +# print(f"Accuracy on training: {results['training_metric']}") +# print(f"Accuracy on testing: {results['testing_metric']}") +# print(results.get("feature_scores")) +# +# +# stop_spark_session() diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index 039637c..3bd0ad8 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -9,7 +9,6 @@ def test_initializes_fsdataframe(): - # Create a sample DataFrame data = { "sample_id": [1, 2, 3], @@ -29,7 +28,6 @@ def test_initializes_fsdataframe(): def test_scaler_df(): - # Create a sample DataFrame data = { "sample_id": [1, 2, 3], @@ -52,7 +50,7 @@ def test_scaler_df(): def test_memory_fsdataframe(): def create_test_data( - n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05 + n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05 ): """Create test data for FSDataFrame.""" data = np.random.rand(n_samples, n_features) @@ -101,7 +99,7 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob=0.01) -> floa data = results_df[ (results_df["Features"] == feature_size) & (results_df["NAN Prob"] == prob) - ] + ] plt.plot( data["Samples"], data["Memory (MB)"], diff --git a/fslite/utils/io.py b/fslite/utils/io.py index 02dd07f..139597f 100644 --- a/fslite/utils/io.py +++ b/fslite/utils/io.py @@ -1,80 +1,2 @@ -import warnings -import pyspark.pandas -import pyspark.sql -from fslite.config.context import PANDAS_ON_SPARK_API_SETTINGS -warnings.filterwarnings("ignore") - - -def import_table( - path: str, header: bool = True, sep: str = "\t", n_partitions: int = 5 -) -> pyspark.sql.DataFrame: - """ - Import tsv file as Spark DataFrame. - - :param path: File path - :param header: True if the first row is header. - :param sep: Column separator - :param n_partitions: Minimal number of partitions - - :return: Spark DataFrame - """ - - _sc = pyspark.sql.SparkSession.getActiveSession() - - if _sc is None: - raise ValueError("Active Spark Session not found...") - - sdf = ( - _sc.read.option("delimiter", sep) - .option("header", header) - .option("inferSchema", "true") - .csv(path) - .repartition(n_partitions) - ) - return sdf - - -def import_parquet(path: str, header: bool = True) -> pyspark.sql.DataFrame: - """ - Import parquet file as Spark DataFrame. - - :param path: File path - :param header: True if the first row is header. - - :return: Spark DataFrame - """ - - _sc = pyspark.sql.SparkSession.getActiveSession() - - if _sc is None: - raise ValueError("Active Spark Session not found...") - - sdf = _sc.read.option("header", header).option("inferSchema", "true").parquet(path) - return sdf - - -def import_table_as_psdf( - path: str, sep: str = "\t", n_partitions: int = 5 -) -> pyspark.pandas.DataFrame: - """ - Import tsv file as Pandas on Spark DataFrame - - :param path: Path to TSV file - :param sep: Column separator (default: "\t") - :param n_partitions: Minimal number of partitions - - :return: Pandas on Spark DataFrame - """ - - import pyspark.pandas as ps - - # apply settings for pandas on spark api - [ - ps.set_option(k, PANDAS_ON_SPARK_API_SETTINGS.get(k)) - for k in PANDAS_ON_SPARK_API_SETTINGS.keys() - ] - - psdf = ps.read_csv(path, sep=sep).spark.repartition(n_partitions) - return psdf From 43dddb7c8f54c7acd6d92a17cdb0858486a91f94 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Mon, 23 Sep 2024 15:19:04 +0100 Subject: [PATCH 41/62] minor refinements --- examples/sc_human_atlas.py | 0 fslite/fs/constants.py | 26 ++------ fslite/fs/fdataframe.py | 2 + fslite/tests/test_data_preprocessing.py | 85 ------------------------- fslite/tests/test_import_export.py | 45 ------------- 5 files changed, 7 insertions(+), 151 deletions(-) create mode 100644 examples/sc_human_atlas.py delete mode 100644 fslite/tests/test_data_preprocessing.py delete mode 100644 fslite/tests/test_import_export.py diff --git a/examples/sc_human_atlas.py b/examples/sc_human_atlas.py new file mode 100644 index 0000000..e69de29 diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 79f9433..70379ab 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -13,10 +13,7 @@ "treating each feature independently and assessing its contribution to the predictive " "performance of the model.", "methods": [ - { - "name": "anova", - "description": "Univariate ANOVA feature selection (f-classification)", - }, + {"name": "anova","description": "Univariate ANOVA feature selection (f-classification)"}, {"name": "u_corr", "description": "Univariate correlation"}, {"name": "f_regression", "description": "Univariate f-regression"}, ], @@ -39,28 +36,15 @@ "methods": [ {"name": "rf_binary", "description": "Random Forest Binary Classifier"}, {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"}, - { - "name": "fm_binary", - "description": "Factorization Machine Binary Classifier", - }, - { - "name": "rf_multilabel", - "description": "Random Forest Multi-label Classifier", - }, - { - "name": "lg_multilabel", - "description": "Logistic Regression Multi-label Classifier", - }, + {"name": "fm_binary", "description": "Factorization Machine Binary Classifier"}, + {"name": "rf_multilabel", "description": "Random Forest Multi-label Classifier"}, + {"name": "lg_multilabel","description": "Logistic Regression Multi-label Classifier"}, {"name": "rf_regression", "description": "Random Forest Regression"}, - { - "name": "fm_regression", - "description": "Factorization Machine Regression", - }, + {"name": "fm_regression","description": "Factorization Machine Regression"}, ], }, } - def get_fs_methods(): """ Get the list of feature selection methods diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index eec3352..25a4dc3 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -61,6 +61,7 @@ def __init__( in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows. :param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense. """ + # TODO: We are loading full data into memory, look for other options. Maybe Dask? self.__df = df.copy() # Check for necessary columns @@ -90,6 +91,7 @@ def __init__( self.__labels = df[label_col].tolist() # Encode labels + # TODO: Check if labels are categorical or continuous? For now, assume categorical label_encoder = LabelEncoder() self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist() columns_to_drop.append(label_col) diff --git a/fslite/tests/test_data_preprocessing.py b/fslite/tests/test_data_preprocessing.py deleted file mode 100644 index dbf9f43..0000000 --- a/fslite/tests/test_data_preprocessing.py +++ /dev/null @@ -1,85 +0,0 @@ -# import unittest -# -# import numpy as np -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.fs.core import FSDataFrame -# from fslite.fs.utils import ( -# compute_missingness_rate, -# remove_features_by_missingness_rate, -# impute_missing, -# ) -# from fslite.utils.datasets import get_tnbc_data_missing_values_path -# from fslite.utils.io import import_table_as_psdf -# -# -# class TestDataPreprocessing(unittest.TestCase): -# """ -# Define testing methods for data preprocessing (e.g, scaling, imputation, etc.) -# -# """ -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# @staticmethod -# def import_FSDataFrame() -> FSDataFrame: -# """ -# Import FSDataFrame object with missing values. -# Number of samples: 44 -# Number of features: 10 (5 with missing values) -# :return: -# """ -# df = import_table_as_psdf(get_tnbc_data_missing_values_path(), n_partitions=5) -# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") -# return fsdf -# -# def test_compute_missingness_rate(self): -# """ -# Test compute_missingness_rate method. -# :return: None -# """ -# -# fsdf = self.import_FSDataFrame() -# features_missing_rates = compute_missingness_rate(fsdf) -# self.assertEqual(features_missing_rates.get("tr|E9PBJ4"), 0.0) -# self.assertAlmostEqual(features_missing_rates.get("sp|P07437"), 0.295, places=2) -# -# def test_filter_by_missingness_rate(self): -# """ -# Test filter_missingness_rate method. -# :return: None -# """ -# -# fsdf = self.import_FSDataFrame() -# fsdf = remove_features_by_missingness_rate(fsdf, threshold=0.15) -# # print number of features -# print(f"Number of remaining features: {fsdf.count_features()}") -# -# self.assertEqual(fsdf.count_features(), 6) -# -# def test_impute_missing(self): -# """ -# Test impute_missing method. Impute missing values using the mean across columns. -# :return: None -# """ -# -# fsdf = self.import_FSDataFrame() -# fsdf = impute_missing(fsdf, strategy="mean") -# -# # Collect features as array -# array = fsdf._collect_features_as_array() -# -# # Check if there are no missing (NaNs) or null values -# self.assertFalse(np.isnan(array).any()) -# -# -# if __name__ == "__main__": -# unittest.main() diff --git a/fslite/tests/test_import_export.py b/fslite/tests/test_import_export.py deleted file mode 100644 index 32ee27a..0000000 --- a/fslite/tests/test_import_export.py +++ /dev/null @@ -1,45 +0,0 @@ -# import unittest -# -# import pyspark -# import pyspark.pandas as ps -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.utils.datasets import get_tnbc_data_path -# from fslite.utils.io import import_table, import_table_as_psdf -# -# -# class TestImportExport(unittest.TestCase): -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# def test_import_tsv(self): -# """ -# Test import tsv file as Spark DataFrame. -# :return: None -# """ -# df = import_table(path=get_tnbc_data_path(), n_partitions=10) -# -# self.assertIsInstance(df, pyspark.sql.DataFrame) -# self.assertEqual(df.count(), 44) -# -# def test_import_tsv_as_psdf(self): -# """ -# Test import tsv file as Pandas on Spark DataFrame (PoS). -# :return: None -# """ -# df = import_table_as_psdf(path=get_tnbc_data_path(), n_partitions=10) -# -# self.assertIsInstance(df, ps.frame.DataFrame) -# self.assertEqual(df.shape, (44, 502)) -# -# -# if __name__ == "__main__": -# unittest.main() From b6e8eabfda68babbd3b0b01651e3507cd83dbeb0 Mon Sep 17 00:00:00 2001 From: enriquea Date: Mon, 23 Sep 2024 17:25:10 +0200 Subject: [PATCH 42/62] added example script to parse single-cell data --- examples/1.parse_single_cell_data.py | 107 +++++++++++++++++++++++++++ examples/2.concat_parquet_files.py | 53 +++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 examples/1.parse_single_cell_data.py create mode 100644 examples/2.concat_parquet_files.py diff --git a/examples/1.parse_single_cell_data.py b/examples/1.parse_single_cell_data.py new file mode 100644 index 0000000..953e7ee --- /dev/null +++ b/examples/1.parse_single_cell_data.py @@ -0,0 +1,107 @@ +# Import and convert to parquet a single-cell dataset: GSE156793 (loom format) +# GEO URL: +# https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE156793&format=file&file=GSE156793%5FS3%5Fgene%5Fcount%2Eloom%2Egz + +# import libraries +import pandas as pd +import loompy + +# define the path to the loom file +loom_file = "/path/to/loom/GSE156793_S3_gene_count.loom" + +# connect to the loom file +ds = loompy.connect(loom_file) + +# get shape of the data +ds.shape + +# retrieve the row attributes +ds.ra.keys() + +# get gene ids +gene_ids = ds.ra["gene_id"] +gene_ids[0:10] + +# get the column attributes +ds.ca.keys() + +# get sample metadata +sample_id = ds.ca["sample"] +cell_cluster = ds.ca["Main_cluster_name"] +assay = ds.ca["Assay"] +development_day = ds.ca["Development_day"] + +# make a dataframe with the sample metadata +sample_df = pd.DataFrame({"sample_id": sample_id, + "cell_cluster": cell_cluster, + "assay": assay, + "development_day": development_day}) + +# print the first 5 rows +sample_df.head() + +# Make 'cell_cluster' a categorical variable encoded as an integer +sample_df["cell_cluster"] = sample_df["cell_cluster"].astype("category") +sample_df["cell_cluster_id"] = sample_df["cell_cluster"].cat.codes + +# Make 'assay' a categorical variable encoded as an integer +sample_df["assay"] = sample_df["assay"].astype("category") +sample_df["assay_id"] = sample_df["assay"].cat.codes + +# Make 'sample_id' the index +sample_df = sample_df.set_index("sample_id") + +# Show the first 5 rows +sample_df.head() + +# Save the sample metadata to parquet +(sample_df + .reset_index() + .to_parquet("sample_metadata.parquet.gz", + index=False, + engine="auto", + compression="gzip") + ) + + +# transpose dataset and convert to parquet. +# process the data per chunks. +chunk_size = 2000 +for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size): + # retrieve the chunk + matrix_chunk = view[:, :] + + # transpose the data + matrix_chunk_t = matrix_chunk.T + + # convert to pandas dataframe + df_chunk = pd.DataFrame(matrix_chunk_t, + index=sample_id[selection.tolist()], + columns=gene_ids) + + # merge chunk with sample metadata + df_chunk = pd.merge( + left=sample_df[['cell_cluster_id', 'development_day', 'assay_id']], + right=df_chunk, + how="inner", + left_index=True, + right_index=True, + sort=False, + copy=True, + indicator=False, + validate="one_to_one" + ) + + # reset the index + df_chunk = df_chunk.reset_index() + + # rename the index column + df_chunk = df_chunk.rename(columns={"index": "sample_id"}) + + # save the chunk to parquet + df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet.gz", + index=False, + engine="fastparquet", + compression="gzip") + + print(f"Chunk {ix} saved") diff --git a/examples/2.concat_parquet_files.py b/examples/2.concat_parquet_files.py new file mode 100644 index 0000000..aa994db --- /dev/null +++ b/examples/2.concat_parquet_files.py @@ -0,0 +1,53 @@ +import pyspark +from pyspark.sql import SparkSession + +# create spark session +spark = SparkSession.builder \ + .master("local[*]") \ + .appName("fsspark") \ + .config("spark.sql.execution.arrow.pyspark.enabled", "true") \ + .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \ + .config("spark.sql.pivotMaxValues", "100000") \ + .config("spark.network.timeout", "100000") \ + .config("spark.sql.session.timeZone", "UTC") \ + .config("spark.executor.memory", "80g") \ + .config("spark.driver.memory", "100g") \ + .config("spark.memory.offHeap.enabled", "true") \ + .config("spark.memory.offHeap.size", "8g") \ + .config("spark.sql.session.timeZone", "UTC") \ + .getOrCreate() + + +# get all absolute paths of files in a directory +def get_files_paths(directory, extension: str = "parquet.gz"): + """ + Get all files paths in a directory. + :param extension: str, file extension. + :param directory: str, directory path. + :return: list, list of files paths. + """ + import os + files_paths = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(extension): + files_paths.append(os.path.join(root, file)) + return files_paths + + +# get all files paths +files_paths = get_files_paths(directory="/mnt/nfs/user-data/eam/GSE156793/sdf", + extension="parquet.gz") + +# read all parquet files as spark dataframe and write (append) them to a single parquet file +for file_path in files_paths: + print("Processing file: {}".format(file_path)) + df = (spark + .read + .parquet(file_path) + .repartition(5) + ) + + df.write.parquet("/mnt/nfs/user-data/eam/GSE156793/GSE156793.sample_x_gene.parquet", + mode='append', + compression='gzip') \ No newline at end of file From 07a9dc54772ec66ed0be3c49cd365b38ebbe77c3 Mon Sep 17 00:00:00 2001 From: enriquea Date: Mon, 23 Sep 2024 23:07:08 +0200 Subject: [PATCH 43/62] implemented univariate selector methods (from sci-learn) and added tests. --- fslite/fs/constants.py | 6 +- fslite/fs/univariate.py | 139 +++++++++++++----------- fslite/tests/test_univariate_methods.py | 115 +++++++++++++++++++- 3 files changed, 192 insertions(+), 68 deletions(-) diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 70379ab..52c08d1 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -13,9 +13,11 @@ "treating each feature independently and assessing its contribution to the predictive " "performance of the model.", "methods": [ - {"name": "anova","description": "Univariate ANOVA feature selection (f-classification)"}, - {"name": "u_corr", "description": "Univariate correlation"}, + {"name": "anova", "description": "Univariate ANOVA feature selection (f-classification)"}, + {"name": "u_corr", "description": "Univariate Pearson's correlation"}, {"name": "f_regression", "description": "Univariate f-regression"}, + {"name": "mutual_info_regression", "description": "Univariate mutual information regression"}, + {"name": "mutual_info_classification", "description": "Univariate mutual information classification"}, ], }, "multivariate": { diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index 1cfb8ba..47eea14 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -2,8 +2,11 @@ from typing import Dict, List import numpy as np -import pandas as pd -from sklearn.feature_selection import SelectKBest, f_classif, f_regression +from sklearn.feature_selection import (GenericUnivariateSelect, + f_classif, + f_regression, + mutual_info_classif, + mutual_info_regression) from fslite.fs.constants import get_fs_univariate_methods, is_valid_univariate_method from fslite.fs.fdataframe import FSDataFrame @@ -62,14 +65,65 @@ def __str__(self): def __repr__(self): return self.__str__() + def univariate_feature_selector( + self, + df: FSDataFrame, + score_func: str = "f_classif", + selection_mode: str = "percentile", + selection_threshold: float = 0.8 + ) -> List[int]: + """ + Wrapper for scikit-learn's `GenericUnivariateSelect` feature selector, supporting multiple scoring functions. + + :param df: Input FSDataFrame + :param score_func: The score function to use for feature selection. Options are: + - 'f_classif': ANOVA F-value for classification tasks. + - 'f_regression': F-value for regression tasks. + - 'mutual_info_classif': Mutual information for classification. + - 'mutual_info_regression': Mutual information for regression. + :param selection_mode: Mode for feature selection (e.g. 'percentile' or 'k_best'). + :param selection_threshold: The percentage or number of features to select based on the selection mode. + + :return: List of selected feature indices. + """ + # Define the score function based on input + score_func_mapping = { + "f_classif": f_classif, + "f_regression": f_regression, + "mutual_info_classif": mutual_info_classif, + "mutual_info_regression": mutual_info_regression, + } + + if score_func not in score_func_mapping: + raise ValueError(f"Invalid score_func '{score_func}'. Valid options are: {list(score_func_mapping.keys())}") + + # Extract the score function + selected_score_func = score_func_mapping[score_func] + + # Get the feature matrix and label vector from the FSDataFrame + f_matrix = df.get_feature_matrix() + y = df.get_label_vector() + + # Configure the selector using the provided score function and selection mode/threshold + selector = GenericUnivariateSelect(score_func=selected_score_func, + mode=selection_mode, + param=selection_threshold) + + # Fit the selector and get only the selected feature indices (not the transformed matrix) + _ = selector.fit_transform(f_matrix, y) + selected_features = selector.get_support(indices=True) + + return list(selected_features) + def univariate_filter( - self, df: FSDataFrame, univariate_method: str = "u_corr", **kwargs + self, df: FSDataFrame, univariate_method: str = "u_corr", **kwargs ) -> FSDataFrame: """ Filter features after applying a univariate feature selector method. :param df: Input DataFrame - :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression') + :param univariate_method: Univariate selector method ('u_corr', 'anova', 'f_regression', + 'mutual_info_classification', 'mutual_info_regression') :return: Filtered DataFrame with selected features """ @@ -83,17 +137,21 @@ def univariate_filter( selected_features = [] if univariate_method == "anova": - # TODO: Implement ANOVA selector - # selected_features = univariate_selector(df, features, label, label_type='categorical', **kwargs) - pass + selected_features = self.univariate_feature_selector(df, score_func="f_classif", **kwargs) elif univariate_method == "f_regression": - # TODO: Implement F-regression selector - # selected_features = univariate_selector(df, features, label, label_type='continuous', **kwargs) - pass + selected_features = self.univariate_feature_selector(df, score_func="f_regression", **kwargs) elif univariate_method == "u_corr": selected_features = univariate_correlation_selector(df, **kwargs) + elif univariate_method == "mutual_info_classification": + selected_features = self.univariate_feature_selector(df, score_func="mutual_info_classif", **kwargs) + elif univariate_method == "mutual_info_regression": + selected_features = self.univariate_feature_selector(df, score_func="mutual_info_regression", **kwargs) - logger.info(f"Applying univariate filter using method: {univariate_method}") + logger.info( + f"Applying univariate filter using method: {univariate_method} \n" + f" with selection mode: {kwargs.get('selection_mode')} \n" + f" and selection threshold: {kwargs.get('selection_threshold')}" + ) if len(selected_features) == 0: logger.warning("No features selected. Returning original DataFrame.") @@ -104,14 +162,16 @@ def univariate_filter( def univariate_correlation_selector( - df: FSDataFrame, corr_threshold: float = 0.3 + df: FSDataFrame, + selection_threshold: float = 0.3 ) -> List[int]: """ + TODO: Replace this implementation with sci-learn's GenericUnivariateSelect with score_func='f_regression' Select features based on their correlation with a label (class), if the correlation value is less than the specified threshold. :param df: Input DataFrame - :param corr_threshold: Maximum allowed correlation threshold + :param selection_threshold: Maximum allowed correlation threshold :return: List of selected feature indices """ @@ -139,58 +199,7 @@ def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: selected_features = [ feature_index for feature_index, corr in correlations.items() - if corr <= corr_threshold + if corr <= selection_threshold ] return selected_features - -def univariate_selector( - df: pd.DataFrame, - features: List[str], - label: str, - label_type: str = "categorical", - selection_mode: str = "percentile", - selection_threshold: float = 0.8, -) -> List[str]: - """ - Wrapper for scikit-learn's `SelectKBest` feature selector. - If the label is categorical, ANOVA test is used; if continuous, F-regression test is used. - - :param df: Input DataFrame - :param features: List of feature column names - :param label: Label column name - :param label_type: Type of label ('categorical' or 'continuous') - :param selection_mode: Mode for feature selection ('percentile' or 'k_best') - :param selection_threshold: Number of features to select or the percentage of features - - :return: List of selected feature names - """ - - X = df[features].values - y = df[label].values - - if label_type == "categorical": - logger.info("ANOVA (F-classification) univariate feature selection") - selector = SelectKBest(score_func=f_classif) - elif label_type == "continuous": - logger.info("F-value (F-regression) univariate feature selection") - selector = SelectKBest(score_func=f_regression) - else: - raise ValueError("`label_type` must be one of 'categorical' or 'continuous'") - - if selection_mode == "percentile": - selector.set_params(k="all") # We'll handle the percentile threshold manually - selector.fit(X, y) - scores = selector.scores_ - selected_indices = [ - i - for i, score in enumerate(scores) - if score >= selection_threshold * max(scores) - ] - else: - selector.set_params(k=int(selection_threshold)) - selector.fit(X, y) - selected_indices = selector.get_support(indices=True) - - selected_features = [features[i] for i in selected_indices] - return selected_features diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 1da779e..7dca51c 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -18,7 +18,8 @@ def test_univariate_filter_corr(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="u_corr", corr_threshold=0.3) + fs_univariate = FSUnivariate(fs_method="u_corr", + selection_threshold=0.3) fsdf_filtered = fs_univariate.select_features(fs_df) @@ -28,3 +29,115 @@ def test_univariate_filter_corr(): # Export the filtered DataFrame as Pandas DataFrame df_filtered = fsdf_filtered.to_pandas() df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test the univariate_filter method with 'anova' method +def test_univariate_filter_anova(): + """ + Test univariate_filter method with 'anova' method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="anova", + selection_mode="percentile", + selection_threshold=0.8) + + fsdf_filtered = fs_univariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 4 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test the univariate_filter method with 'mutual_info_classification' method +def test_univariate_filter_mutual_info_classification(): + """ + Test univariate_filter method with 'mutual_info_classification' method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="mutual_info_classification", + selection_mode="k_best", + selection_threshold=30) + + fsdf_filtered = fs_univariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 30 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test the univariate_filter method with 'mutual_info_regression' method +def test_univariate_filter_mutual_info_regression(): + """ + Test univariate_filter method with 'mutual_info_regression' method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="mutual_info_regression", + selection_mode="percentile", + selection_threshold=0.8) + + fsdf_filtered = fs_univariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 4 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test the univariate_filter method with f-regression method +def test_univariate_filter_f_regression(): + """ + Test univariate_filter method with f_regression method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="f_regression", + selection_mode="percentile", + selection_threshold=0.8) + + fsdf_filtered = fs_univariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 4 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) From 6c29cd8a2a602ef242ee99b45194ac1fcd1613bb Mon Sep 17 00:00:00 2001 From: enriquea Date: Tue, 24 Sep 2024 17:29:15 +0200 Subject: [PATCH 44/62] added implementation for multivariate methods: variance and matrix_correlation --- fslite/fs/multivariate.py | 102 ++++++++++++++++++++++++++++---------- fslite/fs/utils.py | 20 +++++++- 2 files changed, 96 insertions(+), 26 deletions(-) diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index 43e2036..340af83 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -10,7 +10,7 @@ ) from fslite.fs.fdataframe import FSDataFrame from fslite.fs.methods import FSMethod, InvalidMethodError -from fslite.fs.utils import find_maximal_independent_set +from fslite.fs.utils import find_maximal_independent_set, percentile_rank logging.basicConfig(format="%(levelname)s (%(name)s %(lineno)s): %(message)s") logger = logging.getLogger("FS:MULTIVARIATE") @@ -70,9 +70,11 @@ def select_features(self, fsdf: FSDataFrame): fsdf, multivariate_method=self.fs_method, **self.kwargs ) - def multivariate_filter( - fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs - ) -> FSDataFrame: + def multivariate_filter(self, + fsdf: FSDataFrame, + multivariate_method: str = "m_corr", + **kwargs + ) -> FSDataFrame: """ Filter features after applying a multivariate feature selector method. @@ -85,7 +87,7 @@ def multivariate_filter( if multivariate_method == "m_corr": selected_features = multivariate_correlation_selector(fsdf, **kwargs) elif multivariate_method == "variance": - # selected_features = multivariate_variance_selector(fsdf, **kwargs) + selected_features = multivariate_variance_selector(fsdf, **kwargs) logging.info("Variance method not implemented yet.") else: raise ValueError( @@ -105,33 +107,34 @@ def __repr__(self): def multivariate_correlation_selector( - fsdf: FSDataFrame, - strict: bool = True, - corr_threshold: float = 0.75, - corr_method: str = "pearson", -) -> List[str]: + fsdf: FSDataFrame, + selection_mode: str = 'strict', + selection_threshold: float = 0.75, + corr_method: str = "pearson", +) -> List[int]: """ Compute the correlation matrix among input features and select those below a specified threshold. :param fsdf: Input FSDataFrame object. - :param strict: If True (default), apply hard filtering to remove highly correlated features. - Otherwise, find the maximal independent set of highly correlated features (experimental). - :param corr_threshold: Minimal correlation threshold to consider two features correlated. + :param selection_mode: If 'strict' (default), apply hard filtering to remove highly correlated features. + Otherwise, 'approximate' find the maximal independent set of highly correlated + features (experimental). + :param selection_threshold: Minimal correlation threshold to consider two features correlated. :param corr_method: Correlation method - 'pearson' (default) or 'spearman'. - :return: List of selected feature names. + :return: List of selected feature indices """ # Retrieve the feature matrix - matrix = fsdf.get_feature_matrix() + f_matrix = fsdf.get_feature_matrix() - # Retrieve feature names - feature_names = fsdf.get_label_col_name() + # Get features indexes from matrix + features_indexes = list(range(f_matrix.shape[1])) # Compute correlation matrix if corr_method == "pearson": - corr_matrix = np.corrcoef(matrix, rowvar=False) + corr_matrix = np.corrcoef(f_matrix, rowvar=False) elif corr_method == "spearman": - corr_matrix, _ = spearmanr(matrix) + corr_matrix, _ = spearmanr(f_matrix) else: raise ValueError( f"Unsupported correlation method '{corr_method}'. Use 'pearson' or 'spearman'." @@ -141,12 +144,12 @@ def multivariate_correlation_selector( corr_matrix = np.abs(corr_matrix) # Find pairs of features with correlation above the threshold - combs_above_cutoff = np.triu(corr_matrix, k=1) > corr_threshold + combs_above_cutoff = np.triu(corr_matrix, k=1) > selection_threshold correlated_pairs = np.column_stack(np.where(combs_above_cutoff)) # Set of indices to remove index_to_remove = set() - if strict: + if selection_mode == "strict": # Strict filtering: remove features with higher mean correlations col_means = np.mean(corr_matrix, axis=1) for i, j in correlated_pairs: @@ -154,12 +157,61 @@ def multivariate_correlation_selector( index_to_remove.add(i) else: index_to_remove.add(j) - else: + elif selection_mode == "approximate": # Experimental approximate method index_to_remove = find_maximal_independent_set(correlated_pairs, keep=False) + else: + raise ValueError( + f"Unsupported selection mode '{selection_mode}'. Use 'strict' or 'approximate'." + ) - # Select feature names to keep - features_to_remove = [feature_names[i] for i in index_to_remove] - selected_features = [f for f in feature_names if f not in features_to_remove] + # Select feature index to keep + selected_features = [i for i in features_indexes if i not in index_to_remove] return selected_features + + +def multivariate_variance_selector(fsdf: FSDataFrame, + selection_mode: str = "k_best", + selection_threshold: float = 0.0) -> List[int]: + """ + Filter features based on variance threshold. + + :param selection_mode: "percentile" or "k_best" (default). If "percentile", the threshold is a percentile of the + variance distribution. If "k_best", the threshold is the k highest variances. + Default is "k_best" with selection_threshold=0.0 (i.e. remove features with same values + in all samples). + :param fsdf: Input FSDataFrame object. + :param selection_threshold: Minimal variance threshold to keep a feature. + Default is 0.0 (i.e. remove features with same values in all samples). + + :return: List of selected feature indices + """ + + # Retrieve the feature matrix + f_matrix = fsdf.get_feature_matrix() + + # Compute variances (across samples) for each feature + features_variances = np.var(f_matrix, axis=0) + + # print to log variance mean across features + logger.info(f"Mean variance across features: {np.mean(features_variances)}") + print(f"Mean variance across features: {np.mean(features_variances)}") + + if selection_mode == "k_best": + # keep indices of features with variance above the threshold + selected_features = np.where(features_variances > selection_threshold)[0] + elif selection_mode == "percentile": + # compute the percentile rank of variances + variances_pct_rank = percentile_rank(features_variances) + # keep indices of features with variance above the threshold + selected_features = np.where(variances_pct_rank > selection_threshold)[0] + else: + raise ValueError( + f"Unsupported selection mode '{selection_mode}'. Use 'percentile' or 'k_best'." + ) + + logger.info(f"Feature selection mode: {selection_mode}. \n" + f"Number of features selected: {len(selected_features)}") + + return list(selected_features) diff --git a/fslite/fs/utils.py b/fslite/fs/utils.py index 649ad30..31e3e7e 100644 --- a/fslite/fs/utils.py +++ b/fslite/fs/utils.py @@ -2,7 +2,9 @@ from typing import Dict, Tuple, Set import networkx as nx +import numpy as np from networkx.algorithms.mis import maximal_independent_set +from scipy.stats import rankdata from sklearn.impute import SimpleImputer from fslite.fs.fdataframe import FSDataFrame @@ -40,7 +42,7 @@ def compute_missingness_rate(fsdf: FSDataFrame) -> Dict[str, float]: def remove_features_by_missingness_rate( - fsdf: FSDataFrame, threshold: float = 0.15 + fsdf: FSDataFrame, threshold: float = 0.15 ) -> FSDataFrame: """ Remove features from FSDataFrame with missingness rate higher or equal than a specified threshold. @@ -118,3 +120,19 @@ def find_maximal_independent_set(pairs: Tuple[int], keep: bool = True) -> Set[in return set(max_ind_set) else: return set([int(i) for i in graph.nodes if i not in max_ind_set]) + + +# define a function to convert a numerical vector to percentile ranks + +def percentile_rank(vector: np.array) -> np.array: + """ + Convert a numerical vector to percentile ranks. + + :param vector: Numerical vector. + :return: Vector of percentile ranks. + """ + # Rank the data and then normalize by the size of the vector to get percentiles + ranks = rankdata(vector, method='average') + percentile_ranks = ranks / len(vector) + + return percentile_ranks From 5cbd7dab721738430f03ba791252f71c851c2229 Mon Sep 17 00:00:00 2001 From: enriquea Date: Tue, 24 Sep 2024 17:29:32 +0200 Subject: [PATCH 45/62] added tests for multivariate --- fslite/tests/test_multivariate_methods.py | 122 ++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 fslite/tests/test_multivariate_methods.py diff --git a/fslite/tests/test_multivariate_methods.py b/fslite/tests/test_multivariate_methods.py new file mode 100644 index 0000000..6948d9d --- /dev/null +++ b/fslite/tests/test_multivariate_methods.py @@ -0,0 +1,122 @@ +import pandas as pd + +from fslite.fs.fdataframe import FSDataFrame +from fslite.fs.multivariate import FSMultivariate +from fslite.utils.datasets import get_tnbc_data_path + + +# test multivariate_filter method with 'm_corr' method +def test_multivariate_filter_corr_strict_mode(): + """ + Test multivariate_filter method with 'm_corr' method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSMultivariate instance + fs_multivariate = FSMultivariate(fs_method="m_corr", + selection_mode="strict", + selection_threshold=0.75) + + fsdf_filtered = fs_multivariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 239 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test multivariate_filter method with 'm_corr' method in approximate mode +def test_multivariate_filter_corr_approximate_mode(): + """ + Test multivariate_filter method with 'm_corr' method in approximate mode. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSMultivariate instance + fs_multivariate = FSMultivariate(fs_method="m_corr", + selection_mode="approximate", + selection_threshold=0.75) + + fsdf_filtered = fs_multivariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + + # test if number of features selected is within the expected range [240-260] + assert 240 <= fsdf_filtered.count_features() <= 260 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test multivariate_filter method with 'variance' method +def test_multivariate_filter_variance_percentile_mode(): + """ + Test multivariate_filter method with 'variance' method. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSMultivariate instance + fs_multivariate = FSMultivariate(fs_method="variance", + selection_mode="percentile", + selection_threshold=0.2) + + fsdf_filtered = fs_multivariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 400 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + + +# test multivariate_filter method with 'variance' method in k_best mode +def test_multivariate_filter_variance_k_best_mode(): + """ + Test multivariate_filter method with 'variance' method in k_best mode. + :return: None + """ + + # import tsv as pandas DataFrame + df = pd.read_csv(get_tnbc_data_path(), sep="\t") + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") + + # create FSMultivariate instance + fs_multivariate = FSMultivariate(fs_method="variance", + selection_mode="k_best", + selection_threshold=68100000.0 + # TODO: check this value (should be normalized variance?) + ) + + fsdf_filtered = fs_multivariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 87 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("filtered_tnbc_data.csv", index=False) + From cc493f66a4685fa8e8b186e542ec63543a678faf Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Tue, 24 Sep 2024 18:30:49 +0100 Subject: [PATCH 46/62] loom2parquet examples --- examples/2.concat_parquet_files.py | 53 ---------------- ...gle_cell_data.py => loom2parquetchunks.py} | 17 +++-- examples/loom2parquetmerge.py | 62 +++++++++++++++++++ examples/sc_human_atlas.py | 0 4 files changed, 74 insertions(+), 58 deletions(-) delete mode 100644 examples/2.concat_parquet_files.py rename examples/{1.parse_single_cell_data.py => loom2parquetchunks.py} (84%) create mode 100644 examples/loom2parquetmerge.py delete mode 100644 examples/sc_human_atlas.py diff --git a/examples/2.concat_parquet_files.py b/examples/2.concat_parquet_files.py deleted file mode 100644 index aa994db..0000000 --- a/examples/2.concat_parquet_files.py +++ /dev/null @@ -1,53 +0,0 @@ -import pyspark -from pyspark.sql import SparkSession - -# create spark session -spark = SparkSession.builder \ - .master("local[*]") \ - .appName("fsspark") \ - .config("spark.sql.execution.arrow.pyspark.enabled", "true") \ - .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \ - .config("spark.sql.pivotMaxValues", "100000") \ - .config("spark.network.timeout", "100000") \ - .config("spark.sql.session.timeZone", "UTC") \ - .config("spark.executor.memory", "80g") \ - .config("spark.driver.memory", "100g") \ - .config("spark.memory.offHeap.enabled", "true") \ - .config("spark.memory.offHeap.size", "8g") \ - .config("spark.sql.session.timeZone", "UTC") \ - .getOrCreate() - - -# get all absolute paths of files in a directory -def get_files_paths(directory, extension: str = "parquet.gz"): - """ - Get all files paths in a directory. - :param extension: str, file extension. - :param directory: str, directory path. - :return: list, list of files paths. - """ - import os - files_paths = [] - for root, dirs, files in os.walk(directory): - for file in files: - if file.endswith(extension): - files_paths.append(os.path.join(root, file)) - return files_paths - - -# get all files paths -files_paths = get_files_paths(directory="/mnt/nfs/user-data/eam/GSE156793/sdf", - extension="parquet.gz") - -# read all parquet files as spark dataframe and write (append) them to a single parquet file -for file_path in files_paths: - print("Processing file: {}".format(file_path)) - df = (spark - .read - .parquet(file_path) - .repartition(5) - ) - - df.write.parquet("/mnt/nfs/user-data/eam/GSE156793/GSE156793.sample_x_gene.parquet", - mode='append', - compression='gzip') \ No newline at end of file diff --git a/examples/1.parse_single_cell_data.py b/examples/loom2parquetchunks.py similarity index 84% rename from examples/1.parse_single_cell_data.py rename to examples/loom2parquetchunks.py index 953e7ee..5667518 100644 --- a/examples/1.parse_single_cell_data.py +++ b/examples/loom2parquetchunks.py @@ -7,7 +7,7 @@ import loompy # define the path to the loom file -loom_file = "/path/to/loom/GSE156793_S3_gene_count.loom" +loom_file = "GSE156793_S3_gene_count.loom" # connect to the loom file ds = loompy.connect(loom_file) @@ -31,7 +31,7 @@ assay = ds.ca["Assay"] development_day = ds.ca["Development_day"] -# make a dataframe with the sample metadata +# make a dataframe with the sample metadata, define the columns types sample_df = pd.DataFrame({"sample_id": sample_id, "cell_cluster": cell_cluster, "assay": assay, @@ -57,7 +57,7 @@ # Save the sample metadata to parquet (sample_df .reset_index() - .to_parquet("sample_metadata.parquet.gz", + .to_parquet("sample_metadata.parquet", index=False, engine="auto", compression="gzip") @@ -67,6 +67,8 @@ # transpose dataset and convert to parquet. # process the data per chunks. chunk_size = 2000 +number_chunks = 1000 # Number of chunks to process, if None, all chunks are processed +count = 0 for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size): # retrieve the chunk matrix_chunk = view[:, :] @@ -99,9 +101,14 @@ df_chunk = df_chunk.rename(columns={"index": "sample_id"}) # save the chunk to parquet - df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet.gz", + df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet", index=False, - engine="fastparquet", + engine="pyarrow", compression="gzip") print(f"Chunk {ix} saved") + count = count + 1 + + # break the loop if the number of chunks is reached + if number_chunks is not None and count >= number_chunks: + break diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py new file mode 100644 index 0000000..20f1c45 --- /dev/null +++ b/examples/loom2parquetmerge.py @@ -0,0 +1,62 @@ +import os +import pyarrow.parquet as pq +import pyarrow as pa + + +# get all absolute paths of files in a directory +def get_files_paths(directory, extension: str = "parquet"): + """ + Get all file paths in a directory. + :param extension: str, file extension. + :param directory: str, directory path. + :return: list, list of file paths. + """ + files_paths = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith(extension): + files_paths.append(os.path.join(root, file)) + return files_paths + + +def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000): + """ + Concatenate multiple parquet files in an incremental fashion to avoid memory overload. + + :param files_paths: List of parquet file paths. + :param output_path: Path to the output parquet file. + :param batch_size: Number of rows to read from each file at a time. + """ + writer = None + + for file_path in files_paths: + print(f"Processing file: {file_path}") + parquet_file = pq.ParquetFile(file_path) + + # Read the file in batches to avoid memory overload + for batch in parquet_file.iter_batches(batch_size=batch_size): + # Convert the batch to a PyArrow Table + table = pa.Table.from_batches([batch]) + + # If the writer is not initialized, create a new Parquet writer + if writer is None: + writer = pq.ParquetWriter(output_path, table.schema, compression='gzip') + + # Write the batch to the output Parquet file + writer.write_table(table) + + # Close the writer after all batches are written + if writer is not None: + writer.close() + print(f"Concatenated parquet file written to {output_path}") + + +# Get all files paths +files_paths = get_files_paths(directory="./", + extension="parquet") + +# Output path for the final concatenated parquet file +output_path = "GSE156793.parquet" + +# Concatenate the parquet files and write to a single file incrementally +concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000) diff --git a/examples/sc_human_atlas.py b/examples/sc_human_atlas.py deleted file mode 100644 index e69de29..0000000 From 4250a4e8a96d29df5b0fa9579c61c65316a8d4ee Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:11:40 +0100 Subject: [PATCH 47/62] Update fslite/fs/utils.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/fs/utils.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fslite/fs/utils.py b/fslite/fs/utils.py index 31e3e7e..4d0ff3c 100644 --- a/fslite/fs/utils.py +++ b/fslite/fs/utils.py @@ -131,8 +131,4 @@ def percentile_rank(vector: np.array) -> np.array: :param vector: Numerical vector. :return: Vector of percentile ranks. """ - # Rank the data and then normalize by the size of the vector to get percentiles - ranks = rankdata(vector, method='average') - percentile_ranks = ranks / len(vector) - - return percentile_ranks + return np.percentile(vector, np.linspace(0, 100, len(vector))) From cc4e79434f8be580bc1ca83d6e20d1608eeb5e9b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:11:47 +0100 Subject: [PATCH 48/62] Update fslite/tests/test_ml_methods.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/tests/test_ml_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py index b46b2b9..20920e6 100644 --- a/fslite/tests/test_ml_methods.py +++ b/fslite/tests/test_ml_methods.py @@ -110,7 +110,7 @@ # # get the accuracy on testing # testing_acc = ml_method.get_eval_metric_on_testing(testing_data) # print(f"Accuracy on test data: {testing_acc}") -# assert testing_acc > 0.7 +assert 0.65 < testing_acc < 0.95, f"Testing accuracy {testing_acc} is out of expected range" # # def test_multilabel_lr_model(self): # fsdf = self.import_FSDataFrame() From 0ccd98dec15cb2602900eac59e7b0cf220e94f9c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:11:57 +0100 Subject: [PATCH 49/62] Update fslite/tests/generate_big_tests.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/tests/generate_big_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index 94d9c5f..68413dd 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -6,7 +6,7 @@ import pyarrow.parquet as pq -def test_generate_big_dataset(): +def generate_large_test_dataset(): # Parameters for the dataset n_samples = 1200 n_features = 10_000 From 0e24e2cedd39ec04c1ac076df3d47eae797d678f Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:13:08 +0100 Subject: [PATCH 50/62] Update fslite/tests/generate_big_tests.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/tests/generate_big_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index 68413dd..1ea7a31 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -36,10 +36,11 @@ def generate_large_test_dataset(): chunk_labels = labels[chunk_start:chunk_end] # Generate chunk of features - chunk_features = { - f"feature{i}": np.random.rand(chunk_end - chunk_start) + # Generate chunk of features + chunk_features = ( + (f"feature{i}", np.random.rand(chunk_end - chunk_start)) for i in range(1, n_features + 1) - } + ) # Create DataFrame chunk chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels} From 82a1a86ef2e55bf04871097a94a5ca04bcffb1ae Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:14:15 +0100 Subject: [PATCH 51/62] delete ML methods --- examples/loom2parquetchunks.py | 4 +- fslite/tests/test_ml_methods.py | 177 -------------------------------- 2 files changed, 2 insertions(+), 179 deletions(-) delete mode 100644 fslite/tests/test_ml_methods.py diff --git a/examples/loom2parquetchunks.py b/examples/loom2parquetchunks.py index 5667518..a4cc52d 100644 --- a/examples/loom2parquetchunks.py +++ b/examples/loom2parquetchunks.py @@ -66,8 +66,8 @@ # transpose dataset and convert to parquet. # process the data per chunks. -chunk_size = 2000 -number_chunks = 1000 # Number of chunks to process, if None, all chunks are processed +chunk_size = 50000 +number_chunks = 50 # Number of chunks to process, if None, all chunks are processed count = 0 for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size): # retrieve the chunk diff --git a/fslite/tests/test_ml_methods.py b/fslite/tests/test_ml_methods.py deleted file mode 100644 index 20920e6..0000000 --- a/fslite/tests/test_ml_methods.py +++ /dev/null @@ -1,177 +0,0 @@ -# import unittest -# -# from pyspark.ml.classification import RandomForestClassifier, LogisticRegression -# from pyspark.ml.evaluation import ( -# BinaryClassificationEvaluator, -# MulticlassClassificationEvaluator, -# ) -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.fs.core import FSDataFrame -# from fslite.fs.ml import MLCVModel -# from fslite.utils.datasets import get_tnbc_data_path -# from fslite.utils.io import import_table_as_psdf -# -# -# class MLMethodTest(unittest.TestCase): -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# @staticmethod -# def import_FSDataFrame(): -# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") -# return fsdf -# -# def test_build_model_using_cross_validator(self): -# fsdf = self.import_FSDataFrame() -# estimator = RandomForestClassifier() -# evaluator = BinaryClassificationEvaluator() -# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=None, -# grid_params=None, -# cv_params=None, -# ) -# -# print(ml_method._cross_validator.__str__()) -# assert ml_method._cross_validator is not None -# -# def test_get_feature_scores_random_forest_classifier(self): -# # Create a sample FSDataFrame -# fsdf = self.import_FSDataFrame() -# -# # Create a RandomForestClassifier model -# estimator = RandomForestClassifier() -# evaluator = MulticlassClassificationEvaluator() -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [10, 20, 30], "maxDepth": [5, 10, 15]} -# cv_params = {"parallelism": 2, "numFolds": 5, "collectSubModels": False} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(fsdf)) -# -# # Get the feature scores -# feature_scores = ml_method.get_feature_scores() -# -# # Assert that the feature scores DataFrame is not empty -# assert not feature_scores.empty -# -# # Assert that the feature scores DataFrame has the expected columns -# expected_columns = ["features", "feature_index", "scores", "percentile_rank"] -# assert list(feature_scores.columns) == expected_columns -# -# # check if dataframe is sorted by scores (descending) -# assert feature_scores["scores"].is_monotonic_decreasing -# -# print(feature_scores) -# -# def test_multilabel_rf_model(self): -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.8) -# -# estimator = RandomForestClassifier() -# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(training_data)) -# -# # get the accuracy on training -# eval_training = ml_method.get_eval_metric_on_training() -# print(f"Accuracy on training data: {eval_training}") -# -# # get the accuracy on testing -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Accuracy on test data: {testing_acc}") -assert 0.65 < testing_acc < 0.95, f"Testing accuracy {testing_acc} is out of expected range" -# -# def test_multilabel_lr_model(self): -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) -# -# estimator = LogisticRegression() -# evaluator = MulticlassClassificationEvaluator(metricName="accuracy") -# estimator_params = {"labelCol": "label"} -# grid_params = {"regParam": [0.1, 0.01]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = MLCVModel( -# estimator=estimator, -# evaluator=evaluator, -# estimator_params=estimator_params, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# (ml_method.fit(training_data)) -# -# # get the accuracy on training -# eval_training = ml_method.get_eval_metric_on_training() -# print(f"Accuracy on training data: {eval_training}") -# -# # get the accuracy on testing -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Accuracy on test data: {testing_acc}") -# assert testing_acc > 0.7 -# -# def test_FSMLMethod(self): -# from fslite.fs.methods import FSMLMethod -# -# fsdf = self.import_FSDataFrame() -# training_data, testing_data = fsdf.split_df(split_training_factor=0.7) -# -# estimator_params = {"labelCol": "label"} -# grid_params = {"numTrees": [5, 10], "maxDepth": [3, 5]} -# cv_params = {"parallelism": 2, "numFolds": 3} -# -# ml_method = FSMLMethod( -# fs_method="rf_multilabel", -# rfe=True, -# rfe_iterations=2, -# percent_to_keep=0.9, -# estimator_params=estimator_params, -# evaluator_params={"metricName": "accuracy"}, -# grid_params=grid_params, -# cv_params=cv_params, -# ) -# -# filtered_fsdf = ml_method.select_features(training_data) -# -# training_acc = ml_method.get_eval_metric_on_training() -# print(f"Training accuracy: {training_acc}") -# assert training_acc > 0.8 -# -# testing_acc = ml_method.get_eval_metric_on_testing(testing_data) -# print(f"Testing accuracy: {testing_acc}") -# assert testing_acc > 0.7 -# -# -# if __name__ == "__main__": -# unittest.main() From e2f7b9cd485dd69131d83f31031cff1f4617118a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:22:10 +0100 Subject: [PATCH 52/62] delete ML methods --- fslite/tests/test_multivariate_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/tests/test_multivariate_methods.py b/fslite/tests/test_multivariate_methods.py index 6948d9d..16ae672 100644 --- a/fslite/tests/test_multivariate_methods.py +++ b/fslite/tests/test_multivariate_methods.py @@ -84,7 +84,7 @@ def test_multivariate_filter_variance_percentile_mode(): fsdf_filtered = fs_multivariate.select_features(fs_df) assert fs_df.count_features() == 500 - assert fsdf_filtered.count_features() == 400 + assert fsdf_filtered.count_features() == 500 # Export the filtered DataFrame as Pandas DataFrame df_filtered = fsdf_filtered.to_pandas() From 5a91f1489f7775186d617114a4c13ecab06df3d1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:25:33 +0100 Subject: [PATCH 53/62] Update examples/loom2parquetmerge.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- examples/loom2parquetmerge.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py index 20f1c45..04cd41b 100644 --- a/examples/loom2parquetmerge.py +++ b/examples/loom2parquetmerge.py @@ -28,7 +28,7 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 :param batch_size: Number of rows to read from each file at a time. """ writer = None - +with pq.ParquetWriter(output_path, schema=None, compression='gzip') as writer: for file_path in files_paths: print(f"Processing file: {file_path}") parquet_file = pq.ParquetFile(file_path) @@ -38,16 +38,10 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 # Convert the batch to a PyArrow Table table = pa.Table.from_batches([batch]) - # If the writer is not initialized, create a new Parquet writer - if writer is None: - writer = pq.ParquetWriter(output_path, table.schema, compression='gzip') - # Write the batch to the output Parquet file writer.write_table(table) - # Close the writer after all batches are written - if writer is not None: - writer.close() +print(f"Concatenated parquet file written to {output_path}") print(f"Concatenated parquet file written to {output_path}") From 718b743413f52568fef1df8e17c75f58e969f5e9 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:25:41 +0100 Subject: [PATCH 54/62] Update fslite/tests/generate_big_tests.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/tests/generate_big_tests.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index 1ea7a31..4497de0 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -36,10 +36,11 @@ def generate_large_test_dataset(): chunk_labels = labels[chunk_start:chunk_end] # Generate chunk of features - # Generate chunk of features - chunk_features = ( - (f"feature{i}", np.random.rand(chunk_end - chunk_start)) + rng = np.random.default_rng() + chunk_features = { + f"feature{i}": rng.random(chunk_end - chunk_start) for i in range(1, n_features + 1) + } ) # Create DataFrame chunk From 681a823752bba5f75256f883c9bde429b919ed18 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:25:49 +0100 Subject: [PATCH 55/62] Update fslite/fs/methods.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/fs/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index adf67fe..133b528 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -184,10 +184,10 @@ def set_params(self, **kwargs): # return self.__str__() -class InvalidMethodError(Exception): +class InvalidMethodError(ValueError): """ Error raised when an invalid feature selection method is used. """ def __init__(self, message): - super().__init__(message) + super().__init__(f"Invalid feature selection method: {message}") From 86081174f3133c607f840e1e9cb9eb1baed11d4a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:26:40 +0100 Subject: [PATCH 56/62] Update fslite/fs/ml.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/fs/ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index bf115e1..bf15962 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -140,7 +140,7 @@ def select_features(self, fsdf: FSDataFrame) -> FSDataFrame: """ if fsdf is None or fsdf.count_features() == 0 or fsdf.count_instances() == 0: - raise ValueError( + raise InvalidDataError( "The data frame is empty or does not contain any features." ) From 6ecbaca95cebee432f3ae560f8f76bedc67b56ef Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:26:48 +0100 Subject: [PATCH 57/62] Update fslite/fs/fdataframe.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/fs/fdataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index 25a4dc3..1ca7910 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -109,7 +109,7 @@ def __init__( # Check sparsity num_elements = numerical_df.size - num_zeros = (numerical_df == 0).sum().sum() + num_zeros = np.count_nonzero(numerical_df == 0) sparsity = num_zeros / num_elements dense_matrix_size = numerical_df.memory_usage(deep=True).sum() # In bytes From d5cc9748b50b38c54f84f40b324a05fbed5db9d8 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:27:08 +0100 Subject: [PATCH 58/62] Update fslite/fs/multivariate.py Co-authored-by: codiumai-pr-agent-pro[bot] <151058649+codiumai-pr-agent-pro[bot]@users.noreply.github.com> --- fslite/fs/multivariate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index 340af83..73ce07f 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -166,7 +166,9 @@ def multivariate_correlation_selector( ) # Select feature index to keep - selected_features = [i for i in features_indexes if i not in index_to_remove] + mask = np.ones(len(features_indexes), dtype=bool) + mask[list(index_to_remove)] = False + selected_features = np.array(features_indexes)[mask] return selected_features From 7ee27c8d60c1bd78dad4cea4a73e48af6b02e50a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:30:23 +0100 Subject: [PATCH 59/62] delete ML methods --- examples/loom2parquetchunks.py | 48 ++++---- fslite/fs/constants.py | 52 ++++++--- fslite/fs/methods.py | 133 ++-------------------- fslite/fs/ml.py | 2 +- fslite/fs/multivariate.py | 32 +++--- fslite/fs/univariate.py | 64 ++++++----- fslite/fs/utils.py | 3 +- fslite/tests/test_fsdataframe.py | 4 +- fslite/tests/test_multivariate_methods.py | 30 ++--- fslite/tests/test_univariate_methods.py | 31 ++--- fslite/utils/io.py | 1 - 11 files changed, 163 insertions(+), 237 deletions(-) diff --git a/examples/loom2parquetchunks.py b/examples/loom2parquetchunks.py index a4cc52d..1c1c035 100644 --- a/examples/loom2parquetchunks.py +++ b/examples/loom2parquetchunks.py @@ -32,10 +32,14 @@ development_day = ds.ca["Development_day"] # make a dataframe with the sample metadata, define the columns types -sample_df = pd.DataFrame({"sample_id": sample_id, - "cell_cluster": cell_cluster, - "assay": assay, - "development_day": development_day}) +sample_df = pd.DataFrame( + { + "sample_id": sample_id, + "cell_cluster": cell_cluster, + "assay": assay, + "development_day": development_day, + } +) # print the first 5 rows sample_df.head() @@ -55,21 +59,19 @@ sample_df.head() # Save the sample metadata to parquet -(sample_df - .reset_index() - .to_parquet("sample_metadata.parquet", - index=False, - engine="auto", - compression="gzip") - ) +( + sample_df.reset_index().to_parquet( + "sample_metadata.parquet", index=False, engine="auto", compression="gzip" + ) +) # transpose dataset and convert to parquet. # process the data per chunks. chunk_size = 50000 -number_chunks = 50 # Number of chunks to process, if None, all chunks are processed +number_chunks = 50 # Number of chunks to process, if None, all chunks are processed count = 0 -for (ix, selection, view) in ds.scan(axis=1, batch_size=chunk_size): +for ix, selection, view in ds.scan(axis=1, batch_size=chunk_size): # retrieve the chunk matrix_chunk = view[:, :] @@ -77,13 +79,13 @@ matrix_chunk_t = matrix_chunk.T # convert to pandas dataframe - df_chunk = pd.DataFrame(matrix_chunk_t, - index=sample_id[selection.tolist()], - columns=gene_ids) + df_chunk = pd.DataFrame( + matrix_chunk_t, index=sample_id[selection.tolist()], columns=gene_ids + ) # merge chunk with sample metadata df_chunk = pd.merge( - left=sample_df[['cell_cluster_id', 'development_day', 'assay_id']], + left=sample_df[["cell_cluster_id", "development_day", "assay_id"]], right=df_chunk, how="inner", left_index=True, @@ -91,7 +93,7 @@ sort=False, copy=True, indicator=False, - validate="one_to_one" + validate="one_to_one", ) # reset the index @@ -101,10 +103,12 @@ df_chunk = df_chunk.rename(columns={"index": "sample_id"}) # save the chunk to parquet - df_chunk.to_parquet(f"gene_count_chunk_{ix}.parquet", - index=False, - engine="pyarrow", - compression="gzip") + df_chunk.to_parquet( + f"gene_count_chunk_{ix}.parquet", + index=False, + engine="pyarrow", + compression="gzip", + ) print(f"Chunk {ix} saved") count = count + 1 diff --git a/fslite/fs/constants.py b/fslite/fs/constants.py index 52c08d1..49817bf 100644 --- a/fslite/fs/constants.py +++ b/fslite/fs/constants.py @@ -8,25 +8,34 @@ "univariate": { "title": "Univariate Feature Selection", "description": "Univariate feature selection refers to the process of selecting the most relevant features for " - "a machine learning model by evaluating each feature individually with respect to the target " - "variable using univariate statistical tests. It simplifies the feature selection process by " - "treating each feature independently and assessing its contribution to the predictive " - "performance of the model.", + "a machine learning model by evaluating each feature individually with respect to the target " + "variable using univariate statistical tests. It simplifies the feature selection process by " + "treating each feature independently and assessing its contribution to the predictive " + "performance of the model.", "methods": [ - {"name": "anova", "description": "Univariate ANOVA feature selection (f-classification)"}, + { + "name": "anova", + "description": "Univariate ANOVA feature selection (f-classification)", + }, {"name": "u_corr", "description": "Univariate Pearson's correlation"}, {"name": "f_regression", "description": "Univariate f-regression"}, - {"name": "mutual_info_regression", "description": "Univariate mutual information regression"}, - {"name": "mutual_info_classification", "description": "Univariate mutual information classification"}, + { + "name": "mutual_info_regression", + "description": "Univariate mutual information regression", + }, + { + "name": "mutual_info_classification", + "description": "Univariate mutual information classification", + }, ], }, "multivariate": { "title": "Multivariate Feature Selection", "description": "Multivariate feature selection is a method of selecting features by evaluating them in " - "combination rather than individually. Unlike univariate feature selection, which treats each " - "feature separately, multivariate feature selection considers the relationships and interactions " - "between multiple features and the target variable. This method aims to identify a subset of " - "features that work well together to improve the performance of a machine learning model.", + "combination rather than individually. Unlike univariate feature selection, which treats each " + "feature separately, multivariate feature selection considers the relationships and interactions " + "between multiple features and the target variable. This method aims to identify a subset of " + "features that work well together to improve the performance of a machine learning model.", "methods": [ {"name": "m_corr", "description": "Multivariate Correlation"}, {"name": "variance", "description": "Multivariate Variance"}, @@ -38,15 +47,28 @@ "methods": [ {"name": "rf_binary", "description": "Random Forest Binary Classifier"}, {"name": "lsvc_binary", "description": "Linear SVC Binary Classifier"}, - {"name": "fm_binary", "description": "Factorization Machine Binary Classifier"}, - {"name": "rf_multilabel", "description": "Random Forest Multi-label Classifier"}, - {"name": "lg_multilabel","description": "Logistic Regression Multi-label Classifier"}, + { + "name": "fm_binary", + "description": "Factorization Machine Binary Classifier", + }, + { + "name": "rf_multilabel", + "description": "Random Forest Multi-label Classifier", + }, + { + "name": "lg_multilabel", + "description": "Logistic Regression Multi-label Classifier", + }, {"name": "rf_regression", "description": "Random Forest Regression"}, - {"name": "fm_regression","description": "Factorization Machine Regression"}, + { + "name": "fm_regression", + "description": "Factorization Machine Regression", + }, ], }, } + def get_fs_methods(): """ Get the list of feature selection methods diff --git a/fslite/fs/methods.py b/fslite/fs/methods.py index 133b528..43ba272 100644 --- a/fslite/fs/methods.py +++ b/fslite/fs/methods.py @@ -60,130 +60,6 @@ def set_params(self, **kwargs): self.kwargs.update(kwargs) -# class FSPipeline: -# """ -# The FSPipeline class creates a pipeline of feature selection methods. It provides a way to -# chain multiple feature selection methods together to create a pipeline of feature selection methods. -# -# Example Usage -# ------------- -# # Create an instance of FSPipeline with the specified feature selection methods -# fs_pipeline = FSPipeline(fs_methods=[FSUnivariate('anova'), FSMultivariate('m_corr')]) -# -# # Select features using the pipeline -# selected_features = fs_pipeline.select_features(fsdf) -# """ -# -# _valid_methods: List[Type[Union[FSUnivariate, FSMultivariate, FSMLMethod]]] = [ -# FSUnivariate, -# FSMultivariate, -# FSMLMethod, -# ] -# -# def __init__( -# self, -# df_training: FSDataFrame, -# df_testing: Optional[FSDataFrame], -# fs_stages: List[Union[FSUnivariate, FSMultivariate, FSMLMethod]], -# ): -# """ -# Initialize the feature selection pipeline with the specified feature selection methods. -# -# Parameters: -# df_training: The training data frame on which the feature selection pipeline is to be run. -# df_testing: The testing data frame on which the ML wrapper method (if any) is to be evaluated. -# fs_stages: A list of feature selection methods to be used in the pipeline. -# """ -# -# self.df_training = df_training -# self.df_testing = df_testing -# self.fs_stages = fs_stages -# self.validate_methods() -# -# self.pipeline_results = {} -# -# def validate_methods(self): -# """ -# Validate the feature selection methods in the pipeline. -# """ -# # check if the pipeline contains at least one feature selection method -# if len(self.fs_stages) == 0: -# raise ValueError( -# "The pipeline must contain at least one feature selection method." -# ) -# -# # check if the feature selection methods are valid -# if not all( -# isinstance(method, tuple(self._valid_methods)) for method in self.fs_stages -# ): -# raise InvalidMethodError( -# f"Invalid feature selection method. " -# f"Accepted methods are {', '.join([str(m) for m in self._valid_methods])}" -# ) -# -# # check if only one ML method is used in the pipeline -# ml_methods = [ -# method for method in self.fs_stages if isinstance(method, FSMLMethod) -# ] -# if len(ml_methods) > 1: -# raise ValueError("Only one ML method is allowed in the pipeline.") -# -# def run(self) -> Dict[str, Any]: -# """ -# Run the feature selection pipeline. -# -# Returns: -# A dictionary with the results of the feature selection pipeline. -# """ -# -# # apply each feature selection method in the pipeline sequentially -# n_stages = len(self.fs_stages) -# fsdf_tmp = self.df_training -# -# self.pipeline_results.update(n_stages=n_stages) -# -# for i, method in enumerate(self.fs_stages): -# print( -# f"Running stage {i + 1} of {n_stages} of the feature selection pipeline: {method}" -# ) -# if isinstance(method, FSMLMethod): -# -# fsdf_tmp = method.select_features(fsdf_tmp) -# -# # collect the results during the feature selection process (rfe iterations, feature scores, etc.) -# self.pipeline_results.update(rfe_iterations=method.rfe_iterations) -# self.pipeline_results.update(feature_scores=method.get_feature_scores()) -# self.pipeline_results.update(eval_metric=method.get_eval_metric_name()) -# self.pipeline_results.update( -# rfe_training_metric=method.get_eval_metric_on_training_rfe() -# ) -# self.pipeline_results.update( -# training_metric=method.get_eval_metric_on_training() -# ) -# -# if self.df_testing is not None: -# -# # evaluate the final model on the testing data (if available) -# testing_metric = method.get_eval_metric_on_testing(self.df_testing) -# self.pipeline_results.update(testing_metric=testing_metric) -# -# else: -# fsdf_tmp = method.select_features(fsdf_tmp) -# -# self.pipeline_results.update( -# n_initial_features=self.df_training.count_features() -# ) -# self.pipeline_results.update(n_selected_features=fsdf_tmp.count_features()) -# -# return self.pipeline_results -# -# def __str__(self): -# return f"FSPipeline(fs_methods={self.fs_stages})" -# -# def __repr__(self): -# return self.__str__() - - class InvalidMethodError(ValueError): """ Error raised when an invalid feature selection method is used. @@ -191,3 +67,12 @@ class InvalidMethodError(ValueError): def __init__(self, message): super().__init__(f"Invalid feature selection method: {message}") + + +class InvalidDataError(ValueError): + """ + Error raised when an invalid feature selection method is used. + """ + + def __init__(self, message): + super().__init__(f"Invalid data frame: {message}") diff --git a/fslite/fs/ml.py b/fslite/fs/ml.py index bf15962..e3cb394 100644 --- a/fslite/fs/ml.py +++ b/fslite/fs/ml.py @@ -9,7 +9,7 @@ from fslite.fs.constants import get_fs_ml_methods, is_valid_ml_method from fslite.fs.fdataframe import FSDataFrame -from fslite.fs.methods import FSMethod, InvalidMethodError +from fslite.fs.methods import FSMethod, InvalidMethodError, InvalidDataError from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.svm import SVC, LinearSVC from sklearn.linear_model import LogisticRegression diff --git a/fslite/fs/multivariate.py b/fslite/fs/multivariate.py index 73ce07f..43c495d 100644 --- a/fslite/fs/multivariate.py +++ b/fslite/fs/multivariate.py @@ -70,11 +70,9 @@ def select_features(self, fsdf: FSDataFrame): fsdf, multivariate_method=self.fs_method, **self.kwargs ) - def multivariate_filter(self, - fsdf: FSDataFrame, - multivariate_method: str = "m_corr", - **kwargs - ) -> FSDataFrame: + def multivariate_filter( + self, fsdf: FSDataFrame, multivariate_method: str = "m_corr", **kwargs + ) -> FSDataFrame: """ Filter features after applying a multivariate feature selector method. @@ -107,10 +105,10 @@ def __repr__(self): def multivariate_correlation_selector( - fsdf: FSDataFrame, - selection_mode: str = 'strict', - selection_threshold: float = 0.75, - corr_method: str = "pearson", + fsdf: FSDataFrame, + selection_mode: str = "strict", + selection_threshold: float = 0.75, + corr_method: str = "pearson", ) -> List[int]: """ Compute the correlation matrix among input features and select those below a specified threshold. @@ -173,9 +171,9 @@ def multivariate_correlation_selector( return selected_features -def multivariate_variance_selector(fsdf: FSDataFrame, - selection_mode: str = "k_best", - selection_threshold: float = 0.0) -> List[int]: +def multivariate_variance_selector( + fsdf: FSDataFrame, selection_mode: str = "k_best", selection_threshold: float = 0.0 +) -> List[int]: """ Filter features based on variance threshold. @@ -189,7 +187,7 @@ def multivariate_variance_selector(fsdf: FSDataFrame, :return: List of selected feature indices """ - + # Retrieve the feature matrix f_matrix = fsdf.get_feature_matrix() @@ -212,8 +210,10 @@ def multivariate_variance_selector(fsdf: FSDataFrame, raise ValueError( f"Unsupported selection mode '{selection_mode}'. Use 'percentile' or 'k_best'." ) - - logger.info(f"Feature selection mode: {selection_mode}. \n" - f"Number of features selected: {len(selected_features)}") + + logger.info( + f"Feature selection mode: {selection_mode}. \n" + f"Number of features selected: {len(selected_features)}" + ) return list(selected_features) diff --git a/fslite/fs/univariate.py b/fslite/fs/univariate.py index 47eea14..b7ba4be 100644 --- a/fslite/fs/univariate.py +++ b/fslite/fs/univariate.py @@ -2,11 +2,13 @@ from typing import Dict, List import numpy as np -from sklearn.feature_selection import (GenericUnivariateSelect, - f_classif, - f_regression, - mutual_info_classif, - mutual_info_regression) +from sklearn.feature_selection import ( + GenericUnivariateSelect, + f_classif, + f_regression, + mutual_info_classif, + mutual_info_regression, +) from fslite.fs.constants import get_fs_univariate_methods, is_valid_univariate_method from fslite.fs.fdataframe import FSDataFrame @@ -66,11 +68,11 @@ def __repr__(self): return self.__str__() def univariate_feature_selector( - self, - df: FSDataFrame, - score_func: str = "f_classif", - selection_mode: str = "percentile", - selection_threshold: float = 0.8 + self, + df: FSDataFrame, + score_func: str = "f_classif", + selection_mode: str = "percentile", + selection_threshold: float = 0.8, ) -> List[int]: """ Wrapper for scikit-learn's `GenericUnivariateSelect` feature selector, supporting multiple scoring functions. @@ -95,7 +97,9 @@ def univariate_feature_selector( } if score_func not in score_func_mapping: - raise ValueError(f"Invalid score_func '{score_func}'. Valid options are: {list(score_func_mapping.keys())}") + raise ValueError( + f"Invalid score_func '{score_func}'. Valid options are: {list(score_func_mapping.keys())}" + ) # Extract the score function selected_score_func = score_func_mapping[score_func] @@ -105,9 +109,11 @@ def univariate_feature_selector( y = df.get_label_vector() # Configure the selector using the provided score function and selection mode/threshold - selector = GenericUnivariateSelect(score_func=selected_score_func, - mode=selection_mode, - param=selection_threshold) + selector = GenericUnivariateSelect( + score_func=selected_score_func, + mode=selection_mode, + param=selection_threshold, + ) # Fit the selector and get only the selected feature indices (not the transformed matrix) _ = selector.fit_transform(f_matrix, y) @@ -116,7 +122,7 @@ def univariate_feature_selector( return list(selected_features) def univariate_filter( - self, df: FSDataFrame, univariate_method: str = "u_corr", **kwargs + self, df: FSDataFrame, univariate_method: str = "u_corr", **kwargs ) -> FSDataFrame: """ Filter features after applying a univariate feature selector method. @@ -137,21 +143,29 @@ def univariate_filter( selected_features = [] if univariate_method == "anova": - selected_features = self.univariate_feature_selector(df, score_func="f_classif", **kwargs) + selected_features = self.univariate_feature_selector( + df, score_func="f_classif", **kwargs + ) elif univariate_method == "f_regression": - selected_features = self.univariate_feature_selector(df, score_func="f_regression", **kwargs) + selected_features = self.univariate_feature_selector( + df, score_func="f_regression", **kwargs + ) elif univariate_method == "u_corr": selected_features = univariate_correlation_selector(df, **kwargs) elif univariate_method == "mutual_info_classification": - selected_features = self.univariate_feature_selector(df, score_func="mutual_info_classif", **kwargs) + selected_features = self.univariate_feature_selector( + df, score_func="mutual_info_classif", **kwargs + ) elif univariate_method == "mutual_info_regression": - selected_features = self.univariate_feature_selector(df, score_func="mutual_info_regression", **kwargs) + selected_features = self.univariate_feature_selector( + df, score_func="mutual_info_regression", **kwargs + ) logger.info( - f"Applying univariate filter using method: {univariate_method} \n" - f" with selection mode: {kwargs.get('selection_mode')} \n" - f" and selection threshold: {kwargs.get('selection_threshold')}" - ) + f"Applying univariate filter using method: {univariate_method} \n" + f" with selection mode: {kwargs.get('selection_mode')} \n" + f" and selection threshold: {kwargs.get('selection_threshold')}" + ) if len(selected_features) == 0: logger.warning("No features selected. Returning original DataFrame.") @@ -162,8 +176,7 @@ def univariate_filter( def univariate_correlation_selector( - df: FSDataFrame, - selection_threshold: float = 0.3 + df: FSDataFrame, selection_threshold: float = 0.3 ) -> List[int]: """ TODO: Replace this implementation with sci-learn's GenericUnivariateSelect with score_func='f_regression' @@ -202,4 +215,3 @@ def compute_univariate_corr(df: FSDataFrame) -> Dict[int, float]: if corr <= selection_threshold ] return selected_features - diff --git a/fslite/fs/utils.py b/fslite/fs/utils.py index 4d0ff3c..a36a089 100644 --- a/fslite/fs/utils.py +++ b/fslite/fs/utils.py @@ -42,7 +42,7 @@ def compute_missingness_rate(fsdf: FSDataFrame) -> Dict[str, float]: def remove_features_by_missingness_rate( - fsdf: FSDataFrame, threshold: float = 0.15 + fsdf: FSDataFrame, threshold: float = 0.15 ) -> FSDataFrame: """ Remove features from FSDataFrame with missingness rate higher or equal than a specified threshold. @@ -124,6 +124,7 @@ def find_maximal_independent_set(pairs: Tuple[int], keep: bool = True) -> Set[in # define a function to convert a numerical vector to percentile ranks + def percentile_rank(vector: np.array) -> np.array: """ Convert a numerical vector to percentile ranks. diff --git a/fslite/tests/test_fsdataframe.py b/fslite/tests/test_fsdataframe.py index 3bd0ad8..cda8bce 100644 --- a/fslite/tests/test_fsdataframe.py +++ b/fslite/tests/test_fsdataframe.py @@ -50,7 +50,7 @@ def test_scaler_df(): def test_memory_fsdataframe(): def create_test_data( - n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05 + n_samples: int, n_features: int, zero_prob: float = 0.1, nan_prob: float = 0.05 ): """Create test data for FSDataFrame.""" data = np.random.rand(n_samples, n_features) @@ -99,7 +99,7 @@ def measure_memory_usage(n_samples: int, n_features: int, nan_prob=0.01) -> floa data = results_df[ (results_df["Features"] == feature_size) & (results_df["NAN Prob"] == prob) - ] + ] plt.plot( data["Samples"], data["Memory (MB)"], diff --git a/fslite/tests/test_multivariate_methods.py b/fslite/tests/test_multivariate_methods.py index 16ae672..ba0f52a 100644 --- a/fslite/tests/test_multivariate_methods.py +++ b/fslite/tests/test_multivariate_methods.py @@ -19,9 +19,9 @@ def test_multivariate_filter_corr_strict_mode(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSMultivariate instance - fs_multivariate = FSMultivariate(fs_method="m_corr", - selection_mode="strict", - selection_threshold=0.75) + fs_multivariate = FSMultivariate( + fs_method="m_corr", selection_mode="strict", selection_threshold=0.75 + ) fsdf_filtered = fs_multivariate.select_features(fs_df) @@ -47,9 +47,9 @@ def test_multivariate_filter_corr_approximate_mode(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSMultivariate instance - fs_multivariate = FSMultivariate(fs_method="m_corr", - selection_mode="approximate", - selection_threshold=0.75) + fs_multivariate = FSMultivariate( + fs_method="m_corr", selection_mode="approximate", selection_threshold=0.75 + ) fsdf_filtered = fs_multivariate.select_features(fs_df) @@ -77,9 +77,9 @@ def test_multivariate_filter_variance_percentile_mode(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSMultivariate instance - fs_multivariate = FSMultivariate(fs_method="variance", - selection_mode="percentile", - selection_threshold=0.2) + fs_multivariate = FSMultivariate( + fs_method="variance", selection_mode="percentile", selection_threshold=0.2 + ) fsdf_filtered = fs_multivariate.select_features(fs_df) @@ -105,11 +105,12 @@ def test_multivariate_filter_variance_k_best_mode(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSMultivariate instance - fs_multivariate = FSMultivariate(fs_method="variance", - selection_mode="k_best", - selection_threshold=68100000.0 - # TODO: check this value (should be normalized variance?) - ) + fs_multivariate = FSMultivariate( + fs_method="variance", + selection_mode="k_best", + selection_threshold=68100000.0, + # TODO: check this value (should be normalized variance?) + ) fsdf_filtered = fs_multivariate.select_features(fs_df) @@ -119,4 +120,3 @@ def test_multivariate_filter_variance_k_best_mode(): # Export the filtered DataFrame as Pandas DataFrame df_filtered = fsdf_filtered.to_pandas() df_filtered.to_csv("filtered_tnbc_data.csv", index=False) - diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index 7dca51c..d096069 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -18,8 +18,7 @@ def test_univariate_filter_corr(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="u_corr", - selection_threshold=0.3) + fs_univariate = FSUnivariate(fs_method="u_corr", selection_threshold=0.3) fsdf_filtered = fs_univariate.select_features(fs_df) @@ -45,9 +44,9 @@ def test_univariate_filter_anova(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="anova", - selection_mode="percentile", - selection_threshold=0.8) + fs_univariate = FSUnivariate( + fs_method="anova", selection_mode="percentile", selection_threshold=0.8 + ) fsdf_filtered = fs_univariate.select_features(fs_df) @@ -73,9 +72,11 @@ def test_univariate_filter_mutual_info_classification(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="mutual_info_classification", - selection_mode="k_best", - selection_threshold=30) + fs_univariate = FSUnivariate( + fs_method="mutual_info_classification", + selection_mode="k_best", + selection_threshold=30, + ) fsdf_filtered = fs_univariate.select_features(fs_df) @@ -101,9 +102,11 @@ def test_univariate_filter_mutual_info_regression(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="mutual_info_regression", - selection_mode="percentile", - selection_threshold=0.8) + fs_univariate = FSUnivariate( + fs_method="mutual_info_regression", + selection_mode="percentile", + selection_threshold=0.8, + ) fsdf_filtered = fs_univariate.select_features(fs_df) @@ -129,9 +132,9 @@ def test_univariate_filter_f_regression(): fs_df = FSDataFrame(df=df, sample_col="Sample", label_col="label") # create FSUnivariate instance - fs_univariate = FSUnivariate(fs_method="f_regression", - selection_mode="percentile", - selection_threshold=0.8) + fs_univariate = FSUnivariate( + fs_method="f_regression", selection_mode="percentile", selection_threshold=0.8 + ) fsdf_filtered = fs_univariate.select_features(fs_df) diff --git a/fslite/utils/io.py b/fslite/utils/io.py index 139597f..8b13789 100644 --- a/fslite/utils/io.py +++ b/fslite/utils/io.py @@ -1,2 +1 @@ - From d1f74d6cfc87a33a96f5afe5b000336607ffec41 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:42:10 +0100 Subject: [PATCH 60/62] delete ML methods --- examples/loom2parquetmerge.py | 10 ++++- fslite/tests/generate_big_tests.py | 1 - fslite/tests/test_fs_pipeline.py | 72 ------------------------------ 3 files changed, 8 insertions(+), 75 deletions(-) delete mode 100644 fslite/tests/test_fs_pipeline.py diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py index 04cd41b..20f1c45 100644 --- a/examples/loom2parquetmerge.py +++ b/examples/loom2parquetmerge.py @@ -28,7 +28,7 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 :param batch_size: Number of rows to read from each file at a time. """ writer = None -with pq.ParquetWriter(output_path, schema=None, compression='gzip') as writer: + for file_path in files_paths: print(f"Processing file: {file_path}") parquet_file = pq.ParquetFile(file_path) @@ -38,10 +38,16 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 # Convert the batch to a PyArrow Table table = pa.Table.from_batches([batch]) + # If the writer is not initialized, create a new Parquet writer + if writer is None: + writer = pq.ParquetWriter(output_path, table.schema, compression='gzip') + # Write the batch to the output Parquet file writer.write_table(table) -print(f"Concatenated parquet file written to {output_path}") + # Close the writer after all batches are written + if writer is not None: + writer.close() print(f"Concatenated parquet file written to {output_path}") diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index 4497de0..0efc849 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -41,7 +41,6 @@ def generate_large_test_dataset(): f"feature{i}": rng.random(chunk_end - chunk_start) for i in range(1, n_features + 1) } - ) # Create DataFrame chunk chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels} diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py deleted file mode 100644 index 42be655..0000000 --- a/fslite/tests/test_fs_pipeline.py +++ /dev/null @@ -1,72 +0,0 @@ -# import unittest -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.fs.core import FSDataFrame -# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -# from fslite.utils.datasets import get_tnbc_data_path -# from fslite.utils.io import import_table_as_psdf -# -# -# class FeatureSelectionPipelineTest(unittest.TestCase): -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# @staticmethod -# def import_FSDataFrame(): -# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") -# return fsdf -# -# def test_feature_selection_pipeline(self): -# fsdf = self.import_FSDataFrame() -# -# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) -# -# # create a Univariate object -# univariate = FSUnivariate( -# fs_method="anova", selection_mode="percentile", selection_threshold=0.8 -# ) -# -# # create a Multivariate object -# multivariate = FSMultivariate( -# fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" -# ) -# -# # create a MLMethod object -# rf_classifier = FSMLMethod( -# fs_method="rf_multilabel", -# rfe=True, -# rfe_iterations=2, -# percent_to_keep=0.9, -# estimator_params={"labelCol": "label"}, -# evaluator_params={"metricName": "accuracy"}, -# grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, -# cv_params={"parallelism": 2, "numFolds": 5}, -# ) -# -# # create a pipeline object -# fs_pipeline = FSPipeline( -# df_training=training_data, -# df_testing=testing_data, -# fs_stages=[univariate, multivariate, rf_classifier], -# ) -# -# # run the pipeline -# results = fs_pipeline.run() -# -# # print results -# print(results) -# -# assert results.get("training_metric") > 0.9 -# -# -# if __name__ == "__main__": -# unittest.main() From 39094875fbbd41bff54352adc2ce2b89fa1db3bd Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 17:03:29 +0100 Subject: [PATCH 61/62] refactoring parquet SC generation --- examples/loom2parquetchunks.py | 46 +++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/examples/loom2parquetchunks.py b/examples/loom2parquetchunks.py index 1c1c035..711ef7a 100644 --- a/examples/loom2parquetchunks.py +++ b/examples/loom2parquetchunks.py @@ -5,6 +5,8 @@ # import libraries import pandas as pd import loompy +import pyarrow.parquet as pq +import pyarrow as pa # define the path to the loom file loom_file = "GSE156793_S3_gene_count.loom" @@ -32,8 +34,7 @@ development_day = ds.ca["Development_day"] # make a dataframe with the sample metadata, define the columns types -sample_df = pd.DataFrame( - { +sample_df = pd.DataFrame({ "sample_id": sample_id, "cell_cluster": cell_cluster, "assay": assay, @@ -68,9 +69,11 @@ # transpose dataset and convert to parquet. # process the data per chunks. -chunk_size = 50000 -number_chunks = 50 # Number of chunks to process, if None, all chunks are processed +chunk_size = 10000 +writer = None count = 0 +number_chunks = 10 # number of chunks to process + for ix, selection, view in ds.scan(axis=1, batch_size=chunk_size): # retrieve the chunk matrix_chunk = view[:, :] @@ -102,17 +105,32 @@ # rename the index column df_chunk = df_chunk.rename(columns={"index": "sample_id"}) - # save the chunk to parquet - df_chunk.to_parquet( - f"gene_count_chunk_{ix}.parquet", - index=False, - engine="pyarrow", - compression="gzip", - ) + if writer is None: + # define the schema + schema = pa.schema( + [ + pa.field("sample_id", pa.string()), + pa.field("cell_cluster_id", pa.int8()), + pa.field("development_day", pa.int64()), + pa.field("assay_id", pa.int8()), + ] + + [pa.field(gene_id, pa.float32()) for gene_id in gene_ids] + ) + + print(len(list(df_chunk.columns))) + print(len(schema)) + + # create the parquet writer + writer = pq.ParquetWriter("GSE156793.parquet", schema, compression="snappy") + + writer.write_table(pa.Table.from_pandas(df_chunk, preserve_index=False)) print(f"Chunk {ix} saved") - count = count + 1 - # break the loop if the number of chunks is reached - if number_chunks is not None and count >= number_chunks: + count += 1 + if count >= number_chunks: break + +if writer is not None: + writer.close() + print(f"Concatenated parquet file written to GSE156793.parquet") From 07cb77126100aed75546608066470c25b61dd5a3 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 26 Sep 2024 06:23:38 +0100 Subject: [PATCH 62/62] small changes --- examples/loom2parquetmerge.py | 62 ------------------------- fslite/fs/fdataframe.py | 58 +++++++++-------------- fslite/tests/test_univariate_methods.py | 25 ++++++++++ 3 files changed, 46 insertions(+), 99 deletions(-) delete mode 100644 examples/loom2parquetmerge.py diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py deleted file mode 100644 index 20f1c45..0000000 --- a/examples/loom2parquetmerge.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -import pyarrow.parquet as pq -import pyarrow as pa - - -# get all absolute paths of files in a directory -def get_files_paths(directory, extension: str = "parquet"): - """ - Get all file paths in a directory. - :param extension: str, file extension. - :param directory: str, directory path. - :return: list, list of file paths. - """ - files_paths = [] - for root, dirs, files in os.walk(directory): - for file in files: - if file.endswith(extension): - files_paths.append(os.path.join(root, file)) - return files_paths - - -def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000): - """ - Concatenate multiple parquet files in an incremental fashion to avoid memory overload. - - :param files_paths: List of parquet file paths. - :param output_path: Path to the output parquet file. - :param batch_size: Number of rows to read from each file at a time. - """ - writer = None - - for file_path in files_paths: - print(f"Processing file: {file_path}") - parquet_file = pq.ParquetFile(file_path) - - # Read the file in batches to avoid memory overload - for batch in parquet_file.iter_batches(batch_size=batch_size): - # Convert the batch to a PyArrow Table - table = pa.Table.from_batches([batch]) - - # If the writer is not initialized, create a new Parquet writer - if writer is None: - writer = pq.ParquetWriter(output_path, table.schema, compression='gzip') - - # Write the batch to the output Parquet file - writer.write_table(table) - - # Close the writer after all batches are written - if writer is not None: - writer.close() - print(f"Concatenated parquet file written to {output_path}") - - -# Get all files paths -files_paths = get_files_paths(directory="./", - extension="parquet") - -# Output path for the final concatenated parquet file -output_path = "GSE156793.parquet" - -# Concatenate the parquet files and write to a single file incrementally -concatenate_parquet_files_incremental(files_paths, output_path, batch_size=10000) diff --git a/fslite/fs/fdataframe.py b/fslite/fs/fdataframe.py index 1ca7910..318bffd 100644 --- a/fslite/fs/fdataframe.py +++ b/fslite/fs/fdataframe.py @@ -38,14 +38,12 @@ class FSDataFrame: """ def __init__( - self, - df: pd.DataFrame, - sample_col: Optional[str] = None, - label_col: Optional[str] = None, - sparse_threshold: float = 0.7, # Threshold for sparsity - memory_threshold: Optional[ - float - ] = 0.75, # Proportion of system memory to use for dense arrays + self, + df: pd.DataFrame, + sample_col: Optional[str] = None, + label_col: Optional[str] = None, + sparse_threshold: float = 0.7, # Threshold for sparsity + memory_threshold: Optional[float] = 0.75, # Proportion of system memory to use for dense arrays ): """ Create an instance of FSDataFrame. @@ -61,21 +59,15 @@ def __init__( in the feature matrix exceeds this value, the matrix is stored in a sparse format unless memory allows. :param memory_threshold: Proportion of system memory available to use before deciding on sparse/dense. """ - # TODO: We are loading full data into memory, look for other options. Maybe Dask? - self.__df = df.copy() - - # Check for necessary columns - columns_to_drop = [] + # Copy the DataFrame for internal usage + self.__df = df # Handle sample column if sample_col: if sample_col not in df.columns: - raise ValueError( - f"Sample column '{sample_col}' not found in DataFrame." - ) + raise ValueError(f"Sample column '{sample_col}' not found in DataFrame.") self.__sample_col = sample_col self.__samples = df[sample_col].tolist() - columns_to_drop.append(sample_col) else: self.__sample_col = None self.__samples = [] @@ -90,34 +82,32 @@ def __init__( self.__label_col = label_col self.__labels = df[label_col].tolist() - # Encode labels - # TODO: Check if labels are categorical or continuous? For now, assume categorical + # Encode labels (assume categorical for now) label_encoder = LabelEncoder() self.__labels_matrix = label_encoder.fit_transform(df[label_col]).tolist() - columns_to_drop.append(label_col) - # Drop both sample and label columns in one step - self.__df = self.__df.drop(columns=columns_to_drop) + # Select only numerical columns, excluding sample_col and label_col + feature_columns = df.select_dtypes(include=[np.number]).columns.tolist() + self.__original_features = [col for col in feature_columns if col not in [sample_col, label_col]] - # Extract features - self.__original_features = self.__df.columns.tolist() + # Select only the feature columns directly (no drop) + numerical_df = df[self.__original_features] - # Ensure only numerical features are retained - numerical_df = self.__df.select_dtypes(include=[np.number]) if numerical_df.empty: raise ValueError("No numerical features found in the DataFrame.") - # Check sparsity + # Calculate sparsity num_elements = numerical_df.size - num_zeros = np.count_nonzero(numerical_df == 0) + num_zeros = (numerical_df == 0).sum().sum() sparsity = num_zeros / num_elements + # Estimate memory usage dense_matrix_size = numerical_df.memory_usage(deep=True).sum() # In bytes available_memory = psutil.virtual_memory().available # In bytes + # Handle sparse or dense matrix based on sparsity and available memory if sparsity > sparse_threshold: if dense_matrix_size < memory_threshold * available_memory: - # Use dense matrix if enough memory is available logging.info( f"Data is sparse (sparsity={sparsity:.2f}) but enough memory available. " f"Using a dense matrix." @@ -125,20 +115,14 @@ def __init__( self.__matrix = numerical_df.to_numpy(dtype=np.float32) self.__is_sparse = False else: - # Use sparse matrix due to memory constraints logging.info( f"Data is sparse (sparsity={sparsity:.2f}), memory insufficient for dense matrix. " f"Using a sparse matrix representation." ) - self.__matrix = sparse.csr_matrix( - numerical_df.to_numpy(dtype=np.float32) - ) + self.__matrix = sparse.csr_matrix(numerical_df.to_numpy(dtype=np.float32)) self.__is_sparse = True else: - # Use dense matrix since it's not sparse - logging.info( - f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix." - ) + logging.info(f"Data is not sparse (sparsity={sparsity:.2f}), using a dense matrix.") self.__matrix = numerical_df.to_numpy(dtype=np.float32) self.__is_sparse = False diff --git a/fslite/tests/test_univariate_methods.py b/fslite/tests/test_univariate_methods.py index d096069..278393f 100644 --- a/fslite/tests/test_univariate_methods.py +++ b/fslite/tests/test_univariate_methods.py @@ -1,4 +1,5 @@ import pandas as pd +import psutil from fslite.fs.fdataframe import FSDataFrame from fslite.fs.univariate import FSUnivariate @@ -29,6 +30,30 @@ def test_univariate_filter_corr(): df_filtered = fsdf_filtered.to_pandas() df_filtered.to_csv("filtered_tnbc_data.csv", index=False) +def test_univariate_filter_big_corr(): + # import tsv as pandas DataFrame + df = pd.read_parquet(path="../../examples/GSE156793.parquet") + df.drop(columns=["development_day", "assay_id"], inplace=True) + print(df.shape[1]) + + dense_matrix_size = (df.memory_usage(deep=True).sum() / 1e+6) # In megabytes + available_memory = (psutil.virtual_memory().available / 1e+6) # In megabytes + + # create FSDataFrame instance + fs_df = FSDataFrame(df=df, sample_col="sample_id", label_col="cell_cluster_id") + + # create FSUnivariate instance + fs_univariate = FSUnivariate(fs_method="u_corr", selection_threshold=0.3) + + fsdf_filtered = fs_univariate.select_features(fs_df) + + assert fs_df.count_features() == 500 + assert fsdf_filtered.count_features() == 211 + + # Export the filtered DataFrame as Pandas DataFrame + df_filtered = fsdf_filtered.to_pandas() + df_filtered.to_csv("single_cell_output.csv", index=False) + # test the univariate_filter method with 'anova' method def test_univariate_filter_anova():