From d1f74d6cfc87a33a96f5afe5b000336607ffec41 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Wed, 25 Sep 2024 13:42:10 +0100 Subject: [PATCH] delete ML methods --- examples/loom2parquetmerge.py | 10 ++++- fslite/tests/generate_big_tests.py | 1 - fslite/tests/test_fs_pipeline.py | 72 ------------------------------ 3 files changed, 8 insertions(+), 75 deletions(-) delete mode 100644 fslite/tests/test_fs_pipeline.py diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py index 04cd41b..20f1c45 100644 --- a/examples/loom2parquetmerge.py +++ b/examples/loom2parquetmerge.py @@ -28,7 +28,7 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 :param batch_size: Number of rows to read from each file at a time. """ writer = None -with pq.ParquetWriter(output_path, schema=None, compression='gzip') as writer: + for file_path in files_paths: print(f"Processing file: {file_path}") parquet_file = pq.ParquetFile(file_path) @@ -38,10 +38,16 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1 # Convert the batch to a PyArrow Table table = pa.Table.from_batches([batch]) + # If the writer is not initialized, create a new Parquet writer + if writer is None: + writer = pq.ParquetWriter(output_path, table.schema, compression='gzip') + # Write the batch to the output Parquet file writer.write_table(table) -print(f"Concatenated parquet file written to {output_path}") + # Close the writer after all batches are written + if writer is not None: + writer.close() print(f"Concatenated parquet file written to {output_path}") diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py index 4497de0..0efc849 100644 --- a/fslite/tests/generate_big_tests.py +++ b/fslite/tests/generate_big_tests.py @@ -41,7 +41,6 @@ def generate_large_test_dataset(): f"feature{i}": rng.random(chunk_end - chunk_start) for i in range(1, n_features + 1) } - ) # Create DataFrame chunk chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels} diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py deleted file mode 100644 index 42be655..0000000 --- a/fslite/tests/test_fs_pipeline.py +++ /dev/null @@ -1,72 +0,0 @@ -# import unittest -# -# from fslite.config.context import init_spark, stop_spark_session -# from fslite.fs.core import FSDataFrame -# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod -# from fslite.utils.datasets import get_tnbc_data_path -# from fslite.utils.io import import_table_as_psdf -# -# -# class FeatureSelectionPipelineTest(unittest.TestCase): -# -# def setUp(self) -> None: -# init_spark( -# apply_pyarrow_settings=True, -# apply_extra_spark_settings=True, -# apply_pandas_settings=True, -# ) -# -# def tearDown(self) -> None: -# stop_spark_session() -# -# @staticmethod -# def import_FSDataFrame(): -# df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5) -# fsdf = FSDataFrame(df, sample_col="Sample", label_col="label") -# return fsdf -# -# def test_feature_selection_pipeline(self): -# fsdf = self.import_FSDataFrame() -# -# training_data, testing_data = fsdf.split_df(split_training_factor=0.6) -# -# # create a Univariate object -# univariate = FSUnivariate( -# fs_method="anova", selection_mode="percentile", selection_threshold=0.8 -# ) -# -# # create a Multivariate object -# multivariate = FSMultivariate( -# fs_method="m_corr", corr_threshold=0.75, corr_method="pearson" -# ) -# -# # create a MLMethod object -# rf_classifier = FSMLMethod( -# fs_method="rf_multilabel", -# rfe=True, -# rfe_iterations=2, -# percent_to_keep=0.9, -# estimator_params={"labelCol": "label"}, -# evaluator_params={"metricName": "accuracy"}, -# grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]}, -# cv_params={"parallelism": 2, "numFolds": 5}, -# ) -# -# # create a pipeline object -# fs_pipeline = FSPipeline( -# df_training=training_data, -# df_testing=testing_data, -# fs_stages=[univariate, multivariate, rf_classifier], -# ) -# -# # run the pipeline -# results = fs_pipeline.run() -# -# # print results -# print(results) -# -# assert results.get("training_metric") > 0.9 -# -# -# if __name__ == "__main__": -# unittest.main()