From d1f74d6cfc87a33a96f5afe5b000336607ffec41 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Wed, 25 Sep 2024 13:42:10 +0100
Subject: [PATCH] delete ML methods

---
 examples/loom2parquetmerge.py      | 10 ++++-
 fslite/tests/generate_big_tests.py |  1 -
 fslite/tests/test_fs_pipeline.py   | 72 ------------------------------
 3 files changed, 8 insertions(+), 75 deletions(-)
 delete mode 100644 fslite/tests/test_fs_pipeline.py

diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py
index 04cd41b..20f1c45 100644
--- a/examples/loom2parquetmerge.py
+++ b/examples/loom2parquetmerge.py
@@ -28,7 +28,7 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1
     :param batch_size: Number of rows to read from each file at a time.
     """
     writer = None
-with pq.ParquetWriter(output_path, schema=None, compression='gzip') as writer:
+
     for file_path in files_paths:
         print(f"Processing file: {file_path}")
         parquet_file = pq.ParquetFile(file_path)
@@ -38,10 +38,16 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1
             # Convert the batch to a PyArrow Table
             table = pa.Table.from_batches([batch])
 
+            # If the writer is not initialized, create a new Parquet writer
+            if writer is None:
+                writer = pq.ParquetWriter(output_path, table.schema, compression='gzip')
+
             # Write the batch to the output Parquet file
             writer.write_table(table)
 
-print(f"Concatenated parquet file written to {output_path}")
+    # Close the writer after all batches are written
+    if writer is not None:
+        writer.close()
         print(f"Concatenated parquet file written to {output_path}")
 
 
diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py
index 4497de0..0efc849 100644
--- a/fslite/tests/generate_big_tests.py
+++ b/fslite/tests/generate_big_tests.py
@@ -41,7 +41,6 @@ def generate_large_test_dataset():
                 f"feature{i}": rng.random(chunk_end - chunk_start)
                 for i in range(1, n_features + 1)
             }
-            )
 
             # Create DataFrame chunk
             chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels}
diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py
deleted file mode 100644
index 42be655..0000000
--- a/fslite/tests/test_fs_pipeline.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# import unittest
-#
-# from fslite.config.context import init_spark, stop_spark_session
-# from fslite.fs.core import FSDataFrame
-# from fslite.fs.methods import FSPipeline, FSUnivariate, FSMultivariate, FSMLMethod
-# from fslite.utils.datasets import get_tnbc_data_path
-# from fslite.utils.io import import_table_as_psdf
-#
-#
-# class FeatureSelectionPipelineTest(unittest.TestCase):
-#
-#     def setUp(self) -> None:
-#         init_spark(
-#             apply_pyarrow_settings=True,
-#             apply_extra_spark_settings=True,
-#             apply_pandas_settings=True,
-#         )
-#
-#     def tearDown(self) -> None:
-#         stop_spark_session()
-#
-#     @staticmethod
-#     def import_FSDataFrame():
-#         df = import_table_as_psdf(get_tnbc_data_path(), n_partitions=5)
-#         fsdf = FSDataFrame(df, sample_col="Sample", label_col="label")
-#         return fsdf
-#
-#     def test_feature_selection_pipeline(self):
-#         fsdf = self.import_FSDataFrame()
-#
-#         training_data, testing_data = fsdf.split_df(split_training_factor=0.6)
-#
-#         # create a Univariate object
-#         univariate = FSUnivariate(
-#             fs_method="anova", selection_mode="percentile", selection_threshold=0.8
-#         )
-#
-#         # create a Multivariate object
-#         multivariate = FSMultivariate(
-#             fs_method="m_corr", corr_threshold=0.75, corr_method="pearson"
-#         )
-#
-#         # create a MLMethod object
-#         rf_classifier = FSMLMethod(
-#             fs_method="rf_multilabel",
-#             rfe=True,
-#             rfe_iterations=2,
-#             percent_to_keep=0.9,
-#             estimator_params={"labelCol": "label"},
-#             evaluator_params={"metricName": "accuracy"},
-#             grid_params={"numTrees": [10, 15], "maxDepth": [5, 10]},
-#             cv_params={"parallelism": 2, "numFolds": 5},
-#         )
-#
-#         # create a pipeline object
-#         fs_pipeline = FSPipeline(
-#             df_training=training_data,
-#             df_testing=testing_data,
-#             fs_stages=[univariate, multivariate, rf_classifier],
-#         )
-#
-#         # run the pipeline
-#         results = fs_pipeline.run()
-#
-#         # print results
-#         print(results)
-#
-#         assert results.get("training_metric") > 0.9
-#
-#
-# if __name__ == "__main__":
-#     unittest.main()