delete ML methods

bigbio · Sep 25, 2024 · d1f74d6 · d1f74d6
1 parent 7ee27c8
commit d1f74d6
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 75 deletions.
diff --git a/examples/loom2parquetmerge.py b/examples/loom2parquetmerge.py
@@ -28,7 +28,7 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1
     :param batch_size: Number of rows to read from each file at a time.
     """
     writer = None
-with pq.ParquetWriter(output_path, schema=None, compression='gzip') as writer:
+
     for file_path in files_paths:
         print(f"Processing file: {file_path}")
         parquet_file = pq.ParquetFile(file_path)
@@ -38,10 +38,16 @@ def concatenate_parquet_files_incremental(files_paths, output_path, batch_size=1
             # Convert the batch to a PyArrow Table
             table = pa.Table.from_batches([batch])
 
+            # If the writer is not initialized, create a new Parquet writer
+            if writer is None:
+                writer = pq.ParquetWriter(output_path, table.schema, compression='gzip')
+
             # Write the batch to the output Parquet file
             writer.write_table(table)
 
-print(f"Concatenated parquet file written to {output_path}")
+    # Close the writer after all batches are written
+    if writer is not None:
+        writer.close()
         print(f"Concatenated parquet file written to {output_path}")
 
 

diff --git a/fslite/tests/generate_big_tests.py b/fslite/tests/generate_big_tests.py
@@ -41,7 +41,6 @@ def generate_large_test_dataset():
                 f"feature{i}": rng.random(chunk_end - chunk_start)
                 for i in range(1, n_features + 1)
             }
-            )
 
             # Create DataFrame chunk
             chunk_data = {"sample_id": chunk_sample_ids, "label": chunk_labels}

diff --git a/fslite/tests/test_fs_pipeline.py b/fslite/tests/test_fs_pipeline.py