Enforce exact_match=True when listing JSON file for `get_estimated_…

…time` for MPH (#467) * Enforce exact_match=True for get_estimated_time * Ensure we're not searching for the actual extension * Modify list_files to use "correct" syntax for exact_match=True and filter out non-.json * Make workflow of getting bin file easier * Remove the == 0 * Add processing.json files to test * Fix bin file generation by simulating update only after .json file written (not immediately after .bin file written) * Formatting to avoid going over 100 characters per line * Add trailing comma to please black linter * Print statement debugging * See if an OSError is being thrown * More debugging statements: need to see the MPH process * Test the presence and contents of log_out * See what the actual log path is being set to * Try adding a simple logging statement at the beginning to force * Try to set logging propagation to be False * Fix pyproject.toml logging * Additional changes to pytest pyproject.toml * Lowercase true for pyproject.toml * Attempt to force pytest fixture to write to log file * Add logging import * Remove whitespace * Try to fix logging issue * Overwrite pytest.txt with blank stuff * More testing fixes * Use the absolute path to the directory for accessing hte log * Adjust error message * Don't print add_blank * Check if the error is happening during the renaming phases * Add some more tests in * Actually get image_stitching to print out debug * Pin watchdog to version 6 * Update lock file * Add debug workflow to GitHub PR * Remove the lock entirely * Nuke file timer functionality * Test deleting the lock AND the timer * Try pushing old slow copy tissue data * removed debug * See if sleeping for 2 seconds will prevent the toffy segmentation error from popping up * black formatting * Massive test overhaul: see how many tests pass now * Remove old _slow_copy_tissue... function * Linting checks * PYCODESTYLE * Add fixes to temporary work around list_folders and list_files in rosetta * Strict matching required between underscores: "pulse_height" will not match "pulse_heights_..." * Fix list_folders in normalize_image.py to correctly list all FOVs, add TODO for fixing verify_same_elements edge case * Fix list_folders in merge_partial_runs * Formatting * Remove print statements * Remove debug comment * Reset fov_watcher.py back to original * More resetting * Remove unnecessary try-except block in fov_watcher_test.py * Never mind, add old try-except back in (but don't print watcher lag) * Fix copy_image_files with bad list_folders calls removed * ruff linting --------- Co-authored-by: Alex Kong <[email protected]> Co-authored-by: Sricharan Reddy Varra <[email protected]>
angelolab · Dec 4, 2024 · c31d51e · c31d51e
1 parent 5f16d6f
commit c31d51e
Show file tree

Hide file tree

Showing 14 changed files with 1,917 additions and 1,603 deletions.
diff --git a/data/combined/fov-1-scan-1_processing.json b/data/combined/fov-1-scan-1_processing.json
diff --git a/data/combined/fov-2-scan-1_processing.json b/data/combined/fov-2-scan-1_processing.json
diff --git a/data/combined/fov-3-scan-1_processing.json b/data/combined/fov-3-scan-1_processing.json
diff --git a/data/combined/fov-4-scan-1_processing.json b/data/combined/fov-4-scan-1_processing.json
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,7 +44,7 @@ numpy = "1.*"
 natsort = "^8"
 seaborn = "^0.13"
 scikit-learn = "^1"
-watchdog = "^3"
+watchdog = "^6"
 tqdm = "^4"
 scipy = "^1.10.1"
 pandas = "^2"
@@ -128,3 +128,10 @@ filterwarnings = [
 ]
 testpaths = ["tests"]
 norecursedirs = ["tests/utilities"]
+
+log_cli = true
+log_level = "INFO"
+log_file = "pytest.txt"
+log_file_level = "INFO"
+log_format = "%(asctime)s %(levelname)s %(message)s"
+log_date_format = "%Y-%m-%d %H:%M:%S"
diff --git a/src/toffy/mph_comp.py b/src/toffy/mph_comp.py
@@ -24,12 +24,12 @@ def get_estimated_time(bin_file_dir, fov):
     io_utils.validate_paths(bin_file_dir)
 
     # get fov json file in bin_file_path
-    json_file = io_utils.list_files(bin_file_dir, fov + ".json")
-    if len(json_file) == 0:
+    json_file = os.path.join(bin_file_dir, f"{fov}.json")
+    if not os.path.exists(json_file):
         raise FileNotFoundError(f"The FOV name supplied doesn't have a JSON file: {fov}")
 
     # retrieve estimated time (frame dimensions x pixel dwell time)
-    run_metadata = read_json_file(os.path.join(bin_file_dir, json_file[0]), encoding="utf-8")
+    run_metadata = read_json_file(json_file, encoding="utf-8")
     try:
         size = run_metadata.get("frameSize")
         time = run_metadata.get("dwellTimeMillis")

diff --git a/src/toffy/normalize.py b/src/toffy/normalize.py
@@ -831,7 +831,9 @@ def normalize_image_data(
     # create normalization function for mapping MPH to counts
     norm_json = read_json_file(norm_func_path)
 
-    img_fovs = io_utils.list_folders(img_dir, "fov")
+    # TODO: list_folders does not handle cases such as "fov0" correctly
+    # need to add a fix in to alpineer to deal with this
+    img_fovs = [f for f in os.listdir(img_dir) if "fov" in f]
 
     norm_weights, norm_name = norm_json["weights"], norm_json["name"]
     norm_func = create_prediction_function(norm_name, norm_weights)
@@ -852,6 +854,9 @@ def normalize_image_data(
 
     # make sure FOVs used to construct tuning curve are same ones being normalized
     pulse_fovs = np.unique(pulse_height_df["fov"])
+
+    # TODO: verify_same_elements needs to throw a ValueError in the special case
+    # where one list is empty but the other isn't
     misc_utils.verify_same_elements(image_data_fovs=img_fovs, pulse_height_csv_files=pulse_fovs)
 
     # loop over each fov

diff --git a/src/toffy/reorg.py b/src/toffy/reorg.py
@@ -20,7 +20,9 @@ def merge_partial_runs(cohort_dir, run_string):
         os.makedirs(output_folder)
 
     # get list of matching subfolders
-    partial_folders = io_utils.list_folders(cohort_dir, substrs=run_string)
+    # TODO: list_folders does not handle cases such as "run_dup2" correctly
+    # need to add a fix in to alpineer to deal with this
+    partial_folders = [f for f in os.listdir(cohort_dir) if run_string in f]
     partial_folders = [partial for partial in partial_folders if partial != run_string]
 
     if len(partial_folders) == 0:

diff --git a/src/toffy/rosetta.py b/src/toffy/rosetta.py
@@ -288,7 +288,9 @@ def compensate_image_data(
     io_utils.validate_paths([raw_data_dir, comp_data_dir, comp_mat_path])
 
     # get list of all fovs
-    fovs = io_utils.list_folders(raw_data_dir, substrs="fov")
+    # TODO: list_folders does not handle cases such as "fov0" correctly
+    # need to add a fix in to alpineer to deal with this
+    fovs = [f for f in os.listdir(raw_data_dir) if "fov" in f]
 
     # load csv files
     comp_mat = pd.read_csv(comp_mat_path, index_col=0)
@@ -713,6 +715,8 @@ def create_rosetta_matrices(
         mult_matrix.to_csv(os.path.join(save_dir, comp_name))
 
 
+# TODO: anything with [f for f in os.listdir(...) ...] needs to be changed
+# after list_folders with substrs specified is fixed
 def copy_image_files(
     cohort_name, run_names, rosetta_testing_dir, extracted_imgs_dir, fovs_per_run=5
 ):
@@ -735,7 +739,7 @@ def copy_image_files(
     for run in run_names:
         if not os.path.exists(os.path.join(extracted_imgs_dir, run)):
             raise ValueError(f"{run} is not a valid run name found in {extracted_imgs_dir}")
-        fovs_in_run = io_utils.list_folders(os.path.join(extracted_imgs_dir, run), substrs="fov")
+        fovs_in_run = [f for f in os.listdir(os.path.join(extracted_imgs_dir, run)) if "fov" in f]
         # check number of fovs in each run
         if len(fovs_in_run) < fovs_per_run:
             small_runs.append(run)
@@ -765,7 +769,7 @@ def copy_image_files(
     for i, run in enumerate(ns.natsorted(run_names_process)):
         run_path = os.path.join(extracted_imgs_dir, run)
 
-        fovs_in_run = io_utils.list_folders(run_path, substrs="fov")
+        fovs_in_run = [f for f in os.listdir(run_path) if "fov" in f]
         fovs_in_run = ns.natsorted(fovs_in_run)
         rosetta_fovs = random.sample(fovs_in_run, k=fovs_per_run)
 

diff --git a/tests/fov_watcher_test.py b/tests/fov_watcher_test.py
@@ -1,3 +1,4 @@
+import logging
 import os
 import shutil
 import tempfile
@@ -35,6 +36,11 @@
 SLOW_COPY_INTERVAL_S = 1
 
 
+def _reset_logging_file():
+    with open(os.path.join(Path(__file__).parents[1], "pytest.txt"), "w"):
+        pass
+
+
 def _slow_copy_sample_tissue_data(
     dest: str, delta: int = 10, one_blank: bool = False, temp_bin: bool = False
 ):
@@ -71,18 +77,25 @@ def _slow_copy_sample_tissue_data(
                 os.rename(copied_tissue_path, os.path.join(dest, tissue_file))
             else:
                 shutil.copy(tissue_path, dest)
-
-    # get all .bin files
-    bin_files = [bfile for bfile in sorted(os.listdir(COMBINED_DATA_PATH)) if ".bin" in bfile]
-
-    # simulate updating the creation time for some .bin files, this tests _check_bin_updates
-    for i, bfile in enumerate(bin_files):
-        if i % 2 == 0:
-            shutil.copy(
-                os.path.join(COMBINED_DATA_PATH, bfile), os.path.join(dest, bfile + ".temp")
-            )
-            os.remove(os.path.join(dest, bfile))
-            os.rename(os.path.join(dest, bfile + ".temp"), os.path.join(dest, bfile))
+                tissue_data = os.path.splitext(tissue_file)
+
+                # simulate a .bin file update to test _check_bin_updates
+                if tissue_data[1] == ".json" and "_processing" not in tissue_data[0]:
+                    # ensure a sleep so the update doesn't interfere with an existing extraction
+                    time.sleep(2)
+                    bin_file_name = tissue_data[0] + ".bin"
+
+                    # make sure only to update non-blank bin files
+                    if os.path.getsize(os.path.join(dest, bin_file_name)) != 0:
+                        shutil.copy(
+                            os.path.join(COMBINED_DATA_PATH, bin_file_name),
+                            os.path.join(dest, bin_file_name + ".temp"),
+                        )
+                        os.remove(os.path.join(dest, bin_file_name))
+                        os.rename(
+                            os.path.join(dest, bin_file_name + ".temp"),
+                            os.path.join(dest, bin_file_name),
+                        )
 
 
 COMBINED_RUN_JSON_SPOOF = {
@@ -255,7 +268,6 @@ def test_watcher(
     add_blank,
     temp_bin,
 ):
-    print("The watcher start lag is: %d" % watcher_start_lag)
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
             tiff_out_dir = os.path.join(tmpdir, "cb_0", RUN_DIR_NAME)
@@ -371,9 +383,13 @@ def test_watcher(
 
                     res_scan.get()
 
-            with open(os.path.join(log_out, "test_run_log.txt")) as f:
+            with open(os.path.join(Path(__file__).parents[1], "pytest.txt")) as f:
                 logtxt = f.read()
-                assert add_blank == ("non-zero file size..." in logtxt)
+                try:
+                    assert add_blank == ("non-zero file size..." in logtxt)
+                except AssertionError as e:
+                    _reset_logging_file()
+                    raise AssertionError(e)
 
             fovs = [
                 bin_file.split(".")[0]
@@ -390,32 +406,65 @@ def test_watcher(
                 fovs = fovs[1:]
 
             # extract tiffs check
+            print("TIFF validator check")
             validators[0](os.path.join(tmpdir, "cb_0", RUN_DIR_NAME), fovs, bad_fovs)
             if kwargs["extract_prof"]:
-                validators[0](
-                    os.path.join(tmpdir, "cb_0", RUN_DIR_NAME + "_proficient"), fovs, bad_fovs
-                )
+                try:
+                    validators[0](
+                        os.path.join(tmpdir, "cb_0", RUN_DIR_NAME + "_proficient"), fovs, bad_fovs
+                    )
+                except AssertionError as e:
+                    _reset_logging_file()
+                    raise AssertionError(e)
             else:
-                assert not os.path.exists(
-                    os.path.join(tmpdir, "cb_0", RUN_DIR_NAME) + "_proficient"
-                )
+                try:
+                    assert not os.path.exists(
+                        os.path.join(tmpdir, "cb_0", RUN_DIR_NAME) + "_proficient"
+                    )
+                except AssertionError as e:
+                    _reset_logging_file()
+                    raise AssertionError(e)
 
             # qc check
-            validators[1](os.path.join(tmpdir, "cb_1", RUN_DIR_NAME), fovs, bad_fovs)
+            print("QC check")
+            try:
+                validators[1](os.path.join(tmpdir, "cb_1", RUN_DIR_NAME), fovs, bad_fovs)
+            except AssertionError as e:
+                _reset_logging_file()
+                raise AssertionError(e)
 
             # mph check
-            validators[2](
-                os.path.join(tmpdir, "cb_2", RUN_DIR_NAME),
-                os.path.join(tmpdir, "cb_2_plots", RUN_DIR_NAME),
-                fovs,
-                bad_fovs,
-            )
+            print("MPH check")
+            try:
+                validators[2](
+                    os.path.join(tmpdir, "cb_2", RUN_DIR_NAME),
+                    os.path.join(tmpdir, "cb_2_plots", RUN_DIR_NAME),
+                    fovs,
+                    bad_fovs,
+                )
+            except AssertionError as e:
+                _reset_logging_file()
+                raise AssertionError(e)
 
             # stitch images check
-            validators[3](os.path.join(tmpdir, "cb_0", RUN_DIR_NAME, f"{RUN_DIR_NAME}_stitched"))
+            print("Stitch images check")
+            try:
+                validators[3](
+                    os.path.join(tmpdir, "cb_0", RUN_DIR_NAME, f"{RUN_DIR_NAME}_stitched")
+                )
+            except AssertionError as e:
+                _reset_logging_file()
+                raise AssertionError(e)
 
             # pulse heights check
-            validators[4](os.path.join(tmpdir, "cb_3", RUN_DIR_NAME), fovs, bad_fovs)
+            print("Pulse heights check")
+            try:
+                validators[4](os.path.join(tmpdir, "cb_3", RUN_DIR_NAME), fovs, bad_fovs)
+            except AssertionError as e:
+                _reset_logging_file()
+                raise AssertionError(e)
+
+            _reset_logging_file()
 
     except OSError:
         warnings.warn("Temporary file cleanup was incomplete.")

diff --git a/tests/normalize_test.py b/tests/normalize_test.py
@@ -169,9 +169,9 @@ def test_combine_run_metrics(metrics, warn_overwrite_test):
             name_prof, values_df_prof = metric[0], pd.DataFrame(metric[1])
             values_df_prof.to_csv(os.path.join(temp_dir, name_prof), index=False)
 
-        normalize.combine_run_metrics(temp_dir, "pulse_height")
+        normalize.combine_run_metrics(temp_dir, "pulse_heights")
 
-        combined_data = pd.read_csv(os.path.join(temp_dir, "pulse_height_combined.csv"))
+        combined_data = pd.read_csv(os.path.join(temp_dir, "pulse_heights_combined.csv"))
 
         assert np.array_equal(combined_data.columns, ["pulse_height", "mass", "fov"])
         assert len(combined_data) == len(metrics["deficient"]) * 10
@@ -180,24 +180,24 @@ def test_combine_run_metrics(metrics, warn_overwrite_test):
         # NOTE: only if warn_overwrite turned on
         if warn_overwrite_test:
             with pytest.warns(UserWarning, match="previously generated"):
-                normalize.combine_run_metrics(temp_dir, "pulse_height", warn_overwrite_test)
+                normalize.combine_run_metrics(temp_dir, "pulse_heights", warn_overwrite_test)
         else:
-            normalize.combine_run_metrics(temp_dir, "pulse_height", warn_overwrite_test)
+            normalize.combine_run_metrics(temp_dir, "pulse_heights", warn_overwrite_test)
 
         # check that files with different lengths raises error
         name, bad_vals = metrics["deficient"][0][0], pd.DataFrame(metrics["deficient"][0][1])
         bad_vals = bad_vals.loc[0:5, :]
         bad_vals.to_csv(os.path.join(temp_dir, name), index=False)
 
         with pytest.raises(ValueError, match="files are the same length"):
-            normalize.combine_run_metrics(temp_dir, "pulse_height")
+            normalize.combine_run_metrics(temp_dir, "pulse_heights")
 
         # empty directory raises error
         empty_dir = os.path.join(temp_dir, "empty")
         os.makedirs(empty_dir)
 
         with pytest.raises(ValueError, match="No files"):
-            normalize.combine_run_metrics(empty_dir, "pulse_height")
+            normalize.combine_run_metrics(empty_dir, "pulse_heights")
 
 
 @parametrize_with_cases("dir_names, mph_dfs, count_dfs", test_cases.TuningCurveFiles)

diff --git a/tests/rosetta_test.py b/tests/rosetta_test.py
@@ -360,8 +360,8 @@ def test_compensate_image_data(
 
         for folder in format_folders:
             # check that all files were created
-            output_files = io_utils.list_files(os.path.join(output_dir, fovs[0], folder), ".tif")
-            output_files = [chan.split(".tif")[0] for chan in output_files]
+            output_files = io_utils.list_files(os.path.join(output_dir, fovs[0], folder), ".tiff")
+            output_files = [chan.split(".tiff")[0] for chan in output_files]
 
             if output_masses is None or len(output_masses) == 3:
                 assert set(output_files) == set(panel_info["Target"].values)
@@ -752,6 +752,8 @@ def mock_img_size(run_dir, fov_list=None):
     return sizes[run]
 
 
+# TODO: anything with [f for f in os.listdir(...) ...] needs to be changed
+# after list_folders with substrs specified is fixed
 def test_copy_image_files(mocker):
     mocker.patch("toffy.image_stitching.get_max_img_size", mock_img_size)
 
@@ -791,7 +793,7 @@ def test_copy_image_files(mocker):
             extracted_fov_dir = os.path.join(temp_dir2, "cohort_name", "extracted_images")
             assert len(io_utils.list_folders(extracted_fov_dir)) == 12
             for i in range(1, 4):
-                assert len(list(io_utils.list_folders(extracted_fov_dir, f"run_{i}"))) == 4
+                assert len([f for f in os.listdir(extracted_fov_dir) if f"run_{i}" in f]) == 4
             assert len(list(io_utils.list_folders(extracted_fov_dir, "stitched_images"))) == 0
 
             # check that files in fov folders are copied
@@ -808,8 +810,8 @@ def test_copy_image_files(mocker):
             # check that correct total and per run fovs are copied, assert run 3 didn't get copied
             assert len(io_utils.list_folders(extracted_fov_dir)) == 10
             for i in range(1, 3):
-                assert len(io_utils.list_folders(extracted_fov_dir, f"run_{i}")) == 5
-            assert len(io_utils.list_folders(extracted_fov_dir, "run_3")) == 0
+                assert len([f for f in os.listdir(extracted_fov_dir) if f"run_{i}" in f]) == 5
+            assert len([f for f in os.listdir(extracted_fov_dir) if "run_3" in f]) == 0
 
             # check that files in fov folders are copied
             for folder in io_utils.list_folders(extracted_fov_dir):

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -19,7 +19,6 @@
 from toffy.settings import QC_COLUMNS, QC_SUFFIXES
 
 TEST_CHANNELS = [
-    "Calprotectin",
     "Chymase",
     "SMA",
     "Vimentin",