Skip to content

Commit

Permalink
Enforce exact_match=True when listing JSON file for `get_estimated_…
Browse files Browse the repository at this point in the history
…time` for MPH (#467)

* Enforce exact_match=True for get_estimated_time

* Ensure we're not searching for the actual extension

* Modify list_files to use "correct" syntax for exact_match=True and filter out non-.json

* Make workflow of getting bin file easier

* Remove the == 0

* Add processing.json files to test

* Fix bin file generation by simulating update only after .json file written (not immediately after .bin file written)

* Formatting to avoid going over 100 characters per line

* Add trailing comma to please black linter

* Print statement debugging

* See if an OSError is being thrown

* More debugging statements: need to see the MPH process

* Test the presence and contents of log_out

* See what the actual log path is being set to

* Try adding a simple logging statement at the beginning to force

* Try to set logging propagation to be False

* Fix pyproject.toml logging

* Additional changes to pytest pyproject.toml

* Lowercase true for pyproject.toml

* Attempt to force pytest fixture to write to log file

* Add logging import

* Remove whitespace

* Try to fix logging issue

* Overwrite pytest.txt with blank stuff

* More testing fixes

* Use the absolute path to the directory for accessing hte log

* Adjust error message

* Don't print add_blank

* Check if the error is happening during the renaming phases

* Add some more tests in

* Actually get image_stitching to print out debug

* Pin watchdog to version 6

* Update lock file

* Add debug workflow to GitHub PR

* Remove the lock entirely

* Nuke file timer functionality

* Test deleting the lock AND the timer

* Try pushing old slow copy tissue data

* removed debug

* See if sleeping for 2 seconds will prevent the toffy segmentation error from popping up

* black formatting

* Massive test overhaul: see how many tests pass now

* Remove old _slow_copy_tissue... function

* Linting checks

* PYCODESTYLE

* Add fixes to temporary work around list_folders and list_files in rosetta

* Strict matching required between underscores: "pulse_height" will not match "pulse_heights_..."

* Fix list_folders in normalize_image.py to correctly list all FOVs, add TODO for fixing verify_same_elements edge case

* Fix list_folders in merge_partial_runs

* Formatting

* Remove print statements

* Remove debug comment

* Reset fov_watcher.py back to original

* More resetting

* Remove unnecessary try-except block in fov_watcher_test.py

* Never mind, add old try-except back in (but don't print watcher lag)

* Fix copy_image_files with bad list_folders calls removed

* ruff linting

---------

Co-authored-by: Alex Kong <[email protected]>
Co-authored-by: Sricharan Reddy Varra <[email protected]>
  • Loading branch information
3 people authored Dec 4, 2024
1 parent 5f16d6f commit c31d51e
Show file tree
Hide file tree
Showing 14 changed files with 1,917 additions and 1,603 deletions.
Empty file.
Empty file.
Empty file.
Empty file.
3,350 changes: 1,798 additions & 1,552 deletions poetry.lock

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ numpy = "1.*"
natsort = "^8"
seaborn = "^0.13"
scikit-learn = "^1"
watchdog = "^3"
watchdog = "^6"
tqdm = "^4"
scipy = "^1.10.1"
pandas = "^2"
Expand Down Expand Up @@ -128,3 +128,10 @@ filterwarnings = [
]
testpaths = ["tests"]
norecursedirs = ["tests/utilities"]

log_cli = true
log_level = "INFO"
log_file = "pytest.txt"
log_file_level = "INFO"
log_format = "%(asctime)s %(levelname)s %(message)s"
log_date_format = "%Y-%m-%d %H:%M:%S"
6 changes: 3 additions & 3 deletions src/toffy/mph_comp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ def get_estimated_time(bin_file_dir, fov):
io_utils.validate_paths(bin_file_dir)

# get fov json file in bin_file_path
json_file = io_utils.list_files(bin_file_dir, fov + ".json")
if len(json_file) == 0:
json_file = os.path.join(bin_file_dir, f"{fov}.json")
if not os.path.exists(json_file):
raise FileNotFoundError(f"The FOV name supplied doesn't have a JSON file: {fov}")

# retrieve estimated time (frame dimensions x pixel dwell time)
run_metadata = read_json_file(os.path.join(bin_file_dir, json_file[0]), encoding="utf-8")
run_metadata = read_json_file(json_file, encoding="utf-8")
try:
size = run_metadata.get("frameSize")
time = run_metadata.get("dwellTimeMillis")
Expand Down
7 changes: 6 additions & 1 deletion src/toffy/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,9 @@ def normalize_image_data(
# create normalization function for mapping MPH to counts
norm_json = read_json_file(norm_func_path)

img_fovs = io_utils.list_folders(img_dir, "fov")
# TODO: list_folders does not handle cases such as "fov0" correctly
# need to add a fix in to alpineer to deal with this
img_fovs = [f for f in os.listdir(img_dir) if "fov" in f]

norm_weights, norm_name = norm_json["weights"], norm_json["name"]
norm_func = create_prediction_function(norm_name, norm_weights)
Expand All @@ -852,6 +854,9 @@ def normalize_image_data(

# make sure FOVs used to construct tuning curve are same ones being normalized
pulse_fovs = np.unique(pulse_height_df["fov"])

# TODO: verify_same_elements needs to throw a ValueError in the special case
# where one list is empty but the other isn't
misc_utils.verify_same_elements(image_data_fovs=img_fovs, pulse_height_csv_files=pulse_fovs)

# loop over each fov
Expand Down
4 changes: 3 additions & 1 deletion src/toffy/reorg.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def merge_partial_runs(cohort_dir, run_string):
os.makedirs(output_folder)

# get list of matching subfolders
partial_folders = io_utils.list_folders(cohort_dir, substrs=run_string)
# TODO: list_folders does not handle cases such as "run_dup2" correctly
# need to add a fix in to alpineer to deal with this
partial_folders = [f for f in os.listdir(cohort_dir) if run_string in f]
partial_folders = [partial for partial in partial_folders if partial != run_string]

if len(partial_folders) == 0:
Expand Down
10 changes: 7 additions & 3 deletions src/toffy/rosetta.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ def compensate_image_data(
io_utils.validate_paths([raw_data_dir, comp_data_dir, comp_mat_path])

# get list of all fovs
fovs = io_utils.list_folders(raw_data_dir, substrs="fov")
# TODO: list_folders does not handle cases such as "fov0" correctly
# need to add a fix in to alpineer to deal with this
fovs = [f for f in os.listdir(raw_data_dir) if "fov" in f]

# load csv files
comp_mat = pd.read_csv(comp_mat_path, index_col=0)
Expand Down Expand Up @@ -713,6 +715,8 @@ def create_rosetta_matrices(
mult_matrix.to_csv(os.path.join(save_dir, comp_name))


# TODO: anything with [f for f in os.listdir(...) ...] needs to be changed
# after list_folders with substrs specified is fixed
def copy_image_files(
cohort_name, run_names, rosetta_testing_dir, extracted_imgs_dir, fovs_per_run=5
):
Expand All @@ -735,7 +739,7 @@ def copy_image_files(
for run in run_names:
if not os.path.exists(os.path.join(extracted_imgs_dir, run)):
raise ValueError(f"{run} is not a valid run name found in {extracted_imgs_dir}")
fovs_in_run = io_utils.list_folders(os.path.join(extracted_imgs_dir, run), substrs="fov")
fovs_in_run = [f for f in os.listdir(os.path.join(extracted_imgs_dir, run)) if "fov" in f]
# check number of fovs in each run
if len(fovs_in_run) < fovs_per_run:
small_runs.append(run)
Expand Down Expand Up @@ -765,7 +769,7 @@ def copy_image_files(
for i, run in enumerate(ns.natsorted(run_names_process)):
run_path = os.path.join(extracted_imgs_dir, run)

fovs_in_run = io_utils.list_folders(run_path, substrs="fov")
fovs_in_run = [f for f in os.listdir(run_path) if "fov" in f]
fovs_in_run = ns.natsorted(fovs_in_run)
rosetta_fovs = random.sample(fovs_in_run, k=fovs_per_run)

Expand Down
109 changes: 79 additions & 30 deletions tests/fov_watcher_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import shutil
import tempfile
Expand Down Expand Up @@ -35,6 +36,11 @@
SLOW_COPY_INTERVAL_S = 1


def _reset_logging_file():
with open(os.path.join(Path(__file__).parents[1], "pytest.txt"), "w"):
pass


def _slow_copy_sample_tissue_data(
dest: str, delta: int = 10, one_blank: bool = False, temp_bin: bool = False
):
Expand Down Expand Up @@ -71,18 +77,25 @@ def _slow_copy_sample_tissue_data(
os.rename(copied_tissue_path, os.path.join(dest, tissue_file))
else:
shutil.copy(tissue_path, dest)

# get all .bin files
bin_files = [bfile for bfile in sorted(os.listdir(COMBINED_DATA_PATH)) if ".bin" in bfile]

# simulate updating the creation time for some .bin files, this tests _check_bin_updates
for i, bfile in enumerate(bin_files):
if i % 2 == 0:
shutil.copy(
os.path.join(COMBINED_DATA_PATH, bfile), os.path.join(dest, bfile + ".temp")
)
os.remove(os.path.join(dest, bfile))
os.rename(os.path.join(dest, bfile + ".temp"), os.path.join(dest, bfile))
tissue_data = os.path.splitext(tissue_file)

# simulate a .bin file update to test _check_bin_updates
if tissue_data[1] == ".json" and "_processing" not in tissue_data[0]:
# ensure a sleep so the update doesn't interfere with an existing extraction
time.sleep(2)
bin_file_name = tissue_data[0] + ".bin"

# make sure only to update non-blank bin files
if os.path.getsize(os.path.join(dest, bin_file_name)) != 0:
shutil.copy(
os.path.join(COMBINED_DATA_PATH, bin_file_name),
os.path.join(dest, bin_file_name + ".temp"),
)
os.remove(os.path.join(dest, bin_file_name))
os.rename(
os.path.join(dest, bin_file_name + ".temp"),
os.path.join(dest, bin_file_name),
)


COMBINED_RUN_JSON_SPOOF = {
Expand Down Expand Up @@ -255,7 +268,6 @@ def test_watcher(
add_blank,
temp_bin,
):
print("The watcher start lag is: %d" % watcher_start_lag)
try:
with tempfile.TemporaryDirectory() as tmpdir:
tiff_out_dir = os.path.join(tmpdir, "cb_0", RUN_DIR_NAME)
Expand Down Expand Up @@ -371,9 +383,13 @@ def test_watcher(

res_scan.get()

with open(os.path.join(log_out, "test_run_log.txt")) as f:
with open(os.path.join(Path(__file__).parents[1], "pytest.txt")) as f:
logtxt = f.read()
assert add_blank == ("non-zero file size..." in logtxt)
try:
assert add_blank == ("non-zero file size..." in logtxt)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

fovs = [
bin_file.split(".")[0]
Expand All @@ -390,32 +406,65 @@ def test_watcher(
fovs = fovs[1:]

# extract tiffs check
print("TIFF validator check")
validators[0](os.path.join(tmpdir, "cb_0", RUN_DIR_NAME), fovs, bad_fovs)
if kwargs["extract_prof"]:
validators[0](
os.path.join(tmpdir, "cb_0", RUN_DIR_NAME + "_proficient"), fovs, bad_fovs
)
try:
validators[0](
os.path.join(tmpdir, "cb_0", RUN_DIR_NAME + "_proficient"), fovs, bad_fovs
)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)
else:
assert not os.path.exists(
os.path.join(tmpdir, "cb_0", RUN_DIR_NAME) + "_proficient"
)
try:
assert not os.path.exists(
os.path.join(tmpdir, "cb_0", RUN_DIR_NAME) + "_proficient"
)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

# qc check
validators[1](os.path.join(tmpdir, "cb_1", RUN_DIR_NAME), fovs, bad_fovs)
print("QC check")
try:
validators[1](os.path.join(tmpdir, "cb_1", RUN_DIR_NAME), fovs, bad_fovs)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

# mph check
validators[2](
os.path.join(tmpdir, "cb_2", RUN_DIR_NAME),
os.path.join(tmpdir, "cb_2_plots", RUN_DIR_NAME),
fovs,
bad_fovs,
)
print("MPH check")
try:
validators[2](
os.path.join(tmpdir, "cb_2", RUN_DIR_NAME),
os.path.join(tmpdir, "cb_2_plots", RUN_DIR_NAME),
fovs,
bad_fovs,
)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

# stitch images check
validators[3](os.path.join(tmpdir, "cb_0", RUN_DIR_NAME, f"{RUN_DIR_NAME}_stitched"))
print("Stitch images check")
try:
validators[3](
os.path.join(tmpdir, "cb_0", RUN_DIR_NAME, f"{RUN_DIR_NAME}_stitched")
)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

# pulse heights check
validators[4](os.path.join(tmpdir, "cb_3", RUN_DIR_NAME), fovs, bad_fovs)
print("Pulse heights check")
try:
validators[4](os.path.join(tmpdir, "cb_3", RUN_DIR_NAME), fovs, bad_fovs)
except AssertionError as e:
_reset_logging_file()
raise AssertionError(e)

_reset_logging_file()

except OSError:
warnings.warn("Temporary file cleanup was incomplete.")
Expand Down
12 changes: 6 additions & 6 deletions tests/normalize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,9 @@ def test_combine_run_metrics(metrics, warn_overwrite_test):
name_prof, values_df_prof = metric[0], pd.DataFrame(metric[1])
values_df_prof.to_csv(os.path.join(temp_dir, name_prof), index=False)

normalize.combine_run_metrics(temp_dir, "pulse_height")
normalize.combine_run_metrics(temp_dir, "pulse_heights")

combined_data = pd.read_csv(os.path.join(temp_dir, "pulse_height_combined.csv"))
combined_data = pd.read_csv(os.path.join(temp_dir, "pulse_heights_combined.csv"))

assert np.array_equal(combined_data.columns, ["pulse_height", "mass", "fov"])
assert len(combined_data) == len(metrics["deficient"]) * 10
Expand All @@ -180,24 +180,24 @@ def test_combine_run_metrics(metrics, warn_overwrite_test):
# NOTE: only if warn_overwrite turned on
if warn_overwrite_test:
with pytest.warns(UserWarning, match="previously generated"):
normalize.combine_run_metrics(temp_dir, "pulse_height", warn_overwrite_test)
normalize.combine_run_metrics(temp_dir, "pulse_heights", warn_overwrite_test)
else:
normalize.combine_run_metrics(temp_dir, "pulse_height", warn_overwrite_test)
normalize.combine_run_metrics(temp_dir, "pulse_heights", warn_overwrite_test)

# check that files with different lengths raises error
name, bad_vals = metrics["deficient"][0][0], pd.DataFrame(metrics["deficient"][0][1])
bad_vals = bad_vals.loc[0:5, :]
bad_vals.to_csv(os.path.join(temp_dir, name), index=False)

with pytest.raises(ValueError, match="files are the same length"):
normalize.combine_run_metrics(temp_dir, "pulse_height")
normalize.combine_run_metrics(temp_dir, "pulse_heights")

# empty directory raises error
empty_dir = os.path.join(temp_dir, "empty")
os.makedirs(empty_dir)

with pytest.raises(ValueError, match="No files"):
normalize.combine_run_metrics(empty_dir, "pulse_height")
normalize.combine_run_metrics(empty_dir, "pulse_heights")


@parametrize_with_cases("dir_names, mph_dfs, count_dfs", test_cases.TuningCurveFiles)
Expand Down
12 changes: 7 additions & 5 deletions tests/rosetta_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,8 @@ def test_compensate_image_data(

for folder in format_folders:
# check that all files were created
output_files = io_utils.list_files(os.path.join(output_dir, fovs[0], folder), ".tif")
output_files = [chan.split(".tif")[0] for chan in output_files]
output_files = io_utils.list_files(os.path.join(output_dir, fovs[0], folder), ".tiff")
output_files = [chan.split(".tiff")[0] for chan in output_files]

if output_masses is None or len(output_masses) == 3:
assert set(output_files) == set(panel_info["Target"].values)
Expand Down Expand Up @@ -752,6 +752,8 @@ def mock_img_size(run_dir, fov_list=None):
return sizes[run]


# TODO: anything with [f for f in os.listdir(...) ...] needs to be changed
# after list_folders with substrs specified is fixed
def test_copy_image_files(mocker):
mocker.patch("toffy.image_stitching.get_max_img_size", mock_img_size)

Expand Down Expand Up @@ -791,7 +793,7 @@ def test_copy_image_files(mocker):
extracted_fov_dir = os.path.join(temp_dir2, "cohort_name", "extracted_images")
assert len(io_utils.list_folders(extracted_fov_dir)) == 12
for i in range(1, 4):
assert len(list(io_utils.list_folders(extracted_fov_dir, f"run_{i}"))) == 4
assert len([f for f in os.listdir(extracted_fov_dir) if f"run_{i}" in f]) == 4
assert len(list(io_utils.list_folders(extracted_fov_dir, "stitched_images"))) == 0

# check that files in fov folders are copied
Expand All @@ -808,8 +810,8 @@ def test_copy_image_files(mocker):
# check that correct total and per run fovs are copied, assert run 3 didn't get copied
assert len(io_utils.list_folders(extracted_fov_dir)) == 10
for i in range(1, 3):
assert len(io_utils.list_folders(extracted_fov_dir, f"run_{i}")) == 5
assert len(io_utils.list_folders(extracted_fov_dir, "run_3")) == 0
assert len([f for f in os.listdir(extracted_fov_dir) if f"run_{i}" in f]) == 5
assert len([f for f in os.listdir(extracted_fov_dir) if "run_3" in f]) == 0

# check that files in fov folders are copied
for folder in io_utils.list_folders(extracted_fov_dir):
Expand Down
1 change: 0 additions & 1 deletion tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from toffy.settings import QC_COLUMNS, QC_SUFFIXES

TEST_CHANNELS = [
"Calprotectin",
"Chymase",
"SMA",
"Vimentin",
Expand Down

0 comments on commit c31d51e

Please sign in to comment.