From f63a9edce2bede0fe2ed1e95720c3f910a07ee95 Mon Sep 17 00:00:00 2001 From: michaelbornholdt Date: Thu, 14 Oct 2021 10:33:20 -0400 Subject: [PATCH 1/5] Quick fix to precision recall --- cytominer_eval/evaluate.py | 7 +++++++ cytominer_eval/operations/precision_recall.py | 14 +++++++++----- .../test_operations/test_precision_recall.py | 18 ++++++++++-------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py index 166cdfb..48286b4 100644 --- a/cytominer_eval/evaluate.py +++ b/cytominer_eval/evaluate.py @@ -23,6 +23,7 @@ def evaluate( features: List[str], meta_features: List[str], replicate_groups: Union[List[str], dict], + precision_recall_groupby_columns: List[str], operation: str = "replicate_reproducibility", similarity_metric: str = "pearson", replicate_reproducibility_quantile: float = 0.95, @@ -32,6 +33,7 @@ def evaluate( grit_replicate_summary_method: str = "mean", mp_value_params: dict = {}, enrichment_percentile: Union[float, List[float]] = 0.99, + ): r"""Evaluate profile quality and strength. @@ -63,6 +65,10 @@ def evaluate( guides targeting the same genes. See also :py:func:`cytominer_eval.operations.grit` and :py:func:`cytominer_eval.transform.util.check_replicate_groups`. + precision_recall_groupby_columns : List of str + Only used for precision_recall + Column by which the sim mat is grouped and by which the precision is calculated. + For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample. operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional The specific evaluation metric to calculate. The default is "replicate_reproducibility". @@ -130,6 +136,7 @@ def evaluate( metric_result = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, + groupby_columns=precision_recall_groupby_columns, k=precision_recall_k, ) elif operation == "grit": diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py index 248ee74..71b4258 100644 --- a/cytominer_eval/operations/precision_recall.py +++ b/cytominer_eval/operations/precision_recall.py @@ -14,6 +14,7 @@ def precision_recall( similarity_melted_df: pd.DataFrame, replicate_groups: List[str], + groupby_columns: List[str], k: Union[int, List[int]], ) -> pd.DataFrame: """Determine the precision and recall at k for all unique replicate groups @@ -28,13 +29,16 @@ def precision_recall( replicate_groups : List a list of metadata column names in the original profile dataframe to use as replicate columns. + groupby_columns : List of str + column by which the sim mat is grouped and by which the precision is calculated. + For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample. k : List of ints or int an integer indicating how many pairwise comparisons to threshold. Returns ------- pandas.DataFrame - precision and recall metrics for all replicate groups given k + precision and recall metrics for all groupby_column groups given k """ # Determine pairwise replicates and make sure to sort based on the metric! similarity_melted_df = assign_replicates( @@ -46,9 +50,9 @@ def precision_recall( # Extract out specific columns pair_ids = set_pair_ids() - replicate_group_cols = [ + groupby_cols_suffix = [ "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"]) - for x in replicate_groups + for x in groupby_columns ] # iterate over all k precision_recall_df = pd.DataFrame() @@ -57,11 +61,11 @@ def precision_recall( for k_ in k: # Calculate precision and recall for all groups precision_recall_df_at_k = similarity_melted_df.groupby( - replicate_group_cols + groupby_cols_suffix ).apply(lambda x: calculate_precision_recall(x, k=k_)) precision_recall_df = precision_recall_df.append(precision_recall_df_at_k) # Rename the columns back to the replicate groups provided - rename_cols = dict(zip(replicate_group_cols, replicate_groups)) + rename_cols = dict(zip(groupby_cols_suffix, groupby_columns)) return precision_recall_df.reset_index().rename(rename_cols, axis="columns") diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py index 75f0bc7..26b852b 100644 --- a/cytominer_eval/tests/test_operations/test_precision_recall.py +++ b/cytominer_eval/tests/test_operations/test_precision_recall.py @@ -1,15 +1,14 @@ import os import random -import pytest import pathlib import tempfile -import numpy as np import pandas as pd + from cytominer_eval.transform import metric_melt from cytominer_eval.operations import precision_recall -random.seed(123) +random.seed(42) tmpdir = tempfile.gettempdir() # Load CRISPR dataset @@ -37,32 +36,35 @@ replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"] +groupby_columns = ['Metadata_pert_name', 'Image_Metadata_Well'] def test_precision_recall(): result_list = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, + groupby_columns=groupby_columns, k=[5, 10], ) result_int = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, + groupby_columns=groupby_columns, k=5, ) assert len(result_list.k.unique()) == 2 assert result_list.k.unique()[0] == 5 - # ITGAV has a really strong profile + # ITGAV-1 has a really strong profile assert ( result_list.sort_values(by="recall", ascending=False) .reset_index(drop=True) .iloc[0, :] - .Metadata_gene_name - == "ITGAV" + .Metadata_pert_name + == "ITGAV-2" ) - assert all(x in result_list.columns for x in replicate_groups) + assert all(x in result_list.columns for x in groupby_columns) - assert result_int.equals(result_list.query("k == 5")) + assert result_int.equals(result_list.query("k == 5")) \ No newline at end of file From cdf362fd72469562dd5f8075ac867ee7049230bb Mon Sep 17 00:00:00 2001 From: michaelbornholdt Date: Thu, 14 Oct 2021 15:12:40 -0400 Subject: [PATCH 2/5] Fixed test_eval --- cytominer_eval/evaluate.py | 12 ++++++------ cytominer_eval/operations/precision_recall.py | 10 ++++++---- cytominer_eval/tests/test_evaluate.py | 5 +++++ .../tests/test_operations/test_precision_recall.py | 8 +++++--- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py index 48286b4..8767f1a 100644 --- a/cytominer_eval/evaluate.py +++ b/cytominer_eval/evaluate.py @@ -23,8 +23,8 @@ def evaluate( features: List[str], meta_features: List[str], replicate_groups: Union[List[str], dict], - precision_recall_groupby_columns: List[str], operation: str = "replicate_reproducibility", + groupby_columns: List[str] = "Metadata_moa", similarity_metric: str = "pearson", replicate_reproducibility_quantile: float = 0.95, replicate_reproducibility_return_median_cor: bool = False, @@ -65,13 +65,13 @@ def evaluate( guides targeting the same genes. See also :py:func:`cytominer_eval.operations.grit` and :py:func:`cytominer_eval.transform.util.check_replicate_groups`. - precision_recall_groupby_columns : List of str - Only used for precision_recall - Column by which the sim mat is grouped and by which the precision is calculated. - For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample. operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional The specific evaluation metric to calculate. The default is "replicate_reproducibility". + groupby_columns : List of str + Only used for operation = 'precision_recall' and 'hit@k' + Column by which the similarity matrix is grouped and by which the operation is calculated. + For example, if groupby_column = "Metadata_broad_sample" then precision/recall is calculated for each sample. similarity_metric: {'pearson', 'spearman', 'kendall'}, optional How to calculate pairwise similarity. Defaults to "pearson". We use the input in pandas.DataFrame.cor(). The default is "pearson". @@ -136,7 +136,7 @@ def evaluate( metric_result = precision_recall( similarity_melted_df=similarity_melted_df, replicate_groups=replicate_groups, - groupby_columns=precision_recall_groupby_columns, + groupby_columns=groupby_columns, k=precision_recall_k, ) elif operation == "grit": diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py index 71b4258..82ccb2b 100644 --- a/cytominer_eval/operations/precision_recall.py +++ b/cytominer_eval/operations/precision_recall.py @@ -27,11 +27,13 @@ def precision_recall( samples. Importantly, it must follow the exact structure as output from :py:func:`cytominer_eval.transform.transform.metric_melt`. replicate_groups : List - a list of metadata column names in the original profile dataframe to use as - replicate columns. + a list of metadata column names in the original profile dataframe to use as replicate columns. groupby_columns : List of str - column by which the sim mat is grouped and by which the precision is calculated. - For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample. + Column by which the similarity matrix is grouped and by which the precision/recall is calculated. + For example, if groupby_column = Metadata_sample then the precision is calculated for each sample. + Calculating the precision by sample is the default + but it is mathematically not incorrect to calculate the precision at the MOA level. + This is just less intuitive to understand. k : List of ints or int an integer indicating how many pairwise comparisons to threshold. diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py index d972398..8323161 100644 --- a/cytominer_eval/tests/test_evaluate.py +++ b/cytominer_eval/tests/test_evaluate.py @@ -134,6 +134,9 @@ def test_evaluate_precision_recall(): }, } + gene_groupby_columns = ['Metadata_pert_name'] + compound_groupby_columns = ['Metadata_broad_sample'] + for k in ks: # first test the function with k = float, later we test with k = list of floats @@ -142,6 +145,7 @@ def test_evaluate_precision_recall(): features=gene_features, meta_features=gene_meta_features, replicate_groups=gene_groups, + groupby_columns=gene_groupby_columns, operation="precision_recall", similarity_metric="pearson", precision_recall_k=k, @@ -161,6 +165,7 @@ def test_evaluate_precision_recall(): features=compound_features, meta_features=compound_meta_features, replicate_groups=["Metadata_broad_sample"], + groupby_columns=compound_groupby_columns, operation="precision_recall", similarity_metric="pearson", precision_recall_k=[k], diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py index 26b852b..3273112 100644 --- a/cytominer_eval/tests/test_operations/test_precision_recall.py +++ b/cytominer_eval/tests/test_operations/test_precision_recall.py @@ -4,6 +4,8 @@ import tempfile import pandas as pd +# import sys +# sys.path.insert(0, "/Users/mbornhol/git/mycyto/cytominer-eval") from cytominer_eval.transform import metric_melt from cytominer_eval.operations import precision_recall @@ -36,7 +38,7 @@ replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"] -groupby_columns = ['Metadata_pert_name', 'Image_Metadata_Well'] +groupby_columns = ['Metadata_pert_name'] def test_precision_recall(): result_list = precision_recall( @@ -62,9 +64,9 @@ def test_precision_recall(): .reset_index(drop=True) .iloc[0, :] .Metadata_pert_name - == "ITGAV-2" + == "ITGAV-1" ) assert all(x in result_list.columns for x in groupby_columns) - assert result_int.equals(result_list.query("k == 5")) \ No newline at end of file + assert result_int.equals(result_list.query("k == 5")) From 2f52623c711861d88ab7177dde26b59436d37851 Mon Sep 17 00:00:00 2001 From: michaelbornholdt Date: Thu, 14 Oct 2021 15:29:45 -0400 Subject: [PATCH 3/5] Formatting --- cytominer_eval/operations/precision_recall.py | 2 +- cytominer_eval/tests/test_evaluate.py | 18 +++++------------- .../test_operations/test_precision_recall.py | 3 ++- 3 files changed, 8 insertions(+), 15 deletions(-) diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py index 82ccb2b..9fb2565 100644 --- a/cytominer_eval/operations/precision_recall.py +++ b/cytominer_eval/operations/precision_recall.py @@ -17,7 +17,7 @@ def precision_recall( groupby_columns: List[str], k: Union[int, List[int]], ) -> pd.DataFrame: - """Determine the precision and recall at k for all unique replicate groups + """Determine the precision and recall at k for all unique groupby_columns samples based on a predefined similarity metric (see cytominer_eval.transform.metric_melt) Parameters diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py index 8323161..fd51926 100644 --- a/cytominer_eval/tests/test_evaluate.py +++ b/cytominer_eval/tests/test_evaluate.py @@ -113,11 +113,7 @@ def test_evaluate_replicate_reprod_return_cor_true(): assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949 assert sorted(med_cor_df.columns.tolist()) == sorted( - [ - "Metadata_gene_name", - "Metadata_pert_name", - "similarity_metric", - ] + ["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",] ) @@ -134,8 +130,8 @@ def test_evaluate_precision_recall(): }, } - gene_groupby_columns = ['Metadata_pert_name'] - compound_groupby_columns = ['Metadata_broad_sample'] + gene_groupby_columns = ["Metadata_pert_name"] + compound_groupby_columns = ["Metadata_broad_sample"] for k in ks: @@ -213,9 +209,7 @@ def test_evaluate_grit(): top_result = ( grit_results_df.sort_values(by="grit", ascending=False) .reset_index(drop=True) - .iloc[ - 0, - ] + .iloc[0,] ) assert np.round(top_result.grit, 4) == 2.3352 assert top_result.group == "PTK2" @@ -241,9 +235,7 @@ def test_evaluate_grit(): top_result = ( grit_results_df.sort_values(by="grit", ascending=False) .reset_index(drop=True) - .iloc[ - 0, - ] + .iloc[0,] ) assert np.round(top_result.grit, 4) == 0.9990 diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py index 3273112..29edd6b 100644 --- a/cytominer_eval/tests/test_operations/test_precision_recall.py +++ b/cytominer_eval/tests/test_operations/test_precision_recall.py @@ -38,7 +38,8 @@ replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"] -groupby_columns = ['Metadata_pert_name'] +groupby_columns = ["Metadata_pert_name"] + def test_precision_recall(): result_list = precision_recall( From 1834ffb7df20e450359651f9fb2272c6bcc4b636 Mon Sep 17 00:00:00 2001 From: michaelbornholdt Date: Fri, 15 Oct 2021 14:53:46 -0400 Subject: [PATCH 4/5] delete leftover --- cytominer_eval/tests/test_operations/test_precision_recall.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py index 29edd6b..b3a20a4 100644 --- a/cytominer_eval/tests/test_operations/test_precision_recall.py +++ b/cytominer_eval/tests/test_operations/test_precision_recall.py @@ -1,17 +1,13 @@ import os import random import pathlib -import tempfile import pandas as pd -# import sys -# sys.path.insert(0, "/Users/mbornhol/git/mycyto/cytominer-eval") from cytominer_eval.transform import metric_melt from cytominer_eval.operations import precision_recall random.seed(42) -tmpdir = tempfile.gettempdir() # Load CRISPR dataset example_file = "SQ00014610_normalized_feature_select.csv.gz" From 212e2a5b1aa19da5cf8539dacd4cf2636f92a9b8 Mon Sep 17 00:00:00 2001 From: michaelbornholdt Date: Fri, 15 Oct 2021 16:10:30 -0400 Subject: [PATCH 5/5] Readme update --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44ceda1..3381914 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ Currently, five metric operations are supported: 3. mp-value 4. Grit 5. Enrichment +6. Hit@k ## Demos