From f63a9edce2bede0fe2ed1e95720c3f910a07ee95 Mon Sep 17 00:00:00 2001
From: michaelbornholdt <michael.bornholdt@outlook.com>
Date: Thu, 14 Oct 2021 10:33:20 -0400
Subject: [PATCH 1/5] Quick fix to precision recall

---
 cytominer_eval/evaluate.py                     |  7 +++++++
 cytominer_eval/operations/precision_recall.py  | 14 +++++++++-----
 .../test_operations/test_precision_recall.py   | 18 ++++++++++--------
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
index 166cdfb..48286b4 100644
--- a/cytominer_eval/evaluate.py
+++ b/cytominer_eval/evaluate.py
@@ -23,6 +23,7 @@ def evaluate(
     features: List[str],
     meta_features: List[str],
     replicate_groups: Union[List[str], dict],
+    precision_recall_groupby_columns: List[str],
     operation: str = "replicate_reproducibility",
     similarity_metric: str = "pearson",
     replicate_reproducibility_quantile: float = 0.95,
@@ -32,6 +33,7 @@ def evaluate(
     grit_replicate_summary_method: str = "mean",
     mp_value_params: dict = {},
     enrichment_percentile: Union[float, List[float]] = 0.99,
+
 ):
     r"""Evaluate profile quality and strength.
 
@@ -63,6 +65,10 @@ def evaluate(
         guides targeting the same genes. See also
         :py:func:`cytominer_eval.operations.grit` and
         :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
+    precision_recall_groupby_columns : List of str
+        Only used for precision_recall
+        Column by which the sim mat is grouped and by which the precision is calculated.
+        For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample.
     operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
         The specific evaluation metric to calculate. The default is
         "replicate_reproducibility".
@@ -130,6 +136,7 @@ def evaluate(
         metric_result = precision_recall(
             similarity_melted_df=similarity_melted_df,
             replicate_groups=replicate_groups,
+            groupby_columns=precision_recall_groupby_columns,
             k=precision_recall_k,
         )
     elif operation == "grit":
diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
index 248ee74..71b4258 100644
--- a/cytominer_eval/operations/precision_recall.py
+++ b/cytominer_eval/operations/precision_recall.py
@@ -14,6 +14,7 @@
 def precision_recall(
     similarity_melted_df: pd.DataFrame,
     replicate_groups: List[str],
+    groupby_columns: List[str],
     k: Union[int, List[int]],
 ) -> pd.DataFrame:
     """Determine the precision and recall at k for all unique replicate groups
@@ -28,13 +29,16 @@ def precision_recall(
     replicate_groups : List
         a list of metadata column names in the original profile dataframe to use as
         replicate columns.
+    groupby_columns : List of str
+        column by which the sim mat is grouped and by which the precision is calculated.
+        For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample.
     k : List of ints or int
         an integer indicating how many pairwise comparisons to threshold.
 
     Returns
     -------
     pandas.DataFrame
-        precision and recall metrics for all replicate groups given k
+        precision and recall metrics for all groupby_column groups given k
     """
     # Determine pairwise replicates and make sure to sort based on the metric!
     similarity_melted_df = assign_replicates(
@@ -46,9 +50,9 @@ def precision_recall(
 
     # Extract out specific columns
     pair_ids = set_pair_ids()
-    replicate_group_cols = [
+    groupby_cols_suffix = [
         "{x}{suf}".format(x=x, suf=pair_ids[list(pair_ids)[0]]["suffix"])
-        for x in replicate_groups
+        for x in groupby_columns
     ]
     # iterate over all k
     precision_recall_df = pd.DataFrame()
@@ -57,11 +61,11 @@ def precision_recall(
     for k_ in k:
         # Calculate precision and recall for all groups
         precision_recall_df_at_k = similarity_melted_df.groupby(
-            replicate_group_cols
+            groupby_cols_suffix
         ).apply(lambda x: calculate_precision_recall(x, k=k_))
         precision_recall_df = precision_recall_df.append(precision_recall_df_at_k)
 
     # Rename the columns back to the replicate groups provided
-    rename_cols = dict(zip(replicate_group_cols, replicate_groups))
+    rename_cols = dict(zip(groupby_cols_suffix, groupby_columns))
 
     return precision_recall_df.reset_index().rename(rename_cols, axis="columns")
diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py
index 75f0bc7..26b852b 100644
--- a/cytominer_eval/tests/test_operations/test_precision_recall.py
+++ b/cytominer_eval/tests/test_operations/test_precision_recall.py
@@ -1,15 +1,14 @@
 import os
 import random
-import pytest
 import pathlib
 import tempfile
-import numpy as np
 import pandas as pd
 
+
 from cytominer_eval.transform import metric_melt
 from cytominer_eval.operations import precision_recall
 
-random.seed(123)
+random.seed(42)
 tmpdir = tempfile.gettempdir()
 
 # Load CRISPR dataset
@@ -37,32 +36,35 @@
 
 replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"]
 
+groupby_columns = ['Metadata_pert_name', 'Image_Metadata_Well']
 
 def test_precision_recall():
     result_list = precision_recall(
         similarity_melted_df=similarity_melted_df,
         replicate_groups=replicate_groups,
+        groupby_columns=groupby_columns,
         k=[5, 10],
     )
 
     result_int = precision_recall(
         similarity_melted_df=similarity_melted_df,
         replicate_groups=replicate_groups,
+        groupby_columns=groupby_columns,
         k=5,
     )
 
     assert len(result_list.k.unique()) == 2
     assert result_list.k.unique()[0] == 5
 
-    # ITGAV has a really strong profile
+    # ITGAV-1 has a really strong profile
     assert (
         result_list.sort_values(by="recall", ascending=False)
         .reset_index(drop=True)
         .iloc[0, :]
-        .Metadata_gene_name
-        == "ITGAV"
+        .Metadata_pert_name
+        == "ITGAV-2"
     )
 
-    assert all(x in result_list.columns for x in replicate_groups)
+    assert all(x in result_list.columns for x in groupby_columns)
 
-    assert result_int.equals(result_list.query("k == 5"))
+    assert result_int.equals(result_list.query("k == 5"))
\ No newline at end of file

From cdf362fd72469562dd5f8075ac867ee7049230bb Mon Sep 17 00:00:00 2001
From: michaelbornholdt <michael.bornholdt@outlook.com>
Date: Thu, 14 Oct 2021 15:12:40 -0400
Subject: [PATCH 2/5] Fixed test_eval

---
 cytominer_eval/evaluate.py                           | 12 ++++++------
 cytominer_eval/operations/precision_recall.py        | 10 ++++++----
 cytominer_eval/tests/test_evaluate.py                |  5 +++++
 .../tests/test_operations/test_precision_recall.py   |  8 +++++---
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/cytominer_eval/evaluate.py b/cytominer_eval/evaluate.py
index 48286b4..8767f1a 100644
--- a/cytominer_eval/evaluate.py
+++ b/cytominer_eval/evaluate.py
@@ -23,8 +23,8 @@ def evaluate(
     features: List[str],
     meta_features: List[str],
     replicate_groups: Union[List[str], dict],
-    precision_recall_groupby_columns: List[str],
     operation: str = "replicate_reproducibility",
+    groupby_columns: List[str] = "Metadata_moa",
     similarity_metric: str = "pearson",
     replicate_reproducibility_quantile: float = 0.95,
     replicate_reproducibility_return_median_cor: bool = False,
@@ -65,13 +65,13 @@ def evaluate(
         guides targeting the same genes. See also
         :py:func:`cytominer_eval.operations.grit` and
         :py:func:`cytominer_eval.transform.util.check_replicate_groups`.
-    precision_recall_groupby_columns : List of str
-        Only used for precision_recall
-        Column by which the sim mat is grouped and by which the precision is calculated.
-        For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample.
     operation : {'replicate_reproducibility', 'precision_recall', 'grit', 'mp_value'}, optional
         The specific evaluation metric to calculate. The default is
         "replicate_reproducibility".
+    groupby_columns : List of str
+        Only used for operation = 'precision_recall' and 'hit@k'
+        Column by which the similarity matrix is grouped and by which the operation is calculated.
+        For example, if groupby_column = "Metadata_broad_sample" then precision/recall is calculated for each sample.
     similarity_metric: {'pearson', 'spearman', 'kendall'}, optional
         How to calculate pairwise similarity. Defaults to "pearson". We use the input
         in pandas.DataFrame.cor(). The default is "pearson".
@@ -136,7 +136,7 @@ def evaluate(
         metric_result = precision_recall(
             similarity_melted_df=similarity_melted_df,
             replicate_groups=replicate_groups,
-            groupby_columns=precision_recall_groupby_columns,
+            groupby_columns=groupby_columns,
             k=precision_recall_k,
         )
     elif operation == "grit":
diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
index 71b4258..82ccb2b 100644
--- a/cytominer_eval/operations/precision_recall.py
+++ b/cytominer_eval/operations/precision_recall.py
@@ -27,11 +27,13 @@ def precision_recall(
         samples. Importantly, it must follow the exact structure as output from
         :py:func:`cytominer_eval.transform.transform.metric_melt`.
     replicate_groups : List
-        a list of metadata column names in the original profile dataframe to use as
-        replicate columns.
+        a list of metadata column names in the original profile dataframe to use as replicate columns.
     groupby_columns : List of str
-        column by which the sim mat is grouped and by which the precision is calculated.
-        For example, if groupby_column = Metadata_sample then the precision recall is calculated for each sample.
+        Column by which the similarity matrix is grouped and by which the precision/recall is calculated.
+        For example, if groupby_column = Metadata_sample then the precision is calculated for each sample.
+        Calculating the precision by sample is the default
+        but it is mathematically not incorrect to calculate the precision at the MOA level.
+        This is just less intuitive to understand.
     k : List of ints or int
         an integer indicating how many pairwise comparisons to threshold.
 
diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py
index d972398..8323161 100644
--- a/cytominer_eval/tests/test_evaluate.py
+++ b/cytominer_eval/tests/test_evaluate.py
@@ -134,6 +134,9 @@ def test_evaluate_precision_recall():
         },
     }
 
+    gene_groupby_columns = ['Metadata_pert_name']
+    compound_groupby_columns = ['Metadata_broad_sample']
+
     for k in ks:
 
         # first test the function with k = float, later we test with k = list of floats
@@ -142,6 +145,7 @@ def test_evaluate_precision_recall():
             features=gene_features,
             meta_features=gene_meta_features,
             replicate_groups=gene_groups,
+            groupby_columns=gene_groupby_columns,
             operation="precision_recall",
             similarity_metric="pearson",
             precision_recall_k=k,
@@ -161,6 +165,7 @@ def test_evaluate_precision_recall():
             features=compound_features,
             meta_features=compound_meta_features,
             replicate_groups=["Metadata_broad_sample"],
+            groupby_columns=compound_groupby_columns,
             operation="precision_recall",
             similarity_metric="pearson",
             precision_recall_k=[k],
diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py
index 26b852b..3273112 100644
--- a/cytominer_eval/tests/test_operations/test_precision_recall.py
+++ b/cytominer_eval/tests/test_operations/test_precision_recall.py
@@ -4,6 +4,8 @@
 import tempfile
 import pandas as pd
 
+# import sys
+# sys.path.insert(0, "/Users/mbornhol/git/mycyto/cytominer-eval")
 
 from cytominer_eval.transform import metric_melt
 from cytominer_eval.operations import precision_recall
@@ -36,7 +38,7 @@
 
 replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"]
 
-groupby_columns = ['Metadata_pert_name', 'Image_Metadata_Well']
+groupby_columns = ['Metadata_pert_name']
 
 def test_precision_recall():
     result_list = precision_recall(
@@ -62,9 +64,9 @@ def test_precision_recall():
         .reset_index(drop=True)
         .iloc[0, :]
         .Metadata_pert_name
-        == "ITGAV-2"
+        == "ITGAV-1"
     )
 
     assert all(x in result_list.columns for x in groupby_columns)
 
-    assert result_int.equals(result_list.query("k == 5"))
\ No newline at end of file
+    assert result_int.equals(result_list.query("k == 5"))

From 2f52623c711861d88ab7177dde26b59436d37851 Mon Sep 17 00:00:00 2001
From: michaelbornholdt <michael.bornholdt@outlook.com>
Date: Thu, 14 Oct 2021 15:29:45 -0400
Subject: [PATCH 3/5] Formatting

---
 cytominer_eval/operations/precision_recall.py  |  2 +-
 cytominer_eval/tests/test_evaluate.py          | 18 +++++-------------
 .../test_operations/test_precision_recall.py   |  3 ++-
 3 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/cytominer_eval/operations/precision_recall.py b/cytominer_eval/operations/precision_recall.py
index 82ccb2b..9fb2565 100644
--- a/cytominer_eval/operations/precision_recall.py
+++ b/cytominer_eval/operations/precision_recall.py
@@ -17,7 +17,7 @@ def precision_recall(
     groupby_columns: List[str],
     k: Union[int, List[int]],
 ) -> pd.DataFrame:
-    """Determine the precision and recall at k for all unique replicate groups
+    """Determine the precision and recall at k for all unique groupby_columns samples
     based on a predefined similarity metric (see cytominer_eval.transform.metric_melt)
 
     Parameters
diff --git a/cytominer_eval/tests/test_evaluate.py b/cytominer_eval/tests/test_evaluate.py
index 8323161..fd51926 100644
--- a/cytominer_eval/tests/test_evaluate.py
+++ b/cytominer_eval/tests/test_evaluate.py
@@ -113,11 +113,7 @@ def test_evaluate_replicate_reprod_return_cor_true():
 
     assert np.round(med_cor_df.similarity_metric.max(), 3) == 0.949
     assert sorted(med_cor_df.columns.tolist()) == sorted(
-        [
-            "Metadata_gene_name",
-            "Metadata_pert_name",
-            "similarity_metric",
-        ]
+        ["Metadata_gene_name", "Metadata_pert_name", "similarity_metric",]
     )
 
 
@@ -134,8 +130,8 @@ def test_evaluate_precision_recall():
         },
     }
 
-    gene_groupby_columns = ['Metadata_pert_name']
-    compound_groupby_columns = ['Metadata_broad_sample']
+    gene_groupby_columns = ["Metadata_pert_name"]
+    compound_groupby_columns = ["Metadata_broad_sample"]
 
     for k in ks:
 
@@ -213,9 +209,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
     assert np.round(top_result.grit, 4) == 2.3352
     assert top_result.group == "PTK2"
@@ -241,9 +235,7 @@ def test_evaluate_grit():
     top_result = (
         grit_results_df.sort_values(by="grit", ascending=False)
         .reset_index(drop=True)
-        .iloc[
-            0,
-        ]
+        .iloc[0,]
     )
 
     assert np.round(top_result.grit, 4) == 0.9990
diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py
index 3273112..29edd6b 100644
--- a/cytominer_eval/tests/test_operations/test_precision_recall.py
+++ b/cytominer_eval/tests/test_operations/test_precision_recall.py
@@ -38,7 +38,8 @@
 
 replicate_groups = ["Metadata_gene_name", "Metadata_cell_line"]
 
-groupby_columns = ['Metadata_pert_name']
+groupby_columns = ["Metadata_pert_name"]
+
 
 def test_precision_recall():
     result_list = precision_recall(

From 1834ffb7df20e450359651f9fb2272c6bcc4b636 Mon Sep 17 00:00:00 2001
From: michaelbornholdt <michael.bornholdt@outlook.com>
Date: Fri, 15 Oct 2021 14:53:46 -0400
Subject: [PATCH 4/5] delete leftover

---
 cytominer_eval/tests/test_operations/test_precision_recall.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cytominer_eval/tests/test_operations/test_precision_recall.py b/cytominer_eval/tests/test_operations/test_precision_recall.py
index 29edd6b..b3a20a4 100644
--- a/cytominer_eval/tests/test_operations/test_precision_recall.py
+++ b/cytominer_eval/tests/test_operations/test_precision_recall.py
@@ -1,17 +1,13 @@
 import os
 import random
 import pathlib
-import tempfile
 import pandas as pd
 
-# import sys
-# sys.path.insert(0, "/Users/mbornhol/git/mycyto/cytominer-eval")
 
 from cytominer_eval.transform import metric_melt
 from cytominer_eval.operations import precision_recall
 
 random.seed(42)
-tmpdir = tempfile.gettempdir()
 
 # Load CRISPR dataset
 example_file = "SQ00014610_normalized_feature_select.csv.gz"

From 212e2a5b1aa19da5cf8539dacd4cf2636f92a9b8 Mon Sep 17 00:00:00 2001
From: michaelbornholdt <michael.bornholdt@outlook.com>
Date: Fri, 15 Oct 2021 16:10:30 -0400
Subject: [PATCH 5/5] Readme update

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 44ceda1..3381914 100644
--- a/README.md
+++ b/README.md
@@ -63,6 +63,7 @@ Currently, five metric operations are supported:
 3. mp-value
 4. Grit
 5. Enrichment
+6. Hit@k
 
 ## Demos