From 20300d5e7d3ec0f500b69ab8da4ad8c0a126b210 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 10 Jun 2024 14:57:14 -0500
Subject: [PATCH 01/10] changes to code to have ml agg per algo

---
 Snakefile           | 68 +++++++++++++++++++++++++++++++++------------
 config/config.yaml  |  6 +++-
 spras/config.py     |  2 +-
 test/test_config.py |  3 +-
 4 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/Snakefile b/Snakefile
index 140e0f71..4d012cd8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -11,7 +11,8 @@ import spras.config as _config
 SEP = '/'
 
 wildcard_constraints:
-    params="params-\w+"
+    params="params-\w+",
+    dataset="\w+"
 
 # Elsewhere we import this as config, but in the Snakefile, the variable config is already populated
 # with the parsed config.yaml. This is done by Snakemake, which magically pipes config into this file
@@ -77,14 +78,24 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels))
 
     if _config.config.analysis_include_ml:
-        final_input.extend(expand('{out_dir}{sep}{dataset}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
+
+    if _config.config.analysis_include_ml_aggregate_algo: # argument required
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
@@ -268,14 +279,14 @@ rule ml_analysis:
     input: 
         pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params)
     output: 
-        pca_image = SEP.join([out_dir, '{dataset}-pca.png']),
-        pca_variance= SEP.join([out_dir, '{dataset}-pca-variance.txt']),
-        pca_coordinates = SEP.join([out_dir, '{dataset}-pca-coordinates.txt']),
-        hac_image_vertical = SEP.join([out_dir, '{dataset}-hac-vertical.png']),
-        hac_clusters_vertical = SEP.join([out_dir, '{dataset}-hac-clusters-vertical.txt']),
-        hac_image_horizontal = SEP.join([out_dir, '{dataset}-hac-horizontal.png']),
-        hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-hac-clusters-horizontal.txt']),
-        ensemble_network_file = SEP.join([out_dir,'{dataset}-ensemble-pathway.txt'])
+        pca_image = SEP.join([out_dir, '{dataset}-ml', 'pca.png']),
+        pca_variance= SEP.join([out_dir, '{dataset}-ml', 'pca-variance.txt']),
+        pca_coordinates = SEP.join([out_dir, '{dataset}-ml', 'pca-coordinates.txt']),
+        hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-vertical.png']),
+        hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']),
+        hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']),
+        hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']),
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt'])
     run: 
         summary_df = ml.summarize_networks(input.pathways)
         ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
@@ -283,6 +294,29 @@ rule ml_analysis:
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
+def collect_files_per_algo (wildcards):
+    filtered_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
+    return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_params)
+
+rule ml_analysis_aggregate_algo:
+    input:
+        pathways = collect_files_per_algo
+    output:
+        pca_image = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca.png']),
+        pca_variance= SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-variance.txt']),
+        pca_coordinates = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-coordinates.txt']),
+        hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-vertical.png']),
+        hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']),
+        hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']),
+        hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']),
+        ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt'])
+    run:
+        summary_df = ml.summarize_networks(input.pathways)
+        ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params)
+        ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params)
+        ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
+        ml.ensemble_network(summary_df, output.ensemble_network_file)
+
 # Remove the output directory
 rule clean:
     shell: f'rm -rf {out_dir}'
diff --git a/config/config.yaml b/config/config.yaml
index c31b2429..651924b9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -37,7 +37,7 @@ container_registry:
 algorithms:
       - name: "pathlinker"
         params:
-              include: false
+              include: true
               run1:
                   k: range(100,201,100)
 
@@ -134,7 +134,11 @@ analysis:
         include: true
       # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
       ml:
+        # ml analysis per dataset
         include: true
+        # required: true or false
+        # adds ml analysis per algorithm output
+        aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2
         # boolean to show the labels on the pca graph
diff --git a/spras/config.py b/spras/config.py
index 100b2779..041dbe2b 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -204,8 +204,8 @@ def process_config(self, raw_config):
             self.hac_params["linkage"] = self.ml_params["linkage"]
         if "metric" in self.ml_params:
             self.hac_params["metric"] = self.ml_params ["metric"]
-
         self.analysis_include_summary = raw_config["analysis"]["summary"]["include"]
         self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
+        self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
diff --git a/test/test_config.py b/test/test_config.py
index c7497cec..f17190d0 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -26,7 +26,8 @@ def get_test_config():
                 "include": False
             },
             "ml": {
-                "include": False
+                "include": False,
+                "aggregate_per_algorithm": False
             },
             "graphspace": {
                 "include": False

From e31586a910d8b700708f10a461530303155b1172 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Tue, 11 Jun 2024 11:56:26 -0500
Subject: [PATCH 02/10] removed controllable parameter for now

---
 Snakefile          |  3 +--
 config/config.yaml | 10 +++++-----
 spras/config.py    |  2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Snakefile b/Snakefile
index 4d012cd8..800e5ed7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -86,8 +86,7 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-
-    if _config.config.analysis_include_ml_aggregate_algo: # argument required
+    # if _config.config.analysis_include_ml_aggregate_algo: # argument required
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
diff --git a/config/config.yaml b/config/config.yaml
index 651924b9..25680b19 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -61,7 +61,7 @@ algorithms:
 
       - name: "meo"
         params:
-              include: true
+              include: false
               run1:
                   max_path_length: [3]
                   local_search: ["Yes"]
@@ -69,18 +69,18 @@ algorithms:
 
       - name: "mincostflow"
         params:
-              include: true
+              include: false
               run1:
                   flow: [1] # The flow must be an int
                   capacity: [1]
 
       - name: "allpairs"
         params:
-              include: true
+              include: false
 
       - name: "domino"
         params:
-              include: true
+              include: false
               run1:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]
@@ -138,7 +138,7 @@ analysis:
         include: true
         # required: true or false
         # adds ml analysis per algorithm output
-        aggregate_per_algorithm: true
+        # aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2
         # boolean to show the labels on the pca graph
diff --git a/spras/config.py b/spras/config.py
index 041dbe2b..7dcf9ad2 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -208,4 +208,4 @@ def process_config(self, raw_config):
         self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
-        self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
+        # self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]

From a2dc174bf28b1cad1b01b79c3046377de76a2cc5 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Tue, 11 Jun 2024 11:57:20 -0500
Subject: [PATCH 03/10] removed controllable parameter for now from test_config

---
 test/test_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_config.py b/test/test_config.py
index f17190d0..70298d30 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -26,8 +26,8 @@ def get_test_config():
                 "include": False
             },
             "ml": {
-                "include": False,
-                "aggregate_per_algorithm": False
+                "include": False
+                # "aggregate_per_algorithm": False
             },
             "graphspace": {
                 "include": False

From 1da8c9b7746e1f3bb05aefbc3b713da30dbbc372 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Wed, 12 Jun 2024 12:22:14 -0500
Subject: [PATCH 04/10] make all algortihms true

---
 config/config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 25680b19..481dd553 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -61,7 +61,7 @@ algorithms:
 
       - name: "meo"
         params:
-              include: false
+              include: true
               run1:
                   max_path_length: [3]
                   local_search: ["Yes"]
@@ -69,18 +69,18 @@ algorithms:
 
       - name: "mincostflow"
         params:
-              include: false
+              include: true
               run1:
                   flow: [1] # The flow must be an int
                   capacity: [1]
 
       - name: "allpairs"
         params:
-              include: false
+              include: true
 
       - name: "domino"
         params:
-              include: false
+              include: true
               run1:
                   slice_threshold: [0.3]
                   module_threshold: [0.05]

From 0d8ec19e87c5fe75d7dbbac4801de36537a35edf Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Wed, 12 Jun 2024 13:04:29 -0500
Subject: [PATCH 05/10] added back aggregate_per_algorithm param and added
 testing

---
 Snakefile           |  3 ++-
 config/config.yaml  |  5 ++---
 spras/config.py     |  6 +++++-
 test/test_config.py | 16 ++++++++++++++--
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 800e5ed7..92ac9146 100644
--- a/Snakefile
+++ b/Snakefile
@@ -86,7 +86,8 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
-    # if _config.config.analysis_include_ml_aggregate_algo: # argument required
+    
+    if _config.config.analysis_include_ml_aggregate_algo:
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
diff --git a/config/config.yaml b/config/config.yaml
index 481dd553..5e543153 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -136,9 +136,8 @@ analysis:
       ml:
         # ml analysis per dataset
         include: true
-        # required: true or false
-        # adds ml analysis per algorithm output
-        # aggregate_per_algorithm: true
+        # required; adds ml analysis per algorithm output
+        aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2
         # boolean to show the labels on the pca graph
diff --git a/spras/config.py b/spras/config.py
index 7dcf9ad2..454df937 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -208,4 +208,8 @@ def process_config(self, raw_config):
         self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
-        # self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
+
+        if 'aggregate_per_algorithm' not in self.ml_params:
+            raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.")
+        else:
+            self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
\ No newline at end of file
diff --git a/test/test_config.py b/test/test_config.py
index 70298d30..18d5cf64 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -26,8 +26,8 @@ def get_test_config():
                 "include": False
             },
             "ml": {
-                "include": False
-                # "aggregate_per_algorithm": False
+                "include": False,
+                "aggregate_per_algorithm": False
             },
             "graphspace": {
                 "include": False
@@ -105,4 +105,16 @@ def test_config_container_registry(self):
         assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX)
 
 
+    def test_ml(self):
+        # test ml_aggregate
+        test_config = get_test_config()
+        config.init_global(test_config)
+        assert (config.config.analysis_include_ml_aggregate_algo == False)
+
+        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True
+        config.init_global(test_config)
+        assert (config.config.analysis_include_ml_aggregate_algo == True)
 
+        with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing
+            del test_config["analysis"]["ml"]["aggregate_per_algorithm"]
+            config.init_global(test_config)

From a787a885ab3ea72a31acfe05c0265fe1cfc750bd Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Thu, 13 Jun 2024 11:33:25 -0500
Subject: [PATCH 06/10] precommit

---
 spras/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spras/config.py b/spras/config.py
index 454df937..cd7f41f1 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -212,4 +212,4 @@ def process_config(self, raw_config):
         if 'aggregate_per_algorithm' not in self.ml_params:
             raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.")
         else:
-            self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
\ No newline at end of file
+            self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]

From 90e8e11dcb5b3f2f9f22684ad65c8a20b6ba2937 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 14 Jun 2024 14:37:59 -0500
Subject: [PATCH 07/10] update code to make rule not required and updated 
 ml-agg rule

---
 Snakefile           | 23 ++++++++++++++---------
 config/config.yaml  |  8 ++++----
 spras/config.py     |  3 ++-
 test/test_config.py | 16 ----------------
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/Snakefile b/Snakefile
index 92ac9146..8cf152aa 100644
--- a/Snakefile
+++ b/Snakefile
@@ -24,7 +24,6 @@ algorithm_params = _config.config.algorithm_params
 algorithm_directed = _config.config.algorithm_directed
 pca_params = _config.config.pca_params
 hac_params = _config.config.hac_params
-
 FRAMEWORK = _config.config.container_framework
 print(f"Running {FRAMEWORK} containers")
 
@@ -36,6 +35,12 @@ algorithms = list(algorithm_params)
 algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()]
 dataset_labels = list(_config.config.datasets.keys())
 
+# Get algorithms that are running multiple parameter combinations
+def include_algorithm (algo):
+    return len(algorithm_params.get(algo, {})) > 1
+
+algorithms_mult_param_combos = [algo for algo in algorithms if include_algorithm(algo)]
+
 # Get the parameter dictionary for the specified
 # algorithm and parameter combination hash
 def reconstruction_params(algorithm, params_hash):
@@ -88,14 +93,14 @@ def make_final_input(wildcards):
         final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params))
     
     if _config.config.analysis_include_ml_aggregate_algo:
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
-        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
+        final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params))
 
     if len(final_input) == 0:
         # No analysis added yet, so add reconstruction output files if they exist.
diff --git a/config/config.yaml b/config/config.yaml
index 5e543153..e9e7c125 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,16 +63,16 @@ algorithms:
         params:
               include: true
               run1:
-                  max_path_length: [3]
+                  max_path_length: [3, 4, 5]
                   local_search: ["Yes"]
-                  rand_restarts: [10]
+                  rand_restarts: [10, 11]
 
       - name: "mincostflow"
         params:
               include: true
               run1:
                   flow: [1] # The flow must be an int
-                  capacity: [1]
+                  capacity: [1, 2, 3]
 
       - name: "allpairs"
         params:
@@ -136,7 +136,7 @@ analysis:
       ml:
         # ml analysis per dataset
         include: true
-        # required; adds ml analysis per algorithm output
+        # adds ml analysis per algorithm output
         aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2
diff --git a/spras/config.py b/spras/config.py
index cd7f41f1..fdd51165 100644
--- a/spras/config.py
+++ b/spras/config.py
@@ -204,12 +204,13 @@ def process_config(self, raw_config):
             self.hac_params["linkage"] = self.ml_params["linkage"]
         if "metric" in self.ml_params:
             self.hac_params["metric"] = self.ml_params ["metric"]
+
         self.analysis_include_summary = raw_config["analysis"]["summary"]["include"]
         self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"]
         self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"]
         self.analysis_include_ml = raw_config["analysis"]["ml"]["include"]
 
         if 'aggregate_per_algorithm' not in self.ml_params:
-            raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.")
+            self.analysis_include_ml_aggregate_algo = False
         else:
             self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"]
diff --git a/test/test_config.py b/test/test_config.py
index 18d5cf64..cd5772e5 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -27,7 +27,6 @@ def get_test_config():
             },
             "ml": {
                 "include": False,
-                "aggregate_per_algorithm": False
             },
             "graphspace": {
                 "include": False
@@ -103,18 +102,3 @@ def test_config_container_registry(self):
         test_config["container_registry"]["owner"] = ""
         config.init_global(test_config)
         assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX)
-
-
-    def test_ml(self):
-        # test ml_aggregate
-        test_config = get_test_config()
-        config.init_global(test_config)
-        assert (config.config.analysis_include_ml_aggregate_algo == False)
-
-        test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True
-        config.init_global(test_config)
-        assert (config.config.analysis_include_ml_aggregate_algo == True)
-
-        with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing
-            del test_config["analysis"]["ml"]["aggregate_per_algorithm"]
-            config.init_global(test_config)

From 08bc9873975609f032dc8133f16ebf25842ba469 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Fri, 14 Jun 2024 16:36:11 -0500
Subject: [PATCH 08/10] clean up

---
 config/config.yaml  | 6 +++---
 test/test_config.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index e9e7c125..5e76dae9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,16 +63,16 @@ algorithms:
         params:
               include: true
               run1:
-                  max_path_length: [3, 4, 5]
+                  max_path_length: [3]
                   local_search: ["Yes"]
-                  rand_restarts: [10, 11]
+                  rand_restarts: [10]
 
       - name: "mincostflow"
         params:
               include: true
               run1:
                   flow: [1] # The flow must be an int
-                  capacity: [1, 2, 3]
+                  capacity: [1]
 
       - name: "allpairs"
         params:
diff --git a/test/test_config.py b/test/test_config.py
index cd5772e5..602f95af 100644
--- a/test/test_config.py
+++ b/test/test_config.py
@@ -26,7 +26,7 @@ def get_test_config():
                 "include": False
             },
             "ml": {
-                "include": False,
+                "include": False
             },
             "graphspace": {
                 "include": False

From 6c9e1f2e900128bf5f8dcb71046cc92838773dbe Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 17 Jun 2024 10:45:04 -0500
Subject: [PATCH 09/10] clean up

---
 config/config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/config.yaml b/config/config.yaml
index 5e76dae9..f9c3a9c0 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -137,6 +137,7 @@ analysis:
         # ml analysis per dataset
         include: true
         # adds ml analysis per algorithm output
+        # works only on algorithms with multiple parameter combinations chosen
         aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2

From 122fb66f463eb8b15e0c2e1be8ff3d7ca6c02449 Mon Sep 17 00:00:00 2001
From: ntalluri <nehatalluri@live.com>
Date: Mon, 24 Jun 2024 16:25:18 -0500
Subject: [PATCH 10/10] requested changes

---
 Snakefile          | 12 ++++++------
 config/config.yaml |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 8cf152aa..6a9b513d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -36,10 +36,10 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par
 dataset_labels = list(_config.config.datasets.keys())
 
 # Get algorithms that are running multiple parameter combinations
-def include_algorithm (algo):
+def algo_has_mult_param_combos(algo):
     return len(algorithm_params.get(algo, {})) > 1
 
-algorithms_mult_param_combos = [algo for algo in algorithms if include_algorithm(algo)]
+algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)]
 
 # Get the parameter dictionary for the specified
 # algorithm and parameter combination hash
@@ -299,13 +299,13 @@ rule ml_analysis:
         ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params)
         ml.ensemble_network(summary_df, output.ensemble_network_file)
 
-def collect_files_per_algo (wildcards):
-    filtered_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
-    return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_params)
+def collect_pathways_per_algo(wildcards):
+    filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param]
+    return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params)
 
 rule ml_analysis_aggregate_algo:
     input:
-        pathways = collect_files_per_algo
+        pathways = collect_pathways_per_algo
     output:
         pca_image = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca.png']),
         pca_variance= SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-variance.txt']),
diff --git a/config/config.yaml b/config/config.yaml
index f9c3a9c0..5fe6083b 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -137,7 +137,7 @@ analysis:
         # ml analysis per dataset
         include: true
         # adds ml analysis per algorithm output
-        # works only on algorithms with multiple parameter combinations chosen
+        # only runs for algorithms with multiple parameter combinations chosen
         aggregate_per_algorithm: true
         # specify how many principal components to calculate
         components: 2