From 20300d5e7d3ec0f500b69ab8da4ad8c0a126b210 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Mon, 10 Jun 2024 14:57:14 -0500 Subject: [PATCH 01/10] changes to code to have ml agg per algo --- Snakefile | 68 +++++++++++++++++++++++++++++++++------------ config/config.yaml | 6 +++- spras/config.py | 2 +- test/test_config.py | 3 +- 4 files changed, 59 insertions(+), 20 deletions(-) diff --git a/Snakefile b/Snakefile index 140e0f71..4d012cd8 100644 --- a/Snakefile +++ b/Snakefile @@ -11,7 +11,8 @@ import spras.config as _config SEP = '/' wildcard_constraints: - params="params-\w+" + params="params-\w+", + dataset="\w+" # Elsewhere we import this as config, but in the Snakefile, the variable config is already populated # with the parsed config.yaml. This is done by Snakemake, which magically pipes config into this file @@ -77,14 +78,24 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-cytoscape.cys',out_dir=out_dir,sep=SEP,dataset=dataset_labels)) if _config.config.analysis_include_ml: - final_input.extend(expand('{out_dir}{sep}{dataset}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) + + if _config.config.analysis_include_ml_aggregate_algo: # argument required + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. @@ -268,14 +279,14 @@ rule ml_analysis: input: pathways = expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=algorithms_with_params) output: - pca_image = SEP.join([out_dir, '{dataset}-pca.png']), - pca_variance= SEP.join([out_dir, '{dataset}-pca-variance.txt']), - pca_coordinates = SEP.join([out_dir, '{dataset}-pca-coordinates.txt']), - hac_image_vertical = SEP.join([out_dir, '{dataset}-hac-vertical.png']), - hac_clusters_vertical = SEP.join([out_dir, '{dataset}-hac-clusters-vertical.txt']), - hac_image_horizontal = SEP.join([out_dir, '{dataset}-hac-horizontal.png']), - hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-hac-clusters-horizontal.txt']), - ensemble_network_file = SEP.join([out_dir,'{dataset}-ensemble-pathway.txt']) + pca_image = SEP.join([out_dir, '{dataset}-ml', 'pca.png']), + pca_variance= SEP.join([out_dir, '{dataset}-ml', 'pca-variance.txt']), + pca_coordinates = SEP.join([out_dir, '{dataset}-ml', 'pca-coordinates.txt']), + hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-vertical.png']), + hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-vertical.txt']), + hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']), + hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']), + ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) run: summary_df = ml.summarize_networks(input.pathways) ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) @@ -283,6 +294,29 @@ rule ml_analysis: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) +def collect_files_per_algo (wildcards): + filtered_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] + return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_params) + +rule ml_analysis_aggregate_algo: + input: + pathways = collect_files_per_algo + output: + pca_image = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca.png']), + pca_variance= SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-variance.txt']), + pca_coordinates = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-coordinates.txt']), + hac_image_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-vertical.png']), + hac_clusters_vertical = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-vertical.txt']), + hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']), + hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']), + ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt']) + run: + summary_df = ml.summarize_networks(input.pathways) + ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) + ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) + ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) + ml.ensemble_network(summary_df, output.ensemble_network_file) + # Remove the output directory rule clean: shell: f'rm -rf {out_dir}' diff --git a/config/config.yaml b/config/config.yaml index c31b2429..651924b9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -37,7 +37,7 @@ container_registry: algorithms: - name: "pathlinker" params: - include: false + include: true run1: k: range(100,201,100) @@ -134,7 +134,11 @@ analysis: include: true # Machine learning analysis (e.g. clustering) of the pathway output files for each dataset ml: + # ml analysis per dataset include: true + # required: true or false + # adds ml analysis per algorithm output + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph diff --git a/spras/config.py b/spras/config.py index 100b2779..041dbe2b 100644 --- a/spras/config.py +++ b/spras/config.py @@ -204,8 +204,8 @@ def process_config(self, raw_config): self.hac_params["linkage"] = self.ml_params["linkage"] if "metric" in self.ml_params: self.hac_params["metric"] = self.ml_params ["metric"] - self.analysis_include_summary = raw_config["analysis"]["summary"]["include"] self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] + self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] diff --git a/test/test_config.py b/test/test_config.py index c7497cec..f17190d0 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -26,7 +26,8 @@ def get_test_config(): "include": False }, "ml": { - "include": False + "include": False, + "aggregate_per_algorithm": False }, "graphspace": { "include": False From e31586a910d8b700708f10a461530303155b1172 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Tue, 11 Jun 2024 11:56:26 -0500 Subject: [PATCH 02/10] removed controllable parameter for now --- Snakefile | 3 +-- config/config.yaml | 10 +++++----- spras/config.py | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Snakefile b/Snakefile index 4d012cd8..800e5ed7 100644 --- a/Snakefile +++ b/Snakefile @@ -86,8 +86,7 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - - if _config.config.analysis_include_ml_aggregate_algo: # argument required + # if _config.config.analysis_include_ml_aggregate_algo: # argument required final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) diff --git a/config/config.yaml b/config/config.yaml index 651924b9..25680b19 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -61,7 +61,7 @@ algorithms: - name: "meo" params: - include: true + include: false run1: max_path_length: [3] local_search: ["Yes"] @@ -69,18 +69,18 @@ algorithms: - name: "mincostflow" params: - include: true + include: false run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" params: - include: true + include: false - name: "domino" params: - include: true + include: false run1: slice_threshold: [0.3] module_threshold: [0.05] @@ -138,7 +138,7 @@ analysis: include: true # required: true or false # adds ml analysis per algorithm output - aggregate_per_algorithm: true + # aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph diff --git a/spras/config.py b/spras/config.py index 041dbe2b..7dcf9ad2 100644 --- a/spras/config.py +++ b/spras/config.py @@ -208,4 +208,4 @@ def process_config(self, raw_config): self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + # self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] From a2dc174bf28b1cad1b01b79c3046377de76a2cc5 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Tue, 11 Jun 2024 11:57:20 -0500 Subject: [PATCH 03/10] removed controllable parameter for now from test_config --- test/test_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_config.py b/test/test_config.py index f17190d0..70298d30 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -26,8 +26,8 @@ def get_test_config(): "include": False }, "ml": { - "include": False, - "aggregate_per_algorithm": False + "include": False + # "aggregate_per_algorithm": False }, "graphspace": { "include": False From 1da8c9b7746e1f3bb05aefbc3b713da30dbbc372 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Wed, 12 Jun 2024 12:22:14 -0500 Subject: [PATCH 04/10] make all algortihms true --- config/config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 25680b19..481dd553 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -61,7 +61,7 @@ algorithms: - name: "meo" params: - include: false + include: true run1: max_path_length: [3] local_search: ["Yes"] @@ -69,18 +69,18 @@ algorithms: - name: "mincostflow" params: - include: false + include: true run1: flow: [1] # The flow must be an int capacity: [1] - name: "allpairs" params: - include: false + include: true - name: "domino" params: - include: false + include: true run1: slice_threshold: [0.3] module_threshold: [0.05] From 0d8ec19e87c5fe75d7dbbac4801de36537a35edf Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Wed, 12 Jun 2024 13:04:29 -0500 Subject: [PATCH 05/10] added back aggregate_per_algorithm param and added testing --- Snakefile | 3 ++- config/config.yaml | 5 ++--- spras/config.py | 6 +++++- test/test_config.py | 16 ++++++++++++++-- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Snakefile b/Snakefile index 800e5ed7..92ac9146 100644 --- a/Snakefile +++ b/Snakefile @@ -86,7 +86,8 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) - # if _config.config.analysis_include_ml_aggregate_algo: # argument required + + if _config.config.analysis_include_ml_aggregate_algo: final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) diff --git a/config/config.yaml b/config/config.yaml index 481dd553..5e543153 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -136,9 +136,8 @@ analysis: ml: # ml analysis per dataset include: true - # required: true or false - # adds ml analysis per algorithm output - # aggregate_per_algorithm: true + # required; adds ml analysis per algorithm output + aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 # boolean to show the labels on the pca graph diff --git a/spras/config.py b/spras/config.py index 7dcf9ad2..454df937 100644 --- a/spras/config.py +++ b/spras/config.py @@ -208,4 +208,8 @@ def process_config(self, raw_config): self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] - # self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] + + if 'aggregate_per_algorithm' not in self.ml_params: + raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.") + else: + self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] \ No newline at end of file diff --git a/test/test_config.py b/test/test_config.py index 70298d30..18d5cf64 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -26,8 +26,8 @@ def get_test_config(): "include": False }, "ml": { - "include": False - # "aggregate_per_algorithm": False + "include": False, + "aggregate_per_algorithm": False }, "graphspace": { "include": False @@ -105,4 +105,16 @@ def test_config_container_registry(self): assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX) + def test_ml(self): + # test ml_aggregate + test_config = get_test_config() + config.init_global(test_config) + assert (config.config.analysis_include_ml_aggregate_algo == False) + + test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True + config.init_global(test_config) + assert (config.config.analysis_include_ml_aggregate_algo == True) + with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing + del test_config["analysis"]["ml"]["aggregate_per_algorithm"] + config.init_global(test_config) From a787a885ab3ea72a31acfe05c0265fe1cfc750bd Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Thu, 13 Jun 2024 11:33:25 -0500 Subject: [PATCH 06/10] precommit --- spras/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/config.py b/spras/config.py index 454df937..cd7f41f1 100644 --- a/spras/config.py +++ b/spras/config.py @@ -212,4 +212,4 @@ def process_config(self, raw_config): if 'aggregate_per_algorithm' not in self.ml_params: raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.") else: - self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] \ No newline at end of file + self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] From 90e8e11dcb5b3f2f9f22684ad65c8a20b6ba2937 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Fri, 14 Jun 2024 14:37:59 -0500 Subject: [PATCH 07/10] update code to make rule not required and updated ml-agg rule --- Snakefile | 23 ++++++++++++++--------- config/config.yaml | 8 ++++---- spras/config.py | 3 ++- test/test_config.py | 16 ---------------- 4 files changed, 20 insertions(+), 30 deletions(-) diff --git a/Snakefile b/Snakefile index 92ac9146..8cf152aa 100644 --- a/Snakefile +++ b/Snakefile @@ -24,7 +24,6 @@ algorithm_params = _config.config.algorithm_params algorithm_directed = _config.config.algorithm_directed pca_params = _config.config.pca_params hac_params = _config.config.hac_params - FRAMEWORK = _config.config.container_framework print(f"Running {FRAMEWORK} containers") @@ -36,6 +35,12 @@ algorithms = list(algorithm_params) algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, param_combos in algorithm_params.items() for params_hash in param_combos.keys()] dataset_labels = list(_config.config.datasets.keys()) +# Get algorithms that are running multiple parameter combinations +def include_algorithm (algo): + return len(algorithm_params.get(algo, {})) > 1 + +algorithms_mult_param_combos = [algo for algo in algorithms if include_algorithm(algo)] + # Get the parameter dictionary for the specified # algorithm and parameter combination hash def reconstruction_params(algorithm, params_hash): @@ -88,14 +93,14 @@ def make_final_input(wildcards): final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm_params=algorithms_with_params)) if _config.config.analysis_include_ml_aggregate_algo: - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) - final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-variance.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-pca-coordinates.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-vertical.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-vertical.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-horizontal.png',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-hac-clusters-horizontal.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) + final_input.extend(expand('{out_dir}{sep}{dataset}-ml{sep}{algorithm}-ensemble-pathway.txt',out_dir=out_dir,sep=SEP,dataset=dataset_labels,algorithm=algorithms_mult_param_combos,algorithm_params=algorithms_with_params)) if len(final_input) == 0: # No analysis added yet, so add reconstruction output files if they exist. diff --git a/config/config.yaml b/config/config.yaml index 5e543153..e9e7c125 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,16 +63,16 @@ algorithms: params: include: true run1: - max_path_length: [3] + max_path_length: [3, 4, 5] local_search: ["Yes"] - rand_restarts: [10] + rand_restarts: [10, 11] - name: "mincostflow" params: include: true run1: flow: [1] # The flow must be an int - capacity: [1] + capacity: [1, 2, 3] - name: "allpairs" params: @@ -136,7 +136,7 @@ analysis: ml: # ml analysis per dataset include: true - # required; adds ml analysis per algorithm output + # adds ml analysis per algorithm output aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 diff --git a/spras/config.py b/spras/config.py index cd7f41f1..fdd51165 100644 --- a/spras/config.py +++ b/spras/config.py @@ -204,12 +204,13 @@ def process_config(self, raw_config): self.hac_params["linkage"] = self.ml_params["linkage"] if "metric" in self.ml_params: self.hac_params["metric"] = self.ml_params ["metric"] + self.analysis_include_summary = raw_config["analysis"]["summary"]["include"] self.analysis_include_graphspace = raw_config["analysis"]["graphspace"]["include"] self.analysis_include_cytoscape = raw_config["analysis"]["cytoscape"]["include"] self.analysis_include_ml = raw_config["analysis"]["ml"]["include"] if 'aggregate_per_algorithm' not in self.ml_params: - raise ValueError("The 'aggregate_per_algorithm' parameter must be set to either true or false in ml analysis parameters.") + self.analysis_include_ml_aggregate_algo = False else: self.analysis_include_ml_aggregate_algo = raw_config["analysis"]["ml"]["aggregate_per_algorithm"] diff --git a/test/test_config.py b/test/test_config.py index 18d5cf64..cd5772e5 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -27,7 +27,6 @@ def get_test_config(): }, "ml": { "include": False, - "aggregate_per_algorithm": False }, "graphspace": { "include": False @@ -103,18 +102,3 @@ def test_config_container_registry(self): test_config["container_registry"]["owner"] = "" config.init_global(test_config) assert (config.config.container_prefix == config.DEFAULT_CONTAINER_PREFIX) - - - def test_ml(self): - # test ml_aggregate - test_config = get_test_config() - config.init_global(test_config) - assert (config.config.analysis_include_ml_aggregate_algo == False) - - test_config["analysis"]["ml"]["aggregate_per_algorithm"] = True - config.init_global(test_config) - assert (config.config.analysis_include_ml_aggregate_algo == True) - - with pytest.raises(ValueError): #raises error if empty dataframe is used for post processing - del test_config["analysis"]["ml"]["aggregate_per_algorithm"] - config.init_global(test_config) From 08bc9873975609f032dc8133f16ebf25842ba469 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Fri, 14 Jun 2024 16:36:11 -0500 Subject: [PATCH 08/10] clean up --- config/config.yaml | 6 +++--- test/test_config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index e9e7c125..5e76dae9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,16 +63,16 @@ algorithms: params: include: true run1: - max_path_length: [3, 4, 5] + max_path_length: [3] local_search: ["Yes"] - rand_restarts: [10, 11] + rand_restarts: [10] - name: "mincostflow" params: include: true run1: flow: [1] # The flow must be an int - capacity: [1, 2, 3] + capacity: [1] - name: "allpairs" params: diff --git a/test/test_config.py b/test/test_config.py index cd5772e5..602f95af 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -26,7 +26,7 @@ def get_test_config(): "include": False }, "ml": { - "include": False, + "include": False }, "graphspace": { "include": False From 6c9e1f2e900128bf5f8dcb71046cc92838773dbe Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Mon, 17 Jun 2024 10:45:04 -0500 Subject: [PATCH 09/10] clean up --- config/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/config/config.yaml b/config/config.yaml index 5e76dae9..f9c3a9c0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -137,6 +137,7 @@ analysis: # ml analysis per dataset include: true # adds ml analysis per algorithm output + # works only on algorithms with multiple parameter combinations chosen aggregate_per_algorithm: true # specify how many principal components to calculate components: 2 From 122fb66f463eb8b15e0c2e1be8ff3d7ca6c02449 Mon Sep 17 00:00:00 2001 From: ntalluri <nehatalluri@live.com> Date: Mon, 24 Jun 2024 16:25:18 -0500 Subject: [PATCH 10/10] requested changes --- Snakefile | 12 ++++++------ config/config.yaml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Snakefile b/Snakefile index 8cf152aa..6a9b513d 100644 --- a/Snakefile +++ b/Snakefile @@ -36,10 +36,10 @@ algorithms_with_params = [f'{algorithm}-params-{params_hash}' for algorithm, par dataset_labels = list(_config.config.datasets.keys()) # Get algorithms that are running multiple parameter combinations -def include_algorithm (algo): +def algo_has_mult_param_combos(algo): return len(algorithm_params.get(algo, {})) > 1 -algorithms_mult_param_combos = [algo for algo in algorithms if include_algorithm(algo)] +algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)] # Get the parameter dictionary for the specified # algorithm and parameter combination hash @@ -299,13 +299,13 @@ rule ml_analysis: ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.ensemble_network(summary_df, output.ensemble_network_file) -def collect_files_per_algo (wildcards): - filtered_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] - return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_params) +def collect_pathways_per_algo(wildcards): + filtered_algo_params = [algo_param for algo_param in algorithms_with_params if wildcards.algorithm in algo_param] + return expand('{out_dir}{sep}{{dataset}}-{algorithm_params}{sep}pathway.txt', out_dir=out_dir, sep=SEP, algorithm_params=filtered_algo_params) rule ml_analysis_aggregate_algo: input: - pathways = collect_files_per_algo + pathways = collect_pathways_per_algo output: pca_image = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca.png']), pca_variance= SEP.join([out_dir, '{dataset}-ml', '{algorithm}-pca-variance.txt']), diff --git a/config/config.yaml b/config/config.yaml index f9c3a9c0..5fe6083b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -137,7 +137,7 @@ analysis: # ml analysis per dataset include: true # adds ml analysis per algorithm output - # works only on algorithms with multiple parameter combinations chosen + # only runs for algorithms with multiple parameter combinations chosen aggregate_per_algorithm: true # specify how many principal components to calculate components: 2