From 1ba375a3d8c754b54db692aaddd7b1d6fd2da61e Mon Sep 17 00:00:00 2001 From: abearab Date: Thu, 6 Jun 2024 15:07:13 -0700 Subject: [PATCH 01/50] update .gitignore file --- .gitignore | Bin 156 -> 165 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/.gitignore b/.gitignore index 57ac2c3499eed1aa3b5f03bdf5cb153dbd68e56a..5c791006da5f40065ab9f7d786b434858cd1b614 100644 GIT binary patch literal 165 zcmZw9F%H5o3Hoh_U2kxxbFrS5Z*qyt zV5cD1#SXGx=Q#gq6L+bh^wQ`l<>GHkAs#;8Q_g^BorxcU2s0P8UN;=TPitDkCyzFd K^?FB-cFG5NkvLHR literal 156 zcmYj~I|_h63`Acoc#$pKL%5B#2M|#~3qQDmczJcQSV)+VWIlOrN2A)f=xBwJAzgAZ zN?jHL6`q-iD85KqW1Z&6sol0izNj95iod^C)**O$DL3V{pVM*zwZ58vWfyOWFSP0! AApigX From 4fd9dd077a80d8dfe98e9d8abc954f5c4a119f0e Mon Sep 17 00:00:00 2001 From: abearab Date: Thu, 6 Jun 2024 15:08:07 -0700 Subject: [PATCH 02/50] add load functino for coessentiality #45 --- CanDI/candi/__init__.py | 7 ++++--- CanDI/candi/load.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 CanDI/candi/load.py diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py index 5624efb..ad1e50e 100644 --- a/CanDI/candi/__init__.py +++ b/CanDI/candi/__init__.py @@ -1,4 +1,5 @@ -from CanDI.candi import data -data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects -from CanDI.candi.candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster) +from . import load +from . import data +data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects +from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster) diff --git a/CanDI/candi/load.py b/CanDI/candi/load.py new file mode 100644 index 0000000..42e27ef --- /dev/null +++ b/CanDI/candi/load.py @@ -0,0 +1,37 @@ +import numpy as np +import pandas as pd +import polars as pl +from CanDI import candi +from pathlib import Path + + +def coessentiality(pvalue_threshold = 10**-3, data_dir='auto'): + if data_dir == 'auto': + data_dir=str(Path(candi.__path__[0]).parent.absolute()) + '/setup/data/coessentiality' + else: + # check if the path exists and it contains the necessary files + if not Path(data_dir).exists(): + raise ValueError(f"Path {data_dir} does not exist") + if not Path(data_dir+'/genes.txt').exists(): + raise ValueError(f"Path {data_dir}/genes.txt does not exist") + if not Path(data_dir+'/GLS_sign.npy').exists(): + raise ValueError(f"Path {data_dir}/GLS_sign.npy does not exist") + if not Path(data_dir+'/GLS_p.npy').exists(): + raise ValueError(f"Path {data_dir}/GLS_p.npy does not exist") + + gene_names = pd.read_csv(f'{data_dir}/genes.txt',header=None,names=['gene_name'])['gene_name'] + + GLS_sign = np.load(f'{data_dir}/GLS_sign.npy') + GLS_p = np.load(f'{data_dir}/GLS_p.npy') + + coessentiality_mat = pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index() + coessentiality_mat = pl.from_dataframe(coessentiality_mat) + + coessentiality_df = coessentiality_mat.melt('gene_name') + coessentiality_df.columns = ['gene_1','gene_2','coessentiality'] + coessentiality_df = coessentiality_df.filter(~(pl.col('gene_1') == pl.col('gene_2'))) + coessentiality_df = coessentiality_df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold)) + + out = coessentiality_df.to_pandas() + + return out From 4fa9aa84922ae4cbda518cc4d4f13e6d47bcec25 Mon Sep 17 00:00:00 2001 From: abearab Date: Tue, 11 Jun 2024 20:28:15 -0700 Subject: [PATCH 03/50] add `diffexp` module related to #39 --- CanDI/pipelines/__init__.py | 0 CanDI/pipelines/diffexp.py | 52 +++++++++++++++++++++++++++++++++++++ scripts/run_deseq.r | 35 ------------------------- 3 files changed, 52 insertions(+), 35 deletions(-) create mode 100644 CanDI/pipelines/__init__.py create mode 100644 CanDI/pipelines/diffexp.py delete mode 100644 scripts/run_deseq.r diff --git a/CanDI/pipelines/__init__.py b/CanDI/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/CanDI/pipelines/diffexp.py b/CanDI/pipelines/diffexp.py new file mode 100644 index 0000000..ea4a581 --- /dev/null +++ b/CanDI/pipelines/diffexp.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import anndata as ad + +from pydeseq2.dds import DeseqDataSet +from pydeseq2.default_inference import DefaultInference +from pydeseq2.ds import DeseqStats +from adpbulk import ADPBulk + + +def pseudobulk_by_group(adt, groups, method="mean"): + # initialize the object + adpb = ADPBulk(adt, groupby=groups, method=method) + + # perform the pseudobulking + pseudobulk_matrix = adpb.fit_transform() + + # retrieve the sample metadata (useful for easy incorporation with edgeR) + sample_meta = adpb.get_meta() + + out = ad.AnnData( + X=pseudobulk_matrix, + obs=sample_meta.set_index('SampleName') + ) + + return out + + +def run_deseq(adata, design, tested_level, ref_level, n_cpus=8): + + inference = DefaultInference(n_cpus=n_cpus) + + dds = DeseqDataSet( + counts=adata.to_df().astype(int), + metadata=adata.obs, + design_factors=design, # compare samples based on the "condition" + refit_cooks=True, + inference=inference, + ) + + dds.deseq2() + + stat_res = DeseqStats( + dds, + contrast=[design, tested_level, ref_level], + inference=inference + ) + stat_res.summary() + + df = stat_res.results_df + + return df \ No newline at end of file diff --git a/scripts/run_deseq.r b/scripts/run_deseq.r deleted file mode 100644 index f4f1088..0000000 --- a/scripts/run_deseq.r +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env Rscript -library(DESeq2) - -args = commandArgs(trailingOnly=TRUE) - -#read data -counts.mat <- read.csv(args[1]) -coldata <- read.csv(args[2]) - -#name metadata columns -colnames(coldata) <- c("sample.id", "condition") -#convert datatype to factor -coldata[,-1] <- as.factor(coldata[, -1]) -#Match sample ids to sample columns in counts matrix -coldata$sample.id <- sub("-", ".", coldata$sample.id) - -#init dds object -dds <- DESeqDataSetFromMatrix(countData=counts.mat, - colData=coldata, - design= ~condition, - tidy = TRUE) - -dds <- estimateSizeFactors(dds) -#Filter lowly expressed genes -idx <- rowSums(counts(dds, normalized=TRUE) >= 5) >= 3 -dds <- dds[idx,] - -dds <- DESeq(dds) #run deseq -res <- results(dds) #get results -#Show results -print(summary(res)) -print(head(res)) - -write.csv(res, args[3]) #save results - From 1324a41a51a32eeb5c3a201b68adec0cee93fddd Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 00:26:33 -0700 Subject: [PATCH 04/50] add `cfig_path` argument --- CanDI/setup/reset_config.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/CanDI/setup/reset_config.py b/CanDI/setup/reset_config.py index 7453023..6c53dba 100644 --- a/CanDI/setup/reset_config.py +++ b/CanDI/setup/reset_config.py @@ -4,9 +4,24 @@ from manager import Manager -def main(): +def write_cfig(cfig_path, parser): + + write_file = Manager.write_config + write_file(cfig_path, parser) + - cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini" +def main(cfig_path='auto'): + """ + This function will reset the config file to only contain the default sections. + This is useful if you want to reset the config file to its original state. + """ + if cfig_path == 'auto': + cfig_path = os.path.dirname(os.path.realpath(__file__)) + "/data/config.ini" + elif os.path.exists(cfig_path) == False: + raise FileNotFoundError("Config file not found") + elif os.path.exists(cfig_path) == True: + print("Using config file at: " + cfig_path) + parser = configparser.ConfigParser() parser.read(cfig_path) @@ -21,10 +36,5 @@ def main(): write_cfig(cfig_path, parser) -def write_cfig(cfig_path, parser): - - write_file = Manager.write_config - write_file(cfig_path, parser) - if __name__ == "__main__": main() From dc54404e8a0577b7e4ebdd4c0578b6235095e9cb Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 00:27:46 -0700 Subject: [PATCH 05/50] make `Manager` as parent class --- CanDI/setup/manager.py | 116 +++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 50 deletions(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 44aef56..1996e4a 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -10,15 +10,17 @@ import pandas as pd import requests + class Manager(object): """The Manager class handles interations with the datasources and the config file. It is used to setup of the config file upon installation. All data downloading is done by Manager """ - def __init__(self): + def __init__(self, cfig_path='auto'): manager_path = os.path.dirname(os.path.realpath(__file__)) - cfig_path = manager_path + "/data/config.ini" + if cfig_path == 'auto': + cfig_path = manager_path + "/data/config.ini" parser = configparser.ConfigParser() parser.read(cfig_path) @@ -26,9 +28,18 @@ def __init__(self): self.cfig_path = Path(cfig_path) self.parser = parser + @staticmethod + def write_config(cfig_path, parser): - def sanger_download(): - pass + print("Writing config file") + with open(cfig_path, "w") as f: + parser.write(f) + f.close() + + +class BroadDepMap(Manager): + def __init__(self, cfig_path='auto'): + super().__init__(cfig_path) def get_depmap_info(self, release="latest"): @@ -44,7 +55,6 @@ def get_depmap_info(self, release="latest"): self.parser["depmap_urls"] = self.download_info self.parser["depmap_files"] = self.depmap_files - def parse_release(self): download_urls = {} @@ -70,16 +80,21 @@ def get_release(self, release): return release_info["releaseName"] - def format_filename(self, filename): + def format_filename(self, filename, release): + # set candi_name to the filename without the extension candi_name = filename.split(".")[0] - if "CRISPR_" in candi_name: - candi_name = candi_name[len("CRISPR_"):] - elif "CCLE_" in candi_name: - candi_name = candi_name[len("CCLE_"):] - if 'v2' in candi_name: - candi_name = candi_name[:-len("_v2")] + if release == "21Q4": + if "CRISPR_" in candi_name: + candi_name = candi_name[len("CRISPR_"):] + elif "CCLE_" in candi_name: + candi_name = candi_name[len("CCLE_"):] + if 'v2' in candi_name: + candi_name = candi_name[:-len("_v2")] + else: + #TODO: add more cases for different releases, e.g. 24Q4 new file formats + pass return candi_name @@ -114,14 +129,12 @@ def fetch_url(self, entry): downloads[filename] = str(path) - def parallel_fetch(self, entries): print("Starting Pool") with ThreadPoolExecutor(max_workers=4) as executor: for i in entries: executor.submit(self.fetch_url, i) - def download_defaults(self): default_sources = json.loads(self.parser.get("defaults","downloads")) @@ -130,7 +143,6 @@ def download_defaults(self): entries = [self.manage_request(i, "depmap") for i in to_download] self.parallel_fetch(entries) - def manage_request(self, name, path, filename=False): if filename: @@ -169,44 +181,48 @@ def depmap_autoformat(self): df = pd.read_csv(v, low_memory=False, memory_map=True) self.format_depmap_data(df, v) - def format_depmap_data(self, df, path): + def format_depmap_data(self, df, path, release): - if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns): + if release == "21Q4": + if ("AAAS (8086)" in df.columns) or ("AAAS (ENSG00000094914)" in df.columns): - df.rename(columns = lambda s: s.split(" ")[0], inplace=True) + df.rename(columns = lambda s: s.split(" ")[0], inplace=True) - if "Unnamed:" in df.columns: - df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True) + if "Unnamed:" in df.columns: + df.rename(columns={"Unnamed:":"DepMap_ID"}, inplace=True) - df = df.set_index("DepMap_ID").T - df.reset_index(inplace=True) - df.rename(columns={"index":"gene"}, inplace=True) - df.set_index("gene", inplace=True) - df.to_csv(path) + df = df.set_index("DepMap_ID").T + df.reset_index(inplace=True) + df.rename(columns={"index":"gene"}, inplace=True) + df.set_index("gene", inplace=True) + df.to_csv(path) - if "Protein_Change" in df.columns: + if "Protein_Change" in df.columns: - try: - df.drop("Unnamed: 0", axis=1, inplace=True) - df.to_csv(path, index=False) - except KeyError: - pass + try: + df.drop("Unnamed: 0", axis=1, inplace=True) + df.to_csv(path, index=False) + except KeyError: + pass - if "Hugo_Symbol" in df.columns: - try: - df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True) - df.to_csv(path, index=False) - except KeyError: - pass + if "Hugo_Symbol" in df.columns: + try: + df.rename(columns={"Hugo_Symbol": "gene"}, inplace=True) + df.to_csv(path, index=False) + except KeyError: + pass - if "LeftGene" in df.columns: - for col in df.columns: - if "Gene" in col: - split_cols = df[col].str.split(" ", expand=True) - df[col] = split_cols[0] - df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "") + if "LeftGene" in df.columns: + for col in df.columns: + if "Gene" in col: + split_cols = df[col].str.split(" ", expand=True) + df[col] = split_cols[0] + df[col[:-4] + "EnsemblID"] = split_cols[1].str.replace("(", "").str.replace(")", "") - df.to_csv(path, index=False) + df.to_csv(path, index=False) + else: + #TODO: add more cases for different releases, e.g. 24Q4 new file formats + pass try: formatted = self.parser["formatted"] @@ -217,13 +233,13 @@ def format_depmap_data(self, df, path): formatted[path.split("/")[-1]] = path - @staticmethod - def write_config(cfig_path, parser): +class SangerDepMap(Manager): + def __init__(self, cfig_path='auto'): + super().__init__(cfig_path) + + def sanger_download(): + pass - print("Writing config file") - with open(cfig_path, "w") as f: - parser.write(f) - f.close() if __name__ == "__main__": m = Manager() From f47ff47f9459e8a63273ed49dccadff7319f41aa Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 00:40:40 -0700 Subject: [PATCH 06/50] add additional packages --- candi.yml | 7 +++++++ requirements.txt | 6 +++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/candi.yml b/candi.yml index 3c3a84b..c394b5b 100644 --- a/candi.yml +++ b/candi.yml @@ -2,6 +2,13 @@ name: candi dependencies: - python==3.9 - pandas + - numpy + - polars - configparser - requests - tqdm + - pip + - pip: + - pydeseq2 + - adpbulk + - pytdc \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8504299..8c1dd7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,9 @@ pandas +numpy +polars +anndata configparser requests tqdm - +adpbulk +pydeseq2 \ No newline at end of file From 538cf572c8d80c4b075e24d54994dd082e0f543b Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 01:07:02 -0700 Subject: [PATCH 07/50] fix import --- CanDI/candi/candi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py index c422fd8..53da444 100644 --- a/CanDI/candi/candi.py +++ b/CanDI/candi/candi.py @@ -5,7 +5,7 @@ import pandas as pd import numpy as np from . import data, grabber -from . import entity +from ..structures import entity class SubsetHandler(object): From 2aa82b62c3e292222f4cf640274177377d935bef Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 01:07:42 -0700 Subject: [PATCH 08/50] mend --- CanDI/candi/__init__.py | 3 ++- CanDI/candi/data.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/CanDI/candi/__init__.py b/CanDI/candi/__init__.py index ad1e50e..a2245f0 100644 --- a/CanDI/candi/__init__.py +++ b/CanDI/candi/__init__.py @@ -1,5 +1,6 @@ from . import load - from . import data + data = data.Data() #Global object data instantiated on import required for access by GeneQuery Objects + from .candi import (Gene, CellLine, Organelle, Cancer, CellLineCluster, GeneCluster) diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index 5a2921c..b0934b8 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -14,10 +14,15 @@ class Data(object): can be tuned to load specific datasets upon import by editing config.ini can call Data.load() to load any specific dataset """ - def __init__(self): - - self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup' - config_path = self._file_path / 'data/config.ini' + def __init__(self, config_path='auto', verbose=False): + + if config_path == 'auto': + self._file_path = Path(os.path.dirname(os.path.realpath(__file__))).parent.absolute() / 'setup' + config_path = self._file_path / 'data/config.ini' + elif os.path.exists(config_path) == False: + raise FileNotFoundError("Config file not found at {}".format(config_path)) + elif os.path.exists(config_path) == True: + if verbose: print("Using config file at {}".format(config_path)) parser = configparser.ConfigParser() #parses config for data sources parser.read(config_path) From 00e25438e102ad5844ff93a528142e2ea8fdfaef Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:05:42 -0700 Subject: [PATCH 09/50] minor fixes --- CanDI/setup/manager.py | 135 +++++++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 60 deletions(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 8daa587..1a14184 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -1,37 +1,50 @@ +""" +The manager module handles interations with the datasources +and the config file. It is used to setup of the config file upon installation. +All data downloading is done by Manager class and its subclasses. +""" + import os import configparser import json import time import requests -import shutil import pandas as pd from time import sleep from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from .dataverse import depmap_dataverse_download +from .dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI class Manager(object): - """The Manager class handles interations with the datasources - and the config file. It is used to setup of the config file upon installation. - All data downloading is done by Manager - """ - def __init__(self, cfig_path='auto', download_source=None, data_dir=None): - - if data_dir: - manager_path = data_dir - else: + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + """Initializes the Manager class + + Args: + manager_path (str, optional): The path to the manager directory. This is where the data will be stored. + cfig_path (str, optional): The path to the config file. + """ + if manager_path == 'auto': manager_path = os.path.dirname(os.path.realpath(__file__)) + else: + # make sure the path is a directory and exists or create it + if not os.path.exists(manager_path): + os.makedirs(manager_path) + if cfig_path == 'auto': - - cfig_path = manager_path + "/data/config.ini" + cfig_path = manager_path + "/data/config.ini" + + if verbose: + print(f"Manager Path: {manager_path}") + print(f"Config Path: {cfig_path}") + parser = configparser.ConfigParser() parser.read(cfig_path.replace(".ini", ".draft.ini")) self.manager_path = manager_path self.cfig_path = Path(cfig_path) self.parser = parser - self.download_source = download_source + @staticmethod def write_config(cfig_path, parser): @@ -41,10 +54,55 @@ def write_config(cfig_path, parser): f.close() -class BroadDepMap(Manager): - def __init__(self, cfig_path='auto'): - super().__init__(cfig_path) +class DataverseDepMap(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.release = '21Q4' # default release uploded to CanDI dataverse + self.download_source = 'dataverse, ' + CANDI_DATAVERSE_DOI + + def download_reformatted_data(self): + # depmap release + if not os.path.exists(self.manager_path + '/data/'): + os.makedirs(self.manager_path + '/data/') + + if not os.path.exists(self.manager_path + '/data/depmap/'): + os.makedirs(self.manager_path + '/data/depmap/') + + if self.download_source == "dataverse": + urls, file_names = depmap_dataverse_download( + self.manager_path + '/data/depmap/', + return_type= ["url", "name"] + ) + depmap_urls = { + file: url for url, file in zip(urls, file_names) + } + + depmap_files = {} + for file in file_names: + f_key = file.split('.')[0] + f_key = f_key.replace('CCLE_','') + f_key = f_key.replace('CRISPR_','') + depmap_files[f_key] = file + + formatted = { + f'{self.manager_path}/data/depmap/{file}': file for file in file_names + if 'readme' not in file.lower() + } + + self.parser["depmap_urls"] = depmap_urls + self.parser["depmap_files"] = depmap_files + self.parser["formatted"] = formatted + + else: + raise RuntimeError("Set download source to 'dataverse' before running download_formated_data") + + +class BroadDepMap(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.download_source = 'Broad DepMap, https://depmap.org/' + def get_depmap_info(self, release="latest"): depmap = self.parser["download_urls"]["depmap"] @@ -237,52 +295,9 @@ def format_depmap_data(self, df, path, release): formatted[path.split("/")[-1]] = path - def download_reformatted_data(self, depmap_release=''): - if not os.path.exists(self.manager_path + '/data/'): - os.makedirs(self.manager_path + '/data/') - - if not os.path.exists(self.manager_path + '/data/depmap/'): - os.makedirs(self.manager_path + '/data/depmap/') - - if self.download_source == "dataverse": - urls, file_names = depmap_dataverse_download( - self.manager_path + '/data/depmap/', - return_type= ["url", "name"] - ) - - depmap_urls = { - file: url for url, file in zip(urls, file_names) - } - - depmap_files = {} - for file in file_names: - f_key = file.split('.')[0] - f_key = f_key.replace('CCLE_','') - f_key = f_key.replace('CRISPR_','') - depmap_files[f_key] = file - - formatted = { - f'{self.manager_path}/data/depmap/{file}': file for file in file_names - if 'readme' not in file.lower() - } - - self.parser["depmap_urls"] = depmap_urls - self.parser["depmap_files"] = depmap_files - self.parser["formatted"] = formatted - - else: - raise RuntimeError("Set download source to 'dataverse' before running download_formated_data") - class SangerDepMap(Manager): def __init__(self, cfig_path='auto'): super().__init__(cfig_path) def sanger_download(): pass - - -if __name__ == "__main__": - m = Manager() - #m.depmap_download("fusions") - m.depmap_autoformat() - m.write_config(m.cfig_path, m.parser) From 026bb752983efed459bc0155a3f06895f4f4b42e Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:05:51 -0700 Subject: [PATCH 10/50] minor fixes --- CanDI/setup/install.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index 0042e94..c94fd2c 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,21 +1,22 @@ import argparse -from .manager import Manager +from manager import DataverseDepMap, BroadDepMap + def main(): parser = argparse.ArgumentParser() parser.add_argument("--source", help="Specify the download source", default="dataverse") - parser.add_argument("--data_dir", help="Specify the data directory", default=None) + parser.add_argument("--data_dir", help="Specify the data directory", default='auto') args = parser.parse_args() if args.source == 'dataverse': print("Downloading data from Dataverse") - m = Manager(download_source=args.source, data_dir=args.data_dir) + m = DataverseDepMap(manager_path=args.data_dir, verbose=True) m.download_reformatted_data() m.write_config(m.cfig_path, m.parser) elif args.source == 'depmap': print("Downloading data from DepMap") - m = Manager(download_source=args.source, data_dir=args.data_dir) + m = BroadDepMap(manager_path=args.data_dir, verbose=True) m.get_depmap_info() m.write_config(m.cfig_path, m.parser) m.download_defaults() From f0904aff66ecb830fec7fcbf4e25094037d1bac7 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:08:25 -0700 Subject: [PATCH 11/50] mend --- CanDI/setup/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 1a14184..564cc5a 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -13,7 +13,7 @@ from time import sleep from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from .dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI +from dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI class Manager(object): From fe07d580b5187547115c393a63ca5b21ad0518c3 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:09:56 -0700 Subject: [PATCH 12/50] mend --- CanDI/setup/manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 564cc5a..84ca218 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -61,7 +61,6 @@ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): self.download_source = 'dataverse, ' + CANDI_DATAVERSE_DOI def download_reformatted_data(self): - # depmap release if not os.path.exists(self.manager_path + '/data/'): os.makedirs(self.manager_path + '/data/') From 119f5419bd309a93946d989b1fedc5d75ccaa6b8 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:16:06 -0700 Subject: [PATCH 13/50] mend --- CanDI/setup/manager.py | 50 +++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 84ca218..5d10f88 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -67,34 +67,30 @@ def download_reformatted_data(self): if not os.path.exists(self.manager_path + '/data/depmap/'): os.makedirs(self.manager_path + '/data/depmap/') - if self.download_source == "dataverse": - urls, file_names = depmap_dataverse_download( - self.manager_path + '/data/depmap/', - return_type= ["url", "name"] - ) - - depmap_urls = { - file: url for url, file in zip(urls, file_names) - } - - depmap_files = {} - for file in file_names: - f_key = file.split('.')[0] - f_key = f_key.replace('CCLE_','') - f_key = f_key.replace('CRISPR_','') - depmap_files[f_key] = file - - formatted = { - f'{self.manager_path}/data/depmap/{file}': file for file in file_names - if 'readme' not in file.lower() - } - - self.parser["depmap_urls"] = depmap_urls - self.parser["depmap_files"] = depmap_files - self.parser["formatted"] = formatted + urls, file_names = depmap_dataverse_download( + self.manager_path + '/data/depmap/', + return_type= ["url", "name"] + ) - else: - raise RuntimeError("Set download source to 'dataverse' before running download_formated_data") + depmap_urls = { + file: url for url, file in zip(urls, file_names) + } + + depmap_files = {} + for file in file_names: + f_key = file.split('.')[0] + f_key = f_key.replace('CCLE_','') + f_key = f_key.replace('CRISPR_','') + depmap_files[f_key] = file + + formatted = { + f'{self.manager_path}/data/depmap/{file}': file for file in file_names + if 'readme' not in file.lower() + } + + self.parser["depmap_urls"] = depmap_urls + self.parser["depmap_files"] = depmap_files + self.parser["formatted"] = formatted class BroadDepMap(Manager): From 654f98f0ef3d5d371496ca0507e0e7bc4d353106 Mon Sep 17 00:00:00 2001 From: abearab Date: Sun, 23 Jun 2024 02:24:01 -0700 Subject: [PATCH 14/50] bump version 0.1.2 --- CanDI/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/__version__.py b/CanDI/__version__.py index cab7576..f20f41e 100644 --- a/CanDI/__version__.py +++ b/CanDI/__version__.py @@ -1 +1 @@ -version = "0.1.1" \ No newline at end of file +version = "0.1.2" \ No newline at end of file From 1a9a0d10ff4f5566015803c9acc4f576fc9b4d31 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 01:36:03 -0700 Subject: [PATCH 15/50] relative import --- CanDI/setup/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index c94fd2c..8798f8a 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,5 +1,5 @@ import argparse -from manager import DataverseDepMap, BroadDepMap +from .manager import DataverseDepMap, BroadDepMap def main(): From 65effc7dfb9c546659406a987958b56537217052 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 01:38:58 -0700 Subject: [PATCH 16/50] switch to python >3.11 --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index c394b5b..8ccf81d 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ name: candi dependencies: - - python==3.9 + - python>=3.11,<4.0 - pandas - numpy - polars From dcb70b0e20b6f7af6ac5628ceed41404c0710924 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 01:42:37 -0700 Subject: [PATCH 17/50] mend --- environment.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 8ccf81d..6570f0e 100644 --- a/environment.yml +++ b/environment.yml @@ -10,5 +10,4 @@ dependencies: - pip - pip: - pydeseq2 - - adpbulk - - pytdc \ No newline at end of file + - adpbulk \ No newline at end of file From 6675812c4421bbeccc2ba6ef72e6f80fc21cb1bc Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:08:46 -0700 Subject: [PATCH 18/50] mend --- CanDI/candi/candi.py | 2 +- CanDI/candi/data.py | 5 +++-- CanDI/setup/install.py | 2 +- CanDI/setup/manager.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/CanDI/candi/candi.py b/CanDI/candi/candi.py index 53da444..eab6e45 100644 --- a/CanDI/candi/candi.py +++ b/CanDI/candi/candi.py @@ -1,6 +1,6 @@ # Classes for handling data aggregations import operator -from collections import OrderedDict, MutableSequence +from collections.abc import MutableSequence import itertools as it import pandas as pd import numpy as np diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index b0934b8..d653aa0 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -28,12 +28,13 @@ def __init__(self, config_path='auto', verbose=False): parser.read(config_path) self._parser = parser - #self._verify_install() + self._verify_install() self._init_sources() self._init_depmap_paths() - # self._init_index_tables() + self._init_index_tables() def _verify_install(self): #ensures data being loaded is present + #TODO: add more checks for different data sources try: assert "depmap_urls" in self._parser.sections() except AssertionError: diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index 8798f8a..c94fd2c 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,5 +1,5 @@ import argparse -from .manager import DataverseDepMap, BroadDepMap +from manager import DataverseDepMap, BroadDepMap def main(): diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index 5d10f88..caa9024 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -39,7 +39,7 @@ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): print(f"Config Path: {cfig_path}") parser = configparser.ConfigParser() - parser.read(cfig_path.replace(".ini", ".draft.ini")) + parser.read(cfig_path) #.replace(".ini", ".draft.ini")) self.manager_path = manager_path self.cfig_path = Path(cfig_path) From 1a6703a6e10f492e063a73523552be1df5b0e8ec Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:12:39 -0700 Subject: [PATCH 19/50] switch to python >3.11 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c15190e..51db702 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ packages=find_packages(), long_description=long_description, long_description_content_type='text/x-rst', - python_requires='>=3.9', + python_requires='>=3.11,<4.0', install_requires=[ "pandas", "configparser", From 2c8ca703bb9276ad0d49d5485c38630c2ce42b4b Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:18:07 -0700 Subject: [PATCH 20/50] draft `Manager` class test --- tests/test_candi.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_candi.py b/tests/test_candi.py index dd64b59..74f1a08 100644 --- a/tests/test_candi.py +++ b/tests/test_candi.py @@ -2,6 +2,7 @@ import pandas as pd import numpy as np from CanDI.structures.entity import Entity +from CanDI.setup.manager import Manager class testEntity(unittest.TestCase): @@ -92,5 +93,6 @@ def test_canc_filters(self): self.assertIsInstance(over, pd.core.frame.DataFrame) self.assertIsInstance(under, pd.core.frame.DataFrame) - - +class testManager(unittest.TestCase): + #TODO: Implement tests for Manager class + pass \ No newline at end of file From 87b92bc0ad043d0364727d9102c1c5f398856253 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:20:22 -0700 Subject: [PATCH 21/50] update `.gitignore` --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5c79100..5647774 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,10 @@ CanDI/setup/data/coessentiality *.swp tests.py +build **/__pycache__/ **/.ipynb_checkpoints/ **.pyc -*.DS_store \ No newline at end of file +*.DS_store +*.egg-info From b0a3d18f202c7ab25bcfd0323cfe63fe02984b76 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:30:03 -0700 Subject: [PATCH 22/50] relative import --- CanDI/setup/install.py | 2 +- CanDI/setup/manager.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index c94fd2c..8798f8a 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,5 +1,5 @@ import argparse -from manager import DataverseDepMap, BroadDepMap +from .manager import DataverseDepMap, BroadDepMap def main(): diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index caa9024..fa8d629 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -13,7 +13,7 @@ from time import sleep from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI +from .dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI class Manager(object): From 686cd03c9c458ee2f488d37cdfe03fed34e94ab9 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:48:02 -0700 Subject: [PATCH 23/50] add `data_paths` --- CanDI/setup/manager.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index fa8d629..d3233e1 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -88,9 +88,17 @@ def download_reformatted_data(self): if 'readme' not in file.lower() } + data_paths = { + 'depmap': 'data/depmap/', + 'genes': 'data/genes/', + 'corum': 'data/complexes/', + 'location': 'data/location/' + } + self.parser["depmap_urls"] = depmap_urls self.parser["depmap_files"] = depmap_files self.parser["formatted"] = formatted + self.parser["data_paths"] = data_paths class BroadDepMap(Manager): From 2b511fa9936f4cf310e3449127a150dfa591090a Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 02:51:43 -0700 Subject: [PATCH 24/50] switch to python >3.11 --- .github/workflows/python-package.yml | 2 +- .github/workflows/python-publish.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index af02bed..e5e6c9a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9"] # ["3.8", "3.9", "3.10"] + python-version: ["3.11"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4a1e620..9913050 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: os-version: ["ubuntu-latest"] - python-version: ["3.9"] # ["3.8", "3.9", "3.10"] + python-version: ["3.11"] steps: - uses: actions/checkout@v3 From 8254f37c56f45782e2fe4d17c41c2033874ba1c9 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:08:26 -0700 Subject: [PATCH 25/50] update README --- README.rst | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/README.rst b/README.rst index 8c26095..e1c416c 100644 --- a/README.rst +++ b/README.rst @@ -5,32 +5,19 @@ CanDI - A global cancer data integrator |DOI| |Dataverse| -Package Installation --------------------- +Installation +------------ -CanDI is now available on `PyPI `_ and can be installed with pip: +CanDI is now available on `PyPI `_ and can be installed with pip. +Then, a command from CanDI will automatically download stable datasets from `Dataverse `_. .. code:: bash + # Package Installation pip install PyCanDI -___ -For the latest version (development version) install from GitHub: - -.. code:: bash - - pip install git+https://github.com/GilbertLabUCSF/CanDI.git - - -Prepare Datasets -~~~~~~~~~~~~~~~~ - -The python command from CanDI will automatically download and modify -datasets. - -.. code:: bash - - python CanDI/CanDI/setup/install.py + # Prepare Datasets + candi-install Downloaded and formatted datasets would organize this way: @@ -52,6 +39,11 @@ Downloaded and formatted datasets would organize this way: └── locations └── merged_locations.csv +___ + +Note: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users +based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. + Package Usage ------------- From 6dc9482d85a7c44be614f942f6acf33163ca6b65 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:09:17 -0700 Subject: [PATCH 26/50] mend --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index e1c416c..0bc6b96 100644 --- a/README.rst +++ b/README.rst @@ -39,7 +39,8 @@ Downloaded and formatted datasets would organize this way: └── locations └── merged_locations.csv -___ + +__ Note: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. From cee3f98e77fbe51446520b7ec07fcc6a8c996b76 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:11:16 -0700 Subject: [PATCH 27/50] mend --- README.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 0bc6b96..b47a1ff 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,9 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -__ +.. |br| raw:: html + +
Note: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. From de5280949a2b43ea4e6c94e070ae6e73c565e087 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:12:38 -0700 Subject: [PATCH 28/50] mend --- README.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b47a1ff..2358247 100644 --- a/README.rst +++ b/README.rst @@ -40,9 +40,7 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -.. |br| raw:: html - -
+| Note: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. From 52f39b5a7cb218d9e3502c82f8111ed6ffb5b966 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:16:41 -0700 Subject: [PATCH 29/50] mend --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2358247..2af6015 100644 --- a/README.rst +++ b/README.rst @@ -40,10 +40,10 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -| +.. note:: + Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users + based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. -Note: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users -based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. Package Usage ------------- From 1b00131a695bebaec0d04987ea07ba93cb08758c Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:20:18 -0700 Subject: [PATCH 30/50] mend --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2af6015..21668b1 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ Downloaded and formatted datasets would organize this way: .. note:: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users - based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. + based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. Package Usage From 422b2b360bed47d4ac3f77b5c71886c03121e0a7 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:20:42 -0700 Subject: [PATCH 31/50] mend --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 21668b1..2af6015 100644 --- a/README.rst +++ b/README.rst @@ -42,7 +42,7 @@ Downloaded and formatted datasets would organize this way: .. note:: Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users - based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. + based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. Package Usage From 3f045bc16a2716d1c440e56cbc660d0e985aaea0 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:22:39 -0700 Subject: [PATCH 32/50] mend --- README.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 2af6015..dc83b28 100644 --- a/README.rst +++ b/README.rst @@ -40,9 +40,9 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -.. note:: - Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users - based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets. +**Note:** + *Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users + based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.* Package Usage From a6cdb6806d15ac0f000e178769b619cde62bf612 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:25:09 -0700 Subject: [PATCH 33/50] mend --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index dc83b28..f76f640 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,7 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -**Note:** +.. **Note:**:: *Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.* From 410d29873e8d0faf4a20980bbb3933949e917d3c Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:26:18 -0700 Subject: [PATCH 34/50] mend --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index f76f640..954cd6e 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,7 @@ Downloaded and formatted datasets would organize this way: └── merged_locations.csv -.. **Note:**:: +**Note**: *Currently, DepMap API is not available for public use. Therefore, we are providing the preprocessed datasets for the users based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.* From 33cf352c0be051b42ad058783916446fd01e6d05 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:30:08 -0700 Subject: [PATCH 35/50] add citation --- README.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 954cd6e..e4bd235 100644 --- a/README.rst +++ b/README.rst @@ -45,7 +45,7 @@ Downloaded and formatted datasets would organize this way: based on DepMap 21Q4 release. DepMap API will be available in the future to download the latest datasets.* -Package Usage +Usage ------------- Import CanDI into python @@ -72,6 +72,17 @@ CanDI Objects - ``GeneCluster`` : Provides cross dataset indexing for a group of user defined genes. +Citation +-------- + +If you use CanDI in your research, please cite the following paper: + +Yogodzinski C, Arab A, Pritchard JR, Goodarzi H, Gilbert LA. +A global cancer data integrator reveals principles of synthetic lethality, sex disparity and immunotherapy. +Genome Med. 2021;13(1):167. Published 2021 Oct 18. doi:10.1186/s13073-021-00987-8 + + + .. |Documentation Status| image:: https://readthedocs.org/projects/candi/badge/?version=latest :target: https://candi.readthedocs.io/en/latest/?badge=latest From 11ee76f298e3f7a2e2433eba77a7d0715b0eda28 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:30:59 -0700 Subject: [PATCH 36/50] mend --- README.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index e4bd235..b0bcaf5 100644 --- a/README.rst +++ b/README.rst @@ -77,9 +77,11 @@ Citation If you use CanDI in your research, please cite the following paper: -Yogodzinski C, Arab A, Pritchard JR, Goodarzi H, Gilbert LA. -A global cancer data integrator reveals principles of synthetic lethality, sex disparity and immunotherapy. -Genome Med. 2021;13(1):167. Published 2021 Oct 18. doi:10.1186/s13073-021-00987-8 +.. code:: bibtex + + Yogodzinski C, Arab A, Pritchard JR, Goodarzi H, Gilbert LA. + A global cancer data integrator reveals principles of synthetic lethality, sex disparity and immunotherapy. + Genome Med. 2021;13(1):167. Published 2021 Oct 18. doi:10.1186/s13073-021-00987-8 From 74034130e348ef7d5ff5ed7548c096938e407e19 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:33:50 -0700 Subject: [PATCH 37/50] add badge --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b0bcaf5..398e53f 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,7 @@ CanDI - A global cancer data integrator ======================================= -|Documentation Status| -|DOI| -|Dataverse| +|PyPI| |Documentation Status| |DOI| |Dataverse| Installation ------------ @@ -85,6 +83,9 @@ If you use CanDI in your research, please cite the following paper: +.. |PyPI| image:: https://img.shields.io/pypi/v/PyCanDI + :target: https://pypi.org/project/PyCanDI/ + .. |Documentation Status| image:: https://readthedocs.org/projects/candi/badge/?version=latest :target: https://candi.readthedocs.io/en/latest/?badge=latest From c97223c018bd043f2fcca6cf9b7d716226277a88 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 03:35:34 -0700 Subject: [PATCH 38/50] add downloads --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 398e53f..c1c8222 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ CanDI - A global cancer data integrator ======================================= -|PyPI| |Documentation Status| |DOI| |Dataverse| +|PyPI| |Downloads| |Documentation Status| |DOI| |Dataverse| Installation ------------ @@ -89,6 +89,9 @@ If you use CanDI in your research, please cite the following paper: .. |Documentation Status| image:: https://readthedocs.org/projects/candi/badge/?version=latest :target: https://candi.readthedocs.io/en/latest/?badge=latest +.. |Downloads| image:: https://static.pepy.tech/badge/pycandi + :target: https://pepy.tech/project/pycandi + .. |DOI| image:: https://zenodo.org/badge/DOI/10.1186/s13073-021-00987-8.svg :target: https://doi.org/10.1186/s13073-021-00987-8 From 198c163772039a64708c0410c6a1cb28aca740b5 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 04:10:08 -0700 Subject: [PATCH 39/50] mend --- CanDI/setup/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index d3233e1..ace40ec 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -39,7 +39,7 @@ def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): print(f"Config Path: {cfig_path}") parser = configparser.ConfigParser() - parser.read(cfig_path) #.replace(".ini", ".draft.ini")) + parser.read(cfig_path.replace(".ini", ".draft.ini")) self.manager_path = manager_path self.cfig_path = Path(cfig_path) From 33083a57f909e7a9cf07caea018b4ac25095811e Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 04:36:42 -0700 Subject: [PATCH 40/50] update .gitignore file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5647774..cdbe076 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ build **.pyc *.DS_store *.egg-info +*.eggs From 940eae37058c3901a855efca895456d8c66eaba9 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 04:37:17 -0700 Subject: [PATCH 41/50] set `include_package_data` as true --- setup.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 51db702..b5a85ff 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,12 @@ name='PyCanDI', description='A cancer data integration package', version=version, - packages=find_packages(), + + packages=find_packages(exclude=['tests', 'test_*']), + long_description=long_description, long_description_content_type='text/x-rst', + python_requires='>=3.11,<4.0', install_requires=[ "pandas", @@ -21,14 +24,21 @@ "tqdm", ], url = 'https://github.com/GilbertLabUCSF/CanDI', + entry_points={ 'console_scripts': [ 'candi-install = CanDI.setup.install:main', + 'candi-uninstall = CanDI.setup.uninstall:main', ], }, + classifiers=[ 'Intended Audience :: Science/Research', 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', ], + + include_package_data=True, + setup_requires=['setuptools_scm'], + ) From 0c812030a4da36fdfd184abf51b3eac158650d73 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 04:37:37 -0700 Subject: [PATCH 42/50] add uninstall scripts --- CanDI/setup/uninstall.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 CanDI/setup/uninstall.py diff --git a/CanDI/setup/uninstall.py b/CanDI/setup/uninstall.py new file mode 100644 index 0000000..c57dd29 --- /dev/null +++ b/CanDI/setup/uninstall.py @@ -0,0 +1,19 @@ +import shutil +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--database", help="Specify the database to uninstall", default="depmap") + parser.add_argument("--data_dir", help="Specify the data directory", default='auto') + args = parser.parse_args() + + if args.database == 'depmap': + print("Uninstalling DepMap data") + shutil.rmtree(args.data_dir + "/data/depmap/") + + else: + raise ValueError("Invalid database. Currently only 'depmap' is supported") + +if __name__ == "__main__": + main() \ No newline at end of file From 1d0e9b1e976e3eabcaee3970b48eaa87478e0f07 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 04:48:03 -0700 Subject: [PATCH 43/50] mend --- CanDI/setup/install.py | 6 +++--- CanDI/setup/uninstall.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index 8798f8a..f546909 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -5,18 +5,18 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--source", help="Specify the download source", default="dataverse") - parser.add_argument("--data_dir", help="Specify the data directory", default='auto') + parser.add_argument("--directory", help="Specify the parent data directory", default='auto') args = parser.parse_args() if args.source == 'dataverse': print("Downloading data from Dataverse") - m = DataverseDepMap(manager_path=args.data_dir, verbose=True) + m = DataverseDepMap(manager_path=args.directory, verbose=True) m.download_reformatted_data() m.write_config(m.cfig_path, m.parser) elif args.source == 'depmap': print("Downloading data from DepMap") - m = BroadDepMap(manager_path=args.data_dir, verbose=True) + m = BroadDepMap(manager_path=args.directory, verbose=True) m.get_depmap_info() m.write_config(m.cfig_path, m.parser) m.download_defaults() diff --git a/CanDI/setup/uninstall.py b/CanDI/setup/uninstall.py index c57dd29..bf74942 100644 --- a/CanDI/setup/uninstall.py +++ b/CanDI/setup/uninstall.py @@ -1,17 +1,26 @@ +import os import shutil import argparse +from .manager import Manager def main(): parser = argparse.ArgumentParser() parser.add_argument("--database", help="Specify the database to uninstall", default="depmap") - parser.add_argument("--data_dir", help="Specify the data directory", default='auto') + parser.add_argument("--directory", help="Specify the data parent directory", default='auto') args = parser.parse_args() if args.database == 'depmap': print("Uninstalling DepMap data") - shutil.rmtree(args.data_dir + "/data/depmap/") + m = Manager() + + if args.directory == 'auto': + shutil.rmtree(m.manager_path + "/data/depmap/") + elif os.path.exists(args.directory): + shutil.rmtree(m.manager_path + "/data/depmap/") + else: + raise ValueError("Invalid data directory") else: raise ValueError("Invalid database. Currently only 'depmap' is supported") From 3d1ac9e7fa78ad9fe90ff110a754716287d1feaf Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 05:01:37 -0700 Subject: [PATCH 44/50] mend --- CanDI/setup/uninstall.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/CanDI/setup/uninstall.py b/CanDI/setup/uninstall.py index bf74942..0037a56 100644 --- a/CanDI/setup/uninstall.py +++ b/CanDI/setup/uninstall.py @@ -1,4 +1,5 @@ import os +import sys import shutil import argparse from .manager import Manager @@ -11,16 +12,22 @@ def main(): args = parser.parse_args() if args.database == 'depmap': - print("Uninstalling DepMap data") + print("Uninstalling CanDI: removing DepMap data") m = Manager() if args.directory == 'auto': - shutil.rmtree(m.manager_path + "/data/depmap/") + depmap_path = m.manager_path + "/data/depmap/" elif os.path.exists(args.directory): - shutil.rmtree(m.manager_path + "/data/depmap/") + depmap_path = args.directory + "/data/depmap/" else: - raise ValueError("Invalid data directory") + sys.exit("Exit: Invalid directory path!") + + if not os.path.exists(depmap_path): + sys.exit("Exit: Directory does not contain DepMap data") + else: + os.listdir(depmap_path) + shutil.rmtree(depmap_path) else: raise ValueError("Invalid database. Currently only 'depmap' is supported") From d74c0fa382f5120df5cf09666c68cce23f5e85d1 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 05:20:55 -0700 Subject: [PATCH 45/50] minor debug add `on_bad_lines` option https://stackoverflow.com/questions/18039057/pandas-parser-cparsererror-error-tokenizing-data --- CanDI/candi/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index d653aa0..7ae72df 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -97,6 +97,7 @@ def _handle_autoload(method, path): df = pd.read_csv(path, memory_map=True, low_memory=False, + on_bad_lines='warn', index_col="DepMap_ID") elif method == "locations": From 04db890534937885cebe51c1eef6de49bfbe6be1 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 05:33:41 -0700 Subject: [PATCH 46/50] debug --- CanDI/candi/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/candi/data.py b/CanDI/candi/data.py index 7ae72df..24fafcb 100644 --- a/CanDI/candi/data.py +++ b/CanDI/candi/data.py @@ -97,7 +97,7 @@ def _handle_autoload(method, path): df = pd.read_csv(path, memory_map=True, low_memory=False, - on_bad_lines='warn', + sep='\t', index_col="DepMap_ID") elif method == "locations": From bd6491739ed204ee82deb87c42ff66d6c72d6617 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 05:38:22 -0700 Subject: [PATCH 47/50] debug https://stackoverflow.com/questions/59809785/i-get-a-attributeerror-module-collections-has-no-attribute-iterable-when-i --- CanDI/structures/handlers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CanDI/structures/handlers.py b/CanDI/structures/handlers.py index 4a6b9d5..bbfed65 100644 --- a/CanDI/structures/handlers.py +++ b/CanDI/structures/handlers.py @@ -1,9 +1,10 @@ import operator import pandas as pd import numpy as np -import collections +from collections.abc import Iterable import six + class BinaryFilter: """BinaryFilter class filters datasets based on a specific threshold. It's often useful to filter essentiality, expression, copy number etc. @@ -146,7 +147,7 @@ def _get_variant(mut_dat, variant, item, all_except=False): assert item in mut_dat[variant].unique(), "{0} not found, options are: {1}".format(item, mut_dat[variant].unique()) - if isinstance(item, collections.Iterable) and not isinstance(item, six.string_types): + if isinstance(item, Iterable) and not isinstance(item, six.string_types): method = lambda x,y: mut_dat.loc[mut_dat[x].isin(y)] else: From e6003d213c48412abec10bdf82376b649b6ef172 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 16:11:31 -0700 Subject: [PATCH 48/50] major changes in CanDI setup scripts and harmonize coessentiality installation --- CanDI/candi/load.py | 37 -------- CanDI/setup/dataverse.py | 189 ++++++++++++++++++++++++--------------- CanDI/setup/install.py | 50 +++++++---- CanDI/setup/manager.py | 90 ++++++++++++++++++- 4 files changed, 236 insertions(+), 130 deletions(-) delete mode 100644 CanDI/candi/load.py diff --git a/CanDI/candi/load.py b/CanDI/candi/load.py deleted file mode 100644 index 42e27ef..0000000 --- a/CanDI/candi/load.py +++ /dev/null @@ -1,37 +0,0 @@ -import numpy as np -import pandas as pd -import polars as pl -from CanDI import candi -from pathlib import Path - - -def coessentiality(pvalue_threshold = 10**-3, data_dir='auto'): - if data_dir == 'auto': - data_dir=str(Path(candi.__path__[0]).parent.absolute()) + '/setup/data/coessentiality' - else: - # check if the path exists and it contains the necessary files - if not Path(data_dir).exists(): - raise ValueError(f"Path {data_dir} does not exist") - if not Path(data_dir+'/genes.txt').exists(): - raise ValueError(f"Path {data_dir}/genes.txt does not exist") - if not Path(data_dir+'/GLS_sign.npy').exists(): - raise ValueError(f"Path {data_dir}/GLS_sign.npy does not exist") - if not Path(data_dir+'/GLS_p.npy').exists(): - raise ValueError(f"Path {data_dir}/GLS_p.npy does not exist") - - gene_names = pd.read_csv(f'{data_dir}/genes.txt',header=None,names=['gene_name'])['gene_name'] - - GLS_sign = np.load(f'{data_dir}/GLS_sign.npy') - GLS_p = np.load(f'{data_dir}/GLS_p.npy') - - coessentiality_mat = pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index() - coessentiality_mat = pl.from_dataframe(coessentiality_mat) - - coessentiality_df = coessentiality_mat.melt('gene_name') - coessentiality_df.columns = ['gene_1','gene_2','coessentiality'] - coessentiality_df = coessentiality_df.filter(~(pl.col('gene_1') == pl.col('gene_2'))) - coessentiality_df = coessentiality_df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold)) - - out = coessentiality_df.to_pandas() - - return out diff --git a/CanDI/setup/dataverse.py b/CanDI/setup/dataverse.py index 1dac195..ea0aee6 100644 --- a/CanDI/setup/dataverse.py +++ b/CanDI/setup/dataverse.py @@ -9,6 +9,18 @@ CANDI_DATAVERSE_DOI = 'doi:10.7910/DVN/JIAT0H' + +### Datasets Metadata ### + +coessentiality_dataset_names = [ + 'genes', + # 10273535 + 'GLS_p', + # 10273534 + 'GLS_sign', + # 10273533 +] + depmap_dataset_names = [ 'CCLE_expression', 'CCLE_fusions', @@ -22,6 +34,11 @@ ] name2type = { + # Coessentiality datasets + 'genes': 'txt', + 'GLS_p': 'npy', + 'GLS_sign': 'npy', + # DepMap datasets 'CCLE_expression': 'csv', 'CCLE_fusions': 'csv', 'CCLE_gene_cn': 'csv', @@ -34,6 +51,11 @@ } name2id = { + # Coessentiality datasets + 'genes': 10273535, + 'GLS_p': 10273534, + 'GLS_sign': 10273533, + # DepMap datasets 'CCLE_expression': 8076862, 'CCLE_fusions': 10085763, 'CCLE_gene_cn': 8076861, @@ -46,6 +68,7 @@ } +### Utility functions ### def print_sys(s): """system print @@ -55,80 +78,102 @@ def print_sys(s): print(s, flush = True, file = sys.stderr) -def dataverse_download(url, path, name, types): - """dataverse download helper with progress bar - - Args: - url (str): the url of the dataset - path (str): the path to save the dataset - name (str): the dataset name - types (dict): a dictionary mapping from the dataset name to the file format - """ - save_path = os.path.join(path, f"{name}.{types[name]}") - response = requests.get(url, stream=True) - total_size_in_bytes = int(response.headers.get("content-length", 0)) - block_size = 1024 - progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) - with open(save_path, "wb") as file: - for data in response.iter_content(block_size): - progress_bar.update(len(data)) - file.write(data) - progress_bar.close() - - -def download_wrapper(name, path, return_type=None): - """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files - - Args: - name (str): the rough dataset query name - path (str): the path to save the dataset - return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] +### Downloading scripts ### + +class Downloader: + def __init__(self): + pass + + def _dataverse_download(self, url, path, name, types): + """dataverse download helper with progress bar + + Args: + url (str): the url of the dataset + path (str): the path to save the dataset + name (str): the dataset name + types (dict): a dictionary mapping from the dataset name to the file format + """ + save_path = os.path.join(path, f"{name}.{types[name]}") + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get("content-length", 0)) + block_size = 1024 + progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) + with open(save_path, "wb") as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + + + def _download_wrapper(self, name, path, return_type=None): + """wrapper for downloading a dataset given the name and path, for csv,pkl,tsv or similar files + + Args: + name (str): the rough dataset query name + path (str): the path to save the dataset + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + + Returns: + str: the exact dataset query name + """ + server_path = "https://dataverse.harvard.edu/api/access/datafile/" + + url = server_path + str(name2id[name]) + + if not os.path.exists(path): + os.mkdir(path) + + file_name = f"{name}.{name2type[name]}" + + if os.path.exists(os.path.join(path, file_name)): + print_sys("Found local copy...") + os.path.join(path, file_name) + else: + print_sys("Downloading...") + self._dataverse_download(url, path, name, name2type) + + if return_type == "url": + return url + elif return_type == "name": + return file_name + elif return_type == ["url", "name"]: + return url, file_name - Returns: - str: the exact dataset query name - """ - server_path = "https://dataverse.harvard.edu/api/access/datafile/" - - url = server_path + str(name2id[name]) - - if not os.path.exists(path): - os.mkdir(path) - - file_name = f"{name}.{name2type[name]}" - - if os.path.exists(os.path.join(path, file_name)): - print_sys("Found local copy...") - os.path.join(path, file_name) - else: - print_sys("Downloading...") - dataverse_download(url, path, name, name2type) - if return_type == "url": - return url - elif return_type == "name": - return file_name - elif return_type == ["url", "name"]: - return url, file_name - - -def depmap_dataverse_download(path, return_type=None): - """download all datasets to the path + def run(self, path, datasets, return_type=None): + """download all datasets to the path + + Args: + path (str): the path to save the datasets + return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] + """ + url_list = [] + file_names = [] + + for name in datasets: + url, file_name = self._download_wrapper(name, path, return_type=["url", "name"]) + url_list.append(url) + file_names.append(file_name) + + if return_type == "url": + return url_list + elif return_type == "name": + return file_names + elif return_type == ["url", "name"]: + return url_list, file_names + + +class DepMapDownloader(Downloader): + def __init__(self): + super().__init__() + + def download(self, path, return_type=None): + return self.run(path, depmap_dataset_names, return_type) - Args: - path (str): the path to save the datasets - return_type (str, optional): the return type. Defaults to None. Can be "url", "name", or ["url", "name"] - """ - url_list = [] - file_names = [] - for name in depmap_dataset_names: - url, file_name = download_wrapper(name, path, return_type=["url", "name"]) - url_list.append(url) - file_names.append(file_name) +class CoessentialityDownloader(Downloader): + def __init__(self): + super().__init__() - if return_type == "url": - return url_list - elif return_type == "name": - return file_names - elif return_type == ["url", "name"]: - return url_list, file_names + def download(self, path, return_type=None): + return self.run(path, coessentiality_dataset_names, return_type) \ No newline at end of file diff --git a/CanDI/setup/install.py b/CanDI/setup/install.py index f546909..29eed77 100644 --- a/CanDI/setup/install.py +++ b/CanDI/setup/install.py @@ -1,31 +1,45 @@ import argparse -from .manager import DataverseDepMap, BroadDepMap +from . import manager def main(): parser = argparse.ArgumentParser() + parser.add_argument("--database", help="Specify the database to download", default="depmap") parser.add_argument("--source", help="Specify the download source", default="dataverse") parser.add_argument("--directory", help="Specify the parent data directory", default='auto') args = parser.parse_args() - if args.source == 'dataverse': - print("Downloading data from Dataverse") - m = DataverseDepMap(manager_path=args.directory, verbose=True) - m.download_reformatted_data() - m.write_config(m.cfig_path, m.parser) - - elif args.source == 'depmap': - print("Downloading data from DepMap") - m = BroadDepMap(manager_path=args.directory, verbose=True) - m.get_depmap_info() - m.write_config(m.cfig_path, m.parser) - m.download_defaults() - m.write_config(m.cfig_path, m.parser) - m.depmap_autoformat() - m.write_config(m.cfig_path, m.parser) + if args.database == 'depmap': + if args.source == 'dataverse': + print("Downloading data from Dataverse") + m = manager.DataverseDepMap(manager_path=args.directory, verbose=True) + m.download_reformatted_data() + m.write_config(m.cfig_path, m.parser) + + elif args.source == 'depmap': + print("Downloading data from DepMap") + m = manager.BroadDepMap(manager_path=args.directory, verbose=True) + m.get_depmap_info() + m.write_config(m.cfig_path, m.parser) + m.download_defaults() + m.write_config(m.cfig_path, m.parser) + m.depmap_autoformat() + m.write_config(m.cfig_path, m.parser) - else: - raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'") + else: + raise ValueError("Invalid source. Please specify either 'dataverse' or 'depmap'") + if args.database == 'coessentiality': + if args.source == 'dataverse': + print("Downloading data from Dataverse") + m = manager.DataverseCoessentiality(manager_path=args.directory, verbose=True) + m.download_raw_files() + m.coessentiality_autoformat() + m.write_config(m.cfig_path, m.parser) + + else: + raise ValueError("Invalid source. Coessentiality data is only available on `dataverse`!") + + if __name__ == "__main__": main() \ No newline at end of file diff --git a/CanDI/setup/manager.py b/CanDI/setup/manager.py index ace40ec..8efacf0 100644 --- a/CanDI/setup/manager.py +++ b/CanDI/setup/manager.py @@ -9,11 +9,13 @@ import json import time import requests +import numpy as np +import polars as pl import pandas as pd from time import sleep from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from .dataverse import depmap_dataverse_download, CANDI_DATAVERSE_DOI +from . import dataverse class Manager(object): @@ -58,7 +60,7 @@ class DataverseDepMap(Manager): def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): super().__init__(manager_path, cfig_path, verbose) self.release = '21Q4' # default release uploded to CanDI dataverse - self.download_source = 'dataverse, ' + CANDI_DATAVERSE_DOI + self.download_source = 'dataverse, ' + dataverse.CANDI_DATAVERSE_DOI def download_reformatted_data(self): if not os.path.exists(self.manager_path + '/data/'): @@ -67,7 +69,8 @@ def download_reformatted_data(self): if not os.path.exists(self.manager_path + '/data/depmap/'): os.makedirs(self.manager_path + '/data/depmap/') - urls, file_names = depmap_dataverse_download( + session = dataverse.DepMapDownloader() + urls, file_names = session.download( self.manager_path + '/data/depmap/', return_type= ["url", "name"] ) @@ -304,3 +307,84 @@ def __init__(self, cfig_path='auto'): def sanger_download(): pass + + +class DataverseCoessentiality(Manager): + def __init__(self, manager_path='auto', cfig_path='auto', verbose=False): + super().__init__(manager_path, cfig_path, verbose) + self.download_source = 'Dataverse' + self.reference = 'https://github.com/kundajelab/coessentiality' + self.verbose = verbose + + def download_raw_files(self): + if not os.path.exists(self.manager_path + '/data/'): + os.makedirs(self.manager_path + '/data/') + + if not os.path.exists(self.manager_path + '/data/coessentiality/'): + os.makedirs(self.manager_path + '/data/coessentiality/') + + session = dataverse.CoessentialityDownloader() + urls, file_names = session.download( + self.manager_path + '/data/coessentiality/', + return_type= ["url", "name"] + ) + + self.urls = urls + self.file_names = file_names + + def _load_coessentiality_matrix(self): + data_dir = f'{self.manager_path}/data/coessentiality' + + gene_names = pd.read_csv( + f'{data_dir}/genes.txt',header=None,names=['gene_name'] + )['gene_name'] + + GLS_sign = np.load(f'{data_dir}/GLS_sign.npy') + GLS_p = np.load(f'{data_dir}/GLS_p.npy') + + self.matrix = pl.from_dataframe( + pd.DataFrame((-1*np.log10(GLS_p)) * GLS_sign, columns = gene_names, index = gene_names).reset_index() + ) + + def _get_coessentiality_df(self, pvalue_threshold = 10**-3): + df = self.matrix.melt('gene_name') + df.columns = ['gene_1','gene_2','coessentiality'] + df = df.filter(~(pl.col('gene_1') == pl.col('gene_2'))) + df = df.filter(pl.col('coessentiality') > -np.log10(pvalue_threshold)) + + self.df = df + self.pvalue_threshold = pvalue_threshold + + def coessentiality_autoformat(self): + + if self.verbose: print("Building Coessentiality Matrix ...", end=' ') + self._load_coessentiality_matrix() + self.matrix.to_pandas().to_csv( + f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv' + ) + if self.verbose: print("Done!") + + if self.verbose: print("Building Coessentiality DataFrame ...", end=' ') + self._get_coessentiality_df() + self.df.to_pandas().to_csv( + f'{self.manager_path}/data/coessentiality/coessentiality_df.csv' + ) + if self.verbose: print("Done!") + + self.parser['data_paths'] = { + 'coessentiality': 'data/coessentiality/' + } + + self.parser['formatted'] = { + 'coessentiality_matrix.csv': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv', + 'coessentiality_df.csv': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv' + } + + self.parser['depmap_files'] = { + 'coessentiality': f'{self.manager_path}/data/coessentiality/coessentiality_df.csv', + 'coessentiality_matrix': f'{self.manager_path}/data/coessentiality/coessentiality_matrix.csv', + # 'coessentiality_signs': f'{self.manager_path}/data/coessentiality/GLS_sign.npy', + # 'coessentiality_pvalues': f'{self.manager_path}/data/coessentiality/GLS_p.npy', + # 'gene_names': f'{self.manager_path}/data/coessentiality/genes.txt', + # 'pvalue_threshold': self.pvalue_threshold, + } \ No newline at end of file From f2f2aba64ab42afda33c9933ee0daca3c38db855 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 16:38:16 -0700 Subject: [PATCH 49/50] draft coessentiality module --- CanDI/pipelines/coessentiality/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 CanDI/pipelines/coessentiality/__init__.py diff --git a/CanDI/pipelines/coessentiality/__init__.py b/CanDI/pipelines/coessentiality/__init__.py new file mode 100644 index 0000000..e69de29 From fbdfee211b0ea07d8a49abb3f6113696d28074b4 Mon Sep 17 00:00:00 2001 From: abearab Date: Mon, 24 Jun 2024 16:40:17 -0700 Subject: [PATCH 50/50] bump version 0.2.0 --- CanDI/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CanDI/__version__.py b/CanDI/__version__.py index f20f41e..1fee926 100644 --- a/CanDI/__version__.py +++ b/CanDI/__version__.py @@ -1 +1 @@ -version = "0.1.2" \ No newline at end of file +version = "0.2.0" \ No newline at end of file