diff --git a/screenpro/load.py b/screenpro/load.py index 93b599a..5239b87 100644 --- a/screenpro/load.py +++ b/screenpro/load.py @@ -2,12 +2,83 @@ Module for loading screen datasets """ -# import import pickle import pandas as pd +from .utils import check_protospacer_length, trim_protospacer + + +def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True): + '''Load Cas9 sgRNA library table for single or dual guide design. + ''' + library = pd.read_csv( + library_path, + sep=sep, + index_col=index_col, + ) + + ## Evaluate library table and reformat columns for downstream analysis + # I would like to name the target column 'target' if it is named 'gene'! + if 'gene' in library.columns: + # rename gene column to target + library = library.rename(columns={'gene': 'target'}) + + if library_type == "single_guide_design": + eval_columns = ['target', 'sgID', 'protospacer'] + + # Upper case protospacer sequences + library['protospacer'] = library['protospacer'].str.upper() + + for col in eval_columns: + if col not in library.columns: + raise ValueError(f"Column '{col}' not found in library table.") + + library = library[eval_columns] + + elif library_type == "dual_guide_design": + eval_columns = [ + 'target', 'sgID_AB', + 'sgID_A', 'protospacer_A', + 'sgID_B', 'protospacer_B', + 'sequence' + ] + + # Upper case protospacer sequences + library['protospacer_A'] = library['protospacer_A'].str.upper() + library['protospacer_B'] = library['protospacer_B'].str.upper() + + # # TODO: Enable trimming of protospacer sequences through command line arguments. + for protospacer_col in ['protospacer_A', 'protospacer_B']: + in_length = check_protospacer_length(library, protospacer_col) + if in_length == protospacer_length: + pass + elif in_length > protospacer_length: + if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.") + library = trim_protospacer( + library, protospacer_col, + '5prime', + in_length - protospacer_length + ) + + elif in_length < protospacer_length: + raise ValueError( + f"Input protospacer length for '{protospacer_col}'is less than {protospacer_length}" + ) + + # if 'sequence' not in library.columns: + library['sequence'] = library['protospacer_A'] + ';' + library['protospacer_B'] + + for col in eval_columns: + if col not in library.columns: + raise ValueError(f"Column '{col}' not found in library table.") + + library = library[eval_columns] + + if verbose: print("Library table successfully loaded.") + + return library + -# functions def loadScreenProcessingData(experimentName, collapsedToTranscripts=True, premergedCounts=False): """ Load ScreenProcessing outputs diff --git a/screenpro/main.py b/screenpro/main.py index 4fffd57..5dad79a 100644 --- a/screenpro/main.py +++ b/screenpro/main.py @@ -7,14 +7,14 @@ from .__init__ import __version__ from . import ngs -def add_fq2cnt_parser(parent_subparsers, parent): - name = "fq2cnt", +def add_counter_parser(parent_subparsers, parent): + name = "counter" desc = "Process FASTQ files to count sgRNA sequences." help = """ Example usage: - `screenpro fq2cnt --single-guide-design -l -s ` - `screenpro fq2cnt --dual-guide-design -l -s ` + `screenpro counter --single-guide-design -l -s ` + `screenpro counter --dual-guide-design -l -s ` """ @@ -91,8 +91,8 @@ def main(): ## screenpro commands - # fq2cnt subcommand - fq2cnt_parser = add_fq2cnt_parser(parent_subparsers, parent) + # counter subcommand + counter_parser = add_counter_parser(parent_subparsers, parent) ## Define return values args = parent_parser.parse_args() @@ -124,7 +124,7 @@ def main(): # Show module specific help if only module but no further arguments are given command_to_parser = { - "fq2cnt": fq2cnt_parser, + "counter": counter_parser, } if len(sys.argv) == 2: @@ -134,8 +134,8 @@ def main(): parent_parser.print_help(sys.stderr) sys.exit(1) - ## fq2cnt return - if args.command == "fq2cnt": + ## counter return + if args.command == "counter": ### Perform checks on input arguments ## Check if library platform is provided by user if args.single_guide_design: @@ -175,5 +175,5 @@ def main(): ## 3. Load FASTQ files and count sgRNA sequences # Save count matrix - # TODO: Implement fq2cnt workflow. + # TODO: Implement counter workflow. # process FASTQ files to generate counts per sample and then save a count matrix. diff --git a/screenpro/ngs/__init__.py b/screenpro/ngs/__init__.py index 8f660ec..2d7ab93 100644 --- a/screenpro/ngs/__init__.py +++ b/screenpro/ngs/__init__.py @@ -33,103 +33,6 @@ - None of the protospacer A and B is present in the reference library ''' -import pandas as pd -from simple_colors import green - -from . import cas9 from . import cas12 - -class Counter: - '''Class to count sequences from FASTQ files - ''' - - def __init__(self, cas_type, library_type): - self.cas_type = cas_type - self.library_type = library_type - - def load_library(self, library_path): - '''Load library file - ''' - if self.cas_type == 'cas9': - library = pd.read_csv( - library_path, - sep='\t', - index_col=0, - ) - - # I would like to name the target column 'target' if it is named 'gene'! - if 'gene' in library.columns: - # rename gene column to target - library = library.rename(columns={'gene': 'target'}) - - # Evaluate library table - if self.library_type == "single_guide_design": - pass - - elif self.library_type == "dual_guide_design": - # reformat columns for downstream analysis - if 'sgID_AB' in library.columns: - library = library.set_index('sgID_AB') - library = library.rename( - columns={'protospacer_A':'protospacer_a','protospacer_B':'protospacer_b'} - ) - - if 'sequence' not in library.columns: - # TODO: Enable trimming of protospacer sequences through command line arguments. - library.protospacer_a = library.protospacer_a.str.upper() - library.protospacer_b = library.protospacer_b.str[1:].str.upper() - library['sequence'] = library.protospacer_a + ';' + library.protospacer_b - - # check required columns are present: - eval_columns = ['target', 'sequence', 'protospacer_A', 'protospacer_B', 'sequence'] - for col in eval_columns: - if col not in library.columns: - raise ValueError(f"Column '{col}' not found in library table.") - - library = library[eval_columns] - - elif self.cas_type == 'cas12': - raise NotImplementedError("Cas12 library is not yet implemented.") - - self.library = library - - def get_matrix(self, fastq_dir, samples,library,get_recombinant=False, cas_type='cas9',verbose=False): - '''Get count matrix for given samples - ''' - if self.cas_type == 'cas9': - if self.library_type == "single_guide_design": - # TODO: Implement codes to build count matrix for given samples - pass - elif self.library_type == "dual_guide_design": - counts = {} - for sample_id in samples: - if verbose: print(green(sample_id, ['bold'])) - df_count = cas9.fastq_to_count_dual_guide( - R1_fastq_file_path=f'{fastq_dir}/{sample_id}_R1.fastq.gz', - R2_fastq_file_path=f'{fastq_dir}/{sample_id}_R2.fastq.gz', - trim5p_pos1_start=1, - trim5p_pos1_length=19, - trim5p_pos2_start=1, - trim5p_pos2_length=19, - verbose=verbose - ) - if not get_recombinant: - counts[sample_id] = cas9.map_to_library_dual_guide( - df_count=df_count, - library=library, - get_recombinant=False, - return_type='mapped', - verbose=verbose - ) - elif get_recombinant: - raise NotImplementedError("Recombinant count matrix are not yet implemented.") - - counts_mat = pd.concat([ - counts[sample_id].to_pandas()['count'].rename(sample_id) - for sample_id in counts.keys() - ],axis=1).fillna(0) - if cas_type == 'cas12': - # TODO: Implement codes to build count matrix for given samples - raise NotImplementedError("Cas12 count matrix is not yet implemented.") - - return counts_mat +from . import cas9 +from .counter import Counter \ No newline at end of file diff --git a/screenpro/ngs/cas9.py b/screenpro/ngs/cas9.py index 1e2f278..edef75f 100644 --- a/screenpro/ngs/cas9.py +++ b/screenpro/ngs/cas9.py @@ -87,15 +87,52 @@ def fastq_to_count_dual_guide( return df_count +def map_to_library_single_guide(df_count, library, return_type='all', verbose=False): + # get counts for given input + res = df_count.clone() #cheap deepcopy/clone + res = res.sort('count', descending=True) + res = res.with_columns( + pl.col("sequence").alias("sequence"), + ) + + res_map = pl.DataFrame(library).join( + res, on="sequence", how="left" + ) + + if return_type == 'unmapped' or return_type == 'all': + res_unmap = res.join( + pl.DataFrame(library), on="sequence", how="anti" + ) + + if verbose: + print("% mapped reads", + 100 * \ + res_map.to_pandas()['count'].fillna(0).sum() / \ + int(res.select(pl.sum("count")).to_pandas()['count']) + ) + + if return_type == 'unmapped': + return res_unmap + elif return_type == 'mapped': + return res_map + elif return_type == 'all': + return {'full': res, 'mapped': res_map, 'unmapped': res_unmap} + else: + raise ValueError("return_type must be either 'unmapped', 'mapped', or 'all'") + + def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_type='all', verbose=False): # get counts for given input - res = df_count.copy() + res = df_count.clone() #cheap deepcopy/clone + res = res.rename( + {'protospacer_a':'protospacer_A','protospacer_b':'protospacer_B'} + ) res = res.sort('count', descending=True) res = res.with_columns( pl.concat_str( [ - pl.col("protospacer_a"), - pl.col("protospacer_b") + pl.col("protospacer_A"), + pl.col("protospacer_B") ], separator=";", ).alias("sequence"), @@ -104,6 +141,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t res_map = pl.DataFrame(library).join( res, on="sequence", how="left" ) + + if get_recombinant or return_type == 'unmapped' or return_type == 'all': + res_unmap = res.join( + pl.DataFrame(library), on="sequence", how="anti" + ) + if verbose: print("% mapped reads", 100 * \ @@ -112,9 +155,6 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t ) if get_recombinant: - res_unmap = res.join( - pl.DataFrame(library), on="sequence", how="anti" - ) if verbose: print("% unmapped reads", @@ -124,12 +164,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t ) res_unmap_remapped_a = res_unmap.join( - pl.DataFrame(library[['sgID_A','protospacer_a']]), on=["protospacer_a"], how="left" + pl.DataFrame(library[['sgID_A','protospacer_A']]), on=["protospacer_A"], how="left" ) res_recomb_events = res_unmap_remapped_a.join( - pl.DataFrame(library[['sgID_B','protospacer_b']]), - on=["protospacer_b"], how="left" + pl.DataFrame(library[['sgID_B','protospacer_B']]), + on=["protospacer_B"], how="left" ) if verbose: print("% fully remapped recombination events", @@ -138,15 +178,19 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t int(res.select(pl.sum("count")).to_pandas()['count']) ) - if get_recombinant and return_type == 'all': - sample_count = {'full': res,'mapped': res_map,'recomb': res_recomb_events} - return sample_count - elif get_recombinant and return_type == 'recomb': - return res_recomb_events - elif not get_recombinant and return_type == 'all': - sample_count = {'full': res,'mapped': res_map} - return sample_count - elif not get_recombinant and return_type == 'mapped': + if return_type == 'unmapped': + return res_unmap + elif return_type == 'mapped': return res_map + elif return_type == 'recombinant': + if get_recombinant: + return res_recomb_events + else: + raise ValueError("get_recombinant must be set to True to calculate recombinant events") + elif return_type == 'all': + if get_recombinant: + return {'full': res,'mapped': res_map,'recombinant': res_recomb_events, 'unmapped': res_unmap} + else: + return {'full': res,'mapped': res_map, 'unmapped': res_unmap} else: - raise ValueError("return_type must be either 'all' or 'mapped' or 'recomb'") + raise ValueError("return_type must be either 'unmapped', 'mapped', 'recombinant', or 'all'") diff --git a/screenpro/ngs/counter.py b/screenpro/ngs/counter.py new file mode 100644 index 0000000..b3b2c21 --- /dev/null +++ b/screenpro/ngs/counter.py @@ -0,0 +1,217 @@ +import pandas as pd +import polars as pl +import anndata as ad +# import multiprocess as mp + +from . import cas9 +from . import cas12 +from ..load import load_cas9_sgRNA_library +from ..utils import find_low_counts +from simple_colors import green + + +class Counter: + '''Class to count sequences from FASTQ files + ''' + + def __init__(self, cas_type, library_type): + self.cas_type = cas_type + self.library_type = library_type + self.counts_dict = None + self.counts_mat = None + self.recombinants = None + + def load_library(self, library_path, sep='\t', index_col=0, verbose=False): + '''Load library file + ''' + if self.cas_type == 'cas9': + library = load_cas9_sgRNA_library(library_path, library_type=self.library_type, sep=sep, index_col=index_col, verbose=verbose) + + elif self.cas_type == 'cas12': + raise NotImplementedError("Cas12 library is not yet implemented.") + + # covert to polar DataFrame + library = pl.from_pandas(library) + + self.library = library + + def _get_sgRNA_table(self): + + if self.cas_type == 'cas9': + if self.library_type == "single_guide_design": + sgRNA_table = self.library.to_pandas()[['target','sgID','protospacer']].set_index('sgID') + + elif self.library_type == "dual_guide_design": + sgRNA_table = pd.concat([ + self.library.to_pandas()[['target','sgID_A','protospacer_A']].rename(columns={'sgID_A':'sgID','protospacer_A':'protospacer'}), + self.library.to_pandas()[['target','sgID_B', 'protospacer_B']].rename(columns={'sgID_B':'sgID','protospacer_B':'protospacer'}) + ]).set_index('sgID') + + return sgRNA_table + + def get_counts_matrix(self, fastq_dir, samples,get_recombinant=False, cas_type='cas9', parallel=False, verbose=False): + '''Get count matrix for given samples + ''' + if self.cas_type == 'cas9': + counts = {} + + if self.library_type == "single_guide_design": + # TODO: Implement codes to build count matrix for given samples + pass + elif self.library_type == "dual_guide_design": + if get_recombinant: recombinants = {} + + # TODO: Fix the function for parallel processing + def process_sample(sample_id): + if verbose: print(green(sample_id, ['bold'])) + df_count = cas9.fastq_to_count_dual_guide( + R1_fastq_file_path=f'{fastq_dir}/{sample_id}_R1.fastq.gz', + R2_fastq_file_path=f'{fastq_dir}/{sample_id}_R2.fastq.gz', + trim5p_pos1_start=1, + trim5p_pos1_length=19, + trim5p_pos2_start=1, + trim5p_pos2_length=19, + verbose=verbose + ) + if not get_recombinant: + counts[sample_id] = cas9.map_to_library_dual_guide( + df_count=df_count, + library=self.library, + get_recombinant=False, + return_type='mapped', + verbose=verbose + ) + elif get_recombinant: + cnt = cas9.map_to_library_dual_guide( + df_count=df_count, + library=self.library, + get_recombinant=True, + return_type='all', + verbose=verbose + ) + counts[sample_id] = cnt['mapped'] + recombinants[sample_id] = cnt['recombinant'] + + if parallel: + raise NotImplementedError("Parallel processing is not yet implemented.") + # pool = mp.Pool(len(samples)) + # pool.map(process_sample, samples) + # pool.close() + # pool.join() + + else: + for sample_id in samples: + process_sample(sample_id) + + counts_mat = pd.concat([ + counts[sample_id].to_pandas().set_index('sgID_AB')['count'].rename(sample_id) + for sample_id in counts.keys() + ],axis=1).fillna(0) + + if cas_type == 'cas12': + # TODO: Implement codes to build count matrix for given samples + raise NotImplementedError("Cas12 count matrix is not yet implemented.") + + self.counts_dict = counts + self.counts_mat = counts_mat + if get_recombinant: + self.recombinants = recombinants + + def load_counts_matrix(self, counts_mat_path, **kwargs): + '''Load count matrix + ''' + self.counts_mat = pd.read_csv(counts_mat_path, **kwargs) + + def build_counts_anndata(self, source='library'): + '''Build AnnData object from count matrix + ''' + if source == 'recombinant' and self.library_type == "single_guide_design": + raise ValueError("Recombinants are not applicable for single guide design!") + if source == 'recombinant' and self.recombinants is None: + raise ValueError("Recombinants are not available. If applicable, please set get_recombinant=True in get_counts_matrix method.") + + if self.library_type == "single_guide_design": + adata = ad.AnnData( + X = self.counts_mat.T, + var = self.library.to_pandas().set_index('sgID') + ) + + elif self.library_type == "dual_guide_design": + adata = ad.AnnData( + X = self.counts_mat.T, + var = self.library.to_pandas().set_index('sgID_AB') + ) + + if source == 'recombinant': + counts_recombinants = {} + + for sample in self.recombinants.keys(): + d = self.recombinants[sample].drop_nulls() + d = d.to_pandas() + counts_recombinants[sample] = d.set_index(['sgID_A','sgID_B'])['count'] + + counts_recombinants = pd.concat(counts_recombinants,axis=1).fillna(0) + # counts_recombinants = counts_recombinants[counts_recombinants.eq(0).sum(axis=1)<=6] + + counts_recombinants = pd.concat([ + counts_recombinants, + pd.concat([ + # add non-targeting counts from the main count matrix + self.counts_mat[self.counts_mat.index.str.contains('non-targeting')], + # add non-targeting counts from the main count matrix + self.library.to_pandas().set_index('sgID_AB')[['sgID_A','sgID_B']][self.counts_mat.index.str.contains('non-targeting')] + ],axis=1).set_index(['sgID_A','sgID_B']) + ]) + + var_table = pd.DataFrame( + counts_recombinants.index.to_list(), + index = ['|'.join(i) for i in counts_recombinants.index.to_list()], + columns=['sgID_A','sgID_B']) + + var_table.index.name = 'sgID_AB' + + sgRNA_table = self._get_sgRNA_table() + + var_table = pd.concat([ + var_table.reset_index().reset_index(drop=True), + sgRNA_table.loc[var_table['sgID_A']].rename(columns={'target':'target_A', 'protospacer':'protospacer_A'}).reset_index(drop=True), + sgRNA_table.loc[var_table['sgID_B']].rename(columns={'target':'target_B', 'protospacer':'protospacer_B'}).reset_index(drop=True), + ], axis=1).set_index('sgID_AB') + + var_table['targetType'] = '' + + var_table.loc[ + (var_table.target_A.eq('negative_control')) & + (var_table.target_B.eq('negative_control')),'targetType'] = 'negCtrl' + + var_table.loc[ + ~(var_table.target_A.eq('negative_control')) & + (var_table.target_B.eq('negative_control')),'targetType'] = 'gene-ctrl' + + var_table.loc[ + (var_table.target_A.eq('negative_control')) & + ~(var_table.target_B.eq('negative_control')),'targetType'] = 'ctrl-gene' + + var_table.loc[ + ~(var_table.target_A.eq('negative_control')) & + ~(var_table.target_B.eq('negative_control')),'targetType'] = 'gene-gene' + + var_table.index.name = None + + var_table['target'] = var_table['target_A'] + '|' + var_table['target_B'] + var_table['sequence'] = var_table['protospacer_A'] + ';' + var_table['protospacer_B'] + + rdata = ad.AnnData( + X = counts_recombinants.T.to_numpy(), + var = var_table, + obs = adata.obs + ) + + find_low_counts(rdata, minimum_reads=0) + + rdata = rdata[:,~rdata.var.low_count] + + if source == 'mapped' or source == 'library': + return adata + elif source == 'recombinant': + return rdata diff --git a/screenpro/utils.py b/screenpro/utils.py index f668f21..f8ec5de 100644 --- a/screenpro/utils.py +++ b/screenpro/utils.py @@ -2,6 +2,25 @@ import numpy as np +def check_protospacer_length(library, protospacer_col): + lengths = list(set(library[protospacer_col].str.len())) + if len(lengths) > 1: + raise ValueError(f"Protospacer lengths are not uniform: {lengths}") + else: + length = lengths[0] + return length + + +def trim_protospacer(library, protospacer_col, trim_side, trim_len): + if trim_side == '5prime': + library[protospacer_col] = library[protospacer_col].str[trim_len:].str.upper() + + elif trim_side == '3prime': + library[protospacer_col] = library[protospacer_col].str[:-trim_len].str.upper() + + return library + + def find_low_counts(adata, filter_type='either', minimum_reads=50): """ Label variables with low counts in either or all samples.