ArcInstitute · abearab · May 7, 2024 · Apr 26, 2024 · May 1, 2024 · May 1, 2024
diff --git a/screenpro/load.py b/screenpro/load.py
@@ -2,12 +2,83 @@
 Module for loading screen datasets
 """
 
-# import
 import pickle
 import pandas as pd
 
+from .utils import check_protospacer_length, trim_protospacer
+
+
+def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True):
+    '''Load Cas9 sgRNA library table for single or dual guide design.
+    '''
+    library = pd.read_csv(
+        library_path,
+        sep=sep,
+        index_col=index_col,
+    )
+
+    ## Evaluate library table and reformat columns for downstream analysis
+    # I would like to name the target column 'target' if it is named 'gene'!
+    if 'gene' in library.columns:
+        # rename gene column to target
+        library = library.rename(columns={'gene': 'target'})
+
+    if library_type == "single_guide_design":
+        eval_columns = ['target', 'sgID', 'protospacer']
+
+        # Upper case protospacer sequences
+        library['protospacer'] = library['protospacer'].str.upper()
+
+        for col in eval_columns:
+            if col not in library.columns:
+                raise ValueError(f"Column '{col}' not found in library table.")
+
+        library = library[eval_columns]
+
+    elif library_type == "dual_guide_design":
+        eval_columns = [
+            'target', 'sgID_AB',
+            'sgID_A', 'protospacer_A', 
+            'sgID_B', 'protospacer_B', 
+            'sequence'
+        ]
+
+        # Upper case protospacer sequences
+        library['protospacer_A'] = library['protospacer_A'].str.upper()
+        library['protospacer_B'] = library['protospacer_B'].str.upper()
+
+        # # TODO: Enable trimming of protospacer sequences through command line arguments.
+        for protospacer_col in ['protospacer_A', 'protospacer_B']:
+            in_length = check_protospacer_length(library, protospacer_col) 
+            if in_length == protospacer_length:
+                pass
+            elif in_length > protospacer_length:
+                if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.")
+                library = trim_protospacer(
+                    library, protospacer_col, 
+                    '5prime', 
+                    in_length - protospacer_length
+                )
+
+            elif in_length < protospacer_length:
+                raise ValueError(
+                    f"Input protospacer length for '{protospacer_col}'is less than {protospacer_length}"
+                )
+
+        # if 'sequence' not in library.columns:
+        library['sequence'] = library['protospacer_A'] + ';' + library['protospacer_B']
+
+        for col in eval_columns:
+            if col not in library.columns:
+                raise ValueError(f"Column '{col}' not found in library table.")
+
+        library = library[eval_columns]
+
+    if verbose: print("Library table successfully loaded.")
+
+    return library
+
 
-# functions
 def loadScreenProcessingData(experimentName, collapsedToTranscripts=True, premergedCounts=False):
     """
     Load ScreenProcessing outputs

diff --git a/screenpro/main.py b/screenpro/main.py
@@ -7,14 +7,14 @@
 from .__init__ import __version__
 from . import ngs
 
-def add_fq2cnt_parser(parent_subparsers, parent):
-    name = "fq2cnt",
+def add_counter_parser(parent_subparsers, parent):
+    name = "counter"
     desc = "Process FASTQ files to count sgRNA sequences."
     help = """
     Example usage:
 
-        `screenpro fq2cnt --single-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
-        `screenpro fq2cnt --dual-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
+        `screenpro counter --single-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
+        `screenpro counter --dual-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
 
     """
 
@@ -91,8 +91,8 @@ def main():
 
     ## screenpro commands
 
-    # fq2cnt subcommand
-    fq2cnt_parser = add_fq2cnt_parser(parent_subparsers, parent)
+    # counter subcommand
+    counter_parser = add_counter_parser(parent_subparsers, parent)
 
     ## Define return values
     args = parent_parser.parse_args()
@@ -124,7 +124,7 @@ def main():
 
     # Show module specific help if only module but no further arguments are given
     command_to_parser = {
-        "fq2cnt": fq2cnt_parser,
+        "counter": counter_parser,
     }
 
     if len(sys.argv) == 2:
@@ -134,8 +134,8 @@ def main():
             parent_parser.print_help(sys.stderr)
         sys.exit(1)
 
-    ## fq2cnt return
-    if args.command == "fq2cnt":
+    ## counter return
+    if args.command == "counter":
         ### Perform checks on input arguments
         ## Check if library platform is provided by user
         if args.single_guide_design:
@@ -175,5 +175,5 @@ def main():
         ## 3. Load FASTQ files and count sgRNA sequences
         # Save count matrix
 
-        # TODO: Implement fq2cnt workflow.
+        # TODO: Implement counter workflow.
         # process FASTQ files to generate counts per sample and then save a count matrix.
diff --git a/screenpro/ngs/__init__.py b/screenpro/ngs/__init__.py
@@ -33,103 +33,6 @@
 - None of the protospacer A and B is present in the reference library
 '''
 
-import pandas as pd
-from simple_colors import green
-
-from . import cas9
 from . import cas12
-
-class Counter:
-    '''Class to count sequences from FASTQ files
-    '''
-
-    def __init__(self, cas_type, library_type):
-        self.cas_type = cas_type
-        self.library_type = library_type        
-
-    def load_library(self, library_path):
-        '''Load library file
-        '''
-        if self.cas_type == 'cas9':
-            library = pd.read_csv(
-                library_path,
-                sep='\t',
-                index_col=0,
-            )
-
-            # I would like to name the target column 'target' if it is named 'gene'!
-            if 'gene' in library.columns:
-                # rename gene column to target
-                library = library.rename(columns={'gene': 'target'})
-
-            # Evaluate library table
-            if self.library_type == "single_guide_design":
-                pass
-
-            elif self.library_type == "dual_guide_design":
-                # reformat columns for downstream analysis
-                if 'sgID_AB' in library.columns:
-                    library = library.set_index('sgID_AB')
-                library = library.rename(
-                    columns={'protospacer_A':'protospacer_a','protospacer_B':'protospacer_b'}
-                )
-
-                if 'sequence' not in library.columns:
-                    # TODO: Enable trimming of protospacer sequences through command line arguments.
-                    library.protospacer_a = library.protospacer_a.str.upper()
-                    library.protospacer_b = library.protospacer_b.str[1:].str.upper()
-                    library['sequence'] = library.protospacer_a + ';' + library.protospacer_b
-
-                # check required columns are present:
-                eval_columns = ['target', 'sequence', 'protospacer_A', 'protospacer_B', 'sequence']
-                for col in eval_columns:
-                    if col not in library.columns:
-                        raise ValueError(f"Column '{col}' not found in library table.")
-
-                library = library[eval_columns]
-
-        elif self.cas_type == 'cas12':
-            raise NotImplementedError("Cas12 library is not yet implemented.")
-
-        self.library = library
-
-    def get_matrix(self, fastq_dir, samples,library,get_recombinant=False, cas_type='cas9',verbose=False):
-        '''Get count matrix for given samples
-        '''
-        if self.cas_type == 'cas9':
-            if self.library_type == "single_guide_design":
-                # TODO: Implement codes to build count matrix for given samples
-                pass
-            elif self.library_type == "dual_guide_design":
-                counts = {}
-                for sample_id in samples:
-                    if verbose: print(green(sample_id, ['bold']))
-                    df_count = cas9.fastq_to_count_dual_guide(
-                        R1_fastq_file_path=f'{fastq_dir}/{sample_id}_R1.fastq.gz',
-                        R2_fastq_file_path=f'{fastq_dir}/{sample_id}_R2.fastq.gz',
-                        trim5p_pos1_start=1,
-                        trim5p_pos1_length=19,
-                        trim5p_pos2_start=1,
-                        trim5p_pos2_length=19,
-                        verbose=verbose
-                    )
-                    if not get_recombinant:
-                        counts[sample_id] = cas9.map_to_library_dual_guide(
-                            df_count=df_count,
-                            library=library,
-                            get_recombinant=False,
-                            return_type='mapped',
-                            verbose=verbose
-                        )
-                    elif get_recombinant:
-                        raise NotImplementedError("Recombinant count matrix are not yet implemented.")
-
-            counts_mat = pd.concat([
-                counts[sample_id].to_pandas()['count'].rename(sample_id)
-                for sample_id in counts.keys()
-            ],axis=1).fillna(0)
-        if cas_type == 'cas12':
-            # TODO: Implement codes to build count matrix for given samples
-            raise NotImplementedError("Cas12 count matrix is not yet implemented.")
-
-        return counts_mat
+from . import cas9
+from .counter import Counter
diff --git a/screenpro/ngs/cas9.py b/screenpro/ngs/cas9.py
@@ -87,15 +87,52 @@ def fastq_to_count_dual_guide(
     return df_count
 
 
+def map_to_library_single_guide(df_count, library, return_type='all', verbose=False):
+    # get counts for given input
+    res = df_count.clone() #cheap deepcopy/clone
+    res = res.sort('count', descending=True)
+    res = res.with_columns(
+        pl.col("sequence").alias("sequence"),
+    )
+
+    res_map = pl.DataFrame(library).join(
+            res, on="sequence", how="left"
+        )
+
+    if return_type == 'unmapped' or return_type == 'all':
+        res_unmap = res.join(
+            pl.DataFrame(library), on="sequence", how="anti"
+        )
+
+    if verbose:
+        print("% mapped reads",
+            100 * \
+            res_map.to_pandas()['count'].fillna(0).sum() / \
+            int(res.select(pl.sum("count")).to_pandas()['count'])
+        )
+
+    if return_type == 'unmapped':
+        return res_unmap
+    elif return_type == 'mapped':
+        return res_map
+    elif return_type == 'all':
+        return {'full': res, 'mapped': res_map, 'unmapped': res_unmap}
+    else:
+        raise ValueError("return_type must be either 'unmapped', 'mapped', or 'all'")
+
+
 def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_type='all', verbose=False):
     # get counts for given input
-    res = df_count.copy()
+    res = df_count.clone() #cheap deepcopy/clone
+    res = res.rename(
+        {'protospacer_a':'protospacer_A','protospacer_b':'protospacer_B'}
+    )
     res = res.sort('count', descending=True)
     res = res.with_columns(
         pl.concat_str(
             [
-                pl.col("protospacer_a"),
-                pl.col("protospacer_b")
+                pl.col("protospacer_A"),
+                pl.col("protospacer_B")
             ],
             separator=";",
         ).alias("sequence"),
@@ -104,6 +141,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
     res_map = pl.DataFrame(library).join(
             res, on="sequence", how="left"
         )
+
+    if get_recombinant or return_type == 'unmapped' or return_type == 'all':
+        res_unmap = res.join(
+            pl.DataFrame(library), on="sequence", how="anti"
+        )
+
     if verbose:
         print("% mapped reads",
             100 * \
@@ -112,9 +155,6 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
         )
 
     if get_recombinant:
-        res_unmap = res.join(
-            pl.DataFrame(library), on="sequence", how="anti"
-        )
 
         if verbose:
             print("% unmapped reads",
@@ -124,12 +164,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
             )
 
         res_unmap_remapped_a = res_unmap.join(
-            pl.DataFrame(library[['sgID_A','protospacer_a']]), on=["protospacer_a"], how="left"
+            pl.DataFrame(library[['sgID_A','protospacer_A']]), on=["protospacer_A"], how="left"
         )
 
         res_recomb_events = res_unmap_remapped_a.join(
-            pl.DataFrame(library[['sgID_B','protospacer_b']]), 
-            on=["protospacer_b"], how="left"
+            pl.DataFrame(library[['sgID_B','protospacer_B']]), 
+            on=["protospacer_B"], how="left"
         )
         if verbose:            
             print("% fully remapped recombination events",
@@ -138,15 +178,19 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
                 int(res.select(pl.sum("count")).to_pandas()['count'])
             )
 
-    if get_recombinant and return_type == 'all':
-        sample_count = {'full': res,'mapped': res_map,'recomb': res_recomb_events}
-        return sample_count
-    elif get_recombinant and return_type == 'recomb':
-        return res_recomb_events
-    elif not get_recombinant and return_type == 'all':
-        sample_count = {'full': res,'mapped': res_map}
-        return sample_count
-    elif not get_recombinant and return_type == 'mapped':
+    if return_type == 'unmapped':
+        return res_unmap
+    elif return_type == 'mapped':
         return res_map
+    elif return_type == 'recombinant':
+        if get_recombinant:
+            return res_recomb_events
+        else:
+            raise ValueError("get_recombinant must be set to True to calculate recombinant events")
+    elif return_type == 'all':
+        if get_recombinant:
+            return {'full': res,'mapped': res_map,'recombinant': res_recomb_events, 'unmapped': res_unmap}
+        else:
+            return {'full': res,'mapped': res_map, 'unmapped': res_unmap}
     else:
-        raise ValueError("return_type must be either 'all' or 'mapped' or 'recomb'")
+        raise ValueError("return_type must be either 'unmapped', 'mapped', 'recombinant', or 'all'")