Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve core functionalities for CLI #53

Merged
merged 35 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
6f4abe6
Merge branch 'master' into cli
abearab Apr 26, 2024
0f44e20
Merge branch 'master' into cli
abearab May 1, 2024
0cd329e
refactor fq2cnt to counter
abearab May 1, 2024
fc25156
remove `,`
abearab May 2, 2024
a401008
add `map_to_library_single_guide` function
abearab May 2, 2024
ad6210c
add `load_cas9_sgRNA_library` function
abearab May 2, 2024
347f895
update `Counter` module
abearab May 2, 2024
bb226be
convert library to polars
abearab May 2, 2024
35047c5
no index
abearab May 2, 2024
d180345
fix copy
abearab May 2, 2024
b00544d
add verbose
abearab May 2, 2024
0ba6acc
fix a missing variable
abearab May 2, 2024
125d72a
trim first
abearab May 2, 2024
8fbe5dc
rename `recomb` to `recombinant`
abearab May 2, 2024
cb28f0f
support recombinants
abearab May 2, 2024
53fbfc4
fix index
abearab May 2, 2024
2db3596
fix `get_recombinant` input
abearab May 3, 2024
2e834e3
fix column names
abearab May 3, 2024
1f020d8
fix rename for polars
abearab May 3, 2024
2ae6ca3
add `index_col` as input arg
abearab May 5, 2024
1e6c876
add `index_col` as input arg
abearab May 5, 2024
e0e7d63
mend
abearab May 5, 2024
ce044da
try parallel on samples
abearab May 6, 2024
5e19a23
mend
abearab May 6, 2024
0473db7
use multiprocessing
abearab May 6, 2024
efec6d4
typo
abearab May 6, 2024
36d7f82
use multiprocessing
abearab May 6, 2024
0ec06b9
use multiprocessing
abearab May 6, 2024
2691d1f
add todo line
abearab May 6, 2024
186e2dc
use multiprocess
abearab May 6, 2024
abb73d6
add `NotImplementedError` for parallel option
abearab May 6, 2024
d6e5cdd
mend
abearab May 6, 2024
1c27123
allow unmapped return
abearab May 7, 2024
9e71ec7
improve `Counter` class
abearab May 7, 2024
d3c1e2c
new file for counter module
abearab May 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions screenpro/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,83 @@
Module for loading screen datasets
"""

# import
import pickle
import pandas as pd

from .utils import check_protospacer_length, trim_protospacer


def load_cas9_sgRNA_library(library_path, library_type, sep='\t', index_col=0, protospacer_length=19, verbose=True):
'''Load Cas9 sgRNA library table for single or dual guide design.
'''
library = pd.read_csv(
library_path,
sep=sep,
index_col=index_col,
)

## Evaluate library table and reformat columns for downstream analysis
# I would like to name the target column 'target' if it is named 'gene'!
if 'gene' in library.columns:
# rename gene column to target
library = library.rename(columns={'gene': 'target'})

if library_type == "single_guide_design":
eval_columns = ['target', 'sgID', 'protospacer']

# Upper case protospacer sequences
library['protospacer'] = library['protospacer'].str.upper()

for col in eval_columns:
if col not in library.columns:
raise ValueError(f"Column '{col}' not found in library table.")

library = library[eval_columns]

elif library_type == "dual_guide_design":
eval_columns = [
'target', 'sgID_AB',
'sgID_A', 'protospacer_A',
'sgID_B', 'protospacer_B',
'sequence'
]

# Upper case protospacer sequences
library['protospacer_A'] = library['protospacer_A'].str.upper()
library['protospacer_B'] = library['protospacer_B'].str.upper()

# # TODO: Enable trimming of protospacer sequences through command line arguments.
for protospacer_col in ['protospacer_A', 'protospacer_B']:
in_length = check_protospacer_length(library, protospacer_col)
if in_length == protospacer_length:
pass
elif in_length > protospacer_length:
if verbose: print(f"Trimming protospacer sequences in '{protospacer_col}' column.")
library = trim_protospacer(
library, protospacer_col,
'5prime',
in_length - protospacer_length
)

elif in_length < protospacer_length:
raise ValueError(
f"Input protospacer length for '{protospacer_col}'is less than {protospacer_length}"
)

# if 'sequence' not in library.columns:
library['sequence'] = library['protospacer_A'] + ';' + library['protospacer_B']

for col in eval_columns:
if col not in library.columns:
raise ValueError(f"Column '{col}' not found in library table.")

library = library[eval_columns]

if verbose: print("Library table successfully loaded.")

return library


# functions
def loadScreenProcessingData(experimentName, collapsedToTranscripts=True, premergedCounts=False):
"""
Load ScreenProcessing outputs
Expand Down
20 changes: 10 additions & 10 deletions screenpro/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
from .__init__ import __version__
from . import ngs

def add_fq2cnt_parser(parent_subparsers, parent):
name = "fq2cnt",
def add_counter_parser(parent_subparsers, parent):
name = "counter"
desc = "Process FASTQ files to count sgRNA sequences."
help = """
Example usage:

`screenpro fq2cnt --single-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
`screenpro fq2cnt --dual-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
`screenpro counter --single-guide-design -l <path-to-library> -s <path-to-sample-sheet>`
`screenpro counter --dual-guide-design -l <path-to-library> -s <path-to-sample-sheet>`

"""

Expand Down Expand Up @@ -91,8 +91,8 @@ def main():

## screenpro commands

# fq2cnt subcommand
fq2cnt_parser = add_fq2cnt_parser(parent_subparsers, parent)
# counter subcommand
counter_parser = add_counter_parser(parent_subparsers, parent)

## Define return values
args = parent_parser.parse_args()
Expand Down Expand Up @@ -124,7 +124,7 @@ def main():

# Show module specific help if only module but no further arguments are given
command_to_parser = {
"fq2cnt": fq2cnt_parser,
"counter": counter_parser,
}

if len(sys.argv) == 2:
Expand All @@ -134,8 +134,8 @@ def main():
parent_parser.print_help(sys.stderr)
sys.exit(1)

## fq2cnt return
if args.command == "fq2cnt":
## counter return
if args.command == "counter":
### Perform checks on input arguments
## Check if library platform is provided by user
if args.single_guide_design:
Expand Down Expand Up @@ -175,5 +175,5 @@ def main():
## 3. Load FASTQ files and count sgRNA sequences
# Save count matrix

# TODO: Implement fq2cnt workflow.
# TODO: Implement counter workflow.
# process FASTQ files to generate counts per sample and then save a count matrix.
101 changes: 2 additions & 99 deletions screenpro/ngs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,103 +33,6 @@
- None of the protospacer A and B is present in the reference library
'''

import pandas as pd
from simple_colors import green

from . import cas9
from . import cas12

class Counter:
'''Class to count sequences from FASTQ files
'''

def __init__(self, cas_type, library_type):
self.cas_type = cas_type
self.library_type = library_type

def load_library(self, library_path):
'''Load library file
'''
if self.cas_type == 'cas9':
library = pd.read_csv(
library_path,
sep='\t',
index_col=0,
)

# I would like to name the target column 'target' if it is named 'gene'!
if 'gene' in library.columns:
# rename gene column to target
library = library.rename(columns={'gene': 'target'})

# Evaluate library table
if self.library_type == "single_guide_design":
pass

elif self.library_type == "dual_guide_design":
# reformat columns for downstream analysis
if 'sgID_AB' in library.columns:
library = library.set_index('sgID_AB')
library = library.rename(
columns={'protospacer_A':'protospacer_a','protospacer_B':'protospacer_b'}
)

if 'sequence' not in library.columns:
# TODO: Enable trimming of protospacer sequences through command line arguments.
library.protospacer_a = library.protospacer_a.str.upper()
library.protospacer_b = library.protospacer_b.str[1:].str.upper()
library['sequence'] = library.protospacer_a + ';' + library.protospacer_b

# check required columns are present:
eval_columns = ['target', 'sequence', 'protospacer_A', 'protospacer_B', 'sequence']
for col in eval_columns:
if col not in library.columns:
raise ValueError(f"Column '{col}' not found in library table.")

library = library[eval_columns]

elif self.cas_type == 'cas12':
raise NotImplementedError("Cas12 library is not yet implemented.")

self.library = library

def get_matrix(self, fastq_dir, samples,library,get_recombinant=False, cas_type='cas9',verbose=False):
'''Get count matrix for given samples
'''
if self.cas_type == 'cas9':
if self.library_type == "single_guide_design":
# TODO: Implement codes to build count matrix for given samples
pass
elif self.library_type == "dual_guide_design":
counts = {}
for sample_id in samples:
if verbose: print(green(sample_id, ['bold']))
df_count = cas9.fastq_to_count_dual_guide(
R1_fastq_file_path=f'{fastq_dir}/{sample_id}_R1.fastq.gz',
R2_fastq_file_path=f'{fastq_dir}/{sample_id}_R2.fastq.gz',
trim5p_pos1_start=1,
trim5p_pos1_length=19,
trim5p_pos2_start=1,
trim5p_pos2_length=19,
verbose=verbose
)
if not get_recombinant:
counts[sample_id] = cas9.map_to_library_dual_guide(
df_count=df_count,
library=library,
get_recombinant=False,
return_type='mapped',
verbose=verbose
)
elif get_recombinant:
raise NotImplementedError("Recombinant count matrix are not yet implemented.")

counts_mat = pd.concat([
counts[sample_id].to_pandas()['count'].rename(sample_id)
for sample_id in counts.keys()
],axis=1).fillna(0)
if cas_type == 'cas12':
# TODO: Implement codes to build count matrix for given samples
raise NotImplementedError("Cas12 count matrix is not yet implemented.")

return counts_mat
from . import cas9
from .counter import Counter
82 changes: 63 additions & 19 deletions screenpro/ngs/cas9.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,52 @@ def fastq_to_count_dual_guide(
return df_count


def map_to_library_single_guide(df_count, library, return_type='all', verbose=False):
# get counts for given input
res = df_count.clone() #cheap deepcopy/clone
res = res.sort('count', descending=True)
res = res.with_columns(
pl.col("sequence").alias("sequence"),
)

res_map = pl.DataFrame(library).join(
res, on="sequence", how="left"
)

if return_type == 'unmapped' or return_type == 'all':
res_unmap = res.join(
pl.DataFrame(library), on="sequence", how="anti"
)

if verbose:
print("% mapped reads",
100 * \
res_map.to_pandas()['count'].fillna(0).sum() / \
int(res.select(pl.sum("count")).to_pandas()['count'])
)

if return_type == 'unmapped':
return res_unmap
elif return_type == 'mapped':
return res_map
elif return_type == 'all':
return {'full': res, 'mapped': res_map, 'unmapped': res_unmap}
else:
raise ValueError("return_type must be either 'unmapped', 'mapped', or 'all'")


def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_type='all', verbose=False):
# get counts for given input
res = df_count.copy()
res = df_count.clone() #cheap deepcopy/clone
res = res.rename(
{'protospacer_a':'protospacer_A','protospacer_b':'protospacer_B'}
)
res = res.sort('count', descending=True)
res = res.with_columns(
pl.concat_str(
[
pl.col("protospacer_a"),
pl.col("protospacer_b")
pl.col("protospacer_A"),
pl.col("protospacer_B")
],
separator=";",
).alias("sequence"),
Expand All @@ -104,6 +141,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
res_map = pl.DataFrame(library).join(
res, on="sequence", how="left"
)

if get_recombinant or return_type == 'unmapped' or return_type == 'all':
res_unmap = res.join(
pl.DataFrame(library), on="sequence", how="anti"
)

if verbose:
print("% mapped reads",
100 * \
Expand All @@ -112,9 +155,6 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
)

if get_recombinant:
res_unmap = res.join(
pl.DataFrame(library), on="sequence", how="anti"
)

if verbose:
print("% unmapped reads",
Expand All @@ -124,12 +164,12 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
)

res_unmap_remapped_a = res_unmap.join(
pl.DataFrame(library[['sgID_A','protospacer_a']]), on=["protospacer_a"], how="left"
pl.DataFrame(library[['sgID_A','protospacer_A']]), on=["protospacer_A"], how="left"
)

res_recomb_events = res_unmap_remapped_a.join(
pl.DataFrame(library[['sgID_B','protospacer_b']]),
on=["protospacer_b"], how="left"
pl.DataFrame(library[['sgID_B','protospacer_B']]),
on=["protospacer_B"], how="left"
)
if verbose:
print("% fully remapped recombination events",
Expand All @@ -138,15 +178,19 @@ def map_to_library_dual_guide(df_count, library, get_recombinant=False, return_t
int(res.select(pl.sum("count")).to_pandas()['count'])
)

if get_recombinant and return_type == 'all':
sample_count = {'full': res,'mapped': res_map,'recomb': res_recomb_events}
return sample_count
elif get_recombinant and return_type == 'recomb':
return res_recomb_events
elif not get_recombinant and return_type == 'all':
sample_count = {'full': res,'mapped': res_map}
return sample_count
elif not get_recombinant and return_type == 'mapped':
if return_type == 'unmapped':
return res_unmap
elif return_type == 'mapped':
return res_map
elif return_type == 'recombinant':
if get_recombinant:
return res_recomb_events
else:
raise ValueError("get_recombinant must be set to True to calculate recombinant events")
elif return_type == 'all':
if get_recombinant:
return {'full': res,'mapped': res_map,'recombinant': res_recomb_events, 'unmapped': res_unmap}
else:
return {'full': res,'mapped': res_map, 'unmapped': res_unmap}
else:
raise ValueError("return_type must be either 'all' or 'mapped' or 'recomb'")
raise ValueError("return_type must be either 'unmapped', 'mapped', 'recombinant', or 'all'")
Loading
Loading