forked from bminkenberg/CRISPR-PLANTv2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline-commands.txt
98 lines (98 loc) · 12.1 KB
/
pipeline-commands.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# 09-27-2018 Bastian Minkenberg. Commands used to execute CRISPR-PLANTv2 pipeline:
# Extract all NGG and NAG sites.
# CUSTOMIZATION STEPS: Please exchange GENOME.FA with your favorite genome. Please change NGG to your PAM of choice and NAG to your alternative PAM of choice for off-target search
# modified by DL
fuzznuc -complement true -pattern NNNNNNNNNNNNNNNNNNNNNGG -sequence GENOME.FA -outfile 20NNGG_GENOME.fuzznuc
fuzznuc -complement true -pattern NNNNNNNNNNNNNNNNNNNNNAG -sequence GENOME.FA -outfile 20NNAG_GENOME.fuzznuc
# summarize the results in a new fasta file that gives each target site a unique identifier "Chromosome:start_position-end_position:strand". "strand" is show as empty for plus or "rc" for minus.
perl perl_scripts/fuzznuc2fasta.pl 20NNGG_GENOME.fuzznuc > 20NNGG_GENOME.fa
perl perl_scripts/fuzznuc2fasta.pl 20NNAG_GENOME.fuzznuc > 20NNAG_GENOME.fa
# remove low complexity sequences using dust
vsearch --fastx_mask 20NNGG_GENOME.fa --qmask dust --hardmask --fastaout dusted_20NNGG_GENOME.fa --min_unmasked_pct 100
# convert list from 20 nt to 15 nt seed region
python ./python-scripts/cp_20_to_15.py dusted_20NNGG_GENOME.fa dusted_15NNGG_GENOME.fa
# Look for all 15 nt seeds that are identical
vsearch --search_exact dusted_15NNGG_GENOME.fa --db dusted_15NNGG_GENOME.fa --self --strand plus --qmask none --dbmask none --notmatched dusted_15NNGG_GENOME.fa.notmatched
# Take unique seeds and convert back to original 20 nt format
python ./python-scripts/cp_15_back_to_20.py dusted_15NNGG_GENOME.fa.notmatched dusted_20NNGG_GENOME.fa GENOME_NGG_spacer_candidates.fa
# Perform global alignment of all NGG spacer candidates against all putative NGG and NAG off-target sites. This step may require "divide and conquer" and a supercomputer cluster.
vsearch --usearch_global GENOME_NGG_spacer_candidates.fa --db GENOME_NGG_spacer_candidates.fa --self --wordlength 7 --minseqlength 1 --maxaccepts 20 --maxrejects 0 --minwordmatches 1 --id 0.65 --strand plus --iddef 4 --userfields query+target+id+alnlen+gaps+mism+ids+qihi+qilo --userout global_all_NGG_to_NGG_GENOME.userout
vsearch --usearch_global GENOME_NGG_spacer_candidates.fa --db 20NNAG_GENOME.fa --self --wordlength 7 --minseqlength 1 --maxaccepts 20 --maxrejects 0 --minwordmatches 1 --id 0.65 --strand plus --iddef 4 --userfields query+target+id+alnlen+gaps+mism+ids+qihi+qilo --userout global_all_NGG_to_NAG_GENOME.userout
# Perform local alignment of all NGG spacer candidates against all putative NGG and NAG off-target sites. This step may require "divide and conquer" and a supercomputer cluster.
usearch -ublast GENOME_NGG_spacer_candidates.fa -db GENOME_NGG_spacer_candidates.fa -self -wordlength 5 -maxaccepts 20 -id 0.7 -strand plus -evalue 1000 -blast6out local_all_NGG_to_NGG_GENOME.b6
usearch -ublast GENOME_NGG_spacer_candidates.fa -db 20NNAG_GENOME.fa -self -wordlength 5 -maxaccepts 20 -id 0.7 -strand plus -evalue 1000 -blast6out local_all_NGG_to_NAG_GENOME.b6
# Look at results and extract IDs and difference number of any spacer carrying off-targets with up to 3 differences.
python ./python-scripts/cp_global_2MM.py global_all_NGG_to_NGG_GENOME.userout global_all_NGG_to_NGG_GENOME.userout.2MM
python ./python-scripts/cp_global_3MM.py global_all_NGG_to_NAG_GENOME.userout global_all_NGG_to_NAG_GENOME.userout.3MM
python ./python-scripts/cp_local_2MM.py local_all_NGG_to_NGG_GENOME.userout local_all_NGG_to_NGG_GENOME.userout.2MM
python ./python-scripts/cp_local_3MM.py local_all_NGG_to_NAG_GENOME.userout local_all_NGG_to_NAG_GENOME.userout.3MM
# Remove all target sites from the off-target lists that are only 10 bp away from each other as these should not be considered off-targets
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py global_all_NGG_to_NGG_GENOME.complete.userout.2MM global_all_NGG_to_NGG_GENOME.complete.userout.2MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py global_all_NGG_to_NAG_GENOME.complete.userout.2MM global_all_NGG_to_NAG_GENOME.complete.userout.2MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py global_all_NGG_to_NGG_GENOME.complete.userout.3MM global_all_NGG_to_NGG_GENOME.complete.userout.3MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py global_all_NGG_to_NAG_GENOME.complete.userout.3MM global_all_NGG_to_NAG_GENOME.complete.userout.3MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py local_all_NGG_to_NGG_GENOME.complete.b6.3MM local_all_NGG_to_NGG_GENOME.complete.b6.3MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py local_all_NGG_to_NGG_GENOME.complete.b6.2MM local_all_NGG_to_NGG_GENOME.complete.b6.2MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py local_all_NGG_to_NAG_GENOME.complete.b6.2MM local_all_NGG_to_NAG_GENOME.complete.b6.2MM.10bp
python ./python-scripts/cp_remove_10bp_adjacent_spacers.py local_all_NGG_to_NAG_GENOME.complete.b6.3MM local_all_NGG_to_NAG_GENOME.complete.b6.3MM.10bp
# merge results from global and local alignments
cat global_all_NGG_to_NGG_GENOME.complete.userout.2MM.10bp local_all_NGG_to_NGG_GENOME.complete.b6.2MM.10bp > joined_global_and_local_NGG_GENOME.2MM.10bp
cat global_all_NGG_to_NAG_GENOME.complete.userout.2MM.10bp local_all_NGG_to_NAG_GENOME.complete.b6.2MM.10bp > joined_global_and_local_NAG_GENOME.2MM.10bp
cat global_all_NGG_to_NGG_GENOME.complete.userout.3MM.10bp local_all_NGG_to_NGG_GENOME.complete.b6.3MM.10bp > joined_global_and_local_NGG_GENOME.3MM.10bp
cat global_all_NGG_to_NAG_GENOME.complete.userout.3MM.10bp local_all_NGG_to_NAG_GENOME.complete.b6.3MM.10bp > joined_global_and_local_NAG_GENOME.3MM.10bp
# Take the resulting files and save off-target IDs in new file for later comparison
cut joined_global_and_local_NGG_GENOME.2MM.10bp -f 1 | sort | uniq > joined_global_and_local_NGG_GENOME.2MM.ids
cut joined_global_and_local_NAG_GENOME.2MM.10bp -f 1 | sort | uniq > joined_global_and_local_NAG_GENOME.2MM.ids
cut joined_global_and_local_NGG_GENOME.3MM.10bp -f 1 | sort | uniq > joined_global_and_local_NGG_GENOME.3MM.ids
cut joined_global_and_local_NAG_GENOME.3MM.10bp -f 1 | sort | uniq > joined_global_and_local_NAG_GENOME.3MM.ids
# Extract the IDs for each spacer candidate
grep ">" ./GENOME_NGG_spacer_candidates.fa | cut -c2- > GENOME_NGG_spacer_candidates.ids
# Compare IDs from off-target search with candidate IDs to remove any candidates with NGG off-target potentials first. Store NGG specific candidates into a new file
python ./python-scripts/cp_compare_ids.py joined_global_and_local_NGG_GENOME.3MM.ids GENOME_NGG_spacer_candidates.ids 3MM_cleaned_GENOME_NGG_spacer_candidates.ids
sort 3MM_cleaned_GENOME_NGG_spacer_candidates.ids > sorted_3MM_cleaned_GENOME_NGG_spacer_candidates.ids
python ./python-scripts/cp_extract_from_candidate_list.py sorted_3MM_cleaned_GENOME_NGG_spacer_candidates.ids GENOME_NGG_spacer_candidates.fa 3MM_cleaned_GENOME_NGG_spacers.fa
# Remove first 10 nt to keep 10 nt seed. Compare seeds and classify by A and B depending on unique 10 or 15 nt seed region
python./python-scripts/20_to_10.py 3MM_cleaned_GENOME_NGG_spacers.fa 10N_3MM_cleaned_GENOME_NGG_spacers.fa
vsearch --threads 4 --search_exact 10N_3MM_cleaned_GENOME_NGG_spacers.fa --db 10N_3MM_cleaned_GENOME_NGG_spacers.fa --self --strand plus --qmask none --dbmask none --notmatched 10N_A_GENOME_NGG_spacers.fa --matched 10N_B_GENOME_NGG_spacers.fa
python ./python-scripts/cp_15_back_to_20.py 10N_B_GENOME_NGG_spacers.fa 3MM_cleaned_GENOME_NGG_spacers.fa B_GENOME_NGG_spacers.fa
python ./python-scripts/cp_15_back_to_20.py 10N_A_GENOME_NGG_spacers.fa 3MM_cleaned_GENOME_NGG_spacers.fa A_GENOME_NGG_spacers.fa
# Repeat previous steps for NAG instead of NGG classification. Specifically look for any NGG 15 nt seeds that match 15nt NAG seeds and push them to category A2 or B2
python ./python-scripts/cp_20_to_15.py A_GENOME_NGG_spacers.fa 15N_A_GENOME_NGG_spacers.fa
python ./python-scripts/cp_20_to_15.py B_GENOME_NGG_spacers.fa 15N_B_GENOME_NGG_spacers.fa
python ./python-scripts/cp_20_to_15.py 20NNAG_GENOME.fa 15N_20NNAG_GENOME.fa
vsearch --search_exact 15N_A_GENOME_NGG_spacers.fa --db 15N_20NNAG_GENOME.fa --self --strand plus --qmask none --dbmask none --notmatched 15N_A_minus_A2_GENOME_NGG_spacers.fa --matched 15N_A2_GENOME_NGG_spacers.fa
vsearch --search_exact 15N_B_GENOME_NGG_spacers.fa --db 15N_20NNAG_GENOME.fa --self --strand plus --qmask none --dbmask none --notmatched 15N_B_minus_B2_GENOME_NGG_spacers.fa --matched 15N_B2_GENOME_NGG_spacers.fa
python ./python-scripts/cp_15_back_to_20.py 15N_B2_GENOME_NGG_spacers.fa 3MM_cleaned_GENOME_NGG_spacers.fa B2_GENOME_NGG_spacers.fa
python ./python-scripts/cp_15_back_to_20.py 15N_A2_GENOME_NGG_spacers.fa 3MM_cleaned_GENOME_NGG_spacers.fa A2_GENOME_NGG_spacers.fa
# Create IDs of a A and B candidates minus category 2
grep '>' 15N_B_minus_B2_GENOME_NGG_spacers.fa | cut -c2- > B_minus_B2_GENOME_NGG_spacers.ids
grep '>' 15N_A_minus_A2_GENOME_NGG_spacers.fa | cut -c2- > A_minus_A2_GENOME_NGG_spacers.ids
# Compare IDs between genome-wide off-target list and A and B leftover to create A0 and B0 categories
python ./python-scripts/cp_compare_ids.py joined_global_and_local_NAG_GENOME.3MM.ids A_minus_A2_GENOME_NGG_spacers.ids A0_GENOME_NGG_spacers.ids
python ./python-scripts/cp_compare_ids.py joined_global_and_local_NAG_GENOME.3MM.ids B_minus_B2_GENOME_NGG_spacers.ids B0_GENOME_NGG_spacers.ids
sort A0_GENOME_NGG_spacers.ids > sorted_A0_GENOME_NGG_spacers.ids
sort B0_GENOME_NGG_spacers.ids > sorted_B0_GENOME_NGG_spacers.ids
# Save A0 and B0 in new fasta file by extraction from candidate list by ID
python ./python-scripts/cp_extract_from_candidate_list.py sorted_A0_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa A0_GENOME_NGG_spacers.fa
python ./python-scripts/cp_extract_from_candidate_list.py sorted_B0_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa B0_GENOME_NGG_spacers.fa
# Subtract category 0 IDs from A/B minus A2/B2 IDs
python ./python-scripts/cp_compare_ids.py A0_GENOME_NGG_spacers.ids A_minus_A2_GENOME_NGG_spacers.ids A_minus_A2_A0_GENOME_NGG_spacers.ids
python ./python-scripts/cp_compare_ids.py B0_GENOME_NGG_spacers.ids B_minus_B2_GENOME_NGG_spacers.ids B_minus_B2_B0_GENOME_NGG_spacers.ids
# Remove any IDs with 1 or 2 differences to other target sites to create 0.1 categories
python ./python-scripts/cp_compare_ids.py joined_global_and_local_NAG_GENOME.2MM.ids A_minus_A2_A0_GENOME_NGG_spacers.ids A0.1_GENOME_NGG_spacers.ids
python ./python-scripts/cp_compare_ids.py joined_global_and_local_NAG_GENOME.2MM.ids B_minus_B2_B0_GENOME_NGG_spacers.ids B0.1_GENOME_NGG_spacers.ids
# Subtract the 0.1 IDs from leftover to create 1 categories
python ./python-scripts/cp_compare_ids.py A0.1_GENOME_NGG_spacers.ids A_minus_A2_A0_GENOME_NGG_spacers.ids A1_GENOME_NGG_spacers.ids
python ./python-scripts/cp_compare_ids.py B0.1_GENOME_NGG_spacers.ids B_minus_B2_B0_GENOME_NGG_spacers.ids B1_GENOME_NGG_spacers.ids
# Sort ID lists before creating the final fasta files for each category
sort A0.1_GENOME_NGG_spacers.ids > sorted_A0.1_GENOME_NGG_spacers.ids
sort B0.1_GENOME_NGG_spacers.ids > sorted_B0.1_GENOME_NGG_spacers.ids
sort A1_GENOME_NGG_spacers.ids > sorted_A1_GENOME_NGG_spacers.ids
sort B1_GENOME_NGG_spacers.ids > sorted_B1_GENOME_NGG_spacers.ids
# Take IDs from sorted lists and extract the target site from the original candidate list to create final fasta files corresponding to the categories
python ./python-scripts/cp_extract_from_candidate_list.py sorted_A0.1_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa A0.1_GENOME_NGG_spacers.fa
python ./python-scripts/cp_extract_from_candidate_list.py sorted_B0.1_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa B0.1_GENOME_NGG_spacers.fa
python ./python-scripts/cp_extract_from_candidate_list.py sorted_A1_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa A1_GENOME_NGG_spacers.fa
python ./python-scripts/cp_extract_from_candidate_list.py sorted_B1_GENOME_NGG_spacers.ids GENOME_NGG_spacer_candidates.fa B1_GENOME_NGG_spacers.fa
# This concludes the full genome-wide analysis and classification of CRISPR/Cas9 target sites of a genome. For the CRISPR-PLANTv2 website, we created tables compatible with a SQL database. An example for a file that converts the fasta files to a searchable table is given with the python script cp_create_database.py.
# 09-27-2018 Bastian Minkenberg