diff --git a/one-offs/hpc/peaks_liftover.bash b/one-offs/hpc/peaks_liftover.bash new file mode 100644 index 00000000..20f032ad --- /dev/null +++ b/one-offs/hpc/peaks_liftover.bash @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --mem=8G +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --time=24:00:00 +#SBATCH --account=rrg-bourqueg-ad + +module load nixpkgs/16.09 gcc/7.3.0 +module load kentutils/20180716 +module load python/3.8 + +source env/bin/activate + +python3 ./peaks_liftover.py diff --git a/one-offs/hpc/peaks_liftover.py b/one-offs/hpc/peaks_liftover.py new file mode 100644 index 00000000..f4a7e835 --- /dev/null +++ b/one-offs/hpc/peaks_liftover.py @@ -0,0 +1,141 @@ +import csv +import pysam +import subprocess +import tqdm + +from typing import Dict, List, Set, Tuple + + +HG38_CHAIN_FILE = "/home/dlough2/epivar/hg19ToHg38.over.chain" +HG38_VCF = "/home/dlough2/scratch/epivar/allSamples.hc.vqsr.mil.snpId.snpeff.dbnsfp.GRCh38.vcf.gz" + + +def lift_over_peaks(peaks_to_lift_over: Set[str]) -> Dict[str, str]: # return dict { old: new } + lift_over_in_bed = "./peaks-liftover-in.bed" + lift_over_out_bed = "./peaks-liftover-out.bed" + + peaks = list(peaks_to_lift_over) + + with open(lift_over_in_bed, "w") as fh: + for pi, p in enumerate(peaks): + fh.write("\t".join(p.split("_")) + f"\tpeak{pi}\n") + + subprocess.run(( + "liftOver", + lift_over_in_bed, + HG38_CHAIN_FILE, + lift_over_out_bed, + "./lift-over-discard", + "-bedPlus=3", + "-tab", + "-minMatch=0.75", # reduce minimum number of bases needed to remap, so we get as many peaks as possible + )) + + lifted_over_peaks: List[Tuple[str, str]] = [] + with open(lift_over_out_bed, "r") as fh: + for line in fh: + line_data = line.strip().split("\t") + lifted_over_peaks.append(("\t".join(line_data[:3]), line_data[3])) + + res: Dict[str, str] = {} + + peaks_iter = iter(enumerate(peaks)) + lifted_over_peaks_iter = iter(lifted_over_peaks) + + pi, p = next(peaks_iter) + for lifted_p, lifted_p_name in lifted_over_peaks_iter: + while lifted_p_name != f"peak{pi}": + pi, p = next(peaks_iter) + # names are the same, so we have a successful mapping + res[p] = lifted_p + + return res + + +def load_qtl_rows_and_snps(fp: str, feature_type: str) -> Tuple[List[dict], Set[str], Set[str]]: + qtl_rows: List[dict] = [] + peaks_to_lift_over: Set[str] = set() + snps_to_lift_over: Set[str] = set() + + with open(fp, "r", newline="") as fh: + reader = csv.DictReader(fh, delimiter=",", quotechar='"') + for p in tqdm.tqdm(filter(lambda pp: pp["feature_type"] == feature_type, reader), desc="qtl peaks"): + feature = p["feature"] + if feature.startswith("chr") and len(feature.split("_")) != 3: + continue + qtl_rows.append(p) + if feature.startswith("chr"): + peaks_to_lift_over.add(feature) + snps_to_lift_over.add(p["rsID"]) + + return qtl_rows, peaks_to_lift_over, snps_to_lift_over + + +def lift_over_qtl_rows(qtl_rows: List[dict], snps_to_lift_over: Set[str], lifted_over_peaks: Dict[str, str]): + vcf = pysam.VariantFile(HG38_VCF) + lifted_over_snps: Dict[str, str] = {} # dict of { rsID: chr#_######## } + + for contig in vcf.header.contigs: + if "_" in contig: + continue + + print(f"Lifting over SNPs in {contig}") + + for variant in tqdm.tqdm(vcf.fetch(contig), desc="vcf"): + if variant.id not in snps_to_lift_over: + continue + lifted_over_snps[variant.id] = f"{contig}_{variant.pos}" + + print(f"{len(lifted_over_snps)} lifted over SNPs") + + for row in qtl_rows: + if (rs_id := row["rsID"]) in lifted_over_snps: + feature: str = row["feature"] + yield { + **row, + "snp": lifted_over_snps[rs_id], + "feature": lifted_over_peaks[feature] if feature.startswith("chr") else feature, + } + + +def main(): + gene_peaks: List[dict] = [] + peaks_to_lift_over: Set[str] = set() + + with open("./flu-infection-gene-peaks.csv", "r", newline="") as fh: + reader = csv.DictReader(fh, delimiter=",", quotechar='"') + for p in tqdm.tqdm(reader, desc="gene-peaks"): + gene_peaks.append(p) + peak = p["peak_ids"] + if len(peak.split("_")) == 3: + peaks_to_lift_over.add(peak) + + qtl_rows: List[dict] + snps_to_lift_over: Set[str] + qtl_rows, pq, snps_to_lift_over = load_qtl_rows_and_snps("./QTLs_complete_RNA-seq.csv", "RNA-seq") + peaks_to_lift_over |= pq + + print(f"{len(peaks_to_lift_over)} peaks to lift over") + print(f"{len(snps_to_lift_over)} SNPs to lift over") + + lifted_over_peaks: Dict[str, str] = lift_over_peaks(peaks_to_lift_over) + print(f"{len(lifted_over_peaks)} lifted over peaks") + + with open("./flu-infection-gene-peaks-hg38.csv", "w") as fh: + writer = csv.DictWriter(fh, ("symbol", "peak_ids", "feature_type"), delimiter=",", quotechar='"') + for gp in gene_peaks: + if (lifted_over_peak := lifted_over_peaks.get(gp["peak_ids"])) is not None: + writer.writerow({**gp, "peak_ids": lifted_over_peak}) + + with open("./QTLs_complete_RNA-seq_hg38.csv", "w") as fh: + writer = csv.DictWriter( + fh, + ("rsID", "snp", "feature", "pvalue.NI", "pvalue.Flu", "feature_type"), + delimiter=",", + quotechar='"') + for row in lift_over_qtl_rows(qtl_rows, snps_to_lift_over, lifted_over_peaks): + writer.writerow(row) + + +if __name__ == "__main__": + main()