diff --git a/annotation/models/models.py b/annotation/models/models.py index 7f7b9eda0..e2a980fd6 100644 --- a/annotation/models/models.py +++ b/annotation/models/models.py @@ -37,7 +37,7 @@ from ontology.models import OntologyVersion from patients.models_enums import GnomADPopulation from snpdb.models import GenomeBuild, Variant, VariantGridColumn, Q, VCF, DBSNP_PATTERN, VARIANT_PATTERN, \ - HGVS_UNCLEANED_PATTERN, Allele + HGVS_UNCLEANED_PATTERN, Allele, VARIANT_SYMBOLIC_PATTERN from snpdb.models.models_enums import ImportStatus @@ -1174,6 +1174,7 @@ def get_entry_type(line: str) -> ManualVariantEntryType: DBSNP_PATTERN: ManualVariantEntryType.DBSNP, HGVS_UNCLEANED_PATTERN: ManualVariantEntryType.HGVS, VARIANT_PATTERN: ManualVariantEntryType.VARIANT, + VARIANT_SYMBOLIC_PATTERN: ManualVariantEntryType.VARIANT, } for k, v in MATCHERS.items(): if k.match(line): diff --git a/annotation/models/models_enums.py b/annotation/models/models_enums.py index 787606aad..94e101450 100644 --- a/annotation/models/models_enums.py +++ b/annotation/models/models_enums.py @@ -204,6 +204,7 @@ class ManualVariantEntryType(models.TextChoices): DBSNP = "d", "dbSNP" HGVS = "h", "HGVS" VARIANT = "v", "Variant" + VARIANT_SYMBOLIC = "s", "Variant (Symbolic)" UNKNOWN = "u", "Unknown" diff --git a/annotation/tasks/process_manual_variants_task.py b/annotation/tasks/process_manual_variants_task.py index 7caa39387..c6d258831 100644 --- a/annotation/tasks/process_manual_variants_task.py +++ b/annotation/tasks/process_manual_variants_task.py @@ -25,7 +25,7 @@ def get_manual_variant_coordinates(mve: ManualVariantEntry) -> List[VariantCoord elif mve.entry_type == ManualVariantEntryType.HGVS: variant_coordinates.append(get_hgvs_variant_coordinate(mve.entry_text, mve.genome_build)) elif mve.entry_type == ManualVariantEntryType.VARIANT: - variant_coordinates.append(VariantCoordinate.from_string(mve.entry_text)) + variant_coordinates.append(VariantCoordinate.from_string(mve.entry_text, mve.genome_build)) else: raise ValueError(f"Could not convert entry type of {mve.entry_type}") return variant_coordinates diff --git a/library/genomics/vcf_utils.py b/library/genomics/vcf_utils.py index 5078e14fc..66b84be45 100644 --- a/library/genomics/vcf_utils.py +++ b/library/genomics/vcf_utils.py @@ -44,16 +44,27 @@ def write_vcf_from_tuples(vcf_filename, variant_tuples, tuples_have_id_field=Fal vcf_tuples = ((chrom, start, end, ".", ref, alt) for (chrom, start, end, ref, alt) in variant_tuples) vcf_tuples = sorted(vcf_tuples, key=operator.itemgetter(0, 1, 3, 4)) + + info = [ + '##INFO=', + '##INFO=', + ] columns = "\t".join(["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]) - header = "\n".join(["##fileformat=VCFv4.1", "##source=VariantGrid", "#" + columns]) - empty_columns = "\t." * 3 # QUAL/FILTER/INFO always empty + header = "\n".join(["##fileformat=VCFv4.1", "##source=VariantGrid"] + info + ["#" + columns]) + empty_qual_filter_info = ('.', '.', '.') with open(vcf_filename, "wt", encoding="utf-8") as f: f.write(header + "\n") for vcf_record in vcf_tuples: (chrom, position, end, id_col, ref, alt) = vcf_record - if alt == Variant.REFERENCE_ALT: - alt = "." - line = "\t".join((chrom, str(position), str(id_col), ref, alt)) + empty_columns + if Sequence.allele_is_symbolic(alt): + svlen = end - position + svtype = alt[1:-1] # Strip off brackets + qual_filter_info = (".", ".", f"SVLEN={svlen};SVTYPE={svtype}") + else: + qual_filter_info = empty_qual_filter_info + if alt == Variant.REFERENCE_ALT: + alt = "." + line = "\t".join((chrom, str(position), str(id_col), ref, alt) + qual_filter_info) f.write(line + "\n") diff --git a/snpdb/models/models_variant.py b/snpdb/models/models_variant.py index bcf482842..afb3b3ae8 100644 --- a/snpdb/models/models_variant.py +++ b/snpdb/models/models_variant.py @@ -33,6 +33,8 @@ LOCUS_PATTERN = re.compile(r"^([^:]+)\s*:\s*(\d+)[,\s]*([GATC]+)$", re.IGNORECASE) LOCUS_NO_REF_PATTERN = re.compile(r"^([^:]+)\s*:\s*(\d+)$") VARIANT_PATTERN = re.compile(r"^(MT|(?:chr)?(?:[XYM]|\d+))\s*:\s*(\d+)[,\s]*([GATC]+)>(=|[GATC]+)$", re.IGNORECASE) +# This is our internal format for symbolic (ie / etc) +VARIANT_SYMBOLIC_PATTERN = re.compile(r"^(MT|(?:chr)?(?:[XYM]|\d+))\s*:\s*(\d+)\s*-\s*(\d+)\s*<(DEL|DUP|INS|INV|CNV)>$", re.IGNORECASE) # matches anything hgvs-like before any fixes HGVS_UNCLEANED_PATTERN = re.compile(r"(^(N[MC]_|ENST)\d+.*:|[cnmg]\.|[^:]:[cnmg]).*\d+", re.IGNORECASE) @@ -275,24 +277,37 @@ def as_tuple(self) -> Tuple: return self.chrom, self.start, self.end, self.ref, self.alt def __str__(self): - if self.is_symbolic(): - s = f"{self.chrom}:{self.start}-{self.end} {self.ref}>{self.alt}" + return self.format() + + def format(self): + if Sequence.allele_is_symbolic(self.alt): + return f"{self.chrom}:{self.start}-{self.end} {self.alt}" else: - s = f"{self.chrom}:{self.start} {self.ref}>{self.alt}" - return s + return f"{self.chrom}:{self.start} {self.ref}>{self.alt}" @staticmethod - def from_string(variant_string: str, regex_pattern=VARIANT_PATTERN): - # This will only ever match standard (non-symbolic variants) - if full_match := regex_pattern.fullmatch(variant_string): + def from_string(variant_string: str, genome_build=None): + """ Pass in genome build to be able to set REF from symbolic (will be N otherwise) """ + if full_match := VARIANT_PATTERN.fullmatch(variant_string): chrom = full_match.group(1) start = int(full_match.group(2)) ref = full_match.group(3) alt = full_match.group(4) return VariantCoordinate.from_start_only(chrom, start, ref, alt) + elif full_match := VARIANT_SYMBOLIC_PATTERN.fullmatch(variant_string): + chrom, start, end, alt = full_match.groups() + start = int(start) + end = int(end) + alt = "<" + alt + ">" # captured what was inside of brackets ie <()> + if genome_build: + contig_sequence = genome_build.genome_fasta.fasta[chrom] + ref = contig_sequence[start:start+1].upper() + else: + ref = "N" + return VariantCoordinate(chrom, start, end, ref, alt) - raise ValueError(f"{variant_string=} did not match agaisnt {regex_pattern=}") - + regex_patterns = ", ".join((str(s) for s in (VARIANT_PATTERN, VARIANT_SYMBOLIC_PATTERN))) + raise ValueError(f"{variant_string=} did not match against {regex_patterns=}") @staticmethod def from_start_only(chrom: str, start: int, ref: str, alt: str): @@ -493,38 +508,13 @@ def validate(genome_build, chrom, position) -> List[str]: errors.append(f"Chromsome/contig '{chrom}' not a valid in genome build {genome_build}") return errors - @staticmethod - def tuple_to_spdi(chrom, start, end, ref, alt) -> str: - """ SPDI: data model for variants and applications at NCBI - @see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7523648/ """ - if Sequence.allele_is_symbolic(alt): - length = end - start - if alt == VCFSymbolicAllele.DEL: - d_str = length - i_str = "" - elif alt == VCFSymbolicAllele.INS: - d_str = "" - i_str = length - else: - raise ValueError(f"Don't know how to handle alt={alt}") - else: - d_str = ref - i_str = alt - return f"{chrom}:{start}:{d_str}:{i_str}" - @staticmethod def format_tuple(chrom, start, end, ref, alt, abbreviate=False) -> str: - is_symbolic = Sequence.allele_is_symbolic(ref) or Sequence.allele_is_symbolic(alt) - if is_symbolic: - if alt == "": - # There's not really a format for these - return f"CNV {chrom}:{start}-{end}" - return Variant.tuple_to_spdi(chrom, start, end, ref, alt) - - if abbreviate: + if abbreviate and not Sequence.allele_is_symbolic(alt): ref = Sequence.abbreviate(ref) alt = Sequence.abbreviate(alt) - return f"{chrom}:{start} {ref}>{alt}" + vc = VariantCoordinate(chrom, start, end, ref, alt) + return vc.format() @staticmethod def get_from_string(variant_string: str, genome_build: GenomeBuild) -> Optional['Variant']: diff --git a/upload/models/models.py b/upload/models/models.py index 77dd6aebf..cc4df3cb6 100644 --- a/upload/models/models.py +++ b/upload/models/models.py @@ -569,13 +569,23 @@ class ModifiedImportedVariant(models.Model): def genome_build(self): return self.import_info.upload_step.genome_build + @staticmethod + def _to_variant_coordinate(old_variant) -> VariantCoordinate: + if full_match := ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN.fullmatch(old_variant): + chrom = full_match.group(1) + start = int(full_match.group(2)) + ref = full_match.group(3) + alt = full_match.group(4) + return VariantCoordinate.from_start_only(chrom, start, ref, alt) + raise ValueError(f"{old_variant} didn't match regex {ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN}") + @staticmethod def format_old_variant(old_variant: str, genome_build: GenomeBuild) -> List[str]: """ We need consistent formatting (case and use of chrom) so we can retrieve it easily. May return multiple values """ formatted_old_variants = [] for ov in ModifiedImportedVariant._split_old_variant(old_variant): - vc = VariantCoordinate.from_string(ov, regex_pattern=ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN) + vc = ModifiedImportedVariant._to_variant_coordinate(ov) contig = genome_build.chrom_contig_mappings[vc.chrom] variant_coordinate = VariantCoordinate(contig.name, vc.start, vc.end, vc.ref, vc.alt) formatted_old_variants.append(ModifiedImportedVariant.get_old_variant_from_variant_coordinate(variant_coordinate))