Skip to content

Commit

Permalink
#892 - create manual variant / Variant.format_tuple - do internal ran…
Browse files Browse the repository at this point in the history
…ge format instead of SPDI
  • Loading branch information
davmlaw committed Sep 14, 2023
1 parent e881681 commit 0092752
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 45 deletions.
3 changes: 2 additions & 1 deletion annotation/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from ontology.models import OntologyVersion
from patients.models_enums import GnomADPopulation
from snpdb.models import GenomeBuild, Variant, VariantGridColumn, Q, VCF, DBSNP_PATTERN, VARIANT_PATTERN, \
HGVS_UNCLEANED_PATTERN, Allele
HGVS_UNCLEANED_PATTERN, Allele, VARIANT_SYMBOLIC_PATTERN
from snpdb.models.models_enums import ImportStatus


Expand Down Expand Up @@ -1174,6 +1174,7 @@ def get_entry_type(line: str) -> ManualVariantEntryType:
DBSNP_PATTERN: ManualVariantEntryType.DBSNP,
HGVS_UNCLEANED_PATTERN: ManualVariantEntryType.HGVS,
VARIANT_PATTERN: ManualVariantEntryType.VARIANT,
VARIANT_SYMBOLIC_PATTERN: ManualVariantEntryType.VARIANT,
}
for k, v in MATCHERS.items():
if k.match(line):
Expand Down
1 change: 1 addition & 0 deletions annotation/models/models_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ class ManualVariantEntryType(models.TextChoices):
DBSNP = "d", "dbSNP"
HGVS = "h", "HGVS"
VARIANT = "v", "Variant"
VARIANT_SYMBOLIC = "s", "Variant (Symbolic)"
UNKNOWN = "u", "Unknown"


Expand Down
2 changes: 1 addition & 1 deletion annotation/tasks/process_manual_variants_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def get_manual_variant_coordinates(mve: ManualVariantEntry) -> List[VariantCoord
elif mve.entry_type == ManualVariantEntryType.HGVS:
variant_coordinates.append(get_hgvs_variant_coordinate(mve.entry_text, mve.genome_build))
elif mve.entry_type == ManualVariantEntryType.VARIANT:
variant_coordinates.append(VariantCoordinate.from_string(mve.entry_text))
variant_coordinates.append(VariantCoordinate.from_string(mve.entry_text, mve.genome_build))
else:
raise ValueError(f"Could not convert entry type of {mve.entry_type}")
return variant_coordinates
Expand Down
21 changes: 16 additions & 5 deletions library/genomics/vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,27 @@ def write_vcf_from_tuples(vcf_filename, variant_tuples, tuples_have_id_field=Fal
vcf_tuples = ((chrom, start, end, ".", ref, alt) for (chrom, start, end, ref, alt) in variant_tuples)

vcf_tuples = sorted(vcf_tuples, key=operator.itemgetter(0, 1, 3, 4))

info = [
'##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
]
columns = "\t".join(["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
header = "\n".join(["##fileformat=VCFv4.1", "##source=VariantGrid", "#" + columns])
empty_columns = "\t." * 3 # QUAL/FILTER/INFO always empty
header = "\n".join(["##fileformat=VCFv4.1", "##source=VariantGrid"] + info + ["#" + columns])
empty_qual_filter_info = ('.', '.', '.')
with open(vcf_filename, "wt", encoding="utf-8") as f:
f.write(header + "\n")
for vcf_record in vcf_tuples:
(chrom, position, end, id_col, ref, alt) = vcf_record
if alt == Variant.REFERENCE_ALT:
alt = "."
line = "\t".join((chrom, str(position), str(id_col), ref, alt)) + empty_columns
if Sequence.allele_is_symbolic(alt):
svlen = end - position
svtype = alt[1:-1] # Strip off brackets
qual_filter_info = (".", ".", f"SVLEN={svlen};SVTYPE={svtype}")
else:
qual_filter_info = empty_qual_filter_info
if alt == Variant.REFERENCE_ALT:
alt = "."
line = "\t".join((chrom, str(position), str(id_col), ref, alt) + qual_filter_info)
f.write(line + "\n")


Expand Down
64 changes: 27 additions & 37 deletions snpdb/models/models_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
LOCUS_PATTERN = re.compile(r"^([^:]+)\s*:\s*(\d+)[,\s]*([GATC]+)$", re.IGNORECASE)
LOCUS_NO_REF_PATTERN = re.compile(r"^([^:]+)\s*:\s*(\d+)$")
VARIANT_PATTERN = re.compile(r"^(MT|(?:chr)?(?:[XYM]|\d+))\s*:\s*(\d+)[,\s]*([GATC]+)>(=|[GATC]+)$", re.IGNORECASE)
# This is our internal format for symbolic (ie <DEL>/<DUP> etc)
VARIANT_SYMBOLIC_PATTERN = re.compile(r"^(MT|(?:chr)?(?:[XYM]|\d+))\s*:\s*(\d+)\s*-\s*(\d+)\s*<(DEL|DUP|INS|INV|CNV)>$", re.IGNORECASE)
# matches anything hgvs-like before any fixes
HGVS_UNCLEANED_PATTERN = re.compile(r"(^(N[MC]_|ENST)\d+.*:|[cnmg]\.|[^:]:[cnmg]).*\d+", re.IGNORECASE)

Expand Down Expand Up @@ -275,24 +277,37 @@ def as_tuple(self) -> Tuple:
return self.chrom, self.start, self.end, self.ref, self.alt

def __str__(self):
if self.is_symbolic():
s = f"{self.chrom}:{self.start}-{self.end} {self.ref}>{self.alt}"
return self.format()

def format(self):
if Sequence.allele_is_symbolic(self.alt):
return f"{self.chrom}:{self.start}-{self.end} {self.alt}"
else:
s = f"{self.chrom}:{self.start} {self.ref}>{self.alt}"
return s
return f"{self.chrom}:{self.start} {self.ref}>{self.alt}"

@staticmethod
def from_string(variant_string: str, regex_pattern=VARIANT_PATTERN):
# This will only ever match standard (non-symbolic variants)
if full_match := regex_pattern.fullmatch(variant_string):
def from_string(variant_string: str, genome_build=None):
""" Pass in genome build to be able to set REF from symbolic (will be N otherwise) """
if full_match := VARIANT_PATTERN.fullmatch(variant_string):
chrom = full_match.group(1)
start = int(full_match.group(2))
ref = full_match.group(3)
alt = full_match.group(4)
return VariantCoordinate.from_start_only(chrom, start, ref, alt)
elif full_match := VARIANT_SYMBOLIC_PATTERN.fullmatch(variant_string):
chrom, start, end, alt = full_match.groups()
start = int(start)
end = int(end)
alt = "<" + alt + ">" # captured what was inside of brackets ie <()>
if genome_build:
contig_sequence = genome_build.genome_fasta.fasta[chrom]
ref = contig_sequence[start:start+1].upper()
else:
ref = "N"
return VariantCoordinate(chrom, start, end, ref, alt)

raise ValueError(f"{variant_string=} did not match agaisnt {regex_pattern=}")

regex_patterns = ", ".join((str(s) for s in (VARIANT_PATTERN, VARIANT_SYMBOLIC_PATTERN)))
raise ValueError(f"{variant_string=} did not match against {regex_patterns=}")

@staticmethod
def from_start_only(chrom: str, start: int, ref: str, alt: str):
Expand Down Expand Up @@ -493,38 +508,13 @@ def validate(genome_build, chrom, position) -> List[str]:
errors.append(f"Chromsome/contig '{chrom}' not a valid in genome build {genome_build}")
return errors

@staticmethod
def tuple_to_spdi(chrom, start, end, ref, alt) -> str:
""" SPDI: data model for variants and applications at NCBI
@see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7523648/ """
if Sequence.allele_is_symbolic(alt):
length = end - start
if alt == VCFSymbolicAllele.DEL:
d_str = length
i_str = ""
elif alt == VCFSymbolicAllele.INS:
d_str = ""
i_str = length
else:
raise ValueError(f"Don't know how to handle alt={alt}")
else:
d_str = ref
i_str = alt
return f"{chrom}:{start}:{d_str}:{i_str}"

@staticmethod
def format_tuple(chrom, start, end, ref, alt, abbreviate=False) -> str:
is_symbolic = Sequence.allele_is_symbolic(ref) or Sequence.allele_is_symbolic(alt)
if is_symbolic:
if alt == "<CNV>":
# There's not really a format for these
return f"CNV {chrom}:{start}-{end}"
return Variant.tuple_to_spdi(chrom, start, end, ref, alt)

if abbreviate:
if abbreviate and not Sequence.allele_is_symbolic(alt):
ref = Sequence.abbreviate(ref)
alt = Sequence.abbreviate(alt)
return f"{chrom}:{start} {ref}>{alt}"
vc = VariantCoordinate(chrom, start, end, ref, alt)
return vc.format()

@staticmethod
def get_from_string(variant_string: str, genome_build: GenomeBuild) -> Optional['Variant']:
Expand Down
12 changes: 11 additions & 1 deletion upload/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,13 +569,23 @@ class ModifiedImportedVariant(models.Model):
def genome_build(self):
return self.import_info.upload_step.genome_build

@staticmethod
def _to_variant_coordinate(old_variant) -> VariantCoordinate:
if full_match := ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN.fullmatch(old_variant):
chrom = full_match.group(1)
start = int(full_match.group(2))
ref = full_match.group(3)
alt = full_match.group(4)
return VariantCoordinate.from_start_only(chrom, start, ref, alt)
raise ValueError(f"{old_variant} didn't match regex {ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN}")

@staticmethod
def format_old_variant(old_variant: str, genome_build: GenomeBuild) -> List[str]:
""" We need consistent formatting (case and use of chrom) so we can retrieve it easily.
May return multiple values """
formatted_old_variants = []
for ov in ModifiedImportedVariant._split_old_variant(old_variant):
vc = VariantCoordinate.from_string(ov, regex_pattern=ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN)
vc = ModifiedImportedVariant._to_variant_coordinate(ov)
contig = genome_build.chrom_contig_mappings[vc.chrom]
variant_coordinate = VariantCoordinate(contig.name, vc.start, vc.end, vc.ref, vc.alt)
formatted_old_variants.append(ModifiedImportedVariant.get_old_variant_from_variant_coordinate(variant_coordinate))
Expand Down

0 comments on commit 0092752

Please sign in to comment.