Skip to content

Commit

Permalink
fix: removing duplicates in conditions JSONL (#86)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jan 19, 2024
1 parent 482bdd0 commit 16edd9b
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 36 deletions.
2 changes: 1 addition & 1 deletion rules/output/annonars/genes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,6 @@ rule output_annonars_genes: # -- build annonars genes RocksDB file
--value date={wildcards.date} \
\
--value v_annonars={wildcards.v_annonars} \
--value v_downloader={PV.downloader}
--value v_downloader={PV.downloader} \
> {output.spec_yaml}
"""
52 changes: 17 additions & 35 deletions scripts/genes-integrate-diseases.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,17 +831,6 @@ def __lt__(self, other: "ResultContainer") -> bool:
)


class GeneDiseaseKey(BaseModel):
"""Key for a gene-disease association."""

model_config = ConfigDict(frozen=True)

#: Gene HGNC ID.
hgnc_id: str
#: Disease database ID.
disease_id: str


class Integrator:
"""Implementation of the integration algorithm."""

Expand Down Expand Up @@ -880,29 +869,26 @@ class Integrator:
def __init__(self):
"""Initialise the integrator."""
#: Mapping from `(gene_hgnc_id, disease_id)` to `GeneDiseaseAssociationEntry`.
self.disease_assocs: Dict[GeneDiseaseKey, GeneDiseaseAssociation] = {}
self.disease_assocs: Dict[str, List[GeneDiseaseAssociation]] = {}
#: Mapping from `hgnc_id` to list of `PanelappAssociation`s.
self.panelapp_assocs: Dict[str, List[PanelappAssociation]] = {}

def register_disease_assoc(self, assoc: GeneDiseaseAssociation):
"""Register a gene-disease association."""
found_list = set()
for disease_id in assoc.disease_ids:
key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
if key in self.disease_assocs:
found_list.add(self.disease_assocs[key])
found_list = self.disease_assocs.get(assoc.hgnc_id, [])
if not found_list:
for disease_id in assoc.disease_ids:
key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
self.disease_assocs[key] = assoc
self.disease_assocs[assoc.hgnc_id] = [assoc]
else:
if len(found_list) != 1:
logger.warning(f"Found multiple associations for {assoc.hgnc_id}")
merged_any = False
new_list = []
for found in found_list:
found = found.merge(assoc)
for disease_id in assoc.disease_ids:
key = GeneDiseaseKey(hgnc_id=assoc.hgnc_id, disease_id=disease_id)
self.disease_assocs[key] = found
if set(assoc.disease_ids) & set(found.disease_ids):
merged_any = True
found = found.merge(assoc)
new_list.append(found)
if not merged_any:
new_list.append(assoc)
self.disease_assocs[assoc.hgnc_id] = new_list

def run(self, pickle_path: Optional[str] = None):
logger.info("Building gene-disease map...")
Expand All @@ -918,7 +904,8 @@ def run(self, pickle_path: Optional[str] = None):
for hgnc_id in sorted(
set(
chain(
(k.hgnc_id for k in self.disease_assocs.keys()), self.panelapp_assocs.keys()
(hgnc_id for hgnc_id in self.disease_assocs.keys()),
self.panelapp_assocs.keys(),
)
)
)
Expand All @@ -929,14 +916,9 @@ def run(self, pickle_path: Optional[str] = None):
"panelapp_associations": tuple(sorted(assocs, key=lambda a: a.confidence_level))
}
)
for key, assoc in self.disease_assocs.items():
disease_assoc = conditions_by_hgnc[key.hgnc_id].disease_associations
conditions_by_hgnc[key.hgnc_id] = conditions_by_hgnc[key.hgnc_id].model_copy(
update={
"disease_associations": tuple(
sorted(chain(disease_assoc, [assoc]), key=lambda a: a.confidence)
)
}
for hgnc_id, assocs in self.disease_assocs.items():
conditions_by_hgnc[hgnc_id] = conditions_by_hgnc[hgnc_id].model_copy(
update={"disease_associations": tuple(sorted(assocs, key=lambda a: a.confidence))}
)
result = ResultContainer(results=tuple(conditions_by_hgnc.values()))
for assoc in result.results:
Expand Down

0 comments on commit 16edd9b

Please sign in to comment.