Skip to content

Commit

Permalink
Merge pull request #620 from iQuxLE/bgee_ingest_split_odd_intersectio…
Browse files Browse the repository at this point in the history
…n_symbol

splitting apart intersection-symbol IDs in bgee ingest
  • Loading branch information
iQuxLE authored Jan 27, 2025
2 parents 79768f4 + beccbfd commit 24d9e97
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 26 deletions.
51 changes: 33 additions & 18 deletions src/monarch_ingest/ingests/bgee/gene_to_expression_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,20 @@ def write_group(rows: List, koza_app: KozaApp):
koza_app (KozaApp): The KozaApp to use for output of rows.
"""
for row in rows:
association = GeneToExpressionSiteAssociation(
id="uuid:" + str(uuid.uuid1()),
subject="ENSEMBL:" + row['Gene ID'],
predicate='biolink:expressed_in',
object=row['Anatomical entity ID'],
primary_knowledge_source="infores:bgee",
aggregator_knowledge_source=["infores:monarchinitiative"],
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.not_provided,
)

koza_app.write(association)
anatomical_entities = row['Anatomical entity ID'].split(' ∩ ')
for anatomical_entity in anatomical_entities:
association = GeneToExpressionSiteAssociation(
id="uuid:" + str(uuid.uuid1()),
subject="ENSEMBL:" + row['Gene ID'],
predicate='biolink:expressed_in',
object=anatomical_entity.strip(),
primary_knowledge_source="infores:bgee",
aggregator_knowledge_source=["infores:monarchinitiative"],
knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
agent_type=AgentTypeEnum.not_provided,
)

koza_app.write(association)


def get_row_group(koza_app: KozaApp, col: str = 'Gene ID') -> Union[List, None]:
Expand All @@ -68,12 +70,25 @@ def get_row_group(koza_app: KozaApp, col: str = 'Gene ID') -> Union[List, None]:
elif koza_app.previous_row is None:
return None

rows = [koza_app.previous_row]
current_row = koza_app.get_row()

while rows[0][col] == current_row[col]:
rows.append(current_row)
current_row = koza_app.get_row()
rows = []
current_row = koza_app.previous_row

while current_row[col] == koza_app.previous_row[col]:
if " ∩ " in current_row['Anatomical entity ID']:
multiple_entities = [
entity.strip().replace('"','') for entity in current_row['Anatomical entity ID'].split(' ∩ ')
]
for entity in multiple_entities:
split_row = current_row.copy()
split_row['Anatomical entity ID'] = entity.strip()
rows.append(split_row)
else:
rows.append(current_row)

try:
current_row = koza_app.get_row()
except StopIteration:
break

koza_app.previous_row = current_row
return rows
Expand Down
Binary file added tests/unit/bgee/test_bgee_2.tsv.gz
Binary file not shown.
59 changes: 51 additions & 8 deletions tests/unit/bgee/test_bgee_gene_to_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,15 @@ def get_koza_rows(mock_koza: KozaApp, n_rows: int) -> List[Dict]:
"""
rows = []
for i in range(0, n_rows):
rows.append(mock_koza.get_row())
row = mock_koza.get_row()
if " ∩ " in row['Anatomical entity ID']:
entities = [entity.strip().replace('"','') for entity in row['Anatomical entity ID'].split(" ∩ ")]
for entity in entities:
split_row = row.copy()
split_row['Anatomical entity ID'] = entity.strip()
rows.append(split_row)
else:
rows.append(row)
return rows


Expand All @@ -87,14 +95,20 @@ def bgee_test_output_format() -> str:


@pytest.fixture
def bgee_test_files() -> List[str]:
def bgee_test_file() -> List[str]:
return ["tests/unit/bgee/test_bgee.tsv.gz"]

@pytest.fixture
def bgee_test_file_2() -> List[str]:
return ["tests/unit/bgee/test_bgee_2.tsv.gz"]

@pytest.fixture
def bgee_mock_koza_rows(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_files) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_files)
def bgee_mock_koza_rows(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file)

@pytest.fixture
def bgee_mock_koza_rows_2(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file_2) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file_2)

@pytest.fixture
def row_group_1(bgee_mock_koza_rows) -> List[Dict]:
Expand All @@ -106,6 +120,10 @@ def row_group_2(bgee_mock_koza_rows) -> List[Dict]:
_ = get_koza_rows(bgee_mock_koza_rows, 5)
return get_koza_rows(bgee_mock_koza_rows, 22)

@pytest.fixture
def row_group_3(bgee_mock_koza_rows_2) -> List[Dict]:
return get_koza_rows(bgee_mock_koza_rows_2, 3)


@pytest.fixture
def filter_col() -> str:
Expand Down Expand Up @@ -146,8 +164,12 @@ def test_filter_group_by_rank_long(row_group_2, filter_col, smallest_n):


@pytest.fixture
def bgee_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_files) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_files)
def bgee_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file)

@pytest.fixture
def bgee_mock_koza_2(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file_2) -> KozaApp:
return get_mock_koza(bgee_yaml, global_table, bgee_test_output, bgee_test_output_format, bgee_test_file_2)


def test_write_group(row_group_1, bgee_mock_koza):
Expand All @@ -165,17 +187,38 @@ def test_write_group(row_group_1, bgee_mock_koza):
assert item.subject == 'ENSEMBL:ENSSSCG00000000002'
assert item.object == object_list[index]

def test_write_group_2(row_group_3, bgee_mock_koza_2):
write_group(row_group_3, bgee_mock_koza_2)
write_result: list[GeneToExpressionSiteAssociation] = bgee_mock_koza_2._entities
assert len(write_result) == 5
object_list = ['UBERON:0000473', 'CL:0000089', 'UBERON:0000123', 'UBERON:0000473', 'CL:0000089']
predicate_ist = ['ENSEMBL:ENSSSCG00000000419', 'ENSEMBL:ENSSSCG00000000419', 'ENSEMBL:ENSSSCG00000000419', 'ENSEMBL:ENSSSCG00000000457', 'ENSEMBL:ENSSSCG00000000457']
prev_uuid = 0
for index, item in enumerate(write_result):
assert isinstance(item, GeneToExpressionSiteAssociation)
assert item.id != prev_uuid
prev_uuid = item.id
assert item.category == ['biolink:GeneToExpressionSiteAssociation']
assert item.predicate == 'biolink:expressed_in'
assert item.subject == predicate_ist[index]
assert item.object == object_list[index]

def test_get_row_group(bgee_mock_koza, row_group_1, filter_col) -> List:
def test_get_row_group(bgee_mock_koza, row_group_1, filter_col):
row_group = get_row_group(bgee_mock_koza)

assert isinstance(row_group, list)
assert len(row_group) == 5
for i in row_group:
assert isinstance(i, dict)

assert row_group == row_group_1

def test_get_row_group_inter_split(bgee_mock_koza_2, row_group_3, filter_col) -> List:
row_group = get_row_group(bgee_mock_koza_2)
assert isinstance(row_group, list)
assert len(row_group) == 3
assert len(row_group_3) == 5
for i in row_group:
assert isinstance(i, dict)

# Ignoring process_koza_sources for now as it depends completely on above tested functions but goes deeper into Koza.
# def test_process_koza_source(bgee_mock_koza):
Expand Down

0 comments on commit 24d9e97

Please sign in to comment.