This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
hpo_demo.py
69 lines (54 loc) · 1.95 KB
/
hpo_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 16 10:25:39 2019
@author: danielbean
"""
## load HPO ontology into a directed graph in memory and use it to build
## an example risk score
from HPOLocalMapper import HPOLocalMapper
import ScoreBuilder as sb
import examples.Scorer as scorer
import json
import pandas as pd
#create the HPO mapper and parse the HPO .obo ontology file
hpo_mapper = HPOLocalMapper({})
obo_file = 'HPO/hpo_2019-04-15.obo.txt'
hpo_mapper.parse(obo_file)
#create a builder instance
builder = sb.ScoreBuilder()
#add the mapper to the score builder so it can be used in the definition file
#using "HPO_local"
builder.add_mapper('HPO_local', hpo_mapper)
#add the Test mapper so we can pass through manual features
tm = sb.TestMapper({}) #empty configuration for test mapper
builder.add_mapper('Test', tm)
#use the builder to parse the definition file and do the mapping
definition = builder.build('input_files/hpo_demo_definition.csv')
#apply the definition to patient data
with open('example_data/risk_nlp_data_hpo.txt') as f:
pt = json.load(f)
#make a dataframe of patients x risk factors
print "generate risk factor dataframe"
df_data = []
pt_not_found = set()
for p in pt:
#NLP features pre-filtered to those relevant to the score and above
#any required confidence threshold
pt_data = p['features']
#identifiers
pt_data['patient_id'] = p['identifiers']['patient_id']
#create any components of the score we've defined manually rather
#than from NLP
pt_data['Female'] = p['features']['_male'] == 0
pt_age = p['features']['_age']
pt_data['Age_gte75'] = pt_age >= 75
pt_data['Age_65-74'] = (pt_age >= 65 and pt_age < 75)
pt_data['Age_gt65'] = pt_age > 65
df_data.append(pt_data)
df = pd.DataFrame(df_data)
#calculate risk score
demo = scorer.Scorer(definition)
demo_result = demo.score(df, ['patient_id'])
demo_scores = demo_result['scores']
print demo_scores.head()