forked from arinbjornk/BiobankRead
-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_HF.py
64 lines (46 loc) · 2.21 KB
/
test_HF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import __builtin__
__builtin__.namehtml=<my_html_file_location>
__builtin__.namecsv=<my_csv_file_location>
__builtin__.n =<n_of_subjects>
### import Biobankread package
import BiobankRead2.BiobankRead2 as UKBr
UKBr = UKBr.BiobankRead()
### optionally import additional packages
import pandas as pd
import numpy as np
### view the list of all variables available in files
All_vars = UKBr.Vars
#### Extract BMI variable
df_BMI = UKBr.extract_variable('Body mass index(BMI)', baseline_only=True)
#### View Systolic blood pressure (SBP) variables
SBP = UKBr.search_in_list(ls=All_vars,key='Systolic')
### Extract them in one data frame
Df_Systolic = UKBr.extract_many_vars(SBP)
### Take average of automated readings of SBP
Fields = ['Systolic blood pressure, automated reading-0.0',
'Systolic blood pressure, automated reading-0.1']
Df_Systolic['Systolic blood pressure, automated reading'] = Df_Systolic[Fields].mean(axis=1)
#### Moving on to Heart failure variable
#### specify how we define heart failure (HF) under ICD10
HFs = ['I110','I132','I500','I501','I509']
### Open Hes records
HES_records=UKBr.HES_tsv_read(filename=<location of HES tsv file>)
### Find HFs in HES record file - ICD10 diagnosis column
HES_HFs = UKBr.HES_code_match(df=HES_records, icds=HFs, which='diagnosis')
## to select from other diagnosis columns:
## ICD9: which=''
## OPCS: which='opcs'
### Only keep first instance of HF for each subjects
date='epistart'
## date can otherwise be specified to: 'admidate'
First_times_HF = UKBr.HES_first_time(HES_HFs,date)
## Create binary variable for HF
First_dates_HF['HF_icd10'] = 1
############### If pandas has been loaded previously #####################
### Merge all variables together
DF = pd.merge(df_BMI,SBP,on='eid',how='inner') ## 'inner' keeps the intersection of the samples in df_BMI and SBP
DF = pd.merge(DF,First_dates_HF[['eid','HF_icd10']],on='eid',how='outer') ## 'outer' keeps the union of the samples in DF and First_dates_HF
## Most subjects won't have a record of HF, and thus won't have any entry in 'First_dates_HF'; Mark then with '0' in the 'HF_icd10' variable in 'DF'
DF['HF_icd10'].fillna(0,inplace=True)
### Save the final result
DF.to_csv(<location\file_name.csv>, sep=',',index=None)