-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_custom.py
80 lines (68 loc) · 2.89 KB
/
preprocess_custom.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from neo4j import GraphDatabase
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
def fetch_data(session, query):
result = session.run(query)
data = []
for record in result:
subject_id = record['subject_id']
disease_status = record['disease']
phenotype = record['phenotypes']
protein = record['proteins']
data.append({"subject_id": subject_id,"pheno_type":phenotype,"protien":protein, "disease": disease_status})
data = pd.DataFrame(data)
json_output = data.rename(columns={'subject_id': 'id', 'phenotypes': 'positive_phenotypes', 'disease': 'true_diseases'}).to_json(orient='records', indent=4)
with open('output.jsonl', 'w') as file:
file.write(json_output)
return json_output,data
def main(type=1):
uri = "bolt://83.229.84.12:7687"
username = "tumaiReadonly"
password = "MAKEATHON2024"
database = "graph2.db"
AUTH = (username, password)
with GraphDatabase.driver(uri, auth=AUTH) as driver:
driver.verify_connectivity()
if type==0:
query ="""
MATCH (b:Biological_sample)
OPTIONAL MATCH (b)-[:HAS_PROTEIN]->(p:Protein)
OPTIONAL MATCH (b)-[:HAS_PHENOTYPE]->(ph:Phenotype)
OPTIONAL MATCH (b)-[:HAS_DISEASE]->(d:Disease)
RETURN b.subjectid AS subject_id,
collect(DISTINCT p.id) AS proteins,
collect(DISTINCT ph.id) AS phenotypes,
CASE WHEN d.name = 'control' THEN 0 ELSE 1 END AS disease
"""
if type==1:
query = """
MATCH (b:Biological_sample)-[:HAS_DISEASE]->(d:Disease)
WHERE NOT d.name = 'control'
OPTIONAL MATCH (b)-[:HAS_PROTEIN]->(p:Protein)
OPTIONAL MATCH (b)-[:HAS_PHENOTYPE]->(ph:Phenotype)
WITH b,
collect(DISTINCT p.id) AS proteins,
collect(DISTINCT ph.id) AS phenotypes,
d.synonyms AS synonyms
UNWIND synonyms AS synonym
WITH b, proteins, phenotypes, synonym
WHERE synonym CONTAINS 'ICD10CM:'
RETURN b.subjectid AS subject_id,
proteins,
phenotypes,
substring(synonym, size('ICD10CM:'), 1) AS disease
"""
with driver.session(database=database) as session:
jsondata,data = fetch_data(session, query)
return jsondata,data
def process_data(data):
mlb_pheno = MultiLabelBinarizer()
mlb_protein = MultiLabelBinarizer()
pheno_encoded = mlb_pheno.fit_transform(data['pheno_type'])
protein_encoded = mlb_protein.fit_transform(data['protien'])
df_pheno_encoded = pd.DataFrame(pheno_encoded, columns=mlb_pheno.classes_)
df_protein_encoded = pd.DataFrame(protein_encoded, columns=mlb_protein.classes_)
df_final = pd.concat([data[['subject_id']], df_pheno_encoded, data['disease']], axis=1)
# df_final = pd.concat([data[['subject_id']], df_pheno_encoded, df_protein_encoded, data['disease']], axis=1)
return df_final
main(type=1)