forked from IRlabDroso/Lumar_GUI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWeb_scrapping_odorants.py
executable file
·83 lines (66 loc) · 3.71 KB
/
Web_scrapping_odorants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pubchempy as pcp
import csv
import re
##### Foundind CIDs for the odours ######
with open("Odorants.csv", "r") as f:
reader = csv.reader(f, delimiter=";")
first_line = True
notfound = []
CIDs = []
for i in reader:
if re.search("Water", i[0]):
continue
elif first_line:
first_line = False
continue
else:
print(i)
CID = pcp.get_cids(str(i[0]), 'name',
'substance', list_return='flat')
if len(CID) == 0:
notfound.append(i[0])
print("notfound")
else:
print(CID[0])
CIDs.append(CID[0])
print(notfound)
stillnotfound = []
for i in notfound:
new_label = re.sub("-", " ", i)
CID = pcp.get_cids(str(new_label), 'name', 'substance', list_return='flat')
if len(CID) == 0:
stillnotfound.append(new_label)
print("notfound")
else:
CIDs.append(CID[0])
print(stillnotfound)
print(CIDs)
##### Web scrapping Part #####
chromedriver_autoinstaller.install()
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome()
driver.implicitly_wait(5)
compound_list = {}
for CID in CIDs:
url = "https://pubchem.ncbi.nlm.nih.gov/compound/" + str(CID)
driver.get(url)
name_content = driver.find_element(
By.XPATH, "//meta[@property='og:title']")
name_compound = name_content.get_attribute("content")
compound_list[CID] = name_compound
driver.quit()
print(compound_list)
##### Web scrapping Part #####
compound_list = {7439: 'Carvone', 14525: 'Fenchone', 8129: '1-Heptanol', 8103: '1-Hexanol', 8914: 'Nonan-1-ol', 18827: '1-Octen-3-OL', 6276: '1-Pentanol', 650: '2,3-Butanedione', 8461: '2,4-Dinitrotoluene', 6569: 'Methyl Ethyl Ketone', 10976: '2-Heptanol', 8051: '2-Heptanone', 18522: '2-Methyltetrahydrofuran-3-one', 7895: '2-Pentanone', 3776: 'Isopropyl Alcohol', 11509: '3-Hexanone', 246728: '3-Octanone', 9862: '6-Methyl-5-hepten-2-one', 175: 'Acetate', 179: 'Acetoin', 7410: 'Acetophenone', 6654: 'alpha-PINENE',
6713932: '(4R,5R)-5-(1,2-dihydroxyethyl)-3,4-dihydroxy-2-oxolanone', 240: 'Benzaldehyde', 11529: 'Butyl propionate', 264: 'Butyric Acid', 273: 'Cadaverine', 7794: 'Citronellal', 62378: 'Dihydrojasmone', 402: 'Hydrogen Sulfide', 22311: 'Limonene', 174: 'Ethylene Glycol', 62572: 'Ethyl 3-hydroxybutyrate', 8857: 'Ethyl Acetate', 7749: 'Ethyl propionate', 12191: '2-Butenoic acid, ethyl ester', 7344: 'Ethyl lactate', 6184: 'Hexanal', 798: 'Indole', 6590: 'Isobutyric acid', 91604: 'Lyral', 1254: 'Menthol', 7128: 'Methyl isoeugenol', 6584: 'Methyl acetate', 4133: 'Methyl Salicylate', 335: 'o-Cresol', 454: 'Octanal', 998: 'Phenylacetaldehyde', 6054: '2-Phenylethanol', 1032: 'Propionic Acid', 7997: 'Propyl acetate', 1049: 'Pyridine', 31268: 'Pyrrolidine', 19801: 'Dec-2-enal', 10460: 'Hexenal', 9251: 'Thietane', 288227: '4a,5-Dimethyl-3-(prop-1-en-2-yl)-1,2,3,4,4a,5,6,7-octahydronaphthalene', 8468: 'Vanillic Acid', 31272: 'Butyl acetate', 7165: 'Ethyl benzoate', 7342: 'Ethyl isobutyrate', 7780: '3,7-Dimethylocta-2,6-dienyl acetate', 12180: 'Methyl butyrate', 7824: 'Methyl hexanoate', 11124: 'Methyl propionate', 12348: 'Pentyl acetate'}
with open("Corrected_odors.csv", "r") as filetoread, open("Corrected_odors.csv", "w") as filetowrite:
reader = csv.reader(filetoread, delimiter=",")
writer = csv.writer(filetowrite, dialect='excel', delimiter=',')
writer.writerow(["Odor_id", "Odor_name"])
already_in = []
for i in reader:
already_in.append(i[0])
for id, name in compound_list.items(), reader:
if id not in already_in:
writer.writerow([id, name])