-
Notifications
You must be signed in to change notification settings - Fork 1
/
SortedBandGapPredictor.py
142 lines (112 loc) · 3.48 KB
/
SortedBandGapPredictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from pymatgen import *
from numpy import zeros, mean
from sklearn import *
import matplotlib.pyplot as plt
trainFile = open("bandgapDFT.csv", "r").readlines()
def naiveVectorize(composition):
vector = zeros((MAX_Z))
for element in composition:
fraction = composition.get_atomic_fraction(element)
vector[element.Z - 1] = fraction
return(vector)
materials = []
bandgaps = []
naiveFeatures = []
MAX_Z = 100
for line in trainFile:
split = str.split(line, ',')
if(float(split[1]) == 0):
x = 1
material = Composition(split[0])
materials.append(material)
naiveFeatures.append(naiveVectorize(material))
bandgaps.append(float(split[1]))
baselineError = mean(abs(mean(bandgaps) - bandgaps))
print("The MAE of always guessing the average band gap is: " +
str(round(baselineError, 3)) + " eV")
linear = linear_model.Ridge(alpha=0.5)
cv = cross_validation.ShuffleSplit(len(bandgaps),
n_iter=10, test_size=0.1, random_state=0)
scores = cross_validation.cross_val_score(
linear,
naiveFeatures,
bandgaps,
cv=cv,
scoring='mean_absolute_error')
print("The MAE of the linear ridge using the naive features: " +
str(round(abs(mean(scores)), 3)) + " eV")
physicalFeatures = []
atmno = []
plotter = {}
plotter2 = {}
plotter3 = {}
it = 0
for material in materials:
theseFeatures = []
fraction = []
atomicNo = []
eneg = []
group = []
for element in material:
fraction.append(material.get_atomic_fraction(element))
atomicNo.append(float(element.Z))
eneg.append(element.X)
group.append(float(element.group))
mustReverse = False
if fraction[1] > fraction[0]:
mustReverse = True
for features in [fraction, atomicNo, eneg, group]:
if mustReverse:
features.reverse()
theseFeatures.append(fraction[0] / fraction[1])
theseFeatures.append(eneg[0] - eneg[1])
theseFeatures.append(group[0])
theseFeatures.append(group[1])
theseFeatures.append(atomicNo[0] + atomicNo[1])
physicalFeatures.append(theseFeatures)
ZZ = 0
for z in atomicNo:
ZZ += z
atmno.append(ZZ)
plotter[bandgaps[it]] = ZZ
plotter2[bandgaps[it]] = eneg[0] - eneg[1]
plotter3[bandgaps[it]] = fraction[0] / fraction[1]
it += 1
linear = linear_model.Ridge(alpha=0.5)
plt.plot(plotter3.values(), plotter3.keys(), 'b*')
plt.xlabel('Atomic Fraction')
plt.ylabel('Band Gap')
plt.show()
plt.plot(plotter2.values(), plotter2.keys(), 'bo')
plt.xlabel('Electro negativity difference')
plt.ylabel('Band Gap')
plt.show()
b = sorted(plotter.iteritems(), key=lambda (x, y): float(x))
key = []
val = []
for i in b:
key.append(i[0])
val.append(i[1])
plt.xlabel('Molecular weight')
plt.ylabel('Band Gap')
plt.plot(val, key, 'ro')
plt.show()
cv = cross_validation.ShuffleSplit(len(bandgaps),
n_iter=10, test_size=0.1, random_state=0)
scores = cross_validation.cross_val_score(
linear,
physicalFeatures,
bandgaps,
cv=cv,
scoring='mean_absolute_error')
print("The MAE of the linear ridge using the physicalFeatures: " +
str(round(abs(mean(scores)), 3)) + " eV")
rfr = ensemble.RandomForestRegressor(n_estimators=10)
scores = cross_validation.cross_val_score(
rfr,
physicalFeatures,
bandgaps,
cv=cv,
scoring='mean_absolute_error')
print("The MAE of random forrest using physicalFeatures feature set is: " +
str(round(abs(mean(scores)), 3)) + " eV")