-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathmodel.py
122 lines (80 loc) · 3.37 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# -- encoding:UTF-8 --
#-- Author: TNT_000 by Abner yang
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from function import *
from scipy.sparse import hstack
from matplotlib import pyplot
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import average_precision_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
#-- map eval function
# -*- encoding:utf-8 -*-
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
import time
def evalerror(predict, true):
print average_precision_score(true, predict, average='macro', sample_weight=None)
def map_eval(true, predict):
result = pd.DataFrame({'true':true, 'predict':predict})
result = result.sort(['predict'], ascending = [0])
#print result
score = []
num = 0
total = 0
for line in result['true'].values.T:
total += 1
if line == 1:
num += 1
score.append(float(num)/total)
mapScore = np.mean(score)
print mapScore
return mapScore
#-- xgboost local train-test Model frame
def xgbLocalModel(trainFeature, testFeature, trainLabel, testLabel, params, rounds):
params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
print params['scale_pos_weight']
dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
dtest = xgb.DMatrix(testFeature, label = testLabel)
watchlist = [(dtest,'eval'), (dtrain,'train')]
num_round = rounds
print 'run local: ' + 'round: ' + str(rounds)
model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 20)#,feval = evalerror)
predict = model.predict(dtest)
return predict
#-- xgboost cross-validation Model frame
def xgbCVModel(trainFeature, trainLabel, rounds, folds, params):
#--Set parameter: scale_pos_weight--
params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
print params['scale_pos_weight']
#--Get User-define DMatrix: dtrain--
#print trainQid[0]
dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
num_round = rounds
#--Run CrossValidation--
print 'run cv: ' + 'round: ' + str(rounds) + ' folds: ' + str(folds)
res = xgb.cv(params, dtrain, num_round, nfold = folds, verbose_eval = 20)
return res
#-- xgboost online predict Model frame
def xgbPredictModel(trainFeature, trainLabel, testFeature, params, rounds):
dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
dtest = xgb.DMatrix(testFeature, label = np.zeros(testFeature.shape[0]))
watchlist = [(dtest,'eval'), (dtrain,'train')]
params['scale_pos_weight'] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
print params['scale_pos_weight']
num_round = rounds
model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 100)
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort('importance', ascending=False)
predict = model.predict(dtest)
importance.to_csv('../importance/im.csv', index = False)
return model, predict