forked from loyalzc/tencent_ad
-
Notifications
You must be signed in to change notification settings - Fork 0
/
big_old_baseline.py
107 lines (97 loc) · 4.35 KB
/
big_old_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# coding=utf-8
# @author:bryan
# blog: https://blog.csdn.net/bryan__
# github: https://github.com/YouChouNoBB/2018-tencent-ad-competition-baseline
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import os
ad_feature = pd.read_csv('../data/adFeature.csv')
if os.path.exists('../data/userFeature.csv'):
user_feature = pd.read_csv('../data/userFeature.csv')
else:
userFeature_data = []
with open('../data/userFeature.data', 'r') as f:
for i, line in enumerate(f):
line = line.strip().split('|')
userFeature_dict = {}
for each in line:
each_list = each.split(' ')
userFeature_dict[each_list[0]] = ' '.join(each_list[1:])
userFeature_data.append(userFeature_dict)
if i % 100000 == 0:
print(i)
user_feature = pd.DataFrame(userFeature_data)
user_feature.to_csv('../data/userFeature.csv', index=False)
train = pd.read_csv('../data/train.csv')
predict = pd.read_csv('../data/test1.csv')
train.loc[train['label'] == -1, 'label'] = 0
predict['label'] = -1
data = pd.concat([train, predict])
data = pd.merge(data, ad_feature, on='aid', how='left')
data = pd.merge(data, user_feature, on='uid', how='left')
data = data.fillna('-1')
one_hot_feature = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education', 'gender', 'house', 'os', 'ct',
'marriageStatus', 'advertiserId', 'campaignId', 'creativeId',
'adCategoryId', 'productId', 'productType']
vector_feature = ['appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
'kw2', 'kw3', 'topic1', 'topic2', 'topic3']
for feature in one_hot_feature:
try:
data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
except:
data[feature] = LabelEncoder().fit_transform(data[feature])
train = data[data.label != -1]
train_y = train.pop('label')
# train, test, train_y, test_y = train_test_split(train,train_y,test_size=0.2, random_state=2018)
test = data[data.label == -1]
res = test[['aid', 'uid']]
test = test.drop('label', axis=1)
enc = OneHotEncoder()
train_x = train[['creativeSize']]
test_x = test[['creativeSize']]
for feature in one_hot_feature:
enc.fit(data[feature].values.reshape(-1, 1))
train_a = enc.transform(train[feature].values.reshape(-1, 1))
test_a = enc.transform(test[feature].values.reshape(-1, 1))
train_x = sparse.hstack((train_x, train_a))
test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')
cv = CountVectorizer()
for feature in vector_feature:
cv.fit(data[feature])
train_a = cv.transform(train[feature])
test_a = cv.transform(test[feature])
train_x = sparse.hstack((train_x, train_a))
test_x = sparse.hstack((test_x, test_a))
print('cv prepared !')
def LGB_test(train_x, train_y, test_x, test_y):
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=1000, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (test_x, test_y)], eval_metric='auc',
early_stopping_rounds=100)
# print(clf.feature_importances_)
return clf, clf.best_score_['valid_1']['auc']
def LGB_predict(train_x, train_y, test_x, res):
print("LGB test")
clf = lgb.LGBMClassifier(
boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
max_depth=-1, n_estimators=1500, objective='binary',
subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.05, min_child_weight=50, random_state=2018, n_jobs=-1
)
clf.fit(train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc', early_stopping_rounds=100)
res['score'] = clf.predict_proba(test_x)[:, 1]
res['score'] = res['score'].apply(lambda x: float('%.6f' % x))
res.to_csv('../data/submission.csv', index=False)
os.system('zip ../data/baseline.zip ../data/submission.csv')
return clf
model = LGB_predict(train_x, train_y, test_x, res)