-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchauld_rmodel.R
249 lines (195 loc) · 11.4 KB
/
chauld_rmodel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#TODO
# 1. FirstSeenDate for various OS Versions etc
# 2. 'Age' derived from FirstSeenDate
# 3. Variance in RAM/Storage by OEMModelIdentifier
# 4. Where locale does not match country
# 5. Encode CensusOSVersion and AVSigVersion to appropriately deal with likely unseen data in test; Lots of feature ideas here https://www.kaggle.com/tunguz/ms-malware-adversarial-validation
#Parallel backend
library(doParallel)
library(doMC)
library(foreach)
clusterCores <- 20
library(devtools)
library(dplyr)
library(lightgbm)
library(data.table)
library(featuretoolsR)
library(DescTools)
library(fst)
set.seed(2001)
#---------------------------
#TODO:Move loading across to the fst package per https://blog.revolutionanalytics.com/2017/02/fst-fast-serialization-of-r-data-frames.html
cat("Loading data...\n")
dt <- fread("../input/train.csv", drop = "MachineIdentifier")
N <- dt[, .N]
y <- dt[, HasDetections]
dt[, HasDetections := NULL]
dt <- rbindlist(list(dt,fread("../input/test.csv", drop = "MachineIdentifier")), fill=T,use.names=T,idcol=T)
dt[, .rowId := .I]
dt[,.noise:=rnorm(.N)]
setkey(dt,.rowId)
#---------------------------
cat("Adding features...\n")
cat("Ordering version columns") #Per https://www.kaggle.com/sionek/ordering-version-variables
versionVars <- c("AvSigVersion","EngineVersion","AppVersion","Census_OSVersion")
# patterns for proper ordering version variables
pattern1 <- "^([0-9]+)\\D([0-9]+)\\D([0-9]+)\\D([0-9]+)$"
pattern2 <- "0\\1.0000\\2.0000\\3.0000\\4"
pattern3 <- "[0-9]*([0-9]{2})\\D[0-9]*([0-9]{5})\\D[0-9]*([0-9]{5})\\D[0-9]*([0-9]{5})"
pattern4 <- "\\1.\\2.\\3.\\4"
cat("Transforming version variables into ordered integers...\n")
registerDoMC(length(versionVars))
cat(paste('Running in parallel with ', getDoParWorkers(), ' workers'))
combinedDT <- foreach (i=1:length(versionVars), .combine=cbind) %dopar%
{
f <- versionVars[i]
dt[,as.integer(as.factor(gsub(pattern3,pattern4,gsub(pattern1,pattern2,.SD[[f]]))))]
}
combinedDT <- setnames(as.data.table(combinedDT),versionVars)
dt <- cbind(dt[,-(versionVars),with=F],combinedDT)
combinedDT <- NULL
registerDoSEQ()
dt[, count.AvSigVersion.Wdft_IsGamer := .N / nrow(dt), by = "AvSigVersion,Wdft_IsGamer"
][, count.Census_ProcessorCoreCount.Wdft_RegionIdentifier := .N / nrow(dt), by = "Census_ProcessorCoreCount,Wdft_RegionIdentifier"
][, count.Census_ProcessorCoreCount.Census_OEMNameIdentifier := .N / nrow(dt), by = "Census_ProcessorCoreCount,Census_OEMNameIdentifier"
][, count.GeoNameIdentifier.Census_OEMNameIdentifier.Census_OSBuildRevision := .N / nrow(dt), by = "GeoNameIdentifier,Census_OEMNameIdentifier,Census_OSBuildRevision"
][, count.OsBuildLab := .N / nrow(dt), by = "OsBuildLab"
]
cat("Fixing SmartScreen Flags")
dt[,SmartScreenIsValid:=SmartScreen %in% c('','RequireAdmin','ExistsNotSet','Off','Warn','Prompt','Block')]
#---------------------------
cat("Converting character columns...\n")
#TODO: Correct handling of categoricals
cats <- names(which(sapply(dt, is.character)))
dt[, (cats) := lapply(.SD, function(x) as.integer(as.factor(x))), .SDcols = cats]
rm(cats); invisible(gc())
#-----------------------------------
#cat('Deep Feature Synthesis')
# Create entityset
#es <- as_entityset(dt, index = "key", entity_id = "dt", id = "entities")
#es$normalize_entity(base_entity_id = "dt",new_entity_id = "CountryIdentifier",index="CountryIdentifier",make_time_index=F)
#ft_matrix <-
# dfs(es,
# target_entity = "dt",
# agg_primitives = as.list("mean")
# )
#-----------------------------
cat('Hand rolled features')
# Variance and moments of various properties by OEMModelIdentifier
#TODO: Check on NAs
setkey(dt,Census_OEMModelIdentifier)
dt[,CoV.Census_TotalPhysicalRAM:=CoefVar(Census_TotalPhysicalRAM,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,Skew.Census_TotalPhysicalRAM:=Skew(Census_TotalPhysicalRAM,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,Kurt.Census_TotalPhysicalRAM:=Kurt(Census_TotalPhysicalRAM,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,CoV.Census_SystemVolumeTotalCapacity:=CoefVar(Census_SystemVolumeTotalCapacity,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,Skew.Census_SystemVolumeTotalCapacity:=Skew(Census_SystemVolumeTotalCapacity,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,Kurt.Census_SystemVolumeTotalCapacity:=Kurt(Census_SystemVolumeTotalCapacity,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,CoV.Census_ProcessorCoreCount:=CoefVar(Census_ProcessorCoreCount,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for machines that are off the shelf vs home built
dt[,Skew.Census_ProcessorCoreCount:=Skew(Census_ProcessorCoreCount,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier]
dt[,Kurt.Census_ProcessorCoreCount:=Kurt(Census_ProcessorCoreCount,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier]
dt[,CoV.Census_InternalBatteryNumberOfCharges:=CoefVar(Census_InternalBatteryNumberOfCharges,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier] #Proxy for duration on market
dt[,Skew.Census_InternalBatteryNumberOfCharges:=Skew(Census_InternalBatteryNumberOfCharges,na.rm=T),by=Census_OEMModelIdentifier] #Hopefully pulls some measure of the lifecycle of devices
dt[,Kurt.Census_InternalBatteryNumberOfCharges:=Kurt(Census_InternalBatteryNumberOfCharges,na.rm=T),by=Census_OEMModelIdentifier] #As above
dt[,Mean.AvSigVersion_by_Census_OEMModelIdentifier:=Mean(AvSigVersion,trim=0.1,na.rm=T),by=Census_OEMModelIdentifier] #Is this a wildwest machine type or not?
dt[,CoV.AvSigVersion_by_Census_OEMModelIdentifier:=CoefVar(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier]
dt[,Skew.AvSigVersion_by_Census_OEMModelIdentifier:=Skew(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier]
dt[,Kurt.AvSigVersion_by_Census_OEMModelIdentifier:=Kurt(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Census_OEMModelIdentifier]
dt[,CountUnique.AvSigVersion:=n_distinct(AvSigVersion,na.rm=T),by=Census_OEMModelIdentifier]
#Various moments on Country
setkey(dt,CountryIdentifier)
dt[,Mean.AvSigVersion_by_CountryIdentifier:=Mean(AvSigVersion,trim=0.1,na.rm=T),by=CountryIdentifier]
dt[,CoV.AvSigVersion_by_CountryIdentifier:=CoefVar(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier] #How good is a country at keeping up to date
dt[,Skew.AvSigVersion_by_CountryIdentifier:=Skew(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
dt[,Kurt.AvSigVersion_by_CountryIdentifier:=Kurt(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
dt[,Mean.AppVersion:=Mean(AppVersion,trim=0.1,na.rm=T),by=CountryIdentifier]
dt[,CoV.AppVersion:=CoefVar(AppVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier] #How good is a country at keeping up to date
dt[,Skew.AppVersion:=Skew(AppVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
dt[,Kurt.AppVersion:=Kurt(AppVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
dt[,Mean.Census_OSVersion:=Mean(Census_OSVersion,trim=0.1,na.rm=T),by=CountryIdentifier]
dt[,CoV.Census_OSVersion:=CoefVar(Census_OSVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier] #How good is a country at keeping up to date
dt[,Skew.Census_OSVersion:=Skew(Census_OSVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
dt[,Kurt.Census_OSVersion:=Kurt(Census_OSVersion,unbiased=F,conf.level=NA,na.rm=T),by=CountryIdentifier]
#Various moments on Country
setkey(dt,Wdft_RegionIdentifier)
dt[,Mean.AvSigVersion_by_Wdft_RegionIdentifier:=Mean(AvSigVersion,trim=0.1,na.rm=T),by=Wdft_RegionIdentifier]
dt[,CoV.AvSigVersion_by_Wdft_RegionIdentifier:=CoefVar(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Wdft_RegionIdentifier] #How good is a country at keeping up to date
dt[,Skew.AvSigVersion_by_Wdft_RegionIdentifier:=Skew(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Wdft_RegionIdentifier]
dt[,Kurt.AvSigVersion_by_Wdft_RegionIdentifier:=Kurt(AvSigVersion,unbiased=F,conf.level=NA,na.rm=T),by=Wdft_RegionIdentifier]
#Proportion of Locale by Country. #Dplyr syntax nicer per https://stackoverflow.com/questions/30944116/r-data-table-subgroup-weighted-percent-of-group
#TODO: Performance improvement via keying the merge
#TODO: Make nicers with data.table
#TODO: Build a generalized function for doing frequencies by group!
library(dplyr)
freqTable <- as.data.table(dt %>%
group_by(CountryIdentifier, LocaleEnglishNameIdentifier) %>%
summarise(count.LocaleEnglishNameIdentifier.CountryIdentifier = n()) %>%
mutate(freq.LocaleEnglishNameIdentifier.CountryIdentifier = count.LocaleEnglishNameIdentifier.CountryIdentifier/sum(count.LocaleEnglishNameIdentifier.CountryIdentifier)))
setkeyv(dt, c("CountryIdentifier","LocaleEnglishNameIdentifier"))
dt <- merge(dt,freqTable,by=c("CountryIdentifier","LocaleEnglishNameIdentifier"))
#---------------------------
cat("Preparing data for boosting...\n")
setkey(dt,.rowId)
dt[,.rowId:=NULL]
cat("Preparing data for boosting...\n")
dm <- data.matrix(dt)
tr <- lgb.Dataset(data = dm[1:N, ], label = y)
te <- dm[-(1:N), ]
#---------------------------
cat("Training model and predicting...\n")
subm <- fread("../input/sample_submission.csv")
#TODO:a
# Treat NA correctly
# Treat categorical correctly
i <- 0.7
p <- list(boosting_type = "gbdt",
objective = "binary",
metric = "auc",
nthread = 26,
learning_rate = 0.1,
max_depth = 6,
num_leaves = 40,
feature_fraction = i,
bagging_fraction = i,
bagging_freq = 1,
lambda_l1 = 0.1,
lambda_l2 = 0.1,
device = 'gpu',
max_bin=63,
metrics='auc',
early_stopping_rounds=20)
m_cv <- lgb.cv(p, tr, 10000,nfold=5, verbose=1)
cv_score <- m_cv$best_score
best_iter <- m_cv$best_iter
rm(m_cv); invisible(gc())
#TODO: Parallelize with doAzureParallel
#TODO: Exclude .noise
n_bags <- 3
subm[, HasDetections := 0]
for (i in seq(0.5, 0.9, length.out = n_bags)) {
p <- list(boosting_type = "gbdt",
objective = "binary",
metric = "auc",
nthread = 26,
learning_rate = 0.1,
max_depth = 6,
num_leaves = 40,
sub_feature = i,
sub_row = i,
bagging_freq = 1,
lambda_l1 = 0.1,
lambda_l2 = 0.1,
device = 'gpu',
max_bin=63,
metrics='auc')
m_gbm <- lgb.train(p, tr, best_iter*1.1, verbose = 0)
subm[, HasDetections := HasDetections + predict(m_gbm, te) / n_bags]
#rm(m_gbm); invisible(gc())
}
#Feature importance
featImp <- lgb.importance(m_gbm)
lgb.plot.importance(featImp,top_n = 50,left_margin = 15)
#Sanity check
hist(subm$HasDetections)
#---------------------------
cat("Making submission file...\n")
fwrite(subm, "ms_malware_new.csv")