-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.R
118 lines (90 loc) · 3.85 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library(data.table)
library(ggplot2)
library(zoo)
source('Import_and_Explore.R')
train <- import()
test <- train$test
train <- train$train
storespecific <- c('Assortment', 'StoreType', 'AvgSales')
cols <- setdiff(colnames(train), storespecific)
train <- train[,..cols]
storespecific <- c('Assortment', 'StoreType', 'Sales', 'Customers', 'AvgSales')
cols <- setdiff(colnames(test), storespecific)
test <- test[,..cols]
stores <- unique(train$Store)
for ( s in stores){
sub <- train[Store == s]
modelcols <- c('Customers','DayOfWeek', 'month', 'Open', 'Promo')
sub <- sub[,..modelcols]
modelcols <- colnames(sub)[sub[,lapply(.SD, uniqueN), .SDcols = modelcols] > 1]
sub <- sub[,..modelcols]
sub[,Customers := Customers + 1]
model <- glm(Customers ~ ., data = sub, family = Gamma(link = 'log'))
train[Store == s, CustPreds := model$fitted.values]
preds <- exp(predict(model, newdata = test[Store == s]))
test[Store == s, Customers := preds ]
}
train[, custerr := Customers - CustPreds]
mean(abs(train$custerr))
sqrt(mean(train$custerr^2))
for ( s in stores){
sub <- train[Store == s]
modelcols <- c('Sales', 'Customers','DayOfWeek', 'month', 'Open', 'Promo')
sub <- sub[,..modelcols]
modelcols <- colnames(sub)[sub[,lapply(.SD, uniqueN), .SDcols = modelcols] > 1]
sub <- sub[,..modelcols]
sub[,Sales := Sales + 1]
model <- glm(Sales ~ ., data = sub, family = Gamma(link = 'log'))
train[Store == s, SalesPreds := model$fitted.values]
preds <- exp(predict(model, newdata = test[Store == s]))
test[Store == s, Sales := preds ]
}
train[, saleserr := Sales - SalesPreds]
mean(abs(train$saleserr))
sqrt(mean(train$saleserr^2))
sub <- train[Store == 262]
ggplot(sub, aes(Sales, fill = DayOfWeek)) + geom_histogram()
ggplot(sub, aes(x = Sales, y = Customers)) + geom_point()
sub <- within(sub, DayOfWeek <- relevel(DayOfWeek , ref = 7))
train <- within(train, DayOfWeek <- relevel(DayOfWeek, ref = 7))
model <- lm(Sales ~ DayOfWeek:month + Open + Promo + Assortment + StoreType, sub)
sumry <- summary(model)
sumry
library(caret)
trainControl <- trainControl(method = "cv",
number = 10)
tuneGrid <- expand.grid(
.alpha=1,
.lambda=seq(0, 10, by = 0.1))
modelFit <- train(Sales ~ DayOfWeek:month + Open + Promo + Assortment + StoreType, data = train,
method = "glmnet",
trControl = trainControl, # Optimize by F-measure
family="gaussian", tuneGrid = tuneGrid)
resp <- train$Sales
reg <- (train[,.(DayOfWeek, month, Open, Promo, Assortment, StoreType)])
cvfit <- cv.glmnet(reg, resp)
model <- train(Sales ~ DayOfWeek:month + Open + Promo + Assortment + StoreType, data=train, method="lm", trControl=control)
importance <- varImp(model, scale=FALSE)
print(importance)
plot(importance)
library(leaps)
model <- leaps(x = train[,.(Customers)] )
model <- regsubsets(Sales~DayOfWeek + month + Open + Promo + Assortment + StoreType, data = train, method = 'exhaustive', nvmax = 40)
sumry <- summary(model)
sumry$adjr2
which.max(sumry$adjr2)
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
results <- rfe(train[,.(DayOfWeek, month, Open, Promo, Assortment, StoreType)], train[,.(Sales)], sizes=c(1:6), rfeControl=control, nvmax = 20)
library(randomForest)
modelrf <- randomForest(Sales ~ DayOfWeek + month + Open + Promo + Assortment + StoreType,
data = train, subset = sample(1:nrow(train), 100000))
preds <- predict(modelrf, train)
err <- data.table(
sales = train$Sales,
preds = preds,
err = train$Sales - preds)
mae <- mean(abs(err))
merr <- melt(err)
ggplot(merr[variable %in% c('sales', 'preds')], aes(variable, value))
ggplot(err[sample(1:1017209, 10000)], aes(sales, preds)) + geom_point()
sparse_matrix <- sparse.model.matrix(Sales ~ .-1, data = train)