-
Notifications
You must be signed in to change notification settings - Fork 0
/
03a_a3_alternative.R
197 lines (170 loc) · 8.48 KB
/
03a_a3_alternative.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# ------------------------ Learning from Data: Assignment 3 -------------------#
#
# Lukas Schmid ----
#
# -----------------------------------------------------------------------------#
# Preliminaries ----------------------------------------------------------------
## loading packages
library(dplyr)
library(tidyr)
library(purrr)
library(ggplot2)
library(cowplot) # for more tweaking with graphics (namespace will be given)
library(magrittr)
## setting the ggplot-theme
theme_set(theme_minimal())
#------------------------------------------------------------------------------#
#------------------------------------------------------------------------------#
# Task 2 -----------------------------------------------------------------------
#------------------------------------------------------------------------------#
# In this task you will explore the famous MNIST handwritten digits dataset.
# To load the dataset and display specific handwritten digits use the code
# provided under https://gist.github.com/brendano/39760 .
#------------------------------------------------------------------------------#
# load the data ----------------------------------------------------------------
#------------------------------------------------------------------------------#
#----------------------------------------------################################-
# prepare the functions that can load the data ################################-
#----------------------------------------------################################-
## what follows is copied from https://gist.github.com/brendano/39760
## Load the MNIST digit recognition dataset into R
## http://yann.lecun.com/exdb/mnist/
## assume you have all 4 files and gunzip'd them
## creates train$n, train$x, train$y and test$n, test$x, test$y
## e.g. train$x is a 60000 x 784 matrix, each row is one digit (28x28)
## call: show_digit(train$x[5,]) to see a digit.
## brendan o'connor - gist.github.com/39760 - anyall.org
load_mnist <- function() {
load_image_file <- function(filename) {
ret = list()
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
ret$n = readBin(f,'integer',n=1,size=4,endian='big')
nrow = readBin(f,'integer',n=1,size=4,endian='big')
ncol = readBin(f,'integer',n=1,size=4,endian='big')
x = readBin(f,'integer',n=ret$n*nrow*ncol,size=1,signed=F)
ret$x = matrix(x, ncol=nrow*ncol, byrow=T)
close(f)
ret
}
load_label_file <- function(filename) {
f = file(filename,'rb')
readBin(f,'integer',n=1,size=4,endian='big')
n = readBin(f,'integer',n=1,size=4,endian='big')
y = readBin(f,'integer',n=n,size=1,signed=F)
close(f)
y
}
train <<- load_image_file('mnist/train-images.idx3-ubyte')
test <<- load_image_file('mnist/t10k-images.idx3-ubyte')
train$y <<- load_label_file('mnist/train-labels.idx1-ubyte')
test$y <<- load_label_file('mnist/t10k-labels.idx1-ubyte')
}
show_digit <- function(arr784, col=gray(12:1/12), ...) {
image(matrix(arr784, nrow=28)[,28:1], col=col, ...)
}
# -----------------------######################################################-
# actually load the data ######################################################-
# -----------------------######################################################-
# actually loading the data - we assume that test and train never get separated,
# so they either both exist or don't; when we dont have the data as .RDS-object,
# we generate the data and save it as .RDS; if we have the data, but don't have
# the objects in our workspace, we load the obejct from the data; else, we have
# both and don't have to do anything
if(!file.exists("MINST_rds/03_test_raw.rds") |
!file.exists("MINST_rds/03_test_raw.rds")) {
load_mnist()
saveRDS(train, file = "MINST_rds/train_raw.rds")
saveRDS(test, file = "MINST_rds/test_raw.rds")
print("Extracted the data from the original files, saved it as .rds and
loaded them into the workspace")
} else if (!exists("train") | !exists("test")) {
test <- readRDS("MINST_rds/03_test_raw.rds")
train <- readRDS("MINST_rds/03_test_raw.rds")
print("Files already existed as .rds-files. Only loaded them into the
workspace.")
} else {
print("Files and objects in the workspace already existed. Did nothing.")
}
#------------------------------------------------------------------------------#
# create symmetry --------------------------------------------------------------
#------------------------------------------------------------------------------#
symmetryValues <- rowSums(abs(train$x[, 1:392] - train$x[, 784:393]))
# Symmetric values are at the opposite end of the vector in which they are
# stored. Thus, I take the second half of the vector in descending order and
# subtract it from the first half. Symmetry is the sum of the absolute distances
# of all values.
#------------------------------------------------------------------------------#
# create intensity -------------------------------------------------------------
#------------------------------------------------------------------------------#
intensityValues <- rowSums(train$x)
# Intensity is simply the sum of the intensity values in each digit.
#------------------------------------------------------------------------------#
# compare symmetry and intensity -----------------------------------------------
#------------------------------------------------------------------------------#
# uniform distribution of both variables
data <- tibble(symmetryValues, intensityValues, digit=train$y)
data %>%
pivot_longer(cols=c(symmetryValues, intensityValues)) %>%
ggplot(aes(x=value, fill=factor(digit), color=factor(digit))) +
geom_density(alpha=0.3) +
facet_wrap(facets=vars(name))
# bivariate distribution for only 1s and 5s
data %>%
filter(digit==1 | digit==5) %>%
ggplot(aes(x=symmetryValues, y=intensityValues,
fill=factor(digit), color=factor(digit))) +
geom_point(binwidth=0.1, shape=19, stroke=0, stat="bin2d") +
geom_density_2d() +
scale_fill_manual(values=c("1"="red", "5"="blue")) +
scale_colour_manual(values=c("1"="red3", "5"="blue3")) +
scale_alpha_continuous(guide=F) +
labs(x="Point Symmetry", y="Intensity", colour="Digit", fill="Digit",
title="Intensity vs. Point Symmetry for digits from the MNIST-dataset",
caption=paste0("Only 1's and 5's from the test-dataset were used. N = ",
nrow(filter(data, digit==1 | digit==5)), ". \n",
"Both variables are normalised over all digits
(sd=1, mean=1)")) +
guides(fill = guide_legend(override.aes = list(size=4)))
# classification performance of a logistic regression for the two types of
# digits 1 and 5
## first, we have to transform y so it is either 1 or 0
data <- data %>%
filter(digit==1 | digit==5) %>%
mutate(y = ifelse(digit==1, 0, 1))
model <- glm(y ~ symmetryValues+intensityValues, data,
family="binomial")
summary(model)
#------------------------------------------------------------------------------#
# create other features --------------------------------------------------------
#------------------------------------------------------------------------------#
# remember: my data works row-wise (every row in train$x is one digit), but the
# digits themselves are different: if the values are put into a 28-28-matrix,
# row-wise, then I get the correct image
# prominence of vertical lines
matrix(t(train$x), )
rowIntensity = rowSums(cols)
horizontalityValues = rowMeans(matrix(rowIntensity, nrow=10000))
tibble(horizontalityValues, digit=train$y) %>%
ggplot(aes(x=horizontalityValues, fill=factor(digit), color=factor(digit))) +
geom_density(alpha=0.3)
# prominence of vertical lines
# similarity to typical digits
## create the typical digits: average grey-scale-value for every pixel of
## a certain digit
average_digits = matrix(0, 10, 784)
# creating the empty matrix that contains
# the grey scale values for all the digits
for (digit in 1:10) {
average_digits[digit,] <- colMeans(train$x[train$y==digit, ])
}
# plotting the average digits
## we need to create a matrix that is made up of smaller matrices
image_matrix <- matrix(0, 28*5, 28*2)
# the image matrix has
image(matrix(average_digits[1,], nrow=28)[,28:1], col=gray(12:1/12))
#------------------------------------------------------------------------------#
# other visual inspections -----------------------------------------------------
#------------------------------------------------------------------------------#
# distribution of grey-scale values
plot(as.vector(train$x), type="h")