-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheda.jl
400 lines (296 loc) · 14.3 KB
/
eda.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
# This is a loose port of original EDA of Netflix 2020 Dataset in R by ygterl (https://github.com/ygterl/EDA-Netflix-2020-in-R)
## load required libraries
using DataFrames, CategoricalArrays # packages for data frame manipulation
using CSV # package for CSV loading
using StatsBase # base package for statistics
using Queryverse # metapackage for DS
using Gadfly, Cairo # packages for plotting
using Colors # package for color manipulation
using Dates # package for date manipulation
include("./plot_style.jl")
## reading data file
netflix_df = CSV.read("./netflix_titles.csv", delim=',', header=1, dateformat="U d, Y") |> DataFrame
## next, it's useful to take a good look at the dataset
println("Number of obs: ", nrow(netflix_df)) # number of observations
println("Number of cols: ", ncol(netflix_df)) # number of variables (columns)
println("Cols' names: ", names(netflix_df)) # list variables
println("Cols' types: ", [eltype(col) for col = eachcol(netflix_df)]) # list variables types
first(netflix_df, 5) # listing first 5 obs
last(netflix_df, 5) # listing last 5 obs
## discard show id, it's not really relevant
netflix_df = netflix_df |> @select(-:show_id) |> DataFrame
## investigate number of missings in each column
for col in names(netflix_df)
println("number of missings in $col col: ", count(ismissing, netflix_df[:, col]))
end
## investigate number of complete cases (with no missing values in any column)
println("number of complete cases: ", nrow(dropmissing(netflix_df)))
dropmissing!(netflix_df) # retaining only complete cases
## investigate most common values (modes) in each columns
for col in names(netflix_df)
println("most common value in $col col: ", modes(netflix_df[:, col]))
end
## unfornately, some columns are comma separated ones - we'll have to split them, converting into arrays
netflix_df[!, :country_arr] = map(x -> split(x, ", "), netflix_df.country) # countries
netflix_df[!, :genre_arr] = map(x -> split(x, ", "), replace.(netflix_df.listed_in, "&" => "&")) # genres
netflix_df[!, :director_arr] = map(x -> split(x, ", "), netflix_df.director) # directors
netflix_df[!, :actor_arr] = map(x -> split(x, ", "), netflix_df.cast) # actors
## when processing, @mapmany does the magic of unnesting and processing these arrays
## 1: explore individual feature (univariate) distribution
### general plotting settings
Gadfly.push_theme(plot_style())
set_default_plot_size(21cm, 10cm)
### 1a: type
type_df = netflix_df |>
@groupby(_.type) |>
@map({type=key(_), count=length(_)}) |>
@mutate(count_str=string(_.count)) |>
@orderby_descending(_.count) |>
DataFrame
print(type_df)
plot(type_df, y=:type, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Titles by types")) |> PNG("plot1a-types.png")
### 1b: country
top_countries_df = netflix_df |>
@mapmany(_.country_arr, {title=_.title, country=__}) |>
@groupby(_.country) |>
@map({country=key(_), count=length(_), count_str=string(length(_))}) |>
@orderby_descending(_.count) |>
@take(10) |>
DataFrame
print(top_countries_df)
plot(top_countries_df, y=:country, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Top 10 countries")) |> PNG("plot1b-countries.png")
### 1c: release year
release_year_df = netflix_df |>
@groupby(_.release_year) |>
@map({release_year=string(key(_)), count=length(_)}) |>
@mutate(count_str=string(_.count)) |>
@orderby_descending(_.count) |>
DataFrame
top_release_year = first(release_year_df, 10)
print(top_release_year)
plot(top_release_year, y=:release_year, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Top 10 release years")) |> PNG("plot1c-release_year.png")
### 1d: director
top_directors_df = netflix_df |>
@mapmany(_.director_arr, {title=_.title, director=__}) |>
@groupby(_.director) |>
@map({director=key(_), count=length(_), count_str=string(length(_))}) |>
@orderby_descending(_.count) |>
@take(10) |>
DataFrame
print(top_directors_df)
plot(top_directors_df, y=:director, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Top 10 directors")) |> PNG("plot1d-directors.png")
### 1e: actor
top_actors_df = netflix_df |>
@mapmany(_.actor_arr, {title=_.title, actor=__}) |>
@groupby(_.actor) |>
@map({actor=key(_), count=length(_), count_str=string(length(_))}) |>
@orderby_descending(_.count) |>
@take(10) |>
DataFrame
print(top_actors_df)
plot(top_actors_df, y=:actor, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Top 10 actors")) |> PNG("plot1e-actors.png")
### 1f: genre
### listed_in is also a comma separated column - we'll have to split it
top_genres_df = netflix_df |>
@mapmany(_.genre_arr, {title=_.title, genre=__}) |>
@groupby(_.genre) |>
@map({genre=key(_), count=length(_), count_str=string(length(_))}) |>
@orderby_descending(_.count) |>
@take(10) |>
DataFrame
print(top_genres_df)
plot(top_genres_df, y=:genre, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Top 10 genres")) |> PNG("plot1f-genres.png")
### 1g: mature rating
rating_df = netflix_df |>
@groupby(_.rating) |>
@map({rating=key(_), count=length(_)}) |>
@mutate(count_str=string(_.count)) |>
@orderby_descending(_.count) |>
DataFrame
print(rating_df)
plot(rating_df, y=:rating, x=:count, label=:count_str,
Geom.bar(orientation=:horizontal), Geom.label(position=:right),
Guide.title("Titles by mature rating")) |> PNG("plot1g-rating.png")
## 2: next, let's do some multivariate analysis
### first, do some frequency table analysis
unnested_netflix_df = netflix_df |>
@mutate(added_year=Dates.year(_.date_added)) |>
@mapmany(_.country_arr, {title=_.title, type=_.type, rating=_.rating, added_year=_.added_year, country=__, genre_arr=_.genre_arr, director_arr=_.director_arr, actor_arr=_.actor_arr}) |>
@mapmany(_.genre_arr, {title=_.title, type=_.type, rating=_.rating, added_year=_.added_year, country=_.country, genre=__, director_arr=_.director_arr, actor_arr=_.actor_arr}) |>
@mapmany(_.director_arr, {title=_.title, type=_.type, rating=_.rating, added_year=_.added_year, country=_.country, genre=_.genre, director=__, actor_arr=_.actor_arr}) |>
@mapmany(_.actor_arr, {title=_.title, type=_.type, rating=_.rating, added_year=_.added_year, country=_.country, genre=_.genre, director=_.director, actor=__}) |>
DataFrame
### freq tables:
### I: type x country
ft1 = unnested_netflix_df |>
@groupby({_.type, _.country}) |>
@map({type=key(_)[1], country=key(_)[2], n=length(_)}) |>
@orderby(_.type) |> @thenby(_.country) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft1 |> CSV.write("freqtable_type_country-long.csv")
# wide form, allow for an overview including missing cases
wft1 = unstack(ft1, :country, :n)
wft1 |> CSV.write("freqtable_type_country-wide.csv")
### II: type x genre
ft2 = unnested_netflix_df |>
@groupby({_.type, _.genre}) |>
@map({type=key(_)[1], genre=key(_)[2], n=length(_)}) |>
@orderby(_.type) |> @thenby(_.genre) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft2 |> CSV.write("freqtable_type_genre-long.csv")
# wide form, allow for an overview including missing cases
wft2 = unstack(ft2, :genre, :n)
wft2 |> CSV.write("freqtable_type_genre-wide.csv")
### III: type x director
ft3 = unnested_netflix_df |>
@groupby({_.type, _.director}) |>
@map({type=key(_)[1], director=key(_)[2], n=length(_)}) |>
@orderby(_.type) |> @thenby(_.director) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft3 |> CSV.write("freqtable_type_director-long.csv")
# wide form, allow for an overview including missing cases
wft3 = unstack(ft3, :director, :n)
wft3 |> CSV.write("freqtable_type_director-wide.csv")
### IV: type x actor
ft4 = unnested_netflix_df |>
@groupby({_.type, _.actor}) |>
@map({type=key(_)[1], actor=key(_)[2], n=length(_)}) |>
@orderby(_.type) |> @thenby(_.actor) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft4 |> CSV.write("freqtable_type_actor-long.csv")
# wide form, allow for an overview including missing cases
wft4 = unstack(ft4, :actor, :n)
wft4 |> CSV.write("freqtable_type_actor-wide.csv")
### V: genre x country
ft5 = unnested_netflix_df |>
@groupby({_.genre, _.country}) |>
@map({genre=key(_)[1], country=key(_)[2], n=length(_)}) |>
@orderby(_.genre) |> @thenby(_.country) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft5 |> CSV.write("freqtable_genre_country-long.csv")
# wide form, allow for an overview including missing cases
wft5 = unstack(ft5, :country, :n)
wft5 |> CSV.write("freqtable_genre_country-wide.csv")
### VI: genre x director
ft6 = unnested_netflix_df |>
@groupby({_.genre, _.director}) |>
@map({genre=key(_)[1], director=key(_)[2], n=length(_)}) |>
@orderby(_.genre) |> @thenby(_.director) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft6 |> CSV.write("freqtable_genre_director-long.csv")
# wide form, allow for an overview including missing cases
wft6 = unstack(ft6, :director, :n)
wft6 |> CSV.write("freqtable_genre_director-wide.csv")
### VII: genre x actor
ft7 = unnested_netflix_df |>
@groupby({_.genre, _.actor}) |>
@map({genre=key(_)[1], actor=key(_)[2], n=length(_)}) |>
@orderby(_.genre) |> @thenby(_.actor) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft7 |> CSV.write("freqtable_genre_actor-long.csv")
# wide form, allow for an overview including missing cases
wft7 = unstack(ft7, :actor, :n)
wft7 |> CSV.write("freqtable_genre_actor-wide.csv")
### VIII: genre x rating
ft8 = unnested_netflix_df |>
@groupby({_.genre, _.rating}) |>
@map({genre=key(_)[1], rating=key(_)[2], n=length(_)}) |>
@orderby(_.genre) |> @thenby(_.rating) |>
DataFrame
# long form, more readable, but missing out combinations that did not appear
ft8 |> CSV.write("freqtable_genre_rating-long.csv")
# wide form, allow for an overview including missing cases
wft8 = unstack(ft8, :rating, :n)
wft8 |> CSV.write("freqtable_genre_rating-wide.csv")
### secondly, make some multivariate plots
### general plotting settings
set_default_plot_size(30cm, 18cm)
### a: type x release year
type_year_df = netflix_df |>
@groupby({_.type, _.release_year}) |>
@map({type=key(_)[1], release_year=key(_)[2], count=length(_)}) |>
@mutate(count_str=string(_.count)) |>
@orderby_descending(_.count) |>
DataFrame
print(type_year_df)
plot(type_year_df, y=:count, x=:release_year, color=:type,
Geom.line, Geom.point,
style(line_width=0.8mm),
Guide.title("Evolution of type by release year")) |> PNG("plot2a1-type-release_year.png")
plot(type_year_df, y=:count, x=:release_year, color=:type,
Geom.bar(position=:stack),
Guide.title("Evolution of type by release year")) |> PNG("plot2a2-type-release_year.png")
### b: type x added year
type_added_year_df = netflix_df |>
@mutate(added_year=Dates.year(_.date_added)) |>
@groupby({_.type, _.added_year}) |>
@map({type=key(_)[1], added_year=key(_)[2], count=length(_)}) |>
@mutate(count_str=string(_.count)) |>
@orderby_descending(_.count) |>
DataFrame
print(type_added_year_df)
plot(type_added_year_df, y=:count, x=:added_year, color=:type,
Geom.line, Geom.point,
style(line_width=0.8mm),
Guide.title("Evolution of type by added year")) |> PNG("plot2b1-type-added_year.png")
plot(type_added_year_df, y=:count, x=:added_year, color=:type,
Geom.bar(position=:stack),
Guide.title("Evolution of type by added year")) |> PNG("plot2b2-type-added_year.png")
### c: country x type x added year
country_type_added_year_df = netflix_df |>
@mutate(added_year=Dates.year(_.date_added)) |>
@mapmany(_.country_arr, {title=_.title, type=_.type, added_year=_.added_year, country=__}) |>
@groupby({_.country, _.type, _.added_year}) |>
@map({country=key(_)[1], type=key(_)[2], added_year=key(_)[3], count=length(_)}) |>
@filter(_.count >= 60) |>
DataFrame
print(country_type_added_year_df)
plot(country_type_added_year_df, y=:count, x=:added_year, color=:country,
Geom.bar(position=:stack),
Guide.title("Number of titles by country, added year")) |> PNG("plot2c1-country-added_year.png")
plot(country_type_added_year_df, y=:count, x=:country, color=:type,
Geom.bar(position=:stack),
Guide.title("Number of titles by country, type")) |> PNG("plot2c2-country-type.png")
plot(country_type_added_year_df, x=:added_year, y=:count, xgroup=:type, ygroup=:country,
Geom.subplot_grid(Geom.bar),
Guide.title("Top country x added year x type")) |> PNG("plot2c3-country-type-added_year.png")
### d: country x genre x added year
country_genre_added_year_df = netflix_df |>
@mutate(added_year=Dates.year(_.date_added)) |>
@mapmany(_.country_arr, {title=_.title, added_year=_.added_year, country=__, genre_arr=_.genre_arr}) |>
@mapmany(_.genre_arr, {title=_.title, added_year=_.added_year, country=_.country, genre=__}) |>
@groupby({_.country, _.genre, _.added_year}) |>
@map({country=key(_)[1], genre=key(_)[2], added_year=string(key(_)[3]), count=length(_)}) |>
@filter(_.count >= 50) |>
DataFrame
print(country_genre_added_year_df)
plot(country_genre_added_year_df, y=:count, x=:country, color=:genre,
Geom.bar(position=:stack),
# Scale.color_discrete(n -> convert(Vector{Color}, colormap("RdBu", n)) ), # use a red-blue palette
Scale.color_discrete(n -> convert(Vector{Color}, # use a custom palette
Scale.lab_gradient("darkorchid4", "snow2", "darkorange2").(0:(1/(n-1)):1)
) ),
Guide.title("Number of titles by country, genre")) |> PNG("plot2d1-country-genre.png")
plot(country_genre_added_year_df, x=:added_year, y=:count, xgroup=:genre, ygroup=:country,
Geom.subplot_grid(Geom.bar),
style(minor_label_font_size=09pt),
Guide.title("Top country x added year x genre")) |> PNG("plot2d2-country-genre-added_year.png")