-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgeostack.tex
411 lines (360 loc) · 16.4 KB
/
geostack.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
\documentclass[hyperref={pdfpagelabels=true}]{beamer}
\usepackage{lmodern}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
%
%You are free to:
%
% Share — copy and redistribute the material in any medium or format
% Adapt — remix, transform, and build upon the material
% for any purpose, even commercially.
%
% The licensor cannot revoke these freedoms as long as you follow the license terms.
%
%Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
%
%ShareAlike — If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\definecolor{dred}{rgb}{0.647059, 0.164706, 0.164706}
\definecolor{dgreen}{rgb}{0., 0.545098, 0.545098}
\usecolortheme[named=dred]{structure}
\title{A GEO-stack for Big Data}
\subtitle{ Driving Spatial Analysis Beyond the Limits of
Traditional Storage}
\author{Joana Sim\~{o}es}
\author[shortname]{Joana Sim\~{o}es \inst{1}}
\institute[shortinst]{\inst{1} Bdigital, CASA, CICS.NOVA}
%\date{\today}
\titlegraphic{\includegraphics[width=.35\textwidth]{bigdata2.png}}
\usepackage{beamerthemeshadow}
%\usepackage{beamerthemesplit}
\usepackage{listings}
\newcommand{\soooo}{H$_2$SO$_4$}
%fdl stuff
\usepackage{hyperref}
\hypersetup{colorlinks,
citecolor=black,
filecolor=black,
linkcolor=black,
urlcolor=black,
bookmarksopen=true,
pdftex}
\hfuzz = .6pt % avoid black boxes
\lstset{language=SQL}
\begin{document}
\setbeamertemplate{footline}[page number]
\setbeamertemplate{navigation symbols}{}
\begin{frame}
\titlepage
%\begin{titlepage}
%\centering{
% \includegraphics[scale=0.2]{bigdata2.png}
% \includegraphics[scale=0.2]{bigdata1.png}
%}
%\end{titlepage}
\end{frame}
\begin{frame}
\begin{figure}
\includegraphics[width=\textwidth]{wordcloud.jpg}
\end{figure}
\tiny{Big Data Word Cloud; source:\url{http://olap.com/big-data/}}
\end{frame}
\begin{frame}
\frametitle{Table of Contents}
%\tiny{
\tableofcontents%}
\end{frame}
\begin{frame}
\frametitle{Warning}
\begin{figure}
\includegraphics[scale=0.40]{warning.jpeg}
\end{figure}
\textbackslash *
This presentation may contain tech talk, such as: databases, clusters, NoSQL, cloud, parallelism, scalability.\\
If you are susceptible to these concepts, you may want to leave the room \textbf{now}.\\
From this point beyond, you are at your own risk!
\textbackslash *
\end{frame}
\section{The Value of Data}
\begin{frame}
\frametitle{The Value of Data}
%Not a new idea
\textbf{Not a New Idea!}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{itemize}
\item<1->Matthew Fontaine Maury (1806-1873).
\item<2->Foresaw the hidden value on captain's ships logs, when analysed collectively.
\item<2->Used time series data to carry out analysis that would enable him to recomend optimal shipping routes.%based on winds and currents
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[scale=0.40]{maury.jpg}
\end{figure}
\end{column}
\end{columns}
%American sailor,
%A leg injury turned him to study meteorology, astrology, oceanography, cartography
\end{frame}
%\includegraphics[scale=0.2]{wordcloud_ds.png}\\
%sexiest job of the 21st Century’
%\tiny{Data Science Word Cloud; source:\url{http://data.library.virginia.edu/statlab/}}
\begin{frame}
\frametitle{The Value of Data}
\textbf{Data Mining, Open-source and Crowd-sourcing}
\small{
\begin{itemize}
\item<2->In 1848, Captain Jackson was the the first person to try the route recommended by Maury, and as a result he was able to save 17 days on his outbond trip.
\item<3->Apart from collecting existing logs, Maury encouraged the collection of more regular and systematic time series, by creating a template.
\item<4->Collected Data: longitude, latitude, currents, magnetic variation, air and water temperature, general wind direction, etc.
\end{itemize}
}
\begin{figure}
\includegraphics[scale=0.15]{log.jpg}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Data Analysis}
\textbf{A multidisciplinary field}
\begin{figure}
\includegraphics[width=0.8\textwidth]{data_analysis.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Data Analysis}
%In the past few years, this is what people were using
\textbf{Traditional Stack}
\begin{itemize}
\item Spreadsheets (e.g.: Excel, OpenOffice)
\item RDBMS (e.g.: Oracle, PostgreSQL, MySQL)
\item Statistical Packages (e.g.: R, Matlab)
\item GIS Packages (e.g.: QGIS)
\item Scripting and programming languages (e.g.: Python, Java)
\item Libraries
\end{itemize}
\end{frame}
\section{The Big Data Revolution}
\begin{frame}
\frametitle{Big Data}
\textbf{What changed in recent years?}
\begin{itemize}
\item Differences in the way global business and transportation are done have exploded the volume of traditional data sources.%a much larger scale for traditional types of data
\item Widespread increase in data from sensors. %Internet of things: wide arrays of machines that report back to servers or communicate directly with each other
\end{itemize}
\vspace{5mm}
\begin{figure}
\includegraphics[width=0.4\textwidth]{cluster.jpg}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Big Data}
\textbf{Smart Citizen Project}
\begin{columns}
\begin{column}{0.5\textwidth}\small{
\begin{itemize}
\item<2->Platform to generate participatory processes of people in the cities, by connecting data, people and knowledge.
\item<3->Based on geolocation, Internet and free hardware and software for data collection and sharing.
\item<4->Relies on the production of objects to connect people with their environment and their city.
\end{itemize} }
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[width=\textwidth]{smart_city.png}\\
%Although it initally started in BCN (FabLab), these crowd sourcing data collection structures have the potential to increase exponentially.
\tiny{\url{https://smartcitizen.me/}}
\end{column}
\end{columns}
\end{frame}
\begin{frame}
\frametitle{Big Data}
\small{
\begin{itemize}
\item<2->A great deal of this data is actually geo-located (e.g.: satellite navigate coordinates, ip addresses). % satellite navigation
%Unlike other sciences (e.g.: astronomy,medicine) till recently geography suffered from the lack of available spatial data.
\item<3->Geography has finally the opportunity to switch from being based on guesses and samples, to become a truly data-driven science.
\end{itemize}
}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{old_map.png}
\end{figure}
\tiny{Based on reports, Fillipo Arrieta published maps describing a multi-stage containment plan designed to limit the plague in Bari (1864).}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{ebola_ushahidi.jpg}
\end{figure}
\tiny{The iLab used the ushahidi platform to collect and display crowd-sorcing information about Ebola in Liberia (2014).}
\end{column}
\end{columns}
%In 1694 Bari, Italy administrator Fillipo Arrieta published two maps that carefully detailed a multi-stage containment plan designed to limit plague incursions in Bari, Italy.
\end{frame}
\begin{frame}
\frametitle{Big Data}
\textbf{What characterizes Big Data?}%3vs
\begin{itemize}
\item<2->\textbf{Volume}: Large amounts of data.
\item<3->\textbf{Variety}: Different types of structured, unstructured, and multi-structured data.
\item<4->\textbf{Velocity}: Needs to be analyzed quickly.
\end{itemize}
\begin{figure}
\includegraphics[width=0.4\textwidth]{3vs.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Technological Challenges}
These characteristics map into challenges:
\begin{itemize}
\item Scalability %horizontal scalability vs vertical scalability
\item Heterogeneity % different data sources, with completely different structure
\item Low latency % real time
\end{itemize}
\vspace{5mm}
When the traditional stack is no longer enough, a paradigm shift is required.
\begin{figure}
\includegraphics[width=0.3\textwidth]{paradigm_shift.jpg}
\end{figure}
%spatial working on the edge
\end{frame}
\begin{frame}
\frametitle{RDBMS vs NoSQL}
%Relational Model formulated by Codd in 1970, represents reality through: tables, rows and columns.
\begin{figure}
\includegraphics[width=0.2\textwidth]{nosql.jpg}
\end{figure}
\begin{itemize}
\item<2->NoSQL databases trade away some capabilities of relational databases (SQL), in order to improve scalability.%SQL language and optimizer
\item<3->Advantages: NoSQL databases are simpler, can handle semi-structured and denormalized data and have an higher scalability.
\item<4->Disadvantages: loss of abstraction provided by the query optimizer, that increases the complexity of the applications.% the logic is implemented and optimized in the application, an extra burden for the developer
\item<5->Recently, tools were developed that bring back the full power of SQL language to the NoSQL ecosystem (e.g.: Apache Drill, Hive).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{RDBMS vs NoSQL}
\begin{figure}
\includegraphics[width=0.8\textwidth]{sql_vs_nosql.jpg}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Examples}
\begin{figure}
\includegraphics[width=\textwidth]{examples.jpg}%key value,columnar, big table, document, graph
\end{figure}
\end{frame}
\begin{frame}
\frametitle{MapReduce}
Programming model for processing and generating large data sets with a parallel, distributed algorithm on a cluster.
%It is basically composed by two operations, inspired by functional programming
\begin{itemize}
\item<1->Map(): performs filtering and sorting.%such as sorting students by first name into queues, one queue for each name
\item<1->Reduce(): performs a summary operation. % such as counting the number of students in each queue, yielding name frequencies
\item<2->The framework coordinates the processing, by marshalling the distributed servers, running the tasks and parallel and managing all communications and data between the various parts of the system.% true parallel system, that can be deployed on the cloud
\item<3->There are many libraries that implement MapReduce.%Apache Hadoop is a popular Free and Open-Source implementation.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{A Big Data Approach}
\textbf{Does this mean we have to throw away our traditional tools and methods?}
%not really. mix and match
\begin{itemize}
\item<2->First ensure that you \textbf{really} have, or will have Big Data at some point in the future.%scalability requirement. Don't go for this approach, for instance if you are working with a small static data sample, that you can approach with a traditional stack (example: number of voters in a local election 2014)
\item<3->Then identify the stages of the workflow that are bottlenecks, in terms of the current technologies.%Most often you will not have Big Data all along the workflow, and nearly always the results of a Big Data workflow are not Big Data (we want to achieve dimensionality reduction).
\item<4->It is possible to mix \& and match.%To have a mixed stack combining SQL and noSQL technologies
\end{itemize}
\end{frame}
\section{An Use Case}
\begin{frame}
\textbf{Analysing geo-located Tweets}
\begin{itemize}
\item<2->The purpose of this use case was to analyse the stream of geo-located Tweets, as a sensor for citizen presence.
\item<3->Number of tweets in Catalunya in around 3 months: +- 6 million.
\item<4->Continuous stream of data.%This value is increasing everyday.
\item<5->This amount of data is not easily assimilated by the "human-eye", so we decided to create clusters of Tweets.
\end{itemize}
\begin{figure}
\includegraphics[width=0.4\textwidth]{bigdata1.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{An Use Case}
\textbf{Clustering}
\begin{itemize}
\item<2->It is a descriptive data mining technique, often used for dimensionality reduction.
\item<3->It groups a set of objects in such a way that objects in the same group are more similar to each other, than to object in other groups.
\item<4->Strictly, it corresponds to a family of unsupervised machine learning algorithms.
\item<5->We wanted to implement this concept using only Hadoop, and apply it to spatial attributes.%No ML libraries (Mahout)
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Technological Stack}
\begin{figure}
\includegraphics[width=0.7\textwidth]{stack.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Workflow}
\begin{figure}
\includegraphics[width=0.5\textwidth]{geo3.png}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{Results}
Using this workflow we were able to turn the original point cloud, into a density grid, and then into clusters.
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{heatmap_hive.png}
\end{figure}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[width=\textwidth]{clusters_hive.png}
\end{figure}
\end{column}
\end{columns}
\vspace{5mm}
Clusters expose patterns of presence over time, and can help us to re-define boundaries for the city that are data-driven, rather than administrative.%bottom up vs top-down
%As we identified and solved the bottlenecks with the relevant tools, in theory this algorithm is scalable to any Petabytes of data.%quadrillions of bytes
\end{frame}
\section{Final Remarks}
\begin{frame}
\frametitle{Final Remarks}
\begin{itemize}
\item<2->It is ok to use non scalable tools, at certain points of the workflow.
\item<3->We are working on the edge of existing technologies; some functions were not implemented yet, and bugs are common.
\item<4->Since it is a niche, this is even more true for spatial technologies.
\item<5->There are not ready-made solutions: the particular stack and workflow should be compiled for the specific case study.
\item<6->this is one stack to solve this problem; it is not the only one, and it may not even be the "best" one;% if such a things exists
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Acknowledgements}
I would like to thank Ellen Friedman (MapR, Apache Mahout, Apache Drill), for her inspiring work on Big Data and Machine Learning.
\begin{figure}
\includegraphics[width=0.4\textwidth]{ellen2.jpg}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{References}
\begin{itemize}
\item Aji, A., Wang, F., Vo, H., Lee, R., Liu, Q., Zhang, X. and Saltz, J." Hadoop-GIS: A High Performance Spatial Data Warehousing System over MapReduce". Proceedings of the VLDB Endowment International Conference on Very Large Data Bases, 6(11), p1009. (2013)
\item Cuesta, H. "Practical Data Analysis". Packt Publishing (2013)
\item Dunning, T. and Friedman, E. "Time Series Databases: New Ways to Store and Access Data". O'Reilly Media; 1 edition (October, 2014)
\item Myatt, G and Johnson, W. "Making Sense Of Data I: A Practical Guide to Exploratory Data Analysis and Data Mining". O'Reilly : 2 edition (2014).
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Thank You!}
\begin{figure}
\includegraphics[width=0.2\textwidth]{thanks.jpg}
\end{figure}
This presentation is available at: \centering{\\ \url{http://tinyurl.com/pcmgzxp}\\}
\vspace{5mm}
Next:
\begin{itemize}
\item<2-> 9as Jornadas de SIG Libre - 26th-27th March 2015, Girona.
\end{itemize}
\end{frame}
\end{document}