From 4182878dc796011d2dd08d887c0596cea19371cc Mon Sep 17 00:00:00 2001
From: Pondpaun7z <pppangpondpp7@gmail.com>
Date: Thu, 16 Nov 2017 16:59:30 +0700
Subject: [PATCH 1/6] Updated Spark to 2

---
 Dockerfile   | 10 +++++-----
 README.md    | 14 +++++++-------
 bootstrap.sh |  8 ++++----
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 7fbcaf0..586e729 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,14 +1,14 @@
-FROM sequenceiq/hadoop-docker:2.6.0
+FROM sequenceiq/hadoop-docker:2.7.0
 MAINTAINER SequenceIQ
 
-#support for Hadoop 2.6.0
-RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-1.6.1-bin-hadoop2.6.tgz | tar -xz -C /usr/local/
-RUN cd /usr/local && ln -s spark-1.6.1-bin-hadoop2.6 spark
+#support for Hadoop 2.7.0
+RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.0.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
+RUN cd /usr/local && ln -s spark-2.0.0-bin-hadoop2.7 spark
 ENV SPARK_HOME /usr/local/spark
 RUN mkdir $SPARK_HOME/yarn-remote-client
 ADD yarn-remote-client $SPARK_HOME/yarn-remote-client
 
-RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-1.6.1-bin-hadoop2.6/lib /spark
+RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-2.0.0-bin-hadoop2.7/jars /spark
 
 ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
 ENV PATH $PATH:$SPARK_HOME/bin:$HADOOP_PREFIX/bin
diff --git a/README.md b/README.md
index 7158578..f9f1b5b 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,12 @@ The base Hadoop Docker image is also available as an official [Docker image](htt
 
 ##Pull the image from Docker Repository
 ```
-docker pull sequenceiq/spark:1.6.0
+docker pull sequenceiq/spark:2.0.0
 ```
 
 ## Building the image
 ```
-docker build --rm -t sequenceiq/spark:1.6.0 .
+docker build --rm -t sequenceiq/spark:2.0.0
 ```
 
 ## Running the image
@@ -24,16 +24,16 @@ docker build --rm -t sequenceiq/spark:1.6.0 .
 * in your /etc/hosts file add $(boot2docker ip) as host 'sandbox' to make it easier to access your sandbox UI
 * open yarn UI ports when running container
 ```
-docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:1.6.0 bash
+docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:2.0.0 bash
 ```
 or
 ```
-docker run -d -h sandbox sequenceiq/spark:1.6.0 -d
+docker run -d -h sandbox sequenceiq/spark:2.0.0 -d
 ```
 
 ## Versions
 ```
-Hadoop 2.6.0 and Apache Spark v1.6.0 on Centos
+Hadoop 2.7.0 and Apache Spark v2.0.0 on Centos
 ```
 
 ## Testing
@@ -71,7 +71,7 @@ spark-submit \
 --driver-memory 1g \
 --executor-memory 1g \
 --executor-cores 1 \
-$SPARK_HOME/lib/spark-examples-1.6.0-hadoop2.6.0.jar
+$SPARK_HOME/lib/spark-examples-2.0.0-hadoop2.7.0.jar
 ```
 
 Estimating Pi (yarn-client mode):
@@ -84,7 +84,7 @@ spark-submit \
 --driver-memory 1g \
 --executor-memory 1g \
 --executor-cores 1 \
-$SPARK_HOME/lib/spark-examples-1.6.0-hadoop2.6.0.jar
+$SPARK_HOME/lib/spark-examples-2.0.0-hadoop2.7.0.jar
 ```
 
 ### Submitting from the outside of the container
diff --git a/bootstrap.sh b/bootstrap.sh
index c01eeda..3ceecde 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -13,7 +13,7 @@ cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp;
 sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
 
 # setting spark defaults
-echo spark.yarn.jar hdfs:///spark/spark-assembly-1.6.0-hadoop2.6.0.jar > $SPARK_HOME/conf/spark-defaults.conf
+echo spark.yarn.jar hdfs:///spark/spark-yarn_2.11-2.0.0.jar > $SPARK_HOME/conf/spark-defaults.conf
 cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties
 
 service sshd start
@@ -25,8 +25,8 @@ $HADOOP_PREFIX/sbin/start-yarn.sh
 CMD=${1:-"exit 0"}
 if [[ "$CMD" == "-d" ]];
 then
-	service sshd stop
-	/usr/sbin/sshd -D -d
+  service sshd stop
+  /usr/sbin/sshd -D -d
 else
-	/bin/bash -c "$*"
+  /bin/bash -c "$*"
 fi

From 826cb837a7cb13cdce2dfd84bffeeff886bec8b3 Mon Sep 17 00:00:00 2001
From: Pondpaun7z <pppangpondpp7@gmail.com>
Date: Fri, 17 Nov 2017 22:37:12 +0700
Subject: [PATCH 2/6] Added config

---
 Dockerfile   | 10 ++++------
 bootstrap.sh |  5 +----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 586e729..5320176 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,23 +2,21 @@ FROM sequenceiq/hadoop-docker:2.7.0
 MAINTAINER SequenceIQ
 
 #support for Hadoop 2.7.0
-RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.0.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
-RUN cd /usr/local && ln -s spark-2.0.0-bin-hadoop2.7 spark
+RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
+RUN cd /usr/local && ln -s spark-2.1.0-bin-hadoop2.7 spark
 ENV SPARK_HOME /usr/local/spark
 RUN mkdir $SPARK_HOME/yarn-remote-client
 ADD yarn-remote-client $SPARK_HOME/yarn-remote-client
 
-RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-2.0.0-bin-hadoop2.7/jars /spark
+RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-2.1.0-bin-hadoop2.7/jars /spark
 
 ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
 ENV PATH $PATH:$SPARK_HOME/bin:$HADOOP_PREFIX/bin
+
 # update boot script
 COPY bootstrap.sh /etc/bootstrap.sh
 RUN chown root.root /etc/bootstrap.sh
 RUN chmod 700 /etc/bootstrap.sh
 
-#install R
-RUN rpm -ivh http://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
-RUN yum -y install R
 
 ENTRYPOINT ["/etc/bootstrap.sh"]
diff --git a/bootstrap.sh b/bootstrap.sh
index 3ceecde..b994454 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -12,14 +12,11 @@ cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do  echo == $cp;
 # altering the core-site configuration
 sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
 
-# setting spark defaults
-echo spark.yarn.jar hdfs:///spark/spark-yarn_2.11-2.0.0.jar > $SPARK_HOME/conf/spark-defaults.conf
-cp $SPARK_HOME/conf/metrics.properties.template $SPARK_HOME/conf/metrics.properties
 
 service sshd start
 $HADOOP_PREFIX/sbin/start-dfs.sh
 $HADOOP_PREFIX/sbin/start-yarn.sh
-
+$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
 
 
 CMD=${1:-"exit 0"}

From 4d06f2d02d29811121bc85b1451dba205256a817 Mon Sep 17 00:00:00 2001
From: Pondpaun7z <pppangpondpp7@gmail.com>
Date: Fri, 17 Nov 2017 22:41:21 +0700
Subject: [PATCH 3/6] updated README.md

---
 README.md | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index f9f1b5b..35e6061 100644
--- a/README.md
+++ b/README.md
@@ -10,12 +10,12 @@ The base Hadoop Docker image is also available as an official [Docker image](htt
 
 ##Pull the image from Docker Repository
 ```
-docker pull sequenceiq/spark:2.0.0
+docker pull sequenceiq/spark:2.1.0
 ```
 
 ## Building the image
 ```
-docker build --rm -t sequenceiq/spark:2.0.0
+docker build --rm -t sequenceiq/spark:2.1.0
 ```
 
 ## Running the image
@@ -24,16 +24,16 @@ docker build --rm -t sequenceiq/spark:2.0.0
 * in your /etc/hosts file add $(boot2docker ip) as host 'sandbox' to make it easier to access your sandbox UI
 * open yarn UI ports when running container
 ```
-docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:2.0.0 bash
+docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:2.1.0 bash
 ```
 or
 ```
-docker run -d -h sandbox sequenceiq/spark:2.0.0 -d
+docker run -d -h sandbox sequenceiq/spark:2.1.0 -d
 ```
 
 ## Versions
 ```
-Hadoop 2.7.0 and Apache Spark v2.0.0 on Centos
+Hadoop 2.7.0 and Apache Spark v2.1.0 on Centos
 ```
 
 ## Testing
@@ -47,7 +47,7 @@ In yarn-client mode, the driver runs in the client process, and the application
 ```
 # run the spark shell
 spark-shell \
---master yarn-client \
+--master yarn \
 --driver-memory 1g \
 --executor-memory 1g \
 --executor-cores 1
@@ -66,26 +66,17 @@ Estimating Pi (yarn-cluster mode):
 # note you must specify --files argument in cluster mode to enable metrics
 spark-submit \
 --class org.apache.spark.examples.SparkPi \
---files $SPARK_HOME/conf/metrics.properties \
---master yarn-cluster \
+--master yarn \
 --driver-memory 1g \
 --executor-memory 1g \
 --executor-cores 1 \
-$SPARK_HOME/lib/spark-examples-2.0.0-hadoop2.7.0.jar
+$SPARK_HOME/examples/jars/spark-examples*.jar
 ```
 
-Estimating Pi (yarn-client mode):
 
-```
-# execute the the following command which should print the "Pi is roughly 3.1418" to the screen
-spark-submit \
---class org.apache.spark.examples.SparkPi \
---master yarn-client \
---driver-memory 1g \
---executor-memory 1g \
---executor-cores 1 \
-$SPARK_HOME/lib/spark-examples-2.0.0-hadoop2.7.0.jar
-```
+### View Result in hadoop
+
+visit `http://localhost:8088`
 
 ### Submitting from the outside of the container
 To use Spark from outside of the container it is necessary to set the YARN_CONF_DIR environment variable to directory with a configuration appropriate for the docker. The repository contains such configuration in the yarn-remote-client directory.

From b4f6c6712bd37103659b42d034ca6a2872ea0d14 Mon Sep 17 00:00:00 2001
From: yohei1126 <yohei.onishi@fastretailing.com>
Date: Sat, 13 Jan 2018 19:26:21 +0900
Subject: [PATCH 4/6] support spark2.2.0

---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 5320176..27fa944 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,13 +2,13 @@ FROM sequenceiq/hadoop-docker:2.7.0
 MAINTAINER SequenceIQ
 
 #support for Hadoop 2.7.0
-RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
-RUN cd /usr/local && ln -s spark-2.1.0-bin-hadoop2.7 spark
+RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
+RUN cd /usr/local && ln -s spark-2.2.0-bin-hadoop2.7 spark
 ENV SPARK_HOME /usr/local/spark
 RUN mkdir $SPARK_HOME/yarn-remote-client
 ADD yarn-remote-client $SPARK_HOME/yarn-remote-client
 
-RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-2.1.0-bin-hadoop2.7/jars /spark
+RUN $BOOTSTRAP && $HADOOP_PREFIX/bin/hadoop dfsadmin -safemode leave && $HADOOP_PREFIX/bin/hdfs dfs -put $SPARK_HOME-2.2.0-bin-hadoop2.7/jars /spark
 
 ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
 ENV PATH $PATH:$SPARK_HOME/bin:$HADOOP_PREFIX/bin

From 27723cfa6e67b88ac6f31c356642ee7c8ccb2581 Mon Sep 17 00:00:00 2001
From: yohei1126 <yohei.onishi@fastretailing.com>
Date: Sat, 13 Jan 2018 19:30:49 +0900
Subject: [PATCH 5/6] fix README

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 35e6061..473aad6 100644
--- a/README.md
+++ b/README.md
@@ -8,14 +8,14 @@ Apache Spark on Docker
 This repository contains a Docker file to build a Docker image with Apache Spark. This Docker image depends on our previous [Hadoop Docker](https://github.com/sequenceiq/hadoop-docker) image, available at the SequenceIQ [GitHub](https://github.com/sequenceiq) page.
 The base Hadoop Docker image is also available as an official [Docker image](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/).
 
-##Pull the image from Docker Repository
+## Pull the image from Docker Repository
 ```
-docker pull sequenceiq/spark:2.1.0
+docker pull sequenceiq/spark:2.2.0
 ```
 
 ## Building the image
 ```
-docker build --rm -t sequenceiq/spark:2.1.0
+docker build --rm -t sequenceiq/spark:2.2.0
 ```
 
 ## Running the image
@@ -24,16 +24,16 @@ docker build --rm -t sequenceiq/spark:2.1.0
 * in your /etc/hosts file add $(boot2docker ip) as host 'sandbox' to make it easier to access your sandbox UI
 * open yarn UI ports when running container
 ```
-docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:2.1.0 bash
+docker run -it -p 8088:8088 -p 8042:8042 -p 4040:4040 -h sandbox sequenceiq/spark:2.2.0 bash
 ```
 or
 ```
-docker run -d -h sandbox sequenceiq/spark:2.1.0 -d
+docker run -d -h sandbox sequenceiq/spark:2.2.0 -d
 ```
 
 ## Versions
 ```
-Hadoop 2.7.0 and Apache Spark v2.1.0 on Centos
+Hadoop 2.7.0 and Apache Spark v2.2.0 on Centos
 ```
 
 ## Testing

From 0a736516070086e4b146e511439b003bd611571b Mon Sep 17 00:00:00 2001
From: yohei1126 <yohei.onishi@fastretailing.com>
Date: Sun, 21 Jan 2018 17:22:24 +0900
Subject: [PATCH 6/6] use yohei1126/hadoop-docker:2.7.1

---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 27fa944..2fb1d9b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
-FROM sequenceiq/hadoop-docker:2.7.0
-MAINTAINER SequenceIQ
+FROM yohei1126/hadoop-docker:2.7.1
+MAINTAINER yohei1126
 
-#support for Hadoop 2.7.0
+#support for Hadoop 2.7.1
 RUN curl -s http://d3kbcqa49mib13.cloudfront.net/spark-2.2.0-bin-hadoop2.7.tgz | tar -xz -C /usr/local/
 RUN cd /usr/local && ln -s spark-2.2.0-bin-hadoop2.7 spark
 ENV SPARK_HOME /usr/local/spark