Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial integration with AutoDist #72

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions examples/autodist/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Copyright 2020 Petuum, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


FROM python:3.6.12-buster
WORKDIR /root

FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime

FROM tensorflow/tensorflow:2.2.0-gpu
# Install apps
COPY adaptdl adaptdl
COPY examples/requirements.txt .

RUN cd adaptdl && python3 setup.py bdist_wheel

ARG ADAPTDL_VERSION=0.0.0
RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl
RUN pip install -r requirements.txt

RUN rm -rf adaptdl/dist

# autodist env
SHELL ["/bin/bash", "-cu"]

RUN rm -rf /etc/bash.bashrc

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
git \
curl \
vim \
wget \
unzip
WORKDIR /root
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py

COPY bert_config.json bert_config.json
COPY tf_examples.tfrecord tf_examples.tfrecord
RUN git clone https://github.com/petuum/autodist.git
WORKDIR autodist
RUN git checkout integration
RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip
RUN unzip protoc-3.11.0-linux-x86_64.zip
RUN PROTOC=$(pwd)/bin/protoc python setup.py build
RUN pip install -e .[dev]
RUN pip install tensorflow_hub
WORKDIR autodist

# setup ssh
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd

WORKDIR /root
RUN mkdir /root/.ssh
RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys
RUN chown -R root /root/.ssh
RUN chmod 700 /root/.ssh
RUN chmod 600 /root/.ssh/authorized_keys

RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config

# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

ENV PYTHONUNBUFFERED=true
28 changes: 28 additions & 0 deletions examples/autodist/adaptdljob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: adaptdl.petuum.com/v1
kind: AdaptDLJob
metadata:
generateName: integration-
spec:
minReplicas: 2
template:
spec:
containers:
- name: main
command:
- python3
- /root/autodist/examples/benchmark/bert_with_adaptdl.py
- -input_files=/root/tf_examples.tfrecord
- --bert_config_file=/root/bert_config.json
- --num_train_epochs=1
- --num_steps_per_epoch=1000
- --learning_rate=5e-5
- --steps_per_loop=1
- --autodist_strategy=PS
env:
- name: ADAPTDL
value: "true"
resources:
limits:
nvidia.com/gpu: 1


13 changes: 13 additions & 0 deletions examples/autodist/bert_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30522
}
Binary file added examples/autodist/tf_examples.tfrecord
Binary file not shown.
96 changes: 96 additions & 0 deletions examples/integration/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2020 Petuum, Inc. All Rights Reserved.
DachengLi1 marked this conversation as resolved.
Show resolved Hide resolved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


FROM python:3.6.12-buster
WORKDIR /root

FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
WORKDIR /root
DachengLi1 marked this conversation as resolved.
Show resolved Hide resolved

FROM tensorflow/tensorflow:2.2.0-gpu

# Set default shell to /bin/bash
# SHELL ["/bin/bash", "-cu"]

# RUN rm -rf /etc/bash.bashrc

# Install apps
COPY adaptdl adaptdl
COPY examples/requirements.txt .

RUN cd adaptdl && python3 setup.py bdist_wheel

ARG ADAPTDL_VERSION=0.0.0
RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl
RUN pip install -r requirements.txt

RUN rm -rf adaptdl/dist
WORKDIR /root
COPY examples examples_adaptdl
#COPY examples examples
#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils

# autodist env
SHELL ["/bin/bash", "-cu"]

RUN rm -rf /etc/bash.bashrc

RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
build-essential \
git \
curl \
vim \
wget \
unzip

RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py

WORKDIR /root
COPY bert_config.json bert_config.json
COPY tf_examples.tfrecord tf_examples.tfrecord
COPY autodist autodist
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of these COPY commands cannot work in a fresh clone of the AdaptDL repo? Can you make sure this example can work in that setting? Maybe git clone autodist instead of assuming it exists locally?

RUN cd autodist
RUN pip install tensorflow_hub
RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip
COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip
RUN unzip protoc-3.11.0-linux-x86_64.zip
RUN PROTOC=autodist/bin/protoc python autodist/setup.py build
WORKDIR autodist
RUN rm ./examples/resource_spec.yml
RUN pip install -e .[dev]

# setup ssh
# Install OpenSSH to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd

WORKDIR /root
RUN mkdir /root/.ssh
RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys
RUN chown -R root /root/.ssh
RUN chmod 700 /root/.ssh
RUN chmod 600 /root/.ssh/authorized_keys

RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config
RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config

# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config

ENV PYTHONUNBUFFERED=true
25 changes: 25 additions & 0 deletions examples/integration/adaptdljob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: adaptdl.petuum.com/v1
kind: AdaptDLJob
metadata:
generateName: integration-
spec:
minReplicas: 2
template:
spec:
containers:
- name: main
command:
- python3
- /root/autodist/examples/benchmark/bert.py
- -input_files=/root/tf_examples.tfrecord
- --bert_config_file=/root/bert_config.json
- --num_train_epochs=1
- --num_steps_per_epoch=1000
- --learning_rate=5e-5
- --steps_per_loop=1
- --autodist_strategy=PS
resources:
limits:
nvidia.com/gpu: 1


13 changes: 13 additions & 0 deletions examples/integration/bert_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 512,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"type_vocab_size": 2,
"vocab_size": 30522
}
Binary file added examples/integration/tf_examples.tfrecord
Binary file not shown.
22 changes: 22 additions & 0 deletions sched/adaptdl_sched/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ async def _handle_discover(self, request):
group = request.match_info["group"]
timeout = int(request.query.get("timeout", "30"))
pod_ip_list = None
pod_gpu_list = None
async with kubernetes.watch.Watch() as w:
stream = w.stream(self._core_api.list_namespaced_pod, namespace,
label_selector="adaptdl/job={}".format(name),
Expand All @@ -62,6 +63,27 @@ async def _handle_discover(self, request):
if pod_ip_list is None:
pod_ip_list = [None] * replicas
pod_ip_list[rank] = pod.status.pod_ip
try:
gpu_request = request.rel_url.query["gpu"]
except KeyError:
gpu_request = False
if gpu_request:
if pod_gpu_list is None:
pod_gpu_list = [None] * replicas
container = pod.spec.containers
assert len(container) == 1
pod_gpu_list[rank] = \
int(container[0].resources.requests[
'nvidia.com/gpu'])
if all(pod_gpu is not None
for pod_gpu in pod_gpu_list) and \
all(pod_ip is not None
for pod_ip in pod_ip_list):
assert len(pod_ip_list) == len(pod_gpu_list)
return_list = [(pod_ip_list[i], pod_gpu_list[i])
for i in range(len(pod_ip_list))]
LOG.info(return_list)
return web.json_response(return_list)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we unify the return values with L87?

if all(pod_ip is not None for pod_ip in pod_ip_list):
return web.json_response(pod_ip_list)
return web.json_response(status=408) # Timeout.
Expand Down