diff --git a/examples/autodist/Dockerfile b/examples/autodist/Dockerfile new file mode 100644 index 00000000..bec5574e --- /dev/null +++ b/examples/autodist/Dockerfile @@ -0,0 +1,82 @@ +# Copyright 2020 Petuum, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM python:3.6.12-buster +WORKDIR /root + +FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime + +FROM tensorflow/tensorflow:2.2.0-gpu +# Install apps +COPY adaptdl adaptdl +COPY examples/requirements.txt . + +RUN cd adaptdl && python3 setup.py bdist_wheel + +ARG ADAPTDL_VERSION=0.0.0 +RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl +RUN pip install -r requirements.txt + +RUN rm -rf adaptdl/dist + +# autodist env +SHELL ["/bin/bash", "-cu"] + +RUN rm -rf /etc/bash.bashrc + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + wget \ + unzip +WORKDIR /root +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +COPY bert_config.json bert_config.json +COPY tf_examples.tfrecord tf_examples.tfrecord +RUN git clone https://github.com/petuum/autodist.git +WORKDIR autodist +RUN git checkout integration +RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip +RUN unzip protoc-3.11.0-linux-x86_64.zip +RUN PROTOC=$(pwd)/bin/protoc python setup.py build +RUN pip install -e .[dev] +RUN pip install tensorflow_hub +WORKDIR autodist + +# setup ssh +RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd + +WORKDIR /root +RUN mkdir /root/.ssh +RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys +RUN chown -R root /root/.ssh +RUN chmod 700 /root/.ssh +RUN chmod 600 /root/.ssh/authorized_keys + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ENV PYTHONUNBUFFERED=true diff --git a/examples/autodist/adaptdljob.yaml b/examples/autodist/adaptdljob.yaml new file mode 100644 index 00000000..51212161 --- /dev/null +++ b/examples/autodist/adaptdljob.yaml @@ -0,0 +1,28 @@ +apiVersion: adaptdl.petuum.com/v1 +kind: AdaptDLJob +metadata: + generateName: integration- +spec: + minReplicas: 2 + template: + spec: + containers: + - name: main + command: + - python3 + - /root/autodist/examples/benchmark/bert_with_adaptdl.py + - -input_files=/root/tf_examples.tfrecord + - --bert_config_file=/root/bert_config.json + - --num_train_epochs=1 + - --num_steps_per_epoch=1000 + - --learning_rate=5e-5 + - --steps_per_loop=1 + - --autodist_strategy=PS + env: + - name: ADAPTDL + value: "true" + resources: + limits: + nvidia.com/gpu: 1 + + diff --git a/examples/autodist/bert_config.json b/examples/autodist/bert_config.json new file mode 100644 index 00000000..a7efa973 --- /dev/null +++ b/examples/autodist/bert_config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "type_vocab_size": 2, + "vocab_size": 30522 +} diff --git a/examples/autodist/tf_examples.tfrecord b/examples/autodist/tf_examples.tfrecord new file mode 100644 index 00000000..49bad5ad Binary files /dev/null and b/examples/autodist/tf_examples.tfrecord differ diff --git a/examples/integration/Dockerfile b/examples/integration/Dockerfile new file mode 100644 index 00000000..ab4d4321 --- /dev/null +++ b/examples/integration/Dockerfile @@ -0,0 +1,96 @@ +# Copyright 2020 Petuum, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +FROM python:3.6.12-buster +WORKDIR /root + +FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime +WORKDIR /root + +FROM tensorflow/tensorflow:2.2.0-gpu + +# Set default shell to /bin/bash +# SHELL ["/bin/bash", "-cu"] + +# RUN rm -rf /etc/bash.bashrc + +# Install apps +COPY adaptdl adaptdl +COPY examples/requirements.txt . + +RUN cd adaptdl && python3 setup.py bdist_wheel + +ARG ADAPTDL_VERSION=0.0.0 +RUN ADAPTDL_VERSION=${ADAPTDL_VERSION} pip install adaptdl/dist/*.whl +RUN pip install -r requirements.txt + +RUN rm -rf adaptdl/dist +WORKDIR /root +COPY examples examples_adaptdl +#COPY examples examples +#RUN apt-get update && apt-get install -y --no-install-recommends apt-utils + +# autodist env +SHELL ["/bin/bash", "-cu"] + +RUN rm -rf /etc/bash.bashrc + +RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + wget \ + unzip + +RUN curl -O https://bootstrap.pypa.io/get-pip.py && \ + python get-pip.py && \ + rm get-pip.py + +WORKDIR /root +COPY bert_config.json bert_config.json +COPY tf_examples.tfrecord tf_examples.tfrecord +COPY autodist autodist +RUN cd autodist +RUN pip install tensorflow_hub +RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v3.11.0/protoc-3.11.0-linux-x86_64.zip +COPY autodist/protoc-3.11.0-linux-x86_64.zip protoc-3.11.0-linux-x86_64.zip +RUN unzip protoc-3.11.0-linux-x86_64.zip +RUN PROTOC=autodist/bin/protoc python autodist/setup.py build +WORKDIR autodist +RUN rm ./examples/resource_spec.yml +RUN pip install -e .[dev] + +# setup ssh +# Install OpenSSH to communicate between containers +RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \ + mkdir -p /var/run/sshd + +WORKDIR /root +RUN mkdir /root/.ssh +RUN ssh-keygen -f /root/.ssh/id_rsa && cat /root/.ssh/id_rsa.pub | cat >> /root/.ssh/authorized_keys +RUN chown -R root /root/.ssh +RUN chmod 700 /root/.ssh +RUN chmod 600 /root/.ssh/authorized_keys + +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +ENV PYTHONUNBUFFERED=true diff --git a/examples/integration/adaptdljob.yaml b/examples/integration/adaptdljob.yaml new file mode 100644 index 00000000..ed364fa9 --- /dev/null +++ b/examples/integration/adaptdljob.yaml @@ -0,0 +1,25 @@ +apiVersion: adaptdl.petuum.com/v1 +kind: AdaptDLJob +metadata: + generateName: integration- +spec: + minReplicas: 2 + template: + spec: + containers: + - name: main + command: + - python3 + - /root/autodist/examples/benchmark/bert.py + - -input_files=/root/tf_examples.tfrecord + - --bert_config_file=/root/bert_config.json + - --num_train_epochs=1 + - --num_steps_per_epoch=1000 + - --learning_rate=5e-5 + - --steps_per_loop=1 + - --autodist_strategy=PS + resources: + limits: + nvidia.com/gpu: 1 + + diff --git a/examples/integration/bert_config.json b/examples/integration/bert_config.json new file mode 100644 index 00000000..a7efa973 --- /dev/null +++ b/examples/integration/bert_config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 1024, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + "type_vocab_size": 2, + "vocab_size": 30522 +} diff --git a/examples/integration/tf_examples.tfrecord b/examples/integration/tf_examples.tfrecord new file mode 100644 index 00000000..49bad5ad Binary files /dev/null and b/examples/integration/tf_examples.tfrecord differ diff --git a/sched/adaptdl_sched/supervisor.py b/sched/adaptdl_sched/supervisor.py index af88d889..69edeeb6 100644 --- a/sched/adaptdl_sched/supervisor.py +++ b/sched/adaptdl_sched/supervisor.py @@ -49,6 +49,7 @@ async def _handle_discover(self, request): group = request.match_info["group"] timeout = int(request.query.get("timeout", "30")) pod_ip_list = None + pod_gpu_list = None async with kubernetes.watch.Watch() as w: stream = w.stream(self._core_api.list_namespaced_pod, namespace, label_selector="adaptdl/job={}".format(name), @@ -62,6 +63,27 @@ async def _handle_discover(self, request): if pod_ip_list is None: pod_ip_list = [None] * replicas pod_ip_list[rank] = pod.status.pod_ip + try: + gpu_request = request.rel_url.query["gpu"] + except KeyError: + gpu_request = False + if gpu_request: + if pod_gpu_list is None: + pod_gpu_list = [None] * replicas + container = pod.spec.containers + assert len(container) == 1 + pod_gpu_list[rank] = \ + int(container[0].resources.requests[ + 'nvidia.com/gpu']) + if all(pod_gpu is not None + for pod_gpu in pod_gpu_list) and \ + all(pod_ip is not None + for pod_ip in pod_ip_list): + assert len(pod_ip_list) == len(pod_gpu_list) + return_list = [(pod_ip_list[i], pod_gpu_list[i]) + for i in range(len(pod_ip_list))] + LOG.info(return_list) + return web.json_response(return_list) if all(pod_ip is not None for pod_ip in pod_ip_list): return web.json_response(pod_ip_list) return web.json_response(status=408) # Timeout.