Skip to content

Commit

Permalink
update Dockerfiles
Browse files Browse the repository at this point in the history
  • Loading branch information
matbun committed Nov 3, 2024
1 parent 78abafe commit 67641b6
Show file tree
Hide file tree
Showing 2 changed files with 625 additions and 13 deletions.
170 changes: 157 additions & 13 deletions env-files/torch/jupyter/jupyter-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ ENV PATH="${CONDA_DIR}/bin:${PATH}" \
HOME="/home/${NB_USER}"

# Copy a script that we will use to correct permissions after running certain commands
COPY fix-permissions /usr/local/bin/fix-permissions
COPY env-files/torch/jupyter/jupyter-base/fix-permissions /usr/local/bin/fix-permissions
RUN chmod a+rx /usr/local/bin/fix-permissions

# Enable prompt color in the skeleton .bashrc before creating the default NB_USER
Expand Down Expand Up @@ -99,7 +99,7 @@ RUN mkdir "/home/${NB_USER}/work" && \
# Correct permissions
# Do all this in a single RUN command to avoid duplicating all of the
# files across image layers when the permissions change
COPY --chown="${NB_UID}:${NB_GID}" initial-condarc "${CONDA_DIR}/.condarc"
COPY --chown="${NB_UID}:${NB_GID}" env-files/torch/jupyter/jupyter-base/initial-condarc "${CONDA_DIR}/.condarc"
WORKDIR /tmp
RUN set -x && \
arch=$(uname -m) && \
Expand All @@ -113,6 +113,8 @@ RUN set -x && \
rm /tmp/micromamba.tar.bz2 && \
PYTHON_SPECIFIER="python=${PYTHON_VERSION}" && \
if [[ "${PYTHON_VERSION}" == "default" ]]; then PYTHON_SPECIFIER="python"; fi && \
# Matteo: fix error https://stackoverflow.com/a/77701510
./micromamba clean --locks && \
# Install the packages
./micromamba install \
--root-prefix="${CONDA_DIR}" \
Expand All @@ -123,7 +125,8 @@ RUN set -x && \
'jupyter_core' && \
rm micromamba && \
# Pin major.minor version of python
mamba list python | grep '^python ' | tr -s ' ' | cut -d ' ' -f 1,2 >> "${CONDA_DIR}/conda-meta/pinned" && \
# Matteo: I had to comment this line as it was causing the build to fail
# mamba list python | grep '^python ' | tr -s ' ' | cut -d ' ' -f 1,2 >> "${CONDA_DIR}/conda-meta/pinned" && \
mamba clean --all -f -y && \
fix-permissions "${CONDA_DIR}" && \
fix-permissions "/home/${NB_USER}"
Expand All @@ -133,7 +136,7 @@ ENTRYPOINT ["tini", "-g", "--"]
CMD ["start.sh"]

# Copy local files as late as possible to avoid cache busting
COPY start.sh /usr/local/bin/
COPY env-files/torch/jupyter/jupyter-base/start.sh /usr/local/bin/

# Switch back to jovyan to avoid accidental container runs as root
USER ${NB_UID}
Expand Down Expand Up @@ -200,9 +203,9 @@ EXPOSE $JUPYTER_PORT
CMD ["start-notebook.sh"]

# Copy local files as late as possible to avoid cache busting
COPY start-notebook.sh start-singleuser.sh /usr/local/bin/
COPY env-files/torch/jupyter/jupyter-base/start-notebook.sh env-files/torch/jupyter/jupyter-base/start-singleuser.sh /usr/local/bin/
# Currently need to have both jupyter_notebook_config and jupyter_server_config to support classic and lab
COPY jupyter_server_config.py docker_healthcheck.py /etc/jupyter/
COPY env-files/torch/jupyter/jupyter-base/jupyter_server_config.py env-files/torch/jupyter/jupyter-base/docker_healthcheck.py /etc/jupyter/

# Fix permissions on /etc/jupyter as root
USER root
Expand Down Expand Up @@ -266,7 +269,7 @@ RUN update-alternatives --install /usr/bin/nano nano /bin/nano-tiny 10
USER ${NB_UID}

# Add R mimetype option to specify how the plot returns from R to the browser
COPY --chown=${NB_UID}:${NB_GID} Rprofile.site /opt/conda/lib/R/etc/
COPY --chown=${NB_UID}:${NB_GID} env-files/torch/jupyter/jupyter-base/Rprofile.site /opt/conda/lib/R/etc/


###################################################
Expand Down Expand Up @@ -356,8 +359,9 @@ WORKDIR "${HOME}"
USER $NB_UID

# jupyterhub must be < 2
RUN conda install -y -n base mamba \
&& mamba install -y -c conda-forge python-gfal2 \
# Matteo: had to remove the line with conda as conda is not found
# RUN conda install -y -n base mamba \
RUN mamba install -y -c conda-forge python-gfal2 \
nodejs \
jupyterlab"<4" \
notebook"<7" \
Expand All @@ -372,7 +376,7 @@ RUN conda install -y -n base mamba \
ipywidgets \
asyncssh \
peewee \
&& conda clean --all -f -y
&& mamba clean --all -f -y

USER root

Expand Down Expand Up @@ -427,18 +431,18 @@ RUN mkdir /certs \
# update-ca-trust

# Add async ssh script
COPY asyncssh_config.py /opt/ssh/jupyterhub-singleuser
COPY env-files/torch/jupyter/jupyter-base/asyncssh_config.py /opt/ssh/jupyterhub-singleuser
RUN fix-permissions /opt/ssh/jupyterhub-singleuser \
&& chown -R $NB_UID /opt/ssh/jupyterhub-singleuser \
&& chmod +x /opt/ssh/jupyterhub-singleuser

# Setup extension Rucio instance config
COPY configure.py /opt/setup-rucio-jupyterlab/configure.py
COPY env-files/torch/jupyter/jupyter-base/configure.py /opt/setup-rucio-jupyterlab/configure.py
RUN fix-permissions /opt/setup-rucio-jupyterlab/configure.py \
&& chown -R $NB_UID /opt/setup-rucio-jupyterlab/configure.py \
&& chmod +x /opt/setup-rucio-jupyterlab/configure.py

COPY setup.sh /usr/local/bin/setup.sh
COPY env-files/torch/jupyter/jupyter-base/setup.sh /usr/local/bin/setup.sh
RUN fix-permissions /usr/local/bin/setup.sh \
&& sed -i -e 's/\r$/\n/' /usr/local/bin/setup.sh \
&& chmod +x /usr/local/bin/setup.sh
Expand All @@ -461,4 +465,144 @@ ENV JUPYTER_ENABLE_LAB=yes
USER $NB_UID
WORKDIR $HOME

# CMD ["setup.sh", "start-notebook.sh"]

###################################################
# Adapted CUDA itwinai ############################
###################################################

# Install rucio-jupyterlab with jlab v=3
RUN pip install --no-cache-dir --upgrade pip \
&& pip install --no-cache-dir rucio-jupyterlab==0.10.0 \
&& jupyter serverextension enable --py rucio_jupyterlab --sys-prefix

#############################
### itwinai ###
#############################

# USER root

# RUN apt-get update && apt-get install -y \
# # Needed (at least) by horovod wheel builder
# build-essential \
# # Needed (at least) by horovod wheel builder
# cmake \
# # Needed by OpenMPI for efficient compression at startup
# zlib1g \
# # Needed by OpenMPI for efficient compression at startup
# zlib1g-dev \
# && apt-get clean -y && rm -rf /var/lib/apt/lists/*

# # OpenMPI
# WORKDIR /tmp/ompi
# ENV OPENMPI_VERSION=4.1.6 \
# OPENMPI_MINOR=4.1
# ENV OPENMPI_URL="https://download.open-mpi.org/release/open-mpi/v${OPENMPI_MINOR}/openmpi-${OPENMPI_VERSION}.tar.gz"
# ENV OPENMPI_DIR=/opt/openmpi-${OPENMPI_VERSION}
# ENV PATH="${OPENMPI_DIR}/bin:${PATH}"
# ENV LD_LIBRARY_PATH="${OPENMPI_DIR}/lib:${LD_LIBRARY_PATH}"
# ENV MANPATH=${OPENMPI_DIR}/share/man:${MANPATH}
# RUN wget -q -O openmpi-$OPENMPI_VERSION.tar.gz $OPENMPI_URL && tar xzf openmpi-$OPENMPI_VERSION.tar.gz \
# && cd openmpi-$OPENMPI_VERSION && ./configure --prefix=$OPENMPI_DIR && make install

# # Nvidia software
# WORKDIR /tmp/cuda
# # CUDA Toolkit:
# # - https://developer.nvidia.com/cuda-downloads
# # - Installation guide: https://docs.nvidia.com/cuda/cuda-quick-start-guide/index.html#ubuntu
# # - cuda-toolkit metapackage: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#meta-packages
# # cuDNN:
# # - https://docs.nvidia.com/deeplearning/cudnn/latest/installation/linux.html#installing-cudnn-on-linux
# # NCCL:
# # - https://docs.nvidia.com/deeplearning/nccl/install-guide/index.html#debian
# # *NOTE* to correctly install Apex below, CUDA toolkit version must match with the torch CUDA backend version
# ENV CUDA_VERSION=12.4 \
# CUDA_TOOLKIT_VERSION=12-4 \
# CUDA_MAJOR_VERSION=12
# RUN wget -q -O cuda-keyring.deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
# && dpkg -i cuda-keyring.deb \
# && apt-get update && apt-get install -y \
# # CUDA toolkit metapackage (does not include the Nvidia driver)
# cuda-toolkit-${CUDA_TOOLKIT_VERSION} \
# # cuDNN
# cudnn-cuda-${CUDA_MAJOR_VERSION} \
# # NCCL
# libnccl2 \
# libnccl-dev \
# # Nvidia driver, as explained here: https://developer.nvidia.com/cuda-downloads
# nvidia-open \
# && apt-get clean -y && rm -rf /var/lib/apt/lists/*
# ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin${PATH:+:${PATH}}\
# LD_LIBRARY_PATH=/usr/local/cuda-${CUDA_VERSION}/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

# # Cleanup
# RUN rm -rf /tmp/*

# USER $NB_UID

# Torch and smaller deps
RUN pip install --upgrade pip \
# https://github.com/mpi4py/mpi4py/pull/431
&& SETUPTOOLS_USE_DISTUTILS=local python -m pip install --no-cache-dir mpi4py \
&& unset SETUPTOOLS_USE_DISTUTILS \
# Install torch
&& pip install --no-cache-dir \
'numpy<2' \
packaging \
py-cpuinfo \
torch==2.4.* torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124 \
wheel

# Apex: https://github.com/NVIDIA/apex
# (needed for DeepSpeed *_FUSED optinal build options)
# Note: it will take more than an hour to build
WORKDIR /tmp
RUN git clone https://github.com/NVIDIA/apex && cd apex \
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ \
&& rm -rf /tmp/apex
# Transformer engine: https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/index.html
# (needed for DeepSpeed *TRANSFORMER* optinal build options)
# Note: it will take about half an hour to build
RUN pip install --no-cache-dir transformer_engine[pytorch]

# DeepSpeed, Horovod and other deps
ENV HOROVOD_WITH_PYTORCH=1 \
HOROVOD_WITHOUT_TENSORFLOW=1 \
HOROVOD_WITHOUT_MXNET=1 \
CMAKE_CXX_STANDARD=17 \
HOROVOD_MPI_THREADS_DISABLE=1 \
HOROVOD_CPU_OPERATIONS=MPI \
HOROVOD_GPU_ALLREDUCE=NCCL \
HOROVOD_NCCL_LINK=SHARED \
# DeepSpeed
DS_BUILD_UTILS=1 \
DS_BUILD_AIO=1 \
DS_BUILD_FUSED_ADAM=1 \
DS_BUILD_FUSED_LAMB=1 \
DS_BUILD_TRANSFORMER=1 \
DS_BUILD_STOCHASTIC_TRANSFORMER=1 \
DS_BUILD_TRANSFORMER_INFERENCE=1
RUN pip install --no-cache-dir \
deepspeed \
git+https://github.com/horovod/horovod.git@3a31d93 \
"prov4ml[linux]@git+https://github.com/matbun/ProvML@6faafd4" \
ray ray[tune]


# Core itwinai lib
WORKDIR "$HOME/itwinai"
COPY pyproject.toml ./
COPY src ./
RUN pip install --no-cache-dir ".[torch,dev]" \
&& itwinai sanity-check --torch \
--optional-deps deepspeed \
--optional-deps horovod

# # Additional pip deps
# ARG REQUIREMENTS=env-files/torch/requirements/requirements.txt
# COPY "${REQUIREMENTS}" additional-requirements.txt
# RUN pip install --no-cache-dir -r additional-requirements.txt

WORKDIR $HOME

CMD ["setup.sh", "start-notebook.sh"]
Loading

0 comments on commit 67641b6

Please sign in to comment.