forked from opendatahub-io/vllm-gaudi
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathDockerfile.hpu.ubi
124 lines (93 loc) · 4.02 KB
/
Dockerfile.hpu.ubi
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
ARG BASE_IMAGE=vault.habana.ai/gaudi-docker/1.19.1/rhel9.4/habanalabs/pytorch-installer-2.5.1:1.19.1-26
FROM ${BASE_IMAGE} as habana-base
USER root
ENV VLLM_TARGET_DEVICE="hpu"
ENV HABANA_SOFTWARE_VERSION="1.19.1-26"
RUN dnf -y update --best --allowerasing --skip-broken && dnf clean all
WORKDIR /workspace
## Python Installer #################################################################
FROM habana-base as python-install
ARG PYTHON_VERSION=3.11
ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
RUN dnf install -y --setopt=install_weak_deps=0 --nodocs \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV --system-site-packages && pip install --no-cache -U pip wheel setuptools && dnf clean all
## Python Habana base #################################################################
FROM python-install as python-habana-base
ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# install Habana Software and common dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-hpu.txt,target=requirements-hpu.txt \
pip install \
-r requirements-hpu.txt
## Builder #####################################################################
FROM python-habana-base AS build
# install build dependencies
# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
COPY requirements-common.txt requirements-common.txt
COPY requirements-hpu.txt requirements-hpu.txt
COPY pyproject.toml pyproject.toml
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# # make sure punica kernels are built (for LoRA)
# HPU currently doesn't support LoRA
# ENV VLLM_INSTALL_PUNICA_KERNELS=1
# Copy the entire directory before building wheel
COPY vllm vllm
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=.git,target=/workspace/.git \
env CFLAGS="-march=haswell" \
CXXFLAGS="$CFLAGS $CXXFLAGS" \
CMAKE_BUILD_TYPE=Release \
python3 setup.py bdist_wheel --dist-dir=dist
## Release #####################################################################
FROM python-install AS vllm-openai
WORKDIR /workspace
ENV VIRTUAL_ENV=/opt/vllm
ENV PATH=$VIRTUAL_ENV/bin/:$PATH
# Triton needs a CC compiler
RUN dnf install -y --setopt=install_weak_deps=0 --nodocs gcc \
&& dnf clean all
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install $(echo dist/*.whl)'[tensorizer]' "jinja2>=3.1.5" --verbose
ENV HF_HUB_OFFLINE=1 \
PORT=8000 \
HOME=/home/vllm \
VLLM_USAGE_SOURCE=production-docker-image \
VLLM_NO_USAGE_STATS=1 \
PT_HPU_MAX_COMPOUND_OP_SIZE=10
# setup non-root user for OpenShift
RUN umask 002 \
&& useradd --uid 2000 --gid 0 vllm \
&& chmod g+rwx $HOME /usr/src /workspace
COPY LICENSE /licenses/vllm.md
# Copy only .jinja files from example directory to template directory
COPY examples/*.jinja /app/data/template/
USER 2000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
FROM vllm-openai as vllm-grpc-adapter
USER root
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
pip install $(echo dist/*.whl)'[tensorizer]' vllm-tgis-adapter==0.6.1
ENV GRPC_PORT=8033 \
PORT=8000 \
# As an optimization, vLLM disables logprobs when using spec decoding by
# default, but this would be unexpected to users of a hosted model that
# happens to have spec decoding
# see: https://github.com/vllm-project/vllm/pull/6485
DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
USER 2000
ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]