From c5d632318de53971119351fb7376d4d3a799e00a Mon Sep 17 00:00:00 2001 From: Laurentiu Bradin <109964136+z103cb@users.noreply.github.com> Date: Tue, 30 Apr 2024 13:55:08 +0300 Subject: [PATCH 01/46] chore: add fork OWNERS --- OWNERS | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 OWNERS diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000000..7bbb710d21ff --- /dev/null +++ b/OWNERS @@ -0,0 +1,21 @@ +approvers: + - danielezonca + - dtrifiro + - heyselbi + - israel-hdez + - Jooho + - rpancham + - spolti + - terrytangyuan + - vaibhavjainwiz + - VedantMahabaleshwarkar + - Xaenalt + - z103cb +reviewers: + - dtrifiro + - heyselbi + - rpancham + - terrytangyuan + - vaibhavjainwiz + - Xaenalt + - z103cb From 49743ae46dea92f67bf06a26a31c8db4469d2f72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 21 May 2024 10:47:03 +0200 Subject: [PATCH 02/46] add ubi Dockerfile --- Dockerfile.ubi | 244 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 Dockerfile.ubi diff --git a/Dockerfile.ubi b/Dockerfile.ubi new file mode 100644 index 000000000000..58663927a1c2 --- /dev/null +++ b/Dockerfile.ubi @@ -0,0 +1,244 @@ +# Please update any changes made here to +# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst + +## Global Args ################################################################# +ARG BASE_UBI_IMAGE_TAG=9.4 +ARG PYTHON_VERSION=3.11 + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" + + +## Base Layer ################################################################## +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base +ARG PYTHON_VERSION + +RUN microdnf install -y \ + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + && microdnf clean all + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Some utils for dev purposes - tar required for kubectl cp +RUN microdnf install -y \ + which procps findutils tar vim git\ + && microdnf clean all + + +## Python Installer ############################################################ +FROM base as python-install + +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all + + +## CUDA Base ################################################################### +FROM python-install as cuda-base + +# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if +# this env var is set to 12.2.0, even though it's compatible +#ENV CUDA_VERSION=12.2.0 \ +ENV CUDA_VERSION=12.0.0 \ + NV_CUDA_LIB_VERSION=12.2.0-1 \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + NV_CUDA_CUDART_VERSION=12.2.53-1 \ + NV_CUDA_COMPAT_VERSION=535.104.12 + +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + +RUN microdnf install -y \ + cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \ + cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \ + && microdnf clean all + + +ARG CUDA_HOME="/usr/local/cuda" +ENV CUDA_HOME=${CUDA_HOME}\ + PATH="${CUDA_HOME}/bin:${PATH}" \ + LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" + + +## CUDA Development ############################################################ +FROM cuda-base as cuda-devel + +ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \ + NV_NVML_DEV_VERSION=12.2.81-1 \ + NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \ + NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \ + NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2 + +RUN microdnf install -y \ + cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \ + cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \ + cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \ + cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \ + cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \ + libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \ + libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \ + libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \ + && microdnf clean all + +ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" + +# Workaround for https://github.com/openai/triton/issues/2507 and +# https://github.com/pytorch/pytorch/issues/107960 -- hopefully +# this won't be needed for future versions of this docker image +# or future versions of triton. +RUN ldconfig /usr/local/cuda-12.2/compat/ + +## Python cuda base ################################################################# +FROM cuda-devel AS python-cuda-base + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# install cuda and common dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + pip install \ + -r requirements-cuda.txt + +## Development ################################################################# +FROM python-cuda-base AS dev + +# install build and runtime dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + pip3 install \ + -r requirements-cuda.txt \ + -r requirements-dev.txt + +## Proto Compilation ########################################################### +FROM python-install AS gen-protos + +ENV PATH=/opt/vllm/bin/:$PATH + +RUN microdnf install -y \ + make \ + findutils \ + && microdnf clean all + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=Makefile,target=Makefile \ + --mount=type=bind,source=proto,target=proto \ + make gen-protos + +## Builder ##################################################################### +FROM dev AS build + +# install build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ + pip install -r requirements-build.txt + +# install compiler cache to speed up compilation leveraging local or remote caching +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y ccache && microdnf clean all +# install build dependencies + +# copy input files +COPY csrc csrc +COPY setup.py setup.py +COPY cmake cmake +COPY CMakeLists.txt CMakeLists.txt +COPY requirements-common.txt requirements-common.txt +COPY requirements-cuda.txt requirements-cuda.txt +COPY pyproject.toml pyproject.toml + +ARG TORCH_CUDA_ARCH_LIST +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + +# max jobs used by Ninja to build extensions +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# number of threads used by nvcc +ARG nvcc_threads=8 +ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# Make sure the cuda environment is in the PATH +ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Copy the entire directory before building wheel +COPY vllm vllm + +# Copy over the generated *.pb2 files +COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb + +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist + +## Release ##################################################################### +# Note from the non-UBI Dockerfile: +# We used base cuda image because pytorch installs its own cuda libraries. +# However pynccl depends on cuda libraries so we had to switch to the runtime image +# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda +FROM python-install AS vllm-openai + +WORKDIR /workspace + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=$VIRTUAL_ENV/bin/:$PATH + +# Triton needs a CC compiler +RUN microdnf install -y gcc \ + && microdnf clean all + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + pip install dist/*.whl --verbose + +# vllm requires a specific nccl version built from source distribution +# See https://github.com/NVIDIA/nccl/issues/1234 +RUN pip install \ + -v \ + --force-reinstall \ + --no-binary="all" \ + --no-cache-dir \ + "vllm-nccl-cu12==2.18.1.0.4.0" && \ + mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \ + chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1 + + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install \ + # additional dependencies for the TGIS gRPC server + grpcio-tools==1.63.0 \ + # additional dependencies for openai api_server + accelerate==0.30.0 \ + # hf_transfer for faster HF hub downloads + hf_transfer==0.1.6 + +ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ + GRPC_PORT=8033 \ + HOME=/home/vllm \ + VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork + +# setup non-root user for OpenShift +RUN umask 002 \ + && useradd --uid 2000 --gid 0 vllm \ + && chmod g+rwx $HOME /usr/src /workspace + +COPY LICENSE /licenses/vllm.md + +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] From d66716f87bd2438a509cd205381b6416045e0910 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 21 May 2024 10:47:37 +0200 Subject: [PATCH 03/46] Dockerfile.ubi: remove references to grpc/protos --- Dockerfile.ubi | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 58663927a1c2..3226a24ba1ea 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,6 +1,3 @@ -# Please update any changes made here to -# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst - ## Global Args ################################################################# ARG BASE_UBI_IMAGE_TAG=9.4 ARG PYTHON_VERSION=3.11 @@ -120,21 +117,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ -r requirements-cuda.txt \ -r requirements-dev.txt -## Proto Compilation ########################################################### -FROM python-install AS gen-protos - -ENV PATH=/opt/vllm/bin/:$PATH - -RUN microdnf install -y \ - make \ - findutils \ - && microdnf clean all - -RUN --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,source=Makefile,target=Makefile \ - --mount=type=bind,source=proto,target=proto \ - make gen-protos - ## Builder ##################################################################### FROM dev AS build @@ -175,9 +157,6 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Copy the entire directory before building wheel COPY vllm vllm -# Copy over the generated *.pb2 files -COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb - ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ @@ -216,18 +195,8 @@ RUN pip install \ chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1 -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install \ - # additional dependencies for the TGIS gRPC server - grpcio-tools==1.63.0 \ - # additional dependencies for openai api_server - accelerate==0.30.0 \ - # hf_transfer for faster HF hub downloads - hf_transfer==0.1.6 - ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ - GRPC_PORT=8033 \ HOME=/home/vllm \ VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \ VLLM_USAGE_SOURCE=production-docker-image \ From 4ea368bc8679cc90661054a5548b5f91a132f893 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 28 May 2024 18:31:39 +0200 Subject: [PATCH 04/46] Dockerfile.ubi: use vllm-tgis-adapter --- Dockerfile.ubi | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 3226a24ba1ea..1a11dbb33e5e 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -211,3 +211,15 @@ COPY LICENSE /licenses/vllm.md USER 2000 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + + +FROM vllm-openai as vllm-grpc-adapter + +USER root + +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install vllm-tgis-adapter + +ENV GRPC_PORT=8033 +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"] From c8f42be2b121faa02c6fcf6b889140fa133de01d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 3 Jun 2024 11:24:37 +0200 Subject: [PATCH 05/46] gha: add sync workflow --- .github/workflows/sync-with-upstream.yml | 91 ++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 .github/workflows/sync-with-upstream.yml diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml new file mode 100644 index 000000000000..5f009b897a3b --- /dev/null +++ b/.github/workflows/sync-with-upstream.yml @@ -0,0 +1,91 @@ +name: "Sync with upstream" + +on: + schedule: + - cron: 20 4 * * * + + workflow_dispatch: + + +env: + # repo to fetch changes from + UPSTREAM_REPO: vllm-project/vllm + # branch to sync + BRANCH: main + +jobs: + upstream-sync: + name: Sync with upstream + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch upstream repo + run: | + git remote add upstream https://github.com/${UPSTREAM_REPO} + git fetch upstream + + - name: Check diff + id: diff + shell: bash + run: | + echo 'diff<> $GITHUB_OUTPUT + git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT) + echo 'EOF' >> $GITHUB_OUTPUT + + - name: Create PR + if: ${{ steps.diff.outputs.diff != '' }} + env: + GH_TOKEN: ${{ github.token }} + run: | + set -xeu + + git_hash="$(git rev-parse upstream/${BRANCH})" + echo "git_hash=$git_hash" >> $GITHUB_OUTPUT + git_describe="$(git describe --tags upstream/${BRANCH})" + echo "git_describe=$git_describe" >> $GITHUB_OUTPUT + + # echo 'commits<> $GITHUB_OUTPUT + # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT + # echo 'EOF' >> $GITHUB_OUTPUT + + upstream_url="https://github.com/${UPSTREAM_REPO}" + upstream_branch="$upstream_url/tree/${BRANCH}" + + title="Sync with upstream@${git_describe}" + body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH" + + gh repo set-default $GITHUB_REPOSITORY + pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number') + + if [[ -z $pr_number ]]; then + echo "Creating PR" + gh pr create \ + --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \ + --base ${BRANCH} \ + --label code-sync \ + --title "$title" \ + --body "$body" \ + --no-maintainer-edit + exit 0 + fi + + echo "Checking if PR is up-to-date" + + git fetch ${upstream_url} refs/pull/${pr_number}/head + if git diff --stat --exit-code upstream/main FETCH_HEAD; then + echo "PR is up-to-date" + exit 0 + fi + + echo "Updating PR \#${pr_number}" + gh pr edit \ + $pr_number \ + --body "$body" \ + --title "$title" From 6f1bd871179f077341bea30736542e2cd6d9e69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 10 Jun 2024 17:31:35 +0200 Subject: [PATCH 06/46] Dockerfile.ubi: use distributed-executor-backend=mp as default --- Dockerfile.ubi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 1a11dbb33e5e..a65645647070 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -210,7 +210,7 @@ RUN umask 002 \ COPY LICENSE /licenses/vllm.md USER 2000 -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"] FROM vllm-openai as vllm-grpc-adapter @@ -222,4 +222,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV GRPC_PORT=8033 USER 2000 -ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"] +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"] From 7823f55494001085390da1cfb3d4e8e430d2e706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Thu, 13 Jun 2024 10:26:34 +0200 Subject: [PATCH 07/46] Dockerfile.ubi: remove vllm-nccl workaround Fixed upstream in https://github.com/vllm-project/vllm/pull/5091 --- Dockerfile.ubi | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index a65645647070..6c2bd732c5df 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -183,22 +183,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ pip install dist/*.whl --verbose -# vllm requires a specific nccl version built from source distribution -# See https://github.com/NVIDIA/nccl/issues/1234 -RUN pip install \ - -v \ - --force-reinstall \ - --no-binary="all" \ - --no-cache-dir \ - "vllm-nccl-cu12==2.18.1.0.4.0" && \ - mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \ - chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1 - - ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ HOME=/home/vllm \ - VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \ VLLM_USAGE_SOURCE=production-docker-image \ VLLM_WORKER_MULTIPROC_METHOD=fork From 8b537843340cd1abeedd58c6e61f16c32645c895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 18 Jun 2024 15:44:52 +0200 Subject: [PATCH 08/46] Dockerfile.ubi: add missing requirements-*.txt bind mounts --- Dockerfile.ubi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 6c2bd732c5df..6d85a0869622 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -113,6 +113,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ pip3 install \ -r requirements-cuda.txt \ -r requirements-dev.txt From 6c478a6b156970ece288aec0cc8a887774478471 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Wed, 29 May 2024 11:01:40 +0000 Subject: [PATCH 09/46] add triton CustomCacheManger fixes RHOAIENG-8043 Co-authored-by: Chih-Chieh-Yang Signed-off-by: Thomas Parnell --- Dockerfile.ubi | 6 +++++- extras/custom_cache_manager.py | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) create mode 100644 extras/custom_cache_manager.py diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 6d85a0869622..294399be24c4 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -180,6 +180,9 @@ ENV PATH=$VIRTUAL_ENV/bin/:$PATH RUN microdnf install -y gcc \ && microdnf clean all +# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043) +COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py + # install vllm wheel first, so that torch etc will be installed RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ @@ -189,7 +192,8 @@ ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ HOME=/home/vllm \ VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork + VLLM_WORKER_MULTIPROC_METHOD=fork \ + TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager" # setup non-root user for OpenShift RUN umask 002 \ diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py new file mode 100644 index 000000000000..c83ed5b6e865 --- /dev/null +++ b/extras/custom_cache_manager.py @@ -0,0 +1,32 @@ +import os + +from triton.runtime.cache import (FileCacheManager, default_cache_dir, + default_dump_dir, default_override_dir) + + +class CustomCacheManager(FileCacheManager): + + def __init__(self, key, override=False, dump=False): + self.key = key + self.lock_path = None + if dump: + self.cache_dir = default_dump_dir() + self.cache_dir = os.path.join(self.cache_dir, self.key) + self.lock_path = os.path.join(self.cache_dir, "lock") + os.makedirs(self.cache_dir, exist_ok=True) + elif override: + self.cache_dir = default_override_dir() + self.cache_dir = os.path.join(self.cache_dir, self.key) + else: + # create cache directory if it doesn't exist + self.cache_dir = os.getenv("TRITON_CACHE_DIR", + "").strip() or default_cache_dir() + if self.cache_dir: + self.cache_dir = f"{self.cache_dir}_{os.getpid()}" + self.cache_dir = os.path.join(self.cache_dir, self.key) + self.lock_path = os.path.join(self.cache_dir, "lock") + os.makedirs(self.cache_dir, exist_ok=True) + else: + raise RuntimeError("Could not create or locate cache dir") + + print(f"Triton cache dir: {self.cache_dir=}") From cb6f44fe85eef4ff317f1b56c79678f2997633d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 19 Jun 2024 13:45:37 +0200 Subject: [PATCH 10/46] gha: sync-with-upstream workflow create PRs as draft --- .github/workflows/sync-with-upstream.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml index 5f009b897a3b..9e747de396ed 100644 --- a/.github/workflows/sync-with-upstream.yml +++ b/.github/workflows/sync-with-upstream.yml @@ -72,6 +72,7 @@ jobs: --label code-sync \ --title "$title" \ --body "$body" \ + --draft \ --no-maintainer-edit exit 0 fi From cb041a6dc036235ad7ac47bf28670d9839f4e7f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 19 Jun 2024 15:14:35 +0200 Subject: [PATCH 11/46] add smoke/unit tests scripts --- extras/smoke-test.sh | 73 ++++++++++++++++++++++++++++++++++++++++++++ extras/unit-tests.sh | 43 ++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 extras/smoke-test.sh create mode 100644 extras/unit-tests.sh diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh new file mode 100644 index 000000000000..f03edea4f619 --- /dev/null +++ b/extras/smoke-test.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -uxo pipefail + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +export HTTP_PORT=8080 +export GRPC_PORT=8033 + + +function wait_for(){ + trap "" ERR # we don't care about errors in this function + + name=$1 + shift + command=$@ + + max_retries=10 + until $command ; do + echo "Waiting for $name to be up (retries_left=$max_retries)..." + sleep 30 + max_retries=$((max_retries-1)) + if [[ max_retries -le 0 ]]; then + echo "Timed out waiting for $name server" >&2 + exit 1 + fi + done +} + +# stop the server on any errors +trap 'kill -9 $server_pid && exit 1' ERR + +# spin up the OpenAPI server in the background +python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT & +server_pid=$! +server_url="http://localhost:$HTTP_PORT" + +wait_for "http server" curl --verbose --connect-timeout 1 --fail-with-body --no-progress-meter "${server_url}/health" + +curl -v --no-progress-meter --fail-with-body \ + "${server_url}/v1/models" | python -m json.tool || \ + +curl -v --no-progress-meter --fail-with-body \ + --header "Content-Type: application/json" \ + --data '{ + "prompt": "A red fedora symbolizes ", + "model": "facebook/opt-125m" +}' \ + "${server_url}/v1/completions" | python -m json.tool + +echo "OpenAI API success" && kill -9 $server_pid + + +# spin up the grpc server in the background +python -m vllm_tgis_adapter --grpc-port $GRPC_PORT & +server_pid=$! +server_url="localhost:$GRPC_PORT" + +# get grpcurl +curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \ + https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz +tar -xf /tmp/grpcurl.tar.gz --directory /tmp + +wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapter + +/tmp/grpcurl -v \ + -plaintext \ + -use-reflection \ + -d '{ "requests": [{"text": "A red fedora symbolizes "}]}' \ + "$server_url" \ + fmaas.GenerationService/Generate + +echo "GRPC API success" && kill -9 $server_pid diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh new file mode 100644 index 000000000000..4739fb6d65ea --- /dev/null +++ b/extras/unit-tests.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# partially copied from from .buildkite/test-pipeline.yml + +cd tests || exit 1 + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +# basic correctness +pytest -v -s test_regression.py +pytest -v -s async_engine +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +# core +pytest -v -s core + +# note: distributed tests are disabled + +# engine tests +pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py +# entrypoint +pytest -v -s entrypoints -m openai + +#inputs (note: multimodal tests are skipped) +pytest -v -s test_inputs.py + +#models +pytest -v -s models -m \"not vlm\" + +# misc +pytest -v -s prefix_caching +pytest -v -s samplers +pytest -v -s test_logits_processor.py +pytest -v -s models -m \"not vlm\" +pytest -v -s worker +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s spec_decode +# pytest -v -s tensorizer_loader # disabled: requires libsodium +pytest -v -s metrics +pytest -v -s quantization From 4ef5daba3f3dd652a571939fc615094c3456bf4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Thu, 20 Jun 2024 19:24:04 +0200 Subject: [PATCH 12/46] extras: exit unit tests on err --- extras/unit-tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh index 4739fb6d65ea..08b2388b646e 100644 --- a/extras/unit-tests.sh +++ b/extras/unit-tests.sh @@ -1,5 +1,6 @@ #!/bin/bash # partially copied from from .buildkite/test-pipeline.yml +set -e cd tests || exit 1 From 272ff919a910e7fa1201021a036a510bf1072b8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 28 May 2024 16:15:06 +0200 Subject: [PATCH 13/46] Dockerfile.ubi: misc improvements - get rid cuda-devel stage, use cuda 12.4 - add build flags - remove useless installs --- Dockerfile.ubi | 63 ++++++++------------------------------------------ 1 file changed, 10 insertions(+), 53 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 294399be24c4..e4861243e222 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -4,7 +4,6 @@ ARG PYTHON_VERSION=3.11 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" - ## Base Layer ################################################################## FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base ARG PYTHON_VERSION @@ -39,61 +38,19 @@ RUN microdnf install -y \ ## CUDA Base ################################################################### FROM python-install as cuda-base -# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if -# this env var is set to 12.2.0, even though it's compatible -#ENV CUDA_VERSION=12.2.0 \ -ENV CUDA_VERSION=12.0.0 \ - NV_CUDA_LIB_VERSION=12.2.0-1 \ - NVIDIA_VISIBLE_DEVICES=all \ - NVIDIA_DRIVER_CAPABILITIES=compute,utility \ - NV_CUDA_CUDART_VERSION=12.2.53-1 \ - NV_CUDA_COMPAT_VERSION=535.104.12 - RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo RUN microdnf install -y \ - cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \ - cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \ - && microdnf clean all + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ + microdnf clean all - -ARG CUDA_HOME="/usr/local/cuda" -ENV CUDA_HOME=${CUDA_HOME}\ +ENV CUDA_HOME="/usr/local/cuda" \ PATH="${CUDA_HOME}/bin:${PATH}" \ LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" - -## CUDA Development ############################################################ -FROM cuda-base as cuda-devel - -ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \ - NV_NVML_DEV_VERSION=12.2.81-1 \ - NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \ - NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \ - NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2 - -RUN microdnf install -y \ - cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \ - cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \ - cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \ - libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \ - libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \ - libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \ - && microdnf clean all - -ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" - -# Workaround for https://github.com/openai/triton/issues/2507 and -# https://github.com/pytorch/pytorch/issues/107960 -- hopefully -# this won't be needed for future versions of this docker image -# or future versions of triton. -RUN ldconfig /usr/local/cuda-12.2/compat/ - ## Python cuda base ################################################################# -FROM cuda-devel AS python-cuda-base +FROM cuda-base AS python-cuda-base ENV VIRTUAL_ENV=/opt/vllm ENV PATH="$VIRTUAL_ENV/bin:$PATH" @@ -128,7 +85,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-build.txt # install compiler cache to speed up compilation leveraging local or remote caching -RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y ccache && microdnf clean all +# git is required for the cutlass kernels +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all # install build dependencies # copy input files @@ -162,13 +120,12 @@ COPY vllm vllm ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ - CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + python3 setup.py bdist_wheel --dist-dir=dist ## Release ##################################################################### -# Note from the non-UBI Dockerfile: -# We used base cuda image because pytorch installs its own cuda libraries. -# However pynccl depends on cuda libraries so we had to switch to the runtime image -# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda FROM python-install AS vllm-openai WORKDIR /workspace From 57fd180627f676a73605bf28c814e74869c88d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 21 Jun 2024 10:03:48 +0200 Subject: [PATCH 14/46] update OWNERS --- OWNERS | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/OWNERS b/OWNERS index 7bbb710d21ff..dc965385e186 100644 --- a/OWNERS +++ b/OWNERS @@ -1,20 +1,17 @@ approvers: - - danielezonca - dtrifiro - heyselbi - - israel-hdez - - Jooho - rpancham - - spolti + - RH-steve-grubb - terrytangyuan - vaibhavjainwiz - - VedantMahabaleshwarkar - Xaenalt - z103cb reviewers: - dtrifiro - heyselbi - rpancham + - RH-steve-grubb - terrytangyuan - vaibhavjainwiz - Xaenalt From fc0df4152ba57c1ca0e77e36e085ac86fdf21129 Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Tue, 25 Jun 2024 05:17:36 -0700 Subject: [PATCH 15/46] Dockerfile.ubi: use tensorizer (#64) add libsodium for tensorizer encryption --------- Signed-off-by: Prashant Gupta Co-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com> --- Dockerfile.ubi | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index e4861243e222..cef224e3e8bb 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -125,6 +125,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ CMAKE_BUILD_TYPE=Release \ python3 setup.py bdist_wheel --dist-dir=dist +#################### libsodium Build IMAGE #################### +FROM base as libsodium-builder + +RUN microdnf install -y gcc gzip \ + && microdnf clean all + +WORKDIR /usr/src/libsodium + +ARG LIBSODIUM_VERSION=1.0.20 +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ + && tar -xzvf libsodium*.tar.gz \ + && rm -f libsodium*.tar.gz \ + && mv libsodium*/* ./ + +RUN ./configure --prefix="/usr/" && make && make check + ## Release ##################################################################### FROM python-install AS vllm-openai @@ -143,7 +159,12 @@ COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custo # install vllm wheel first, so that torch etc will be installed RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ - pip install dist/*.whl --verbose + pip install $(echo dist/*.whl)'[tensorizer]' --verbose + +# Install libsodium for Tensorizer encryption +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ + cd /usr/src/libsodium \ + && make install ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ From 50677483b6b658b20a3869ce48782258dfbab1e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 26 Jun 2024 18:13:16 +0200 Subject: [PATCH 16/46] Dockerfile.ubi: pin vllm-tgis-adapter to 0.1.2 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index cef224e3e8bb..20bb7e7e2cb5 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -189,7 +189,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter + pip install vllm-tgis-adapter==0.1.2 ENV GRPC_PORT=8033 USER 2000 From b93cf7e5a7cb16d2b0c6cdebd202aafdb1b54373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 2 Jul 2024 12:20:13 +0200 Subject: [PATCH 17/46] gha: fix fetch step in upstream sync workflow --- .github/workflows/sync-with-upstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml index 9e747de396ed..9d87ff4e50da 100644 --- a/.github/workflows/sync-with-upstream.yml +++ b/.github/workflows/sync-with-upstream.yml @@ -79,7 +79,7 @@ jobs: echo "Checking if PR is up-to-date" - git fetch ${upstream_url} refs/pull/${pr_number}/head + git fetch origin refs/pull/${pr_number}/head if git diff --stat --exit-code upstream/main FETCH_HEAD; then echo "PR is up-to-date" exit 0 From ab6ab65c2ac3ea8078268fe635116add37347767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 2 Jul 2024 12:36:38 +0200 Subject: [PATCH 18/46] gha: always update sync workflow PR body/title --- .github/workflows/sync-with-upstream.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml index 9d87ff4e50da..53751552f4d2 100644 --- a/.github/workflows/sync-with-upstream.yml +++ b/.github/workflows/sync-with-upstream.yml @@ -77,14 +77,6 @@ jobs: exit 0 fi - echo "Checking if PR is up-to-date" - - git fetch origin refs/pull/${pr_number}/head - if git diff --stat --exit-code upstream/main FETCH_HEAD; then - echo "PR is up-to-date" - exit 0 - fi - echo "Updating PR \#${pr_number}" gh pr edit \ $pr_number \ From a419aa823339614a279e187a25afe4e1692b18ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 3 Jul 2024 17:28:25 +0200 Subject: [PATCH 19/46] Dockerfile.ubi: bump vllm-tgis-adapter to 0.1.3 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 20bb7e7e2cb5..822363161be2 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -189,7 +189,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.1.2 + pip install vllm-tgis-adapter==0.1.3 ENV GRPC_PORT=8033 USER 2000 From 456e93f1a82327f7f7e1f00bc16a1dfe13998b58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 10 Jul 2024 17:03:57 +0200 Subject: [PATCH 20/46] Dockerfile.ubi: get rid of --distributed-executor-backend=mp this is the default when `--worker-use-ray` is not provided and world-size > 1 --- Dockerfile.ubi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 822363161be2..c38c1be443c6 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -181,7 +181,7 @@ RUN umask 002 \ COPY LICENSE /licenses/vllm.md USER 2000 -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"] +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] FROM vllm-openai as vllm-grpc-adapter @@ -193,4 +193,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV GRPC_PORT=8033 USER 2000 -ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"] +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"] From f67c2ca241c3ee71801b8a7772ef63739f8f6df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 9 Jul 2024 14:53:07 +0200 Subject: [PATCH 21/46] Dockerfile.ubi: add flashinfer --- Dockerfile.ubi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index c38c1be443c6..cb5ec895ae49 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -166,6 +166,9 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/ cd /usr/src/libsodium \ && make install +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp311-cp311-linux_x86_64.whl + ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ HOME=/home/vllm \ From d554f4997123c4759a2ba23d9252f7872228b145 Mon Sep 17 00:00:00 2001 From: Prashant Gupta Date: Fri, 12 Jul 2024 08:59:40 -0700 Subject: [PATCH 22/46] pin adapter to 2.0.0 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index cb5ec895ae49..8ba87bbcb5e8 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -192,7 +192,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.1.3 + pip install vllm-tgis-adapter==0.2.0 ENV GRPC_PORT=8033 USER 2000 From 11c65674977111a28f0d366de71ec908962ce074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 15 Jul 2024 13:31:01 +0200 Subject: [PATCH 23/46] deps: bump flashinfer to 0.0.9 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 8ba87bbcb5e8..28bc2deb2c61 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -167,7 +167,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/ && make install RUN --mount=type=cache,target=/root/.cache/pip \ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp311-cp311-linux_x86_64.whl + pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ From fe79602265759ebea219279308e6b04adc64b0a8 Mon Sep 17 00:00:00 2001 From: Selbi Nuryyeva Date: Thu, 27 Jun 2024 16:27:11 -0400 Subject: [PATCH 24/46] Update OWNERS with IBM folks --- OWNERS | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/OWNERS b/OWNERS index dc965385e186..09b25dab41c0 100644 --- a/OWNERS +++ b/OWNERS @@ -1,17 +1,27 @@ approvers: - dtrifiro + - fialhocoelho - heyselbi - - rpancham + - joerunde + - maxdebayser + - njhill + - prashantgupta24 - RH-steve-grubb + - rpancham - terrytangyuan - vaibhavjainwiz - - Xaenalt - z103cb + - Xaenalt reviewers: - dtrifiro + - fialhocoelho - heyselbi - - rpancham + - joerunde + - maxdebayser + - njhill + - prashantgupta24 - RH-steve-grubb + - rpancham - terrytangyuan - vaibhavjainwiz - Xaenalt From 225d4a8c93b15f76bafd67b98f88a0576881f099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 17 Jul 2024 20:07:42 +0200 Subject: [PATCH 25/46] Dockerfile.ubi: bind mount .git dir to allow inclusion of git commit hash --- Dockerfile.ubi | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 28bc2deb2c61..a7bd8f50bd24 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -120,6 +120,7 @@ COPY vllm vllm ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,target=/workspace/.git \ env CFLAGS="-march=haswell" \ CXXFLAGS="$CFLAGS $CXXFLAGS" \ CMAKE_BUILD_TYPE=Release \ From 9ded6e6af5986019b324dcc49d21a2a939994d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 17 Jul 2024 20:09:47 +0200 Subject: [PATCH 26/46] gha: remove reminder_comment --- .github/workflows/reminder_comment.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/reminder_comment.yml diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index 390c88bb6530..000000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: PR Reminder Comment Bot -on: - pull_request_target: - types: [opened] - -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@v6 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: 'šŸ‘‹ Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\nšŸš€' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From bf5ff75167e5087445543ffde338299f8da1d458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Thu, 18 Jul 2024 19:08:35 +0200 Subject: [PATCH 27/46] Dockerfile: bump vllm-tgis-adapter to 0.2.1 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index a7bd8f50bd24..49a046044f1c 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.2.0 + pip install vllm-tgis-adapter==0.2.1 ENV GRPC_PORT=8033 USER 2000 From ae6669b38a5fc4863329feef8d0c5014731699f7 Mon Sep 17 00:00:00 2001 From: Nathan Weinberg Date: Thu, 18 Jul 2024 14:11:52 -0500 Subject: [PATCH 28/46] fix: update setup.py to differentiate between fork and upstream Signed-off-by: Nathan Weinberg --- setup.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index b146299f8269..c5d1a3b3a2d4 100644 --- a/setup.py +++ b/setup.py @@ -454,18 +454,17 @@ def _read_requirements(filename: str) -> List[str]: package_data["vllm"].append("*.so") setup( - name="vllm", + name="vllm-odh", version=get_vllm_version(), - author="vLLM Team", + author="Open Data Hub Community", license="Apache 2.0", description=("A high-throughput and memory-efficient inference and " "serving engine for LLMs"), long_description=read_readme(), long_description_content_type="text/markdown", - url="https://github.com/vllm-project/vllm", + url="https://github.com/opendatahub-io/vllm", project_urls={ - "Homepage": "https://github.com/vllm-project/vllm", - "Documentation": "https://vllm.readthedocs.io/en/latest/", + "Homepage": "https://github.com/opendatahub-io/vllm", }, classifiers=[ "Programming Language :: Python :: 3.8", From 0dc9ba93338ef351ca70b91aaea0ebb204a36162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 19 Jul 2024 13:32:00 +0200 Subject: [PATCH 29/46] Dockerfile.ubi: properly mount .git dir --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 49a046044f1c..d3a93154e638 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -120,7 +120,7 @@ COPY vllm vllm ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ - --mount=type=bind,target=/workspace/.git \ + --mount=type=bind,src=.git,target=/workspace/.git \ env CFLAGS="-march=haswell" \ CXXFLAGS="$CFLAGS $CXXFLAGS" \ CMAKE_BUILD_TYPE=Release \ From efcd71c8d2992e189cf763754accec3d89935849 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Fri, 19 Jul 2024 18:04:14 +0200 Subject: [PATCH 30/46] Revert "[CI/Build] fix: update setup.py to differentiate between fork and upstream" --- setup.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index c5d1a3b3a2d4..b146299f8269 100644 --- a/setup.py +++ b/setup.py @@ -454,17 +454,18 @@ def _read_requirements(filename: str) -> List[str]: package_data["vllm"].append("*.so") setup( - name="vllm-odh", + name="vllm", version=get_vllm_version(), - author="Open Data Hub Community", + author="vLLM Team", license="Apache 2.0", description=("A high-throughput and memory-efficient inference and " "serving engine for LLMs"), long_description=read_readme(), long_description_content_type="text/markdown", - url="https://github.com/opendatahub-io/vllm", + url="https://github.com/vllm-project/vllm", project_urls={ - "Homepage": "https://github.com/opendatahub-io/vllm", + "Homepage": "https://github.com/vllm-project/vllm", + "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ "Programming Language :: Python :: 3.8", From 2b71ba51637687ab4bc81fc13f345060a9644746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Fri, 19 Jul 2024 18:21:15 +0200 Subject: [PATCH 31/46] Dockerfile.ubi: bump vllm-tgis-adapter to 0.2.2 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index d3a93154e638..566178cbd1b9 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.2.1 + pip install vllm-tgis-adapter==0.2.2 ENV GRPC_PORT=8033 USER 2000 From 9e10cae16b4aea7df77055b10964c084dd814f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 23 Jul 2024 20:21:56 +0200 Subject: [PATCH 32/46] gha: remove unused upstream workflows --- .github/workflows/add_label_automerge.yml | 21 ----------------- .github/workflows/add_label_ready_comment.yml | 23 ------------------- 2 files changed, 44 deletions(-) delete mode 100644 .github/workflows/add_label_automerge.yml delete mode 100644 .github/workflows/add_label_ready_comment.yml diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml deleted file mode 100644 index cd53b764c720..000000000000 --- a/.github/workflows/add_label_automerge.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Add label on auto-merge enabled -on: - pull_request_target: - types: - - auto_merge_enabled -jobs: - add-label-on-auto-merge: - runs-on: ubuntu-latest - steps: - - name: Add label - uses: actions/github-script@v5 - with: - script: | - github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - labels: ['ready'] - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml deleted file mode 100644 index 729c1452af03..000000000000 --- a/.github/workflows/add_label_ready_comment.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Add Ready Label on Ready Comment - -on: - issue_comment: - types: [created] - -jobs: - add-ready-label: - runs-on: ubuntu-latest - if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready') - steps: - - name: Add label - uses: actions/github-script@v5 - with: - script: | - github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - labels: ['ready'] - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From b209322be75eedade10d048cbb08b959cd3af538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 24 Jul 2024 17:35:41 +0200 Subject: [PATCH 33/46] deps: bump vllm-tgis-adapter to 0.2.3 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 566178cbd1b9..ae6bd0bc6dfd 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.2.2 + pip install vllm-tgis-adapter==0.2.3 ENV GRPC_PORT=8033 USER 2000 From 8ca9ede12bce137cf530dbdd6889cf6fc547b466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 24 Jul 2024 13:11:01 +0200 Subject: [PATCH 34/46] Dockerfile.ubi: get rid of custom cache manager fixed in https://github.com/vllm-project/vllm/pull/6140 fixes https://issues.redhat.com/browse/RHOAIENG-8043 --- Dockerfile.ubi | 6 +----- extras/custom_cache_manager.py | 32 -------------------------------- 2 files changed, 1 insertion(+), 37 deletions(-) delete mode 100644 extras/custom_cache_manager.py diff --git a/Dockerfile.ubi b/Dockerfile.ubi index ae6bd0bc6dfd..4462ce8a59c2 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -154,9 +154,6 @@ ENV PATH=$VIRTUAL_ENV/bin/:$PATH RUN microdnf install -y gcc \ && microdnf clean all -# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043) -COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py - # install vllm wheel first, so that torch etc will be installed RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ @@ -174,8 +171,7 @@ ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ HOME=/home/vllm \ VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork \ - TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager" + VLLM_WORKER_MULTIPROC_METHOD=fork # setup non-root user for OpenShift RUN umask 002 \ diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py deleted file mode 100644 index c83ed5b6e865..000000000000 --- a/extras/custom_cache_manager.py +++ /dev/null @@ -1,32 +0,0 @@ -import os - -from triton.runtime.cache import (FileCacheManager, default_cache_dir, - default_dump_dir, default_override_dir) - - -class CustomCacheManager(FileCacheManager): - - def __init__(self, key, override=False, dump=False): - self.key = key - self.lock_path = None - if dump: - self.cache_dir = default_dump_dir() - self.cache_dir = os.path.join(self.cache_dir, self.key) - self.lock_path = os.path.join(self.cache_dir, "lock") - os.makedirs(self.cache_dir, exist_ok=True) - elif override: - self.cache_dir = default_override_dir() - self.cache_dir = os.path.join(self.cache_dir, self.key) - else: - # create cache directory if it doesn't exist - self.cache_dir = os.getenv("TRITON_CACHE_DIR", - "").strip() or default_cache_dir() - if self.cache_dir: - self.cache_dir = f"{self.cache_dir}_{os.getpid()}" - self.cache_dir = os.path.join(self.cache_dir, self.key) - self.lock_path = os.path.join(self.cache_dir, "lock") - os.makedirs(self.cache_dir, exist_ok=True) - else: - raise RuntimeError("Could not create or locate cache dir") - - print(f"Triton cache dir: {self.cache_dir=}") From f850e50758f3a82974e9c1e3f618775fe8fbf4be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 6 Aug 2024 16:56:06 +0200 Subject: [PATCH 35/46] Dockerfile.ubi: add missing dependency --- Dockerfile.ubi | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 4462ce8a59c2..75082aa77502 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -71,6 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \ --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ pip3 install \ -r requirements-cuda.txt \ From f41930e80064299c5ba11a41962b008b3fcbcd45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Wed, 24 Jul 2024 17:35:41 +0200 Subject: [PATCH 36/46] deps: bump vllm-tgis-adapter to 0.3.0 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 75082aa77502..69fd6d44f441 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -190,7 +190,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.2.3 + pip install vllm-tgis-adapter==0.3.0 ENV GRPC_PORT=8033 USER 2000 From 26cfbd9ede504d0ace858de14711420134bd4026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 Aug 2024 01:42:17 +0200 Subject: [PATCH 37/46] Dockerfile.ubi: force using python-installed cuda runtime libraries --- Dockerfile.ubi | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 69fd6d44f441..d049267db49a 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -62,6 +62,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install \ -r requirements-cuda.txt + ## Development ################################################################# FROM python-cuda-base AS dev @@ -113,7 +114,6 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 # Make sure the cuda environment is in the PATH ENV PATH=/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Copy the entire directory before building wheel COPY vllm vllm @@ -145,12 +145,18 @@ RUN ./configure --prefix="/usr/" && make && make check ## Release ##################################################################### FROM python-install AS vllm-openai +ARG PYTHON_VERSION WORKDIR /workspace ENV VIRTUAL_ENV=/opt/vllm ENV PATH=$VIRTUAL_ENV/bin/:$PATH +# force using the python venv's cuda runtime libraries +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}" + # Triton needs a CC compiler RUN microdnf install -y gcc \ && microdnf clean all From ac48a82007db71579c80e82452290d2bc9a3d928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 Aug 2024 00:08:56 +0200 Subject: [PATCH 38/46] Dockerfile: use uv pip everywhere (it's faster) --- Dockerfile.ubi | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index d049267db49a..914cae070274 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -32,7 +32,7 @@ ENV VIRTUAL_ENV=/opt/vllm ENV PATH="$VIRTUAL_ENV/bin:$PATH" RUN microdnf install -y \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ - python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all ## CUDA Base ################################################################### @@ -57,9 +57,10 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" # install cuda and common dependencies RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ - pip install \ + uv pip install \ -r requirements-cuda.txt @@ -68,13 +69,14 @@ FROM python-cuda-base AS dev # install build and runtime dependencies RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \ --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ - pip3 install \ + uv pip install \ -r requirements-cuda.txt \ -r requirements-dev.txt @@ -83,8 +85,9 @@ FROM dev AS build # install build dependencies RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ - pip install -r requirements-build.txt + uv pip install -r requirements-build.txt # install compiler cache to speed up compilation leveraging local or remote caching # git is required for the cutlass kernels @@ -121,6 +124,7 @@ COPY vllm vllm ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,src=.git,target=/workspace/.git \ env CFLAGS="-march=haswell" \ CXXFLAGS="$CFLAGS $CXXFLAGS" \ @@ -164,7 +168,8 @@ RUN microdnf install -y gcc \ # install vllm wheel first, so that torch etc will be installed RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ --mount=type=cache,target=/root/.cache/pip \ - pip install $(echo dist/*.whl)'[tensorizer]' --verbose + --mount=type=cache,target=/root/.cache/uv \ + uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose # Install libsodium for Tensorizer encryption RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ @@ -172,7 +177,8 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/ && make install RUN --mount=type=cache,target=/root/.cache/pip \ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl + --mount=type=cache,target=/root/.cache/uv \ + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ From 9460cfb8b869199c7631eb438ec78255e8b5a6d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 5 Aug 2024 18:17:52 +0200 Subject: [PATCH 39/46] Dockerfile.ubi: bump flashinfer to 0.1.2 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 914cae070274..53a6c90ecfbf 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -178,7 +178,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=cache,target=/root/.cache/uv \ - uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ From 204a1d43f390843d1c99778d27caffd222dd2142 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Thu, 8 Aug 2024 15:43:20 -0600 Subject: [PATCH 40/46] feat: allow long max seq length Signed-off-by: Travis Johnson --- Dockerfile.ubi | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 53a6c90ecfbf..39ee4a63c84a 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -183,6 +183,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV HF_HUB_OFFLINE=1 \ PORT=8000 \ HOME=/home/vllm \ + # Allow requested max length to exceed what is extracted from the + # config.json + # see: https://github.com/vllm-project/vllm/pull/7080 + VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ VLLM_USAGE_SOURCE=production-docker-image \ VLLM_WORKER_MULTIPROC_METHOD=fork From f49380d0f8c6b825e23d4c90b93378c24bd32b96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 Aug 2024 20:27:58 +0200 Subject: [PATCH 41/46] smoke test: kill server on timeout --- extras/smoke-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh index f03edea4f619..15bcd6b1984f 100644 --- a/extras/smoke-test.sh +++ b/extras/smoke-test.sh @@ -22,6 +22,7 @@ function wait_for(){ max_retries=$((max_retries-1)) if [[ max_retries -le 0 ]]; then echo "Timed out waiting for $name server" >&2 + kill -9 ${server_pid} exit 1 fi done From 19adb9d94eec543f46b998b4f460b976f259a651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Tue, 13 Aug 2024 23:29:47 +0200 Subject: [PATCH 42/46] Dockerfile.ubi: set vllm_tgis_adapter unicorn log level to warning --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 39ee4a63c84a..e185ac549f51 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -210,4 +210,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV GRPC_PORT=8033 USER 2000 -ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"] +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] From b361484b24e6586aea4d97eeac0fa158d0170c80 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Tue, 20 Aug 2024 12:25:56 -0600 Subject: [PATCH 43/46] fix: enable logprobs during spec decoding by default Signed-off-by: Travis Johnson --- Dockerfile.ubi | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index e185ac549f51..5308d690015c 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -181,7 +181,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl ENV HF_HUB_OFFLINE=1 \ - PORT=8000 \ HOME=/home/vllm \ # Allow requested max length to exceed what is extracted from the # config.json @@ -208,6 +207,13 @@ USER root RUN --mount=type=cache,target=/root/.cache/pip \ pip install vllm-tgis-adapter==0.3.0 -ENV GRPC_PORT=8033 +ENV GRPC_PORT=8033 \ + PORT=8000 \ + # As an optimization, vLLM disables logprobs when using spec decoding by + # default, but this would be unexpected to users of a hosted model that + # happens to have spec decoding + # see: https://github.com/vllm-project/vllm/pull/6485 + DISABLE_LOGPROBS_DURING_SPEC_DECODING=false + USER 2000 ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"] From b0e81ce31581cb6ecdd8b1230ad78b6274dee35b Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Wed, 21 Aug 2024 21:42:24 +0530 Subject: [PATCH 44/46] deps: bump vllm-tgis-adapter to 0.4.0 (#132) [changelog for 0.4.0](https://github.com/opendatahub-io/vllm-tgis-adapter/releases/tag/0.4.0) https://issues.redhat.com/browse/RHOAIENG-11591 --- Dockerfile.ubi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 5308d690015c..3019951df117 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -205,7 +205,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.3.0 + pip install vllm-tgis-adapter==0.4.0 ENV GRPC_PORT=8033 \ PORT=8000 \ From 6cff676b566f7cb367417a793657dc7500c2db63 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 29 Aug 2024 16:40:26 -0400 Subject: [PATCH 45/46] Disable usage tracking This turns off tracking by default. If someone wants to, they can simply override this in yaml. --- Dockerfile.ubi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 3019951df117..230966ffc74a 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -187,7 +187,8 @@ ENV HF_HUB_OFFLINE=1 \ # see: https://github.com/vllm-project/vllm/pull/7080 VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ VLLM_USAGE_SOURCE=production-docker-image \ - VLLM_WORKER_MULTIPROC_METHOD=fork + VLLM_WORKER_MULTIPROC_METHOD=fork \ + VLLM_NO_USAGE_STATS=1 # setup non-root user for OpenShift RUN umask 002 \ From 9281890c8a04689df7042aac8e572d06b64a7d98 Mon Sep 17 00:00:00 2001 From: Jefferson Fialho Date: Wed, 4 Sep 2024 16:46:22 -0300 Subject: [PATCH 46/46] Updating ubi-tag e vllm-tgis-adapter Signed-off-by: Jefferson Fialho --- Dockerfile.ubi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 230966ffc74a..6911548aee03 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -1,5 +1,5 @@ ## Global Args ################################################################# -ARG BASE_UBI_IMAGE_TAG=9.4 +ARG BASE_UBI_IMAGE_TAG=9.4-1227 ARG PYTHON_VERSION=3.11 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" @@ -206,7 +206,7 @@ FROM vllm-openai as vllm-grpc-adapter USER root RUN --mount=type=cache,target=/root/.cache/pip \ - pip install vllm-tgis-adapter==0.4.0 + pip install vllm-tgis-adapter==0.4.1 ENV GRPC_PORT=8033 \ PORT=8000 \