From c5d632318de53971119351fb7376d4d3a799e00a Mon Sep 17 00:00:00 2001
From: Laurentiu Bradin <109964136+z103cb@users.noreply.github.com>
Date: Tue, 30 Apr 2024 13:55:08 +0300
Subject: [PATCH 01/46] chore: add fork OWNERS

---
 OWNERS | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 OWNERS

diff --git a/OWNERS b/OWNERS
new file mode 100644
index 000000000000..7bbb710d21ff
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,21 @@
+approvers:
+  - danielezonca
+  - dtrifiro
+  - heyselbi
+  - israel-hdez
+  - Jooho
+  - rpancham
+  - spolti
+  - terrytangyuan
+  - vaibhavjainwiz
+  - VedantMahabaleshwarkar
+  - Xaenalt
+  - z103cb
+reviewers:
+  - dtrifiro
+  - heyselbi
+  - rpancham
+  - terrytangyuan
+  - vaibhavjainwiz
+  - Xaenalt
+  - z103cb

From 49743ae46dea92f67bf06a26a31c8db4469d2f72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 21 May 2024 10:47:03 +0200
Subject: [PATCH 02/46] add ubi Dockerfile

---
 Dockerfile.ubi | 244 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 244 insertions(+)
 create mode 100644 Dockerfile.ubi

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
new file mode 100644
index 000000000000..58663927a1c2
--- /dev/null
+++ b/Dockerfile.ubi
@@ -0,0 +1,244 @@
+# Please update any changes made here to
+# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst
+
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4
+ARG PYTHON_VERSION=3.11
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+ARG PYTHON_VERSION
+
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
+    && microdnf clean all
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y \
+        which procps findutils tar vim git\
+    && microdnf clean all
+
+
+## Python Installer ############################################################
+FROM base as python-install
+
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all
+
+
+## CUDA Base ###################################################################
+FROM python-install as cuda-base
+
+# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
+# this env var is set to 12.2.0, even though it's compatible
+#ENV CUDA_VERSION=12.2.0 \
+ENV CUDA_VERSION=12.0.0 \
+    NV_CUDA_LIB_VERSION=12.2.0-1 \
+    NVIDIA_VISIBLE_DEVICES=all \
+    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+    NV_CUDA_CUDART_VERSION=12.2.53-1 \
+    NV_CUDA_COMPAT_VERSION=535.104.12
+
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+
+RUN microdnf install -y \
+        cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
+        cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
+    && microdnf clean all
+
+
+ARG CUDA_HOME="/usr/local/cuda"
+ENV CUDA_HOME=${CUDA_HOME}\
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+
+
+## CUDA Development ############################################################
+FROM cuda-base as cuda-devel
+
+ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
+    NV_NVML_DEV_VERSION=12.2.81-1 \
+    NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
+    NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
+    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2
+
+RUN microdnf install -y \
+        cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
+        cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
+        cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
+        libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
+        libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
+        libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
+    && microdnf clean all
+
+ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.2/compat/
+
+## Python cuda base #################################################################
+FROM cuda-devel AS python-cuda-base
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    pip install \
+        -r requirements-cuda.txt
+
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    pip3 install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Proto Compilation ###########################################################
+FROM python-install AS gen-protos
+
+ENV PATH=/opt/vllm/bin/:$PATH
+
+RUN microdnf install -y \
+        make \
+        findutils \
+    && microdnf clean all
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=Makefile,target=Makefile \
+    --mount=type=bind,source=proto,target=proto \
+    make gen-protos
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y ccache && microdnf clean all
+# install build dependencies
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Copy the entire directory before building wheel
+COPY vllm vllm
+
+# Copy over the generated *.pb2 files
+COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist
+
+## Release #####################################################################
+# Note from the non-UBI Dockerfile:
+# We used base cuda image because pytorch installs its own cuda libraries.
+# However pynccl depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM python-install AS vllm-openai
+
+WORKDIR /workspace
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=$VIRTUAL_ENV/bin/:$PATH
+
+# Triton needs a CC compiler
+RUN microdnf install -y gcc \
+    && microdnf clean all
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    pip install dist/*.whl --verbose
+
+# vllm requires a specific nccl version built from source distribution
+# See https://github.com/NVIDIA/nccl/issues/1234
+RUN pip install \
+        -v \
+        --force-reinstall \
+        --no-binary="all" \
+        --no-cache-dir \
+        "vllm-nccl-cu12==2.18.1.0.4.0" && \
+    mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \
+    chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1
+
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install \
+        # additional dependencies for the TGIS gRPC server
+        grpcio-tools==1.63.0 \
+        # additional dependencies for openai api_server
+        accelerate==0.30.0 \
+        # hf_transfer for faster HF hub downloads
+        hf_transfer==0.1.6
+
+ENV HF_HUB_OFFLINE=1 \
+    PORT=8000 \
+    GRPC_PORT=8033 \
+    HOME=/home/vllm \
+    VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
+    VLLM_USAGE_SOURCE=production-docker-image \
+    VLLM_WORKER_MULTIPROC_METHOD=fork
+
+# setup non-root user for OpenShift
+RUN umask 002 \
+    && useradd --uid 2000 --gid 0 vllm \
+    && chmod g+rwx $HOME /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

From d66716f87bd2438a509cd205381b6416045e0910 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 21 May 2024 10:47:37 +0200
Subject: [PATCH 03/46] Dockerfile.ubi: remove references to grpc/protos

---
 Dockerfile.ubi | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 58663927a1c2..3226a24ba1ea 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -1,6 +1,3 @@
-# Please update any changes made here to
-# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst
-
 ## Global Args #################################################################
 ARG BASE_UBI_IMAGE_TAG=9.4
 ARG PYTHON_VERSION=3.11
@@ -120,21 +117,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         -r requirements-cuda.txt \
         -r requirements-dev.txt
 
-## Proto Compilation ###########################################################
-FROM python-install AS gen-protos
-
-ENV PATH=/opt/vllm/bin/:$PATH
-
-RUN microdnf install -y \
-        make \
-        findutils \
-    && microdnf clean all
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=Makefile,target=Makefile \
-    --mount=type=bind,source=proto,target=proto \
-    make gen-protos
-
 ## Builder #####################################################################
 FROM dev AS build
 
@@ -175,9 +157,6 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 # Copy the entire directory before building wheel
 COPY vllm vllm
 
-# Copy over the generated *.pb2 files
-COPY --from=gen-protos /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
-
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
@@ -216,18 +195,8 @@ RUN pip install \
     chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1
 
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install \
-        # additional dependencies for the TGIS gRPC server
-        grpcio-tools==1.63.0 \
-        # additional dependencies for openai api_server
-        accelerate==0.30.0 \
-        # hf_transfer for faster HF hub downloads
-        hf_transfer==0.1.6
-
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
-    GRPC_PORT=8033 \
     HOME=/home/vllm \
     VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
     VLLM_USAGE_SOURCE=production-docker-image \

From 4ea368bc8679cc90661054a5548b5f91a132f893 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 28 May 2024 18:31:39 +0200
Subject: [PATCH 04/46] Dockerfile.ubi: use vllm-tgis-adapter

---
 Dockerfile.ubi | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 3226a24ba1ea..1a11dbb33e5e 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -211,3 +211,15 @@ COPY LICENSE /licenses/vllm.md
 
 USER 2000
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
+
+FROM vllm-openai as vllm-grpc-adapter
+
+USER root
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install vllm-tgis-adapter
+
+ENV GRPC_PORT=8033
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"]

From c8f42be2b121faa02c6fcf6b889140fa133de01d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 3 Jun 2024 11:24:37 +0200
Subject: [PATCH 05/46] gha: add sync workflow

---
 .github/workflows/sync-with-upstream.yml | 91 ++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 .github/workflows/sync-with-upstream.yml

diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
new file mode 100644
index 000000000000..5f009b897a3b
--- /dev/null
+++ b/.github/workflows/sync-with-upstream.yml
@@ -0,0 +1,91 @@
+name: "Sync with upstream"
+
+on:
+  schedule:
+    - cron: 20 4 * * *
+
+  workflow_dispatch:
+
+
+env:
+  # repo to fetch changes from
+  UPSTREAM_REPO: vllm-project/vllm
+ # branch to sync
+  BRANCH: main
+
+jobs:
+  upstream-sync:
+    name: Sync with upstream
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch upstream repo
+        run: |
+          git remote add upstream https://github.com/${UPSTREAM_REPO}
+          git fetch upstream
+
+      - name: Check diff
+        id: diff
+        shell: bash
+        run: |
+          echo 'diff<<EOF' >> $GITHUB_OUTPUT
+          git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT)
+          echo 'EOF' >> $GITHUB_OUTPUT
+
+      - name: Create PR
+        if: ${{ steps.diff.outputs.diff != '' }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -xeu
+
+          git_hash="$(git rev-parse upstream/${BRANCH})"
+          echo "git_hash=$git_hash" >> $GITHUB_OUTPUT
+          git_describe="$(git describe --tags upstream/${BRANCH})"
+          echo "git_describe=$git_describe" >> $GITHUB_OUTPUT
+
+          # echo 'commits<<EOF' >> $GITHUB_OUTPUT
+          # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT
+          # echo 'EOF' >> $GITHUB_OUTPUT
+
+          upstream_url="https://github.com/${UPSTREAM_REPO}"
+          upstream_branch="$upstream_url/tree/${BRANCH}"
+
+          title="Sync with upstream@${git_describe}"
+          body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH"
+
+          gh repo set-default $GITHUB_REPOSITORY
+          pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number')
+
+          if [[ -z $pr_number ]]; then
+            echo "Creating PR"
+            gh pr create \
+              --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \
+              --base ${BRANCH} \
+              --label code-sync \
+              --title "$title" \
+              --body "$body" \
+              --no-maintainer-edit
+            exit 0
+          fi
+
+          echo "Checking if PR is up-to-date"
+
+          git fetch ${upstream_url} refs/pull/${pr_number}/head
+          if git diff --stat --exit-code upstream/main FETCH_HEAD; then
+            echo "PR is up-to-date"
+            exit 0
+          fi
+
+          echo "Updating PR \#${pr_number}"
+          gh pr edit \
+            $pr_number \
+            --body "$body" \
+            --title "$title"

From 6f1bd871179f077341bea30736542e2cd6d9e69f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 10 Jun 2024 17:31:35 +0200
Subject: [PATCH 06/46] Dockerfile.ubi: use distributed-executor-backend=mp as
 default

---
 Dockerfile.ubi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 1a11dbb33e5e..a65645647070 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -210,7 +210,7 @@ RUN umask 002 \
 COPY LICENSE /licenses/vllm.md
 
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"]
 
 
 FROM vllm-openai as vllm-grpc-adapter
@@ -222,4 +222,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV GRPC_PORT=8033
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"]
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"]

From 7823f55494001085390da1cfb3d4e8e430d2e706 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Thu, 13 Jun 2024 10:26:34 +0200
Subject: [PATCH 07/46] Dockerfile.ubi: remove vllm-nccl workaround

Fixed upstream in https://github.com/vllm-project/vllm/pull/5091
---
 Dockerfile.ubi | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index a65645647070..6c2bd732c5df 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -183,22 +183,9 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     pip install dist/*.whl --verbose
 
-# vllm requires a specific nccl version built from source distribution
-# See https://github.com/NVIDIA/nccl/issues/1234
-RUN pip install \
-        -v \
-        --force-reinstall \
-        --no-binary="all" \
-        --no-cache-dir \
-        "vllm-nccl-cu12==2.18.1.0.4.0" && \
-    mv /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1 /opt/vllm/lib/ && \
-    chmod 0755 /opt/vllm/lib/libnccl.so.2.18.1
-
-
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \
-    VLLM_NCCL_SO_PATH=/opt/vllm/lib/libnccl.so.2.18.1 \
     VLLM_USAGE_SOURCE=production-docker-image \
     VLLM_WORKER_MULTIPROC_METHOD=fork
 

From 8b537843340cd1abeedd58c6e61f16c32645c895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 18 Jun 2024 15:44:52 +0200
Subject: [PATCH 08/46] Dockerfile.ubi: add missing requirements-*.txt bind
 mounts

---
 Dockerfile.ubi | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 6c2bd732c5df..6d85a0869622 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -113,6 +113,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
     pip3 install \
         -r requirements-cuda.txt \
         -r requirements-dev.txt

From 6c478a6b156970ece288aec0cc8a887774478471 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 29 May 2024 11:01:40 +0000
Subject: [PATCH 09/46] add triton CustomCacheManger

fixes RHOAIENG-8043

Co-authored-by: Chih-Chieh-Yang <chih.chieh.yang@ibm.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 Dockerfile.ubi                 |  6 +++++-
 extras/custom_cache_manager.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 extras/custom_cache_manager.py

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 6d85a0869622..294399be24c4 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -180,6 +180,9 @@ ENV PATH=$VIRTUAL_ENV/bin/:$PATH
 RUN microdnf install -y gcc \
     && microdnf clean all
 
+# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043)
+COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py
+
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
@@ -189,7 +192,8 @@ ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \
     VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager"
 
 # setup non-root user for OpenShift
 RUN umask 002 \
diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py
new file mode 100644
index 000000000000..c83ed5b6e865
--- /dev/null
+++ b/extras/custom_cache_manager.py
@@ -0,0 +1,32 @@
+import os
+
+from triton.runtime.cache import (FileCacheManager, default_cache_dir,
+                                  default_dump_dir, default_override_dir)
+
+
+class CustomCacheManager(FileCacheManager):
+
+    def __init__(self, key, override=False, dump=False):
+        self.key = key
+        self.lock_path = None
+        if dump:
+            self.cache_dir = default_dump_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+            self.lock_path = os.path.join(self.cache_dir, "lock")
+            os.makedirs(self.cache_dir, exist_ok=True)
+        elif override:
+            self.cache_dir = default_override_dir()
+            self.cache_dir = os.path.join(self.cache_dir, self.key)
+        else:
+            # create cache directory if it doesn't exist
+            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
+                                       "").strip() or default_cache_dir()
+            if self.cache_dir:
+                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+                self.cache_dir = os.path.join(self.cache_dir, self.key)
+                self.lock_path = os.path.join(self.cache_dir, "lock")
+                os.makedirs(self.cache_dir, exist_ok=True)
+            else:
+                raise RuntimeError("Could not create or locate cache dir")
+
+        print(f"Triton cache dir: {self.cache_dir=}")

From cb6f44fe85eef4ff317f1b56c79678f2997633d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 19 Jun 2024 13:45:37 +0200
Subject: [PATCH 10/46] gha: sync-with-upstream workflow create PRs as draft

---
 .github/workflows/sync-with-upstream.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
index 5f009b897a3b..9e747de396ed 100644
--- a/.github/workflows/sync-with-upstream.yml
+++ b/.github/workflows/sync-with-upstream.yml
@@ -72,6 +72,7 @@ jobs:
               --label code-sync \
               --title "$title" \
               --body "$body" \
+              --draft \
               --no-maintainer-edit
             exit 0
           fi

From cb041a6dc036235ad7ac47bf28670d9839f4e7f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 19 Jun 2024 15:14:35 +0200
Subject: [PATCH 11/46] add smoke/unit tests scripts

---
 extras/smoke-test.sh | 73 ++++++++++++++++++++++++++++++++++++++++++++
 extras/unit-tests.sh | 43 ++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 extras/smoke-test.sh
 create mode 100644 extras/unit-tests.sh

diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
new file mode 100644
index 000000000000..f03edea4f619
--- /dev/null
+++ b/extras/smoke-test.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+set -uxo pipefail
+
+# we will need to download test models off HF hub
+unset HF_HUB_OFFLINE
+
+export HTTP_PORT=8080
+export GRPC_PORT=8033
+
+
+function wait_for(){
+    trap "" ERR # we don't care about errors in this function
+
+    name=$1
+    shift
+    command=$@
+
+    max_retries=10
+    until $command ; do
+        echo "Waiting for $name to be up (retries_left=$max_retries)..."
+        sleep 30
+        max_retries=$((max_retries-1))
+        if [[ max_retries -le 0 ]]; then
+            echo "Timed out waiting for $name server" >&2
+            exit 1
+        fi
+    done
+}
+
+# stop the server on any errors
+trap 'kill -9 $server_pid && exit 1' ERR
+
+# spin up the OpenAPI server in the background
+python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT &
+server_pid=$!
+server_url="http://localhost:$HTTP_PORT"
+
+wait_for "http server" curl --verbose --connect-timeout 1 --fail-with-body --no-progress-meter "${server_url}/health"
+
+curl -v --no-progress-meter --fail-with-body \
+  "${server_url}/v1/models" | python -m json.tool || \
+
+curl -v --no-progress-meter --fail-with-body \
+  --header "Content-Type: application/json" \
+  --data '{
+    "prompt": "A red fedora symbolizes ",
+    "model": "facebook/opt-125m"
+}' \
+  "${server_url}/v1/completions" | python -m json.tool
+
+echo "OpenAI API success" && kill -9 $server_pid
+
+
+# spin up the grpc server in the background
+python -m vllm_tgis_adapter --grpc-port $GRPC_PORT &
+server_pid=$!
+server_url="localhost:$GRPC_PORT"
+
+# get grpcurl
+curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \
+  https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz
+tar -xf /tmp/grpcurl.tar.gz --directory /tmp
+
+wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapter
+
+/tmp/grpcurl -v \
+    -plaintext \
+    -use-reflection \
+    -d '{ "requests": [{"text": "A red fedora symbolizes "}]}' \
+    "$server_url" \
+    fmaas.GenerationService/Generate
+
+echo "GRPC API success" && kill -9 $server_pid
diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh
new file mode 100644
index 000000000000..4739fb6d65ea
--- /dev/null
+++ b/extras/unit-tests.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# partially copied from from .buildkite/test-pipeline.yml
+
+cd tests || exit 1
+
+# we will need to download test models off HF hub
+unset HF_HUB_OFFLINE
+
+# basic correctness
+pytest -v -s test_regression.py
+pytest -v -s async_engine
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+
+# core
+pytest -v -s core
+
+# note: distributed tests are disabled
+
+# engine tests
+pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+# entrypoint
+pytest -v -s entrypoints -m openai
+
+#inputs (note: multimodal tests are skipped)
+pytest -v -s test_inputs.py
+
+#models
+pytest -v -s models -m \"not vlm\"
+
+# misc
+pytest -v -s prefix_caching
+pytest -v -s samplers
+pytest -v -s test_logits_processor.py
+pytest -v -s models -m \"not vlm\"
+pytest -v -s worker
+VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s spec_decode
+# pytest -v -s tensorizer_loader # disabled: requires libsodium
+pytest -v -s metrics
+pytest -v -s quantization

From 4ef5daba3f3dd652a571939fc615094c3456bf4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Thu, 20 Jun 2024 19:24:04 +0200
Subject: [PATCH 12/46] extras: exit unit tests on err

---
 extras/unit-tests.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh
index 4739fb6d65ea..08b2388b646e 100644
--- a/extras/unit-tests.sh
+++ b/extras/unit-tests.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 # partially copied from from .buildkite/test-pipeline.yml
+set -e
 
 cd tests || exit 1
 

From 272ff919a910e7fa1201021a036a510bf1072b8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 28 May 2024 16:15:06 +0200
Subject: [PATCH 13/46] Dockerfile.ubi: misc improvements

- get rid cuda-devel stage, use cuda 12.4
- add build flags
- remove useless installs
---
 Dockerfile.ubi | 63 ++++++++------------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 294399be24c4..e4861243e222 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -4,7 +4,6 @@ ARG PYTHON_VERSION=3.11
 
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 
-
 ## Base Layer ##################################################################
 FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
 ARG PYTHON_VERSION
@@ -39,61 +38,19 @@ RUN microdnf install -y \
 ## CUDA Base ###################################################################
 FROM python-install as cuda-base
 
-# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
-# this env var is set to 12.2.0, even though it's compatible
-#ENV CUDA_VERSION=12.2.0 \
-ENV CUDA_VERSION=12.0.0 \
-    NV_CUDA_LIB_VERSION=12.2.0-1 \
-    NVIDIA_VISIBLE_DEVICES=all \
-    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
-    NV_CUDA_CUDART_VERSION=12.2.53-1 \
-    NV_CUDA_COMPAT_VERSION=535.104.12
-
 RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 
 RUN microdnf install -y \
-        cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
-        cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
-    && microdnf clean all
+        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
+    microdnf clean all
 
-
-ARG CUDA_HOME="/usr/local/cuda"
-ENV CUDA_HOME=${CUDA_HOME}\
+ENV CUDA_HOME="/usr/local/cuda" \
     PATH="${CUDA_HOME}/bin:${PATH}" \
     LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
 
-
-## CUDA Development ############################################################
-FROM cuda-base as cuda-devel
-
-ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
-    NV_NVML_DEV_VERSION=12.2.81-1 \
-    NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
-    NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
-    NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2
-
-RUN microdnf install -y \
-        cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
-        cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
-        cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
-        libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
-        libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
-        libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
-    && microdnf clean all
-
-ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
-
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.2/compat/
-
 ## Python cuda base #################################################################
-FROM cuda-devel AS python-cuda-base
+FROM cuda-base AS python-cuda-base
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
@@ -128,7 +85,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
-RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y ccache && microdnf clean all
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
 # install build dependencies
 
 # copy input files
@@ -162,13 +120,12 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
 
 ## Release #####################################################################
-# Note from the non-UBI Dockerfile:
-# We used base cuda image because pytorch installs its own cuda libraries.
-# However pynccl depends on cuda libraries so we had to switch to the runtime image
-# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM python-install AS vllm-openai
 
 WORKDIR /workspace

From 57fd180627f676a73605bf28c814e74869c88d04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Fri, 21 Jun 2024 10:03:48 +0200
Subject: [PATCH 14/46] update OWNERS

---
 OWNERS | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/OWNERS b/OWNERS
index 7bbb710d21ff..dc965385e186 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,20 +1,17 @@
 approvers:
-  - danielezonca
   - dtrifiro
   - heyselbi
-  - israel-hdez
-  - Jooho
   - rpancham
-  - spolti
+  - RH-steve-grubb
   - terrytangyuan
   - vaibhavjainwiz
-  - VedantMahabaleshwarkar
   - Xaenalt
   - z103cb
 reviewers:
   - dtrifiro
   - heyselbi
   - rpancham
+  - RH-steve-grubb
   - terrytangyuan
   - vaibhavjainwiz
   - Xaenalt

From fc0df4152ba57c1ca0e77e36e085ac86fdf21129 Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta24@gmail.com>
Date: Tue, 25 Jun 2024 05:17:36 -0700
Subject: [PATCH 15/46] Dockerfile.ubi: use tensorizer (#64)

add libsodium for tensorizer encryption

---------

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com>
---
 Dockerfile.ubi | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index e4861243e222..cef224e3e8bb 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -125,6 +125,22 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         CMAKE_BUILD_TYPE=Release \
         python3 setup.py bdist_wheel --dist-dir=dist
 
+#################### libsodium Build IMAGE ####################
+FROM base as libsodium-builder
+
+RUN microdnf install -y gcc gzip \
+    && microdnf clean all
+
+WORKDIR /usr/src/libsodium
+
+ARG LIBSODIUM_VERSION=1.0.20
+RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
+    && tar -xzvf libsodium*.tar.gz \
+    && rm -f libsodium*.tar.gz \
+    && mv libsodium*/* ./
+
+RUN ./configure --prefix="/usr/" && make && make check
+
 ## Release #####################################################################
 FROM python-install AS vllm-openai
 
@@ -143,7 +159,12 @@ COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custo
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+
+# Install libsodium for Tensorizer encryption
+RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
+    cd /usr/src/libsodium \
+    && make install
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \

From 50677483b6b658b20a3869ce48782258dfbab1e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 26 Jun 2024 18:13:16 +0200
Subject: [PATCH 16/46] Dockerfile.ubi: pin vllm-tgis-adapter to 0.1.2

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index cef224e3e8bb..20bb7e7e2cb5 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -189,7 +189,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter
+    pip install vllm-tgis-adapter==0.1.2
 
 ENV GRPC_PORT=8033
 USER 2000

From b93cf7e5a7cb16d2b0c6cdebd202aafdb1b54373 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 2 Jul 2024 12:20:13 +0200
Subject: [PATCH 17/46] gha: fix fetch step in upstream sync workflow

---
 .github/workflows/sync-with-upstream.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
index 9e747de396ed..9d87ff4e50da 100644
--- a/.github/workflows/sync-with-upstream.yml
+++ b/.github/workflows/sync-with-upstream.yml
@@ -79,7 +79,7 @@ jobs:
 
           echo "Checking if PR is up-to-date"
 
-          git fetch ${upstream_url} refs/pull/${pr_number}/head
+          git fetch origin refs/pull/${pr_number}/head
           if git diff --stat --exit-code upstream/main FETCH_HEAD; then
             echo "PR is up-to-date"
             exit 0

From ab6ab65c2ac3ea8078268fe635116add37347767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 2 Jul 2024 12:36:38 +0200
Subject: [PATCH 18/46] gha: always update sync workflow PR body/title

---
 .github/workflows/sync-with-upstream.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
index 9d87ff4e50da..53751552f4d2 100644
--- a/.github/workflows/sync-with-upstream.yml
+++ b/.github/workflows/sync-with-upstream.yml
@@ -77,14 +77,6 @@ jobs:
             exit 0
           fi
 
-          echo "Checking if PR is up-to-date"
-
-          git fetch origin refs/pull/${pr_number}/head
-          if git diff --stat --exit-code upstream/main FETCH_HEAD; then
-            echo "PR is up-to-date"
-            exit 0
-          fi
-
           echo "Updating PR \#${pr_number}"
           gh pr edit \
             $pr_number \

From a419aa823339614a279e187a25afe4e1692b18ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 3 Jul 2024 17:28:25 +0200
Subject: [PATCH 19/46] Dockerfile.ubi: bump vllm-tgis-adapter to 0.1.3

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 20bb7e7e2cb5..822363161be2 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -189,7 +189,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.1.2
+    pip install vllm-tgis-adapter==0.1.3
 
 ENV GRPC_PORT=8033
 USER 2000

From 456e93f1a82327f7f7e1f00bc16a1dfe13998b58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 10 Jul 2024 17:03:57 +0200
Subject: [PATCH 20/46] Dockerfile.ubi: get rid of
 --distributed-executor-backend=mp

this is the default when `--worker-use-ray` is not provided and
world-size > 1
---
 Dockerfile.ubi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 822363161be2..c38c1be443c6 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -181,7 +181,7 @@ RUN umask 002 \
 COPY LICENSE /licenses/vllm.md
 
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server", "--distributed-executor-backend=mp"]
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 
 
 FROM vllm-openai as vllm-grpc-adapter
@@ -193,4 +193,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV GRPC_PORT=8033
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--distributed-executor-backend=mp"]
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"]

From f67c2ca241c3ee71801b8a7772ef63739f8f6df9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 9 Jul 2024 14:53:07 +0200
Subject: [PATCH 21/46] Dockerfile.ubi: add flashinfer

---
 Dockerfile.ubi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index c38c1be443c6..cb5ec895ae49 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -166,6 +166,9 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
     cd /usr/src/libsodium \
     && make install
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \

From d554f4997123c4759a2ba23d9252f7872228b145 Mon Sep 17 00:00:00 2001
From: Prashant Gupta <prashantgupta24@gmail.com>
Date: Fri, 12 Jul 2024 08:59:40 -0700
Subject: [PATCH 22/46] pin adapter to 2.0.0

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index cb5ec895ae49..8ba87bbcb5e8 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -192,7 +192,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.1.3
+    pip install vllm-tgis-adapter==0.2.0
 
 ENV GRPC_PORT=8033
 USER 2000

From 11c65674977111a28f0d366de71ec908962ce074 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 15 Jul 2024 13:31:01 +0200
Subject: [PATCH 23/46] deps: bump flashinfer to 0.0.9

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 8ba87bbcb5e8..28bc2deb2c61 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -167,7 +167,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
     && make install
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \

From fe79602265759ebea219279308e6b04adc64b0a8 Mon Sep 17 00:00:00 2001
From: Selbi Nuryyeva <selbi@redhat.com>
Date: Thu, 27 Jun 2024 16:27:11 -0400
Subject: [PATCH 24/46] Update OWNERS with IBM folks

---
 OWNERS | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/OWNERS b/OWNERS
index dc965385e186..09b25dab41c0 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,17 +1,27 @@
 approvers:
   - dtrifiro
+  - fialhocoelho
   - heyselbi
-  - rpancham
+  - joerunde
+  - maxdebayser
+  - njhill
+  - prashantgupta24
   - RH-steve-grubb
+  - rpancham
   - terrytangyuan
   - vaibhavjainwiz
-  - Xaenalt
   - z103cb
+  - Xaenalt
 reviewers:
   - dtrifiro
+  - fialhocoelho
   - heyselbi
-  - rpancham
+  - joerunde
+  - maxdebayser
+  - njhill
+  - prashantgupta24
   - RH-steve-grubb
+  - rpancham
   - terrytangyuan
   - vaibhavjainwiz
   - Xaenalt

From 225d4a8c93b15f76bafd67b98f88a0576881f099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 17 Jul 2024 20:07:42 +0200
Subject: [PATCH 25/46] Dockerfile.ubi: bind mount .git dir to allow inclusion
 of git commit hash

---
 Dockerfile.ubi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 28bc2deb2c61..a7bd8f50bd24 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -120,6 +120,7 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
         CMAKE_BUILD_TYPE=Release \

From 9ded6e6af5986019b324dcc49d21a2a939994d7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 17 Jul 2024 20:09:47 +0200
Subject: [PATCH 26/46] gha: remove reminder_comment

---
 .github/workflows/reminder_comment.yml | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 .github/workflows/reminder_comment.yml

diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
deleted file mode 100644
index 390c88bb6530..000000000000
--- a/.github/workflows/reminder_comment.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: PR Reminder Comment Bot
-on:
-  pull_request_target:
-    types: [opened]
-
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@v6
-        with:
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
-            })
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From bf5ff75167e5087445543ffde338299f8da1d458 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Thu, 18 Jul 2024 19:08:35 +0200
Subject: [PATCH 27/46] Dockerfile: bump vllm-tgis-adapter to 0.2.1

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index a7bd8f50bd24..49a046044f1c 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.2.0
+    pip install vllm-tgis-adapter==0.2.1
 
 ENV GRPC_PORT=8033
 USER 2000

From ae6669b38a5fc4863329feef8d0c5014731699f7 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Thu, 18 Jul 2024 14:11:52 -0500
Subject: [PATCH 28/46] fix: update setup.py to differentiate between fork and
 upstream

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 setup.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index b146299f8269..c5d1a3b3a2d4 100644
--- a/setup.py
+++ b/setup.py
@@ -454,18 +454,17 @@ def _read_requirements(filename: str) -> List[str]:
     package_data["vllm"].append("*.so")
 
 setup(
-    name="vllm",
+    name="vllm-odh",
     version=get_vllm_version(),
-    author="vLLM Team",
+    author="Open Data Hub Community",
     license="Apache 2.0",
     description=("A high-throughput and memory-efficient inference and "
                  "serving engine for LLMs"),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
-    url="https://github.com/vllm-project/vllm",
+    url="https://github.com/opendatahub-io/vllm",
     project_urls={
-        "Homepage": "https://github.com/vllm-project/vllm",
-        "Documentation": "https://vllm.readthedocs.io/en/latest/",
+        "Homepage": "https://github.com/opendatahub-io/vllm",
     },
     classifiers=[
         "Programming Language :: Python :: 3.8",

From 0dc9ba93338ef351ca70b91aaea0ebb204a36162 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Fri, 19 Jul 2024 13:32:00 +0200
Subject: [PATCH 29/46] Dockerfile.ubi: properly mount .git dir

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 49a046044f1c..d3a93154e638 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -120,7 +120,7 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,target=/workspace/.git \
+    --mount=type=bind,src=.git,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
         CMAKE_BUILD_TYPE=Release \

From efcd71c8d2992e189cf763754accec3d89935849 Mon Sep 17 00:00:00 2001
From: Daniele <36171005+dtrifiro@users.noreply.github.com>
Date: Fri, 19 Jul 2024 18:04:14 +0200
Subject: [PATCH 30/46] Revert "[CI/Build] fix: update setup.py to
 differentiate between fork and upstream"

---
 setup.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index c5d1a3b3a2d4..b146299f8269 100644
--- a/setup.py
+++ b/setup.py
@@ -454,17 +454,18 @@ def _read_requirements(filename: str) -> List[str]:
     package_data["vllm"].append("*.so")
 
 setup(
-    name="vllm-odh",
+    name="vllm",
     version=get_vllm_version(),
-    author="Open Data Hub Community",
+    author="vLLM Team",
     license="Apache 2.0",
     description=("A high-throughput and memory-efficient inference and "
                  "serving engine for LLMs"),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
-    url="https://github.com/opendatahub-io/vllm",
+    url="https://github.com/vllm-project/vllm",
     project_urls={
-        "Homepage": "https://github.com/opendatahub-io/vllm",
+        "Homepage": "https://github.com/vllm-project/vllm",
+        "Documentation": "https://vllm.readthedocs.io/en/latest/",
     },
     classifiers=[
         "Programming Language :: Python :: 3.8",

From 2b71ba51637687ab4bc81fc13f345060a9644746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Fri, 19 Jul 2024 18:21:15 +0200
Subject: [PATCH 31/46] Dockerfile.ubi: bump vllm-tgis-adapter to 0.2.2

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index d3a93154e638..566178cbd1b9 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.2.1
+    pip install vllm-tgis-adapter==0.2.2
 
 ENV GRPC_PORT=8033
 USER 2000

From 9e10cae16b4aea7df77055b10964c084dd814f2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 23 Jul 2024 20:21:56 +0200
Subject: [PATCH 32/46] gha: remove unused upstream workflows

---
 .github/workflows/add_label_automerge.yml     | 21 -----------------
 .github/workflows/add_label_ready_comment.yml | 23 -------------------
 2 files changed, 44 deletions(-)
 delete mode 100644 .github/workflows/add_label_automerge.yml
 delete mode 100644 .github/workflows/add_label_ready_comment.yml

diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
deleted file mode 100644
index cd53b764c720..000000000000
--- a/.github/workflows/add_label_automerge.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-name: Add label on auto-merge enabled
-on:
-    pull_request_target:
-        types:
-            - auto_merge_enabled
-jobs:
-    add-label-on-auto-merge:
-        runs-on: ubuntu-latest
-        steps:
-            -   name: Add label
-                uses: actions/github-script@v5
-                with:
-                    script: |
-                        github.rest.issues.addLabels({
-                            owner: context.repo.owner,
-                            repo: context.repo.repo,
-                            issue_number: context.issue.number,
-                            labels: ['ready']
-                        })
-                env:
-                    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
deleted file mode 100644
index 729c1452af03..000000000000
--- a/.github/workflows/add_label_ready_comment.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-name: Add Ready Label on Ready Comment
-
-on:
-  issue_comment:
-    types: [created]
-
-jobs:
-  add-ready-label:
-    runs-on: ubuntu-latest
-    if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
-    steps:
-        -   name: Add label
-            uses: actions/github-script@v5
-            with:
-                script: |
-                    github.rest.issues.addLabels({
-                        owner: context.repo.owner,
-                        repo: context.repo.repo,
-                        issue_number: context.issue.number,
-                        labels: ['ready']
-                    })
-            env:
-                GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

From b209322be75eedade10d048cbb08b959cd3af538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 24 Jul 2024 17:35:41 +0200
Subject: [PATCH 33/46] deps: bump vllm-tgis-adapter to 0.2.3

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 566178cbd1b9..ae6bd0bc6dfd 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -193,7 +193,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.2.2
+    pip install vllm-tgis-adapter==0.2.3
 
 ENV GRPC_PORT=8033
 USER 2000

From 8ca9ede12bce137cf530dbdd6889cf6fc547b466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 24 Jul 2024 13:11:01 +0200
Subject: [PATCH 34/46] Dockerfile.ubi: get rid of custom cache manager

fixed in https://github.com/vllm-project/vllm/pull/6140

fixes https://issues.redhat.com/browse/RHOAIENG-8043
---
 Dockerfile.ubi                 |  6 +-----
 extras/custom_cache_manager.py | 32 --------------------------------
 2 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 extras/custom_cache_manager.py

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index ae6bd0bc6dfd..4462ce8a59c2 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -154,9 +154,6 @@ ENV PATH=$VIRTUAL_ENV/bin/:$PATH
 RUN microdnf install -y gcc \
     && microdnf clean all
 
-# Custom cache manager (fix for https://issues.redhat.com/browse/RHOAIENG-8043)
-COPY extras/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/custom_cache_manager.py
-
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
@@ -174,8 +171,7 @@ ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \
     VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork \
-    TRITON_CACHE_MANAGER="custom_cache_manager:CustomCacheManager"
+    VLLM_WORKER_MULTIPROC_METHOD=fork
 
 # setup non-root user for OpenShift
 RUN umask 002 \
diff --git a/extras/custom_cache_manager.py b/extras/custom_cache_manager.py
deleted file mode 100644
index c83ed5b6e865..000000000000
--- a/extras/custom_cache_manager.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-
-from triton.runtime.cache import (FileCacheManager, default_cache_dir,
-                                  default_dump_dir, default_override_dir)
-
-
-class CustomCacheManager(FileCacheManager):
-
-    def __init__(self, key, override=False, dump=False):
-        self.key = key
-        self.lock_path = None
-        if dump:
-            self.cache_dir = default_dump_dir()
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-            self.lock_path = os.path.join(self.cache_dir, "lock")
-            os.makedirs(self.cache_dir, exist_ok=True)
-        elif override:
-            self.cache_dir = default_override_dir()
-            self.cache_dir = os.path.join(self.cache_dir, self.key)
-        else:
-            # create cache directory if it doesn't exist
-            self.cache_dir = os.getenv("TRITON_CACHE_DIR",
-                                       "").strip() or default_cache_dir()
-            if self.cache_dir:
-                self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
-                self.cache_dir = os.path.join(self.cache_dir, self.key)
-                self.lock_path = os.path.join(self.cache_dir, "lock")
-                os.makedirs(self.cache_dir, exist_ok=True)
-            else:
-                raise RuntimeError("Could not create or locate cache dir")
-
-        print(f"Triton cache dir: {self.cache_dir=}")

From f850e50758f3a82974e9c1e3f618775fe8fbf4be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 6 Aug 2024 16:56:06 +0200
Subject: [PATCH 35/46] Dockerfile.ubi: add missing dependency

---
 Dockerfile.ubi | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 4462ce8a59c2..75082aa77502 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -71,6 +71,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
     --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \
     --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
     pip3 install \
         -r requirements-cuda.txt \

From f41930e80064299c5ba11a41962b008b3fcbcd45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 24 Jul 2024 17:35:41 +0200
Subject: [PATCH 36/46] deps: bump vllm-tgis-adapter to 0.3.0

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 75082aa77502..69fd6d44f441 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -190,7 +190,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.2.3
+    pip install vllm-tgis-adapter==0.3.0
 
 ENV GRPC_PORT=8033
 USER 2000

From 26cfbd9ede504d0ace858de14711420134bd4026 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 13 Aug 2024 01:42:17 +0200
Subject: [PATCH 37/46] Dockerfile.ubi: force using python-installed cuda
 runtime libraries

---
 Dockerfile.ubi | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 69fd6d44f441..d049267db49a 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -62,6 +62,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install \
         -r requirements-cuda.txt
 
+
 ## Development #################################################################
 FROM python-cuda-base AS dev
 
@@ -113,7 +114,6 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 # Make sure the cuda environment is in the PATH
 ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Copy the entire directory before building wheel
 COPY vllm vllm
@@ -145,12 +145,18 @@ RUN ./configure --prefix="/usr/" && make && make check
 
 ## Release #####################################################################
 FROM python-install AS vllm-openai
+ARG PYTHON_VERSION
 
 WORKDIR /workspace
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH=$VIRTUAL_ENV/bin/:$PATH
 
+# force using the python venv's cuda runtime libraries
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
+
 # Triton needs a CC compiler
 RUN microdnf install -y gcc \
     && microdnf clean all

From ac48a82007db71579c80e82452290d2bc9a3d928 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 13 Aug 2024 00:08:56 +0200
Subject: [PATCH 38/46] Dockerfile: use uv pip everywhere (it's faster)

---
 Dockerfile.ubi | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index d049267db49a..914cae070274 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -32,7 +32,7 @@ ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN microdnf install -y \
     python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
 
 
 ## CUDA Base ###################################################################
@@ -57,9 +57,10 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    pip install \
+    uv pip install \
         -r requirements-cuda.txt
 
 
@@ -68,13 +69,14 @@ FROM python-cuda-base AS dev
 
 # install build and runtime dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
     --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
     --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \
     --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    pip3 install \
+    uv pip install \
         -r requirements-cuda.txt \
         -r requirements-dev.txt
 
@@ -83,8 +85,9 @@ FROM dev AS build
 
 # install build dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    pip install -r requirements-build.txt
+    uv pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
 # git is required for the cutlass kernels
@@ -121,6 +124,7 @@ COPY vllm vllm
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,src=.git,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
@@ -164,7 +168,8 @@ RUN microdnf install -y gcc \
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
@@ -172,7 +177,8 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
     && make install
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \

From 9460cfb8b869199c7631eb438ec78255e8b5a6d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 5 Aug 2024 18:17:52 +0200
Subject: [PATCH 39/46] Dockerfile.ubi: bump flashinfer to 0.1.2

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 914cae070274..53a6c90ecfbf 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -178,7 +178,7 @@ RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=cache,target=/root/.cache/uv \
-    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \

From 204a1d43f390843d1c99778d27caffd222dd2142 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Thu, 8 Aug 2024 15:43:20 -0600
Subject: [PATCH 40/46] feat: allow long max seq length

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 Dockerfile.ubi | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 53a6c90ecfbf..39ee4a63c84a 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -183,6 +183,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
     HOME=/home/vllm \
+    # Allow requested max length to exceed what is extracted from the
+    # config.json
+    # see: https://github.com/vllm-project/vllm/pull/7080
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
     VLLM_USAGE_SOURCE=production-docker-image \
     VLLM_WORKER_MULTIPROC_METHOD=fork
 

From f49380d0f8c6b825e23d4c90b93378c24bd32b96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 13 Aug 2024 20:27:58 +0200
Subject: [PATCH 41/46] smoke test: kill server on timeout

---
 extras/smoke-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
index f03edea4f619..15bcd6b1984f 100644
--- a/extras/smoke-test.sh
+++ b/extras/smoke-test.sh
@@ -22,6 +22,7 @@ function wait_for(){
         max_retries=$((max_retries-1))
         if [[ max_retries -le 0 ]]; then
             echo "Timed out waiting for $name server" >&2
+            kill -9 ${server_pid}
             exit 1
         fi
     done

From 19adb9d94eec543f46b998b4f460b976f259a651 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 13 Aug 2024 23:29:47 +0200
Subject: [PATCH 42/46] Dockerfile.ubi: set vllm_tgis_adapter unicorn log level
 to warning

---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 39ee4a63c84a..e185ac549f51 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -210,4 +210,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ENV GRPC_PORT=8033
 USER 2000
-ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"]
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]

From b361484b24e6586aea4d97eeac0fa158d0170c80 Mon Sep 17 00:00:00 2001
From: Travis Johnson <tsjohnso@us.ibm.com>
Date: Tue, 20 Aug 2024 12:25:56 -0600
Subject: [PATCH 43/46] fix: enable logprobs during spec decoding by default

Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
---
 Dockerfile.ubi | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index e185ac549f51..5308d690015c 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -181,7 +181,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
-    PORT=8000 \
     HOME=/home/vllm \
     # Allow requested max length to exceed what is extracted from the
     # config.json
@@ -208,6 +207,13 @@ USER root
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install vllm-tgis-adapter==0.3.0
 
-ENV GRPC_PORT=8033
+ENV GRPC_PORT=8033 \
+    PORT=8000 \
+    # As an optimization, vLLM disables logprobs when using spec decoding by
+    # default, but this would be unexpected to users of a hosted model that
+    # happens to have spec decoding
+    # see: https://github.com/vllm-project/vllm/pull/6485
+    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
+
 USER 2000
 ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]

From b0e81ce31581cb6ecdd8b1230ad78b6274dee35b Mon Sep 17 00:00:00 2001
From: Vaibhav Jain <vajain@redhat.com>
Date: Wed, 21 Aug 2024 21:42:24 +0530
Subject: [PATCH 44/46] deps: bump vllm-tgis-adapter to 0.4.0 (#132)

[changelog for
0.4.0](https://github.com/opendatahub-io/vllm-tgis-adapter/releases/tag/0.4.0)


https://issues.redhat.com/browse/RHOAIENG-11591
---
 Dockerfile.ubi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 5308d690015c..3019951df117 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -205,7 +205,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.3.0
+    pip install vllm-tgis-adapter==0.4.0
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \

From 6cff676b566f7cb367417a793657dc7500c2db63 Mon Sep 17 00:00:00 2001
From: Steve Grubb <ausearch.1@gmail.com>
Date: Thu, 29 Aug 2024 16:40:26 -0400
Subject: [PATCH 45/46] Disable usage tracking

This turns off tracking by default. If someone wants to, they
can simply override this in yaml.
---
 Dockerfile.ubi | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 3019951df117..230966ffc74a 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -187,7 +187,8 @@ ENV HF_HUB_OFFLINE=1 \
     # see: https://github.com/vllm-project/vllm/pull/7080
     VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
     VLLM_USAGE_SOURCE=production-docker-image \
-    VLLM_WORKER_MULTIPROC_METHOD=fork
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    VLLM_NO_USAGE_STATS=1
 
 # setup non-root user for OpenShift
 RUN umask 002 \

From 9281890c8a04689df7042aac8e572d06b64a7d98 Mon Sep 17 00:00:00 2001
From: Jefferson Fialho <jfialho@ibm.com>
Date: Wed, 4 Sep 2024 16:46:22 -0300
Subject: [PATCH 46/46] Updating ubi-tag e vllm-tgis-adapter

Signed-off-by: Jefferson Fialho <jfialho@ibm.com>
---
 Dockerfile.ubi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.ubi b/Dockerfile.ubi
index 230966ffc74a..6911548aee03 100644
--- a/Dockerfile.ubi
+++ b/Dockerfile.ubi
@@ -1,5 +1,5 @@
 ## Global Args #################################################################
-ARG BASE_UBI_IMAGE_TAG=9.4
+ARG BASE_UBI_IMAGE_TAG=9.4-1227
 ARG PYTHON_VERSION=3.11
 
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
@@ -206,7 +206,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.4.0
+    pip install vllm-tgis-adapter==0.4.1
 
 ENV GRPC_PORT=8033 \
     PORT=8000 \