opendatahub-io · fialhocoelho · Apr 30, 2024 · May 21, 2024 · May 21, 2024 · May 28, 2024
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
@@ -0,0 +1,84 @@
+name: "Sync with upstream"
+
+on:
+  schedule:
+    - cron: 20 4 * * *
+
+  workflow_dispatch:
+
+
+env:
+  # repo to fetch changes from
+  UPSTREAM_REPO: vllm-project/vllm
+ # branch to sync
+  BRANCH: main
+
+jobs:
+  upstream-sync:
+    name: Sync with upstream
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch upstream repo
+        run: |
+          git remote add upstream https://github.com/${UPSTREAM_REPO}
+          git fetch upstream
+
+      - name: Check diff
+        id: diff
+        shell: bash
+        run: |
+          echo 'diff<<EOF' >> $GITHUB_OUTPUT
+          git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT)
+          echo 'EOF' >> $GITHUB_OUTPUT
+
+      - name: Create PR
+        if: ${{ steps.diff.outputs.diff != '' }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -xeu
+
+          git_hash="$(git rev-parse upstream/${BRANCH})"
+          echo "git_hash=$git_hash" >> $GITHUB_OUTPUT
+          git_describe="$(git describe --tags upstream/${BRANCH})"
+          echo "git_describe=$git_describe" >> $GITHUB_OUTPUT
+
+          # echo 'commits<<EOF' >> $GITHUB_OUTPUT
+          # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT
+          # echo 'EOF' >> $GITHUB_OUTPUT
+
+          upstream_url="https://github.com/${UPSTREAM_REPO}"
+          upstream_branch="$upstream_url/tree/${BRANCH}"
+
+          title="Sync with upstream@${git_describe}"
+          body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH"
+
+          gh repo set-default $GITHUB_REPOSITORY
+          pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number')
+
+          if [[ -z $pr_number ]]; then
+            echo "Creating PR"
+            gh pr create \
+              --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \
+              --base ${BRANCH} \
+              --label code-sync \
+              --title "$title" \
+              --body "$body" \
+              --draft \
+              --no-maintainer-edit
+            exit 0
+          fi
+
+          echo "Updating PR \#${pr_number}"
+          gh pr edit \
+            $pr_number \
+            --body "$body" \
+            --title "$title"
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -0,0 +1,220 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4-1227
+ARG PYTHON_VERSION=3.11
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+ARG PYTHON_VERSION
+
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \
+    && microdnf clean all
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y \
+        which procps findutils tar vim git\
+    && microdnf clean all
+
+
+## Python Installer ############################################################
+FROM base as python-install
+
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN microdnf install -y \
+    python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
+
+
+## CUDA Base ###################################################################
+FROM python-install as cuda-base
+
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+
+RUN microdnf install -y \
+        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
+    microdnf clean all
+
+ENV CUDA_HOME="/usr/local/cuda" \
+    PATH="${CUDA_HOME}/bin:${PATH}" \
+    LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+
+## Python cuda base #################################################################
+FROM cuda-base AS python-cuda-base
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    uv pip install \
+        -r requirements-cuda.txt
+
+
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+    --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+    --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
+    --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \
+    --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
+    uv pip install \
+        -r requirements-cuda.txt \
+        -r requirements-dev.txt
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+    uv pip install -r requirements-build.txt
+
+# install compiler cache to speed up compilation leveraging local or remote caching
+# git is required for the cutlass kernels
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all
+# install build dependencies
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=8
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Make sure the cuda environment is in the PATH
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+# Copy the entire directory before building wheel
+COPY vllm vllm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,src=.git,target=/workspace/.git \
+    env CFLAGS="-march=haswell" \
+        CXXFLAGS="$CFLAGS $CXXFLAGS" \
+        CMAKE_BUILD_TYPE=Release \
+        python3 setup.py bdist_wheel --dist-dir=dist
+
+#################### libsodium Build IMAGE ####################
+FROM base as libsodium-builder
+
+RUN microdnf install -y gcc gzip \
+    && microdnf clean all
+
+WORKDIR /usr/src/libsodium
+
+ARG LIBSODIUM_VERSION=1.0.20
+RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
+    && tar -xzvf libsodium*.tar.gz \
+    && rm -f libsodium*.tar.gz \
+    && mv libsodium*/* ./
+
+RUN ./configure --prefix="/usr/" && make && make check
+
+## Release #####################################################################
+FROM python-install AS vllm-openai
+ARG PYTHON_VERSION
+
+WORKDIR /workspace
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=$VIRTUAL_ENV/bin/:$PATH
+
+# force using the python venv's cuda runtime libraries
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
+
+# Triton needs a CC compiler
+RUN microdnf install -y gcc \
+    && microdnf clean all
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+
+# Install libsodium for Tensorizer encryption
+RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
+    cd /usr/src/libsodium \
+    && make install
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl
+
+ENV HF_HUB_OFFLINE=1 \
+    HOME=/home/vllm \
+    # Allow requested max length to exceed what is extracted from the
+    # config.json
+    # see: https://github.com/vllm-project/vllm/pull/7080
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    VLLM_USAGE_SOURCE=production-docker-image \
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    VLLM_NO_USAGE_STATS=1
+
+# setup non-root user for OpenShift
+RUN umask 002 \
+    && useradd --uid 2000 --gid 0 vllm \
+    && chmod g+rwx $HOME /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
+
+FROM vllm-openai as vllm-grpc-adapter
+
+USER root
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install vllm-tgis-adapter==0.4.1
+
+ENV GRPC_PORT=8033 \
+    PORT=8000 \
+    # As an optimization, vLLM disables logprobs when using spec decoding by
+    # default, but this would be unexpected to users of a hosted model that
+    # happens to have spec decoding
+    # see: https://github.com/vllm-project/vllm/pull/6485
+    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]