diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile index 3d5f8ec7d69..cf3de769ea5 100644 --- a/.ci/docker/Dockerfile +++ b/.ci/docker/Dockerfile @@ -1,18 +1,48 @@ -FROM 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9:ba7f4e752ac8ffdec18f81381ed9c25d2962c074 +FROM ubuntu:20.04 -USER root ENV DEBIAN_FRONTEND noninteractive -RUN apt-get update || sudo apt-get install libgnutls30 -RUN apt-get update -RUN apt-get install -y --no-install-recommends unzip p7zip-full sox libsox-dev libsox-fmt-all rsync +# Install common dependencies (so that this step can be cached separately) +COPY ./install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh -COPY ./common.sh common.sh -COPY ./requirements.txt /opt/conda -RUN bash ./common.sh && rm common.sh /opt/conda/requirements.txt +# Install user +COPY ./install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh -COPY ./download_data.sh download_data.sh -RUN bash download_data.sh && rm download_data.sh +COPY ./install_docs_reqs.sh install_docs_reqs.sh +RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh + +# Install conda and other packages (e.g., numpy, pytest) +ENV ANACONDA_PYTHON_VERSION 3.10 +ENV CONDA_CMAKE yes +ENV DOCS yes +ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION +ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH +COPY ./requirements.txt /opt/conda/ +COPY ./install_conda.sh install_conda.sh +COPY ./common_utils.sh common_utils.sh +RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements.txt + + +# Install gcc +ENV GCC_VERSION 9 +COPY ./install_gcc.sh install_gcc.sh +RUN bash ./install_gcc.sh && rm install_gcc.sh + + +# Install cuda and cudnn +ENV CUDA_VERSION 12.1.1 +RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/install_cuda.sh -O install_cuda.sh +RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh +ENV DESIRED_CUDA ${CUDA_VERSION} +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH + +# Install CUDNN +ENV CUDNN_VERSION 8 +COPY ./install_cudnn.sh install_cudnn.sh +RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi +RUN rm install_cudnn.sh USER jenkins CMD ["bash"] diff --git a/.ci/docker/common_utils.sh b/.ci/docker/common_utils.sh new file mode 100644 index 00000000000..edfab470c42 --- /dev/null +++ b/.ci/docker/common_utils.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Work around bug where devtoolset replaces sudo and breaks it. +as_jenkins() { + # NB: unsetting the environment variables works around a conda bug + # https://github.com/conda/conda/issues/6576 + # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation + # NB: This must be run from a directory that jenkins has access to, + # works around https://github.com/conda/conda-package-handling/pull/34 + sudo -E -H -u jenkins env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=$PATH" "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" $* +} + +conda_install() { + # Ensure that the install command don't upgrade/downgrade Python + # This should be called as + # conda_install pkg1 pkg2 ... [-c channel] + as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $* +} + +conda_run() { + as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $* +} + +pip_install() { + as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION pip install --progress-bar off $* +} diff --git a/.ci/docker/install_base.sh b/.ci/docker/install_base.sh new file mode 100644 index 00000000000..e2ecfbb79a4 --- /dev/null +++ b/.ci/docker/install_base.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker + +set -ex + +install_ubuntu() { + # Install common dependencies + apt-get update + # TODO: Some of these may not be necessary + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake3=3.16* \ + curl \ + git \ + wget \ + sudo \ + vim \ + jq \ + vim \ + unzip \ + gdb \ + rsync \ + libssl-dev + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +} + + +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + ubuntu) + install_ubuntu + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; +esac diff --git a/.ci/docker/install_conda.sh b/.ci/docker/install_conda.sh new file mode 100644 index 00000000000..93772616aa1 --- /dev/null +++ b/.ci/docker/install_conda.sh @@ -0,0 +1,73 @@ +#!/bin/bash + +set -ex + +# Optionally install conda +if [ -n "$ANACONDA_PYTHON_VERSION" ]; then + BASE_URL="https://repo.anaconda.com/miniconda" + + MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) + MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) + + CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" + + mkdir -p /opt/conda + chown jenkins:jenkins /opt/conda + + source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh" + + pushd /tmp + wget -q "${BASE_URL}/${CONDA_FILE}" + # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431 + as_jenkins bash "${CONDA_FILE}" -b -f -p "/opt/conda" + popd + + # NB: Don't do this, rely on the rpath to get it right + #echo "/opt/conda/lib" > /etc/ld.so.conf.d/conda-python.conf + #ldconfig + sed -e 's|PATH="\(.*\)"|PATH="/opt/conda/bin:\1"|g' -i /etc/environment + export PATH="/opt/conda/bin:$PATH" + + # Ensure we run conda in a directory that jenkins has write access to + pushd /opt/conda + + # Prevent conda from updating to 4.14.0, which causes docker build failures + # See https://hud.pytorch.org/pytorch/pytorch/commit/754d7f05b6841e555cea5a4b2c505dd9e0baec1d + # Uncomment the below when resolved to track the latest conda update + # as_jenkins conda update -y -n base conda + + # Install correct Python version + as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" + + # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README + CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools" + + # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source + # and libpython-static for torch deploy + conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}" + + # Use conda cmake in some cases. Conda cmake will be newer than our supported + # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those + # following builds that we know should use conda. Specifically, Ubuntu bionic + # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda + conda_install cmake + + conda_install magma-cuda$(TMP=${CUDA_VERSION/./};echo ${TMP%.*[0-9]}) -c pytorch + + # Install some other packages, including those needed for Python test reporting + pip_install -r /opt/conda/requirements.txt + + apt-get update + apt-get -y install expect-dev + + # HACK HACK HACK + # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu + # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda + # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0 + # Same is true for gcc-12 from Ubuntu-22.04 + if grep -e [12][82].04.[623] /etc/issue >/dev/null; then + rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6 + fi + + popd +fi diff --git a/.ci/docker/install_cudnn.sh b/.ci/docker/install_cudnn.sh new file mode 100644 index 00000000000..f654c9fee24 --- /dev/null +++ b/.ci/docker/install_cudnn.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +if [[ ${CUDNN_VERSION} == 8 ]]; then + # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement + mkdir tmp_cudnn + pushd tmp_cudnn + if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz + elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then + CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive" + curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz + else + print "Unsupported CUDA version ${CUDA_VERSION}" + exit 1 + fi + + tar xf ${CUDNN_NAME}.tar.xz + cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/ + cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/ + popd + rm -rf tmp_cudnn + ldconfig +fi diff --git a/.ci/docker/install_docs_reqs.sh b/.ci/docker/install_docs_reqs.sh new file mode 100644 index 00000000000..b227ac3d2d8 --- /dev/null +++ b/.ci/docker/install_docs_reqs.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker +set -ex + +apt-get update +# Ignore error if gpg-agent doesn't exist (for Ubuntu 16.04) +apt-get install -y gpg-agent || : + +curl --retry 3 -sL https://deb.nodesource.com/setup_16.x | sudo -E bash - +sudo apt-get install -y nodejs + +curl --retry 3 -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - +echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + +apt-get update +apt-get install -y --no-install-recommends yarn +yarn global add katex --prefix /usr/local + +sudo apt-get -y install doxygen + +apt-get autoclean && apt-get clean +rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/.ci/docker/install_gcc.sh b/.ci/docker/install_gcc.sh new file mode 100644 index 00000000000..cdd9bcc15d5 --- /dev/null +++ b/.ci/docker/install_gcc.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +# Need the official toolchain repo to get alternate packages +add-apt-repository ppa:ubuntu-toolchain-r/test +apt-get update +apt-get install -y g++-$GCC_VERSION +update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 +update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 +update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 + + +# Cleanup package manager +apt-get autoclean && apt-get clean +rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/.ci/docker/install_user.sh b/.ci/docker/install_user.sh new file mode 100644 index 00000000000..974a7b12703 --- /dev/null +++ b/.ci/docker/install_user.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Based off of https://github.com/pytorch/pytorch/tree/b52e0bf131a4e55cd987176f9c5a8d2ad6783b4f/.ci/docker +set -ex + +# Mirror jenkins user in container +# jenkins user as ec2-user should have the same user-id +echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd +echo "jenkins:x:1000:" >> /etc/group +# Needed on focal or newer +echo "jenkins:*:19110:0:99999:7:::" >>/etc/shadow + +# Create $HOME +mkdir -p /var/lib/jenkins +chown jenkins:jenkins /var/lib/jenkins +mkdir -p /var/lib/jenkins/.ccache +chown jenkins:jenkins /var/lib/jenkins/.ccache + +# Allow writing to /usr/local (for make install) +chown jenkins:jenkins /usr/local + +# Allow sudo +# TODO: Maybe we shouldn't +echo 'jenkins ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/jenkins + +# Test that sudo works +sudo -u jenkins sudo -v