diff --git a/.github/actions/set-channel/action.yml b/.github/actions/set-channel/action.yml index 608f2cb16d..26872c18d9 100644 --- a/.github/actions/set-channel/action.yml +++ b/.github/actions/set-channel/action.yml @@ -2,7 +2,6 @@ name: Set Channel for Build Matrix Generation/Binary Build Step description: Add CHANNEL to GITHUB_ENV - runs: using: composite steps: diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index 0ef1ea3c22..69e6561a43 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -2,6 +2,12 @@ name: Setup Linux description: Set up Docker workspace on EC2 +inputs: + nvidia-driver-version: + description: If set will install required nvidia driver + required: false + default: "" + runs: using: composite steps: @@ -77,8 +83,14 @@ runs: echo "does=${needs}" >> $GITHUB_OUTPUT - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ steps.needs-nvidia-driver.outputs.does == 1 && inputs.nvidia-driver-version == ""}} + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + + - name: Install nvidia driver specific version, nvidia-docker runtime, set GPU_FLAG + if: ${{ inputs.nvidia-driver-version != "" }} uses: pytorch/test-infra/.github/actions/setup-nvidia@main - if: ${{ steps.needs-nvidia-driver.outputs.does == 1 }} + with: + nvidia-driver: inputs.nvidia-driver-version - name: Kill any existing containers, clean up images shell: bash diff --git a/.github/actions/setup-nvidia/action.yml b/.github/actions/setup-nvidia/action.yml index c23fa7f560..8e01900009 100644 --- a/.github/actions/setup-nvidia/action.yml +++ b/.github/actions/setup-nvidia/action.yml @@ -85,6 +85,8 @@ runs: echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing" elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing" + # First Uninstall old driver + sudo nvidia-uninstall -s else HAS_NVIDIA_DRIVER=1 echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation" @@ -104,6 +106,9 @@ runs: sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" set +e + # unload any possible nvidia processes + sudo kill 9 $(pgrep nvidia) + # install nvidia driver sudo /bin/bash /tmp/nvidia_driver -s --no-drm NVIDIA_INSTALLATION_STATUS=$? diff --git a/.github/workflows/test-setup-nvidia.yml b/.github/workflows/test-setup-nvidia.yml index 565063d09c..11d1499ac0 100644 --- a/.github/workflows/test-setup-nvidia.yml +++ b/.github/workflows/test-setup-nvidia.yml @@ -24,3 +24,22 @@ jobs: - name: Test that setup-nvidia works uses: ./.github/actions/setup-nvidia + test-minimal-version: + strategy: + fail-fast: false + matrix: + runner-type: + - linux.g5.4xlarge.nvidia.gpu + name: Install NVIDIA driver on ${{ matrix.runner-type }} + runs-on: ${{ matrix.runner-type }} + timeout-minutes: 15 + steps: + - uses: actions/checkout@v3 + - name: Setup SSH + uses: ./.github/actions/setup-ssh + with: + github-secret: ${{ github.token }} + - name: Test that setup-nvidia works + uses: ./.github/actions/setup-nvidia + with: + driver-version: "470.161.03"