Skip to content

Commit

Permalink
Add nvidia-driver to linux_job workflow
Browse files Browse the repository at this point in the history
test

test

test

test

test

test

test

test

test

test

test

test

test

More changes

test

test

test

test
  • Loading branch information
atalman committed Jul 18, 2024
1 parent 0d1423e commit 5edb9f3
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 2 deletions.
1 change: 0 additions & 1 deletion .github/actions/set-channel/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ name: Set Channel for Build Matrix Generation/Binary Build Step

description: Add CHANNEL to GITHUB_ENV


runs:
using: composite
steps:
Expand Down
14 changes: 13 additions & 1 deletion .github/actions/setup-linux/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ name: Setup Linux

description: Set up Docker workspace on EC2

inputs:
nvidia-driver-version:
description: If set will install required nvidia driver
required: false
default: ""

runs:
using: composite
steps:
Expand Down Expand Up @@ -77,8 +83,14 @@ runs:
echo "does=${needs}" >> $GITHUB_OUTPUT
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
if: ${{ steps.needs-nvidia-driver.outputs.does == 1 && inputs.nvidia-driver-version == ""}}
uses: pytorch/test-infra/.github/actions/setup-nvidia@main

- name: Install nvidia driver specific version, nvidia-docker runtime, set GPU_FLAG
if: ${{ inputs.nvidia-driver-version != "" }}
uses: pytorch/test-infra/.github/actions/setup-nvidia@main
if: ${{ steps.needs-nvidia-driver.outputs.does == 1 }}
with:
nvidia-driver: inputs.nvidia-driver-version

- name: Kill any existing containers, clean up images
shell: bash
Expand Down
5 changes: 5 additions & 0 deletions .github/actions/setup-nvidia/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ runs:
echo "Failed to get NVIDIA driver version ($INSTALLED_DRIVER_VERSION). Continuing"
elif [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
# First Uninstall old driver
sudo nvidia-uninstall -s
else
HAS_NVIDIA_DRIVER=1
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
Expand All @@ -104,6 +106,9 @@ runs:
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
set +e
# unload any possible nvidia processes
sudo kill 9 $(pgrep nvidia)
# install nvidia driver
sudo /bin/bash /tmp/nvidia_driver -s --no-drm
NVIDIA_INSTALLATION_STATUS=$?
Expand Down
19 changes: 19 additions & 0 deletions .github/workflows/test-setup-nvidia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,22 @@ jobs:

- name: Test that setup-nvidia works
uses: ./.github/actions/setup-nvidia
test-minimal-version:
strategy:
fail-fast: false
matrix:
runner-type:
- linux.g5.4xlarge.nvidia.gpu
name: Install NVIDIA driver on ${{ matrix.runner-type }}
runs-on: ${{ matrix.runner-type }}
timeout-minutes: 15
steps:
- uses: actions/checkout@v3
- name: Setup SSH
uses: ./.github/actions/setup-ssh
with:
github-secret: ${{ github.token }}
- name: Test that setup-nvidia works
uses: ./.github/actions/setup-nvidia
with:
driver-version: "470.161.03"

0 comments on commit 5edb9f3

Please sign in to comment.