Skip to content

Commit

Permalink
Merge pull request #34 from coreweave/es/extras-expand-apex
Browse files Browse the repository at this point in the history
feat(torch-extras): Add `--distributed_*` and `--group_norm` to bundled Apex, fix CI on updates
  • Loading branch information
wbrown authored Aug 24, 2023
2 parents 4112cb1 + c5e7997 commit 39919ac
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 10 deletions.
6 changes: 1 addition & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@ on:
tag-suffix:
required: false
type: string
large-runner:
required: false
type: boolean
default: false
outputs:
outcome:
description: "The outcome of the build"
Expand All @@ -33,7 +29,7 @@ on:
jobs:
build:
name: Build Images
runs-on: ${{ fromJSON(inputs.large-runner && '["self-hosted", "Linux", "chunky"]' || '["self-hosted", "Linux"]') }}
runs-on: [ self-hosted, Linux ]
outputs:
outcome: ${{ steps.docker-build.outcome }}
tags: ${{ steps.meta.outputs.tags }}
Expand Down
122 changes: 118 additions & 4 deletions .github/workflows/torch-extras.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,120 @@ on:
base-image:
required: true
type: string
skip-bases-check:
required: false
type: boolean
default: true

workflow_dispatch:
inputs:
tag:
required: true
required: false
description: "Tag suffix to identify the build"
type: string
base-image:
required: true
required: false
description: "Base image for the build"
type: string
skip-bases-check:
required: false
description: "Build from one specific image rather than the most recent releases from the main branch"
type: boolean
default: true

push:
paths:
- "torch-extras/**"
- ".github/workflows/torch-extras.yml"
- ".github/workflows/build.yml"


jobs:
build:
get-required-bases:
if: inputs.skip-bases-check != true
runs-on: ["self-hosted", "Linux"]
permissions:
packages: read
outputs:
bases-list: ${{ steps.choose-bases.outputs.list }}
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Check if torch-extras needs to be rebuilt from previous bases
id: check-changed
run: |
if [ "$EVENT_NAME" = 'push' ]; then \
CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \
{ \
echo "$CHANGED_FILES" \
| grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
&& echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \
|| echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
} && { \
echo "$CHANGED_FILES" \
| grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
&& echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \
|| echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
}; \
else \
echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \
echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT";
fi
env:
EVENT_NAME: ${{ github.event_name }}
BEFORE_HASH: ${{ github.event.before }}
AFTER_HASH: ${{ github.event.after }}
- name: Get latest torch container releases
if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
id: get-latest
run: |
RELEASES="$( \
/bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \
https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list \
| jq -r '.["tags"][]' \
| grep -E '^[0-9a-f]{7}-(base|nccl)-' \
)" && \
BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \
NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \
LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \
LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \
LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \
LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \
echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \
echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT"
env:
BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Choose which torch containers to use as a build base
if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
id: choose-bases
run: |
TAG_PATTERN='^[0-9a-f]{7}-(.*)' && \
JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}' && \
TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; }
SPLIT_TO_LINES() { xargs -n 1; } && \
JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \
echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \
echo "list=[$( \
( \
if [ "$BASE_PROVIDED" = 'false' ]; then \
echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
echo "$LATEST_BASE_IMAGES"; \
fi && \
if [ "$NCCL_PROVIDED" = 'false' ]; then \
echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
echo "$LATEST_NCCL_IMAGES"; \
fi; \
) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \
)]" >> "$GITHUB_OUTPUT";
env:
BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }}
NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }}
LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }}
LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }}

build-call:
if: inputs.skip-bases-check
strategy:
matrix:
flash-attn: [ 2.0.2, 1.0.9 ]
Expand All @@ -30,7 +129,22 @@ jobs:
image-name: torch-extras
folder: torch-extras
tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }}
large-runner: true
build-args: |
BASE_IMAGE=${{ inputs.base-image }}
FLASH_ATTN_VERSION=${{ matrix.flash-attn }}
build-self:
needs: get-required-bases
if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]'
strategy:
matrix:
flash-attn: [ 2.0.2, 1.0.9 ]
bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }}
uses: ./.github/workflows/build.yml
with:
image-name: torch-extras
folder: torch-extras
tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }}
build-args: |
BASE_IMAGE=${{ matrix.bases.image }}
FLASH_ATTN_VERSION=${{ matrix.flash-attn }}
7 changes: 6 additions & 1 deletion torch-extras/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
ARG BASE_IMAGE
ARG DEEPSPEED_VERSION="0.9.4"
ARG FLASH_ATTN_VERSION="2.0.2"
ARG APEX_COMMIT="7b2e71b0d4013f8e2f9f1c8dd21980ff1d76f1b6"
ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346"

FROM alpine/git:2.36.3 as flash-attn-downloader
WORKDIR /git
Expand Down Expand Up @@ -152,6 +152,8 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) &&
libnccl-dev=$LIBNCCL2_VERSION && \
apt-get clean

# --distributed_adam, --distributed_lamb, and --group_norm aren't documented
# in the Apex README, but are defined in its setup.py config.
RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
python3 -m pip install -U --no-cache-dir \
packaging setuptools wheel pip && \
Expand All @@ -162,9 +164,12 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
echo \
--cpp_ext \
--cuda_ext \
--distributed_adam \
--distributed_lamb \
--permutation_search \
--xentropy \
--focal_loss \
--group_norm \
--index_mul_2d \
--deprecated_fused_adam \
--deprecated_fused_lamb \
Expand Down

0 comments on commit 39919ac

Please sign in to comment.