aws · mattcjo · Jun 26, 2024 · Jun 26, 2024 · Jun 26, 2024 · Jun 27, 2024
diff --git a/hack/optimize/neuron/Dockerfile b/hack/optimize/neuron/Dockerfile
@@ -0,0 +1,73 @@
+# Use Ubuntu 20.04 as the base image
+FROM ubuntu:20.04
+
+# Neuron SDK components versions
+ARG NEURONX_FRAMEWORK_VERSION=2.11.0.0
+ARG NEURONX_RUNTIME_LIB_VERSION=2.11.7.0
+ARG NEURONX_TOOLS_VERSION=2.11.8.0
+ARG NEURONX_CC_VERSION=2.11.8.0
+
+# Set environment variables
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LD_LIBRARY_PATH="/opt/aws/neuron/lib:/usr/local/lib"
+ENV PATH="/opt/aws/neuron/bin:$PATH"
+
+# Install system dependencies including libsqlite3-dev and libbz2-dev for Python
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    curl \
+    wget \
+    zlib1g-dev \
+    gnupg2 \
+    libssl-dev \
+    libffi-dev \
+    libsqlite3-dev \
+    libbz2-dev \
+    libopenblas-dev \
+    libomp5 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add Neuron repository and install Neuron SDK components
+RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list && \
+    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-NEURON.PUB | apt-key add - && \
+    apt-get update && \
+    apt-get install -y \
+    aws-neuronx-tools=${NEURONX_TOOLS_VERSION} \
+    aws-neuronx-runtime-lib=${NEURONX_RUNTIME_LIB_VERSION} \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python 3.10 with sqlite3 and bz2 support
+RUN wget -q https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \
+    tar -xzf Python-3.10.12.tgz && \
+    cd Python-3.10.12 && \
+    ./configure --enable-shared --enable-optimizations --with-ensurepip=install && \
+    make -j $(nproc) && make install && \
+    cd .. && rm -rf Python-3.10.12*
+
+# Upgrade pip and install required Python packages
+RUN python3.10 -m pip install --upgrade pip
+
+# Install Neuron-related Python packages from the Neuron repository
+RUN python3.10 -m pip install --no-cache-dir \
+    --extra-index-url https://pip.repos.neuron.amazonaws.com \
+    torch-neuronx==${NEURONX_FRAMEWORK_VERSION} \
+    torch-xla==1.13.* \
+    torchvision
+
+# Install additional Python packages
+RUN python3.10 -m pip install --no-cache-dir \
+    transformers==4.29 \
+    numpy==1.23 \
+    pynvml
+
+# Set the working directory
+WORKDIR /app
+
+# Copy training and inference scripts
+COPY train_bert_neuron.py /app/train_bert_neuron.py
+COPY infer_bert_neuron.py /app/infer_bert_neuron.py
+
diff --git a/hack/optimize/neuron/infer_bert_neuron.py b/hack/optimize/neuron/infer_bert_neuron.py
@@ -0,0 +1,82 @@
+import os
+
+# Unset XLA_FLAGS to avoid GPU-specific issues on Neuron
+os.environ.pop('XLA_FLAGS', None)
+
+import torch
+import torch_neuronx
+from transformers import BertTokenizer, BertForPreTraining
+from torch.utils.data import DataLoader, TensorDataset
+
+def create_dummy_data(tokenizer, num_samples=1000, max_length=128):
+    sentences = [
+        f"This is a dummy sentence number {i}" for i in range(num_samples)
+    ]
+    tokenized_inputs = tokenizer(
+        sentences,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    labels = tokenized_inputs.input_ids.detach().clone()
+    next_sentence_labels = torch.randint(0, 2, (num_samples,))
+    return TensorDataset(
+        tokenized_inputP1+rOQ\P1+rOR\P1+rOS\s.input_ids,
+        tokenized_inputs.attention_mask,
+        labels,
+        next_sentence_labels,
+    )
+
+def infer_bert_neuron(model, tokenizer, batch_sizes, device):
+    dataset = create_dummy_data(tokenizer)
+    results = []
+
+    for batch_size in batch_sizes:
+        try:
+            dataloader = DataLoader(dataset, batch_size=batch_size)
+            start_time = time.time()
+            for batch in dataloader:
+                inputs, masks, labels, next_sentence_labels = batch
+                inputs, masks = inputs.to(device), masks.to(device)
+                outputs = model(input_ids=inputs, attention_mask=masks)
+            end_time = time.time()
+            inference_time = end_time - start_time
+            throughput = len(dataset) / inference_time
+
+            print(f"Batch Size: {batch_size}")
+            print(f"Inference time: {inference_time:.2f} seconds")
+            print(f"Throughput: {throughput:.2f} samples/second")
+
+            results.append({
+                'batch_size': batch_size,
+                'throughput': throughput,
+            })
+            break  # Exit after successful batch size
+
+        except RuntimeError as e:
+            if 'out of memory' in str(e).lower():
+                print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.")
+                torch.cuda.empty_cache()
+                continue
+            else:
+                raise e
+
+    print("Optimal Batch Size Found:")
+    for res in results:
+        print(f"Batch Size: {res['batch_size']}, Throughput: {res['throughput']:.2f} samples/sec")
+
+def main():
+    device = torch.device("xla")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertForPreTraining.from_pretrained("bert-base-uncased")
+
+    example_inputs = torch.randint(0, 2000, (1, 128)).to(device)
+    model_neuron = torch_neuronx.trace(model, example_inputs)
+
+    batch_sizes = [128, 64, 32, 16, 8]
+    infer_bert_neuron(model_neuron, tokenizer, batch_sizes, device)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/hack/optimize/neuron/train_bert_neuron.py b/hack/optimize/neuron/train_bert_neuron.py
@@ -0,0 +1,103 @@
+import os
+
+# Unset XLA_FLAGS to avoid GPU-specific issues on Neuron
+os.environ.pop('XLA_FLAGS', None)
+
+import time
+import torch
+import torch_xla
+import torch_xla.core.xla_model as xm
+from transformers import BertForPreTraining, BertTokenizer
+from torch.utils.data import DataLoader, TensorDataset
+
+def create_dummy_data(tokenizer, num_samples=1000, max_length=128):
+    sentences = [
+        f"This is a dummy sentence number {i}" for i in range(num_samples)
+    ]
+    tokenized_inputs = tokenizer(
+        sentences,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+    )
+    labels = tokenized_inputs.input_ids.detach().clone()
+    next_sentence_labels = torch.randint(0, 2, (num_samples,))
+    return TensorDataset(
+        tokenized_inputs.input_ids,
+        tokenized_inputs.attention_mask,
+        labels,
+        next_sentence_labels,
+    )
+
+def train_bert_neuron(model, tokenizer, batch_sizes, device):
+    model.train()
+    model.to(device)
+
+    dataset = create_dummy_data(tokenizer)
+    results = []
+
+    for batch_size in batch_sizes:
+        try:
+            train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+            optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
+
+            # Measure training time for throughput calculation
+            start_time = time.time()
+            for batch in train_dataloader:
+                optimizer.zero_grad()
+                inputs, masks, labels, next_sentence_labels = batch
+                inputs, masks, labels, next_sentence_labels = (
+                    inputs.to(device),
+                    masks.to(device),
+                    labels.to(device),
+                    next_sentence_labels.to(device),
+                )
+                outputs = model(
+                    input_ids=inputs,
+                    attention_mask=masks,
+                    labels=labels,
+                    next_sentence_label=next_sentence_labels,
+                )
+                loss = outputs.loss
+                loss.backward()
+                optimizer.step()
+            end_time = time.time()
+            training_time = end_time - start_time
+            throughput = len(dataset) / training_time
+
+            print(f"Batch Size: {batch_size}")
+            print(f"Training time: {training_time:.2f} seconds")
+            print(f"Throughput: {throughput:.2f} samples/second")
+
+            results.append({
+                'batch_size': batch_size,
+                'throughput': throughput,
+            })
+            break  # Exit after successful batch size
+
+        except RuntimeError as e:
+            if 'out of memory' in str(e).lower():
+                print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.")
+                torch.cuda.empty_cache()
+                continue
+            else:
+                raise e
+
+    print("Optimal Batch Size Found:")
+    for res in results:
+        print(f"Batch Size: {res['batch_size']}, Throughput: {res['throughput']:.2f} samples/sec")
+
+def main():
+    device = xm.xla_device()
+
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    model = BertForPreTraining.from_pretrained("bert-base-uncased")
+
+    batch_sizes = [128, 64, 32, 16, 8]
+
+    train_bert_neuron(model, tokenizer, batch_sizes, device)
+
+if __name__ == "__main__":
+    main()
+