Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI #21

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open

CI #21

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ sentencepiece
tiktoken
blobfile
tabulate
transformers
transformers
43 changes: 0 additions & 43 deletions .github/workflows/integration_test_4gpu.yaml

This file was deleted.

41 changes: 0 additions & 41 deletions .github/workflows/integration_test_8gpu.yaml

This file was deleted.

37 changes: 0 additions & 37 deletions .github/workflows/lint.yaml

This file was deleted.

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ torchtitan/datasets/**/*.model
*.log
error.json
_remote_module_non_scriptable.py
!.git/hooks/pre-commit
26 changes: 26 additions & 0 deletions .hooks/pre-commit
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/sh

# Check if this script is symlinked to .git/hooks/pre-commit
if [ "$(readlink -f "$0")" != "$(readlink -f "$(git rev-parse --git-dir)/hooks/pre-commit")" ]; then
echo "Error: pre-commit hook is not installed correctly."
echo "Please run the install_hooks.sh script in the repository root."
exit 1
fi

# Check if pre-commit is installed
if ! command -v pre-commit >/dev/null 2>&1; then
echo "Error: pre-commit is not installed."
echo "Please install pre-commit using: pip install pre-commit"
echo "Then set up the pre-commit hooks using: pre-commit install"
exit 1
fi

# Check if pre-commit-msg is installed
if ! grep -q "pre-commit-msg" "$(git rev-parse --git-dir)/hooks/commit-msg" 2>/dev/null; then
echo "Error: pre-commit-msg hook is not installed."
echo "Please set up the pre-commit-msg hook using: pre-commit install --hook-type commit-msg"
exit 1
fi

# If both are installed, run pre-commit
exec pre-commit
50 changes: 24 additions & 26 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,39 +13,37 @@ repos:
- id: no-commit-to-branch
args: ['--branch=main']
- id: check-added-large-files
args: ['--maxkb=500']
args: ['--maxkb=5000']
- id: end-of-file-fixer
exclude: '^(.*\.svg)$'

- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.4
hooks:
- id: insert-license
files: \.py$
args:
- --license-filepath
- docs/license_header.txt
- repo: https://github.com/commitizen-tools/commitizen
rev: v3.29.0
hooks:
- id: commitizen
stages: [commit-msg]

- repo: https://github.com/pycqa/flake8
rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear == 22.4.25
- pep8-naming == 0.12.1
- torchfix
args: ['--config=.flake8']
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
additional_dependencies:
- flake8-bugbear == 22.4.25
- pep8-naming == 0.12.1
- torchfix
args: ['--config=.flake8']

- repo: https://github.com/omnilib/ufmt
rev: v2.3.0
hooks:
- id: ufmt
additional_dependencies:
- black == 22.12.0
- usort == 1.0.5

- repo: https://github.com/omnilib/ufmt
rev: v2.7.2
hooks:
- id: ufmt
additional_dependencies:
- black == 22.12.0
- usort == 1.0.5

- repo: https://github.com/jsh9/pydoclint
rev: d88180a8632bb1602a4d81344085cf320f288c5a
rev: 0.5.6
hooks:
- id: pydoclint
args: [--config=pyproject.toml]
2 changes: 1 addition & 1 deletion estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from torch.testing._internal.distributed.fake_pg import FakeStore

from torchtitan.config_manager import JobConfig
from torchtitan.tokenizers.tokenizer import build_tokenizer
from torchtitan.float8 import Float8Handler
from torchtitan.logging import init_logger, logger
from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
from torchtitan.optimizer import build_lr_schedulers, build_optimizers
from torchtitan.parallelisms import models_parallelize_fns, ParallelDims
from torchtitan.tokenizers.tokenizer import build_tokenizer
from train import get_train_context


Expand Down
11 changes: 11 additions & 0 deletions install_hooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/sh

# Get the .git directory
GIT_DIR=$(git rev-parse --git-dir)

# Create symbolic link for pre-commit hook
ln -sf "../../.hooks/pre-commit" "${GIT_DIR}/hooks/pre-commit"

echo "Git hooks installed successfully."
echo "Please ensure you have pre-commit installed: pip install pre-commit"
echo "Then run: pre-commit install && pre-commit install --hook-type commit-msg"
45 changes: 29 additions & 16 deletions submitit_train.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
import submitit
import datetime
import yaml
import os


if __name__ == "__main__":
executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j")
executor.update_parameters(
name="titan", timeout_min=15,
name="titan",
timeout_min=15,
gpus_per_node=2,
nodes=1, mem_gb=30, cpus_per_task=10,
slurm_array_parallelism=10
nodes=1,
mem_gb=30,
cpus_per_task=10,
slurm_array_parallelism=10,
)

jobs = []
with executor.batch():
for _ in range(1):
function = submitit.helpers.CommandFunction([
'python3', '-m', 'torch.distributed.run',
'--nproc_per_node', '2',
'--rdzv_backend', 'c10d',
'--rdzv_endpoint', 'localhost:0',
'--local-ranks-filter', '0',
'--role', 'rank', '--tee', '3',
'train.py', '--job.config_file', './train_configs/galactica_125m.toml',
])
print(' '.join(function.command))
function = submitit.helpers.CommandFunction(
[
"python3",
"-m",
"torch.distributed.run",
"--nproc_per_node",
"2",
"--rdzv_backend",
"c10d",
"--rdzv_endpoint",
"localhost:0",
"--local-ranks-filter",
"0",
"--role",
"rank",
"--tee",
"3",
"train.py",
"--job.config_file",
"./train_configs/galactica_125m.toml",
]
)
print(" ".join(function.command))
# subprocess.run(function.command)
job = executor.submit(function)
jobs.append(job)
4 changes: 3 additions & 1 deletion test/datasets/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def _build_dataloader(
self, dataset_name, dataset_path, batch_size, seq_len, world_size, rank
):
tokenizer_type = "tiktoken"
tokenizer = build_tokenizer("tiktoken", "./torchtitan/tokenizers/chemlactica-125m")
tokenizer = build_tokenizer(
"tiktoken", "./torchtitan/tokenizers/chemlactica-125m"
)
return build_hf_data_loader(
dataset_name=dataset_name,
dataset_path=dataset_path,
Expand Down
22 changes: 12 additions & 10 deletions torchtitan/aim.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from aim.sdk.utils import clean_repo_path, get_aim_repo_name


class AimLogger():
class AimLogger:
def __init__(
self,
repo: Optional[str] = None,
Expand All @@ -17,9 +17,9 @@ def __init__(
capture_terminal_logs: Optional[bool] = True,
run_name: Optional[str] = None,
run_hash: Optional[str] = None,
train_metric_prefix: Optional[str] = 'train_',
val_metric_prefix: Optional[str] = 'val_',
test_metric_prefix: Optional[str] = 'test_',
train_metric_prefix: Optional[str] = "train_",
val_metric_prefix: Optional[str] = "val_",
test_metric_prefix: Optional[str] = "test_",
):
super().__init__()

Expand Down Expand Up @@ -64,23 +64,25 @@ def experiment(self) -> Run:

def log_hyperparams(self, params: Dict[str, Any]):
for key, value in params.items():
self.experiment.set(('hparams', key), value, strict=False)
self.experiment.set(("hparams", key), value, strict=False)

def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None):

metric_items: Dict[str:Any] = {k: v for k, v in metrics.items()} # for modifications to metric_items without affecting the original metrics
metric_items: Dict[str:Any] = {
k: v for k, v in metrics.items()
} # for modifications to metric_items without affecting the original metrics
for k, v in metric_items.items():
name = k
context = {}
if self._train_metric_prefix and name.startswith(self._train_metric_prefix):
name = name[len(self._train_metric_prefix) :]
context['subset'] = 'train'
context["subset"] = "train"
elif self._test_metric_prefix and name.startswith(self._test_metric_prefix):
name = name[len(self._test_metric_prefix) :]
context['subset'] = 'test'
context["subset"] = "test"
elif self._val_metric_prefix and name.startswith(self._val_metric_prefix):
name = name[len(self._val_metric_prefix) :]
context['subset'] = 'val'
context["subset"] = "val"
self.experiment.track(v, name=name, step=step, context=context)

def finalize(self) -> None:
Expand All @@ -103,4 +105,4 @@ def name(self) -> str:

@property
def version(self) -> str:
return self.experiment.hash
return self.experiment.hash
Loading