allenai · natolambert · Jul 24, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,29 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  unit-tests:
+    name: Run unit tests
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install pytest torch datasets transformers==4.40.2
+      - name: Run unit tests
+        run: pytest open_instruct/test_utils.py
diff --git a/configs/train_configs/sft/default.yaml b/configs/train_configs/sft/default.yaml
@@ -4,7 +4,14 @@ use_flash_attn: true
 tokenizer_name: allenai/OLMo-1.7-1B-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
 # train_file: data/processed/tulu_v2/tulu_v2_data.jsonl
-dataset_name: allenai/tulu-v2-sft-mixture
+# dataset_name: allenai/tulu-v2-sft-mixture
+dataset_mixer:
+ allenai/tulu-v2-sft-mixture: 0.5
+ HuggingFaceH4/no_robots: 0.8
+#  data/processed/tulu_v2/tulu_v2_filtered_data.jsonl: 0.1
+# dataset_mixer:
+#  allenai/tulu-v2-sft-mixture: 100000
+#  HuggingFaceH4/no_robots: 5000
 max_seq_length: 4096
 max_train_samples: 10000
 preprocessing_num_workers: 128
@@ -20,4 +27,5 @@ with_tracking: true
 report_to:
   - wandb
 logging_steps: 1
-add_bos: true # needed for GPTNeoX olmo models
+add_bos: true # needed for GPTNeoX olmo models
+dataset_mix_dir: null
diff --git a/configs/train_configs/sft/olmo_7b_17_remix_sft.yaml b/configs/train_configs/sft/olmo_7b_17_remix_sft.yaml
@@ -0,0 +1,27 @@
+# Note, the exact model was trained on TPUs in a different repo
+# Example file for mixing, use exact tulu 2 mixture for replication allenai/tulu-v2-sft-mixture
+model_name_or_path: allenai/OLMo-1.7-7B-hf
+model_revision: main
+use_flash_attn: true
+tokenizer_name: allenai/OLMo-1.7-7B-hf
+use_slow_tokenizer: true
+dataset_mixer:
+  allenai/wildjailbreak: 262000
+  /net/nfs.cirrascale/mosaic/oe-safety-datasets/vanilla_harmful_v.03/sampled_data/gpt-3.5-turbo_synthetic_harm_v03.jsonl: 21876
+  /net/nfs.cirrascale/mosaic/oe-safety-datasets/wildchat_lmsys_sexual/gpt4_lmsys_wildchat_dedup_50ksampled.jsonl: 16888
+  allenai/tulu-v2-sft-mixture: 326154
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 for tulu 2
+learning_rate: 2.0e-05
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: output/OLMo-1.7-7B-safe-remix
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
diff --git a/configs/train_configs/sft/olmo_7b_17_safe_sft.yaml b/configs/train_configs/sft/olmo_7b_17_safe_sft.yaml
@@ -0,0 +1,22 @@
+model_name_or_path: allenai/OLMo-1.7-7B-hf
+model_revision: main
+use_flash_attn: true
+tokenizer_name: allenai/OLMo-1.7-7B-hf
+use_slow_tokenizer: false # olmo models only use fast tokenizers
+dataset_name: /net/nfs.cirrascale/mosaic/liweij/auto_jailbreak/data/safety_training_data/v3/mixtures/tulu2mix-all-vani_b-50000-vani_h-50000-adv_b-50000-adv_h-50000.jsonl
+max_seq_length: 2048
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16
+learning_rate: 2.0e-06
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: output/olmo_17_safe_sft/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
+add_bos: true
diff --git a/configs/train_configs/sft/olmo_7b_17_sft.yaml b/configs/train_configs/sft/olmo_7b_17_sft.yaml
@@ -3,7 +3,7 @@ model_revision: main
 use_flash_attn: true
 tokenizer_name: allenai/OLMo-1.7-7B-hf
 use_slow_tokenizer: false # olmo models only use fast tokenizers
-dataset_name: allenai/tulu-v2-sft-mixture
+dataset_name: allenai/tulu-v2-sft-mixture-olmo-4096
 max_seq_length: 4096
 preprocessing_num_workers: 128
 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs

diff --git a/configs/train_configs/sft/tulu2_7b_remix.yaml b/configs/train_configs/sft/tulu2_7b_remix.yaml
@@ -0,0 +1,33 @@
+# Note, the exact model was trained on TPUs in a different repo
+# Example file for mixing, use exact tulu 2 mixture for replication allenai/tulu-v2-sft-mixture
+model_name_or_path: meta-llama/Llama-2-7b
+model_revision: main
+use_flash_attn: true
+tokenizer_name: meta-llama/Llama-2-7b
+use_slow_tokenizer: true
+dataset_mixer:
+    natolambert/tulu-v2-sft-mixture-flan: 50000
+    natolambert/tulu-v2-sft-mixture-cot: 49747
+    allenai/openassistant-guanaco-reformatted: 7708  # not exact subset
+    Vtuber-plan/sharegpt-cleaned: 114046  # original https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered
+    vicgalle/alpaca-gpt4: 20000
+    HuggingFaceH4/CodeAlpaca_20K: 18000  # original uses https://github.com/sahil280114/codealpaca
+    natolambert/tulu-v2-sft-mixture-lima: 1018  # original has 1030
+    WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000
+    Open-Orca/OpenOrca: 30000
+    natolambert/tulu-v2-sft-mixture-science: 7468  # original data slightly different
+max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3
+preprocessing_num_workers: 128
+per_device_train_batch_size: 1 # note, this is set up for 8 GPUs
+gradient_accumulation_steps: 16 # effective batch size 128 for tulu 2
+learning_rate: 2.0e-05
+lr_scheduler_type: linear
+warmup_ratio: 0.03
+weight_decay: 0.0
+num_train_epochs: 3
+output_dir: output/tulu_v2_7b/
+with_tracking: true
+report_to:
+  - wandb
+logging_steps: 1
+checkpointing_steps: epoch
diff --git a/open_instruct/__init__.py b/open_instruct/__init__.py
@@ -1,3 +1,3 @@
-from .utils import ArgumentParserPlus, FlatArguments
+from .utils import ArgumentParserPlus, FlatArguments, get_datasets
 
-All = [FlatArguments, ArgumentParserPlus]
+All = [FlatArguments, ArgumentParserPlus, get_datasets]
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
@@ -46,7 +46,7 @@
     get_scheduler,
 )
 
-from open_instruct.utils import ArgumentParserPlus, FlatArguments
+from open_instruct.utils import ArgumentParserPlus, FlatArguments, get_datasets
 
 logger = get_logger(__name__)
 
@@ -222,6 +222,16 @@ def main():
             args.dataset_name,
             args.dataset_config_name,
         )
+    elif args.dataset_mixer is not None:
+        # mixing datasets via config
+        raw_datasets = get_datasets(
+            args.dataset_mixer,
+            configs=args.dataset_config_name,
+            splits=["train"],
+            save_data_dir=args.dataset_mix_dir,
+            columns_to_keep=["messages"],
+            need_columns=["messages"],
+        )
     else:
         data_files = {}
         dataset_args = {}

diff --git a/open_instruct/mix_data.py b/open_instruct/mix_data.py
@@ -0,0 +1,45 @@
+# !/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 AllenAI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# script for mixing and saving data
+from .utils import ArgumentParserPlus, FlatArguments, get_datasets
+
+# Run as module for local imports, e.g.:
+# python -m open_instruct.mix_data configs/train_configs/sft/default.yaml --save_data_dir=output/tmp/
+
+
+def main():
+    parser = ArgumentParserPlus((FlatArguments))
+    args = parser.parse()
+
+    # assert that data_mixer is not none in config
+    assert args.dataset_mixer is not None, "data_mixer is required in config"
+
+    raw_datasets = get_datasets(
+        args.dataset_mixer,
+        configs=args.dataset_config_name,
+        splits=["train"],
+        save_data_dir=args.dataset_mix_dir,  # location where dataset is saved as json
+        columns_to_keep=["messages"],
+    )
+
+    # print first 5 samples of dataset
+    for i in range(5):
+        print(raw_datasets["train"][i])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/open_instruct/test_utils.py b/open_instruct/test_utils.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2024 AllenAI Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copied from https://github.com/huggingface/alignment-handbook/blob/main/tests/test_data.py
+import unittest
+
+import pytest
+
+from open_instruct.utils import FlatArguments, get_datasets
+
+
+class GetDatasetsTest(unittest.TestCase):
+    """Each of these test datasets has 100 examples"""
+
+    def assert_args_type(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        _ = FlatArguments(dataset_mixer, columns_to_keep=["prompt", "completion"])
+
+    def test_loading_data_args(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+            "HuggingFaceH4/testing_self_instruct_small": 1.0,
+            "HuggingFaceH4/testing_codealpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 300)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_fractions_greater_than_unity(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": 0.4,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 70 + 40)
+        self.assertEqual(len(datasets["test"]), 200)
+
+    def test_loading_fails_with_negative_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": -0.3,
+        }
+        with pytest.raises(ValueError, match=r"Dataset fractions / lengths cannot be negative."):
+            get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+
+    def test_loading_single_split_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, splits=["test"], columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["test"]), 100)
+        self.assertRaises(KeyError, lambda: datasets["train"])
+
+
+# useful for checking if public datasets are still available
+# class CheckTuluDatasetsTest(unittest.TestCase):
+#     """
+#     Try to rebuild Tulu from public sources
+#     """
+
+#     def test_loading_tulu(self):
+#         dataset_mixer = {
+#             "natolambert/tulu-v2-sft-mixture-flan": 50000,
+#             "natolambert/tulu-v2-sft-mixture-cot": 49747,
+#             "allenai/openassistant-guanaco-reformatted": 7708,  # not exact subset
+#             "Vtuber-plan/sharegpt-cleaned": 114046,
+#             # original https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered
+#             "vicgalle/alpaca-gpt4": 20000,
+#             "HuggingFaceH4/CodeAlpaca_20K": 18000,  # original uses https://github.com/sahil280114/codealpaca
+#             "natolambert/tulu-v2-sft-mixture-lima": 1018,  # original has 1030
+#             "WizardLMTeam/WizardLM_evol_instruct_V2_196k": 30000,
+#             "Open-Orca/OpenOrca": 30000,
+#             "natolambert/tulu-v2-sft-mixture-science": 7468,  # original data slightly different
+#         }
+#         _ = get_datasets(dataset_mixer, splits=["train"], columns_to_keep=["messages"])