Merge remote-tracking branch 'upstream/main'

red-hat-data-services · Sep 21, 2024 · 80aed7a · 80aed7a
2 parents 20d8000 + 926fb9b
commit 80aed7a
Show file tree

Hide file tree

Showing 10 changed files with 455 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -621,15 +621,26 @@ The list of configurations for various `fms_acceleration` plugins:
 - [fused_ops_and_kernels](./tuning/config/acceleration_configs/fused_ops_and_kernels.py) (experimental):
   - `--fused_lora`: fused lora for more efficient LoRA training.
   - `--fast_kernels`: fast cross-entropy, rope, rms loss kernels.
+- [attention_and_distributed_packing](./tuning/config/acceleration_configs/attention_and_distributed_packing.py) (experimental):
+  - `--padding_free`: technique to process multiple examples in single batch without adding padding tokens that waste compute.
+  - `--multipack`: technique for *multi-gpu training* to balance out number of tokens processed in each device, to minimize waiting time.
 
 Notes: 
  * `quantized_lora_config` requires that it be used along with LoRA tuning technique. See [LoRA tuning section](https://github.com/foundation-model-stack/fms-hf-tuning/tree/main?tab=readme-ov-file#lora-tuning-example) on the LoRA parameters to pass.
  * When setting `--auto_gptq triton_v2` plus note to also pass `--torch_dtype float16` and `--fp16`, or an exception will be raised. This is because these kernels only support this dtype.
- * Currently, the `fused_ops_and_kernels` is to be used used together QLoRA or GPTQ-LORA via the `quantized_lora_config`. In the future it may be made more flexible such that `fast_kernels` can even be used with full-finetuning.
  * When using `fused_ops_and_kernels` together with `quantized_lora_config`,
  make sure to appropriately set `--fused_lora auto_gptq True` or `bitsandbytes True`; the `True` sets `fast_lora==True`.
- * Currently `fused_ops_and_kernels` only supports activating `fast_loss,fast_rsm_layernorm,fast_rope_embeddings` all to `True`, so pass `--fast_kernels True True True`.
-
+ * `fused_ops_and_kernels` works for full-finetuning, LoRA, QLoRA and GPTQ-LORA, 
+    - pass `--fast_kernels True True True` for full finetuning/LoRA
+    - pass `--fast_kernels True True True --auto_gptq triton_v2 --fused_lora auto_gptq True` for GPTQ-LoRA
+    - pass `--fast_kernels True True True --bitsandbytes nf4 --fused_lora bitsandbytes True` for QLoRA
+ * Notes on Padding Free
+    - works for both *single* and *multi-gpu*. 
+    - works on both *pretokenized* and *untokenized* datasets
+    - verified against the version found in HF main, merged in via PR https://github.com/huggingface/transformers/pull/31629.
+ * Notes on Multipack
+    - works only for *multi-gpu*.
+    - currently only includes the version of *multipack* optimized for linear attention implementations like *flash-attn*.
 
 Activate `TRANSFORMERS_VERBOSITY=info` to see the huggingface trainer printouts and verify that `AccelerationFramework` is activated!
 

diff --git a/tests/acceleration/spying_utils.py b/tests/acceleration/spying_utils.py
@@ -36,7 +36,7 @@ def augmentation(
 
     def get_callbacks_and_ready_for_train(self, *args, **kwargs):
         spy["get_ready_for_train_calls"] += 1
-        return plugin_cls.get_callbacks_and_ready_for_train(self, args, **kwargs)
+        return plugin_cls.get_callbacks_and_ready_for_train(self, *args, **kwargs)
 
     attributes = {
         "model_loader": model_loader,

diff --git a/tests/acceleration/test_acceleration_dataclasses.py b/tests/acceleration/test_acceleration_dataclasses.py
@@ -23,6 +23,11 @@
     FusedOpsAndKernelsConfig,
     QuantizedLoraConfig,
 )
+from tuning.config.acceleration_configs.attention_and_distributed_packing import (
+    AttentionAndDistributedPackingConfig,
+    MultiPack,
+    PaddingFree,
+)
 from tuning.config.acceleration_configs.fused_ops_and_kernels import (
     FastKernelsConfig,
     FusedLoraConfig,
@@ -65,6 +70,24 @@ def test_dataclass_parse_successfully():
     assert cfg.auto_gptq is None
     assert isinstance(cfg.bnb_qlora, BNBQLoraConfig)
 
+    # 3. Specifing "--padding_free" will parse a PaddingFree class
+    parser = transformers.HfArgumentParser(
+        dataclass_types=AttentionAndDistributedPackingConfig
+    )
+    (cfg,) = parser.parse_args_into_dataclasses(
+        ["--padding_free", "huggingface"],
+    )
+    assert isinstance(cfg.padding_free, PaddingFree)
+
+    # 4. Specifing "--multipack" will parse a MultiPack class
+    parser = transformers.HfArgumentParser(
+        dataclass_types=AttentionAndDistributedPackingConfig
+    )
+    (cfg,) = parser.parse_args_into_dataclasses(
+        ["--multipack", "16"],
+    )
+    assert isinstance(cfg.multipack, MultiPack)
+
 
 def test_two_dataclasses_parse_successfully_together():
     """Ensure that the two dataclasses can parse arguments successfully
@@ -133,3 +156,9 @@ def test_dataclass_will_fail_to_accept_illegal_args():
         ValueError, match="quant_type can only be either 'nf4' or 'fp4."
     ):
         BNBQLoraConfig(quant_type="fake-quant-type")
+
+    # 3 padding-free plugin only supports huggingface models
+    with pytest.raises(
+        ValueError, match="only 'huggingface' method currently supported."
+    ):
+        PaddingFree(method="invalid-method")