Update example diff files

huggingface · Sep 25, 2024 · 19ad9e7 · 19ad9e7
1 parent c31dfab
commit 19ad9e7
Show file tree

Hide file tree

Showing 13 changed files with 102 additions and 65 deletions.
diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt
@@ -30,7 +30,7 @@
 > 
 47,48c48,50
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")

diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt
@@ -25,7 +25,7 @@
 > 
 56,57c63,65
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")

diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt
@@ -25,7 +25,7 @@
 > from optimum.habana.utils import set_seed
 57,58d52
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 60c54,60
 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 ---

diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt
@@ -47,7 +47,7 @@
 < from transformers.modeling_outputs import CausalLMOutputWithPast
 ---
 > from optimum.habana.utils import get_hpu_memory_stats
-60,188d42
+60,216d42
 < MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
 < 
 < MODEL_CLASSES = {
@@ -177,7 +177,7 @@
 < 
 <     return num_layer, num_head, num_embedding_size_per_head
 < 
-190,285c44,46
+< 
 < def generate_past_key_values(model, batch_size, seq_len):
 <     num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
 <     if model.config.model_type == "bloom":
@@ -205,7 +205,7 @@
 <             for _ in range(num_block_layers)
 <         )
 <     return past_key_values
-< 
+218,285c44,46
 < 
 < def prepare_jit_inputs(inputs, model, tokenizer):
 <     batch_size = len(inputs)
@@ -318,7 +318,7 @@
 >     )
 >     parser.add_argument(
 >         "--dataset_name",
-297,298c86,92
+297,298c86,97
 <         required=True,
 <         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
 ---
@@ -329,20 +329,20 @@
 >         default=None,
 >         type=str,
 >         help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
-300,304d93
+>     )
+>     parser.add_argument(
+>         "--do_sample",
+>         action="store_true",
+>         help="Whether to use sampling for generation.",
+300,304d98
 < 
 <     parser.add_argument("--prompt", type=str, default="")
 <     parser.add_argument("--length", type=int, default=20)
 <     parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
 < 
-306c95,113
+306c100,113
 <         "--temperature",
 ---
->         "--do_sample",
->         action="store_true",
->         help="Whether to use sampling for generation.",
->     )
->     parser.add_argument(
 >         "--num_beams",
 >         default=1,
 >         type=int,
@@ -362,7 +362,7 @@
 <         help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
 ---
 >         help="Degeneration penalty for contrastive search. penalty_alpha > 0 enables contrastive search.",
-312c118,245
+312c118,250
 <         "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
 ---
 >         "--trim_logits",
@@ -477,6 +477,11 @@
 >         help="Skip HPU Graph usage for first token to save memory",
 >     )
 >     parser.add_argument(
+>         "--show_graphs_count",
+>         action="store_true",
+>         help="Show statistics of HPU graph compilation.",
+>     )
+>     parser.add_argument(
 >         "--reuse_cache",
 >         action="store_true",
 >         help="Whether to reuse key/value cache for decoding. It should save memory.",
@@ -493,16 +498,16 @@
 >         "--reduce_recompile",
 >         action="store_true",
 >         help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
-314,319d246
+314,319d251
 <     parser.add_argument("--k", type=int, default=0)
 <     parser.add_argument("--p", type=float, default=0.9)
 < 
 <     parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
 <     parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
 <     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
-321d247
+321d252
 <     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-323c249,269
+323c254,264
 <         "--use_cpu",
 ---
 >         "--use_flash_attention",
@@ -516,33 +521,31 @@
 >     )
 >     parser.add_argument(
 >         "--flash_attention_causal_mask",
->         action="store_true",
+325c266
+<         help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
+---
 >         help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
->     )
->     parser.add_argument(
+327d267
+<     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
+329c269
+<         "--fp16",
+---
 >         "--flash_attention_fast_softmax",
->         action="store_true",
+331c271,295
+<         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+---
 >         help="Whether to enable Habana Flash Attention in fast softmax mode.",
 >     )
 >     parser.add_argument(
 >         "--book_source",
-325c271
-<         help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
----
+>         action="store_true",
 >         help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
-327d272
-<     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
-329c274
-<         "--fp16",
----
+>     )
+>     parser.add_argument(
 >         "--torch_compile",
-331c276
-<         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
----
+>         action="store_true",
 >         help="Whether to use torch compiled model or not.",
-333c278,314
-<     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
----
+>     )
 >     parser.add_argument(
 >         "--ignore_eos",
 >         default=True,
@@ -556,7 +559,9 @@
 >         "--csp",
 >         type=str,
 >         help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
->     )
+333c297,324
+<     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
+---
 >     parser.add_argument(
 >         "--disk_offload",
 >         action="store_true",
@@ -579,25 +584,30 @@
 >         default="none",
 >         help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
 >     )
+>     parser.add_argument(
+>         "--input_embeds",
+>         action="store_true",
+>         help="Whether to enable inputs_embeds or not.",
+>     )
 > 
-336,337c317,318
+336,337c327,328
 <     # Initialize the distributed state.
 <     distributed_state = PartialState(cpu=args.use_cpu)
 ---
 >     if args.torch_compile:
 >         args.use_hpu_graphs = False
-339c320,321
+339c330,331
 <     logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 ---
 >     if not args.use_hpu_graphs:
 >         args.limit_hpu_graphs = False
-341,342c323,324
+341,342c333,334
 <     if args.seed is not None:
 <         set_seed(args.seed)
 ---
 >     if args.use_flash_attention and not args.flash_attention_fast_softmax:
 >         args.flash_attention_fast_softmax = True
-344,371c326,331
+344,371c336,341
 <     # Initialize the model and tokenizer
 <     try:
 <         args.model_type = args.model_type.lower()
@@ -633,12 +643,12 @@
 >             "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
 >         )
 >     return args
-373,376d332
+373,376d342
 <         if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
 <             tokenizer_kwargs = {"add_space_before_punct_symbol": True}
 <         else:
 <             tokenizer_kwargs = {}
-378,384c334,337
+378,384c344,350
 <         encoded_prompt = tokenizer.encode(
 <             preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
 <         )
@@ -647,14 +657,26 @@
 <         encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
 <     encoded_prompt = encoded_prompt.to(distributed_state.device)
 ---
+> def prepare_generation_embedding(model, model_name, input_tokens):
+>     batch_size = input_tokens["input_ids"].size(0)
+> 
+>     inputs_embeds = model.get_input_embeddings()(input_tokens["input_ids"])
+> 
+>     if inputs_embeds.size(0) != batch_size:
+>         inputs_embeds = inputs_embeds.expand(batch_size, -1, -1)
+386,387c352,571
+<     if encoded_prompt.size()[-1] == 0:
+<         input_ids = None
+---
+>     attention_mask = input_tokens["attention_mask"]
+>     return {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+> 
+> 
 > def main():
 >     parser = argparse.ArgumentParser()
 >     args = setup_parser(parser)
 >     model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
-386,387c339,534
-<     if encoded_prompt.size()[-1] == 0:
-<         input_ids = None
----
+> 
 >     use_lazy_mode = True
 >     if args.torch_compile:
 >         use_lazy_mode = False
@@ -747,9 +769,22 @@
 >                 for t in input_tokens:
 >                     if torch.is_tensor(input_tokens[t]):
 >                         input_tokens[t] = input_tokens[t].to(args.device)
+> 
+>             input_data = {}
+>             if args.input_embeds:
+>                 inputs_embeds = prepare_generation_embedding(model, args.model_name_or_path, input_tokens)
+>                 if inputs_embeds is not None:
+>                     input_data.update(inputs_embeds)
+>                     input_data.update(input_tokens)
+>                 else:
+>                     args.input_embeds = False
+>                     input_data.update(input_tokens)
+>             else:
+>                 input_data.update(input_tokens)
+> 
 >             iteration_times = []
 >             outputs = model.generate(
->                 **input_tokens,
+>                 **input_data,
 >                 generation_config=generation_config,
 >                 assistant_model=assistant_model,
 >                 lazy_mode=use_lazy_mode,
@@ -838,8 +873,10 @@
 >             with (output_dir / "results.json").open("w", encoding="utf-8") as f:
 >                 json.dump(results, f, ensure_ascii=False, indent=4)
 > 
->         stats = f"Throughput (including tokenization) = {throughput} tokens/second"
->         stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
+>         stats = "Input embeds" if args.input_embeds else "Input tokens"
+>         stats = stats + f"\nThroughput (including tokenization) = {throughput} tokens/second"
+>         if args.show_graphs_count:
+>             stats = stats + f"\nNumber of HPU graphs                = {count_hpu_graphs()}"
 >         separator = "-" * len(stats)
 >         print()
 >         print("Stats:")
@@ -851,7 +888,7 @@
 >         print(f"Graph compilation duration          = {compilation_duration} seconds")
 >         print(separator)
 >         print()
-389c536,553
+389c573,590
 <         input_ids = encoded_prompt
 ---
 >         # Downloading and loading a dataset from the hub.
@@ -872,7 +909,7 @@
 >             .shuffle()
 >             .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
 >         )
-391,397c555,562
+391,397c592,599
 <     if args.jit:
 <         jit_input_texts = ["enable jit"]
 <         jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
@@ -889,7 +926,7 @@
 >             logger.info(
 >                 f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
 >             )
-399,437c564,584
+399,437c601,621
 <             sig = inspect.signature(model.__call__)
 <         jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
 <         traced_model = torch.jit.trace(model, jit_inputs, strict=False)
@@ -951,7 +988,7 @@
 >             preprocess_function,
 >             batched=True,
 >             desc="Running tokenizer on dataset",
-438a586,668
+438a623,705
 >         # After tokenization, we can remove the column of interest
 >         raw_dataset = raw_dataset.remove_columns([column_name])
 >         raw_dataset.set_format(type="torch")
@@ -1035,7 +1072,7 @@
 > 
 >         throughput = total_new_tokens_generated / duration
 >         # Print Stats
-440,441c670,686
+440,441c707,723
 <         generated_sequences.append(total_sequence)
 <         print(total_sequence)
 ---
@@ -1056,7 +1093,7 @@
 >         finalize_quantization(model)
 >     if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
 >         import shutil
-443c688
+443c725
 <     return generated_sequences
 ---
 >         shutil.rmtree(args.const_serialization_path)
diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt
@@ -21,7 +21,7 @@
 >         return ()
 50,51c56,61
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 ---
 > 
 > logger = logging.getLogger(__name__)

diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt
@@ -25,7 +25,7 @@
 < """ Fine-tuning a 🤗 Transformers model for image classification"""
 58,59c65,67
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")

diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt
@@ -20,7 +20,7 @@
 > from optimum.habana.utils import set_seed
 56,57d51
 < # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-< check_min_version("4.45.0.dev0")
+< check_min_version("4.46.0.dev0")
 59c53,59
 < require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 ---