Skip to content

Commit

Permalink
Update example diff files
Browse files Browse the repository at this point in the history
  • Loading branch information
regisss committed Sep 25, 2024
1 parent c31dfab commit 19ad9e7
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 65 deletions.
2 changes: 1 addition & 1 deletion tests/example_diff/run_audio_classification.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
>
47,48c48,50
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
---
> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
> check_min_version("4.43.0")
Expand Down
2 changes: 1 addition & 1 deletion tests/example_diff/run_clip.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
>
56,57c63,65
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
---
> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
> check_min_version("4.43.0")
Expand Down
2 changes: 1 addition & 1 deletion tests/example_diff/run_clm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
> from optimum.habana.utils import set_seed
57,58d52
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
60c54,60
< require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
---
Expand Down
143 changes: 90 additions & 53 deletions tests/example_diff/run_generation.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
< from transformers.modeling_outputs import CausalLMOutputWithPast
---
> from optimum.habana.utils import get_hpu_memory_stats
60,188d42
60,216d42
< MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop
<
< MODEL_CLASSES = {
Expand Down Expand Up @@ -177,7 +177,7 @@
<
< return num_layer, num_head, num_embedding_size_per_head
<
190,285c44,46
<
< def generate_past_key_values(model, batch_size, seq_len):
< num_block_layers, num_attention_heads, num_embedding_size_per_head = sparse_model_config(model.config)
< if model.config.model_type == "bloom":
Expand Down Expand Up @@ -205,7 +205,7 @@
< for _ in range(num_block_layers)
< )
< return past_key_values
<
218,285c44,46
<
< def prepare_jit_inputs(inputs, model, tokenizer):
< batch_size = len(inputs)
Expand Down Expand Up @@ -318,7 +318,7 @@
> )
> parser.add_argument(
> "--dataset_name",
297,298c86,92
297,298c86,97
< required=True,
< help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
---
Expand All @@ -329,20 +329,20 @@
> default=None,
> type=str,
> help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.",
300,304d93
> )
> parser.add_argument(
> "--do_sample",
> action="store_true",
> help="Whether to use sampling for generation.",
300,304d98
<
< parser.add_argument("--prompt", type=str, default="")
< parser.add_argument("--length", type=int, default=20)
< parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped")
<
306c95,113
306c100,113
< "--temperature",
---
> "--do_sample",
> action="store_true",
> help="Whether to use sampling for generation.",
> )
> parser.add_argument(
> "--num_beams",
> default=1,
> type=int,
Expand All @@ -362,7 +362,7 @@
< help="temperature of 1.0 has no effect, lower tend toward greedy sampling",
---
> help="Degeneration penalty for contrastive search. penalty_alpha > 0 enables contrastive search.",
312c118,245
312c118,250
< "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2"
---
> "--trim_logits",
Expand Down Expand Up @@ -477,6 +477,11 @@
> help="Skip HPU Graph usage for first token to save memory",
> )
> parser.add_argument(
> "--show_graphs_count",
> action="store_true",
> help="Show statistics of HPU graph compilation.",
> )
> parser.add_argument(
> "--reuse_cache",
> action="store_true",
> help="Whether to reuse key/value cache for decoding. It should save memory.",
Expand All @@ -493,16 +498,16 @@
> "--reduce_recompile",
> action="store_true",
> help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
314,319d246
314,319d251
< parser.add_argument("--k", type=int, default=0)
< parser.add_argument("--p", type=float, default=0.9)
<
< parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
< parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
< parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
321d247
321d252
< parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
323c249,269
323c254,264
< "--use_cpu",
---
> "--use_flash_attention",
Expand All @@ -516,33 +521,31 @@
> )
> parser.add_argument(
> "--flash_attention_causal_mask",
> action="store_true",
325c266
< help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
---
> help="Whether to enable Habana Flash Attention in causal mode on first token generation.",
> )
> parser.add_argument(
327d267
< parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
329c269
< "--fp16",
---
> "--flash_attention_fast_softmax",
> action="store_true",
331c271,295
< help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
---
> help="Whether to enable Habana Flash Attention in fast softmax mode.",
> )
> parser.add_argument(
> "--book_source",
325c271
< help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
---
> action="store_true",
> help="Whether to use project Guttenberg books data as input. Usefull for testing large sequence lenghts.",
327d272
< parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
329c274
< "--fp16",
---
> )
> parser.add_argument(
> "--torch_compile",
331c276
< help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
---
> action="store_true",
> help="Whether to use torch compiled model or not.",
333c278,314
< parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
---
> )
> parser.add_argument(
> "--ignore_eos",
> default=True,
Expand All @@ -556,7 +559,9 @@
> "--csp",
> type=str,
> help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
> )
333c297,324
< parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
---
> parser.add_argument(
> "--disk_offload",
> action="store_true",
Expand All @@ -579,25 +584,30 @@
> default="none",
> help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
> )
> parser.add_argument(
> "--input_embeds",
> action="store_true",
> help="Whether to enable inputs_embeds or not.",
> )
>
336,337c317,318
336,337c327,328
< # Initialize the distributed state.
< distributed_state = PartialState(cpu=args.use_cpu)
---
> if args.torch_compile:
> args.use_hpu_graphs = False
339c320,321
339c330,331
< logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
---
> if not args.use_hpu_graphs:
> args.limit_hpu_graphs = False
341,342c323,324
341,342c333,334
< if args.seed is not None:
< set_seed(args.seed)
---
> if args.use_flash_attention and not args.flash_attention_fast_softmax:
> args.flash_attention_fast_softmax = True
344,371c326,331
344,371c336,341
< # Initialize the model and tokenizer
< try:
< args.model_type = args.model_type.lower()
Expand Down Expand Up @@ -633,12 +643,12 @@
> "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."
> )
> return args
373,376d332
373,376d342
< if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
< tokenizer_kwargs = {"add_space_before_punct_symbol": True}
< else:
< tokenizer_kwargs = {}
378,384c334,337
378,384c344,350
< encoded_prompt = tokenizer.encode(
< preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
< )
Expand All @@ -647,14 +657,26 @@
< encoded_prompt = tokenizer.encode(prefix + prompt_text, add_special_tokens=False, return_tensors="pt")
< encoded_prompt = encoded_prompt.to(distributed_state.device)
---
> def prepare_generation_embedding(model, model_name, input_tokens):
> batch_size = input_tokens["input_ids"].size(0)
>
> inputs_embeds = model.get_input_embeddings()(input_tokens["input_ids"])
>
> if inputs_embeds.size(0) != batch_size:
> inputs_embeds = inputs_embeds.expand(batch_size, -1, -1)
386,387c352,571
< if encoded_prompt.size()[-1] == 0:
< input_ids = None
---
> attention_mask = input_tokens["attention_mask"]
> return {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
>
>
> def main():
> parser = argparse.ArgumentParser()
> args = setup_parser(parser)
> model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
386,387c339,534
< if encoded_prompt.size()[-1] == 0:
< input_ids = None
---
>
> use_lazy_mode = True
> if args.torch_compile:
> use_lazy_mode = False
Expand Down Expand Up @@ -747,9 +769,22 @@
> for t in input_tokens:
> if torch.is_tensor(input_tokens[t]):
> input_tokens[t] = input_tokens[t].to(args.device)
>
> input_data = {}
> if args.input_embeds:
> inputs_embeds = prepare_generation_embedding(model, args.model_name_or_path, input_tokens)
> if inputs_embeds is not None:
> input_data.update(inputs_embeds)
> input_data.update(input_tokens)
> else:
> args.input_embeds = False
> input_data.update(input_tokens)
> else:
> input_data.update(input_tokens)
>
> iteration_times = []
> outputs = model.generate(
> **input_tokens,
> **input_data,
> generation_config=generation_config,
> assistant_model=assistant_model,
> lazy_mode=use_lazy_mode,
Expand Down Expand Up @@ -838,8 +873,10 @@
> with (output_dir / "results.json").open("w", encoding="utf-8") as f:
> json.dump(results, f, ensure_ascii=False, indent=4)
>
> stats = f"Throughput (including tokenization) = {throughput} tokens/second"
> stats = stats + f"\nNumber of HPU graphs = {count_hpu_graphs()}"
> stats = "Input embeds" if args.input_embeds else "Input tokens"
> stats = stats + f"\nThroughput (including tokenization) = {throughput} tokens/second"
> if args.show_graphs_count:
> stats = stats + f"\nNumber of HPU graphs = {count_hpu_graphs()}"
> separator = "-" * len(stats)
> print()
> print("Stats:")
Expand All @@ -851,7 +888,7 @@
> print(f"Graph compilation duration = {compilation_duration} seconds")
> print(separator)
> print()
389c536,553
389c573,590
< input_ids = encoded_prompt
---
> # Downloading and loading a dataset from the hub.
Expand All @@ -872,7 +909,7 @@
> .shuffle()
> .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
> )
391,397c555,562
391,397c592,599
< if args.jit:
< jit_input_texts = ["enable jit"]
< jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
Expand All @@ -889,7 +926,7 @@
> logger.info(
> f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
> )
399,437c564,584
399,437c601,621
< sig = inspect.signature(model.__call__)
< jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
< traced_model = torch.jit.trace(model, jit_inputs, strict=False)
Expand Down Expand Up @@ -951,7 +988,7 @@
> preprocess_function,
> batched=True,
> desc="Running tokenizer on dataset",
438a586,668
438a623,705
> # After tokenization, we can remove the column of interest
> raw_dataset = raw_dataset.remove_columns([column_name])
> raw_dataset.set_format(type="torch")
Expand Down Expand Up @@ -1035,7 +1072,7 @@
>
> throughput = total_new_tokens_generated / duration
> # Print Stats
440,441c670,686
440,441c707,723
< generated_sequences.append(total_sequence)
< print(total_sequence)
---
Expand All @@ -1056,7 +1093,7 @@
> finalize_quantization(model)
> if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
> import shutil
443c688
443c725
< return generated_sequences
---
> shutil.rmtree(args.const_serialization_path)
2 changes: 1 addition & 1 deletion tests/example_diff/run_glue.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
> return ()
50,51c56,61
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
---
>
> logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion tests/example_diff/run_image_classification.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
< """ Fine-tuning a 🤗 Transformers model for image classification"""
58,59c65,67
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
---
> # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
> check_min_version("4.43.0")
Expand Down
2 changes: 1 addition & 1 deletion tests/example_diff/run_mlm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
> from optimum.habana.utils import set_seed
56,57d51
< # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
< check_min_version("4.45.0.dev0")
< check_min_version("4.46.0.dev0")
59c53,59
< require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
---
Expand Down
Loading

0 comments on commit 19ad9e7

Please sign in to comment.