Revert "Supporting llama int4 inference using AutoGPTQ in HPU (#166)#…

…1125" This reverts commit 2000967.
huggingface · Oct 1, 2024 · b87d80e · b87d80e
1 parent ef442a4
commit b87d80e
Show file tree

Hide file tree

Showing 3 changed files with 0 additions and 33 deletions.
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
@@ -539,30 +539,6 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 
 For more details see [documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html#using-fused-sdpa).
 
-### Running with UINT4 using AutoGPTQ
-
-
-Llama2-7b in UINT4 is enabled using [AutoGPTQ Fork](https://github.com/HabanaAI/AutoGPTQ), which provides quantization capabilities in PyTorch.
-Currently, the support is for UINT4 inference of pre-quantized models only.
-
-You can run a *UINT4 quantized* model using AutoGPTQ with the argument `--gptq`.
-
-Here is an example to run a quantized model on Llama2-7b `TheBloke/Llama-2-7b-Chat-GPTQ`:
-```bash
-python run_generation.py \
---attn_softmax_bf16 \
---model_name_or_path TheBloke/Llama-2-7b-Chat-GPTQ \
---use_hpu_graphs \
---limit_hpu_graphs \
---use_kv_cache \
---bucket_size 128 \
---bucket_internal \
---trim_logits \
---max_new_tokens 128 \
---batch_size 1 \
---bf16 \
---gptq
-```
 
 ## Language Model Evaluation Harness
 

diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
@@ -249,8 +249,6 @@ def setup_parser(parser):
         help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
     )
 
-    parser.add_argument("--gptq", action="store_true", help="Enable Quantization to 4 bit with AutoGPTQ")
-
     parser.add_argument(
         "--use_flash_attention",
         action="store_true",
@@ -335,9 +333,6 @@ def setup_parser(parser):
         args.flash_attention_fast_softmax = True
 
     args.quant_config = os.getenv("QUANT_CONFIG", "")
-    if args.quant_config and args.gptq:
-        raise RuntimeError("Setting both quant_config and gptq is unsupported. ")
-
     if args.quant_config == "" and args.disk_offload:
         logger.warning(
             "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag."

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
@@ -241,10 +241,6 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         from neural_compressor.torch.quantization import load
 
         model = load(model_name_or_path=args.model_name_or_path, format="huggingface", device="hpu", **model_kwargs)
-    elif args.gptq:
-        from transformers import GPTQConfig
-        quantization_config = GPTQConfig(bits=4, use_exllama=False)
-        model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs)
     else:
         if args.assistant_model is not None:
             assistant_model = AutoModelForCausalLM.from_pretrained(