We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hi, I recieved loss None when training model. Anyone can help?
Simple reproduct kaggle notebook link
import os import time import pandas as pd import numpy as np from tqdm import tqdm import datasets import torch import torch.nn as nn import torch.optim as optim import torch_xla as xla import torch_xla.core.xla_model as xm import torch_xla.distributed.xla_multiprocessing as xmp from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear import torch_xla.distributed.parallel_loader as pl import torch_xla.core.xla_env_vars as xenv import torch_xla.debug.metrics as met import torch_xla.distributed.spmd.xla_sharding as xs from torch_xla.distributed.spmd.xla_sharding import Mesh import torch_xla.runtime as xr import re from datasets import Dataset, load_dataset import transformers from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy from transformers import AutoConfig, AutoProcessor, AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType from transformers import logging as hf_logging hf_logging.set_verbosity_error() os.environ["PJRT_DEVICE"] = "TPU" class CFG: NUM_EPOCHS = 1 BATCH_SIZE = 24 DROPOUT = 0.05 MODEL_NAME = 'unsloth/Qwen2.5-7B-Instruct' SEED = 2024 MAX_LENGTH = 4096 NUM_WARMUP_STEPS = 128 LR_MAX = 2e-4 NUM_LABELS = 3 LORA_RANK = 16 LORA_ALPHA = 16 LORA_MODULES = ['o_proj', 'v_proj',"q_proj", "k_proj"] FLAGS = {'MAX_INPUT': 64, 'LOGGING_STEPS': 10, 'NUM_EPOCHS': 3, 'BATCH_SIZE': 24, } MAX_INPUT=128 MODEL = "unsloth/Qwen2.5-7B-Instruct" def get_dataset(): tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = 'right' tokenizer.add_eos_token = True # save tokenizer to load offline during inference tokenizer.save_pretrained('tokenizer') max_seq_length = 4096 tokenizer_x = AutoTokenizer.from_pretrained(CFG.MODEL_NAME, max_seq_length=max_seq_length) tokenizer_x.pad_token_id = tokenizer.eos_token_id df = datasets.load_dataset('stanfordnlp/imdb', split='train') # df = df['train'] df = df.remove_columns(['label']) def preprocess(tasks, train_mode=True): return {"text": 'this is test'} df = df.map(preprocess, batched = False, remove_columns=df.column_names) print(df) def preprocess_function(example): x = tokenizer(example["text"], truncation=True, max_length=4096, padding='max_length') return { "input_ids": x.input_ids, "labels": 0, "attention_mask": x.attention_mask } data_train = df.map(preprocess_function, batched=False, num_proc=4).remove_columns(['text']) return data_train, tokenizer, FLAGS ############################################################################################################################################## def train(data_train, tokenizer, FLAGS): # print('rank', rank) N_SAMPLES = len(data_train) STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE METRICS = { 'loss': [], 'accuracy': {'y_true': [], 'y_pred': [] }} device = xm.xla_device() print('device', device) num_devices = xr.global_runtime_device_count() #8 model_axis = 1 mesh_shape = (1, num_devices // model_axis, model_axis) # 2x4 on v3-8, 2x2 on v4-8 device_ids = np.array(range(num_devices)) mesh = Mesh(device_ids, mesh_shape, ('dcn', 'data', 'model')) print('world_size:', xm.xrt_world_size()) rng = torch.Generator().manual_seed(42) training_loader = torch.utils.data.DataLoader(data_train, batch_size=FLAGS['BATCH_SIZE'], collate_fn=DataCollatorWithPadding(tokenizer=tokenizer), # sampler=train_sampler, drop_last=True, generator=rng) sharding_spec = xs.ShardingSpec(mesh, (('dcn', 'data'), None)) xla_train_loader = pl.MpDeviceLoader(training_loader, device = xm.xla_device(), input_sharding=sharding_spec, device_prefetch_size=16 ) base_model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16) base_model.config.pretraining_tp = 1 tokenizer.pad_token = tokenizer.eos_token # If pad_token is not set base_model.config.pad_token_id = tokenizer.pad_token_id # Ensure the model respects the pad_token lora_config = LoraConfig( r=CFG.LORA_RANK, # the dimension of the low-rank matrices lora_alpha = CFG.LORA_ALPHA, # scaling factor for LoRA activations vs pre-trained weight activations lora_dropout= CFG.DROPOUT, bias='none', inference_mode=False, task_type='CAUSAL_LM', target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]) # Only Use Output and Values Projection # Create LoRa Model model = get_peft_model(base_model, lora_config) model.print_trainable_parameters() model = apply_xla_patch_to_nn_linear(model, xs.xla_patched_nn_linear_forward) model = model.to(device) xm.master_print('Done load model') if hasattr(model, "tie_weights"): print('tie_weight') model.tie_weights() print(model) i = 0 for name, param in model.named_parameters(): i+=1 if 1 > 0: if len(param.shape) == 1: continue if 'embed_tokens' in name: xs.mark_sharding(param, mesh, ('model', 'data')) # print('> [2D] Sharding tensor', name, param.shape) elif 'q_proj' in name or 'k_proj' in name or 'v_proj' in name: xs.mark_sharding(param, mesh, ('data', 'model')) elif 'o_proj' in name: xs.mark_sharding(param, mesh, ('model', 'data')) elif 'gate_proj' in name or 'up_proj' in name: xs.mark_sharding(param, mesh, ('model', 'data')) elif 'down_proj' in name: xs.mark_sharding(param, mesh, ('data', 'model')) elif 'lm_head' in name: xs.mark_sharding(param, mesh, ('model', 'data')) else: continue optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.LR_MAX) # Cosine Learning Rate With Warmup lr_scheduler = transformers.get_cosine_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=CFG.NUM_WARMUP_STEPS, num_training_steps=300 * CFG.NUM_EPOCHS) for epoch in range(1, FLAGS['NUM_EPOCHS'] + 1): s = time.time() for step, data in tqdm(enumerate(xla_train_loader)): with xla.step(): if step%200==0: print(f'tracing {step=}') input_ids, labels, attention_mask = data.input_ids, data.labels, data.attention_mask optimizer.zero_grad() xm.master_print(input_ids) xm.master_print(attention_mask) xm.master_print(attention_mask.shape) # (24,4096) xm.master_print(input_ids.shape) # (24,4096) with xla.amp.autocast(xm.xla_device()): outputs = model(input_ids=input_ids, attention_mask=attention_mask ) # Logits Float32 print(dir(outputs)) loss = outputs.loss # None here xm.master_print(loss) loss.backward() lr_scheduler.step() xm.optimizer_step(optimizer) # If stopped, and to continue training in future on tpu we save model and optimizer xm.save({k: v.cpu() for k, v in model.named_parameters() if v.requires_grad}, f'model_qwen2.5_cp_{epoch+1}_v1.pth') xm.save(optimizer.state_dict(), f'optimizer_qwen_2.5_cp_{epoch+1}_v1.pth') print(f'Model saved at epoch {epoch+1}') xm.rendezvous('init') return model if __name__ == '__main__': print('Load') train_dataloader, tokenizer, FLAGS = get_dataset() import gc gc.collect() print(len(train_dataloader)) xr.use_spmd() model = train(train_dataloader, tokenizer, FLAGS)
The text was updated successfully, but these errors were encountered:
@JackCaoG Are you have any suggestion?
Sorry, something went wrong.
Hi @manh3152924,
It is because the label is not passed as an argument to the model. In your code
outputs = model(input_ids=input_ids, attention_mask=attention_mask )
While in the transformers repo you can see that https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/modeling_qwen2.py#L1181-L1183
loss = None if labels is not None: loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
No branches or pull requests
❓ Questions and Help
Hi, I recieved loss None when training model. Anyone can help?
Simple reproduct kaggle notebook link
The text was updated successfully, but these errors were encountered: