diff --git a/modules/training.py b/modules/training.py index b0e02400..2830ba07 100644 --- a/modules/training.py +++ b/modules/training.py @@ -341,7 +341,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: # Populate target_modules list with chosen X_proj modules. Llama-based models only atm, non-llama will revert to default behavior. def list_target_modules(model_id): - if model_id != "llama": + if model_id != "llama" and model_id != "mistral": return model_to_lora_modules[model_id] available_modules = { @@ -517,7 +517,8 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: # == Start prepping the model itself == if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'): logger.info("Getting model ready") - prepare_model_for_kbit_training(shared.model) + if 'quantization_config' in shared.model.config.to_dict(): + prepare_model_for_kbit_training(shared.model) # base model is now frozen and should not be reused for any other LoRA training than this one shared.model_dirty_from_training = True @@ -615,7 +616,8 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: warmup_steps=math.ceil(warmup_steps / gradient_accumulation_steps), num_train_epochs=epochs, learning_rate=actual_lr, - fp16=False if shared.args.cpu else True, + fp16=False if shared.args.cpu or shared.args.bf16 else True, + bf16=shared.args.bf16, optim=optimizer, logging_steps=2 if stop_at_loss > 0 else 5, evaluation_strategy="steps" if eval_data is not None else "no", @@ -627,7 +629,7 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: # TODO: Enable multi-device support ddp_find_unused_parameters=None, no_cuda=shared.args.cpu, - use_ipex=True if is_torch_xpu_available and not shared.args.cpu else False + use_ipex=True if is_torch_xpu_available() and not shared.args.cpu else False ), data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False), callbacks=list([Callbacks()])