Revert "Remove GPTQ-for-LLaMa monkey patch support"

This reverts commit e3d3565b2a.
This commit is contained in:
oobabooga 2023-08-10 08:39:41 -07:00
parent 16e2b117b4
commit c7f52bbdc1
6 changed files with 103 additions and 0 deletions

View File

@ -279,6 +279,7 @@ Optionally, you can use the following command-line flags:
| `--groupsize GROUPSIZE` | Group size. | | `--groupsize GROUPSIZE` | Group size. |
| `--pre_layer PRE_LAYER [PRE_LAYER ...]` | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. | | `--pre_layer PRE_LAYER [PRE_LAYER ...]` | The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg `--pre_layer 30 60`. |
| `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. | | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. |
| `--monkey-patch` | Apply the monkey patch for using LoRAs with quantized models.
#### DeepSpeed #### DeepSpeed

View File

@ -157,4 +157,31 @@ Output generated in 123.79 seconds (1.61 tokens/s, 199 tokens)
You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU. You can also use multiple GPUs with `pre_layer` if using the oobabooga fork of GPTQ, eg `--pre_layer 30 60` will load a LLaMA-30B model half onto your first GPU and half onto your second, or `--pre_layer 20 40` will load 20 layers onto GPU-0, 20 layers onto GPU-1, and 20 layers offloaded to CPU.
### Using LoRAs with GPTQ-for-LLaMa
This requires using a monkey patch that is supported by this web UI: https://github.com/johnsmith0031/alpaca_lora_4bit
To use it:
1. Clone `johnsmith0031/alpaca_lora_4bit` into the repositories folder:
```
cd text-generation-webui/repositories
git clone https://github.com/johnsmith0031/alpaca_lora_4bit
```
⚠️ I have tested it with the following commit specifically: `2f704b93c961bf202937b10aac9322b092afdce0`
2. Install https://github.com/sterlind/GPTQ-for-LLaMa with this command:
```
pip install git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
```
3. Start the UI with the `--monkey-patch` flag:
```
python server.py --model llama-7b-4bit-128g --listen --lora tloen_alpaca-lora-7b --monkey-patch
```

View File

@ -11,6 +11,7 @@ This is the current state of LoRA integration in the web UI:
| Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. | | Transformers | Full support in 16-bit, `--load-in-8bit`, `--load-in-4bit`, and CPU modes. |
| ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. | | ExLlama | Single LoRA support. Fast to remove the LoRA afterwards. |
| AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.| | AutoGPTQ | Single LoRA support. Removing the LoRA requires reloading the entire model.|
| GPTQ-for-LLaMa | Full support with the [monkey patch](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#using-loras-with-gptq-for-llama). |
## Downloading a LoRA ## Downloading a LoRA

View File

@ -131,6 +131,14 @@ So, in effect, Loss is a balancing game: you want to get it low enough that it u
Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption). Note: if you see Loss start at or suddenly jump to exactly `0`, it is likely something has gone wrong in your training process (eg model corruption).
## Note: 4-Bit Monkeypatch
The [4-bit LoRA monkeypatch](GPTQ-models-(4-bit-mode).md#using-loras-in-4-bit-mode) works for training, but has side effects:
- VRAM usage is higher currently. You can reduce the `Micro Batch Size` to `1` to compensate.
- Models do funky things. LoRAs apply themselves, or refuse to apply, or spontaneously error out, or etc. It can be helpful to reload base model or restart the WebUI between training/usage to minimize chances of anything going haywire.
- Loading or working with multiple LoRAs at the same time doesn't currently work.
- Generally, recognize and treat the monkeypatch as the dirty temporary hack it is - it works, but isn't very stable. It will get better in time when everything is merged upstream for full official support.
## Legacy notes ## Legacy notes
LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570). LoRA training was contributed by [mcmonkey4eva](https://github.com/mcmonkey4eva) in PR [#570](https://github.com/oobabooga/text-generation-webui/pull/570).

View File

@ -0,0 +1,43 @@
# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit
import sys
from pathlib import Path
sys.path.insert(0, str(Path("repositories/alpaca_lora_4bit")))
import autograd_4bit
from amp_wrapper import AMPWrapper
from autograd_4bit import (
Autograd4bitQuantLinear,
load_llama_model_4bit_low_ram
)
from monkeypatch.peft_tuners_lora_monkey_patch import (
Linear4bitLt,
replace_peft_model_with_gptq_lora_model
)
from modules import shared
from modules.GPTQ_loader import find_quantized_model_file
replace_peft_model_with_gptq_lora_model()
def load_model_llama(model_name):
config_path = str(Path(f'{shared.args.model_dir}/{model_name}'))
model_path = str(find_quantized_model_file(model_name))
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False)
for n, m in model.named_modules():
if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
m.bias = m.bias.half()
autograd_4bit.use_new = True
autograd_4bit.auto_switch = True
model.half()
wrapper = AMPWrapper(model)
wrapper.apply_generate()
return model, tokenizer

View File

@ -270,6 +270,12 @@ def calc_trainable_parameters(model):
def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str): def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str):
if shared.args.monkey_patch:
from monkeypatch.peft_tuners_lora_monkey_patch import (
replace_peft_model_with_gptq_lora_model
)
replace_peft_model_with_gptq_lora_model()
global WANT_INTERRUPT global WANT_INTERRUPT
WANT_INTERRUPT = False WANT_INTERRUPT = False
@ -301,6 +307,15 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
time.sleep(5) time.sleep(5)
if shared.args.wbits > 0 and not shared.args.monkey_patch:
yield "LoRA training with GPTQ models requires loading with `--monkey-patch`"
return
elif not (shared.args.load_in_8bit or shared.args.load_in_4bit) and shared.args.wbits <= 0:
yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*"
logger.warning("It is highly recommended you use `--load-in-8bit` for LoRA training.")
time.sleep(2) # Give it a moment for the message to show in UI before continuing
if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0: if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0:
yield "Cannot input zeroes." yield "Cannot input zeroes."
return return
@ -505,6 +520,14 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch
yield traceback.format_exc().replace('\n', '\n\n') yield traceback.format_exc().replace('\n', '\n\n')
return return
if shared.args.monkey_patch:
for n, m in lora_model.named_modules():
if '4bit' in str(type(m)):
if m.is_v1_model:
m.zeros = m.zeros.half()
m.scales = m.scales.half()
class Tracked(): class Tracked():
def __init__(self): def __init__(self):
self.current_steps = 0 self.current_steps = 0