mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-23 00:18:20 +01:00
commit
3ef49397bb
4
characters/instruction-following/Airoboros-v1.2.yaml
Normal file
4
characters/instruction-following/Airoboros-v1.2.yaml
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
user: "USER:"
|
||||||
|
bot: "ASSISTANT:"
|
||||||
|
turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n"
|
||||||
|
context: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input.\n"
|
4
characters/instruction-following/Llama-v2.yaml
Normal file
4
characters/instruction-following/Llama-v2.yaml
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
user: ""
|
||||||
|
bot: ""
|
||||||
|
turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> [INST] "
|
||||||
|
context: "[INST] <<SYS>>\nAnswer the questions.\n<</SYS>>\n"
|
@ -142,12 +142,25 @@ python setup_cuda.py install
|
|||||||
|
|
||||||
### Getting pre-converted LLaMA weights
|
### Getting pre-converted LLaMA weights
|
||||||
|
|
||||||
These are models that you can simply download and place in your `models` folder.
|
* Direct download (recommended):
|
||||||
|
|
||||||
* Converted without `group-size` (better for the 7b model): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-7B-4bit-128g
|
||||||
* Converted with `group-size` (better from 13b upwards): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105
|
|
||||||
|
|
||||||
⚠️ The tokenizer files in the sources above may be outdated. Make sure to obtain the universal LLaMA tokenizer as described [here](https://github.com/oobabooga/text-generation-webui/blob/main/docs/LLaMA-model.md#option-1-pre-converted-weights).
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-13B-4bit-128g
|
||||||
|
|
||||||
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-30B-4bit-128g
|
||||||
|
|
||||||
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-65B-4bit-128g
|
||||||
|
|
||||||
|
These models were converted with `desc_act=True`. They work just fine with ExLlama. For AutoGPTQ, they will only work on Linux with the `triton` option checked.
|
||||||
|
|
||||||
|
* Torrent:
|
||||||
|
|
||||||
|
https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617
|
||||||
|
|
||||||
|
https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105
|
||||||
|
|
||||||
|
These models were converted with `desc_act=False`. As such, they are less accurate, but they work with AutoGPTQ on Windows. The `128g` versions are better from 13b upwards, and worse for 7b. The tokenizer files in the torrents are outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer
|
||||||
|
|
||||||
### Starting the web UI:
|
### Starting the web UI:
|
||||||
|
|
||||||
|
@ -9,10 +9,21 @@ This guide will cover usage through the official `transformers` implementation.
|
|||||||
|
|
||||||
### Option 1: pre-converted weights
|
### Option 1: pre-converted weights
|
||||||
|
|
||||||
* Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789
|
* Direct download (recommended):
|
||||||
* Direct download: https://huggingface.co/Neko-Institute-of-Science
|
|
||||||
|
|
||||||
⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-7B-HF
|
||||||
|
|
||||||
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-13B-HF
|
||||||
|
|
||||||
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-30B-HF
|
||||||
|
|
||||||
|
https://huggingface.co/Neko-Institute-of-Science/LLaMA-65B-HF
|
||||||
|
|
||||||
|
* Torrent:
|
||||||
|
|
||||||
|
https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789
|
||||||
|
|
||||||
|
The tokenizer files in the torrent above are outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer
|
||||||
|
|
||||||
### Option 2: convert the weights yourself
|
### Option 2: convert the weights yourself
|
||||||
|
|
||||||
|
35
docs/LLaMA-v2-model.md
Normal file
35
docs/LLaMA-v2-model.md
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
# LLaMA-v2
|
||||||
|
|
||||||
|
To convert LLaMA-v2 from the `.pth` format provided by Meta to transformers format, follow the steps below:
|
||||||
|
|
||||||
|
1) `cd` into your `llama` folder (the one containing `download.sh` and the models that you downloaded):
|
||||||
|
|
||||||
|
```
|
||||||
|
cd llama
|
||||||
|
```
|
||||||
|
|
||||||
|
2) Clone the transformers library:
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone 'https://github.com/huggingface/transformers'
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
3) Create symbolic links from the downloaded folders to names that the conversion script can recognize:
|
||||||
|
|
||||||
|
```
|
||||||
|
ln -s llama-2-7b 7B
|
||||||
|
ln -s llama-2-13b 13B
|
||||||
|
```
|
||||||
|
|
||||||
|
4) Do the conversions:
|
||||||
|
|
||||||
|
```
|
||||||
|
mkdir llama-2-7b-hf llama-2-13b-hf
|
||||||
|
python ./transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir . --model_size 7B --output_dir llama-2-7b-hf --safe_serialization true
|
||||||
|
python ./transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir . --model_size 13B --output_dir llama-2-13b-hf --safe_serialization true
|
||||||
|
```
|
||||||
|
|
||||||
|
5) Move the output folders inside `text-generation-webui/models`
|
||||||
|
|
||||||
|
6) Have fun
|
@ -186,6 +186,9 @@ llama-65b-gptq-3bit:
|
|||||||
.*airoboros:
|
.*airoboros:
|
||||||
mode: 'instruct'
|
mode: 'instruct'
|
||||||
instruction_template: 'Vicuna-v1.1'
|
instruction_template: 'Vicuna-v1.1'
|
||||||
|
.*airoboros.*1.2:
|
||||||
|
mode: 'instruct'
|
||||||
|
instruction_template: 'Airoboros-v1.2'
|
||||||
.*WizardLM-30B-V1.0:
|
.*WizardLM-30B-V1.0:
|
||||||
mode: 'instruct'
|
mode: 'instruct'
|
||||||
instruction_template: 'Vicuna-v1.1'
|
instruction_template: 'Vicuna-v1.1'
|
||||||
@ -269,3 +272,8 @@ TheBloke_WizardLM-30B-GPTQ:
|
|||||||
.*godzilla:
|
.*godzilla:
|
||||||
mode: 'instruct'
|
mode: 'instruct'
|
||||||
instruction_template: 'Alpaca'
|
instruction_template: 'Alpaca'
|
||||||
|
.*llama-(2|v2):
|
||||||
|
truncation_length: 4096
|
||||||
|
.*llama-(2|v2).*chat:
|
||||||
|
mode: 'instruct'
|
||||||
|
instruction_template: 'Llama-v2'
|
||||||
|
@ -132,7 +132,7 @@ def add_lora_transformers(lora_names):
|
|||||||
if not shared.args.load_in_8bit and not shared.args.cpu:
|
if not shared.args.load_in_8bit and not shared.args.cpu:
|
||||||
shared.model.half()
|
shared.model.half()
|
||||||
if not hasattr(shared.model, "hf_device_map"):
|
if not hasattr(shared.model, "hf_device_map"):
|
||||||
if torch.has_mps:
|
if torch.backends.mps.is_available():
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
shared.model = shared.model.to(device)
|
shared.model = shared.model.to(device)
|
||||||
else:
|
else:
|
||||||
|
@ -42,7 +42,6 @@ class LlamacppHF(PreTrainedModel):
|
|||||||
|
|
||||||
# Make the forward call
|
# Make the forward call
|
||||||
seq_tensor = torch.tensor(seq)
|
seq_tensor = torch.tensor(seq)
|
||||||
self.cache = seq_tensor
|
|
||||||
if labels is None:
|
if labels is None:
|
||||||
if self.cache is None or not torch.equal(self.cache, seq_tensor[:-1]):
|
if self.cache is None or not torch.equal(self.cache, seq_tensor[:-1]):
|
||||||
self.model.reset()
|
self.model.reset()
|
||||||
@ -50,13 +49,15 @@ class LlamacppHF(PreTrainedModel):
|
|||||||
else:
|
else:
|
||||||
self.model.eval([seq[-1]])
|
self.model.eval([seq[-1]])
|
||||||
|
|
||||||
logits = torch.tensor(self.model.eval_logits)[-1].view(1, 1, -1).to(kwargs['input_ids'].device)
|
logits = torch.tensor(self.model.eval_logits[-1]).view(1, 1, -1).to(kwargs['input_ids'].device)
|
||||||
else:
|
else:
|
||||||
self.model.reset()
|
self.model.reset()
|
||||||
self.model.eval(seq)
|
self.model.eval(seq)
|
||||||
logits = torch.tensor(self.model.eval_logits)
|
logits = torch.tensor(self.model.eval_logits)
|
||||||
logits = logits.view(1, logits.shape[0], logits.shape[1]).to(kwargs['input_ids'].device)
|
logits = logits.view(1, logits.shape[0], logits.shape[1]).to(kwargs['input_ids'].device)
|
||||||
|
|
||||||
|
self.cache = seq_tensor
|
||||||
|
|
||||||
# Based on transformers/models/llama/modeling_llama.py
|
# Based on transformers/models/llama/modeling_llama.py
|
||||||
loss = None
|
loss = None
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
@ -96,6 +97,8 @@ class LlamacppHF(PreTrainedModel):
|
|||||||
'use_mlock': shared.args.mlock,
|
'use_mlock': shared.args.mlock,
|
||||||
'low_vram': shared.args.low_vram,
|
'low_vram': shared.args.low_vram,
|
||||||
'n_gpu_layers': shared.args.n_gpu_layers,
|
'n_gpu_layers': shared.args.n_gpu_layers,
|
||||||
|
'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
|
||||||
|
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
||||||
'logits_all': True,
|
'logits_all': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,7 +50,9 @@ class LlamaCppModel:
|
|||||||
'use_mmap': not shared.args.no_mmap,
|
'use_mmap': not shared.args.no_mmap,
|
||||||
'use_mlock': shared.args.mlock,
|
'use_mlock': shared.args.mlock,
|
||||||
'low_vram': shared.args.low_vram,
|
'low_vram': shared.args.low_vram,
|
||||||
'n_gpu_layers': shared.args.n_gpu_layers
|
'n_gpu_layers': shared.args.n_gpu_layers,
|
||||||
|
'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.),
|
||||||
|
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
|
||||||
}
|
}
|
||||||
|
|
||||||
result.model = Llama(**params)
|
result.model = Llama(**params)
|
||||||
|
@ -37,6 +37,8 @@ loaders_and_params = {
|
|||||||
'low_vram',
|
'low_vram',
|
||||||
'mlock',
|
'mlock',
|
||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
|
'compress_pos_emb',
|
||||||
|
'alpha_value',
|
||||||
],
|
],
|
||||||
'llamacpp_HF': [
|
'llamacpp_HF': [
|
||||||
'n_ctx',
|
'n_ctx',
|
||||||
@ -47,6 +49,8 @@ loaders_and_params = {
|
|||||||
'low_vram',
|
'low_vram',
|
||||||
'mlock',
|
'mlock',
|
||||||
'llama_cpp_seed',
|
'llama_cpp_seed',
|
||||||
|
'compress_pos_emb',
|
||||||
|
'alpha_value',
|
||||||
'llamacpp_HF_info',
|
'llamacpp_HF_info',
|
||||||
],
|
],
|
||||||
'Transformers': [
|
'Transformers': [
|
||||||
|
@ -147,7 +147,7 @@ def huggingface_loader(model_name):
|
|||||||
# Load the model in simple 16-bit mode by default
|
# Load the model in simple 16-bit mode by default
|
||||||
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
|
if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]):
|
||||||
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
|
model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code)
|
||||||
if torch.has_mps:
|
if torch.backends.mps.is_available():
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
model = model.to(device)
|
model = model.to(device)
|
||||||
else:
|
else:
|
||||||
@ -167,7 +167,7 @@ def huggingface_loader(model_name):
|
|||||||
"trust_remote_code": shared.args.trust_remote_code
|
"trust_remote_code": shared.args.trust_remote_code
|
||||||
}
|
}
|
||||||
|
|
||||||
if not any((shared.args.cpu, torch.cuda.is_available(), torch.has_mps)):
|
if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())):
|
||||||
logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
|
logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.")
|
||||||
shared.args.cpu = True
|
shared.args.cpu = True
|
||||||
|
|
||||||
|
@ -32,10 +32,10 @@ need_restart = False
|
|||||||
|
|
||||||
settings = {
|
settings = {
|
||||||
'dark_theme': False,
|
'dark_theme': False,
|
||||||
'autoload_model': True,
|
'autoload_model': False,
|
||||||
'max_new_tokens': 200,
|
'max_new_tokens': 200,
|
||||||
'max_new_tokens_min': 1,
|
'max_new_tokens_min': 1,
|
||||||
'max_new_tokens_max': 2000,
|
'max_new_tokens_max': 4096,
|
||||||
'seed': -1,
|
'seed': -1,
|
||||||
'character': 'None',
|
'character': 'None',
|
||||||
'name1': 'You',
|
'name1': 'You',
|
||||||
|
@ -57,7 +57,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
|||||||
return input_ids.numpy()
|
return input_ids.numpy()
|
||||||
elif shared.args.deepspeed:
|
elif shared.args.deepspeed:
|
||||||
return input_ids.to(device=local_rank)
|
return input_ids.to(device=local_rank)
|
||||||
elif torch.has_mps:
|
elif torch.backends.mps.is_available():
|
||||||
device = torch.device('mps')
|
device = torch.device('mps')
|
||||||
return input_ids.to(device)
|
return input_ids.to(device)
|
||||||
else:
|
else:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
accelerate==0.20.3
|
accelerate==0.21.0
|
||||||
colorama
|
colorama
|
||||||
datasets
|
datasets
|
||||||
einops
|
einops
|
||||||
@ -20,11 +20,11 @@ tensorboard
|
|||||||
wandb
|
wandb
|
||||||
transformers==4.30.2
|
transformers==4.30.2
|
||||||
git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
|
git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524
|
||||||
bitsandbytes==0.40.1.post1; platform_system != "Windows"
|
bitsandbytes==0.40.2; platform_system != "Windows"
|
||||||
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.1.post1-py3-none-win_amd64.whl; platform_system == "Windows"
|
https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.2-py3-none-win_amd64.whl; platform_system == "Windows"
|
||||||
llama-cpp-python==0.1.72; platform_system != "Windows"
|
llama-cpp-python==0.1.72; platform_system != "Windows"
|
||||||
https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.72/llama_cpp_python-0.1.72-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.72/llama_cpp_python-0.1.72-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
|
||||||
https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
|
||||||
|
13
server.py
13
server.py
@ -277,20 +277,17 @@ def create_model_menus():
|
|||||||
load.click(
|
load.click(
|
||||||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||||
update_model_parameters, gradio('interface_state'), None).then(
|
update_model_parameters, gradio('interface_state'), None).then(
|
||||||
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).then(
|
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
|
||||||
lambda: shared.lora_names, None, gradio('lora_menu'))
|
|
||||||
|
|
||||||
unload.click(
|
unload.click(
|
||||||
unload_model, None, None).then(
|
unload_model, None, None).then(
|
||||||
lambda: "Model unloaded", None, gradio('model_status')).then(
|
lambda: "Model unloaded", None, gradio('model_status'))
|
||||||
lambda: shared.lora_names, None, gradio('lora_menu'))
|
|
||||||
|
|
||||||
reload.click(
|
reload.click(
|
||||||
unload_model, None, None).then(
|
unload_model, None, None).then(
|
||||||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||||
update_model_parameters, gradio('interface_state'), None).then(
|
update_model_parameters, gradio('interface_state'), None).then(
|
||||||
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).then(
|
partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False)
|
||||||
lambda: shared.lora_names, None, gradio('lora_menu'))
|
|
||||||
|
|
||||||
save_settings.click(
|
save_settings.click(
|
||||||
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
|
||||||
@ -1128,10 +1125,6 @@ if __name__ == "__main__":
|
|||||||
if shared.args.model is not None:
|
if shared.args.model is not None:
|
||||||
shared.model_name = shared.args.model
|
shared.model_name = shared.args.model
|
||||||
|
|
||||||
# Only one model is available
|
|
||||||
elif len(available_models) == 1:
|
|
||||||
shared.model_name = available_models[0]
|
|
||||||
|
|
||||||
# Select the model from a command-line menu
|
# Select the model from a command-line menu
|
||||||
elif shared.args.model_menu:
|
elif shared.args.model_menu:
|
||||||
if len(available_models) == 0:
|
if len(available_models) == 0:
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
dark_theme: false
|
dark_theme: false
|
||||||
autoload_model: true
|
autoload_model: false
|
||||||
max_new_tokens: 200
|
max_new_tokens: 200
|
||||||
max_new_tokens_min: 1
|
max_new_tokens_min: 1
|
||||||
max_new_tokens_max: 2000
|
max_new_tokens_max: 4096
|
||||||
seed: -1
|
seed: -1
|
||||||
character: None
|
character: None
|
||||||
name1: You
|
name1: You
|
||||||
|
Loading…
Reference in New Issue
Block a user