From a199f2179999467b299fec3128e6298f5895c223 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 16 Jul 2023 20:49:48 -0700 Subject: [PATCH 01/18] Optimize llamacpp_hf a bit --- modules/llamacpp_hf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 12212fec..5d05f5df 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -42,7 +42,6 @@ class LlamacppHF(PreTrainedModel): # Make the forward call seq_tensor = torch.tensor(seq) - self.cache = seq_tensor if labels is None: if self.cache is None or not torch.equal(self.cache, seq_tensor[:-1]): self.model.reset() @@ -50,13 +49,15 @@ class LlamacppHF(PreTrainedModel): else: self.model.eval([seq[-1]]) - logits = torch.tensor(self.model.eval_logits)[-1].view(1, 1, -1).to(kwargs['input_ids'].device) + logits = torch.tensor(self.model.eval_logits[-1]).view(1, 1, -1).to(kwargs['input_ids'].device) else: self.model.reset() self.model.eval(seq) logits = torch.tensor(self.model.eval_logits) logits = logits.view(1, logits.shape[0], logits.shape[1]).to(kwargs['input_ids'].device) + self.cache = seq_tensor + # Based on transformers/models/llama/modeling_llama.py loss = None if labels is not None: From 656b4577953530caeb3f850eeef87a90829bf5e8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 07:27:42 -0700 Subject: [PATCH 02/18] Add Airoboros-v1.2 template --- characters/instruction-following/Airoboros-v1.2.yaml | 4 ++++ models/config.yaml | 3 +++ 2 files changed, 7 insertions(+) create mode 100644 characters/instruction-following/Airoboros-v1.2.yaml diff --git a/characters/instruction-following/Airoboros-v1.2.yaml b/characters/instruction-following/Airoboros-v1.2.yaml new file mode 100644 index 00000000..7f1bfed6 --- /dev/null +++ b/characters/instruction-following/Airoboros-v1.2.yaml @@ -0,0 +1,4 @@ +user: "USER:" +bot: "ASSISTANT:" +turn_template: "<|user|> <|user-message|>\n<|bot|> <|bot-message|>\n" +context: "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input.\n" diff --git a/models/config.yaml b/models/config.yaml index 37e4273c..ef95c2eb 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -186,6 +186,9 @@ llama-65b-gptq-3bit: .*airoboros: mode: 'instruct' instruction_template: 'Vicuna-v1.1' +.*airoboros.*1.2: + mode: 'instruct' + instruction_template: 'Airoboros-v1.2' .*WizardLM-30B-V1.0: mode: 'instruct' instruction_template: 'Vicuna-v1.1' From b1a6ea68dd31158009a1910864d2047c5cf07857 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 07:40:56 -0700 Subject: [PATCH 03/18] Disable "autoload the model" by default --- modules/shared.py | 2 +- settings-template.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index f20b9fcd..10f059a4 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -32,7 +32,7 @@ need_restart = False settings = { 'dark_theme': False, - 'autoload_model': True, + 'autoload_model': False, 'max_new_tokens': 200, 'max_new_tokens_min': 1, 'max_new_tokens_max': 2000, diff --git a/settings-template.yaml b/settings-template.yaml index e949f697..b2cdc5ac 100644 --- a/settings-template.yaml +++ b/settings-template.yaml @@ -1,5 +1,5 @@ dark_theme: false -autoload_model: true +autoload_model: false max_new_tokens: 200 max_new_tokens_min: 1 max_new_tokens_max: 2000 From 4ce766414bbd8ee800425c141b4ef900af488869 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 10:02:12 -0700 Subject: [PATCH 04/18] Bump AutoGPTQ version --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 24aa6d79..54f0f2b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,7 @@ bitsandbytes==0.40.1.post1; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.1.post1-py3-none-win_amd64.whl; platform_system == "Windows" llama-cpp-python==0.1.72; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.72/llama_cpp_python-0.1.72-cp310-cp310-win_amd64.whl; platform_system == "Windows" -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" -https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" +https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/exllama/releases/download/0.0.6/exllama-0.0.6+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" From f83fdb9270145b29350a9c881a67c0087c3f251c Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:50:25 -0700 Subject: [PATCH 05/18] Don't reset LoRA menu when loading a model --- server.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/server.py b/server.py index 727e5894..a1eb7867 100644 --- a/server.py +++ b/server.py @@ -277,20 +277,17 @@ def create_model_menus(): load.click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( update_model_parameters, gradio('interface_state'), None).then( - partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).then( - lambda: shared.lora_names, None, gradio('lora_menu')) + partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False) unload.click( unload_model, None, None).then( - lambda: "Model unloaded", None, gradio('model_status')).then( - lambda: shared.lora_names, None, gradio('lora_menu')) + lambda: "Model unloaded", None, gradio('model_status')) reload.click( unload_model, None, None).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( update_model_parameters, gradio('interface_state'), None).then( - partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).then( - lambda: shared.lora_names, None, gradio('lora_menu')) + partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False) save_settings.click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( From 60a3e702421c7b6794fd1199a57dec5ca4ee4321 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:51:01 -0700 Subject: [PATCH 06/18] Update LLaMA links and info --- docs/GPTQ-models-(4-bit-mode).md | 21 +++++++++++++++++---- docs/LLaMA-model.md | 17 ++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/docs/GPTQ-models-(4-bit-mode).md b/docs/GPTQ-models-(4-bit-mode).md index 63a6ed5b..838595ef 100644 --- a/docs/GPTQ-models-(4-bit-mode).md +++ b/docs/GPTQ-models-(4-bit-mode).md @@ -142,12 +142,25 @@ python setup_cuda.py install ### Getting pre-converted LLaMA weights -These are models that you can simply download and place in your `models` folder. +* Direct download (recommended): -* Converted without `group-size` (better for the 7b model): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617 -* Converted with `group-size` (better from 13b upwards): https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105 +https://huggingface.co/Neko-Institute-of-Science/LLaMA-7B-4bit-128g -⚠️ The tokenizer files in the sources above may be outdated. Make sure to obtain the universal LLaMA tokenizer as described [here](https://github.com/oobabooga/text-generation-webui/blob/main/docs/LLaMA-model.md#option-1-pre-converted-weights). +https://huggingface.co/Neko-Institute-of-Science/LLaMA-13B-4bit-128g + +https://huggingface.co/Neko-Institute-of-Science/LLaMA-30B-4bit-128g + +https://huggingface.co/Neko-Institute-of-Science/LLaMA-65B-4bit-128g + +These models were converted with `desc_act=True`. They work just fine with ExLlama. For AutoGPTQ, they will only work on Linux with the `triton` option checked. + +* Torrent: + +https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483891617 + +https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1483941105 + +These models were converted with `desc_act=False`. As such, they are less accurate, but they work with AutoGPTQ on Windows. The `128g` versions are better from 13b upwards, and worse for 7b. The tokenizer files in the torrents are outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer ### Starting the web UI: diff --git a/docs/LLaMA-model.md b/docs/LLaMA-model.md index cd655268..ba7350f5 100644 --- a/docs/LLaMA-model.md +++ b/docs/LLaMA-model.md @@ -9,10 +9,21 @@ This guide will cover usage through the official `transformers` implementation. ### Option 1: pre-converted weights -* Torrent: https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789 -* Direct download: https://huggingface.co/Neko-Institute-of-Science +* Direct download (recommended): -⚠️ The tokenizers for the Torrent source above and also for many LLaMA fine-tunes available on Hugging Face may be outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer +https://huggingface.co/Neko-Institute-of-Science/LLaMA-7B-HF + +https://huggingface.co/Neko-Institute-of-Science/LLaMA-13B-HF + +https://huggingface.co/Neko-Institute-of-Science/LLaMA-30B-HF + +https://huggingface.co/Neko-Institute-of-Science/LLaMA-65B-HF + +* Torrent: + +https://github.com/oobabooga/text-generation-webui/pull/530#issuecomment-1484235789 + +The tokenizer files in the torrent above are outdated, in particular the files called `tokenizer_config.json` and `special_tokens_map.json`. Here you can find those files: https://huggingface.co/oobabooga/llama-tokenizer ### Option 2: convert the weights yourself From 02a5fe6aa24842285cbff21a557f56a454f0a130 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 20:18:31 +0000 Subject: [PATCH 07/18] Bump accelerate from 0.20.3 to 0.21.0 Bumps [accelerate](https://github.com/huggingface/accelerate) from 0.20.3 to 0.21.0. - [Release notes](https://github.com/huggingface/accelerate/releases) - [Commits](https://github.com/huggingface/accelerate/compare/v0.20.3...v0.21.0) --- updated-dependencies: - dependency-name: accelerate dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 24aa6d79..8918aab9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -accelerate==0.20.3 +accelerate==0.21.0 colorama datasets einops From 5e5d926d2b5bbd01ae0414bdbed580568bbf4037 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:00:49 -0700 Subject: [PATCH 08/18] Prevent lists from flickering in chat mode while streaming --- css/chat.css | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/css/chat.css b/css/chat.css index 45a518bc..e07f2f71 100644 --- a/css/chat.css +++ b/css/chat.css @@ -65,8 +65,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .message-body li { - margin-top: 0.5em !important; - margin-bottom: 0.5em !important; + margin-top: 0 !important; + margin-bottom: 1.25em !important; } .message-body li > p { From 8c1c2e0fae2be1dbd0415f8600eed5e7ddbcbfb6 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 17 Jul 2023 17:08:22 -0700 Subject: [PATCH 09/18] Increase max_new_tokens upper limit --- modules/shared.py | 2 +- settings-template.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/shared.py b/modules/shared.py index 10f059a4..08d88ff5 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -35,7 +35,7 @@ settings = { 'autoload_model': False, 'max_new_tokens': 200, 'max_new_tokens_min': 1, - 'max_new_tokens_max': 2000, + 'max_new_tokens_max': 4096, 'seed': -1, 'character': 'None', 'name1': 'You', diff --git a/settings-template.yaml b/settings-template.yaml index b2cdc5ac..ef9a7e7e 100644 --- a/settings-template.yaml +++ b/settings-template.yaml @@ -2,7 +2,7 @@ dark_theme: false autoload_model: false max_new_tokens: 200 max_new_tokens_min: 1 -max_new_tokens_max: 2000 +max_new_tokens_max: 4096 seed: -1 character: None name1: You From 234c58ccd14b1dd40b370952d49886159adcf50d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jul 2023 21:24:51 -0300 Subject: [PATCH 10/18] Bump bitsandbytes from 0.40.1.post1 to 0.40.2 (#3178) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index aef8648f..b22265e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ tensorboard wandb transformers==4.30.2 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 -bitsandbytes==0.40.1.post1; platform_system != "Windows" +bitsandbytes==0.40.2; platform_system != "Windows" https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.1.post1-py3-none-win_amd64.whl; platform_system == "Windows" llama-cpp-python==0.1.72; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.72/llama_cpp_python-0.1.72-cp310-cp310-win_amd64.whl; platform_system == "Windows" From 89e0d15cf5c4b06967a31a51e40fd194a2bfc71a Mon Sep 17 00:00:00 2001 From: appe233 <89209249+appe233@users.noreply.github.com> Date: Tue, 18 Jul 2023 08:27:18 +0800 Subject: [PATCH 11/18] Use 'torch.backends.mps.is_available' to check if mps is supported (#3164) --- modules/LoRA.py | 2 +- modules/models.py | 4 ++-- modules/text_generation.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/LoRA.py b/modules/LoRA.py index 0626c969..1350783f 100644 --- a/modules/LoRA.py +++ b/modules/LoRA.py @@ -132,7 +132,7 @@ def add_lora_transformers(lora_names): if not shared.args.load_in_8bit and not shared.args.cpu: shared.model.half() if not hasattr(shared.model, "hf_device_map"): - if torch.has_mps: + if torch.backends.mps.is_available(): device = torch.device('mps') shared.model = shared.model.to(device) else: diff --git a/modules/models.py b/modules/models.py index 9d9ba951..232d5fa6 100644 --- a/modules/models.py +++ b/modules/models.py @@ -147,7 +147,7 @@ def huggingface_loader(model_name): # Load the model in simple 16-bit mode by default if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.load_in_4bit, shared.args.auto_devices, shared.args.disk, shared.args.deepspeed, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None]): model = LoaderClass.from_pretrained(Path(f"{shared.args.model_dir}/{model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16, trust_remote_code=shared.args.trust_remote_code) - if torch.has_mps: + if torch.backends.mps.is_available(): device = torch.device('mps') model = model.to(device) else: @@ -167,7 +167,7 @@ def huggingface_loader(model_name): "trust_remote_code": shared.args.trust_remote_code } - if not any((shared.args.cpu, torch.cuda.is_available(), torch.has_mps)): + if not any((shared.args.cpu, torch.cuda.is_available(), torch.backends.mps.is_available())): logger.warning("torch.cuda.is_available() returned False. This means that no GPU has been detected. Falling back to CPU mode.") shared.args.cpu = True diff --git a/modules/text_generation.py b/modules/text_generation.py index 566c2f55..d3939d3f 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -57,7 +57,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt return input_ids.numpy() elif shared.args.deepspeed: return input_ids.to(device=local_rank) - elif torch.has_mps: + elif torch.backends.mps.is_available(): device = torch.device('mps') return input_ids.to(device) else: From a69955377a172f6fda50a77dab31d2c87241c459 Mon Sep 17 00:00:00 2001 From: randoentity <137087500+randoentity@users.noreply.github.com> Date: Tue, 18 Jul 2023 03:32:37 +0200 Subject: [PATCH 12/18] [GGML] Support for customizable RoPE (#3083) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- modules/llamacpp_hf.py | 2 ++ modules/llamacpp_model.py | 4 +++- modules/loaders.py | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py index 5d05f5df..94d893c4 100644 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -97,6 +97,8 @@ class LlamacppHF(PreTrainedModel): 'use_mlock': shared.args.mlock, 'low_vram': shared.args.low_vram, 'n_gpu_layers': shared.args.n_gpu_layers, + 'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.), + 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'logits_all': True, } diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 86537a27..180b0f37 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -50,7 +50,9 @@ class LlamaCppModel: 'use_mmap': not shared.args.no_mmap, 'use_mlock': shared.args.mlock, 'low_vram': shared.args.low_vram, - 'n_gpu_layers': shared.args.n_gpu_layers + 'n_gpu_layers': shared.args.n_gpu_layers, + 'rope_freq_base': 10000 * shared.args.alpha_value ** (64/63.), + 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, } result.model = Llama(**params) diff --git a/modules/loaders.py b/modules/loaders.py index da38c2f5..b760128f 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -37,6 +37,8 @@ loaders_and_params = { 'low_vram', 'mlock', 'llama_cpp_seed', + 'compress_pos_emb', + 'alpha_value', ], 'llamacpp_HF': [ 'n_ctx', @@ -47,6 +49,8 @@ loaders_and_params = { 'low_vram', 'mlock', 'llama_cpp_seed', + 'compress_pos_emb', + 'alpha_value', 'llamacpp_HF_info', ], 'Transformers': [ From d7a14174a29a99da5c23ffd48854a4e79ea0257c Mon Sep 17 00:00:00 2001 From: jllllll <3887729+jllllll@users.noreply.github.com> Date: Tue, 18 Jul 2023 09:39:08 -0500 Subject: [PATCH 13/18] Remove auto-loading when only one model is available (#3187) --- server.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/server.py b/server.py index a1eb7867..babdec2d 100644 --- a/server.py +++ b/server.py @@ -1125,10 +1125,6 @@ if __name__ == "__main__": if shared.args.model is not None: shared.model_name = shared.args.model - # Only one model is available - elif len(available_models) == 1: - shared.model_name = available_models[0] - # Select the model from a command-line menu elif shared.args.model_menu: if len(available_models) == 0: From c535f14e5f7ac6a89f0f460b699aff6868fe16be Mon Sep 17 00:00:00 2001 From: jllllll <3887729+jllllll@users.noreply.github.com> Date: Tue, 18 Jul 2023 09:39:43 -0500 Subject: [PATCH 14/18] Bump bitsandbytes Windows wheel to 0.40.2 (#3186) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b22265e0..40ede730 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,7 @@ wandb transformers==4.30.2 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 bitsandbytes==0.40.2; platform_system != "Windows" -https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.1.post1-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.40.2-py3-none-win_amd64.whl; platform_system == "Windows" llama-cpp-python==0.1.72; platform_system != "Windows" https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.72/llama_cpp_python-0.1.72-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.0/auto_gptq-0.3.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" From 603c596616aa3a967139c3d1e9d4358fabd89f73 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 18 Jul 2023 10:29:56 -0700 Subject: [PATCH 15/18] Add LLaMA-v2 conversion instructions --- docs/LLaMA-v2-model.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 docs/LLaMA-v2-model.md diff --git a/docs/LLaMA-v2-model.md b/docs/LLaMA-v2-model.md new file mode 100644 index 00000000..ac06ccf6 --- /dev/null +++ b/docs/LLaMA-v2-model.md @@ -0,0 +1,35 @@ +# LLaMA-v2 + +To convert LLaMA-v2 from the `.pth` format provided by Meta to transformers format, follow the steps below: + +1) `cd` into your `llama` folder (the one containing `download.sh` and the models that you downloaded): + +``` +cd llama +``` + +2) Clone the code in [this PR](https://github.com/huggingface/transformers/pull/24891): + +``` +git clone 'https://github.com/ArthurZucker/transformers' -b llama-tests + +``` + +3) Create symbolic links from the downloaded folders to names that the conversion script can recognize: + +``` +ln -s llama-2-7b 7B +ln -s llama-2-13b 13B +``` + +4) Do the conversions: + +``` +mkdir llama-2-7b-hf llama-2-13b-hf +python ./transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir . --model_size 7B --output_dir llama-2-7b-hf --safe_serialization true +python ./transformers/src/transformers/models/llama/convert_llama_weights_to_hf.py --input_dir . --model_size 13B --output_dir llama-2-13b-hf --safe_serialization true +``` + +5) Move the output folders inside `text-generation-webui/models` + +6) Have fun From e0631e309fa83602bb748a7a159a0d8f21428dbb Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 18 Jul 2023 17:19:18 -0300 Subject: [PATCH 16/18] Create instruction template for Llama-v2 (#3194) --- characters/instruction-following/Llama-v2.yaml | 4 ++++ models/config.yaml | 5 +++++ 2 files changed, 9 insertions(+) create mode 100644 characters/instruction-following/Llama-v2.yaml diff --git a/characters/instruction-following/Llama-v2.yaml b/characters/instruction-following/Llama-v2.yaml new file mode 100644 index 00000000..3c1e27a9 --- /dev/null +++ b/characters/instruction-following/Llama-v2.yaml @@ -0,0 +1,4 @@ +user: "" +bot: "" +turn_template: "<|user|><|user-message|> [/INST] <|bot|><|bot-message|> [INST] " +context: "[INST] <>\nAnswer the questions.\n<>\n" diff --git a/models/config.yaml b/models/config.yaml index ef95c2eb..1cc5605c 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -272,3 +272,8 @@ TheBloke_WizardLM-30B-GPTQ: .*godzilla: mode: 'instruct' instruction_template: 'Alpaca' +.*llama-(2|v2): + truncation_length: 4096 +.*llama-(2|v2).*chat: + mode: 'instruct' + instruction_template: 'Llama-v2' From a2918176ea629e5ae98b2012bc413fd16b4b9cc0 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 18 Jul 2023 13:21:18 -0700 Subject: [PATCH 17/18] Update LLaMA-v2-model.md (thanks Panchovix) --- docs/LLaMA-v2-model.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/LLaMA-v2-model.md b/docs/LLaMA-v2-model.md index ac06ccf6..55c6aa76 100644 --- a/docs/LLaMA-v2-model.md +++ b/docs/LLaMA-v2-model.md @@ -8,10 +8,10 @@ To convert LLaMA-v2 from the `.pth` format provided by Meta to transformers form cd llama ``` -2) Clone the code in [this PR](https://github.com/huggingface/transformers/pull/24891): +2) Clone the transformers library: ``` -git clone 'https://github.com/ArthurZucker/transformers' -b llama-tests +git clone 'https://github.com/huggingface/transformers' ``` From 070a8862789b70098032304b9638ef1190bc5782 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Tue, 18 Jul 2023 13:23:29 -0700 Subject: [PATCH 18/18] Revert "Prevent lists from flickering in chat mode while streaming" This reverts commit 5e5d926d2b5bbd01ae0414bdbed580568bbf4037. --- css/chat.css | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/css/chat.css b/css/chat.css index e07f2f71..45a518bc 100644 --- a/css/chat.css +++ b/css/chat.css @@ -65,8 +65,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } .message-body li { - margin-top: 0 !important; - margin-bottom: 1.25em !important; + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; } .message-body li > p {