From fa0e68cefdf39d6f23ebd12fb31fb311943bc568 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Mar 2024 11:31:06 -0800 Subject: [PATCH 1/6] Installer: add back INSTALL_EXTENSIONS environment variable (for docker) --- README.md | 2 +- docker/amd/Dockerfile | 2 +- docker/cpu/Dockerfile | 2 +- docker/intel/Dockerfile | 2 +- docker/nvidia/Dockerfile | 2 +- one_click.py | 3 +++ 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 71a5c105..f3236beb 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ If you ever need to install something manually in the `installer_files` environm * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root. * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts. * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki). -* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, and `LAUNCH_AFTER_INSTALL` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE ./start_linux.sh`. +* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`. ### Manual installation using Conda diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile index 000303d4..365e88e3 100644 --- a/docker/amd/Dockerfile +++ b/docker/amd/Dockerfile @@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui -RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE ./start_linux.sh --verbose +RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose COPY CMD_FLAGS.txt /home/app/text-generation-webui/ EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile index 472fc652..04ccf94a 100644 --- a/docker/cpu/Dockerfile +++ b/docker/cpu/Dockerfile @@ -17,7 +17,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui -RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE ./start_linux.sh --verbose +RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose COPY CMD_FLAGS.txt /home/app/text-generation-webui/ EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} # set umask to ensure group read / write at runtime diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile index 53fdf25b..bc67a185 100644 --- a/docker/intel/Dockerfile +++ b/docker/intel/Dockerfile @@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui -RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE ./start_linux.sh --verbose +RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose COPY CMD_FLAGS.txt /home/app/text-generation-webui/ EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} # set umask to ensure group read / write at runtime diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile index 1414bd38..ca17c17b 100644 --- a/docker/nvidia/Dockerfile +++ b/docker/nvidia/Dockerfile @@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui -RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE ./start_linux.sh --verbose +RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose COPY CMD_FLAGS.txt /home/app/text-generation-webui/ EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui diff --git a/one_click.py b/one_click.py index c64ac460..a1d1e10d 100644 --- a/one_click.py +++ b/one_click.py @@ -356,6 +356,9 @@ def update_requirements(initial_installation=False, pull=True): print_big_message(f"File '{file_name}' was updated during 'git pull'. Please run the script again.") exit(1) + if os.environ.get("INSTALL_EXTENSIONS", "").lower() in ("yes", "y", "true", "1", "t", "on"): + install_extensions_requirements() + # Update PyTorch if not initial_installation: update_pytorch() From 2ec1d96c91441f3963715a3e144d4c3bf87ce364 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Mar 2024 23:02:25 -0300 Subject: [PATCH 2/6] Add cache_4bit option for ExLlamaV2 (#5645) --- README.md | 1 + modules/exllamav2.py | 3 +++ modules/exllamav2_hf.py | 3 +++ modules/loaders.py | 2 ++ modules/shared.py | 1 + modules/ui.py | 1 + modules/ui_model_menu.py | 1 + requirements.txt | 10 +++++----- requirements_amd.txt | 6 +++--- requirements_amd_noavx2.txt | 6 +++--- requirements_noavx2.txt | 10 +++++----- 11 files changed, 28 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index f3236beb..f34e9ca4 100644 --- a/README.md +++ b/README.md @@ -279,6 +279,7 @@ List of command-line flags |`--cfg-cache` | ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. | |`--no_flash_attn` | Force flash-attention to not be used. | |`--cache_8bit` | Use 8-bit cache to save VRAM. | +|`--cache_4bit` | Use 4-bit cache to save VRAM. | |`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` | Number of experts to use for generation. Applies to MoE models like Mixtral. | #### AutoGPTQ diff --git a/modules/exllamav2.py b/modules/exllamav2.py index 34072d0f..d39bc315 100644 --- a/modules/exllamav2.py +++ b/modules/exllamav2.py @@ -6,6 +6,7 @@ from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, ExLlamaV2Config, ExLlamaV2Tokenizer ) @@ -60,6 +61,8 @@ class Exllamav2Model: if shared.args.cache_8bit: cache = ExLlamaV2Cache_8bit(model, lazy=shared.args.autosplit) + elif shared.args.cache_4bit: + cache = ExLlamaV2Cache_Q4(model, lazy=shared.args.autosplit) else: cache = ExLlamaV2Cache(model, lazy=shared.args.autosplit) diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 1e21c2f1..55cdc9f4 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -8,6 +8,7 @@ from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, ExLlamaV2Config ) from torch.nn import CrossEntropyLoss @@ -50,6 +51,8 @@ class Exllamav2HF(PreTrainedModel): if shared.args.cache_8bit: self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model, lazy=shared.args.autosplit) + elif shared.args.cache_4bit: + self.ex_cache = ExLlamaV2Cache_Q4(self.ex_model, lazy=shared.args.autosplit) else: self.ex_cache = ExLlamaV2Cache(self.ex_model, lazy=shared.args.autosplit) diff --git a/modules/loaders.py b/modules/loaders.py index 513fd910..330f2903 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -78,6 +78,7 @@ loaders_and_params = OrderedDict({ 'no_flash_attn', 'num_experts_per_token', 'cache_8bit', + 'cache_4bit', 'autosplit', 'alpha_value', 'compress_pos_emb', @@ -90,6 +91,7 @@ loaders_and_params = OrderedDict({ 'no_flash_attn', 'num_experts_per_token', 'cache_8bit', + 'cache_4bit', 'autosplit', 'alpha_value', 'compress_pos_emb', diff --git a/modules/shared.py b/modules/shared.py index 7bef04bf..0a0c678b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -139,6 +139,7 @@ group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequen group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') +group.add_argument('--cache_4bit', action='store_true', help='Use 4-bit cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') # AutoGPTQ diff --git a/modules/ui.py b/modules/ui.py index 6249bb48..6e1b12b0 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -76,6 +76,7 @@ def list_model_elements(): 'no_flash_attn', 'num_experts_per_token', 'cache_8bit', + 'cache_4bit', 'autosplit', 'threads', 'threads_batch', diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index ac6a8a8f..227ddc42 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -132,6 +132,7 @@ def create_ui(): shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') + shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use 4-bit cache to save VRAM.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') diff --git a/requirements.txt b/requirements.txt index fdf8c914..904051cb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_amd.txt b/requirements_amd.txt index 7255cf2f..14778f3b 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/ro https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.55+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index b949aa30..c00960c7 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp # AMD wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index f9a62a22..2e249a10 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.1/exllamav2-0.0.14.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From bde7f00cae8306884c31d855092463ca04ce26ac Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Mar 2024 21:08:29 -0800 Subject: [PATCH 3/6] Change the exllamav2 version number --- requirements.txt | 10 +++++----- requirements_amd.txt | 6 +++--- requirements_amd_noavx2.txt | 6 +++--- requirements_noavx2.txt | 10 +++++----- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index 904051cb..3028fe7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" diff --git a/requirements_amd.txt b/requirements_amd.txt index 14778f3b..f43f6ac4 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -40,8 +40,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/ro https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.55+rocm5.6.1-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index c00960c7..37dfcea2 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -38,8 +38,8 @@ https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cp # AMD wheels https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 2e249a10..a4f9bcb9 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -56,11 +56,11 @@ https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu1 https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/exllamav2/releases/download/v0.0.14.2/exllamav2-0.0.14.2-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.0.15/exllamav2-0.0.15-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" From 303433001f5099acbf6ba840ffa9b9745fcfad2a Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Mar 2024 21:13:54 -0800 Subject: [PATCH 4/6] Fix a check in the installer --- one_click.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/one_click.py b/one_click.py index a1d1e10d..d266be18 100644 --- a/one_click.py +++ b/one_click.py @@ -342,7 +342,7 @@ def update_requirements(initial_installation=False, pull=True): files_to_check = [ 'start_linux.sh', 'start_macos.sh', 'start_windows.bat', 'start_wsl.bat', - 'update_linux.sh', 'update_macos.sh', 'update_windows.bat', 'update_wsl.bat', + 'update_wizard_linux.sh', 'update_wizard_macos.sh', 'update_wizard_windows.bat', 'update_wizard_wsl.bat', 'one_click.py' ] From bef08129bce8a969582a0132c282ab1eb47cfaa4 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 6 Mar 2024 21:43:11 -0800 Subject: [PATCH 5/6] Small fix for cuda 11.8 in the one-click installer --- one_click.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/one_click.py b/one_click.py index d266be18..0d543e30 100644 --- a/one_click.py +++ b/one_click.py @@ -390,7 +390,7 @@ def update_requirements(initial_installation=False, pull=True): if is_cuda118: textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements] if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11 - textgen_requirements = [req for req in textgen_requirements if 'jllllll/flash-attention' not in req] + textgen_requirements = [req for req in textgen_requirements if 'oobabooga/flash-attention' not in req] with open('temp_requirements.txt', 'w') as file: file.write('\n'.join(textgen_requirements)) From 104573f7d49085b4916c87bbcd5a2c8a040c1747 Mon Sep 17 00:00:00 2001 From: Bartowski Date: Thu, 7 Mar 2024 11:08:21 -0500 Subject: [PATCH 6/6] Update cache_4bit documentation (#5649) --------- Co-authored-by: oobabooga <112222186+oobabooga@users.noreply.github.com> --- README.md | 2 +- docs/04 - Model Tab.md | 1 + modules/exllamav2_hf.py | 2 ++ modules/shared.py | 2 +- modules/ui_model_menu.py | 2 +- 5 files changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f34e9ca4..bb5d2810 100644 --- a/README.md +++ b/README.md @@ -279,7 +279,7 @@ List of command-line flags |`--cfg-cache` | ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader. | |`--no_flash_attn` | Force flash-attention to not be used. | |`--cache_8bit` | Use 8-bit cache to save VRAM. | -|`--cache_4bit` | Use 4-bit cache to save VRAM. | +|`--cache_4bit` | Use Q4 cache to save VRAM. | |`--num_experts_per_token NUM_EXPERTS_PER_TOKEN` | Number of experts to use for generation. Applies to MoE models like Mixtral. | #### AutoGPTQ diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 762f85e8..3766c96c 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -46,6 +46,7 @@ Examples: * **cfg-cache**: Creates a second cache to hold the CFG negative prompts. You need to set this if and only if you intend to use CFG in the "Parameters" > "Generation" tab. Checking this parameter doubles the cache VRAM usage. * **no_flash_attn**: Disables flash attention. Otherwise, it is automatically used as long as the library is installed. * **cache_8bit**: Create a 8-bit precision cache instead of a 16-bit one. This saves VRAM but increases perplexity (I don't know by how much). +* **cache_4bit**: Creates a Q4 cache using grouped quantization. ### ExLlamav2 diff --git a/modules/exllamav2_hf.py b/modules/exllamav2_hf.py index 55cdc9f4..9ab9cdc7 100644 --- a/modules/exllamav2_hf.py +++ b/modules/exllamav2_hf.py @@ -63,6 +63,8 @@ class Exllamav2HF(PreTrainedModel): if shared.args.cfg_cache: if shared.args.cache_8bit: self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model) + elif shared.args.cache_4bit: + self.ex_cache_negative = ExLlamaV2Cache_Q4(self.ex_model) else: self.ex_cache_negative = ExLlamaV2Cache(self.ex_model) diff --git a/modules/shared.py b/modules/shared.py index 0a0c678b..10a70001 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -139,7 +139,7 @@ group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequen group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') -group.add_argument('--cache_4bit', action='store_true', help='Use 4-bit cache to save VRAM.') +group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') # AutoGPTQ diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 227ddc42..c29db7d0 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -132,7 +132,7 @@ def create_ui(): shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') - shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use 4-bit cache to save VRAM.') + shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')