diff --git a/.gitignore b/.gitignore index a9c47a5a..bfb6d027 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ torch-dumps */*/pycache* venv/ .venv/ +.vscode repositories settings.json diff --git a/README.md b/README.md index 169c894b..e0784e12 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Text generation web UI -A gradio web UI for running Large Language Models like GPT-J 6B, OPT, GALACTICA, LLaMA, and Pygmalion. +A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, OPT, and GALACTICA. Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation. @@ -28,6 +28,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. * [DeepSpeed ZeRO-3 offload](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed). * Get responses via API, [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-streaming.py) or [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming. * [LLaMA model, including 4-bit GPTQ support](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model). +* [llama.cpp support](https://github.com/oobabooga/text-generation-webui/wiki/llama.cpp-models). **\*NEW!\*** * [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model). * [Supports LoRAs](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs). * Supports softprompts. @@ -175,24 +176,31 @@ Optionally, you can use the following command-line flags: | Flag | Description | |------------------|-------------| | `-h`, `--help` | show this help message and exit | -| `--model MODEL` | Name of the model to load by default. | -| `--lora LORA` | Name of the LoRA to apply to the model by default. | | `--notebook` | Launch the web UI in notebook mode, where the output is written to the same text box as the input. | | `--chat` | Launch the web UI in chat mode.| | `--cai-chat` | Launch the web UI in chat mode with a style similar to Character.AI's. If the file `img_bot.png` or `img_bot.jpg` exists in the same folder as server.py, this image will be used as the bot's profile picture. Similarly, `img_me.png` or `img_me.jpg` will be used as your profile picture. | +| `--model MODEL` | Name of the model to load by default. | +| `--lora LORA` | Name of the LoRA to apply to the model by default. | +| `--model-dir MODEL_DIR` | Path to directory with all the models | +| `--lora-dir LORA_DIR` | Path to directory with all the loras | +| `--no-stream` | Don't stream the text output in real time. | +| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.| +| `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. | +| `--verbose` | Print the prompts to the terminal. | | `--cpu` | Use the CPU to generate text.| +| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.| +| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. | +| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.| +| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. | +| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. | | `--load-in-8bit` | Load the model with 8-bit precision.| +| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | +| `--no-cache` | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. | +| `--threads` | Number of threads to use in llama.cpp. | | `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | | `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | | `--groupsize GROUPSIZE` | GPTQ: Group size. | | `--pre_layer PRE_LAYER` | GPTQ: The number of layers to preload. | -| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | -| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU.| -| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. | -| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. | -| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. | -| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Must be an integer number. Defaults to 99.| -| `--no-cache` | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. | | `--flexgen` | Enable the use of FlexGen offloading. | | `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). | | `--compress-weight` | FlexGen: Whether to compress weight (default: False).| @@ -202,12 +210,6 @@ Optionally, you can use the following command-line flags: | `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. | | `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | | `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | -| `--no-stream` | Don't stream the text output in real time. | -| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag.| -| `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. | -| `--model-dir MODEL_DIR` | Path to directory with all the models | -| `--lora-dir LORA_DIR` | Path to directory with all the loras | -| `--verbose` | Print the prompts to the terminal. | | `--listen` | Make the web UI reachable from your local network. | | `--listen-port LISTEN_PORT` | The listening port that the server will use. | | `--share` | Create a public URL. This is useful for running the web UI on Google Colab or similar. | diff --git a/download-model.py b/download-model.py index 7e5f61b2..0f40ab50 100644 --- a/download-model.py +++ b/download-model.py @@ -9,6 +9,7 @@ python download-model.py facebook/opt-1.3b import argparse import base64 import datetime +import hashlib import json import re import sys @@ -24,11 +25,28 @@ parser.add_argument('--branch', type=str, default='main', help='Name of the Git parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') +parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') +parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.') args = parser.parse_args() def get_file(url, output_folder): - r = requests.get(url, stream=True) - with open(output_folder / Path(url.rsplit('/', 1)[1]), 'wb') as f: + filename = Path(url.rsplit('/', 1)[1]) + output_path = output_folder / filename + if output_path.exists() and not args.clean: + # Check if the file has already been downloaded completely + r = requests.get(url, stream=True) + total_size = int(r.headers.get('content-length', 0)) + if output_path.stat().st_size >= total_size: + return + # Otherwise, resume the download from where it left off + headers = {'Range': f'bytes={output_path.stat().st_size}-'} + mode = 'ab' + else: + headers = {} + mode = 'wb' + + r = requests.get(url, stream=True, headers=headers) + with open(output_path, mode) as f: total_size = int(r.headers.get('content-length', 0)) block_size = 1024 with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: @@ -97,6 +115,7 @@ def get_download_links_from_huggingface(model, branch): classifications = [] has_pytorch = False has_pt = False + has_ggml = False has_safetensors = False is_lora = False while True: @@ -114,6 +133,7 @@ def get_download_links_from_huggingface(model, branch): is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname) is_safetensors = re.match(".*\.safetensors", fname) is_pt = re.match(".*\.pt", fname) + is_ggml = re.match("ggml.*\.bin", fname) is_tokenizer = re.match("tokenizer.*\.model", fname) is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer @@ -135,6 +155,9 @@ def get_download_links_from_huggingface(model, branch): elif is_pt: has_pt = True classifications.append('pt') + elif is_ggml: + has_ggml = True + classifications.append('ggml') cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50' cursor = base64.b64encode(cursor) @@ -149,7 +172,7 @@ def get_download_links_from_huggingface(model, branch): return links, sha256, is_lora def download_files(file_list, output_folder, num_threads=8): - thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads) + thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True) if __name__ == '__main__': model = args.MODEL @@ -179,22 +202,48 @@ if __name__ == '__main__': output_folder = f"{'_'.join(model.split('/')[-2:])}" if branch != 'main': output_folder += f'_{branch}' - - # Creating the folder and writing the metadata output_folder = Path(base_folder) / output_folder - if not output_folder.exists(): - output_folder.mkdir() - with open(output_folder / 'huggingface-metadata.txt', 'w') as f: - f.write(f'url: https://huggingface.co/{model}\n') - f.write(f'branch: {branch}\n') - f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') - sha256_str = '' - for i in range(len(sha256)): - sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' - if sha256_str != '': - f.write(f'sha256sum:\n{sha256_str}') - # Downloading the files - print(f"Downloading the model to {output_folder}") - download_files(links, output_folder, args.threads) - print() + if args.check: + # Validate the checksums + validated = True + for i in range(len(sha256)): + fpath = (output_folder / sha256[i][0]) + + if not fpath.exists(): + print(f"The following file is missing: {fpath}") + validated = False + continue + + with open(output_folder / sha256[i][0], "rb") as f: + bytes = f.read() + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != sha256[i][1]: + print(f'Checksum failed: {sha256[i][0]} {sha256[i][1]}') + validated = False + else: + print(f'Checksum validated: {sha256[i][0]} {sha256[i][1]}') + + if validated: + print('[+] Validated checksums of all model files!') + else: + print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.') + + else: + + # Creating the folder and writing the metadata + if not output_folder.exists(): + output_folder.mkdir() + with open(output_folder / 'huggingface-metadata.txt', 'w') as f: + f.write(f'url: https://huggingface.co/{model}\n') + f.write(f'branch: {branch}\n') + f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') + sha256_str = '' + for i in range(len(sha256)): + sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' + if sha256_str != '': + f.write(f'sha256sum:\n{sha256_str}') + + # Downloading the files + print(f"Downloading the model to {output_folder}") + download_files(links, output_folder, args.threads) \ No newline at end of file diff --git a/extensions/gallery/script.py b/extensions/gallery/script.py index fbf23bc9..c17d69ee 100644 --- a/extensions/gallery/script.py +++ b/extensions/gallery/script.py @@ -2,19 +2,29 @@ from pathlib import Path import gradio as gr +from modules.chat import load_character from modules.html_generator import get_image_cache +from modules.shared import gradio, settings -def generate_html(): +def generate_css(): css = """ - .character-gallery { + .character-gallery > .gallery { margin: 1rem 0; - display: grid; + display: grid !important; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); grid-column-gap: 0.4rem; grid-row-gap: 1.2rem; } + .character-gallery > .label { + display: none !important; + } + + .character-gallery button.gallery-item { + display: contents; + } + .character-container { cursor: pointer; text-align: center; @@ -45,14 +55,16 @@ def generate_html(): overflow-wrap: anywhere; } """ + return css - container_html = f'