diff --git a/README.md b/README.md index 9d2e1b00..24c04711 100644 --- a/README.md +++ b/README.md @@ -212,7 +212,7 @@ Optionally, you can use the following command-line flags: | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen | #### Accelerate/transformers diff --git a/characters/instruction-following/Starchat-Beta.yaml b/characters/instruction-following/Starchat-Beta.yaml new file mode 100644 index 00000000..2af4ee6b --- /dev/null +++ b/characters/instruction-following/Starchat-Beta.yaml @@ -0,0 +1,4 @@ +user: "<|user|>" +bot: "<|assistant|>" +context: "<|system|>\n<|end|>\n" +turn_template: "<|user|>\n<|user-message|><|end|>\n<|bot|>\n<|bot-message|><|end|>\n" diff --git a/characters/instruction-following/Tulu.yaml b/characters/instruction-following/Tulu.yaml new file mode 100644 index 00000000..13dd14f9 --- /dev/null +++ b/characters/instruction-following/Tulu.yaml @@ -0,0 +1,4 @@ +user: "<|user|>" +bot: "<|assistant|>" +context: "" +turn_template: "<|user|>\n<|user-message|>\n<|bot|>\n<|bot-message|>\n" diff --git a/css/chat.css b/css/chat.css index f8cb765a..fcf19ee0 100644 --- a/css/chat.css +++ b/css/chat.css @@ -93,3 +93,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { .message-body :not(pre) > code { white-space: normal !important; } + +@media print { + body { + visibility: hidden; + } + + .chat { + visibility: visible; + position: absolute; + left: 0; + top: 0; + max-width: none; + max-height: none; + width: 100%; + height: fit-content; + display: flex; + flex-direction: column-reverse; + } +} diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css index 132d4dd4..575281b1 100644 --- a/css/html_instruct_style.css +++ b/css/html_instruct_style.css @@ -17,6 +17,10 @@ margin-bottom: 1.25em !important; } +.message-body ul, .message-body ol { + margin-bottom: 1.25em !important; +} + .dark .message-body p em { color: rgb(198, 202, 214) !important; } diff --git a/docker/Dockerfile b/docker/Dockerfile index be6fcfc2..7cc0ff15 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -26,7 +26,7 @@ LABEL maintainer="Your Name " LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI" RUN apt-get update && \ - apt-get install --no-install-recommends -y libportaudio2 libasound-dev git python3 python3-pip make g++ && \ + apt-get install --no-install-recommends -y python3-dev libportaudio2 libasound-dev git python3 python3-pip make g++ && \ rm -rf /var/lib/apt/lists/* RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv diff --git a/download-model.py b/download-model.py index 540f94c6..e04aedc3 100644 --- a/download-model.py +++ b/download-model.py @@ -1,5 +1,5 @@ ''' -Downloads models from Hugging Face to models/model-name. +Downloads models from Hugging Face to models/username_modelname. Example: python download-model.py facebook/opt-1.3b @@ -11,8 +11,8 @@ import base64 import datetime import hashlib import json -import re import os +import re import sys from pathlib import Path @@ -21,63 +21,12 @@ import tqdm from tqdm.contrib.concurrent import thread_map -def select_model_from_default_options(): - models = { - "OPT 6.7B": ("facebook", "opt-6.7b", "main"), - "OPT 2.7B": ("facebook", "opt-2.7b", "main"), - "OPT 1.3B": ("facebook", "opt-1.3b", "main"), - "OPT 350M": ("facebook", "opt-350m", "main"), - "GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"), - "GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"), - "GALACTICA 125M": ("facebook", "galactica-125m", "main"), - "Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"), - "Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"), - "Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"), - "Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"), - } - - choices = {} - print("Select the model that you want to download:\n") - for i, name in enumerate(models): - char = chr(ord('A') + i) - choices[char] = name - print(f"{char}) {name}") - - char_hugging = chr(ord('A') + len(models)) - print(f"{char_hugging}) Manually specify a Hugging Face model") - char_exit = chr(ord('A') + len(models) + 1) - print(f"{char_exit}) Do not download a model") - print() - print("Input> ", end='') - choice = input()[0].strip().upper() - if choice == char_exit: - exit() - elif choice == char_hugging: - print("""\nType the name of your desired Hugging Face model in the format organization/name. - -Examples: -facebook/opt-1.3b -EleutherAI/pythia-1.4b-deduped -""") - - print("Input> ", end='') - model = input() - branch = "main" - else: - arr = models[choices[choice]] - model = f"{arr[0]}/{arr[1]}" - branch = arr[2] - - return model, branch - - class ModelDownloader: def __init__(self): self.s = requests.Session() if os.getenv('HF_USER') is not None and os.getenv('HF_PASS') is not None: self.s.auth = (os.getenv('HF_USER'), os.getenv('HF_PASS')) - def sanitize_model_and_branch_names(self, model, branch): if model[-1] == '/': model = model[:-1] @@ -92,7 +41,6 @@ class ModelDownloader: return model, branch - def get_download_links_from_huggingface(self, model, branch, text_only=False): base = "https://huggingface.co" page = f"/api/models/{model}/tree/{branch}" @@ -163,7 +111,6 @@ class ModelDownloader: return links, sha256, is_lora - def get_output_folder(self, model, branch, is_lora, base_folder=None): if base_folder is None: base_folder = 'models' if not is_lora else 'loras' @@ -171,59 +118,64 @@ class ModelDownloader: output_folder = f"{'_'.join(model.split('/')[-2:])}" if branch != 'main': output_folder += f'_{branch}' + output_folder = Path(base_folder) / output_folder return output_folder - def get_single_file(self, url, output_folder, start_from_scratch=False): filename = Path(url.rsplit('/', 1)[1]) output_path = output_folder / filename + headers = {} + mode = 'wb' if output_path.exists() and not start_from_scratch: + # Check if the file has already been downloaded completely r = self.s.get(url, stream=True, timeout=20) total_size = int(r.headers.get('content-length', 0)) if output_path.stat().st_size >= total_size: return + # Otherwise, resume the download from where it left off headers = {'Range': f'bytes={output_path.stat().st_size}-'} mode = 'ab' - else: - headers = {} - mode = 'wb' - r = self.s.get(url, stream=True, headers=headers, timeout=20) - with open(output_path, mode) as f: + with self.s.get(url, stream=True, headers=headers, timeout=20) as r: + r.raise_for_status() # Do not continue the download if the request was unsuccessful total_size = int(r.headers.get('content-length', 0)) - block_size = 1024 - with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: - for data in r.iter_content(block_size): - t.update(len(data)) - f.write(data) - + block_size = 1024 * 1024 # 1MB + with open(output_path, mode) as f: + with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: + count = 0 + for data in r.iter_content(block_size): + t.update(len(data)) + f.write(data) + if total_size != 0 and self.progress_bar is not None: + count += len(data) + self.progress_bar(float(count) / float(total_size), f"Downloading {filename}") def start_download_threads(self, file_list, output_folder, start_from_scratch=False, threads=1): thread_map(lambda url: self.get_single_file(url, output_folder, start_from_scratch=start_from_scratch), file_list, max_workers=threads, disable=True) + def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=1): + self.progress_bar = progress_bar - def download_model_files(self, model, branch, links, sha256, output_folder, start_from_scratch=False, threads=1): # Creating the folder and writing the metadata - if not output_folder.exists(): - output_folder.mkdir(parents=True, exist_ok=True) - with open(output_folder / 'huggingface-metadata.txt', 'w') as f: - f.write(f'url: https://huggingface.co/{model}\n') - f.write(f'branch: {branch}\n') - f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') - sha256_str = '' - for i in range(len(sha256)): - sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' - if sha256_str != '': - f.write(f'sha256sum:\n{sha256_str}') + output_folder.mkdir(parents=True, exist_ok=True) + metadata = f'url: https://huggingface.co/{model}\n' \ + f'branch: {branch}\n' \ + f'download date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n' + + sha256_str = '\n'.join([f' {item[1]} {item[0]}' for item in sha256]) + if sha256_str: + metadata += f'sha256sum:\n{sha256_str}' + + metadata += '\n' + (output_folder / 'huggingface-metadata.txt').write_text(metadata) # Downloading the files print(f"Downloading the model to {output_folder}") self.start_download_threads(links, output_folder, start_from_scratch=start_from_scratch, threads=threads) - def check_model_files(self, model, branch, links, sha256, output_folder): # Validate the checksums validated = True @@ -264,8 +216,6 @@ if __name__ == '__main__': branch = args.branch model = args.MODEL - if model is None: - model, branch = select_model_from_default_options() downloader = ModelDownloader() # Cleaning up the model/branch names diff --git a/models/config.yaml b/models/config.yaml index 715bbf71..318b0822 100644 --- a/models/config.yaml +++ b/models/config.yaml @@ -50,7 +50,7 @@ llama-65b-gptq-3bit: .*vicuna.*v0: mode: 'instruct' instruction_template: 'Vicuna-v0' -.*vicuna.*(1.1|1_1): +.*vicuna.*(1.1|1_1|1.3|1_3): mode: 'instruct' instruction_template: 'Vicuna-v1.1' .*wizard.*vicuna: @@ -184,7 +184,7 @@ llama-65b-gptq-3bit: .*Nous-Hermes-13b: mode: 'instruct' instruction_template: 'Alpaca' -.*airoboros-13b-gpt4: +.*airoboros: mode: 'instruct' instruction_template: 'Vicuna-v1.1' .*WizardLM-30B-V1.0: @@ -193,7 +193,7 @@ llama-65b-gptq-3bit: TheBloke_WizardLM-30B-GPTQ: mode: 'instruct' instruction_template: 'Vicuna-v1.1' -.*(A|a)lpa(cino|sta): +.*alpa(cino|sta): mode: 'instruct' instruction_template: 'Alpaca' .*hippogriff: @@ -202,9 +202,33 @@ TheBloke_WizardLM-30B-GPTQ: .*gpt4all-.*-snoozy: mode: 'instruct' instruction_template: 'WizardLM' -.*(L|l)azarus: +.*lazarus: mode: 'instruct' instruction_template: 'Alpaca' -.*(G|g)uanaco-.*(7|13|33|65)(b|B): +.*guanaco-.*(7|13|33|65)b: mode: 'instruct' instruction_template: 'Guanaco' +.*hypermantis: + mode: 'instruct' + instruction_template: 'Alpaca' +.*open-llama-.*-open-instruct: + mode: 'instruct' + instruction_template: 'Alpaca' +.*starcoder-gpteacher-code-instruct: + mode: 'instruct' + instruction_template: 'Alpaca' +.*tulu: + mode: 'instruct' + instruction_template: 'Tulu' +.*chronos: + mode: 'instruct' + instruction_template: 'Alpaca' +.*samantha: + mode: 'instruct' + instruction_template: 'Samantha' +.*wizardcoder: + mode: 'instruct' + instruction_template: 'Alpaca' +.*starchat-beta: + mode: 'instruct' + instruction_template: 'Starchat-Beta' \ No newline at end of file diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py new file mode 100644 index 00000000..27cac374 --- /dev/null +++ b/modules/exllama_hf.py @@ -0,0 +1,82 @@ +import os +import sys +from pathlib import Path +from typing import * + +import torch +from transformers import ( + GenerationConfig, + LlamaTokenizer, + PretrainedConfig, + PreTrainedModel +) +from transformers.modeling_outputs import CausalLMOutputWithPast + +from modules import shared +from modules.logging_colors import logger +from modules.relative_imports import RelativeImport + +with RelativeImport("repositories/exllama"): + from model import ExLlama, ExLlamaCache, ExLlamaConfig + + +class ExllamaHF(PreTrainedModel): + def __init__(self, config: ExLlamaConfig): + super().__init__(PretrainedConfig()) + self.ex_config = config + self.ex_model = ExLlama(self.ex_config) + self.generation_config = GenerationConfig() + + def _validate_model_class(self): + pass + + def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): + pass + + def prepare_inputs_for_generation(self, input_ids, **kwargs): + return {'input_ids': input_ids, **kwargs} + + @property + def device(self) -> torch.device: + # TODO: May cause problem on multi-gpu inference? + return torch.device(0) + + def __call__(self, *args, **kwargs): + # TODO: Some decoding methods (such as Contrastive Search) may not work at this time + assert len(args) == 0, 'no *args should be passed to forward' + use_cache = kwargs['use_cache'] + seq = kwargs['input_ids'][0].tolist() + cache = kwargs['past_key_values'] if 'past_key_values' in kwargs else None + if cache is None: + cache = ExLlamaCache(self.ex_model) + self.ex_model.forward(torch.tensor([seq[:-1]], dtype=torch.long), cache, preprocess_only=True) + logits = self.ex_model.forward(torch.tensor([seq[-1:]], dtype=torch.long), cache).to(self.device) + return CausalLMOutputWithPast(logits=logits, past_key_values=cache if use_cache else None) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported" + if isinstance(pretrained_model_name_or_path, str): + pretrained_model_name_or_path = Path(pretrained_model_name_or_path) + + pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path) + config = ExLlamaConfig(pretrained_model_name_or_path / 'config.json') + + # from 'oobabooga/text-generation-webui/modules/exllama.py' + weight_path = None + for ext in ['.safetensors', '.pt', '.bin']: + found = list(pretrained_model_name_or_path.glob(f"*{ext}")) + if len(found) > 0: + weight_path = found[-1] + break + assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"' + + config.model_path = str(weight_path) + + # This slowes down a bit but align better with autogptq generation. + # TODO: Should give user choice to tune the exllama config + config.act_order = True + config.fused_attn = False + config.fused_mlp_thd = 0 + + return ExllamaHF(config) \ No newline at end of file diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 9f6122d9..10a852db 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -52,9 +52,9 @@ class LlamaCppModel: 'n_gpu_layers': shared.args.n_gpu_layers } - self.model = Llama(**params) + result.model = Llama(**params) if cache_capacity > 0: - self.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) + result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) # This is ugly, but the model and the tokenizer are the same object in this library. return result, result diff --git a/modules/loaders.py b/modules/loaders.py index ac6f80bd..21642023 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -55,6 +55,10 @@ loaders_and_params = { 'ExLlama' : [ 'gpu_split', 'exllama_info', + ], + 'ExLlama_HF' : [ + 'gpu_split', + 'exllama_HF_info', ] } diff --git a/modules/models.py b/modules/models.py index 1aba66c5..574e1646 100644 --- a/modules/models.py +++ b/modules/models.py @@ -49,7 +49,8 @@ def load_model(model_name, loader=None): 'llama.cpp': llamacpp_loader, 'FlexGen': flexgen_loader, 'RWKV': RWKV_loader, - 'ExLlama': ExLlama_loader + 'ExLlama': ExLlama_loader, + 'ExLlama_HF': ExLlama_HF_loader } if loader is None: @@ -278,6 +279,12 @@ def ExLlama_loader(model_name): return model, tokenizer +def ExLlama_HF_loader(model_name): + from modules.exllama_hf import ExllamaHF + + return ExllamaHF.from_pretrained(model_name) + + def get_max_memory_dict(): max_memory = {} if shared.args.gpu_memory: diff --git a/modules/shared.py b/modules/shared.py index ecc03fc9..e065b76b 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -98,7 +98,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') # Model loader -parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, llamacpp, rwkv, flexgen') +parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen') # Accelerate/transformers parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') @@ -218,6 +218,8 @@ def fix_loader_name(name): return 'GPTQ-for-LLaMa' elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: return 'ExLlama' + elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']: + return 'ExLlama_HF' if args.loader is not None: diff --git a/modules/text_generation.py b/modules/text_generation.py index 0d2f55c2..d0965b8a 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -104,9 +104,8 @@ def get_reply_from_output_ids(output_ids, input_ids, original_question, state, i else: new_tokens = len(output_ids) - len(input_ids[0]) reply = decode(output_ids[-new_tokens:], state['skip_special_tokens']) - # Prevent LlamaTokenizer from skipping a space - if type(shared.tokenizer) is transformers.LlamaTokenizer and len(output_ids) > 0: + if type(shared.tokenizer) in [transformers.LlamaTokenizer, transformers.LlamaTokenizerFast] and len(output_ids) > 0: if shared.tokenizer.convert_ids_to_tokens(int(output_ids[-new_tokens])).startswith('▁'): reply = ' ' + reply diff --git a/modules/training.py b/modules/training.py index 75ba82ca..65f1668a 100644 --- a/modules/training.py +++ b/modules/training.py @@ -11,7 +11,7 @@ import gradio as gr import torch import transformers from datasets import Dataset, load_dataset -from peft import (LoraConfig, get_peft_model, prepare_model_for_int8_training, +from peft import (LoraConfig, get_peft_model, prepare_model_for_kbit_training, set_peft_model_state_dict) from modules import shared, ui, utils @@ -30,14 +30,17 @@ try: MODEL_CLASSES = {v: k for k, v in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES} except: standard_modules = ["q_proj", "v_proj"] - model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"]} + model_to_lora_modules = {"llama": standard_modules, "opt": standard_modules, "gptj": standard_modules, "gpt_neox": ["query_key_value"], "rw":["query_key_value"]} MODEL_CLASSES = { "LlamaForCausalLM": "llama", "OPTForCausalLM": "opt", "GPTJForCausalLM": "gptj", - "GPTNeoXForCausalLM": "gpt_neox" + "GPTNeoXForCausalLM": "gpt_neox", + "RWForCausalLM": "rw" + } +train_log = {} WANT_INTERRUPT = False PARAMETERS = ["lora_name", "always_override", "save_steps", "micro_batch_size", "batch_size", "epochs", "learning_rate", "lr_scheduler_type", "lora_rank", "lora_alpha", "lora_dropout", "cutoff_len", "dataset", "eval_dataset", "format", "eval_steps", "raw_text_file", "overlap_len", "newline_favor_len", "higher_rank_limit", "warmup_steps", "optimizer", "hard_cut_string", "train_only_after"] @@ -357,7 +360,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # == Start prepping the model itself == if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'): logger.info("Getting model ready...") - prepare_model_for_int8_training(shared.model) + prepare_model_for_kbit_training(shared.model) logger.info("Prepping for training...") config = LoraConfig( @@ -406,12 +409,19 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch control.should_training_stop = True elif state.global_step > 0 and actual_save_steps > 0 and state.global_step % actual_save_steps == 0: lora_model.save_pretrained(f"{lora_file_path}/checkpoint-{tracked.current_steps}/") + # Save log + with open(f"{lora_file_path}/checkpoint-{tracked.current_steps}/training_log.json", 'w', encoding='utf-8') as file: + json.dump(train_log, file, indent=2) + def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs): tracked.current_steps += 1 if WANT_INTERRUPT: control.should_epoch_stop = True control.should_training_stop = True + + def on_log(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, logs, **kwargs): + train_log.update(logs) trainer = transformers.Trainer( model=lora_model, @@ -448,7 +458,7 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # == Save parameters for reuse == with open(f"{lora_file_path}/training_parameters.json", 'w', encoding='utf-8') as file: vars = locals() - json.dump({x: vars[x] for x in PARAMETERS}, file) + json.dump({x: vars[x] for x in PARAMETERS}, file, indent=2) # == Main run and monitor loop == logger.info("Starting training...") @@ -462,7 +472,9 @@ def do_train(lora_name: str, always_override: bool, save_steps: int, micro_batch # Note: save in the thread in case the gradio thread breaks (eg browser closed) lora_model.save_pretrained(lora_file_path) logger.info("LoRA training run is completed and saved.") - tracked.did_save = True + # Save log + with open(f"{lora_file_path}/training_log.json", 'w', encoding='utf-8') as file: + json.dump(train_log, file, indent=2) thread = threading.Thread(target=threaded_run) thread.start() diff --git a/requirements.txt b/requirements.txt index f149a5a4..c1a5a9a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,10 +17,10 @@ tqdm scipy transformers==4.30.2 git+https://github.com/huggingface/peft@03eb378eb914fbee709ff7c86ba5b1d033b89524 -bitsandbytes==0.39.0; platform_system != "Windows" -https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.39.0-py3-none-any.whl; platform_system == "Windows" -llama-cpp-python==0.1.62; platform_system != "Windows" -https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.62/llama_cpp_python-0.1.62-cp310-cp310-win_amd64.whl; platform_system == "Windows" +bitsandbytes==0.39.1; platform_system != "Windows" +https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl; platform_system == "Windows" +llama-cpp-python==0.1.64; platform_system != "Windows" +https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.64/llama_cpp_python-0.1.64-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.2.2/auto_gptq-0.2.2+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/exllama/releases/download/0.0.1/exllama-0.0.1+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" diff --git a/server.py b/server.py index 4198dd69..ff4f2c3f 100644 --- a/server.py +++ b/server.py @@ -122,7 +122,7 @@ def count_tokens(text): return 'Couldn\'t count the number of tokens. Is a tokenizer loaded?' -def download_model_wrapper(repo_id): +def download_model_wrapper(repo_id, progress=gr.Progress()): try: downloader_module = importlib.import_module("download-model") downloader = downloader_module.ModelDownloader() @@ -131,6 +131,7 @@ def download_model_wrapper(repo_id): branch = repo_id_parts[1] if len(repo_id_parts) > 1 else "main" check = False + progress(0.0) yield ("Cleaning up the model/branch names") model, branch = downloader.sanitize_model_and_branch_names(model, branch) @@ -141,13 +142,16 @@ def download_model_wrapper(repo_id): output_folder = downloader.get_output_folder(model, branch, is_lora) if check: + progress(0.5) yield ("Checking previously downloaded files") downloader.check_model_files(model, branch, links, sha256, output_folder) + progress(1.0) else: yield (f"Downloading files to {output_folder}") - downloader.download_model_files(model, branch, links, sha256, output_folder, threads=1) + downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=1) yield ("Done!") except: + progress(1.0) yield traceback.format_exc() @@ -193,7 +197,7 @@ def create_model_menus(): with gr.Row(): with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "llama.cpp"], value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "AutoGPTQ", "GPTQ-for-LLaMa", "ExLlama", "ExLlama_HF", "llama.cpp"], value=None) with gr.Box(): with gr.Row(): with gr.Column(): @@ -233,6 +237,7 @@ def create_model_menus(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Make sure to inspect the .py files inside the model folder before loading it with this option enabled.') shared.gradio['gptq_for_llama_info'] = gr.Markdown('GPTQ-for-LLaMa is currently 2x faster than AutoGPTQ on some systems. It is installed by default with the one-click installers. Otherwise, it has to be installed manually following the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/GPTQ-models-(4-bit-mode).md#installation-1).') shared.gradio['exllama_info'] = gr.Markdown('ExLlama has to be installed manually. See the instructions here: [instructions](https://github.com/oobabooga/text-generation-webui/blob/main/docs/ExLlama.md).') + shared.gradio['exllama_HF_info'] = gr.Markdown('ExLlama_HF is a wrapper that lets you use ExLlama like a Transformers model, which means it can use the Transformers samplers. It\'s still a bit buggy, so feel free to help out by fixing issues.\n\nCheck out PR [#2777](https://github.com/oobabooga/text-generation-webui/pull/2777) for more details.') with gr.Column(): with gr.Row(): @@ -276,7 +281,7 @@ def create_model_menus(): save_model_settings, [shared.gradio[k] for k in ['model_menu', 'interface_state']], shared.gradio['model_status'], show_progress=False) shared.gradio['lora_menu_apply'].click(load_lora_wrapper, shared.gradio['lora_menu'], shared.gradio['model_status'], show_progress=False) - shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=False) + shared.gradio['download_model_button'].click(download_model_wrapper, shared.gradio['custom_model_menu'], shared.gradio['model_status'], show_progress=True) shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), shared.gradio['autoload_model'], load)