diff --git a/README.md b/README.md index ad2ad1ed..8ceb077c 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features * 3 interface modes: default, notebook, and chat -* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa +* Multiple model backends: transformers, llama.cpp, ExLlama, AutoGPTQ, GPTQ-for-LLaMa, ctransformers * Dropdown menu for quickly switching between different models * LoRA: load and unload LoRAs on the fly, train a new LoRA * Precise instruction templates for chat mode, including Llama 2, Alpaca, Vicuna, WizardLM, StableLM, and many others diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py new file mode 100644 index 00000000..b3d001d3 --- /dev/null +++ b/modules/ctransformers_model.py @@ -0,0 +1,76 @@ +from ctransformers import AutoConfig, AutoModelForCausalLM + +from modules import shared +from modules.callbacks import Iteratorize +from modules.logging_colors import logger + + +class CtransformersModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path): + result = self() + + # ctransformers uses -1 for random seed + config = AutoConfig.from_pretrained( + str(path), + threads=shared.args.threads, + gpu_layers=shared.args.n_gpu_layers, + batch_size=shared.args.n_batch, + stream=True, + seed=(-1 if shared.args.llama_cpp_seed == 0 else shared.args.llama_cpp_seed) + ) + + self.model = AutoModelForCausalLM.from_pretrained( + str(result.model_dir(path) if result.model_type_is_auto() else path), + model_type=(None if result.model_type_is_auto() else shared.args.model_type), + config=config + ) + + logger.info(f'Using ctransformers model_type: {self.model.model_type} for {self.model.model_path}') + return result, result + + def model_type_is_auto(self): + return shared.args.model_type == "Auto" or shared.args.model_type == "None" + + def model_dir(self, path): + if path.is_file(): + return path.parent + + return path + + def encode(self, string, **kwargs): + return self.model.tokenize(string) + + def decode(self, ids): + return self.model.detokenize(ids) + + def generate(self, prompt, state, callback=None): + prompt = prompt if type(prompt) is str else prompt.decode() + generator = self.model._stream( + prompt=prompt, + max_new_tokens=state['max_new_tokens'], + temperature=state['temperature'], + top_p=state['top_p'], + top_k=state['top_k'], + repetition_penalty=state['repetition_penalty'], + threads=shared.args.threads + ) + + output = "" + for token in generator: + if callback: + callback(token) + + output += token + + return output + + def generate_with_streaming(self, *args, **kwargs): + with Iteratorize(self.generate, args, kwargs, callback=None) as generator: + reply = '' + for token in generator: + reply += token + yield reply diff --git a/modules/loaders.py b/modules/loaders.py index 07bc455c..fa5e03c2 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -1,10 +1,43 @@ import functools +from collections import OrderedDict import gradio as gr from modules import shared -loaders_and_params = { +loaders_and_params = OrderedDict({ + 'Transformers': [ + 'cpu_memory', + 'gpu_memory', + 'trust_remote_code', + 'load_in_8bit', + 'bf16', + 'cpu', + 'disk', + 'auto_devices', + 'load_in_4bit', + 'use_double_quant', + 'quant_type', + 'compute_dtype', + 'trust_remote_code', + 'alpha_value', + 'compress_pos_emb', + 'transformers_info' + ], + 'ExLlama_HF': [ + 'gpu_split', + 'max_seq_len', + 'alpha_value', + 'compress_pos_emb', + 'exllama_HF_info', + ], + 'ExLlama': [ + 'gpu_split', + 'max_seq_len', + 'alpha_value', + 'compress_pos_emb', + 'exllama_info', + ], 'AutoGPTQ': [ 'triton', 'no_inject_fused_attention', @@ -59,39 +92,15 @@ loaders_and_params = { 'cpu', 'llamacpp_HF_info', ], - 'Transformers': [ - 'cpu_memory', - 'gpu_memory', - 'trust_remote_code', - 'load_in_8bit', - 'bf16', - 'cpu', - 'disk', - 'auto_devices', - 'load_in_4bit', - 'use_double_quant', - 'quant_type', - 'compute_dtype', - 'trust_remote_code', - 'alpha_value', - 'compress_pos_emb', - 'transformers_info' - ], - 'ExLlama': [ - 'gpu_split', - 'max_seq_len', - 'alpha_value', - 'compress_pos_emb', - 'exllama_info', - ], - 'ExLlama_HF': [ - 'gpu_split', - 'max_seq_len', - 'alpha_value', - 'compress_pos_emb', - 'exllama_HF_info', + 'ctransformers': [ + 'n_ctx', + 'n_gpu_layers', + 'n_batch', + 'threads', + 'model_type', + 'llama_cpp_seed', ] -} +}) loaders_samplers = { 'Transformers': { @@ -256,6 +265,33 @@ loaders_samplers = { 'skip_special_tokens', 'auto_max_new_tokens', }, + 'ctransformers': { + 'temperature', + 'top_p', + 'top_k', + 'repetition_penalty', + } +} + +loaders_model_types = { + 'GPTQ-for-LLaMa': [ + "None", + "llama", + "opt", + "gptj" + ], + 'ctransformers': [ + "None", + "gpt2", + "gptj", + "gptneox", + "llama", + "mpt", + "dollyv2" + "replit", + "starcoder", + "falcon" + ], } @@ -277,6 +313,13 @@ def blacklist_samplers(loader): return [gr.update(visible=True) if sampler in loaders_samplers[loader] else gr.update(visible=False) for sampler in all_samplers] +def get_model_types(loader): + if loader in loaders_model_types: + return loaders_model_types[loader] + + return ["None"] + + def get_gpu_memory_keys(): return [k for k in shared.gradio if k.startswith('gpu_memory')] diff --git a/modules/models.py b/modules/models.py index aad142c1..d60aecd0 100644 --- a/modules/models.py +++ b/modules/models.py @@ -58,7 +58,8 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'RWKV': RWKV_loader, 'ExLlama': ExLlama_loader, - 'ExLlama_HF': ExLlama_HF_loader + 'ExLlama_HF': ExLlama_HF_loader, + 'ctransformers': ctransformers_loader, } p = Path(model_name) @@ -242,7 +243,7 @@ def llamacpp_loader(model_name): else: model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin'))[0] - logger.info(f"llama.cpp weights detected: {model_file}\n") + logger.info(f"llama.cpp weights detected: {model_file}") model, tokenizer = LlamaCppModel.from_pretrained(model_file) return model, tokenizer @@ -268,6 +269,24 @@ def llamacpp_HF_loader(model_name): return model, tokenizer +def ctransformers_loader(model_name): + from modules.ctransformers_model import CtransformersModel + + path = Path(f'{shared.args.model_dir}/{model_name}') + ctrans = CtransformersModel() + if ctrans.model_type_is_auto(): + model_file = path + else: + if path.is_file(): + model_file = path + else: + model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.bin'))[0] + + logger.info(f'ctransformers weights detected: {model_file}') + model, tokenizer = ctrans.from_pretrained(model_file) + return model, tokenizer + + def GPTQ_loader(model_name): # Monkey patch diff --git a/modules/shared.py b/modules/shared.py index 224fa6aa..cb6f0ae1 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -215,6 +215,8 @@ def fix_loader_name(name): return 'ExLlama' elif name in ['exllama-hf', 'exllama_hf', 'exllama hf', 'ex-llama-hf', 'ex_llama_hf']: return 'ExLlama_HF' + elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']: + return 'ctransformers' def add_extension(name): diff --git a/modules/text_generation.py b/modules/text_generation.py index 6e95414b..30e81355 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -41,7 +41,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False): yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -90,7 +90,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False): def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_length=None): - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'CtransformersModel']: input_ids = shared.tokenizer.encode(str(prompt)) input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: @@ -104,7 +104,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel', 'CtransformersModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 0c1042f6..7b852a44 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -63,7 +63,7 @@ def create_ui(): with gr.Row(): with gr.Column(): - shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=["Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value=None) + shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None) with gr.Box(): with gr.Row(): with gr.Column(): @@ -84,7 +84,7 @@ def create_ui(): shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None") - shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None") + shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) shared.gradio['autogptq_info'] = gr.Markdown('* ExLlama_HF is recommended over AutoGPTQ for models derived from LLaMA.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') @@ -127,7 +127,9 @@ def create_ui(): def create_event_handlers(): - shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())) + shared.gradio['loader'].change( + loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then( + lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type')) # In this event handler, the interface state is read and updated # with the model defaults (if any), and then the model is loaded diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 75bce9b1..4b9fb918 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -16,7 +16,7 @@ def create_ui(default_preset): shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button') with gr.Column(): - shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All", "Transformers", "ExLlama_HF", "ExLlama", "AutoGPTQ", "GPTQ-for-LLaMa", "llama.cpp", "llamacpp_HF"], value="All", elem_classes='slim-dropdown') + shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown') with gr.Row(): with gr.Column(): diff --git a/requirements.txt b/requirements.txt index b27e14c5..ec6a7e47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,3 +40,6 @@ https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/text # GPTQ-for-LLaMa https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" + +# ctransformers +https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.20+cu117-py3-none-any.whl