diff --git a/README.md b/README.md index 6b92448c..9f3e81bd 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features * 3 interface modes: default (two columns), notebook, and chat. -* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [CTransformers](https://github.com/marella/ctransformers), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp). +* Multiple model backends: [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp) (through [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)), [ExLlamaV2](https://github.com/turboderp/exllamav2), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa), [QuIP#](https://github.com/Cornell-RelaxML/quip-sharp). * Dropdown menu for quickly switching between different models. * Large number of extensions (built-in and user-contributed), including Coqui TTS for realistic voice outputs, Whisper STT for voice inputs, translation, [multimodal pipelines](https://github.com/oobabooga/text-generation-webui/tree/main/extensions/multimodal), vector databases, Stable Diffusion integration, and a lot more. See [the wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [the extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details. * [Chat with custom characters](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab#character). @@ -221,7 +221,7 @@ List of command-line flags | Flag | Description | |--------------------------------------------|-------------| -| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#. | +| `--loader LOADER` | Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#. | #### Accelerate/transformers @@ -308,12 +308,6 @@ List of command-line flags | `--checkpoint CHECKPOINT` | The path to the quantized checkpoint file. If not specified, it will be automatically detected. | | `--monkey-patch` | Apply the monkey patch for using LoRAs with quantized models. | -#### ctransformers - -| Flag | Description | -|-------------|-------------| -| `--model_type MODEL_TYPE` | Model type of pre-quantized model. Currently gpt2, gptj, gptneox, falcon, llama, mpt, starcoder (gptbigcode), dollyv2, and replit are supported. | - #### HQQ | Flag | Description | diff --git a/docs/04 - Model Tab.md b/docs/04 - Model Tab.md index 05b85b48..7c168e89 100644 --- a/docs/04 - Model Tab.md +++ b/docs/04 - Model Tab.md @@ -105,12 +105,6 @@ It has an additional parameter: * **logits_all**: Needs to be checked if you want to evaluate the perplexity of the llama.cpp model using the "Training" > "Perplexity evaluation" tab. Otherwise, leave it unchecked, as it makes prompt processing slower. -### ctransformers - -Loads: GGUF/GGML models. - -Similar to llama.cpp but it works for certain GGUF/GGML models not originally supported by llama.cpp like Falcon, StarCoder, StarChat, and GPT-J. - ### AutoAWQ Loads: AWQ models. diff --git a/docs/What Works.md b/docs/What Works.md index 354da1dd..6c0d4c84 100644 --- a/docs/What Works.md +++ b/docs/What Works.md @@ -10,7 +10,6 @@ | AutoGPTQ | ✅ | ❌ | ❌ | ✅ | ✅ | | AutoAWQ | ? | ❌ | ? | ? | ✅ | | GPTQ-for-LLaMa | ✅\*\* | ✅\*\*\* | ✅ | ✅ | ✅ | -| ctransformers | ❌ | ❌ | ❌ | ❌ | ❌ | | QuIP# | ? | ? | ? | ? | ✅ | | HQQ | ? | ? | ? | ? | ✅ | diff --git a/modules/ctransformers_model.py b/modules/ctransformers_model.py deleted file mode 100644 index 70ce92f5..00000000 --- a/modules/ctransformers_model.py +++ /dev/null @@ -1,79 +0,0 @@ -from ctransformers import AutoConfig, AutoModelForCausalLM - -from modules import shared -from modules.callbacks import Iteratorize -from modules.logging_colors import logger - - -class CtransformersModel: - def __init__(self): - pass - - @classmethod - def from_pretrained(cls, path): - result = cls() - - config = AutoConfig.from_pretrained( - str(path), - threads=shared.args.threads if shared.args.threads != 0 else -1, - gpu_layers=shared.args.n_gpu_layers, - batch_size=shared.args.n_batch, - context_length=shared.args.n_ctx, - stream=True, - mmap=not shared.args.no_mmap, - mlock=shared.args.mlock - ) - - result.model = AutoModelForCausalLM.from_pretrained( - str(result.model_dir(path) if result.model_type_is_auto() else path), - model_type=(None if result.model_type_is_auto() else shared.args.model_type), - config=config - ) - - logger.info(f'Using ctransformers model_type: {result.model.model_type} for {result.model.model_path}') - return result, result - - def model_type_is_auto(self): - return shared.args.model_type is None or shared.args.model_type == "Auto" or shared.args.model_type == "None" - - def model_dir(self, path): - if path.is_file(): - return path.parent - - return path - - def encode(self, string, **kwargs): - return self.model.tokenize(string) - - def decode(self, ids): - return self.model.detokenize(ids) - - def generate(self, prompt, state, callback=None): - prompt = prompt if type(prompt) is str else prompt.decode() - # ctransformers uses -1 for random seed - generator = self.model( - prompt=prompt, - max_new_tokens=state['max_new_tokens'], - temperature=state['temperature'], - top_p=state['top_p'], - top_k=state['top_k'], - repetition_penalty=state['repetition_penalty'], - last_n_tokens=state['repetition_penalty_range'], - seed=int(state['seed']) - ) - - output = "" - for token in generator: - if callback: - callback(token) - - output += token - - return output - - def generate_with_streaming(self, *args, **kwargs): - with Iteratorize(self.generate, args, kwargs, callback=None) as generator: - reply = '' - for token in generator: - reply += token - yield reply diff --git a/modules/loaders.py b/modules/loaders.py index 60fe8aa6..23477339 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -138,15 +138,6 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'gptq_for_llama_info', ], - 'ctransformers': [ - 'n_ctx', - 'n_gpu_layers', - 'n_batch', - 'threads', - 'model_type', - 'no_mmap', - 'mlock' - ], 'QuIP#': [ 'trust_remote_code', 'no_use_fast', @@ -332,13 +323,6 @@ loaders_samplers = { 'skip_special_tokens', 'auto_max_new_tokens', }, - 'ctransformers': { - 'temperature', - 'top_p', - 'top_k', - 'repetition_penalty', - 'repetition_penalty_range', - }, } loaders_model_types = { @@ -348,19 +332,6 @@ loaders_model_types = { "opt", "gptj" ], - 'ctransformers': [ - "None", - "gpt2", - "gptj", - "gptneox", - "llama", - "mpt", - "dollyv2", - "replit", - "starcoder", - "gptbigcode", - "falcon" - ], } diff --git a/modules/models.py b/modules/models.py index 541c6301..98349705 100644 --- a/modules/models.py +++ b/modules/models.py @@ -67,7 +67,6 @@ def load_model(model_name, loader=None): 'llamacpp_HF': llamacpp_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, - 'ctransformers': ctransformers_loader, 'AutoAWQ': AutoAWQ_loader, 'QuIP#': QuipSharp_loader, 'HQQ': HQQ_loader, @@ -97,7 +96,7 @@ def load_model(model_name, loader=None): shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) if loader.lower().startswith('exllama'): shared.settings['truncation_length'] = shared.args.max_seq_len - elif loader in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: + elif loader in ['llama.cpp', 'llamacpp_HF']: shared.settings['truncation_length'] = shared.args.n_ctx logger.info(f"LOADER: \"{loader}\"") @@ -265,33 +264,6 @@ def llamacpp_HF_loader(model_name): return model -def ctransformers_loader(model_name): - from modules.ctransformers_model import CtransformersModel - - path = Path(f'{shared.args.model_dir}/{model_name}') - ctrans = CtransformersModel() - if ctrans.model_type_is_auto(): - model_file = path - else: - if path.is_file(): - model_file = path - else: - entries = Path(f'{shared.args.model_dir}/{model_name}') - gguf = list(entries.glob('*.gguf')) - bin = list(entries.glob('*.bin')) - if len(gguf) > 0: - model_file = gguf[0] - elif len(bin) > 0: - model_file = bin[0] - else: - logger.error("Could not find a model for ctransformers.") - return None, None - - logger.info(f'ctransformers weights detected: \"{model_file}\"') - model, tokenizer = ctrans.from_pretrained(model_file) - return model, tokenizer - - def AutoAWQ_loader(model_name): from awq import AutoAWQForCausalLM diff --git a/modules/models_settings.py b/modules/models_settings.py index 76effa87..12a2db82 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -48,7 +48,7 @@ def get_model_metadata(model): model_settings['loader'] = loader # GGUF metadata - if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: + if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF']: path = Path(f'{shared.args.model_dir}/{model}') if path.is_file(): model_file = path @@ -231,7 +231,7 @@ def apply_model_settings_to_state(model, state): loader = model_settings.pop('loader') # If the user is using an alternative loader for the same model type, let them keep using it - if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']) and not (loader == 'llama.cpp' and state['loader'] in ['ctransformers']): + if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']): state['loader'] = loader for k in model_settings: diff --git a/modules/shared.py b/modules/shared.py index ecfdb3be..a48b281c 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -88,7 +88,7 @@ group.add_argument('--chat-buttons', action='store_true', help='Show buttons on # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, ctransformers, QuIP#.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -259,8 +259,6 @@ def fix_loader_name(name): return 'ExLlamav2' elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' - elif name in ['ctransformers', 'ctranforemrs', 'ctransformer']: - return 'ctransformers' elif name in ['autoawq', 'awq', 'auto-awq']: return 'AutoAWQ' elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']: diff --git a/modules/text_generation.py b/modules/text_generation.py index 724bb0f0..f99c605e 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -46,7 +46,7 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF @@ -114,7 +114,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'CtransformersModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: input_ids = shared.tokenizer.encode(str(prompt)) if shared.model.__class__.__name__ not in ['Exllamav2Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) @@ -128,7 +128,7 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'CtransformersModel'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: return input_ids.to(device=local_rank) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 9f2729e2..8d6122d2 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -330,7 +330,7 @@ def update_truncation_length(current_length, state): if 'loader' in state: if state['loader'].lower().startswith('exllama'): return state['max_seq_len'] - elif state['loader'] in ['llama.cpp', 'llamacpp_HF', 'ctransformers']: + elif state['loader'] in ['llama.cpp', 'llamacpp_HF']: return state['n_ctx'] return current_length diff --git a/requirements.txt b/requirements.txt index 3835b954..25efdf53 100644 --- a/requirements.txt +++ b/requirements.txt @@ -68,5 +68,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX2/ctransformers-0.2.27+cu121-py3-none-any.whl autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index 9fe6e0a0..60cd75d5 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -68,5 +68,4 @@ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_ https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/ctransformers-cuBLAS-wheels/releases/download/AVX/ctransformers-0.2.27+cu121-py3-none-any.whl autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows"