Remove GGML support

This commit is contained in:
oobabooga 2023-09-11 07:30:56 -07:00
parent cc7b7ba153
commit ed86878f02
15 changed files with 24 additions and 123 deletions

View File

@ -158,7 +158,7 @@ text-generation-webui
│   │   └── tokenizer.model │   │   └── tokenizer.model
``` ```
* GGML/GGUF models are a single file and should be placed directly into `models`. Example: * GGUF models are a single file and should be placed directly into `models`. Example:
``` ```
text-generation-webui text-generation-webui
@ -260,7 +260,7 @@ Optionally, you can use the following command-line flags:
| `--quant_type QUANT_TYPE` | quant_type for 4-bit. Valid options: nf4, fp4. | | `--quant_type QUANT_TYPE` | quant_type for 4-bit. Valid options: nf4, fp4. |
| `--use_double_quant` | use_double_quant for 4-bit. | | `--use_double_quant` | use_double_quant for 4-bit. |
#### GGML/GGUF (for llama.cpp and ctransformers) #### GGUF (for llama.cpp and ctransformers)
| Flag | Description | | Flag | Description |
|-------------|-------------| |-------------|-------------|
@ -279,8 +279,6 @@ Optionally, you can use the following command-line flags:
| `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. | | `--cache-capacity CACHE_CAPACITY` | Maximum cache capacity. Examples: 2000MiB, 2GiB. When provided without units, bytes will be assumed. |
| `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 | | `--tensor_split TENSOR_SPLIT` | Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17 |
| `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). | | `--llama_cpp_seed SEED` | Seed for llama-cpp models. Default 0 (random). |
| `--n_gqa N_GQA` | GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b. |
| `--rms_norm_eps RMS_NORM_EPS` | GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models. |
| `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. | | `--cpu` | Use the CPU version of llama-cpp-python instead of the GPU-accelerated version. |
|`--cfg-cache` | llamacpp_HF: Create an additional cache for CFG negative prompts. | |`--cfg-cache` | llamacpp_HF: Create an additional cache for CFG negative prompts. |

View File

@ -9,16 +9,14 @@ llama.cpp is the best backend in two important scenarios:
#### Pre-converted #### Pre-converted
Download the GGUF or GGML models directly into your `text-generation-webui/models` folder. It will be a single file. Download the GGUF models directly into your `text-generation-webui/models` folder. It will be a single file.
* For GGUF models, make sure its name contains `.gguf`. * Make sure its name ends in `.gguf`.
* For GGML models, make sure its name contains `ggml` and ends in `.bin`. * `q4_K_M` quantization is recommended.
`q4_K_M` quantization is recommended.
#### Convert Llama yourself #### Convert Llama yourself
Follow the instructions in the llama.cpp README to generate a ggml: https://github.com/ggerganov/llama.cpp#prepare-data--run Follow the instructions in the llama.cpp README to generate a GGUF: https://github.com/ggerganov/llama.cpp#prepare-data--run
## GPU acceleration ## GPU acceleration

View File

@ -58,7 +58,6 @@ class ModelDownloader:
has_pytorch = False has_pytorch = False
has_pt = False has_pt = False
has_gguf = False has_gguf = False
has_ggml = False
has_safetensors = False has_safetensors = False
is_lora = False is_lora = False
while True: while True:
@ -83,10 +82,9 @@ class ModelDownloader:
is_safetensors = re.match(r".*\.safetensors", fname) is_safetensors = re.match(r".*\.safetensors", fname)
is_pt = re.match(r".*\.pt", fname) is_pt = re.match(r".*\.pt", fname)
is_gguf = re.match(r'.*\.gguf', fname) is_gguf = re.match(r'.*\.gguf', fname)
is_ggml = re.match(r".*ggml.*\.bin", fname)
is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname)
is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_ggml, is_tokenizer, is_text)): if any((is_pytorch, is_safetensors, is_pt, is_gguf, is_tokenizer, is_text)):
if 'lfs' in dict[i]: if 'lfs' in dict[i]:
sha256.append([fname, dict[i]['lfs']['oid']]) sha256.append([fname, dict[i]['lfs']['oid']])
@ -109,9 +107,6 @@ class ModelDownloader:
elif is_gguf: elif is_gguf:
has_gguf = True has_gguf = True
classifications.append('gguf') classifications.append('gguf')
elif is_ggml:
has_ggml = True
classifications.append('ggml')
cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50' cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50'
cursor = base64.b64encode(cursor) cursor = base64.b64encode(cursor)
@ -123,13 +118,8 @@ class ModelDownloader:
if classifications[i] in ['pytorch', 'pt']: if classifications[i] in ['pytorch', 'pt']:
links.pop(i) links.pop(i)
# If both GGML and GGUF are available, download GGUF only is_llamacpp = has_gguf and specific_file is not None
if has_ggml and has_gguf: return links, sha256, is_lora, is_llamacpp
for i in range(len(classifications) - 1, -1, -1):
if classifications[i] == 'ggml':
links.pop(i)
return links, sha256, is_lora, ((has_ggml or has_gguf) and specific_file is not None)
def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_folder=None): def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, base_folder=None):
if base_folder is None: if base_folder is None:

View File

@ -63,7 +63,6 @@ llama-65b-gptq-3bit:
.*vicuna.*(1.5|1_5): .*vicuna.*(1.5|1_5):
instruction_template: 'Vicuna-v1.1' instruction_template: 'Vicuna-v1.1'
truncation_length: 4096 truncation_length: 4096
rms_norm_eps: 5.0e-6
.*stable.*vicuna: .*stable.*vicuna:
instruction_template: 'StableVicuna' instruction_template: 'StableVicuna'
(?!.*chat).*chinese-vicuna: (?!.*chat).*chinese-vicuna:
@ -211,24 +210,19 @@ llama-65b-gptq-3bit:
instruction_template: 'Alpaca' instruction_template: 'Alpaca'
.*llama-(2|v2): .*llama-(2|v2):
truncation_length: 4096 truncation_length: 4096
rms_norm_eps: 5.0e-6
.*llama-(2|v2).*chat: .*llama-(2|v2).*chat:
instruction_template: 'Llama-v2' instruction_template: 'Llama-v2'
.*70b.*ggml.*\.bin:
n_gqa: 8
.*newhope: .*newhope:
instruction_template: 'NewHope' instruction_template: 'NewHope'
.*stablebeluga2: .*stablebeluga2:
instruction_template: 'StableBeluga2' instruction_template: 'StableBeluga2'
truncation_length: 4096 truncation_length: 4096
rms_norm_eps: 5.0e-6
.*openchat: .*openchat:
instruction_template: 'OpenChat' instruction_template: 'OpenChat'
.*falcon.*-instruct: .*falcon.*-instruct:
.*(openorca-platypus2): .*(openorca-platypus2):
instruction_template: 'OpenOrca-Platypus2' instruction_template: 'OpenOrca-Platypus2'
custom_stopping_strings: '"### Instruction:", "### Response:"' custom_stopping_strings: '"### Instruction:", "### Response:"'
rms_norm_eps: 5.0e-6
.*codellama: .*codellama:
rope_freq_base: 1000000 rope_freq_base: 1000000
.*codellama.*instruct: .*codellama.*instruct:

View File

@ -9,39 +9,23 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
from modules import RoPE, shared from modules import RoPE, shared
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.utils import is_gguf
import llama_cpp import llama_cpp
try:
import llama_cpp_ggml
except:
llama_cpp_ggml = llama_cpp
if torch.cuda.is_available() and not torch.version.hip: if torch.cuda.is_available() and not torch.version.hip:
try: try:
import llama_cpp_cuda import llama_cpp_cuda
except: except:
llama_cpp_cuda = None llama_cpp_cuda = None
try:
import llama_cpp_ggml_cuda
except:
llama_cpp_ggml_cuda = llama_cpp_cuda
else: else:
llama_cpp_cuda = None llama_cpp_cuda = None
llama_cpp_ggml_cuda = None
def llama_cpp_lib(model_file: Union[str, Path] = None): def llama_cpp_lib():
if model_file is not None:
gguf_model = is_gguf(model_file)
else:
gguf_model = True
if shared.args.cpu or llama_cpp_cuda is None: if shared.args.cpu or llama_cpp_cuda is None:
return llama_cpp if gguf_model else llama_cpp_ggml return llama_cpp
else: else:
return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda return llama_cpp_cuda
class LlamacppHF(PreTrainedModel): class LlamacppHF(PreTrainedModel):
@ -64,7 +48,7 @@ class LlamacppHF(PreTrainedModel):
'n_tokens': self.model.n_tokens, 'n_tokens': self.model.n_tokens,
'input_ids': self.model.input_ids.copy(), 'input_ids': self.model.input_ids.copy(),
'scores': self.model.scores.copy(), 'scores': self.model.scores.copy(),
'ctx': llama_cpp_lib(path).llama_new_context_with_model(model.model, model.params) 'ctx': llama_cpp_lib().llama_new_context_with_model(model.model, model.params)
} }
def _validate_model_class(self): def _validate_model_class(self):
@ -181,7 +165,7 @@ class LlamacppHF(PreTrainedModel):
if path.is_file(): if path.is_file():
model_file = path model_file = path
else: else:
model_file = (list(path.glob('*.gguf*')) + list(path.glob('*ggml*.bin')))[0] model_file = list(path.glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: {model_file}\n") logger.info(f"llama.cpp weights detected: {model_file}\n")
@ -207,14 +191,7 @@ class LlamacppHF(PreTrainedModel):
'logits_all': True, 'logits_all': True,
} }
if not is_gguf(model_file): Llama = llama_cpp_lib().Llama
ggml_params = {
'n_gqa': shared.args.n_gqa or None,
'rms_norm_eps': shared.args.rms_norm_eps or None,
}
params = params | ggml_params
Llama = llama_cpp_lib(model_file).Llama
model = Llama(**params) model = Llama(**params)
return LlamacppHF(model, model_file) return LlamacppHF(model, model_file)

View File

@ -1,7 +1,5 @@
import re import re
from functools import partial from functools import partial
from pathlib import Path
from typing import Union
import torch import torch
@ -9,39 +7,23 @@ from modules import RoPE, shared
from modules.callbacks import Iteratorize from modules.callbacks import Iteratorize
from modules.logging_colors import logger from modules.logging_colors import logger
from modules.text_generation import get_max_prompt_length from modules.text_generation import get_max_prompt_length
from modules.utils import is_gguf
import llama_cpp import llama_cpp
try:
import llama_cpp_ggml
except:
llama_cpp_ggml = llama_cpp
if torch.cuda.is_available() and not torch.version.hip: if torch.cuda.is_available() and not torch.version.hip:
try: try:
import llama_cpp_cuda import llama_cpp_cuda
except: except:
llama_cpp_cuda = None llama_cpp_cuda = None
try:
import llama_cpp_ggml_cuda
except:
llama_cpp_ggml_cuda = llama_cpp_cuda
else: else:
llama_cpp_cuda = None llama_cpp_cuda = None
llama_cpp_ggml_cuda = None
def llama_cpp_lib(model_file: Union[str, Path] = None): def llama_cpp_lib():
if model_file is not None:
gguf_model = is_gguf(model_file)
else:
gguf_model = True
if shared.args.cpu or llama_cpp_cuda is None: if shared.args.cpu or llama_cpp_cuda is None:
return llama_cpp if gguf_model else llama_cpp_ggml return llama_cpp
else: else:
return llama_cpp_cuda if gguf_model else llama_cpp_ggml_cuda return llama_cpp_cuda
def ban_eos_logits_processor(eos_token, input_ids, logits): def ban_eos_logits_processor(eos_token, input_ids, logits):
@ -59,8 +41,8 @@ class LlamaCppModel:
@classmethod @classmethod
def from_pretrained(self, path): def from_pretrained(self, path):
Llama = llama_cpp_lib(path).Llama Llama = llama_cpp_lib().Llama
LlamaCache = llama_cpp_lib(path).LlamaCache LlamaCache = llama_cpp_lib().LlamaCache
result = self() result = self()
cache_capacity = 0 cache_capacity = 0
@ -95,13 +77,6 @@ class LlamaCppModel:
'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb,
} }
if not is_gguf(path):
ggml_params = {
'n_gqa': shared.args.n_gqa or None,
'rms_norm_eps': shared.args.rms_norm_eps or None,
}
params = params | ggml_params
result.model = Llama(**params) result.model = Llama(**params)
if cache_capacity > 0: if cache_capacity > 0:
result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity))

View File

@ -68,8 +68,6 @@ loaders_and_params = OrderedDict({
], ],
'llama.cpp': [ 'llama.cpp': [
'n_ctx', 'n_ctx',
'n_gqa',
'rms_norm_eps',
'n_gpu_layers', 'n_gpu_layers',
'tensor_split', 'tensor_split',
'n_batch', 'n_batch',
@ -86,8 +84,6 @@ loaders_and_params = OrderedDict({
], ],
'llamacpp_HF': [ 'llamacpp_HF': [
'n_ctx', 'n_ctx',
'n_gqa',
'rms_norm_eps',
'n_gpu_layers', 'n_gpu_layers',
'tensor_split', 'tensor_split',
'n_batch', 'n_batch',

View File

@ -241,7 +241,7 @@ def llamacpp_loader(model_name):
if path.is_file(): if path.is_file():
model_file = path model_file = path
else: else:
model_file = (list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf*')) + list(Path(f'{shared.args.model_dir}/{model_name}').glob('*ggml*.bin')))[0] model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
logger.info(f"llama.cpp weights detected: {model_file}") logger.info(f"llama.cpp weights detected: {model_file}")
model, tokenizer = LlamaCppModel.from_pretrained(model_file) model, tokenizer = LlamaCppModel.from_pretrained(model_file)

View File

@ -24,9 +24,9 @@ def infer_loader(model_name):
loader = None loader = None
elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0): elif Path(f'{shared.args.model_dir}/{model_name}/quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0):
loader = 'AutoGPTQ' loader = 'AutoGPTQ'
elif len(list(path_to_model.glob('*.gguf*')) + list(path_to_model.glob('*ggml*.bin'))) > 0: elif len(list(path_to_model.glob('*.gguf'))) > 0:
loader = 'llama.cpp' loader = 'llama.cpp'
elif re.match(r'.*\.gguf|.*ggml.*\.bin', model_name.lower()): elif re.match(r'.*\.gguf', model_name.lower()):
loader = 'llama.cpp' loader = 'llama.cpp'
elif re.match(r'.*rwkv.*\.pth', model_name.lower()): elif re.match(r'.*rwkv.*\.pth', model_name.lower()):
loader = 'RWKV' loader = 'RWKV'

View File

@ -126,8 +126,6 @@ parser.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layer
parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17") parser.add_argument('--tensor_split', type=str, default=None, help="Split the model across multiple GPUs, comma-separated list of proportions, e.g. 18,17")
parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.') parser.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.')
parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)') parser.add_argument('--llama_cpp_seed', type=int, default=0, help='Seed for llama-cpp models. Default 0 (random)')
parser.add_argument('--n_gqa', type=int, default=0, help='grouped-query attention. Must be 8 for llama-2 70b.')
parser.add_argument('--rms_norm_eps', type=float, default=0, help='5e-6 is a good value for llama-2 models.')
# GPTQ # GPTQ
parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')

View File

@ -73,8 +73,6 @@ def list_model_elements():
'n_gpu_layers', 'n_gpu_layers',
'tensor_split', 'tensor_split',
'n_ctx', 'n_ctx',
'n_gqa',
'rms_norm_eps',
'llama_cpp_seed', 'llama_cpp_seed',
'gpu_split', 'gpu_split',
'max_seq_len', 'max_seq_len',

View File

@ -82,8 +82,6 @@ def create_ui():
shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx) shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch) shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
shared.gradio['n_gqa'] = gr.Slider(minimum=0, maximum=16, step=1, label="n_gqa", value=shared.args.n_gqa, info='GGML only (not used by GGUF): Grouped-Query Attention. Must be 8 for llama-2 70b.')
shared.gradio['rms_norm_eps'] = gr.Slider(minimum=0, maximum=1e-5, step=1e-6, label="rms_norm_eps", value=shared.args.rms_norm_eps, info='GGML only (not used by GGUF): 5e-6 is a good value for llama-2 models.')
shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None") shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=str(shared.args.wbits) if shared.args.wbits > 0 else "None")
shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=str(shared.args.groupsize) if shared.args.groupsize > 0 else "None")
@ -128,7 +126,7 @@ def create_ui():
shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.') shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.')
shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.") shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.")
shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF/GGML)", show_label=False, max_lines=1) shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1)
with gr.Row(): with gr.Row():
shared.gradio['download_model_button'] = gr.Button("Download", variant='primary') shared.gradio['download_model_button'] = gr.Button("Download", variant='primary')
shared.gradio['get_file_list'] = gr.Button("Get file list") shared.gradio['get_file_list'] = gr.Button("Get file list")

View File

@ -2,7 +2,6 @@ import os
import re import re
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Union
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
@ -125,15 +124,3 @@ def get_datasets(path: str, ext: str):
def get_available_chat_styles(): def get_available_chat_styles():
return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys) return sorted(set(('-'.join(k.stem.split('-')[1:]) for k in Path('css').glob('chat_style*.css'))), key=natural_keys)
def is_gguf(path: Union[str, Path]) -> bool:
'''
Determines if a llama.cpp model is in GGUF format
Copied from ctransformers utils.py
'''
path = str(Path(path).resolve())
with open(path, "rb") as f:
magic = f.read(4)
return magic == "GGUF".encode()

View File

@ -44,12 +44,6 @@ https://github.com/abetlen/llama-cpp-python/releases/download/v0.1.84/llama_cpp_
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.84+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.84+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.84+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.1.84+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
# llama-cpp-python with GGML support
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python_ggml-0.1.78+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_ggml_cuda-0.1.78+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
# GPTQ-for-LLaMa # GPTQ-for-LLaMa
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-win_amd64.whl; platform_system == "Windows"
https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.0/gptq_for_llama-0.1.0+cu117-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"

View File

@ -177,8 +177,6 @@ if __name__ == "__main__":
'skip_special_tokens': shared.settings['skip_special_tokens'], 'skip_special_tokens': shared.settings['skip_special_tokens'],
'custom_stopping_strings': shared.settings['custom_stopping_strings'], 'custom_stopping_strings': shared.settings['custom_stopping_strings'],
'truncation_length': shared.settings['truncation_length'], 'truncation_length': shared.settings['truncation_length'],
'n_gqa': 0,
'rms_norm_eps': 0,
'rope_freq_base': 0, 'rope_freq_base': 0,
} }