diff --git a/README.md b/README.md index 3bbaf84e..868e822b 100644 --- a/README.md +++ b/README.md @@ -220,8 +220,10 @@ Optionally, you can use the following command-line flags: | Flag | Description | |-------------|-------------| -| `--threads` | Number of threads to use in llama.cpp. | -| `--n_batch` | Processing batch size for llama.cpp. | +| `--threads` | Number of threads to use. | +| `--n_batch` | Maximum number of prompt tokens to batch together when calling llama_eval. | +| `--no-mmap` | Prevent mmap from being used. | +| `--mlock` | Force the system to keep the model in RAM. | #### GPTQ diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 9461db10..d19eea27 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -1,78 +1,63 @@ -import multiprocessing +''' +Based on +https://github.com/abetlen/llama-cpp-python -import llamacpp +Documentation: +https://abetlen.github.io/llama-cpp-python/ +''' + +from llama_cpp import Llama, LlamaCache from modules import shared from modules.callbacks import Iteratorize -class LlamaCppTokenizer: - """A thin wrapper over the llamacpp tokenizer""" - def __init__(self, model: llamacpp.LlamaInference): - self._tokenizer = model.get_tokenizer() - self.eos_token_id = 2 - self.bos_token_id = 0 - - @classmethod - def from_model(cls, model: llamacpp.LlamaInference): - return cls(model) - - def encode(self, prompt: str): - return self._tokenizer.tokenize(prompt) - - def decode(self, ids): - return self._tokenizer.detokenize(ids) - - class LlamaCppModel: def __init__(self): self.initialized = False @classmethod def from_pretrained(self, path): - params = llamacpp.InferenceParams() - params.path_model = str(path) - params.n_threads = shared.args.threads or multiprocessing.cpu_count() // 2 - - _model = llamacpp.LlamaInference(params) - result = self() - result.model = _model - result.params = params - tokenizer = LlamaCppTokenizer.from_model(_model) - return result, tokenizer + params = { + 'model_path': str(path), + 'n_ctx': 2048, + 'seed': 0, + 'n_threads': shared.args.threads or None, + 'n_batch': shared.args.n_batch, + 'use_mmap': not shared.args.no_mmap, + 'use_mlock': shared.args.mlock + } + self.model = Llama(**params) + self.model.set_cache(LlamaCache) + + # This is ugly, but the model and the tokenizer are the same object in this library. + return result, result + + def encode(self, string): + if type(string) is str: + string = string.encode() + return self.model.tokenize(string) def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): - params = self.params - params.n_predict = token_count - params.top_p = top_p - params.top_k = top_k - params.temp = temperature - params.repeat_penalty = repetition_penalty - # params.repeat_last_n = repeat_last_n + if type(context) is str: + context = context.encode() + tokens = self.model.tokenize(context) - # self.model.params = params - self.model.add_bos() - self.model.update_input(context) + output = b"" + count = 0 + for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty): + text = self.model.detokenize([token]) + output += text + if callback: + callback(text.decode()) - output = "" - is_end_of_text = False - ctr = 0 - while ctr < token_count and not is_end_of_text: - if self.model.has_unconsumed_input(): - self.model.ingest_all_pending_input() - else: - self.model.eval() - token = self.model.sample() - text = self.model.token_to_str(token) - output += text - is_end_of_text = token == self.model.token_eos() - if callback: - callback(text) - ctr += 1 + count += 1 + if count >= token_count or (token == self.model.token_eos()): + break - return output + return output.decode() def generate_with_streaming(self, **kwargs): with Iteratorize(self.generate, kwargs, callback=None) as generator: diff --git a/modules/llamacpp_model_alternative.py b/modules/llamacpp_model_alternative.py deleted file mode 100644 index 2671f227..00000000 --- a/modules/llamacpp_model_alternative.py +++ /dev/null @@ -1,65 +0,0 @@ -''' -Based on -https://github.com/abetlen/llama-cpp-python - -Documentation: -https://abetlen.github.io/llama-cpp-python/ -''' - -from llama_cpp import Llama, LlamaCache - -from modules import shared -from modules.callbacks import Iteratorize - - -class LlamaCppModel: - def __init__(self): - self.initialized = False - - @classmethod - def from_pretrained(self, path): - result = self() - - params = { - 'model_path': str(path), - 'n_ctx': 2048, - 'seed': 0, - 'n_threads': shared.args.threads or None, - 'n_batch': shared.args.n_batch - } - self.model = Llama(**params) - self.model.set_cache(LlamaCache) - - # This is ugly, but the model and the tokenizer are the same object in this library. - return result, result - - def encode(self, string): - if type(string) is str: - string = string.encode() - return self.model.tokenize(string) - - def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): - if type(context) is str: - context = context.encode() - tokens = self.model.tokenize(context) - - output = b"" - count = 0 - for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty): - text = self.model.detokenize([token]) - output += text - if callback: - callback(text.decode()) - - count += 1 - if count >= token_count or (token == self.model.token_eos()): - break - - return output.decode() - - def generate_with_streaming(self, **kwargs): - with Iteratorize(self.generate, kwargs, callback=None) as generator: - reply = '' - for token in generator: - reply += token - yield reply diff --git a/modules/models.py b/modules/models.py index a6c45cca..e2193e6c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -129,7 +129,7 @@ def load_model(model_name): # llamacpp model elif shared.model_type == 'llamacpp': - from modules.llamacpp_model_alternative import LlamaCppModel + from modules.llamacpp_model import LlamaCppModel path = Path(f'{shared.args.model_dir}/{model_name}') if path.is_file(): diff --git a/modules/shared.py b/modules/shared.py index 849b9cef..6ee82385 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -120,8 +120,10 @@ parser.add_argument('--sdp-attention', action='store_true', help="Use torch 2.0' parser.add_argument('--trust-remote-code', action='store_true', help="Set trust_remote_code=True while loading a model. Necessary for ChatGLM.") # llama.cpp -parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.') -parser.add_argument('--n_batch', type=int, default=8, help='Processing batch size for llama.cpp.') +parser.add_argument('--threads', type=int, default=0, help='Number of threads to use.') +parser.add_argument('--n_batch', type=int, default=512, help='Maximum number of prompt tokens to batch together when calling llama_eval.') +parser.add_argument('--no_mmap', action='store_true', help='Prevent mmap from being used.') +parser.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') # GPTQ parser.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')