mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-23 16:38:21 +01:00
commit
11288d11d4
@ -53,7 +53,10 @@ def add_lora_exllama(lora_names):
|
|||||||
|
|
||||||
lora_path = get_lora_path(lora_names[0])
|
lora_path = get_lora_path(lora_names[0])
|
||||||
lora_config_path = lora_path / "adapter_config.json"
|
lora_config_path = lora_path / "adapter_config.json"
|
||||||
lora_adapter_path = lora_path / "adapter_model.bin"
|
for file_name in ["adapter_model.safetensors", "adapter_model.bin"]:
|
||||||
|
file_path = lora_path / file_name
|
||||||
|
if file_path.is_file():
|
||||||
|
lora_adapter_path = file_path
|
||||||
|
|
||||||
logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
|
logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
|
||||||
if shared.model.__class__.__name__ == 'ExllamaModel':
|
if shared.model.__class__.__name__ == 'ExllamaModel':
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import random
|
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -10,7 +9,7 @@ from exllamav2 import (
|
|||||||
ExLlamaV2Config,
|
ExLlamaV2Config,
|
||||||
ExLlamaV2Tokenizer
|
ExLlamaV2Tokenizer
|
||||||
)
|
)
|
||||||
from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
|
from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
@ -64,7 +63,7 @@ class Exllamav2Model:
|
|||||||
else:
|
else:
|
||||||
cache = ExLlamaV2Cache(model)
|
cache = ExLlamaV2Cache(model)
|
||||||
|
|
||||||
generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
|
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
|
||||||
|
|
||||||
result = self()
|
result = self()
|
||||||
result.model = model
|
result.model = model
|
||||||
@ -115,41 +114,21 @@ class Exllamav2Model:
|
|||||||
|
|
||||||
ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
|
ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
|
||||||
ids = ids[:, -get_max_prompt_length(state):]
|
ids = ids[:, -get_max_prompt_length(state):]
|
||||||
initial_len = ids.shape[-1]
|
|
||||||
|
|
||||||
if state['auto_max_new_tokens']:
|
if state['auto_max_new_tokens']:
|
||||||
max_new_tokens = state['truncation_length'] - ids.shape[-1]
|
max_new_tokens = state['truncation_length'] - ids.shape[-1]
|
||||||
else:
|
else:
|
||||||
max_new_tokens = state['max_new_tokens']
|
max_new_tokens = state['max_new_tokens']
|
||||||
|
|
||||||
# _gen_begin_base
|
self.generator.begin_stream(ids, settings, loras=self.loras)
|
||||||
self.cache.current_seq_len = 0
|
|
||||||
self.model.forward(ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
|
|
||||||
|
|
||||||
has_leading_space = False
|
decoded_text = ''
|
||||||
for i in range(max_new_tokens):
|
for i in range(max_new_tokens):
|
||||||
logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None, loras=self.loras).float().cpu()
|
chunk, eos, _ = self.generator.stream()
|
||||||
token, _, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer)
|
if eos or shared.stop_everything:
|
||||||
ids = torch.cat([ids, token], dim=1)
|
|
||||||
|
|
||||||
if i == 0 and self.tokenizer.tokenizer.id_to_piece(int(token)).startswith('▁'):
|
|
||||||
has_leading_space = True
|
|
||||||
|
|
||||||
decoded_text = self.tokenizer.decode(ids[:, initial_len:], decode_special_tokens=not state['skip_special_tokens'])[0]
|
|
||||||
if has_leading_space:
|
|
||||||
decoded_text = ' ' + decoded_text
|
|
||||||
|
|
||||||
# Check the partial unicode character
|
|
||||||
if chr(0xfffd) in decoded_text:
|
|
||||||
is_last = i == max_new_tokens - 1
|
|
||||||
is_stopping = token.item() == self.tokenizer.eos_token_id or shared.stop_everything
|
|
||||||
# If we are not at the end of the generation, we skip this token
|
|
||||||
if not (is_last or is_stopping):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
decoded_text += chunk
|
||||||
yield decoded_text
|
yield decoded_text
|
||||||
|
|
||||||
def generate(self, prompt, state):
|
def generate(self, prompt, state):
|
||||||
|
@ -413,12 +413,8 @@ def ExLlamav2_HF_loader(model_name):
|
|||||||
|
|
||||||
|
|
||||||
def HQQ_loader(model_name):
|
def HQQ_loader(model_name):
|
||||||
try:
|
|
||||||
from hqq.core.quantize import HQQBackend, HQQLinear
|
from hqq.core.quantize import HQQBackend, HQQLinear
|
||||||
from hqq.engine.hf import HQQModelForCausalLM
|
from hqq.engine.hf import HQQModelForCausalLM
|
||||||
except ModuleNotFoundError:
|
|
||||||
logger.error("HQQ is not installed. You can install it with:\n\npip install hqq")
|
|
||||||
return None
|
|
||||||
|
|
||||||
logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
|
logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
|
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
|
exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11
|
exllamav2==0.0.11
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11
|
exllamav2==0.0.11
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11
|
exllamav2==0.0.11
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11
|
exllamav2==0.0.11
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
@ -4,6 +4,7 @@ datasets
|
|||||||
einops
|
einops
|
||||||
exllamav2==0.0.11
|
exllamav2==0.0.11
|
||||||
gradio==3.50.*
|
gradio==3.50.*
|
||||||
|
hqq==0.1.1.post1
|
||||||
markdown
|
markdown
|
||||||
numpy==1.24.*
|
numpy==1.24.*
|
||||||
optimum==1.16.*
|
optimum==1.16.*
|
||||||
|
Loading…
Reference in New Issue
Block a user