mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-22 08:07:56 +01:00
Remove flexgen support
This commit is contained in:
parent
5134d5b1c6
commit
75c2dd38cf
@ -1,63 +0,0 @@
|
||||
'''
|
||||
|
||||
Converts a transformers model to a format compatible with flexgen.
|
||||
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54))
|
||||
parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def disable_torch_init():
|
||||
"""
|
||||
Disable the redundant torch default initialization to accelerate model creation.
|
||||
"""
|
||||
import torch
|
||||
global torch_linear_init_backup
|
||||
global torch_layer_norm_init_backup
|
||||
|
||||
torch_linear_init_backup = torch.nn.Linear.reset_parameters
|
||||
setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
|
||||
|
||||
torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters
|
||||
setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
|
||||
|
||||
|
||||
def restore_torch_init():
|
||||
"""Rollback the change made by disable_torch_init."""
|
||||
import torch
|
||||
setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup)
|
||||
setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
path = Path(args.MODEL)
|
||||
model_name = path.name
|
||||
|
||||
print(f"Loading {model_name}...")
|
||||
# disable_torch_init()
|
||||
model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
||||
# restore_torch_init()
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(path)
|
||||
|
||||
out_folder = Path(f"models/{model_name}-np")
|
||||
if not Path(out_folder).exists():
|
||||
os.mkdir(out_folder)
|
||||
|
||||
print(f"Saving the converted model to {out_folder}...")
|
||||
for name, param in tqdm(list(model.model.named_parameters())):
|
||||
name = name.replace("decoder.final_layer_norm", "decoder.layer_norm")
|
||||
param_path = os.path.join(out_folder, name)
|
||||
with open(param_path, "wb") as f:
|
||||
np.save(f, param.cpu().detach().numpy())
|
@ -1,64 +0,0 @@
|
||||
>FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!).
|
||||
|
||||
https://github.com/FMInference/FlexGen
|
||||
|
||||
## Installation
|
||||
|
||||
No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project.
|
||||
|
||||
## Converting a model
|
||||
|
||||
FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI:
|
||||
|
||||
```
|
||||
python convert-to-flexgen.py models/opt-1.3b/
|
||||
```
|
||||
|
||||
The output will be saved to `models/opt-1.3b-np/`.
|
||||
|
||||
## Usage
|
||||
|
||||
The basic command is the following:
|
||||
|
||||
```
|
||||
python server.py --model opt-1.3b --loader flexgen
|
||||
```
|
||||
|
||||
For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this:
|
||||
|
||||
```
|
||||
python server.py --model opt-1.3b --loader flexgen --compress-weight
|
||||
```
|
||||
|
||||
With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases.
|
||||
|
||||
You can also manually set the offload strategy with
|
||||
|
||||
```
|
||||
python server.py --model opt-1.3b --loader flexgen --percent 0 100 100 0 100 0
|
||||
```
|
||||
|
||||
where the six numbers after `--percent` are:
|
||||
|
||||
```
|
||||
the percentage of weight on GPU
|
||||
the percentage of weight on CPU
|
||||
the percentage of attention cache on GPU
|
||||
the percentage of attention cache on CPU
|
||||
the percentage of activations on GPU
|
||||
the percentage of activations on CPU
|
||||
```
|
||||
|
||||
You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder.
|
||||
|
||||
## Performance
|
||||
|
||||
In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results:
|
||||
|
||||
* `--loader flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token.
|
||||
* `--loader flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token.
|
||||
|
||||
## Limitations
|
||||
|
||||
* Only works with the OPT models.
|
||||
* Only two generation parameters are available: `temperature` and `do_sample`.
|
@ -56,7 +56,6 @@ def load_model(model_name, loader=None):
|
||||
'GPTQ-for-LLaMa': GPTQ_loader,
|
||||
'llama.cpp': llamacpp_loader,
|
||||
'llamacpp_HF': llamacpp_HF_loader,
|
||||
'FlexGen': flexgen_loader,
|
||||
'RWKV': RWKV_loader,
|
||||
'ExLlama': ExLlama_loader,
|
||||
'ExLlama_HF': ExLlama_HF_loader
|
||||
@ -221,32 +220,6 @@ def huggingface_loader(model_name):
|
||||
return model
|
||||
|
||||
|
||||
def flexgen_loader(model_name):
|
||||
from flexgen.flex_opt import CompressionConfig, ExecutionEnv, OptLM, Policy
|
||||
|
||||
# Initialize environment
|
||||
env = ExecutionEnv.create(shared.args.disk_cache_dir)
|
||||
|
||||
# Offloading policy
|
||||
policy = Policy(1, 1,
|
||||
shared.args.percent[0], shared.args.percent[1],
|
||||
shared.args.percent[2], shared.args.percent[3],
|
||||
shared.args.percent[4], shared.args.percent[5],
|
||||
overlap=True, sep_layer=True, pin_weight=shared.args.pin_weight,
|
||||
cpu_cache_compute=False, attn_sparsity=1.0,
|
||||
compress_weight=shared.args.compress_weight,
|
||||
comp_weight_config=CompressionConfig(
|
||||
num_bits=4, group_size=64,
|
||||
group_dim=0, symmetric=False),
|
||||
compress_cache=False,
|
||||
comp_cache_config=CompressionConfig(
|
||||
num_bits=4, group_size=64,
|
||||
group_dim=2, symmetric=False))
|
||||
|
||||
model = OptLM(f"facebook/{model_name}", env, shared.args.model_dir, policy)
|
||||
return model
|
||||
|
||||
|
||||
def RWKV_loader(model_name):
|
||||
from modules.RWKV import RWKVModel, RWKVTokenizer
|
||||
|
||||
|
@ -30,8 +30,6 @@ def infer_loader(model_name):
|
||||
loader = 'llama.cpp'
|
||||
elif re.match('.*rwkv.*\.pth', model_name.lower()):
|
||||
loader = 'RWKV'
|
||||
elif shared.args.flexgen:
|
||||
loader = 'FlexGen'
|
||||
else:
|
||||
loader = 'Transformers'
|
||||
|
||||
|
@ -95,7 +95,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten
|
||||
parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.')
|
||||
|
||||
# Model loader
|
||||
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen')
|
||||
parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv')
|
||||
|
||||
# Accelerate/transformers
|
||||
parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.')
|
||||
@ -156,7 +156,6 @@ parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM
|
||||
parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
|
||||
|
||||
# FlexGen
|
||||
parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
|
||||
parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).')
|
||||
parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.")
|
||||
parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).")
|
||||
@ -202,9 +201,6 @@ if args.autogptq:
|
||||
if args.gptq_for_llama:
|
||||
logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.')
|
||||
args.loader = 'gptq-for-llama'
|
||||
if args.flexgen:
|
||||
logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.')
|
||||
args.loader = 'FlexGen'
|
||||
|
||||
# Security warnings
|
||||
if args.trust_remote_code:
|
||||
|
@ -53,8 +53,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt
|
||||
|
||||
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu:
|
||||
return input_ids
|
||||
elif shared.args.flexgen:
|
||||
return input_ids.numpy()
|
||||
elif shared.args.deepspeed:
|
||||
return input_ids.to(device=local_rank)
|
||||
elif torch.backends.mps.is_available():
|
||||
@ -182,8 +180,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False):
|
||||
|
||||
if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']:
|
||||
generate_func = generate_reply_custom
|
||||
elif shared.args.flexgen:
|
||||
generate_func = generate_reply_flexgen
|
||||
else:
|
||||
generate_func = generate_reply_HF
|
||||
|
||||
@ -339,66 +335,3 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str
|
||||
new_tokens = len(encode(original_question + reply)[0]) - original_tokens
|
||||
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
||||
return
|
||||
|
||||
|
||||
def generate_reply_flexgen(question, original_question, seed, state, stopping_strings=None, is_chat=False):
|
||||
generate_params = {}
|
||||
for k in ['max_new_tokens', 'do_sample', 'temperature']:
|
||||
generate_params[k] = state[k]
|
||||
|
||||
if state['stream']:
|
||||
generate_params['max_new_tokens'] = 8
|
||||
|
||||
# Encode the input
|
||||
input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
|
||||
output = input_ids[0]
|
||||
|
||||
# Find the eos tokens
|
||||
eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
|
||||
if not state['ban_eos_token']:
|
||||
generate_params['stop'] = eos_token_ids[-1]
|
||||
|
||||
# Add the encoded tokens to generate_params
|
||||
question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
|
||||
original_input_ids = input_ids
|
||||
generate_params.update({'inputs': input_ids})
|
||||
if inputs_embeds is not None:
|
||||
generate_params.update({'inputs_embeds': inputs_embeds})
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
if not is_chat:
|
||||
yield ''
|
||||
|
||||
# Generate the entire reply at once.
|
||||
if not state['stream']:
|
||||
with torch.no_grad():
|
||||
output = shared.model.generate(**generate_params)[0]
|
||||
|
||||
yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat)
|
||||
|
||||
# Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
|
||||
else:
|
||||
for i in range(state['max_new_tokens'] // 8 + 1):
|
||||
if shared.stop_everything:
|
||||
break
|
||||
|
||||
clear_torch_cache()
|
||||
with torch.no_grad():
|
||||
output = shared.model.generate(**generate_params)[0]
|
||||
|
||||
if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
|
||||
break
|
||||
|
||||
yield get_reply_from_output_ids(output, original_input_ids, original_question, state)
|
||||
input_ids = np.reshape(output, (1, output.shape[0]))
|
||||
generate_params.update({'inputs': input_ids})
|
||||
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
t1 = time.time()
|
||||
original_tokens = len(original_input_ids[0])
|
||||
new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
|
||||
print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
|
||||
return
|
||||
|
@ -71,10 +71,7 @@ def natural_keys(text):
|
||||
|
||||
|
||||
def get_available_models():
|
||||
if shared.args.flexgen:
|
||||
return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys)
|
||||
else:
|
||||
return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
|
||||
return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys)
|
||||
|
||||
|
||||
def get_available_presets():
|
||||
|
@ -321,7 +321,7 @@ def create_settings_menus(default_preset):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset', elem_classes='slim-dropdown')
|
||||
shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown')
|
||||
ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button')
|
||||
shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button')
|
||||
shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')
|
||||
|
Loading…
Reference in New Issue
Block a user