diff --git a/convert-to-flexgen.py b/convert-to-flexgen.py deleted file mode 100644 index 7654593b..00000000 --- a/convert-to-flexgen.py +++ /dev/null @@ -1,63 +0,0 @@ -''' - -Converts a transformers model to a format compatible with flexgen. - -''' - -import argparse -import os -from pathlib import Path - -import numpy as np -import torch -from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer - -parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54)) -parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.") -args = parser.parse_args() - - -def disable_torch_init(): - """ - Disable the redundant torch default initialization to accelerate model creation. - """ - import torch - global torch_linear_init_backup - global torch_layer_norm_init_backup - - torch_linear_init_backup = torch.nn.Linear.reset_parameters - setattr(torch.nn.Linear, "reset_parameters", lambda self: None) - - torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters - setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) - - -def restore_torch_init(): - """Rollback the change made by disable_torch_init.""" - import torch - setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup) - setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup) - - -if __name__ == '__main__': - path = Path(args.MODEL) - model_name = path.name - - print(f"Loading {model_name}...") - # disable_torch_init() - model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True) - # restore_torch_init() - - tokenizer = AutoTokenizer.from_pretrained(path) - - out_folder = Path(f"models/{model_name}-np") - if not Path(out_folder).exists(): - os.mkdir(out_folder) - - print(f"Saving the converted model to {out_folder}...") - for name, param in tqdm(list(model.model.named_parameters())): - name = name.replace("decoder.final_layer_norm", "decoder.layer_norm") - param_path = os.path.join(out_folder, name) - with open(param_path, "wb") as f: - np.save(f, param.cpu().detach().numpy()) diff --git a/docs/FlexGen.md b/docs/FlexGen.md deleted file mode 100644 index 931cc36f..00000000 --- a/docs/FlexGen.md +++ /dev/null @@ -1,64 +0,0 @@ ->FlexGen is a high-throughput generation engine for running large language models with limited GPU memory (e.g., a 16GB T4 GPU or a 24GB RTX3090 gaming card!). - -https://github.com/FMInference/FlexGen - -## Installation - -No additional installation steps are necessary. FlexGen is in the `requirements.txt` file for this project. - -## Converting a model - -FlexGen only works with the OPT model, and it needs to be converted to numpy format before starting the web UI: - -``` -python convert-to-flexgen.py models/opt-1.3b/ -``` - -The output will be saved to `models/opt-1.3b-np/`. - -## Usage - -The basic command is the following: - -``` -python server.py --model opt-1.3b --loader flexgen -``` - -For large models, the RAM usage may be too high and your computer may freeze. If that happens, you can try this: - -``` -python server.py --model opt-1.3b --loader flexgen --compress-weight -``` - -With this second command, I was able to run both OPT-6.7b and OPT-13B with **2GB VRAM**, and the speed was good in both cases. - -You can also manually set the offload strategy with - -``` -python server.py --model opt-1.3b --loader flexgen --percent 0 100 100 0 100 0 -``` - -where the six numbers after `--percent` are: - -``` -the percentage of weight on GPU -the percentage of weight on CPU -the percentage of attention cache on GPU -the percentage of attention cache on CPU -the percentage of activations on GPU -the percentage of activations on CPU -``` - -You should typically only change the first two numbers. If their sum is less than 100, the remaining layers will be offloaded to the disk, by default into the `text-generation-webui/cache` folder. - -## Performance - -In my experiments with OPT-30B using a RTX 3090 on Linux, I have obtained these results: - -* `--loader flexgen --compress-weight --percent 0 100 100 0 100 0`: 0.99 seconds per token. -* `--loader flexgen --compress-weight --percent 100 0 100 0 100 0`: 0.765 seconds per token. - -## Limitations - -* Only works with the OPT models. -* Only two generation parameters are available: `temperature` and `do_sample`. \ No newline at end of file diff --git a/modules/models.py b/modules/models.py index 232d5fa6..4866893a 100644 --- a/modules/models.py +++ b/modules/models.py @@ -56,7 +56,6 @@ def load_model(model_name, loader=None): 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, - 'FlexGen': flexgen_loader, 'RWKV': RWKV_loader, 'ExLlama': ExLlama_loader, 'ExLlama_HF': ExLlama_HF_loader @@ -221,32 +220,6 @@ def huggingface_loader(model_name): return model -def flexgen_loader(model_name): - from flexgen.flex_opt import CompressionConfig, ExecutionEnv, OptLM, Policy - - # Initialize environment - env = ExecutionEnv.create(shared.args.disk_cache_dir) - - # Offloading policy - policy = Policy(1, 1, - shared.args.percent[0], shared.args.percent[1], - shared.args.percent[2], shared.args.percent[3], - shared.args.percent[4], shared.args.percent[5], - overlap=True, sep_layer=True, pin_weight=shared.args.pin_weight, - cpu_cache_compute=False, attn_sparsity=1.0, - compress_weight=shared.args.compress_weight, - comp_weight_config=CompressionConfig( - num_bits=4, group_size=64, - group_dim=0, symmetric=False), - compress_cache=False, - comp_cache_config=CompressionConfig( - num_bits=4, group_size=64, - group_dim=2, symmetric=False)) - - model = OptLM(f"facebook/{model_name}", env, shared.args.model_dir, policy) - return model - - def RWKV_loader(model_name): from modules.RWKV import RWKVModel, RWKVTokenizer diff --git a/modules/models_settings.py b/modules/models_settings.py index 3f37e48d..9319582e 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -30,8 +30,6 @@ def infer_loader(model_name): loader = 'llama.cpp' elif re.match('.*rwkv.*\.pth', model_name.lower()): loader = 'RWKV' - elif shared.args.flexgen: - loader = 'FlexGen' else: loader = 'Transformers' diff --git a/modules/shared.py b/modules/shared.py index f45a5683..937b4c51 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -95,7 +95,7 @@ parser.add_argument('--extensions', type=str, nargs="+", help='The list of exten parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') # Model loader -parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv, flexgen') +parser.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: transformers, autogptq, gptq-for-llama, exllama, exllama_hf, llamacpp, rwkv') # Accelerate/transformers parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text. Warning: Training on CPU is extremely slow.') @@ -156,7 +156,6 @@ parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.") # FlexGen -parser.add_argument('--flexgen', action='store_true', help='DEPRECATED') parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).') parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.") parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).") @@ -202,9 +201,6 @@ if args.autogptq: if args.gptq_for_llama: logger.warning('--gptq-for-llama has been deprecated and will be removed soon. Use --loader gptq-for-llama instead.') args.loader = 'gptq-for-llama' -if args.flexgen: - logger.warning('--flexgen has been deprecated and will be removed soon. Use --loader flexgen instead.') - args.loader = 'FlexGen' # Security warnings if args.trust_remote_code: diff --git a/modules/text_generation.py b/modules/text_generation.py index d3939d3f..e1be6aa3 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -53,8 +53,6 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel'] or shared.args.cpu: return input_ids - elif shared.args.flexgen: - return input_ids.numpy() elif shared.args.deepspeed: return input_ids.to(device=local_rank) elif torch.backends.mps.is_available(): @@ -182,8 +180,6 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False): if shared.model.__class__.__name__ in ['LlamaCppModel', 'RWKVModel', 'ExllamaModel']: generate_func = generate_reply_custom - elif shared.args.flexgen: - generate_func = generate_reply_flexgen else: generate_func = generate_reply_HF @@ -339,66 +335,3 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str new_tokens = len(encode(original_question + reply)[0]) - original_tokens print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') return - - -def generate_reply_flexgen(question, original_question, seed, state, stopping_strings=None, is_chat=False): - generate_params = {} - for k in ['max_new_tokens', 'do_sample', 'temperature']: - generate_params[k] = state[k] - - if state['stream']: - generate_params['max_new_tokens'] = 8 - - # Encode the input - input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state)) - output = input_ids[0] - - # Find the eos tokens - eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else [] - if not state['ban_eos_token']: - generate_params['stop'] = eos_token_ids[-1] - - # Add the encoded tokens to generate_params - question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None) - original_input_ids = input_ids - generate_params.update({'inputs': input_ids}) - if inputs_embeds is not None: - generate_params.update({'inputs_embeds': inputs_embeds}) - - t0 = time.time() - try: - if not is_chat: - yield '' - - # Generate the entire reply at once. - if not state['stream']: - with torch.no_grad(): - output = shared.model.generate(**generate_params)[0] - - yield get_reply_from_output_ids(output, input_ids, original_question, state, is_chat=is_chat) - - # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria' - else: - for i in range(state['max_new_tokens'] // 8 + 1): - if shared.stop_everything: - break - - clear_torch_cache() - with torch.no_grad(): - output = shared.model.generate(**generate_params)[0] - - if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)): - break - - yield get_reply_from_output_ids(output, original_input_ids, original_question, state) - input_ids = np.reshape(output, (1, output.shape[0])) - generate_params.update({'inputs': input_ids}) - - except Exception: - traceback.print_exc() - finally: - t1 = time.time() - original_tokens = len(original_input_ids[0]) - new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0) - print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') - return diff --git a/modules/utils.py b/modules/utils.py index e257de2d..9ae5dc86 100644 --- a/modules/utils.py +++ b/modules/utils.py @@ -71,10 +71,7 @@ def natural_keys(text): def get_available_models(): - if shared.args.flexgen: - return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=natural_keys) - else: - return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys) + return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json', '.yaml'))], key=natural_keys) def get_available_presets(): diff --git a/server.py b/server.py index 5157de0d..35b43176 100644 --- a/server.py +++ b/server.py @@ -321,7 +321,7 @@ def create_settings_menus(default_preset): with gr.Row(): with gr.Column(): with gr.Row(): - shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset', elem_classes='slim-dropdown') + shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset, label='Generation parameters preset', elem_classes='slim-dropdown') ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button') shared.gradio['save_preset'] = gr.Button('💾', elem_classes='refresh-button') shared.gradio['delete_preset'] = gr.Button('🗑️', elem_classes='refresh-button')