From c0fd7f3257780e2edf761d535fa248f79e4ffac2 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 22 May 2023 19:37:24 -0300 Subject: [PATCH] Add mirostat parameters for llama.cpp (#2287) --- api-example-chat-stream.py | 3 +++ api-example-chat.py | 3 +++ api-example-stream.py | 3 +++ api-example.py | 3 +++ css/main.css | 1 - docs/Generation-parameters.md | 23 +++++++++++++++++++++ docs/README.md | 1 + extensions/api/util.py | 3 +++ extensions/openai/script.py | 6 ++++++ modules/llamacpp_model.py | 5 ++++- modules/text_generation.py | 4 ++++ modules/ui.py | 2 +- server.py | 38 ++++++++++++++++++++++++----------- 13 files changed, 80 insertions(+), 15 deletions(-) create mode 100644 docs/Generation-parameters.md diff --git a/api-example-chat-stream.py b/api-example-chat-stream.py index 55cd706d..3ed7f241 100644 --- a/api-example-chat-stream.py +++ b/api-example-chat-stream.py @@ -46,6 +46,9 @@ async def run(user_input, history): 'penalty_alpha': 0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'seed': -1, 'add_bos_token': True, 'truncation_length': 2048, diff --git a/api-example-chat.py b/api-example-chat.py index 3a98008e..6d8208bb 100644 --- a/api-example-chat.py +++ b/api-example-chat.py @@ -40,6 +40,9 @@ def run(user_input, history): 'penalty_alpha': 0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'seed': -1, 'add_bos_token': True, 'truncation_length': 2048, diff --git a/api-example-stream.py b/api-example-stream.py index 326bac45..71eaa30c 100644 --- a/api-example-stream.py +++ b/api-example-stream.py @@ -34,6 +34,9 @@ async def run(context): 'penalty_alpha': 0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'seed': -1, 'add_bos_token': True, 'truncation_length': 2048, diff --git a/api-example.py b/api-example.py index 45c2864d..5cd0243a 100644 --- a/api-example.py +++ b/api-example.py @@ -26,6 +26,9 @@ def run(prompt): 'penalty_alpha': 0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'seed': -1, 'add_bos_token': True, 'truncation_length': 2048, diff --git a/css/main.css b/css/main.css index c9891737..e6ad9a38 100644 --- a/css/main.css +++ b/css/main.css @@ -34,7 +34,6 @@ .dark a { color: white !important; - text-decoration: none !important; } ol li p, ul li p { diff --git a/docs/Generation-parameters.md b/docs/Generation-parameters.md new file mode 100644 index 00000000..18e2889e --- /dev/null +++ b/docs/Generation-parameters.md @@ -0,0 +1,23 @@ +# Generation parameters + +For a description of the generation parameters provided by the transformers library, see this link: https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig + +### llama.cpp + +llama.cpp only uses the following parameters: + +* temperature +* top_p +* top_k +* repetition_penalty +* mirostat_mode +* mirostat_tau +* mirostat_eta + +### RWKV + +RWKV only uses the following parameters: + +* temperature +* top_p +* top_k diff --git a/docs/README.md b/docs/README.md index b31026f4..37c4fe37 100644 --- a/docs/README.md +++ b/docs/README.md @@ -7,6 +7,7 @@ * [Using LoRAs](Using-LoRAs.md) * [llama.cpp models](llama.cpp-models.md) * [RWKV model](RWKV-model.md) +* [Generation parameters](Generation-parameters.md) * [Extensions](Extensions.md) * [Chat mode](Chat-mode.md) * [DeepSpeed](DeepSpeed.md) diff --git a/extensions/api/util.py b/extensions/api/util.py index 27c6d7e9..bd86f8d1 100644 --- a/extensions/api/util.py +++ b/extensions/api/util.py @@ -26,6 +26,9 @@ def build_parameters(body, chat=False): 'penalty_alpha': float(body.get('penalty_alpha', 0)), 'length_penalty': float(body.get('length_penalty', 1)), 'early_stopping': bool(body.get('early_stopping', False)), + 'mirostat_mode': int(body.get('mirostat_mode', 0)), + 'mirostat_tau': float(body.get('mirostat_tau', 5)), + 'mirostat_eta': float(body.get('mirostat_eta', 0.1)), 'seed': int(body.get('seed', -1)), 'add_bos_token': bool(body.get('add_bos_token', True)), 'truncation_length': int(body.get('truncation_length', body.get('max_context_length', 2048))), diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 9e560d94..49a16b07 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -216,6 +216,9 @@ class Handler(BaseHTTPRequestHandler): 'penalty_alpha': 0.0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'ban_eos_token': False, 'skip_special_tokens': True, } @@ -526,6 +529,9 @@ class Handler(BaseHTTPRequestHandler): 'penalty_alpha': 0.0, 'length_penalty': 1, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5, + 'mirostat_eta': 0.1, 'ban_eos_token': False, 'skip_special_tokens': True, 'custom_stopping_strings': [], diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py index 94830898..2d351b43 100644 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -59,7 +59,7 @@ class LlamaCppModel: string = string.encode() return self.model.tokenize(string) - def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, mirostat_mode=0, mirostat_tau=5, mirostat_eta=0.1, callback=None): context = context if type(context) is str else context.decode() completion_chunks = self.model.create_completion( prompt=context, @@ -68,6 +68,9 @@ class LlamaCppModel: top_p=top_p, top_k=top_k, repeat_penalty=repetition_penalty, + mirostat_mode=int(mirostat_mode), + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, stream=True ) output = "" diff --git a/modules/text_generation.py b/modules/text_generation.py index e5fa4467..904d0d48 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -294,6 +294,10 @@ def generate_reply_custom(question, original_question, seed, state, eos_token=No for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']: generate_params[k] = state[k] + if shared.model_type == 'llamacpp': + for k in ['mirostat_mode', 'mirostat_tau', 'mirostat_eta']: + generate_params[k] = state[k] + t0 = time.time() reply = '' try: diff --git a/modules/ui.py b/modules/ui.py index 702ec99c..683a1fba 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -37,7 +37,7 @@ def list_model_elements(): def list_interface_input_elements(chat=False): - elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream'] + elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream'] if chat: elements += ['name1', 'name2', 'greeting', 'context', 'chat_prompt_size', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command'] diff --git a/server.py b/server.py index 97097460..061ddbb5 100644 --- a/server.py +++ b/server.py @@ -97,7 +97,11 @@ def load_preset_values(preset_menu, state, return_dict=False): 'length_penalty': 1, 'no_repeat_ngram_size': 0, 'early_stopping': False, + 'mirostat_mode': 0, + 'mirostat_tau': 5.0, + 'mirostat_eta': 0.1, } + with open(Path(f'presets/{preset_menu}.txt'), 'r') as infile: preset = infile.read() for i in preset.splitlines(): @@ -110,7 +114,7 @@ def load_preset_values(preset_menu, state, return_dict=False): return generate_params else: state.update(generate_params) - return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']] + return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']] def upload_soft_prompt(file): @@ -434,27 +438,32 @@ def create_settings_menus(default_preset): with gr.Row(): with gr.Column(): with gr.Row(): - shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset') - ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button') - with gr.Column(): - shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)') + with gr.Column(): + with gr.Row(): + shared.gradio['preset_menu'] = gr.Dropdown(choices=utils.get_available_presets(), value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset') + ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': utils.get_available_presets()}, 'refresh-button') + + with gr.Column(): + shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)') - with gr.Row(): - with gr.Column(): with gr.Box(): - gr.Markdown('Main parameters ([click here to view technical documentation](https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig))') + gr.Markdown('Main parameters') with gr.Row(): with gr.Column(): shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature', info='Primary factor to control randomness of outputs. 0 = deterministic (only the most likely token is used). Higher value = more randomness.') shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p', info='If not set to 1, select tokens with probabilities adding up to less than this number. Higher value = higher range of possible random results.') shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k', info='Similar to top_p, but select instead only the top_k most likely tokens. Higher value = higher range of possible random results.') shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p', info='If not set to 1, select only tokens that are at least this much more likely to appear than random tokens, given the prior text.') + shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4') + shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4') + with gr.Column(): shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty', info='Exponential penalty factor for repeating prior tokens. 1 means no penalty, higher value = less repetition, lower value = more repetition.') shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty', info='Also known as the "Hallucinations filter". Used to penalize tokens that are *not* in the prior text. Higher value = more likely to stay in context, lower value = more likely to diverge.') shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size', info='If not set to 0, specifies the length of token sets that are completely blocked from repeating at all. Higher values = blocks larger phrases, lower values = blocks words or letters from repeating. Only 0 or high values are a good idea in most cases.') shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'], label='min_length', info='Minimum generation length in tokens.') - shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') + shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') + with gr.Column(): with gr.Box(): with gr.Row(): @@ -468,9 +477,12 @@ def create_settings_menus(default_preset): shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping') with gr.Column(): + gr.Markdown('Mirostrat (for llama.cpp)') + shared.gradio['mirostat_mode'] = gr.Slider(0, 2, step=1, value=generate_params['mirostat_mode'], label='mirostat_mode') + shared.gradio['mirostat_tau'] = gr.Slider(0, 10, step=0.01, value=generate_params['mirostat_tau'], label='mirostat_tau') + shared.gradio['mirostat_eta'] = gr.Slider(0, 1, step=0.01, value=generate_params['mirostat_eta'], label='mirostat_eta') + gr.Markdown('Other') - shared.gradio['epsilon_cutoff'] = gr.Slider(0, 9, value=generate_params['epsilon_cutoff'], step=0.01, label='epsilon_cutoff', info='In units of 1e-4') - shared.gradio['eta_cutoff'] = gr.Slider(0, 20, value=generate_params['eta_cutoff'], step=0.01, label='eta_cutoff', info='In units of 1e-4') with gr.Accordion('Soft prompt', open=False): with gr.Row(): shared.gradio['softprompts_menu'] = gr.Dropdown(choices=utils.get_available_softprompts(), value='None', label='Soft prompt') @@ -492,7 +504,9 @@ def create_settings_menus(default_preset): shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.') shared.gradio['stream'] = gr.Checkbox(value=not shared.args.no_stream, label='Activate text streaming') - shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]) + gr.Markdown('[Click here for more information.](https://github.com/oobabooga/text-generation-webui/docs/Generation-parameters.md)') + + shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'interface_state']], [shared.gradio[k] for k in ['interface_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta']]) shared.gradio['softprompts_menu'].change(load_soft_prompt, shared.gradio['softprompts_menu'], shared.gradio['softprompts_menu'], show_progress=True) shared.gradio['upload_softprompt'].upload(upload_soft_prompt, shared.gradio['upload_softprompt'], shared.gradio['softprompts_menu'])