diff --git a/README.md b/README.md
index 2c836f40..c289a5b2 100644
--- a/README.md
+++ b/README.md
@@ -266,6 +266,8 @@ Optionally, you can use the following command-line flags:
 | Flag             | Description |
 |------------------|-------------|
 |`--gpu-split`     | Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. `20,7,7` |
+|`--max_seq_len MAX_SEQ_LEN`           | Maximum sequence length. |
+|`--compress_pos_emb COMPRESS_POS_EMB` | Positional embeddings compression factor. Should typically be set to max_seq_len / 2048. |
 
 #### GPTQ-for-LLaMa
 
diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 1b80a33d..046f3d08 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -29,7 +29,6 @@ async def run(user_input, history):
         'regenerate': False,
         '_continue': False,
         'stop_at_newline': False,
-        'chat_prompt_size': 2048,
         'chat_generation_attempts': 1,
         'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index fb2847d4..7048043a 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -23,7 +23,6 @@ def run(user_input, history):
         'regenerate': False,
         '_continue': False,
         'stop_at_newline': False,
-        'chat_prompt_size': 2048,
         'chat_generation_attempts': 1,
         'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
 
diff --git a/extensions/api/util.py b/extensions/api/util.py
index 22f8a80a..01ae1636 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -53,7 +53,6 @@ def build_parameters(body, chat=False):
         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
         generate_params.update({
             'stop_at_newline': bool(body.get('stop_at_newline', shared.settings['stop_at_newline'])),
-            'chat_prompt_size': int(body.get('chat_prompt_size', shared.settings['chat_prompt_size'])),
             'chat_generation_attempts': int(body.get('chat_generation_attempts', shared.settings['chat_generation_attempts'])),
             'mode': str(body.get('mode', 'chat')),
             'name1': name1,
diff --git a/extensions/openai/cache_embedding_model.py b/extensions/openai/cache_embedding_model.py
old mode 100755
new mode 100644
diff --git a/models/config.yaml b/models/config.yaml
index 4b01a6f2..704fb8bf 100644
--- a/models/config.yaml
+++ b/models/config.yaml
@@ -104,12 +104,8 @@ llama-65b-gptq-3bit:
   mode: 'instruct'
   instruction_template: 'StableLM'
   truncation_length: 4096
-  chat_prompt_size: 4096
-  chat_prompt_size_max: 4096
 .*stablelm-base:
   truncation_length: 4096
-  chat_prompt_size: 4096
-  chat_prompt_size_max: 4096
 .*wizardlm:
   mode: 'instruct'
   model_type: 'llama'
@@ -237,8 +233,6 @@ TheBloke_WizardLM-30B-GPTQ:
   instruction_template: 'Minotaur'
 .*minotaur-15b:
   truncation_length: 8192
-  chat_prompt_size: 8192
-  chat_prompt_size_max: 8192
 .*orca_mini:
   mode: 'instruct'
   instruction_template: 'Orca Mini'
diff --git a/modules/chat.py b/modules/chat.py
index e30fc516..4329f673 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -57,7 +57,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
     is_instruct = state['mode'] == 'instruct'
 
     # Find the maximum prompt size
-    max_length = min(get_max_prompt_length(state), state['chat_prompt_size'])
+    max_length = get_max_prompt_length(state)
     all_substrings = {
         'chat': get_turn_substrings(state, instruct=False),
         'instruct': get_turn_substrings(state, instruct=True)
diff --git a/modules/exllama.py b/modules/exllama.py
index 87ac924f..449926eb 100644
--- a/modules/exllama.py
+++ b/modules/exllama.py
@@ -46,6 +46,8 @@ class ExllamaModel:
 
         config = ExLlamaConfig(str(model_config_path))
         config.model_path = str(model_path)
+        config.max_seq_len = shared.args.max_seq_len
+        config.compress_pos_emb = shared.args.compress_pos_emb
         if shared.args.gpu_split:
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
diff --git a/modules/exllama_hf.py b/modules/exllama_hf.py
index c54eee4a..6db620ba 100644
--- a/modules/exllama_hf.py
+++ b/modules/exllama_hf.py
@@ -91,7 +91,8 @@ class ExllamaHF(PreTrainedModel):
         assert weight_path is not None, f'could not find weight in "{pretrained_model_name_or_path}"'
 
         config.model_path = str(weight_path)
-
+        config.max_seq_len = shared.args.max_seq_len
+        config.compress_pos_emb = shared.args.compress_pos_emb
         if shared.args.gpu_split:
             config.set_auto_map(shared.args.gpu_split)
             config.gpu_peer_fix = True
diff --git a/modules/loaders.py b/modules/loaders.py
index 35202a77..44e893fb 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -55,10 +55,14 @@ loaders_and_params = {
     ],
     'ExLlama' : [
         'gpu_split',
+        'max_seq_len',
+        'compress_pos_emb',
         'exllama_info',
     ],
     'ExLlama_HF' : [
         'gpu_split',
+        'max_seq_len',
+        'compress_pos_emb',
         'exllama_HF_info',
     ]
 }
diff --git a/modules/shared.py b/modules/shared.py
index 6258648b..dfa9cd38 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -51,15 +51,12 @@ settings = {
     'skip_special_tokens': True,
     'truncation_length': 2048,
     'truncation_length_min': 0,
-    'truncation_length_max': 8192,
+    'truncation_length_max': 16384,
     'mode': 'chat',
     'start_with': '',
     'chat_style': 'cai-chat',
     'instruction_template': 'None',
     'chat-instruct_command': 'Continue the chat dialogue below. Write a single reply for the character "<|character|>".\n\n<|prompt|>',
-    'chat_prompt_size': 2048,
-    'chat_prompt_size_min': 0,
-    'chat_prompt_size_max': 8192,
     'chat_generation_attempts': 1,
     'chat_generation_attempts_min': 1,
     'chat_generation_attempts_max': 10,
@@ -152,6 +149,8 @@ parser.add_argument('--desc_act', action='store_true', help='For models that don
 
 # ExLlama
 parser.add_argument('--gpu-split', type=str, help="Comma-separated list of VRAM (in GB) to use per GPU device for model layers, e.g. 20,7,7")
+parser.add_argument('--max_seq_len', type=int, default=2048, help="Maximum sequence length.")
+parser.add_argument('--compress_pos_emb', type=int, default=1, help="Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.")
 
 # FlexGen
 parser.add_argument('--flexgen', action='store_true', help='DEPRECATED')
diff --git a/modules/ui.py b/modules/ui.py
index 6d09fbc3..9f8cd5ab 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -30,7 +30,7 @@ theme = gr.themes.Default(
 
 
 def list_model_elements():
-    elements = ['loader', 'cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed', 'gpu_split']
+    elements = ['loader', 'cpu_memory', 'auto_devices', 'disk', 'cpu', 'bf16', 'load_in_8bit', 'trust_remote_code', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', 'model_type', 'pre_layer', 'triton', 'desc_act', 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'threads', 'n_batch', 'no_mmap', 'mlock', 'n_gpu_layers', 'n_ctx', 'llama_cpp_seed', 'gpu_split', 'max_seq_len', 'compress_pos_emb']
     for i in range(torch.cuda.device_count()):
         elements.append(f'gpu_memory_{i}')
 
@@ -40,7 +40,7 @@ def list_model_elements():
 def list_interface_input_elements(chat=False):
     elements = ['max_new_tokens', 'seed', 'temperature', 'top_p', 'top_k', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'repetition_penalty', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'min_length', 'do_sample', 'penalty_alpha', 'num_beams', 'length_penalty', 'early_stopping', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'add_bos_token', 'ban_eos_token', 'truncation_length', 'custom_stopping_strings', 'skip_special_tokens', 'preset_menu', 'stream', 'tfs', 'top_a']
     if chat:
-        elements += ['name1', 'name2', 'greeting', 'context', 'chat_prompt_size', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command']
+        elements += ['name1', 'name2', 'greeting', 'context', 'chat_generation_attempts', 'stop_at_newline', 'mode', 'instruction_template', 'character_menu', 'name1_instruct', 'name2_instruct', 'context_instruct', 'turn_template', 'chat_style', 'chat-instruct_command']
 
     elements += list_model_elements()
     return elements
diff --git a/server.py b/server.py
index a4f73bcc..455b6539 100644
--- a/server.py
+++ b/server.py
@@ -216,13 +216,15 @@ def create_model_menus():
                         shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads)
                         shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, value=shared.args.n_batch)
                         shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=128, value=shared.args.n_gpu_layers)
-                        shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=8192, step=1, label="n_ctx", value=shared.args.n_ctx)
+                        shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=16384, step=256, label="n_ctx", value=shared.args.n_ctx)
                         shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None")
                         shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None")
                         shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None", "llama", "opt", "gptj"], value=shared.args.model_type or "None")
                         shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0)
                         shared.gradio['autogptq_info'] = gr.Markdown('On some systems, AutoGPTQ can be 2x slower than GPTQ-for-LLaMa. You can manually select the GPTQ-for-LLaMa loader above.')
                         shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
+                        shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=2048, maximum=16384, step=256, info='Maximum sequence length.')
+                        shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should typically be set to max_seq_len / 2048.')
 
                     with gr.Column():
                         shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton)
@@ -300,10 +302,9 @@ def create_chat_settings_menus():
         with gr.Row():
             with gr.Column():
                 shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens'])
-                shared.gradio['chat_prompt_size'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='chat_prompt_size', info='Set limit on prompt size by removing old messages (while retaining context and user input)', value=shared.settings['chat_prompt_size'])
+                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
 
             with gr.Column():
-                shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)', info='New generations will be called until either this number is reached or no new content is generated between two iterations.')
                 shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character')
 
 
@@ -366,7 +367,7 @@ def create_settings_menus(default_preset):
             with gr.Box():
                 with gr.Row():
                     with gr.Column():
-                        shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=1, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
+                        shared.gradio['truncation_length'] = gr.Slider(value=shared.settings['truncation_length'], minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.')
                         shared.gradio['custom_stopping_strings'] = gr.Textbox(lines=1, value=shared.settings["custom_stopping_strings"] or None, label='Custom stopping strings', info='In addition to the defaults. Written between "" and separated by commas. For instance: "\\nYour Assistant:", "\\nThe assistant:"')
                     with gr.Column():
                         shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
diff --git a/settings-template.yaml b/settings-template.yaml
index c9a24b75..e949f697 100644
--- a/settings-template.yaml
+++ b/settings-template.yaml
@@ -20,7 +20,7 @@ ban_eos_token: false
 skip_special_tokens: true
 truncation_length: 2048
 truncation_length_min: 0
-truncation_length_max: 8192
+truncation_length_max: 16384
 mode: chat
 start_with: ''
 chat_style: cai-chat
@@ -30,9 +30,6 @@ chat-instruct_command: 'Continue the chat dialogue below. Write a single reply f
 
 
   <|prompt|>'
-chat_prompt_size: 2048
-chat_prompt_size_min: 0
-chat_prompt_size_max: 8192
 chat_generation_attempts: 1
 chat_generation_attempts_min: 1
 chat_generation_attempts_max: 10