From d9fabdde40af766837d3cc7d0758189ab0f6ea8d Mon Sep 17 00:00:00 2001
From: atriantafy <atriantafy@gmail.com>
Date: Wed, 12 Jul 2023 04:01:03 +0100
Subject: [PATCH] =?UTF-8?q?Add=20context=5Finstruct=20to=20API.=20Load=20d?=
 =?UTF-8?q?efault=20model=20instruction=20template=20=E2=80=A6=20(#2688)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 api-examples/api-example-chat-stream.py |  5 ++-
 api-examples/api-example-chat.py        |  3 +-
 api-examples/api-example-model.py       | 50 ++++++++++++-------------
 api-examples/api-example-stream.py      |  2 +-
 api-examples/api-example.py             |  2 +-
 extensions/api/util.py                  |  7 +++-
 6 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/api-examples/api-example-chat-stream.py b/api-examples/api-example-chat-stream.py
index 8e37b569..14f6f9d6 100644
--- a/api-examples/api-example-chat-stream.py
+++ b/api-examples/api-example-chat-stream.py
@@ -23,7 +23,8 @@ async def run(user_input, history):
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
-        'instruction_template': 'Vicuna-v1.1',
+        'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
+        # 'context_instruct': '',  # Optional
         'your_name': 'You',
 
         'regenerate': False,
@@ -34,7 +35,7 @@ async def run(user_input, history):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,
diff --git a/api-examples/api-example-chat.py b/api-examples/api-example-chat.py
index 23f2f186..0e155c63 100644
--- a/api-examples/api-example-chat.py
+++ b/api-examples/api-example-chat.py
@@ -17,7 +17,8 @@ def run(user_input, history):
         'history': history,
         'mode': 'instruct',  # Valid options: 'chat', 'chat-instruct', 'instruct'
         'character': 'Example',
-        'instruction_template': 'Vicuna-v1.1',
+        'instruction_template': 'Vicuna-v1.1',  # Will get autodetected if unset
+        # 'context_instruct': '',  # Optional
         'your_name': 'You',
 
         'regenerate': False,
diff --git a/api-examples/api-example-model.py b/api-examples/api-example-model.py
index 1e108a2d..9a61ccb1 100644
--- a/api-examples/api-example-model.py
+++ b/api-examples/api-example-model.py
@@ -4,8 +4,9 @@ import requests
 
 HOST = '0.0.0.0:5000'
 
-def generate(prompt, tokens = 200):
-    request = { 'prompt': prompt, 'max_new_tokens': tokens }
+
+def generate(prompt, tokens=200):
+    request = {'prompt': prompt, 'max_new_tokens': tokens}
     response = requests.post(f'http://{HOST}/api/v1/generate', json=request)
 
     if response.status_code == 200:
@@ -23,7 +24,7 @@ def print_basic_model_info(response):
     print("Model: ", response['result']['model_name'])
     print("Lora(s): ", response['result']['lora_names'])
     for setting in basic_settings:
-        print(setting, "=",  response['result']['shared.settings'][setting])
+        print(setting, "=", response['result']['shared.settings'][setting])
 
 
 # model info
@@ -75,17 +76,17 @@ def complex_model_load(model):
             'rwkv_cuda_on': False,
 
             # b&b 4-bit
-            #'load_in_4bit': False,
-            #'compute_dtype': 'float16',
-            #'quant_type': 'nf4',
-            #'use_double_quant': False,
+            # 'load_in_4bit': False,
+            # 'compute_dtype': 'float16',
+            # 'quant_type': 'nf4',
+            # 'use_double_quant': False,
 
-            #"cpu": false,
-            #"auto_devices": false,
-            #"gpu_memory": null,
-            #"cpu_memory": null,
-            #"disk": false,
-            #"disk_cache_dir": "cache",
+            # "cpu": false,
+            # "auto_devices": false,
+            # "gpu_memory": null,
+            # "cpu_memory": null,
+            # "disk": false,
+            # "disk_cache_dir": "cache",
         },
     }
 
@@ -104,26 +105,25 @@ def complex_model_load(model):
         req['args']['load_in_8bit'] = True
     elif '-hf' in model or 'fp16' in model:
         if '7b' in model:
-            req['args']['bf16'] = True # for 24GB
+            req['args']['bf16'] = True  # for 24GB
         elif '13b' in model:
-            req['args']['load_in_8bit'] = True # for 24GB
+            req['args']['load_in_8bit'] = True  # for 24GB
     elif 'ggml' in model:
-        #req['args']['threads'] = 16
+        # req['args']['threads'] = 16
         if '7b' in model:
             req['args']['n_gpu_layers'] = 100
         elif '13b' in model:
             req['args']['n_gpu_layers'] = 100
         elif '30b' in model or '33b' in model:
-            req['args']['n_gpu_layers'] = 59 # 24GB
+            req['args']['n_gpu_layers'] = 59  # 24GB
         elif '65b' in model:
-            req['args']['n_gpu_layers'] = 42 # 24GB
+            req['args']['n_gpu_layers'] = 42  # 24GB
     elif 'rwkv' in model:
         req['args']['rwkv_cuda_on'] = True
         if '14b' in model:
-            req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB
+            req['args']['rwkv_strategy'] = 'cuda f16i8'  # 24GB
         else:
-            req['args']['rwkv_strategy'] = 'cuda f16' # 24GB
-
+            req['args']['rwkv_strategy'] = 'cuda f16'  # 24GB
 
     return model_api(req)
 
@@ -134,7 +134,7 @@ if __name__ == '__main__':
             resp = complex_model_load(model)
 
             if 'error' in resp:
-                print (f"❌ {model} FAIL Error: {resp['error']['message']}")
+                print(f"❌ {model} FAIL Error: {resp['error']['message']}")
                 continue
             else:
                 print_basic_model_info(resp)
@@ -142,12 +142,12 @@ if __name__ == '__main__':
             ans = generate("0,1,1,2,3,5,8,13,", tokens=2)
 
             if '21' in ans:
-                print (f"✅ {model} PASS ({ans})")
+                print(f"✅ {model} PASS ({ans})")
             else:
-                print (f"❌ {model} FAIL ({ans})")
+                print(f"❌ {model} FAIL ({ans})")
 
         except Exception as e:
-            print (f"❌ {model} FAIL Exception: {repr(e)}")
+            print(f"❌ {model} FAIL Exception: {repr(e)}")
 
 
 # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21.
diff --git a/api-examples/api-example-stream.py b/api-examples/api-example-stream.py
index 79a01e4d..1ae5a91c 100644
--- a/api-examples/api-example-stream.py
+++ b/api-examples/api-example-stream.py
@@ -23,7 +23,7 @@ async def run(context):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,
diff --git a/api-examples/api-example.py b/api-examples/api-example.py
index b09823c3..4e45de9e 100644
--- a/api-examples/api-example.py
+++ b/api-examples/api-example.py
@@ -15,7 +15,7 @@ def run(prompt):
 
         # Generation params. If 'preset' is set to different than 'None', the values
         # in presets/preset-name.yaml are used instead of the individual numbers.
-        'preset': 'None',  
+        'preset': 'None',
         'do_sample': True,
         'temperature': 0.7,
         'top_p': 0.1,
diff --git a/extensions/api/util.py b/extensions/api/util.py
index a89365ce..a25c7885 100644
--- a/extensions/api/util.py
+++ b/extensions/api/util.py
@@ -59,7 +59,10 @@ def build_parameters(body, chat=False):
 
     if chat:
         character = body.get('character')
-        instruction_template = body.get('instruction_template')
+        instruction_template = body.get('instruction_template', shared.settings['instruction_template'])
+        if str(instruction_template) == "None":
+            instruction_template = "Vicuna-v1.1"
+
         name1, name2, _, greeting, context, _ = load_character_memoized(character, str(body.get('your_name', shared.settings['name1'])), shared.settings['name2'], instruct=False)
         name1_instruct, name2_instruct, _, _, context_instruct, turn_template = load_character_memoized(instruction_template, '', '', instruct=True)
         generate_params.update({
@@ -72,7 +75,7 @@ def build_parameters(body, chat=False):
             'greeting': greeting,
             'name1_instruct': name1_instruct,
             'name2_instruct': name2_instruct,
-            'context_instruct': context_instruct,
+            'context_instruct': body.get('context_instruct',  context_instruct),
             'turn_template': turn_template,
             'chat-instruct_command': str(body.get('chat-instruct_command', shared.settings['chat-instruct_command'])),
             'history': body.get('history', {'internal': [], 'visible': []})