mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2024-11-25 09:19:23 +01:00
Supercharging superbooga (#3272)
This commit is contained in:
parent
ad00b8eb26
commit
0845724a89
3
.gitignore
vendored
3
.gitignore
vendored
@ -33,4 +33,5 @@ models/config-user.yaml
|
|||||||
|
|
||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
installer_files/
|
.chroma
|
||||||
|
installer_files
|
||||||
|
207
extensions/superboogav2/api.py
Normal file
207
extensions/superboogav2/api.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
"""
|
||||||
|
This module is responsible for the VectorDB API. It currently supports:
|
||||||
|
* DELETE api/v1/clear
|
||||||
|
- Clears the whole DB.
|
||||||
|
* POST api/v1/add
|
||||||
|
- Add some corpus to the DB. You can also specify metadata to be added alongside it.
|
||||||
|
* POST api/v1/delete
|
||||||
|
- Delete specific records with given metadata.
|
||||||
|
* POST api/v1/get
|
||||||
|
- Get results from chromaDB.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
from modules import shared
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
from .chromadb import ChromaCollector
|
||||||
|
from .data_processor import process_and_add_to_collector
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
|
||||||
|
class CustomThreadingHTTPServer(ThreadingHTTPServer):
|
||||||
|
def __init__(self, server_address, RequestHandlerClass, collector: ChromaCollector, bind_and_activate=True):
|
||||||
|
self.collector = collector
|
||||||
|
super().__init__(server_address, RequestHandlerClass, bind_and_activate)
|
||||||
|
|
||||||
|
def finish_request(self, request, client_address):
|
||||||
|
self.RequestHandlerClass(request, client_address, self, self.collector)
|
||||||
|
|
||||||
|
|
||||||
|
class Handler(BaseHTTPRequestHandler):
|
||||||
|
def __init__(self, request, client_address, server, collector: ChromaCollector):
|
||||||
|
self.collector = collector
|
||||||
|
super().__init__(request, client_address, server)
|
||||||
|
|
||||||
|
|
||||||
|
def _send_412_error(self, message):
|
||||||
|
self.send_response(412)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
response = json.dumps({"error": message})
|
||||||
|
self.wfile.write(response.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def _send_404_error(self):
|
||||||
|
self.send_response(404)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
response = json.dumps({"error": "Resource not found"})
|
||||||
|
self.wfile.write(response.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def _send_400_error(self, error_message: str):
|
||||||
|
self.send_response(400)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
response = json.dumps({"error": error_message})
|
||||||
|
self.wfile.write(response.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def _send_200_response(self, message: str):
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-type", "application/json")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
if isinstance(message, str):
|
||||||
|
response = json.dumps({"message": message})
|
||||||
|
else:
|
||||||
|
response = json.dumps(message)
|
||||||
|
|
||||||
|
self.wfile.write(response.encode('utf-8'))
|
||||||
|
|
||||||
|
|
||||||
|
def _handle_get(self, search_strings: list[str], n_results: int, max_token_count: int, sort_param: str):
|
||||||
|
if sort_param == parameters.SORT_DISTANCE:
|
||||||
|
results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
|
||||||
|
elif sort_param == parameters.SORT_ID:
|
||||||
|
results = self.collector.get_sorted_by_id(search_strings, n_results, max_token_count)
|
||||||
|
else: # Default is dist
|
||||||
|
results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"results": results
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def do_GET(self):
|
||||||
|
self._send_404_error()
|
||||||
|
|
||||||
|
|
||||||
|
def do_POST(self):
|
||||||
|
try:
|
||||||
|
content_length = int(self.headers['Content-Length'])
|
||||||
|
body = json.loads(self.rfile.read(content_length).decode('utf-8'))
|
||||||
|
|
||||||
|
parsed_path = urlparse(self.path)
|
||||||
|
path = parsed_path.path
|
||||||
|
query_params = parse_qs(parsed_path.query)
|
||||||
|
|
||||||
|
if path in ['/api/v1/add', '/api/add']:
|
||||||
|
corpus = body.get('corpus')
|
||||||
|
if corpus is None:
|
||||||
|
self._send_412_error("Missing parameter 'corpus'")
|
||||||
|
return
|
||||||
|
|
||||||
|
clear_before_adding = body.get('clear_before_adding', False)
|
||||||
|
metadata = body.get('metadata')
|
||||||
|
process_and_add_to_collector(corpus, self.collector, clear_before_adding, metadata)
|
||||||
|
self._send_200_response("Data successfully added")
|
||||||
|
|
||||||
|
elif path in ['/api/v1/delete', '/api/delete']:
|
||||||
|
metadata = body.get('metadata')
|
||||||
|
if corpus is None:
|
||||||
|
self._send_412_error("Missing parameter 'metadata'")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.collector.delete(ids_to_delete=None, where=metadata)
|
||||||
|
self._send_200_response("Data successfully deleted")
|
||||||
|
|
||||||
|
elif path in ['/api/v1/get', '/api/get']:
|
||||||
|
search_strings = body.get('search_strings')
|
||||||
|
if search_strings is None:
|
||||||
|
self._send_412_error("Missing parameter 'search_strings'")
|
||||||
|
return
|
||||||
|
|
||||||
|
n_results = body.get('n_results')
|
||||||
|
if n_results is None:
|
||||||
|
n_results = parameters.get_chunk_count()
|
||||||
|
|
||||||
|
max_token_count = body.get('max_token_count')
|
||||||
|
if max_token_count is None:
|
||||||
|
max_token_count = parameters.get_max_token_count()
|
||||||
|
|
||||||
|
sort_param = query_params.get('sort', ['distance'])[0]
|
||||||
|
|
||||||
|
results = self._handle_get(search_strings, n_results, max_token_count, sort_param)
|
||||||
|
self._send_200_response(results)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self._send_404_error()
|
||||||
|
except Exception as e:
|
||||||
|
self._send_400_error(str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def do_DELETE(self):
|
||||||
|
try:
|
||||||
|
parsed_path = urlparse(self.path)
|
||||||
|
path = parsed_path.path
|
||||||
|
query_params = parse_qs(parsed_path.query)
|
||||||
|
|
||||||
|
if path in ['/api/v1/clear', '/api/clear']:
|
||||||
|
self.collector.clear()
|
||||||
|
self._send_200_response("Data successfully cleared")
|
||||||
|
else:
|
||||||
|
self._send_404_error()
|
||||||
|
except Exception as e:
|
||||||
|
self._send_400_error(str(e))
|
||||||
|
|
||||||
|
|
||||||
|
def do_OPTIONS(self):
|
||||||
|
self.send_response(200)
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
|
||||||
|
def end_headers(self):
|
||||||
|
self.send_header('Access-Control-Allow-Origin', '*')
|
||||||
|
self.send_header('Access-Control-Allow-Methods', '*')
|
||||||
|
self.send_header('Access-Control-Allow-Headers', '*')
|
||||||
|
self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
|
||||||
|
super().end_headers()
|
||||||
|
|
||||||
|
|
||||||
|
class APIManager:
|
||||||
|
def __init__(self, collector: ChromaCollector):
|
||||||
|
self.server = None
|
||||||
|
self.collector = collector
|
||||||
|
self.is_running = False
|
||||||
|
|
||||||
|
def start_server(self, port: int):
|
||||||
|
if self.server is not None:
|
||||||
|
print("Server already running.")
|
||||||
|
return
|
||||||
|
|
||||||
|
address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
|
||||||
|
self.server = CustomThreadingHTTPServer((address, port), Handler, self.collector)
|
||||||
|
|
||||||
|
logger.info(f'Starting chromaDB API at http://{address}:{port}/api')
|
||||||
|
|
||||||
|
Thread(target=self.server.serve_forever, daemon=True).start()
|
||||||
|
|
||||||
|
self.is_running = True
|
||||||
|
|
||||||
|
def stop_server(self):
|
||||||
|
if self.server is not None:
|
||||||
|
logger.info(f'Stopping chromaDB API.')
|
||||||
|
self.server.shutdown()
|
||||||
|
self.server.server_close()
|
||||||
|
self.server = None
|
||||||
|
self.is_running = False
|
||||||
|
|
||||||
|
def is_server_running(self):
|
||||||
|
return self.is_running
|
72
extensions/superboogav2/benchmark.py
Normal file
72
extensions/superboogav2/benchmark.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
This module implements a benchmark function to evaluate the performance of the embedding pipeline. It expects a configuration JSON file. It must have questions and expected retrieved text.
|
||||||
|
For each question, it's essential to have variants of that question. Language is fluid and each person might have their own spin on how they may ask it.
|
||||||
|
|
||||||
|
At the end, it will save the results inside a benchmark_{sysdate}.txt file in the main directory.
|
||||||
|
|
||||||
|
The benchmark function will return the score as an integer.
|
||||||
|
"""
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .data_processor import process_and_add_to_collector, preprocess_text
|
||||||
|
from .parameters import get_chunk_count, get_max_token_count
|
||||||
|
from .utils import create_metadata_source
|
||||||
|
|
||||||
|
def benchmark(config_path, collector):
|
||||||
|
# Get the current system date
|
||||||
|
sysdate = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"benchmark_{sysdate}.txt"
|
||||||
|
|
||||||
|
# Open the log file in append mode
|
||||||
|
with open(filename, 'a') as log:
|
||||||
|
with open(config_path, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
total_points = 0
|
||||||
|
max_points = 0
|
||||||
|
|
||||||
|
for item in data:
|
||||||
|
filepath = item["text"]
|
||||||
|
corpus = ""
|
||||||
|
|
||||||
|
# Check if the file exists
|
||||||
|
if os.path.isfile(Path(filepath)):
|
||||||
|
# Open the file and read its content
|
||||||
|
with open(Path(filepath), 'r') as file:
|
||||||
|
corpus = file.read()
|
||||||
|
process_and_add_to_collector(corpus, collector, True, create_metadata_source('benchmark'))
|
||||||
|
else:
|
||||||
|
raise f'Cannot find specified file {filepath}.'
|
||||||
|
|
||||||
|
for question_group in item["questions"]:
|
||||||
|
question_variants = question_group["question_variants"]
|
||||||
|
criteria = question_group["criteria"]
|
||||||
|
|
||||||
|
for q in question_variants:
|
||||||
|
max_points += len(criteria)
|
||||||
|
processed_text = preprocess_text(q)
|
||||||
|
|
||||||
|
# Get the most similar chunks
|
||||||
|
results = collector.get_sorted_by_dist(processed_text, n_results=get_chunk_count(), max_token_count=get_max_token_count())
|
||||||
|
|
||||||
|
points = 0
|
||||||
|
|
||||||
|
for c in criteria:
|
||||||
|
for p in results:
|
||||||
|
if c in p:
|
||||||
|
points += 1
|
||||||
|
total_points += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
info = f"The question '{q}' scored {points}/{len(criteria)} points."
|
||||||
|
print(info, file=log)
|
||||||
|
|
||||||
|
print('\n---\n', file=log)
|
||||||
|
|
||||||
|
print(f'##Total points:\n\n{total_points}/{max_points}', file=log)
|
||||||
|
|
||||||
|
return total_points, max_points
|
9471
extensions/superboogav2/benchmark_texts/aircraft_lease.txt
Normal file
9471
extensions/superboogav2/benchmark_texts/aircraft_lease.txt
Normal file
File diff suppressed because it is too large
Load Diff
291
extensions/superboogav2/benchmark_texts/questions.json
Normal file
291
extensions/superboogav2/benchmark_texts/questions.json
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"text": "extensions/superboogav2/benchmark_texts/aircraft_lease.txt",
|
||||||
|
"questions": [
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is a wet lease?",
|
||||||
|
"Agh, I'm really wracking my brain here, but can't figure it out. What is a wet lease?",
|
||||||
|
"I've been trying to wrap my head around this concept and it's just not clicking. Could you elucidate the concept of a wet lease?",
|
||||||
|
"I'm finding it so hard to understand this whole wet lease thing! Would you be so kind as to explicate on the matter of what is known as a wet lease in the domain of aviation?",
|
||||||
|
"I've spent hours trying to grasp this and I'm still lost. Could you engage in a detailed exploration of the intricate and nuanced topic that is the wet lease, as it is commonly understood and applied within the broad and complex ecosystem of aviation?"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"WET LEASE shall mean any arrangement whereby Lessee agrees to\n operate the Aircraft under a contractual arrangement with a third party\n pursuant to which no rights to any aircraft specifically identified by serial\n number or registration number are granted to such third party and pursuant to\n which the Aircraft (i) remains at all times under the sole and complete\n operational control of Lessee",
|
||||||
|
"(ii) shall be operated solely by cockpit crew\n employed by Lessee possessing all current certificates and licenses required by\n applicable Laws and (iii) shall be maintained by Lessee in accordance with all\n the provisions of the Lease including, but not limited to, Sections 6(d) and 12\n (it being understood and agreed by Lessor and Lessee that any Wet Lease shall,\n by its terms, be in all cases subject and subordinate to this Lease)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is PBGC?",
|
||||||
|
"I'm stumped! Can you tell me what PBGC is?",
|
||||||
|
"I've been racking my brain trying to understand PBGC. Would you mind explaining the concept to me?",
|
||||||
|
"I've been trying to grasp what PBGC represents in the context of pension benefits, but I'm not getting it. Would it be possible for you to expound upon the matter?",
|
||||||
|
"I'm having trouble understanding the multi-dimensional entity known as 'PBGC'. Could you embark on an exploration of it? How is it traditionally comprehended, interpreted, and implemented within the sphere of pension benefits?"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"PBGC shall mean the Pension Benefit Guaranty Corporation\n established pursuant to Subtitle A of Part IV of ERISA, and any successor\n thereof."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is LIEN?",
|
||||||
|
"I can't seem to find information on what LIEN is. Can you help me?",
|
||||||
|
"I'm feeling stuck. I can't seem to find any information on what LIEN is. Could you provide some insights?",
|
||||||
|
"It's like I'm chasing my tail here. I've been searching high and low and I just can't seem to find anything that clearly explains what LIEN is. Could you lend me a hand in understanding it?",
|
||||||
|
"I'm in a bit of a pickle. I've scoured the internet, flipped through countless books, and I still can't seem to find any definitive information on what LIEN is. Could you possibly help me get my head around it?"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"LIEN shall mean any mortgage, pledge, lien, charge,\n encumbrance, lease, exercise of rights, security interest or claim."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What happens if the Lease is terminated by operation of law?",
|
||||||
|
"I'm a bit lost here. What happens if the Lease is terminated by operation of law? Can you help me understand?",
|
||||||
|
"I've been trying to figure this out but I'm stuck. What exactly happens if the Lease is terminated by operation of law? Could you explain?",
|
||||||
|
"I've been poring over this and I'm still not clear. What exactly transpires if the Lease is terminated by operation of law? I'd appreciate your help in understanding this.",
|
||||||
|
"I'm really hitting a wall here. I've been trying to understand this, but it feels like I'm reading a foreign language. What's the end result if the Lease is terminated by operation of law? Any help in understanding this, particularly a detailed breakdown, would be greatly appreciated."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"If for any reason whatsoever this Lease shall be terminated\n in whole or in part by operation of law (other than termination under any\n bankruptcy laws as now or hereafter in effect), Lessee nonetheless agrees to\n pay to Lessor amounts equal to the Rent payments hereunder at the time such\n payments would have become due and payable in accordance with the terms hereof",
|
||||||
|
"had this Lease not been terminated so long as Lessee is able to use, possess\n and quietly enjoy the Aircraft, and so long as such payments are made and all\n other terms and conditions hereof are complied\n\n -16-\n\n\n<PAGE>\n\n\nwith by Lessor and Lessee, Lessor and Lessee will deem this Lease to remain in\nfull force and effect."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What happens if a discrepancy or malfunction is detected during the Acceptance Flight?",
|
||||||
|
"I'm having difficulty understanding this part. What exactly happens if a discrepancy or malfunction is detected during the Acceptance Flight? Can you provide a thorough explanation?",
|
||||||
|
"I'm stuck on this one. I'm struggling to comprehend what steps are taken if a discrepancy or malfunction is detected during the Acceptance Flight. Could you possibly explain this in detail?",
|
||||||
|
"I've been poring over this issue for a while, and it's not clicking. What steps are taken or what are the implications if a discrepancy or malfunction is detected during the Acceptance Flight? I'd appreciate a comprehensive explanation.",
|
||||||
|
"I'm really hitting a wall here. I've been trying to understand, but it's like I'm reading a foreign language. What's the end result or the next step if a discrepancy or malfunction is detected during the Acceptance Flight? Any help in understanding this, particularly a detailed breakdown, would be greatly appreciated."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"If, during the Acceptance Flight, no discrepancy or malfunction is\n detected with respect to the airworthiness or operational nature of\n the Aircraft by normal airline standards, then (i) the delivery of\n the Aircraft from Lessor to Lessee hereunder shall occur, the\n Aircraft shall be accepted by Lessee hereunder whilst the Aircraft\n is located at the Delivery Location, and Lessee shall execute and\n deliver the Lease Supplement, (ii) th",
|
||||||
|
"e Aircraft shall continue en\n route to a location selected by Lessee (the \"Ferry Location\"), under\n the operational control of Lessee from the time of such delivery and\n acceptance (the Acceptance Flight shall terminate at the time of\n such delivery and acceptance, and that portion of the flight from\n the Delivery Location to the Ferry Location is herein called the\n \"Ferry Flight\"), and (iii) Lessee shall bear the costs of the flight\n ",
|
||||||
|
" crew, fuel and other costs\n\n\n -12-\n\n\n<PAGE>\n\n\n relating to the Ferry Flight and shall reimburse the Beneficiaries\n therefor promptly following receipt of the Beneficiaries, invoice\n therefor, PROVIDED, HOWEVER, that, if any discrepancy or malfunction\n is detected with respect to the airworthiness during the Acceptance\n Flight, then, at Lessee's option after consultation with Lessor,\n either ",
|
||||||
|
"(A) the Aircraft shall be delivered to and accepted by Lessee\n at the Delivery Location and shall be ferried to the Ferry Location,\n as provided in clauses (i), (ii) and (iii) above, where Lessee shall\n remedy such discrepancy or malfunction at the cost (without mark up)\n of the Beneficiaries (provided that such subsequent repair or\n maintenance work shall not affect Lessee's acceptance of the\n Aircraft hereunder), or (B) the Aircraft s",
|
||||||
|
"hall return to the\n Beneficiaries' storage and maintenance facility where such\n discrepancy or malfunction shall be corrected at the Beneficiaries'\n expense, in which case the Delivery Date shall be postponed to such\n date as the Beneficiaries shall advise, subject to the second\n paragraph of Section 3(c) below. Any discrepancy or malfunction\n detected of an airworthiness nature shall be corrected by Lessee or\n the Beneficiari",
|
||||||
|
"es in accordance with clause (A) or (B) above, as\n applicable, at Beneficiaries, expense. If during the Acceptance\n Flight a discrepancy or malfunction is detected with respect to the\n operational nature of the Aircraft by normal airline standards but\n no discrepancy or malfunction is detected with respect to the\n airworthiness of the Aircraft, then the Aircraft shall be delivered\n to and accepted by Lessee at the Delivery Location as p",
|
||||||
|
"rovided in\n clause (A) above, and Lessee shall remedy such discrepancy or\n malfunction at the cost (without mark up) of the Beneficiaries.\n\n In anticipation of the occurrence of the Delivery Date, the\n Beneficiaries retained a flight crew to conduct the Acceptance\n Flight and will incur costs relating to such retention. In\n connection therewith, Lessee agrees to reimburse the Beneficiaries,\n promptly following Lessee's receipt",
|
||||||
|
" of an invoice therefor, for\n one-half of the costs incurred by the Beneficiaries in connection\n with retaining such flight crew for the Acceptance Flight, which\n costs relate to the period commencing on and including November 29,\n 1995 and ending on the day immediately preceding the Delivery Date."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What condition must the Aircraft meet before being delivered to the Lessee?",
|
||||||
|
"I'm having some trouble understanding this part. Could you please clarify what condition the Aircraft must meet before being delivered to the Lessee? I would appreciate a detailed explanation.",
|
||||||
|
"I'm stuck on this point. I'm finding it difficult to understand the specific condition the Aircraft must be in before being handed over to the Lessee. Could you possibly provide a comprehensive explanation?",
|
||||||
|
"I'm feeling a bit lost here. I'm having trouble understanding the exact condition or standard that the Aircraft must meet before being delivered to the Lessee. Could you provide a detailed walkthrough of the requirements?",
|
||||||
|
"I've hit a bit of a wall with this one. I've been trying my best to understand this, but it's proving to be quite complex. What is the precise condition that the Aircraft must meet before it can be delivered to the Lessee? Any help in understanding this, particularly a detailed explanation, would be of great help."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(d) Lessee's obligation to lease the Aircraft hereunder from\nLessor shall also be conditioned upon the Aircraft being delivered to Lessee in\nthe following condition:\n\n (1) The Aircraft shall be airworthy and in good\n operating condition\n\n\n -11-\n\n<PAGE>\n\n\n with all of the Aircraft equipment, components and systems;\n\n (2) The Aircraft shall be clean;\n\n (3) The Airc",
|
||||||
|
"raft shall meet the requirements for\n airworthiness certification by the FAA;\n\n (4) A borescope of the Engines and the inspection of\n the APU in accordance with the Manufacturer's or APU manufacturer's\n recommendation shall have been performed at the direction of Lessee\n but under the control and at the cost of the Beneficiaries, and any\n discrepancies discovered in connection therewith shall have been\n corrected;"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What rights does the Lessee waive under section 4(c)?",
|
||||||
|
"Can you tell me about the rights that the Lessee gives up under section 4(c)?",
|
||||||
|
"I'm having some difficulty here, could you please explain to me what rights the Lessee is forfeiting under the terms of section 4(c)?",
|
||||||
|
"I'm really struggling to understand this part, it's quite complex. Could you clarify what rights the Lessee is explicitly waiving as per section 4(c) in this agreement?",
|
||||||
|
"I'm pulling my hair out! What does this even mean? Can you assist me in deciphering what rights the Lessee is giving up or putting aside according to section 4(c)? I'm finding this part particularly challenging to grasp."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(c) PROHIBITION AGAINST SETOFF, COUNTERCLAIM, ETC. This Lease\n is a net lease. Subject to Section 20(f), Lessee's obligation to pay all Rent\n hereunder shall be absolute and unconditional and shall not be affected or\n reduced by any circumstance, including, without limitation, (i) any setoff,\n counterclaim, recoupment, defense or other right which Lessee may have against\n Lessor, any Beneficiary, the Manufacturer, the Engine Manufacturer, any seller\n of or person providing services with respect ",
|
||||||
|
"to the Aircraft or any other\n Person, for any reason whatsoever; (ii) any defect in the title, airworthiness\n or eligibility for registration under applicable Law, or any condition, design,\n operation or fitness for use of, or any damage to or loss or destruction of,\n the Aircraft, or any interruption or cessation in the use or possession thereof\n by Lessee for any reason whatsoever, whether arising out of or related to an\n act or omission of Lessee, or any other Person; (iii) any Liens with res",
|
||||||
|
"pect to\n the Aircraft; (iv) the invalidity or unenforceability or lack of due\n authorization or other infirmity of this Lease or any absence of right, power\n or authority of Lessor or Lessee to enter into this Lease; (v) any insolvency,\n bankruptcy, reorganization or similar proceedings by or against Lessor or\n Lessee; (vi) any other circumstance or happening of any nature whatsoever,\n similar to any of the foregoing; or (vii) any Taxes (other Taxes to which\n Lessee's indemnity does not extend p",
|
||||||
|
"ursuant to the provisions of Section 10);\n it being the express intention of Lessor and Lessee that all Rent payable\n hereunder shall be payable in all events, unless the obligation to pay the same\n shall be terminated pursuant to the express provisions of this Lease. Nothing\n in this paragraph (c) shall constitute a waiver by Lessee of any right or claim\n that Lessee may separately assert against Lessor or any Beneficiary.\n\n Lessee hereby waives, to the extent permitted by app",
|
||||||
|
"licable\n Law, any and all rights which it may now have or which at any time hereafter\n may be conferred upon it, by Law or otherwise, to terminate this Lease or any\n obligation imposed upon Lessee hereunder or in relation hereto.\n\n If for any reason whatsoever this Lease shall be terminated\n in whole or in part by operation of law (other than termination under any\n bankruptcy laws as now or hereafter in effect), Lessee nonetheless agrees to\n pay to Lessor amounts equal to the R",
|
||||||
|
"ent payments hereunder at the time such\n payments would have become due and payable in accordance with the terms hereof\n had this Lease not been terminated so long as Lessee is able to use, possess\n and quietly enjoy the Aircraft, and so long as such payments are made and all\n other terms and conditions hereof are complied\n\n -16-\n\n\n<PAGE>\n\n\nwith by Lessor and Lessee, Lessor and Lessee will deem this Lease to remain in\nfull force and effect."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Can the Lessor and Beneficiaries conduct inspections without notice under certain conditions? What are those conditions?",
|
||||||
|
"Is it possible for the Lessor and Beneficiaries to carry out inspections without prior notice, given specific circumstances? If so, could you explain what these circumstances might be?",
|
||||||
|
"I'm finding myself a bit confused here. Can the Lessor and Beneficiaries, under any special conditions, perform inspections without providing any advance notice? If yes, what exactly are these special conditions?",
|
||||||
|
"I'm at my wit's end! Can the Lessor and Beneficiaries actually go ahead and conduct inspections without giving a heads up, but only when certain conditions are met? What exactly are these conditions that would allow for such actions?",
|
||||||
|
"I'm really trying to get my head around this, but I could use some assistance. Is it within the Lessor and Beneficiaries' rights to initiate inspections without any forewarning, but only under certain predefined circumstances? What are these circumstances exactly?"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"Lessee shall permit Lessor, each Beneficiary and their\n respective designees on at least seven (7) days' prior written notice to visit\n and inspect the Aircraft, its condition, use and operation and the records\n maintained in connection therewith during normal business hours; PROVIDED,\n HOWEVER, that this shall not unreasonably interfere with Lessee's quiet use and\n enjoyment of the Aircraft PROVIDED FURTHER, HOWEVER, that Lessor or the\n Beneficiaries may conduct such visit and inspection at any",
|
||||||
|
" time and with or\n without notice if an Event of Default has occurred and is continuing."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What aircraft-related information will the Lessee provide on a monthly and annual basis?",
|
||||||
|
"Could you let me know what type of aircraft-related details the Lessee is obligated to provide on a monthly and annual basis?",
|
||||||
|
"I'm finding it a bit tricky to understand this part - could you help me clarify what specific aircraft-related data or information is the Lessee expected to report on both a monthly and an annual basis?",
|
||||||
|
"I'm really trying to grapple with this agreement. Could you assist me in figuring out the exact nature of the aircraft-related information that the Lessee is required to furnish on a consistent monthly and annual basis?",
|
||||||
|
"I'm genuinely struggling here! What does it mean exactly? What is the exact nature and extent of the aircraft-related data or information that the Lessee has to provide routinely, both on a monthly and an annual basis? I'm having a hard time understanding the specificities of this provision."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(v) Lessee will use its reasonable efforts to provide the\n Beneficiaries on or before the fifth day of each calendar month\n commencing with the next calendar month of the Delivery Date, and shall\n in any event provide to the Beneficiaries upon request of a Beneficiary,\n with a properly completed Monthly Aircraft Utilization and Status Report\n in the Form of Exhibit J hereto for the preceding calendar month\n operation of the aircraft;\n\n (vi) Lessee ",
|
||||||
|
"will use its reasonable efforts to provide the\n Beneficiaries, on or before the 15th day of January of each year\n (commencing with January 1996), and shall in any event provide Lessor and\n the Beneficiaries upon request of a Beneficiary in English, the\n information and documentation for the preceding calendar year as listed\n in Exhibit K hereto; PROVIDED, HOWEVER, that if (i) a Default or an Event\n of Default shall have occurred and be continuing or (ii) Lessee's\n ",
|
||||||
|
" financial condition changes adversely from its financial condition at the\n time of the Delivery Date, then, upon notice and a request from Lessor or\n a Beneficiary, Lessee shall provide such information on a quarterly basis\n on the 15th day of each January, April, July and October, commencing with\n the first of such dates to follow the date of such notice."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Under what conditions can Lessee consolidate, merge, or transfer assets without Lessor's prior written consent according to the text?",
|
||||||
|
"Could you explain under which specific circumstances the Lessee is allowed to consolidate, merge, or transfer assets without needing the Lessor's prior written approval, as stated in the text?",
|
||||||
|
"I'm having a bit of trouble with this section, could you clarify the exact conditions under which the Lessee is permitted to consolidate, merge, or transfer assets without first obtaining the Lessor's written consent, as outlined in the text?",
|
||||||
|
"I'm really wracking my brain here trying to understand the terms. Can you help me decipher under which exact circumstances or conditions the Lessee can execute consolidation, merging, or asset transfer without needing prior written consent from the Lessor, as the text suggests?",
|
||||||
|
"I'm pulling my hair out here! What on earth does it mean? What are the specific conditions or circumstances under which the Lessee can consolidate, merge, or transfer assets without having to acquire the Lessor's prior written consent, as it's described in the text? This is really a tough one to crack!"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(iv) CONSOLIDATION, MERGER, ETC. Without the prior written\n consent of Lessor and each Beneficiary, Lessee shall not consolidate with,\n merge with or merge into any other Person or convey, transfer or lease\n substantially all of its assets as an entirety to any other Person unless, upon\n and after giving effect to such transaction, (A) the surviving entity has at\n least the same net worth and gross assets as the Lessee immediately prior to\n such transaction, such surviving entity is Certified Ai",
|
||||||
|
"r Carrier and a \"citizen\n of the United States\" as defined in Section 101(16) of the Federal Aviation\n Act, (C) Lessor shall continue to be entitled to the benefits of Section 1110\n of the United States Bankruptcy Code, as in effect from time to time, and (D)\n each of the Operative Documents shall continue in full force and effect and\n shall constitute the legally binding and enforceable obligation of such\n surviving entity."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Who is responsible for replacing any parts on the Aircraft that become worn out, damaged, etc?",
|
||||||
|
"Could you please specify who holds the responsibility for replacing any parts of the Aircraft that may become worn out, damaged, or similarly affected?",
|
||||||
|
"I'm having a little trouble understanding this part. Who exactly is tasked with the responsibility of replacing any components of the Aircraft that may get worn out, damaged, or otherwise impaired?",
|
||||||
|
"I'm really scratching my head trying to figure out who precisely is designated to handle the replacement of any Aircraft parts that become worn out, damaged, or in similar conditions? This aspect seems a bit complicated.",
|
||||||
|
"I'm on the verge of losing it! Who in the world is charged with the duty of replacing any parts of the Aircraft that get worn out, damaged, or anything like that? I'm really finding it tough to get my head around this point."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(a) REPLACEMENT OF PARTS. Lessee, at its own cost and\n expense, will promptly replace all Parts which may from time to time become\n worn out, lost, stolen, destroyed, seized, confiscated, damaged beyond repair\n or permanently rendered unfit for use for any reason whatsoever. In addition,\n in the ordinary course of maintenance, service, repair, overhaul or testing,\n Lessee may at its own cost and expense cause to be removed any Parts, whether\n or not worn out, destroyed, damaged beyond repair or ",
|
||||||
|
"permanently rendered unfit\n for use, provided that Lessee shall replace at its own cost and expense such\n Parts as promptly as practicable. All replacement Parts shall be free and clear\n of all Liens, other than Liens permitted by Section 14 hereof, shall be in at\n least the same modification status and service bulletin accomplishment status,\n shall be fully interchangeable as to form, fit and function, shall have been\n overhauled or repaired and inspected by an agency acceptable to the FAA and\n",
|
||||||
|
" shall be in as good an operating condition as, and have a utility at least\n equal to and a value and remaining warranty reasonably approximating, the Parts\n replaced (assuming such replaced Parts were in the condition and repair in\n which they were required to be maintained by the terms hereof) and all\n historical records since new or last overhaul relating to such Parts (and all\n historical records since manufacture with respect to Engines, Landing Gears,\n the APU and all life limited parts in",
|
||||||
|
"stalled on any Engine, Landing Gear or\n APU) shall be maintained by Lessee."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Who bears responsibility if alterations, modifications or additions to the Aircraft result in any loss of revenue or grounding?",
|
||||||
|
"Can you clarify who would take responsibility if any alterations, modifications, or additions made to the Aircraft cause any loss of revenue or result in grounding?",
|
||||||
|
"I'm having some difficulty here. Could you please specify who should shoulder the responsibility if any changes, modifications or additions to the Aircraft lead to any form of revenue loss or cause the aircraft to be grounded?",
|
||||||
|
"I'm really trying to understand this, but it's complex. Could you elucidate who is to bear the brunt if alterations, modifications, or additions to the Aircraft culminate in a loss of revenue or result in the grounding of the aircraft?",
|
||||||
|
"I'm pulling my hair out over this! Who on earth would bear the responsibility if any alterations, modifications, or additions that are made to the Aircraft end up causing some form of revenue loss or force the aircraft to be grounded? I'm finding this part particularly challenging to comprehend."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"In no event shall Lessor bear any liability or cost for any\n alteration, modification or addition to, or for any grounding or suspension of\n certification of, the Aircraft, or for any loss of revenue arising therefrom.\n Lessee shall make no material alterations, modifications or additions to the\n Aircraft (such as removal of seats, galleys, lavatories, major avionics\n equipment or the like) that would affect the marketability of the Aircraft\n without Lessor's and each Beneficiary's prior written",
|
||||||
|
" consent. if Lessor and\n each Beneficiary grant such consent, title to such removed Parts shall remain\n with Lessor and Lessor and the Beneficiaries may request Lessee to reinstall\n such Parts prior to termination of this Lease. If Lessor or Beneficiaries\n request Lessee to reinstall such Parts, title to the Parts removed shall vest\n in Lessee. All costs associated with such removal and reinstallation shall be\n borne by Lessee."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Who is the assignor and who is the assignee?",
|
||||||
|
"Can you help me identify who the assignor is and who takes the role of the assignee?",
|
||||||
|
"I'm having some trouble figuring this out. Could you clarify for me who exactly is the assignor and who is designated as the assignee in this context?",
|
||||||
|
"I'm really wrestling with this, it seems a bit tricky. Could you help me to understand who exactly is acting as the assignor and who is being recognized as the assignee in this particular scenario?",
|
||||||
|
"I'm at my wits' end here! What does it mean? Who exactly is playing the role of the assignor and who is being referred to as the assignee in this situation? This is proving to be quite a tough nut to crack!"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"ASSIGNOR: ALOHA AIRLINES, INC.,\n A HAWAII CORPORATION",
|
||||||
|
"ASSIGNEE: ALOHA AIRLINES, INC., A\n DELAWARE CORPORATION"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What does it mean when the Assignee is referred to as a 'Certified Air Carrier'?",
|
||||||
|
"Could you clarify what is implied when the Assignee is labeled as a 'Certified Air Carrier'?",
|
||||||
|
"I'm having a hard time understanding this. Can you explain what the term 'Certified Air Carrier' means when it is applied to the Assignee in this context?",
|
||||||
|
"I'm really struggling here to understand this terminology. Could you assist in explaining what it means when the Assignee is characterized as a 'Certified Air Carrier' in this particular situation?",
|
||||||
|
"I'm almost at the end of my tether! What does this even mean? Can you help me grasp the meaning when the Assignee is designated as a 'Certified Air Carrier'? This particular terminology is really throwing me for a loop!"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(e) Assignee is a Certified Air Carrier and holds all\nlicenses, certificates, permits and franchises from the appropriate agencies of\nthe United States of America and/or all other governmental authorities having\njurisdiction which are necessary to authorize the Assignee to engage in air\ntransport and to carry on its business as presently conducted and to be\nconducted with the Aircraft."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"Why is it important for the Assignee to be a 'citizen of the United States' as defined in 40102(a)(15) of Title 49 of the United States Code?",
|
||||||
|
"Could you help me understand why it's significant for the Assignee to be defined as a 'citizen of the United States' as per 40102(a)(15) of Title 49 of the United States Code?",
|
||||||
|
"I'm finding it a bit challenging to comprehend this part. Why is it crucial for the Assignee to be designated as a 'citizen of the United States', as defined under 40102(a)(15) of Title 49 of the United States Code?",
|
||||||
|
"I'm really trying to unravel this, but it seems quite complex. Could you elucidate why it's so imperative for the Assignee to be identified as a 'citizen of the United States', as per the definition provided in 40102(a)(15) of Title 49 of the United States Code?",
|
||||||
|
"I'm pulling my hair out over this! What does it even mean? Can you help me decipher why it's so essential for the Assignee to be considered a 'citizen of the United States', as stipulated in 40102(a)(15) of Title 49 of the United States Code? I'm finding this legal terminology particularly difficult to grasp."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(f) Assignee is a \"citizen of the United States\" as defined\nin 40102(a)(15) of Title 49 of the United States Code."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"How many days do I have to pay?",
|
||||||
|
"Could you specify the number of days I'm given to complete the payment?",
|
||||||
|
"I'm a bit unsure about the payment deadline. Could you clarify how many days exactly I have to make the payment?",
|
||||||
|
"I'm really trying to understand the payment terms. Could you help me ascertain the exact number of days that I am allotted to finalize the payment?",
|
||||||
|
"I'm so confused! What does this mean exactly? Can you help me comprehend the specific amount of time, in days, that I have been provided with to conclude the payment? I'm finding this financial term quite challenging to understand."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(e) TIMING OF PAYMENT. Any amount due and payable to the\n relevant Indemnitee pursuant to this Section 10 will be paid within 10 days\n after receipt of a written demand therefor from such Indemnitee accompanied by\n a written statement describing in reasonable detail the basis for such\n indemnity and the computation of the amount so payable; PROVIDED, HOWEVER, that\n such amount need not be paid by Lessee prior to the later of (i) five days\n prior to the date the applicable Tax is payable to the a",
|
||||||
|
"ppropriate Governmental\n Entity or taxing authority or (ii) in the case of amounts which are being\n contested by Lessee in good faith or by Lessor pursuant to Section 10(f), the\n date such contest is finally resolved. If requested in writing by Lessee, and\n at Lessee's sole cost and expense, any calculations by an Indemnitee of any\n amount due and payable\n\n -44-\n\n\n<PAGE>\n\n\n hereunder shall be subject to review and verification by a firm of independent\n certif",
|
||||||
|
"ied public accounts of internationally recognized stature selected by\n such Indemnitee and reasonably acceptable to Lessee (such approval not to be\n unreasonably withheld or delayed). Such Indemnitee shall make available to such\n accounting firm such information as shall be necessary for purposes of such\n review and verification (but such information shall be held by such accounting\n firm in strictest confidence and shall not in any event be disclosed or made\n available to Lessee). If the result",
|
||||||
|
" of such review is that Lessee was liable\n for a smaller amount, the excess payment shall be returned by such Indemnitee\n forthwith."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What currency should I pay in?",
|
||||||
|
"Could you please clarify in which currency I am expected to make the payment?",
|
||||||
|
"I'm a bit puzzled here, could you specify the exact currency I should use for the payment?",
|
||||||
|
"I'm really scratching my head trying to figure this out. Could you help me understand in which specific currency I am supposed to settle the payment?",
|
||||||
|
"I'm quite frustrated at this point! What exactly does it mean? Can you elucidate in which particular currency I'm required to execute the payment? I'm finding this point a bit difficult to decipher."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(i) PAYMENTS IN U.S. DOLLARS. All amounts to be paid hereunder to\nLessor or Lessee shall be paid in Dollars, in immediately available funds.\nLessee acknowledges that the specification of Dollars in this transaction is\nof the essence and that Dollars shall be the currency of account in any and\nall events. The obligations of Lessee or Lessor hereunder, to Lessor or\nLessee, respectively, shall not be discharged by an amount paid in another\ncurrency, whether pursuant to a judgment or otherwise, to t",
|
||||||
|
"he extent that the\n amount so paid on prompt conversion to Dollars under normal banking\nprocedures does not yield the amount of Dollars owing to Lessor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is the US registration number of the aircraft?",
|
||||||
|
"Could you please tell me the US registration number assigned to the aircraft?",
|
||||||
|
"I'm having some difficulty here. Could you specify the exact US registration number of the aircraft?",
|
||||||
|
"I'm really struggling to get this part. Could you assist me in figuring out what the specific US registration number for the aircraft is?",
|
||||||
|
"I'm pulling my hair out over this! What does it mean exactly? Can you help me decipher the precise US registration number that's associated with the aircraft? I'm finding it a bit challenging to understand."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"U.S.\n MODEL AND REGISTRATION MANUFACTURER'S\nITEM MANUFACTURER CONFIGURATION NUMBER SERIAL NUMBER\n-------------------------------------------------------------------------------------------------------------------\n<S> <C> <C> <C> <C>\n\nAircraft The Boeing Compa",
|
||||||
|
"ny 737-25A N685MA*"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is the maximum duration that a safety or maintenance requirement can remain unaddressed on the aircraft, particularly in terms of airworthiness directives and mandatory orders?",
|
||||||
|
"How long can a safety or maintenance requirement, especially airworthiness directives and mandatory orders, be left unresolved?",
|
||||||
|
"How long can an airworthiness directive or mandatory order remain outstanding on the aircraft according to standard lease agreements?",
|
||||||
|
"What's the longest period that a safety or maintenance requirement, such as airworthiness directives and mandatory orders, can remain unmet on a leased aircraft?",
|
||||||
|
"What is the maximum allowable timeframe for a safety or maintenance requirement to be left unattended to on an aircraft, specifically referring to airworthiness directives and mandatory orders?"
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"(i) have had all repetitive airworthiness directives and mandatory\n orders and regulations in at least half-life or better condition;"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What are the payment locations?",
|
||||||
|
"Could you specify where exactly I should be making the payments? Are there particular bank accounts or locations?",
|
||||||
|
"I'm a bit puzzled here. Could you clarify the exact payment locations or bank accounts where I'm supposed to deposit the payments?",
|
||||||
|
"I'm really struggling to grasp this. Could you assist me in understanding the specific payment locations or bank accounts where I'm expected to send the payments?",
|
||||||
|
"I'm at my wit's end here! What does this mean? Can you help me figure out the precise locations or bank accounts where I'm supposed to carry out the payments? I'm finding this financial aspect particularly hard to comprehend."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"Payment Locations: For ITOCHU AirLease (Europe) Limited:\n\n Account Name: Citibank New York (ABA No.\n 021000089) for the account of Citibank\n Dublin (account no. 10994598) in favor of\n ITOCHU AirLease (Europe) Limited (account\n no. 1-00-6793-017)\n\n For Marubeni Airleasing (",
|
||||||
|
"U.K.) Limited:\n\n Harris Bank International Corporation\n ABA #026-007-760\n for the credit of\n The Mitsubishi Trust & Banking Corporation\n London Branch A/C#16011100\n UID No.107280\n for further credit to\n Marubeni Airleasi",
|
||||||
|
"ng (UK) Ltd.\n Account #020-404391\n\n With respect to payments by\n Lessee of Basic Rent hereunder, 62.682% of\n such amounts shall be paid to ITOCHU\n AirLease (Europe) Limited (as above\n provided) and 37.318% of such amounts\n shall be paid to Marubeni Airlea",
|
||||||
|
"sing (U.K.)\n Limited (as above provided)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"question_variants": [
|
||||||
|
"What is the revision number of the aircraft?",
|
||||||
|
"Could you please clarify what the revision number of the aircraft is?",
|
||||||
|
"I'm finding this a bit hard to grasp. Could you specify the exact revision number associated with the aircraft?",
|
||||||
|
"I'm really trying to understand this, but it's proving difficult. Could you assist me in determining the specific revision number that is attributed to the aircraft?",
|
||||||
|
"Agh! What does it even mean? Can you help me decipher the exact revision number that is tied to the aircraft? I'm finding this technical detail quite challenging to comprehend."
|
||||||
|
],
|
||||||
|
"criteria": [
|
||||||
|
"Detail Specification (737-25A-Rev. B)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
138
extensions/superboogav2/chat_handler.py
Normal file
138
extensions/superboogav2/chat_handler.py
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
"""
|
||||||
|
This module is responsible for modifying the chat prompt and history.
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from modules import chat
|
||||||
|
from modules.text_generation import get_encoded_length
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
from extensions.superboogav2.utils import create_context_text, create_metadata_source
|
||||||
|
|
||||||
|
from .data_processor import process_and_add_to_collector
|
||||||
|
from .chromadb import ChromaCollector
|
||||||
|
|
||||||
|
|
||||||
|
CHAT_METADATA = create_metadata_source('automatic-chat-insert')
|
||||||
|
|
||||||
|
INSTRUCT_MODE = 'instruct'
|
||||||
|
CHAT_INSTRUCT_MODE = 'chat-instruct'
|
||||||
|
|
||||||
|
|
||||||
|
def _is_instruct_mode(state: dict):
|
||||||
|
mode = state.get('mode')
|
||||||
|
return mode == INSTRUCT_MODE or mode == CHAT_INSTRUCT_MODE
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_tag_if_necessary(user_input: str):
|
||||||
|
if not parameters.get_is_manual():
|
||||||
|
return user_input
|
||||||
|
|
||||||
|
return re.sub(r'^\s*!c\s*|\s*!c\s*$', '', user_input)
|
||||||
|
|
||||||
|
|
||||||
|
def _should_query(input: str):
|
||||||
|
if not parameters.get_is_manual():
|
||||||
|
return True
|
||||||
|
|
||||||
|
if re.search(r'^\s*!c|!c\s*$', input, re.MULTILINE):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _format_single_exchange(name, text):
|
||||||
|
if re.search(r':\s*$', name):
|
||||||
|
return '{} {}\n'.format(name, text)
|
||||||
|
else:
|
||||||
|
return '{}: {}\n'.format(name, text)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_names(state: dict):
|
||||||
|
if _is_instruct_mode(state):
|
||||||
|
user_name = state['name1_instruct']
|
||||||
|
bot_name = state['name2_instruct']
|
||||||
|
else:
|
||||||
|
user_name = state['name1']
|
||||||
|
bot_name = state['name2']
|
||||||
|
|
||||||
|
if not user_name:
|
||||||
|
user_name = 'User'
|
||||||
|
if not bot_name:
|
||||||
|
bot_name = 'Assistant'
|
||||||
|
|
||||||
|
return user_name, bot_name
|
||||||
|
|
||||||
|
|
||||||
|
def _concatinate_history(history: dict, state: dict):
|
||||||
|
full_history_text = ''
|
||||||
|
user_name, bot_name = _get_names(state)
|
||||||
|
|
||||||
|
# Grab the internal history.
|
||||||
|
internal_history = history['internal']
|
||||||
|
assert isinstance(internal_history, list)
|
||||||
|
|
||||||
|
# Iterate through the history.
|
||||||
|
for exchange in internal_history:
|
||||||
|
assert isinstance(exchange, list)
|
||||||
|
|
||||||
|
if len(exchange) >= 1:
|
||||||
|
full_history_text += _format_single_exchange(user_name, exchange[0])
|
||||||
|
if len(exchange) >= 2:
|
||||||
|
full_history_text += _format_single_exchange(bot_name, exchange[1])
|
||||||
|
|
||||||
|
return full_history_text[:-1] # Remove the last new line.
|
||||||
|
|
||||||
|
|
||||||
|
def _hijack_last(context_text: str, history: dict, max_len: int, state: dict):
|
||||||
|
num_context_tokens = get_encoded_length(context_text)
|
||||||
|
|
||||||
|
names = _get_names(state)[::-1]
|
||||||
|
|
||||||
|
history_tokens = 0
|
||||||
|
replace_position = None
|
||||||
|
for i, messages in enumerate(reversed(history['internal'])):
|
||||||
|
for j, message in enumerate(reversed(messages)):
|
||||||
|
num_message_tokens = get_encoded_length(_format_single_exchange(names[j], message))
|
||||||
|
|
||||||
|
# TODO: This is an extremely naive solution. A more robust implementation must be made.
|
||||||
|
if history_tokens + num_context_tokens <= max_len:
|
||||||
|
# This message can be replaced
|
||||||
|
replace_position = (i, j)
|
||||||
|
|
||||||
|
history_tokens += num_message_tokens
|
||||||
|
|
||||||
|
if replace_position is None:
|
||||||
|
logger.warn("The provided context_text is too long to replace any message in the history.")
|
||||||
|
else:
|
||||||
|
# replace the message at replace_position with context_text
|
||||||
|
i, j = replace_position
|
||||||
|
history['internal'][-i-1][-j-1] = context_text
|
||||||
|
|
||||||
|
|
||||||
|
def custom_generate_chat_prompt_internal(user_input: str, state: dict, collector: ChromaCollector, **kwargs):
|
||||||
|
if parameters.get_add_chat_to_data():
|
||||||
|
# Get the whole history as one string
|
||||||
|
history_as_text = _concatinate_history(kwargs['history'], state)
|
||||||
|
|
||||||
|
if history_as_text:
|
||||||
|
# Delete all documents that were auto-inserted
|
||||||
|
collector.delete(ids_to_delete=None, where=CHAT_METADATA)
|
||||||
|
# Insert the processed history
|
||||||
|
process_and_add_to_collector(history_as_text, collector, False, CHAT_METADATA)
|
||||||
|
|
||||||
|
if _should_query(user_input):
|
||||||
|
user_input = _remove_tag_if_necessary(user_input)
|
||||||
|
results = collector.get_sorted_by_dist(user_input, n_results=parameters.get_chunk_count(), max_token_count=int(parameters.get_max_token_count()))
|
||||||
|
|
||||||
|
# Check if the strategy is to modify the last message. If so, prepend or append to the user query.
|
||||||
|
if parameters.get_injection_strategy() == parameters.APPEND_TO_LAST:
|
||||||
|
user_input = user_input + create_context_text(results)
|
||||||
|
elif parameters.get_injection_strategy() == parameters.PREPEND_TO_LAST:
|
||||||
|
user_input = create_context_text(results) + user_input
|
||||||
|
elif parameters.get_injection_strategy() == parameters.HIJACK_LAST_IN_CONTEXT:
|
||||||
|
_hijack_last(create_context_text(results), kwargs['history'], state['truncation_length'], state)
|
||||||
|
|
||||||
|
return chat.generate_chat_prompt(user_input, state, **kwargs)
|
376
extensions/superboogav2/chromadb.py
Normal file
376
extensions/superboogav2/chromadb.py
Normal file
@ -0,0 +1,376 @@
|
|||||||
|
import threading
|
||||||
|
import chromadb
|
||||||
|
import posthog
|
||||||
|
import torch
|
||||||
|
import math
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from chromadb.config import Settings
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
from modules.text_generation import encode, decode
|
||||||
|
|
||||||
|
logger.debug('Intercepting all calls to posthog.')
|
||||||
|
posthog.capture = lambda *args, **kwargs: None
|
||||||
|
|
||||||
|
|
||||||
|
class Collecter():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get(self, search_strings: list[str], n_results: int) -> list[str]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Embedder():
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def embed(self, text: str) -> list[torch.Tensor]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class Info:
|
||||||
|
def __init__(self, start_index, text_with_context, distance, id):
|
||||||
|
self.text_with_context = text_with_context
|
||||||
|
self.start_index = start_index
|
||||||
|
self.distance = distance
|
||||||
|
self.id = id
|
||||||
|
|
||||||
|
def calculate_distance(self, other_info):
|
||||||
|
if parameters.get_new_dist_strategy() == parameters.DIST_MIN_STRATEGY:
|
||||||
|
# Min
|
||||||
|
return min(self.distance, other_info.distance)
|
||||||
|
elif parameters.get_new_dist_strategy() == parameters.DIST_HARMONIC_STRATEGY:
|
||||||
|
# Harmonic mean
|
||||||
|
return 2 * (self.distance * other_info.distance) / (self.distance + other_info.distance)
|
||||||
|
elif parameters.get_new_dist_strategy() == parameters.DIST_GEOMETRIC_STRATEGY:
|
||||||
|
# Geometric mean
|
||||||
|
return (self.distance * other_info.distance) ** 0.5
|
||||||
|
elif parameters.get_new_dist_strategy() == parameters.DIST_ARITHMETIC_STRATEGY:
|
||||||
|
# Arithmetic mean
|
||||||
|
return (self.distance + other_info.distance) / 2
|
||||||
|
else: # Min is default
|
||||||
|
return min(self.distance, other_info.distance)
|
||||||
|
|
||||||
|
def merge_with(self, other_info):
|
||||||
|
s1 = self.text_with_context
|
||||||
|
s2 = other_info.text_with_context
|
||||||
|
s1_start = self.start_index
|
||||||
|
s2_start = other_info.start_index
|
||||||
|
|
||||||
|
new_dist = self.calculate_distance(other_info)
|
||||||
|
|
||||||
|
if self.should_merge(s1, s2, s1_start, s2_start):
|
||||||
|
if s1_start <= s2_start:
|
||||||
|
if s1_start + len(s1) >= s2_start + len(s2): # if s1 completely covers s2
|
||||||
|
return Info(s1_start, s1, new_dist, self.id)
|
||||||
|
else:
|
||||||
|
overlap = max(0, s1_start + len(s1) - s2_start)
|
||||||
|
return Info(s1_start, s1 + s2[overlap:], new_dist, self.id)
|
||||||
|
else:
|
||||||
|
if s2_start + len(s2) >= s1_start + len(s1): # if s2 completely covers s1
|
||||||
|
return Info(s2_start, s2, new_dist, other_info.id)
|
||||||
|
else:
|
||||||
|
overlap = max(0, s2_start + len(s2) - s1_start)
|
||||||
|
return Info(s2_start, s2 + s1[overlap:], new_dist, other_info.id)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def should_merge(s1, s2, s1_start, s2_start):
|
||||||
|
# Check if s1 and s2 are adjacent or overlapping
|
||||||
|
s1_end = s1_start + len(s1)
|
||||||
|
s2_end = s2_start + len(s2)
|
||||||
|
|
||||||
|
return not (s1_end < s2_start or s2_end < s1_start)
|
||||||
|
|
||||||
|
class ChromaCollector(Collecter):
|
||||||
|
def __init__(self, embedder: Embedder):
|
||||||
|
super().__init__()
|
||||||
|
self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
|
||||||
|
self.embedder = embedder
|
||||||
|
self.collection = self.chroma_client.create_collection(name="context", embedding_function=self.embedder.embed)
|
||||||
|
self.ids = []
|
||||||
|
self.id_to_info = {}
|
||||||
|
self.embeddings_cache = {}
|
||||||
|
self.lock = threading.Lock() # Locking so the server doesn't break.
|
||||||
|
|
||||||
|
def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int], metadatas: list[dict] = None):
|
||||||
|
with self.lock:
|
||||||
|
assert metadatas is None or len(metadatas) == len(texts), "metadatas must be None or have the same length as texts"
|
||||||
|
|
||||||
|
if len(texts) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_ids = self._get_new_ids(len(texts))
|
||||||
|
|
||||||
|
(existing_texts, existing_embeddings, existing_ids, existing_metas), \
|
||||||
|
(non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)
|
||||||
|
|
||||||
|
# If there are any already existing texts, add them all at once.
|
||||||
|
if existing_texts:
|
||||||
|
logger.info(f'Adding {len(existing_embeddings)} cached embeddings.')
|
||||||
|
args = {'embeddings': existing_embeddings, 'documents': existing_texts, 'ids': existing_ids}
|
||||||
|
if metadatas is not None:
|
||||||
|
args['metadatas'] = existing_metas
|
||||||
|
self.collection.add(**args)
|
||||||
|
|
||||||
|
# If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
|
||||||
|
if non_existing_texts:
|
||||||
|
non_existing_embeddings = self.embedder.embed(non_existing_texts).tolist()
|
||||||
|
for text, embedding in zip(non_existing_texts, non_existing_embeddings):
|
||||||
|
self.embeddings_cache[text] = embedding
|
||||||
|
|
||||||
|
logger.info(f'Adding {len(non_existing_embeddings)} new embeddings.')
|
||||||
|
args = {'embeddings': non_existing_embeddings, 'documents': non_existing_texts, 'ids': non_existing_ids}
|
||||||
|
if metadatas is not None:
|
||||||
|
args['metadatas'] = non_existing_metas
|
||||||
|
self.collection.add(**args)
|
||||||
|
|
||||||
|
# Create a dictionary that maps each ID to its context and starting index
|
||||||
|
new_info = {
|
||||||
|
id_: {'text_with_context': context, 'start_index': start_index}
|
||||||
|
for id_, context, start_index in zip(new_ids, texts_with_context, starting_indices)
|
||||||
|
}
|
||||||
|
|
||||||
|
self.id_to_info.update(new_info)
|
||||||
|
self.ids.extend(new_ids)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metadatas: list[dict]):
|
||||||
|
existing_texts, non_existing_texts = [], []
|
||||||
|
existing_embeddings = []
|
||||||
|
existing_ids, non_existing_ids = [], []
|
||||||
|
existing_metas, non_existing_metas = [], []
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
id_ = new_ids[i]
|
||||||
|
metadata = metadatas[i] if metadatas is not None else None
|
||||||
|
embedding = self.embeddings_cache.get(text)
|
||||||
|
if embedding:
|
||||||
|
existing_texts.append(text)
|
||||||
|
existing_embeddings.append(embedding)
|
||||||
|
existing_ids.append(id_)
|
||||||
|
existing_metas.append(metadata)
|
||||||
|
else:
|
||||||
|
non_existing_texts.append(text)
|
||||||
|
non_existing_ids.append(id_)
|
||||||
|
non_existing_metas.append(metadata)
|
||||||
|
|
||||||
|
return (existing_texts, existing_embeddings, existing_ids, existing_metas), \
|
||||||
|
(non_existing_texts, non_existing_ids, non_existing_metas)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_new_ids(self, num_new_ids: int):
|
||||||
|
if self.ids:
|
||||||
|
max_existing_id = max(int(id_) for id_ in self.ids)
|
||||||
|
else:
|
||||||
|
max_existing_id = -1
|
||||||
|
|
||||||
|
return [str(i + max_existing_id + 1) for i in range(num_new_ids)]
|
||||||
|
|
||||||
|
|
||||||
|
def _find_min_max_start_index(self):
|
||||||
|
max_index, min_index = 0, float('inf')
|
||||||
|
for _, val in self.id_to_info.items():
|
||||||
|
if val['start_index'] > max_index:
|
||||||
|
max_index = val['start_index']
|
||||||
|
if val['start_index'] < min_index:
|
||||||
|
min_index = val['start_index']
|
||||||
|
return min_index, max_index
|
||||||
|
|
||||||
|
|
||||||
|
# NB: Does not make sense to weigh excerpts from different documents.
|
||||||
|
# But let's say that's the user's problem. Perfect world scenario:
|
||||||
|
# Apply time weighing to different documents. For each document, then, add
|
||||||
|
# separate time weighing.
|
||||||
|
def _apply_sigmoid_time_weighing(self, infos: list[Info], document_len: int, time_steepness: float, time_power: float):
|
||||||
|
sigmoid = lambda x: 1 / (1 + np.exp(-x))
|
||||||
|
|
||||||
|
weights = sigmoid(time_steepness * np.linspace(-10, 10, document_len))
|
||||||
|
|
||||||
|
# Scale to [0,time_power] and shift it up to [1-time_power, 1]
|
||||||
|
weights = weights - min(weights)
|
||||||
|
weights = weights * (time_power / max(weights))
|
||||||
|
weights = weights + (1 - time_power)
|
||||||
|
|
||||||
|
# Reverse the weights
|
||||||
|
weights = weights[::-1]
|
||||||
|
|
||||||
|
for info in infos:
|
||||||
|
index = info.start_index
|
||||||
|
info.distance *= weights[index]
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_outliers_by_median_distance(self, infos: list[Info], significant_level: float):
|
||||||
|
# Ensure there are infos to filter
|
||||||
|
if not infos:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Find info with minimum distance
|
||||||
|
min_info = min(infos, key=lambda x: x.distance)
|
||||||
|
|
||||||
|
# Calculate median distance among infos
|
||||||
|
median_distance = np.median([inf.distance for inf in infos])
|
||||||
|
|
||||||
|
# Filter out infos that have a distance significantly greater than the median
|
||||||
|
filtered_infos = [inf for inf in infos if inf.distance <= significant_level * median_distance]
|
||||||
|
|
||||||
|
# Always include the info with minimum distance
|
||||||
|
if min_info not in filtered_infos:
|
||||||
|
filtered_infos.append(min_info)
|
||||||
|
|
||||||
|
return filtered_infos
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_infos(self, infos: list[Info]):
|
||||||
|
merged_infos = []
|
||||||
|
current_info = infos[0]
|
||||||
|
|
||||||
|
for next_info in infos[1:]:
|
||||||
|
merged = current_info.merge_with(next_info)
|
||||||
|
if merged is not None:
|
||||||
|
current_info = merged
|
||||||
|
else:
|
||||||
|
merged_infos.append(current_info)
|
||||||
|
current_info = next_info
|
||||||
|
|
||||||
|
merged_infos.append(current_info)
|
||||||
|
return merged_infos
|
||||||
|
|
||||||
|
|
||||||
|
# Main function for retrieving chunks by distance. It performs merging, time weighing, and mean filtering.
|
||||||
|
def _get_documents_ids_distances(self, search_strings: list[str], n_results: int):
|
||||||
|
n_results = min(len(self.ids), n_results)
|
||||||
|
if n_results == 0:
|
||||||
|
return [], [], []
|
||||||
|
|
||||||
|
if isinstance(search_strings, str):
|
||||||
|
search_strings = [search_strings]
|
||||||
|
|
||||||
|
infos = []
|
||||||
|
min_start_index, max_start_index = self._find_min_max_start_index()
|
||||||
|
|
||||||
|
for search_string in search_strings:
|
||||||
|
result = self.collection.query(query_texts=search_string, n_results=math.ceil(n_results / len(search_strings)), include=['distances'])
|
||||||
|
curr_infos = [Info(start_index=self.id_to_info[id]['start_index'],
|
||||||
|
text_with_context=self.id_to_info[id]['text_with_context'],
|
||||||
|
distance=distance, id=id)
|
||||||
|
for id, distance in zip(result['ids'][0], result['distances'][0])]
|
||||||
|
|
||||||
|
self._apply_sigmoid_time_weighing(infos=curr_infos, document_len=max_start_index - min_start_index + 1, time_steepness=parameters.get_time_steepness(), time_power=parameters.get_time_power())
|
||||||
|
curr_infos = self._filter_outliers_by_median_distance(curr_infos, parameters.get_significant_level())
|
||||||
|
infos.extend(curr_infos)
|
||||||
|
|
||||||
|
infos.sort(key=lambda x: x.start_index)
|
||||||
|
infos = self._merge_infos(infos)
|
||||||
|
|
||||||
|
texts_with_context = [inf.text_with_context for inf in infos]
|
||||||
|
ids = [inf.id for inf in infos]
|
||||||
|
distances = [inf.distance for inf in infos]
|
||||||
|
|
||||||
|
return texts_with_context, ids, distances
|
||||||
|
|
||||||
|
|
||||||
|
# Get chunks by similarity
|
||||||
|
def get(self, search_strings: list[str], n_results: int) -> list[str]:
|
||||||
|
with self.lock:
|
||||||
|
documents, _, _ = self._get_documents_ids_distances(search_strings, n_results)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
|
||||||
|
# Get ids by similarity
|
||||||
|
def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
|
||||||
|
with self.lock:
|
||||||
|
_, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
|
||||||
|
return ids
|
||||||
|
|
||||||
|
|
||||||
|
# Cutoff token count
|
||||||
|
def _get_documents_up_to_token_count(self, documents: list[str], max_token_count: int):
|
||||||
|
# TODO: Move to caller; We add delimiters there which might go over the limit.
|
||||||
|
current_token_count = 0
|
||||||
|
return_documents = []
|
||||||
|
|
||||||
|
for doc in documents:
|
||||||
|
doc_tokens = encode(doc)[0]
|
||||||
|
doc_token_count = len(doc_tokens)
|
||||||
|
if current_token_count + doc_token_count > max_token_count:
|
||||||
|
# If adding this document would exceed the max token count,
|
||||||
|
# truncate the document to fit within the limit.
|
||||||
|
remaining_tokens = max_token_count - current_token_count
|
||||||
|
|
||||||
|
truncated_doc = decode(doc_tokens[:remaining_tokens], skip_special_tokens=True)
|
||||||
|
return_documents.append(truncated_doc)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return_documents.append(doc)
|
||||||
|
current_token_count += doc_token_count
|
||||||
|
|
||||||
|
return return_documents
|
||||||
|
|
||||||
|
|
||||||
|
# Get chunks by similarity and then sort by ids
|
||||||
|
def get_sorted_by_ids(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
|
||||||
|
with self.lock:
|
||||||
|
documents, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
|
||||||
|
sorted_docs = [x for _, x in sorted(zip(ids, documents))]
|
||||||
|
|
||||||
|
return self._get_documents_up_to_token_count(sorted_docs, max_token_count)
|
||||||
|
|
||||||
|
|
||||||
|
# Get chunks by similarity and then sort by distance (lowest distance is last).
|
||||||
|
def get_sorted_by_dist(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
|
||||||
|
with self.lock:
|
||||||
|
documents, _, distances = self._get_documents_ids_distances(search_strings, n_results)
|
||||||
|
sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])] # sorted lowest -> highest
|
||||||
|
|
||||||
|
# If a document is truncated or competely skipped, it would be with high distance.
|
||||||
|
return_documents = self._get_documents_up_to_token_count(sorted_docs, max_token_count)
|
||||||
|
return_documents.reverse() # highest -> lowest
|
||||||
|
|
||||||
|
return return_documents
|
||||||
|
|
||||||
|
|
||||||
|
def delete(self, ids_to_delete: list[str], where: dict):
|
||||||
|
with self.lock:
|
||||||
|
ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']
|
||||||
|
self.collection.delete(ids=ids_to_delete, where=where)
|
||||||
|
|
||||||
|
# Remove the deleted ids from self.ids and self.id_to_info
|
||||||
|
ids_set = set(ids_to_delete)
|
||||||
|
self.ids = [id_ for id_ in self.ids if id_ not in ids_set]
|
||||||
|
for id_ in ids_to_delete:
|
||||||
|
self.id_to_info.pop(id_, None)
|
||||||
|
|
||||||
|
logger.info(f'Successfully deleted {len(ids_to_delete)} records from chromaDB.')
|
||||||
|
|
||||||
|
|
||||||
|
def clear(self):
|
||||||
|
with self.lock:
|
||||||
|
self.chroma_client.reset()
|
||||||
|
self.collection = self.chroma_client.create_collection("context", embedding_function=self.embedder.embed)
|
||||||
|
self.ids = []
|
||||||
|
self.id_to_info = {}
|
||||||
|
|
||||||
|
logger.info('Successfully cleared all records and reset chromaDB.')
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceTransformerEmbedder(Embedder):
|
||||||
|
def __init__(self) -> None:
|
||||||
|
logger.debug('Creating Sentence Embedder...')
|
||||||
|
self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
|
||||||
|
self.embed = self.model.encode
|
||||||
|
|
||||||
|
|
||||||
|
def make_collector():
|
||||||
|
return ChromaCollector(SentenceTransformerEmbedder())
|
161
extensions/superboogav2/config.json
Normal file
161
extensions/superboogav2/config.json
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
{
|
||||||
|
"to_lower": {
|
||||||
|
"default": false,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"num_conversion": {
|
||||||
|
"default": null,
|
||||||
|
"categories": ["NUM_TO_WORD_METHOD", "NUM_TO_CHAR_METHOD", "NUM_TO_CHAR_LONG_METHOD", null],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"merge_spaces": {
|
||||||
|
"default": false,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"strip": {
|
||||||
|
"default": true,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": false
|
||||||
|
},
|
||||||
|
"remove_punctuation": {
|
||||||
|
"default": true,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"remove_stopwords": {
|
||||||
|
"default": false,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"remove_specific_pos": {
|
||||||
|
"default": false,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"lemmatize": {
|
||||||
|
"default": true,
|
||||||
|
"categories": [true, false],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"min_num_sent": {
|
||||||
|
"default": 1,
|
||||||
|
"categories": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 999999],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"delta_start": {
|
||||||
|
"default": 0,
|
||||||
|
"categories": [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"chunk_len1": {
|
||||||
|
"default": 500,
|
||||||
|
"categories": [50, 200, 250, 500, 600, 900, 1000],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"chunk_len2": {
|
||||||
|
"default": 500,
|
||||||
|
"categories": [0, 50, 200, 250, 500, 600, 900],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"chunk_len3": {
|
||||||
|
"default": 1000,
|
||||||
|
"categories": [0, 100, 150, 300, 400, 700, 800, 1000],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"chunk_len4": {
|
||||||
|
"default": 700,
|
||||||
|
"categories": [0, 100, 150, 300, 400, 700, 800],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"chunk_len_mask": {
|
||||||
|
"default": 15,
|
||||||
|
"categories": [3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15],
|
||||||
|
"should_optimize": false
|
||||||
|
},
|
||||||
|
"context_len_left": {
|
||||||
|
"default": 250,
|
||||||
|
"categories": [50, 100, 150, 200, 250, 300, 350, 400, 500, 600, 700, 800, 900, 1000],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"context_len_right": {
|
||||||
|
"default": 800,
|
||||||
|
"categories": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1500, 1600],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"new_dist_strategy": {
|
||||||
|
"default": "DIST_MIN_STRATEGY",
|
||||||
|
"categories": ["DIST_MIN_STRATEGY", "DIST_HARMONIC_STRATEGY", "DIST_GEOMETRIC_STRATEGY", "DIST_ARITHMETIC_STRATEGY"],
|
||||||
|
"should_optimize": false
|
||||||
|
},
|
||||||
|
"chunk_count": {
|
||||||
|
"default": 250,
|
||||||
|
"categories": [30, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"min_num_length": {
|
||||||
|
"default": 9,
|
||||||
|
"categories": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"significant_level": {
|
||||||
|
"default": 1.0,
|
||||||
|
"categories": [0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 999999],
|
||||||
|
"should_optimize": true
|
||||||
|
},
|
||||||
|
"time_steepness": {
|
||||||
|
"default": 0.01,
|
||||||
|
"categories": [0.01, 0.2, 0.4, 0.6, 0.8, 1.0],
|
||||||
|
"should_optimize": false
|
||||||
|
},
|
||||||
|
"time_power": {
|
||||||
|
"default": 0,
|
||||||
|
"categories": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
|
||||||
|
"should_optimize": false
|
||||||
|
},
|
||||||
|
"chunk_separator": {
|
||||||
|
"default": ""
|
||||||
|
},
|
||||||
|
"prefix": {
|
||||||
|
"default": "<<document chunk>>\n\n"
|
||||||
|
},
|
||||||
|
"data_separator": {
|
||||||
|
"default": "\n\n<<document chunk>>\n\n"
|
||||||
|
},
|
||||||
|
"postfix": {
|
||||||
|
"default": "\n\n<<document end>>\n\n"
|
||||||
|
},
|
||||||
|
"manual": {
|
||||||
|
"default": true
|
||||||
|
},
|
||||||
|
"add_chat_to_data": {
|
||||||
|
"default": true
|
||||||
|
},
|
||||||
|
"injection_strategy": {
|
||||||
|
"default": "PREPEND_TO_LAST",
|
||||||
|
"categories": ["PREPEND_TO_LAST", "APPEND_TO_LAST", "HIJACK_LAST_IN_CONTEXT"]
|
||||||
|
},
|
||||||
|
"chunk_regex": {
|
||||||
|
"default": "(?<==== ).*?(?= ===)|User story: \\d+"
|
||||||
|
},
|
||||||
|
"strong_cleanup": {
|
||||||
|
"default": false
|
||||||
|
},
|
||||||
|
"max_token_count": {
|
||||||
|
"default": 3072
|
||||||
|
},
|
||||||
|
"threads": {
|
||||||
|
"default": 4
|
||||||
|
},
|
||||||
|
"optimization_steps": {
|
||||||
|
"default": 100
|
||||||
|
},
|
||||||
|
"api_port": {
|
||||||
|
"default": 5002
|
||||||
|
},
|
||||||
|
"api_on": {
|
||||||
|
"default": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
199
extensions/superboogav2/data_preprocessor.py
Normal file
199
extensions/superboogav2/data_preprocessor.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
"""
|
||||||
|
This module contains utils for preprocessing the text before converting it to embeddings.
|
||||||
|
|
||||||
|
- TextPreprocessorBuilder preprocesses individual strings.
|
||||||
|
* lowering cases
|
||||||
|
* converting numbers to words or characters
|
||||||
|
* merging and stripping spaces
|
||||||
|
* removing punctuation
|
||||||
|
* removing stop words
|
||||||
|
* lemmatizing
|
||||||
|
* removing specific parts of speech (adverbs and interjections)
|
||||||
|
- TextSummarizer extracts the most important sentences from a long string using text-ranking.
|
||||||
|
"""
|
||||||
|
import pytextrank
|
||||||
|
import string
|
||||||
|
import spacy
|
||||||
|
import math
|
||||||
|
import nltk
|
||||||
|
import re
|
||||||
|
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import WordNetLemmatizer
|
||||||
|
from num2words import num2words
|
||||||
|
|
||||||
|
|
||||||
|
class TextPreprocessorBuilder:
|
||||||
|
# Define class variables as None initially
|
||||||
|
_stop_words = set(stopwords.words('english'))
|
||||||
|
_lemmatizer = WordNetLemmatizer()
|
||||||
|
|
||||||
|
# Some of the functions are expensive. We cache the results.
|
||||||
|
_lemmatizer_cache = {}
|
||||||
|
_pos_remove_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self, text: str):
|
||||||
|
self.text = text
|
||||||
|
|
||||||
|
|
||||||
|
def to_lower(self):
|
||||||
|
# Match both words and non-word characters
|
||||||
|
tokens = re.findall(r'\b\w+\b|\W+', self.text)
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
# Check if token is a word
|
||||||
|
if re.match(r'^\w+$', token):
|
||||||
|
# Check if token is not an abbreviation or constant
|
||||||
|
if not re.match(r'^[A-Z]+$', token) and not re.match(r'^[A-Z_]+$', token):
|
||||||
|
tokens[i] = token.lower()
|
||||||
|
self.text = "".join(tokens)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def num_to_word(self, min_len: int = 1):
|
||||||
|
# Match both words and non-word characters
|
||||||
|
tokens = re.findall(r'\b\w+\b|\W+', self.text)
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
# Check if token is a number of length `min_len` or more
|
||||||
|
if token.isdigit() and len(token) >= min_len:
|
||||||
|
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
|
||||||
|
# 740700 will become "seven hundred and forty thousand seven hundred".
|
||||||
|
tokens[i] = num2words(int(token)).replace(",","") # Remove commas from num2words.
|
||||||
|
self.text = "".join(tokens)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def num_to_char_long(self, min_len: int = 1):
|
||||||
|
# Match both words and non-word characters
|
||||||
|
tokens = re.findall(r'\b\w+\b|\W+', self.text)
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
# Check if token is a number of length `min_len` or more
|
||||||
|
if token.isdigit() and len(token) >= min_len:
|
||||||
|
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
|
||||||
|
# 740700 will become HHHHHHEEEEEAAAAHHHAAA
|
||||||
|
convert_token = lambda token: ''.join((chr(int(digit) + 65) * (i + 1)) for i, digit in enumerate(token[::-1]))[::-1]
|
||||||
|
tokens[i] = convert_token(tokens[i])
|
||||||
|
self.text = "".join(tokens)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def num_to_char(self, min_len: int = 1):
|
||||||
|
# Match both words and non-word characters
|
||||||
|
tokens = re.findall(r'\b\w+\b|\W+', self.text)
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
# Check if token is a number of length `min_len` or more
|
||||||
|
if token.isdigit() and len(token) >= min_len:
|
||||||
|
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
|
||||||
|
# 740700 will become HEAHAA
|
||||||
|
tokens[i] = ''.join(chr(int(digit) + 65) for digit in token)
|
||||||
|
self.text = "".join(tokens)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def merge_spaces(self):
|
||||||
|
self.text = re.sub(' +', ' ', self.text)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def strip(self):
|
||||||
|
self.text = self.text.strip()
|
||||||
|
return self
|
||||||
|
|
||||||
|
def remove_punctuation(self):
|
||||||
|
self.text = self.text.translate(str.maketrans('', '', string.punctuation))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def remove_stopwords(self):
|
||||||
|
self.text = "".join([word for word in re.findall(r'\b\w+\b|\W+', self.text) if word not in TextPreprocessorBuilder._stop_words])
|
||||||
|
return self
|
||||||
|
|
||||||
|
def remove_specific_pos(self):
|
||||||
|
"""
|
||||||
|
In the English language, adverbs and interjections rarely provide meaningul information.
|
||||||
|
Removing them improves the embedding precision. Don't tell JK Rowling, though.
|
||||||
|
"""
|
||||||
|
processed_text = TextPreprocessorBuilder._pos_remove_cache.get(self.text)
|
||||||
|
if processed_text:
|
||||||
|
self.text = processed_text
|
||||||
|
return self
|
||||||
|
|
||||||
|
# Match both words and non-word characters
|
||||||
|
tokens = re.findall(r'\b\w+\b|\W+', self.text)
|
||||||
|
|
||||||
|
# Exclude adverbs and interjections
|
||||||
|
excluded_tags = ['RB', 'RBR', 'RBS', 'UH']
|
||||||
|
|
||||||
|
for i, token in enumerate(tokens):
|
||||||
|
# Check if token is a word
|
||||||
|
if re.match(r'^\w+$', token):
|
||||||
|
# Part-of-speech tag the word
|
||||||
|
pos = nltk.pos_tag([token])[0][1]
|
||||||
|
# If the word's POS tag is in the excluded list, remove the word
|
||||||
|
if pos in excluded_tags:
|
||||||
|
tokens[i] = ''
|
||||||
|
|
||||||
|
new_text = "".join(tokens)
|
||||||
|
TextPreprocessorBuilder._pos_remove_cache[self.text] = new_text
|
||||||
|
self.text = new_text
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def lemmatize(self):
|
||||||
|
processed_text = TextPreprocessorBuilder._lemmatizer_cache.get(self.text)
|
||||||
|
if processed_text:
|
||||||
|
self.text = processed_text
|
||||||
|
return self
|
||||||
|
|
||||||
|
new_text = "".join([TextPreprocessorBuilder._lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b|\W+', self.text)])
|
||||||
|
TextPreprocessorBuilder._lemmatizer_cache[self.text] = new_text
|
||||||
|
self.text = new_text
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def build(self):
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
class TextSummarizer:
|
||||||
|
_nlp_pipeline = None
|
||||||
|
_cache = {}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _load_nlp_pipeline():
|
||||||
|
# Lazy-load it.
|
||||||
|
if TextSummarizer._nlp_pipeline is None:
|
||||||
|
TextSummarizer._nlp_pipeline = spacy.load('en_core_web_sm')
|
||||||
|
TextSummarizer._nlp_pipeline.add_pipe("textrank", last=True)
|
||||||
|
return TextSummarizer._nlp_pipeline
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def process_long_text(text: str, min_num_sent: int) -> list[str]:
|
||||||
|
"""
|
||||||
|
This function applies a text summarization process on a given text string, extracting
|
||||||
|
the most important sentences based on the principle that 20% of the content is responsible
|
||||||
|
for 80% of the meaning (the Pareto Principle).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of the most important sentences
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Attempt to get the result from cache
|
||||||
|
cache_key = (text, min_num_sent)
|
||||||
|
cached_result = TextSummarizer._cache.get(cache_key, None)
|
||||||
|
if cached_result is not None:
|
||||||
|
return cached_result
|
||||||
|
|
||||||
|
nlp_pipeline = TextSummarizer._load_nlp_pipeline()
|
||||||
|
doc = nlp_pipeline(text)
|
||||||
|
|
||||||
|
num_sent = len(list(doc.sents))
|
||||||
|
result = []
|
||||||
|
|
||||||
|
if num_sent >= min_num_sent:
|
||||||
|
|
||||||
|
limit_phrases = math.ceil(len(doc._.phrases) * 0.20) # 20% of the phrases, rounded up
|
||||||
|
limit_sentences = math.ceil(num_sent * 0.20) # 20% of the sentences, rounded up
|
||||||
|
result = [str(sent) for sent in doc._.textrank.summary(limit_phrases=limit_phrases, limit_sentences=limit_sentences)]
|
||||||
|
|
||||||
|
else:
|
||||||
|
result = [text]
|
||||||
|
|
||||||
|
# Store the result in cache before returning it
|
||||||
|
TextSummarizer._cache[cache_key] = result
|
||||||
|
return result
|
209
extensions/superboogav2/data_processor.py
Normal file
209
extensions/superboogav2/data_processor.py
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
"""
|
||||||
|
This module is responsible for processing the corpus and feeding it into chromaDB. It will receive a corpus of text.
|
||||||
|
It will then split it into chunks of specified length. For each of those chunks, it will append surrounding context.
|
||||||
|
It will only include full words.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import bisect
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from .data_preprocessor import TextPreprocessorBuilder, TextSummarizer
|
||||||
|
from .chromadb import ChromaCollector
|
||||||
|
|
||||||
|
def preprocess_text_no_summary(text) -> str:
|
||||||
|
builder = TextPreprocessorBuilder(text)
|
||||||
|
if parameters.should_to_lower():
|
||||||
|
builder.to_lower()
|
||||||
|
|
||||||
|
if parameters.should_remove_punctuation():
|
||||||
|
builder.remove_punctuation()
|
||||||
|
|
||||||
|
if parameters.should_remove_specific_pos():
|
||||||
|
builder.remove_specific_pos()
|
||||||
|
|
||||||
|
if parameters.should_remove_stopwords():
|
||||||
|
builder.remove_stopwords
|
||||||
|
|
||||||
|
if parameters.should_lemmatize():
|
||||||
|
builder.lemmatize()
|
||||||
|
|
||||||
|
if parameters.should_merge_spaces():
|
||||||
|
builder.merge_spaces
|
||||||
|
|
||||||
|
if parameters.should_strip():
|
||||||
|
builder.strip()
|
||||||
|
|
||||||
|
if parameters.get_num_conversion_strategy():
|
||||||
|
if parameters.get_num_conversion_strategy() == parameters.NUM_TO_WORD_METHOD:
|
||||||
|
builder.num_to_word(parameters.get_min_num_length())
|
||||||
|
elif parameters.get_num_conversion_strategy() == parameters.NUM_TO_CHAR_METHOD:
|
||||||
|
builder.num_to_char(parameters.get_min_num_length())
|
||||||
|
elif parameters.get_num_conversion_strategy() == parameters.NUM_TO_CHAR_LONG_METHOD:
|
||||||
|
builder.num_to_char_long(parameters.get_min_num_length())
|
||||||
|
|
||||||
|
return builder.build()
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(text) -> list[str]:
|
||||||
|
important_sentences = TextSummarizer.process_long_text(text, parameters.get_min_num_sentences())
|
||||||
|
return [preprocess_text_no_summary(sent) for sent in important_sentences]
|
||||||
|
|
||||||
|
|
||||||
|
def _create_chunks_with_context(corpus, chunk_len, context_left, context_right):
|
||||||
|
"""
|
||||||
|
This function takes a corpus of text and splits it into chunks of a specified length,
|
||||||
|
then adds a specified amount of context to each chunk. The context is added by first
|
||||||
|
going backwards from the start of the chunk and then going forwards from the end of the
|
||||||
|
chunk, ensuring that the context includes only whole words and that the total context length
|
||||||
|
does not exceed the specified limit. This function uses binary search for efficiency.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
chunks (list of str): The chunks of text.
|
||||||
|
chunks_with_context (list of str): The chunks of text with added context.
|
||||||
|
chunk_with_context_start_indices (list of int): The starting indices of each chunk with context in the corpus.
|
||||||
|
"""
|
||||||
|
words = re.split('(\\s+)', corpus)
|
||||||
|
word_start_indices = [0]
|
||||||
|
current_index = 0
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
current_index += len(word)
|
||||||
|
word_start_indices.append(current_index)
|
||||||
|
|
||||||
|
chunks, chunk_lengths, chunk_start_indices, chunk_with_context_start_indices = [], [], [], []
|
||||||
|
current_length = 0
|
||||||
|
current_index = 0
|
||||||
|
chunk = []
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
if current_length + len(word) > chunk_len:
|
||||||
|
chunks.append(''.join(chunk))
|
||||||
|
chunk_lengths.append(current_length)
|
||||||
|
chunk_start_indices.append(current_index - current_length)
|
||||||
|
chunk = [word]
|
||||||
|
current_length = len(word)
|
||||||
|
else:
|
||||||
|
chunk.append(word)
|
||||||
|
current_length += len(word)
|
||||||
|
current_index += len(word)
|
||||||
|
|
||||||
|
if chunk:
|
||||||
|
chunks.append(''.join(chunk))
|
||||||
|
chunk_lengths.append(current_length)
|
||||||
|
chunk_start_indices.append(current_index - current_length)
|
||||||
|
|
||||||
|
chunks_with_context = []
|
||||||
|
for start_index, chunk_length in zip(chunk_start_indices, chunk_lengths):
|
||||||
|
context_start_index = bisect.bisect_right(word_start_indices, start_index - context_left)
|
||||||
|
context_end_index = bisect.bisect_left(word_start_indices, start_index + chunk_length + context_right)
|
||||||
|
|
||||||
|
# Combine all the words in the context range (before, chunk, and after)
|
||||||
|
chunk_with_context = ''.join(words[context_start_index:context_end_index])
|
||||||
|
chunks_with_context.append(chunk_with_context)
|
||||||
|
|
||||||
|
# Determine the start index of the chunk with context
|
||||||
|
chunk_with_context_start_index = word_start_indices[context_start_index]
|
||||||
|
chunk_with_context_start_indices.append(chunk_with_context_start_index)
|
||||||
|
|
||||||
|
return chunks, chunks_with_context, chunk_with_context_start_indices
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_chunks(data_chunks, data_chunks_with_context, data_chunk_starting_indices):
|
||||||
|
distinct_data_chunks = []
|
||||||
|
distinct_data_chunks_with_context = []
|
||||||
|
distinct_data_chunk_starting_indices = []
|
||||||
|
|
||||||
|
seen_chunks = dict()
|
||||||
|
|
||||||
|
for chunk, context, index in zip(data_chunks, data_chunks_with_context, data_chunk_starting_indices):
|
||||||
|
# Skip the chunk if it does not contain any alphanumeric characters
|
||||||
|
if not any(char.isalnum() for char in chunk):
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_chunk_start = seen_chunks.get(chunk)
|
||||||
|
if seen_chunk_start:
|
||||||
|
# If we've already seen this exact chunk, and the context around it it very close to the seen chunk, then skip it.
|
||||||
|
if abs(seen_chunk_start-index) < parameters.get_delta_start():
|
||||||
|
continue
|
||||||
|
|
||||||
|
distinct_data_chunks.append(chunk)
|
||||||
|
distinct_data_chunks_with_context.append(context)
|
||||||
|
distinct_data_chunk_starting_indices.append(index)
|
||||||
|
|
||||||
|
seen_chunks[chunk] = index
|
||||||
|
|
||||||
|
return distinct_data_chunks, distinct_data_chunks_with_context, distinct_data_chunk_starting_indices
|
||||||
|
|
||||||
|
|
||||||
|
def process_and_add_to_collector(corpus: str, collector: ChromaCollector, clear_collector_before_adding: bool, metadata: dict):
|
||||||
|
# Defining variables
|
||||||
|
chunk_lens = [int(len.strip()) for len in parameters.get_chunk_len().split(',')]
|
||||||
|
context_len = [int(len.strip()) for len in parameters.get_context_len().split(',')]
|
||||||
|
if len(context_len) >= 3:
|
||||||
|
raise f"Context len has too many values: {len(context_len)}"
|
||||||
|
if len(context_len) == 2:
|
||||||
|
context_left = context_len[0]
|
||||||
|
context_right = context_len[1]
|
||||||
|
else:
|
||||||
|
context_left = context_right = context_len[0]
|
||||||
|
|
||||||
|
data_chunks = []
|
||||||
|
data_chunks_with_context = []
|
||||||
|
data_chunk_starting_indices = []
|
||||||
|
|
||||||
|
# Handling chunk_regex
|
||||||
|
if parameters.get_chunk_regex():
|
||||||
|
if parameters.get_chunk_separator():
|
||||||
|
cumulative_length = 0 # This variable will store the length of the processed corpus
|
||||||
|
sections = corpus.split(parameters.get_chunk_separator())
|
||||||
|
for section in sections:
|
||||||
|
special_chunks = list(re.finditer(parameters.get_chunk_regex(), section))
|
||||||
|
for match in special_chunks:
|
||||||
|
chunk = match.group(0)
|
||||||
|
start_index = match.start()
|
||||||
|
end_index = start_index + len(chunk)
|
||||||
|
context = section[max(0, start_index - context_left):min(len(section), end_index + context_right)]
|
||||||
|
data_chunks.append(chunk)
|
||||||
|
data_chunks_with_context.append(context)
|
||||||
|
data_chunk_starting_indices.append(cumulative_length + max(0, start_index - context_left))
|
||||||
|
cumulative_length += len(section) + len(parameters.get_chunk_separator()) # Update the length of the processed corpus
|
||||||
|
else:
|
||||||
|
special_chunks = list(re.finditer(parameters.get_chunk_regex(), corpus))
|
||||||
|
for match in special_chunks:
|
||||||
|
chunk = match.group(0)
|
||||||
|
start_index = match.start()
|
||||||
|
end_index = start_index + len(chunk)
|
||||||
|
context = corpus[max(0, start_index - context_left):min(len(corpus), end_index + context_right)]
|
||||||
|
data_chunks.append(chunk)
|
||||||
|
data_chunks_with_context.append(context)
|
||||||
|
data_chunk_starting_indices.append(max(0, start_index - context_left))
|
||||||
|
|
||||||
|
for chunk_len in chunk_lens:
|
||||||
|
# Breaking the data into chunks and adding those to the db
|
||||||
|
if parameters.get_chunk_separator():
|
||||||
|
cumulative_length = 0 # This variable will store the length of the processed corpus
|
||||||
|
sections = corpus.split(parameters.get_chunk_separator())
|
||||||
|
for section in sections:
|
||||||
|
chunks, chunks_with_context, context_start_indices = _create_chunks_with_context(section, chunk_len, context_left, context_right)
|
||||||
|
context_start_indices = [cumulative_length + i for i in context_start_indices] # Add the length of the processed corpus to each start index
|
||||||
|
data_chunks.extend(chunks)
|
||||||
|
data_chunks_with_context.extend(chunks_with_context)
|
||||||
|
data_chunk_starting_indices.extend(context_start_indices)
|
||||||
|
cumulative_length += len(section) + len(parameters.get_chunk_separator()) # Update the length of the processed corpus
|
||||||
|
else:
|
||||||
|
chunks, chunks_with_context, context_start_indices = _create_chunks_with_context(corpus, chunk_len, context_left, context_right)
|
||||||
|
data_chunks.extend(chunks)
|
||||||
|
data_chunks_with_context.extend(chunks_with_context)
|
||||||
|
data_chunk_starting_indices.extend(context_start_indices)
|
||||||
|
|
||||||
|
data_chunks = [preprocess_text_no_summary(chunk) for chunk in data_chunks]
|
||||||
|
|
||||||
|
data_chunks, data_chunks_with_context, data_chunk_starting_indices = _clear_chunks(
|
||||||
|
data_chunks, data_chunks_with_context, data_chunk_starting_indices
|
||||||
|
)
|
||||||
|
|
||||||
|
if clear_collector_before_adding:
|
||||||
|
collector.clear()
|
||||||
|
collector.add(data_chunks, data_chunks_with_context, data_chunk_starting_indices, [metadata]*len(data_chunks) if metadata is not None else None)
|
65
extensions/superboogav2/download_urls.py
Normal file
65
extensions/superboogav2/download_urls.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
import concurrent.futures
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from .data_processor import process_and_add_to_collector
|
||||||
|
from .utils import create_metadata_source
|
||||||
|
|
||||||
|
def _download_single(url):
|
||||||
|
response = requests.get(url, timeout=5)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.content
|
||||||
|
else:
|
||||||
|
raise Exception("Failed to download URL")
|
||||||
|
|
||||||
|
|
||||||
|
def _download_urls(urls, threads=1):
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||||
|
futures = []
|
||||||
|
for url in urls:
|
||||||
|
future = executor.submit(_download_single, url)
|
||||||
|
futures.append(future)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
i = 0
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
try:
|
||||||
|
result = future.result()
|
||||||
|
results.append(result)
|
||||||
|
i += 1
|
||||||
|
yield f"{i}/{len(urls)}", results
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield "Done", results
|
||||||
|
|
||||||
|
|
||||||
|
def feed_url_into_collector(urls, collector):
|
||||||
|
all_text = ''
|
||||||
|
cumulative = ''
|
||||||
|
|
||||||
|
urls = urls.strip().split('\n')
|
||||||
|
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
|
||||||
|
yield cumulative
|
||||||
|
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
|
||||||
|
yield cumulative + update
|
||||||
|
|
||||||
|
cumulative += 'Processing the HTML sources...'
|
||||||
|
yield cumulative
|
||||||
|
for content in contents:
|
||||||
|
soup = BeautifulSoup(content, features="lxml")
|
||||||
|
for script in soup(["script", "style"]):
|
||||||
|
script.extract()
|
||||||
|
|
||||||
|
strings = soup.stripped_strings
|
||||||
|
if parameters.get_is_strong_cleanup():
|
||||||
|
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
|
||||||
|
|
||||||
|
text = '\n'.join([s.strip() for s in strings])
|
||||||
|
all_text += text
|
||||||
|
|
||||||
|
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))
|
179
extensions/superboogav2/nltk_data/corpora/stopwords/english
Normal file
179
extensions/superboogav2/nltk_data/corpora/stopwords/english
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
i
|
||||||
|
me
|
||||||
|
my
|
||||||
|
myself
|
||||||
|
we
|
||||||
|
our
|
||||||
|
ours
|
||||||
|
ourselves
|
||||||
|
you
|
||||||
|
you're
|
||||||
|
you've
|
||||||
|
you'll
|
||||||
|
you'd
|
||||||
|
your
|
||||||
|
yours
|
||||||
|
yourself
|
||||||
|
yourselves
|
||||||
|
he
|
||||||
|
him
|
||||||
|
his
|
||||||
|
himself
|
||||||
|
she
|
||||||
|
she's
|
||||||
|
her
|
||||||
|
hers
|
||||||
|
herself
|
||||||
|
it
|
||||||
|
it's
|
||||||
|
its
|
||||||
|
itself
|
||||||
|
they
|
||||||
|
them
|
||||||
|
their
|
||||||
|
theirs
|
||||||
|
themselves
|
||||||
|
what
|
||||||
|
which
|
||||||
|
who
|
||||||
|
whom
|
||||||
|
this
|
||||||
|
that
|
||||||
|
that'll
|
||||||
|
these
|
||||||
|
those
|
||||||
|
am
|
||||||
|
is
|
||||||
|
are
|
||||||
|
was
|
||||||
|
were
|
||||||
|
be
|
||||||
|
been
|
||||||
|
being
|
||||||
|
have
|
||||||
|
has
|
||||||
|
had
|
||||||
|
having
|
||||||
|
do
|
||||||
|
does
|
||||||
|
did
|
||||||
|
doing
|
||||||
|
a
|
||||||
|
an
|
||||||
|
the
|
||||||
|
and
|
||||||
|
but
|
||||||
|
if
|
||||||
|
or
|
||||||
|
because
|
||||||
|
as
|
||||||
|
until
|
||||||
|
while
|
||||||
|
of
|
||||||
|
at
|
||||||
|
by
|
||||||
|
for
|
||||||
|
with
|
||||||
|
about
|
||||||
|
against
|
||||||
|
between
|
||||||
|
into
|
||||||
|
through
|
||||||
|
during
|
||||||
|
before
|
||||||
|
after
|
||||||
|
above
|
||||||
|
below
|
||||||
|
to
|
||||||
|
from
|
||||||
|
up
|
||||||
|
down
|
||||||
|
in
|
||||||
|
out
|
||||||
|
on
|
||||||
|
off
|
||||||
|
over
|
||||||
|
under
|
||||||
|
again
|
||||||
|
further
|
||||||
|
then
|
||||||
|
once
|
||||||
|
here
|
||||||
|
there
|
||||||
|
when
|
||||||
|
where
|
||||||
|
why
|
||||||
|
how
|
||||||
|
all
|
||||||
|
any
|
||||||
|
both
|
||||||
|
each
|
||||||
|
few
|
||||||
|
more
|
||||||
|
most
|
||||||
|
other
|
||||||
|
some
|
||||||
|
such
|
||||||
|
no
|
||||||
|
nor
|
||||||
|
not
|
||||||
|
only
|
||||||
|
own
|
||||||
|
same
|
||||||
|
so
|
||||||
|
than
|
||||||
|
too
|
||||||
|
very
|
||||||
|
s
|
||||||
|
t
|
||||||
|
can
|
||||||
|
will
|
||||||
|
just
|
||||||
|
don
|
||||||
|
don't
|
||||||
|
should
|
||||||
|
should've
|
||||||
|
now
|
||||||
|
d
|
||||||
|
ll
|
||||||
|
m
|
||||||
|
o
|
||||||
|
re
|
||||||
|
ve
|
||||||
|
y
|
||||||
|
ain
|
||||||
|
aren
|
||||||
|
aren't
|
||||||
|
couldn
|
||||||
|
couldn't
|
||||||
|
didn
|
||||||
|
didn't
|
||||||
|
doesn
|
||||||
|
doesn't
|
||||||
|
hadn
|
||||||
|
hadn't
|
||||||
|
hasn
|
||||||
|
hasn't
|
||||||
|
haven
|
||||||
|
haven't
|
||||||
|
isn
|
||||||
|
isn't
|
||||||
|
ma
|
||||||
|
mightn
|
||||||
|
mightn't
|
||||||
|
mustn
|
||||||
|
mustn't
|
||||||
|
needn
|
||||||
|
needn't
|
||||||
|
shan
|
||||||
|
shan't
|
||||||
|
shouldn
|
||||||
|
shouldn't
|
||||||
|
wasn
|
||||||
|
wasn't
|
||||||
|
weren
|
||||||
|
weren't
|
||||||
|
won
|
||||||
|
won't
|
||||||
|
wouldn
|
||||||
|
wouldn't
|
BIN
extensions/superboogav2/nltk_data/corpora/wordnet.zip
Normal file
BIN
extensions/superboogav2/nltk_data/corpora/wordnet.zip
Normal file
Binary file not shown.
Binary file not shown.
40
extensions/superboogav2/notebook_handler.py
Normal file
40
extensions/superboogav2/notebook_handler.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
This module is responsible for handling and modifying the notebook text.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from modules import shared
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
from extensions.superboogav2.utils import create_context_text
|
||||||
|
|
||||||
|
from .data_processor import preprocess_text
|
||||||
|
|
||||||
|
def _remove_special_tokens(string):
|
||||||
|
pattern = r'(<\|begin-user-input\|>|<\|end-user-input\|>|<\|injection-point\|>)'
|
||||||
|
return re.sub(pattern, '', string)
|
||||||
|
|
||||||
|
|
||||||
|
def input_modifier_internal(string, collector):
|
||||||
|
# Sanity check.
|
||||||
|
if shared.is_chat():
|
||||||
|
return string
|
||||||
|
|
||||||
|
# Find the user input
|
||||||
|
pattern = re.compile(r"<\|begin-user-input\|>(.*?)<\|end-user-input\|>", re.DOTALL)
|
||||||
|
match = re.search(pattern, string)
|
||||||
|
if match:
|
||||||
|
# Preprocess the user prompt.
|
||||||
|
user_input = match.group(1).strip()
|
||||||
|
user_input = preprocess_text(user_input)
|
||||||
|
|
||||||
|
logger.debug(f"Preprocessed User Input: {user_input}")
|
||||||
|
|
||||||
|
# Get the most similar chunks
|
||||||
|
results = collector.get_sorted_by_dist(user_input, n_results=parameters.get_chunk_count(), max_token_count=int(parameters.get_max_token_count()))
|
||||||
|
|
||||||
|
# Make the injection
|
||||||
|
string = string.replace('<|injection-point|>', create_context_text(results))
|
||||||
|
|
||||||
|
return _remove_special_tokens(string)
|
135
extensions/superboogav2/optimize.py
Normal file
135
extensions/superboogav2/optimize.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
"""
|
||||||
|
This module implements a hyperparameter optimization routine for the embedding application. It utilizes TPE optimization from Optuna.
|
||||||
|
|
||||||
|
Each run, the optimizer will set the default values inside the hyperparameters. At the end, it will output the best ones it has found.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import optuna
|
||||||
|
import gradio as gr
|
||||||
|
import numpy as np
|
||||||
|
import logging
|
||||||
|
import hashlib
|
||||||
|
logging.getLogger('optuna').setLevel(logging.WARNING)
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .benchmark import benchmark
|
||||||
|
from .parameters import Parameters
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
|
||||||
|
# Format the parameters into markdown format.
|
||||||
|
def _markdown_hyperparams():
|
||||||
|
res = []
|
||||||
|
for param_name, param_value in Parameters.getInstance().hyperparameters.items():
|
||||||
|
# Escape any markdown syntax
|
||||||
|
param_name = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", param_name)
|
||||||
|
param_value_default = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", str(param_value['default'])) if param_value['default'] else ' '
|
||||||
|
|
||||||
|
res.append('* {}: **{}**'.format(param_name, param_value_default))
|
||||||
|
|
||||||
|
return '\n'.join(res)
|
||||||
|
|
||||||
|
|
||||||
|
# Convert numpy types to python types.
|
||||||
|
def _convert_np_types(params):
|
||||||
|
for key in params:
|
||||||
|
if type(params[key]) == np.bool_:
|
||||||
|
params[key] = bool(params[key])
|
||||||
|
elif type(params[key]) == np.int64:
|
||||||
|
params[key] = int(params[key])
|
||||||
|
elif type(params[key]) == np.float64:
|
||||||
|
params[key] = float(params[key])
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
|
# Set the default values for the hyperparameters.
|
||||||
|
def _set_hyperparameters(params):
|
||||||
|
for param_name, param_value in params.items():
|
||||||
|
if param_name in Parameters.getInstance().hyperparameters:
|
||||||
|
Parameters.getInstance().hyperparameters[param_name]['default'] = param_value
|
||||||
|
|
||||||
|
|
||||||
|
# Check if the parameter is for optimization.
|
||||||
|
def _is_optimization_param(val):
|
||||||
|
is_opt = val.get('should_optimize', False) # Either does not exist or is false
|
||||||
|
return is_opt
|
||||||
|
|
||||||
|
|
||||||
|
# Create a hashable representation of the parameters
|
||||||
|
def _get_params_hash(params):
|
||||||
|
params_str = json.dumps(params, sort_keys=True)
|
||||||
|
return hashlib.sha256(params_str.encode()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def optimize(collector, progress=gr.Progress()):
|
||||||
|
# Inform the user that something is happening.
|
||||||
|
progress(0, desc=f'Setting Up...')
|
||||||
|
|
||||||
|
# Track the current step
|
||||||
|
current_step = 0
|
||||||
|
|
||||||
|
# Track the best score
|
||||||
|
best_score = 0
|
||||||
|
|
||||||
|
# Dictionary for caching scores
|
||||||
|
scores_cache = {}
|
||||||
|
|
||||||
|
def objective_function(trial):
|
||||||
|
nonlocal current_step
|
||||||
|
nonlocal best_score
|
||||||
|
nonlocal scores_cache
|
||||||
|
|
||||||
|
params = {}
|
||||||
|
for key, val in Parameters.getInstance().hyperparameters.items():
|
||||||
|
if _is_optimization_param(val):
|
||||||
|
params[key] = trial.suggest_categorical(key, val['categories'])
|
||||||
|
|
||||||
|
_set_hyperparameters(params)
|
||||||
|
|
||||||
|
params_hash = _get_params_hash(params)
|
||||||
|
|
||||||
|
# If the score for these parameters is in the cache, return it
|
||||||
|
if params_hash in scores_cache:
|
||||||
|
return scores_cache[params_hash]
|
||||||
|
|
||||||
|
# Benchmark the current set of parameters.
|
||||||
|
score, max_score = benchmark(Path("extensions/superboogav2/benchmark_texts/questions.json"), collector)
|
||||||
|
|
||||||
|
# Cache the score
|
||||||
|
scores_cache[params_hash] = score
|
||||||
|
|
||||||
|
result = json.dumps(_convert_np_types(params), indent=4)
|
||||||
|
result += f'\nScore: {score}/{max_score}'
|
||||||
|
|
||||||
|
logger.debug(result)
|
||||||
|
|
||||||
|
# Increment the current step
|
||||||
|
current_step += 1
|
||||||
|
|
||||||
|
# Update the best score
|
||||||
|
best_score = max(best_score, score)
|
||||||
|
|
||||||
|
# Update the progress
|
||||||
|
progress(current_step / parameters.get_optimization_steps(), desc=f'Optimizing... {current_step}/{parameters.get_optimization_steps()}')
|
||||||
|
|
||||||
|
return -score
|
||||||
|
|
||||||
|
# Run the optimization.
|
||||||
|
study = optuna.create_study()
|
||||||
|
study.optimize(objective_function, n_trials=int(parameters.get_optimization_steps()))
|
||||||
|
|
||||||
|
best_params = study.best_params
|
||||||
|
_set_hyperparameters(best_params)
|
||||||
|
|
||||||
|
# Convert results to a markdown string.
|
||||||
|
str_result = f"## Best parameters:\n\n{_markdown_hyperparams()}\n\n## Score:\n\n{best_score}"
|
||||||
|
|
||||||
|
# Save to JSON file
|
||||||
|
with open('best_params.json', 'w') as fp:
|
||||||
|
json.dump(_convert_np_types(best_params), fp, indent=4)
|
||||||
|
|
||||||
|
return str_result
|
369
extensions/superboogav2/parameters.py
Normal file
369
extensions/superboogav2/parameters.py
Normal file
@ -0,0 +1,369 @@
|
|||||||
|
"""
|
||||||
|
This module provides a singleton class `Parameters` that is used to manage all hyperparameters for the embedding application.
|
||||||
|
It expects a JSON file in `extensions/superboogav2/config.json`.
|
||||||
|
|
||||||
|
Each element in the JSON must have a `default` value which will be used for the current run. Elements can have `categories`.
|
||||||
|
These categories define the range in which the optimizer will search. If the element is tagged with `"should_optimize": false`,
|
||||||
|
then the optimizer will only ever use the default value.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
|
||||||
|
|
||||||
|
NUM_TO_WORD_METHOD = 'Number to Word'
|
||||||
|
NUM_TO_CHAR_METHOD = 'Number to Char'
|
||||||
|
NUM_TO_CHAR_LONG_METHOD = 'Number to Multi-Char'
|
||||||
|
|
||||||
|
|
||||||
|
DIST_MIN_STRATEGY = 'Min of Two'
|
||||||
|
DIST_HARMONIC_STRATEGY = 'Harmonic Mean'
|
||||||
|
DIST_GEOMETRIC_STRATEGY = 'Geometric Mean'
|
||||||
|
DIST_ARITHMETIC_STRATEGY = 'Arithmetic Mean'
|
||||||
|
|
||||||
|
|
||||||
|
PREPEND_TO_LAST = 'Prepend to Last Message'
|
||||||
|
APPEND_TO_LAST = 'Append to Last Message'
|
||||||
|
HIJACK_LAST_IN_CONTEXT = 'Hijack Last Message in Context ⚠️ WIP ⚠️ (Works Partially)'
|
||||||
|
|
||||||
|
|
||||||
|
SORT_DISTANCE = 'distance'
|
||||||
|
SORT_ID = 'id'
|
||||||
|
|
||||||
|
|
||||||
|
class Parameters:
|
||||||
|
_instance = None
|
||||||
|
|
||||||
|
variable_mapping = {
|
||||||
|
'NUM_TO_WORD_METHOD': NUM_TO_WORD_METHOD,
|
||||||
|
'NUM_TO_CHAR_METHOD': NUM_TO_CHAR_METHOD,
|
||||||
|
'NUM_TO_CHAR_LONG_METHOD': NUM_TO_CHAR_LONG_METHOD,
|
||||||
|
'DIST_MIN_STRATEGY': DIST_MIN_STRATEGY,
|
||||||
|
'DIST_HARMONIC_STRATEGY': DIST_HARMONIC_STRATEGY,
|
||||||
|
'DIST_GEOMETRIC_STRATEGY': DIST_GEOMETRIC_STRATEGY,
|
||||||
|
'DIST_ARITHMETIC_STRATEGY': DIST_ARITHMETIC_STRATEGY,
|
||||||
|
'PREPEND_TO_LAST': PREPEND_TO_LAST,
|
||||||
|
'APPEND_TO_LAST': APPEND_TO_LAST,
|
||||||
|
'HIJACK_LAST_IN_CONTEXT': HIJACK_LAST_IN_CONTEXT,
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def getInstance():
|
||||||
|
if Parameters._instance is None:
|
||||||
|
Parameters()
|
||||||
|
return Parameters._instance
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if Parameters._instance is not None:
|
||||||
|
raise Exception("This class is a singleton!")
|
||||||
|
else:
|
||||||
|
Parameters._instance = self
|
||||||
|
self.hyperparameters = self._load_from_json(Path("extensions/superboogav2/config.json"))
|
||||||
|
|
||||||
|
def _load_from_json(self, file_path):
|
||||||
|
logger.debug('Loading hyperparameters...')
|
||||||
|
|
||||||
|
with open(file_path, 'r') as file:
|
||||||
|
data = json.load(file)
|
||||||
|
|
||||||
|
# Replace variable names in the dict and create Categorical objects
|
||||||
|
for key in data:
|
||||||
|
if "default" in data[key] and data[key]["default"] in self.variable_mapping:
|
||||||
|
data[key]["default"] = self.variable_mapping[data[key]["default"]]
|
||||||
|
if "categories" in data[key]:
|
||||||
|
data[key]["categories"] = [self.variable_mapping.get(cat, cat) for cat in data[key]["categories"]]
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def should_to_lower() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['to_lower']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_num_conversion_strategy() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['num_conversion']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def should_merge_spaces() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['merge_spaces']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def should_strip() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['strip']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def should_remove_punctuation() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['remove_punctuation']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def should_remove_stopwords() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['remove_stopwords']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def should_remove_specific_pos() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['remove_specific_pos']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def should_lemmatize() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['lemmatize']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_min_num_sentences() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['min_num_sent']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_delta_start() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['delta_start']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def set_to_lower(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['to_lower']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_num_conversion_strategy(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['num_conversion']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_merge_spaces(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['merge_spaces']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_strip(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['strip']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_remove_punctuation(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['remove_punctuation']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_remove_stopwords(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['remove_stopwords']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_remove_specific_pos(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['remove_specific_pos']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_lemmatize(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['lemmatize']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_min_num_sentences(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['min_num_sent']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_delta_start(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['delta_start']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_len() -> str:
|
||||||
|
lens = []
|
||||||
|
mask = Parameters.getInstance().hyperparameters['chunk_len_mask']['default']
|
||||||
|
|
||||||
|
lens.append(Parameters.getInstance().hyperparameters['chunk_len1']['default'] if mask & (1 << 0) else None)
|
||||||
|
lens.append(Parameters.getInstance().hyperparameters['chunk_len2']['default'] if mask & (1 << 1) else None)
|
||||||
|
lens.append(Parameters.getInstance().hyperparameters['chunk_len3']['default'] if mask & (1 << 2) else None)
|
||||||
|
lens.append(Parameters.getInstance().hyperparameters['chunk_len4']['default'] if mask & (1 << 3) else None)
|
||||||
|
|
||||||
|
return ','.join([str(len) for len in lens if len])
|
||||||
|
|
||||||
|
|
||||||
|
def set_chunk_len(val: str):
|
||||||
|
chunk_lens = sorted([int(len.strip()) for len in val.split(',')])
|
||||||
|
|
||||||
|
# Reset the mask to zero
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] = 0
|
||||||
|
|
||||||
|
if len(chunk_lens) > 0:
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len1']['default'] = chunk_lens[0]
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 0)
|
||||||
|
if len(chunk_lens) > 1:
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len2']['default'] = chunk_lens[1]
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 1)
|
||||||
|
if len(chunk_lens) > 2:
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len3']['default'] = chunk_lens[2]
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 2)
|
||||||
|
if len(chunk_lens) > 3:
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len4']['default'] = chunk_lens[3]
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 3)
|
||||||
|
|
||||||
|
if len(chunk_lens) > 4:
|
||||||
|
logger.warning(f'Only up to four chunk lengths are supported. Skipping {chunk_lens[4:]}')
|
||||||
|
|
||||||
|
|
||||||
|
def get_context_len() -> str:
|
||||||
|
context_len = str(Parameters.getInstance().hyperparameters['context_len_left']['default']) + ',' + str(Parameters.getInstance().hyperparameters['context_len_right']['default'])
|
||||||
|
return context_len
|
||||||
|
|
||||||
|
|
||||||
|
def set_context_len(val: str):
|
||||||
|
context_lens = [int(len.strip()) for len in val.split(',') if len.isdigit()]
|
||||||
|
if len(context_lens) == 1:
|
||||||
|
Parameters.getInstance().hyperparameters['context_len_left']['default'] = Parameters.getInstance().hyperparameters['context_len_right']['default'] = context_lens[0]
|
||||||
|
elif len(context_lens) == 2:
|
||||||
|
Parameters.getInstance().hyperparameters['context_len_left']['default'] = context_lens[0]
|
||||||
|
Parameters.getInstance().hyperparameters['context_len_right']['default'] = context_lens[1]
|
||||||
|
else:
|
||||||
|
logger.warning(f'Incorrect context length received {val}. Skipping.')
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_dist_strategy() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['new_dist_strategy']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_count() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['chunk_count']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_min_num_length() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['min_num_length']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_significant_level() -> float:
|
||||||
|
return float(Parameters.getInstance().hyperparameters['significant_level']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_time_steepness() -> float:
|
||||||
|
return float(Parameters.getInstance().hyperparameters['time_steepness']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_time_power() -> float:
|
||||||
|
return float(Parameters.getInstance().hyperparameters['time_power']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_separator() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['chunk_separator']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_prefix() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['prefix']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_separator() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['data_separator']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_postfix() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['postfix']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_is_manual() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['manual']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_add_chat_to_data() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['add_chat_to_data']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_injection_strategy() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['injection_strategy']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunk_regex() -> str:
|
||||||
|
return Parameters.getInstance().hyperparameters['chunk_regex']['default']
|
||||||
|
|
||||||
|
|
||||||
|
def get_is_strong_cleanup() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['strong_cleanup']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_token_count() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['max_token_count']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_num_threads() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['threads']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_optimization_steps() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['optimization_steps']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_api_port() -> int:
|
||||||
|
return int(Parameters.getInstance().hyperparameters['api_port']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def get_api_on() -> bool:
|
||||||
|
return bool(Parameters.getInstance().hyperparameters['api_on']['default'])
|
||||||
|
|
||||||
|
|
||||||
|
def set_new_dist_strategy(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['new_dist_strategy']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_chunk_count(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_count']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_min_num_length(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['min_num_length']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_significant_level(value: float):
|
||||||
|
Parameters.getInstance().hyperparameters['significant_level']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_time_steepness(value: float):
|
||||||
|
Parameters.getInstance().hyperparameters['time_steepness']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_time_power(value: float):
|
||||||
|
Parameters.getInstance().hyperparameters['time_power']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_chunk_separator(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_separator']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_prefix(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['prefix']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_data_separator(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['data_separator']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_postfix(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['postfix']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_manual(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['manual']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_add_chat_to_data(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['add_chat_to_data']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_injection_strategy(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['injection_strategy']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_chunk_regex(value: str):
|
||||||
|
Parameters.getInstance().hyperparameters['chunk_regex']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_strong_cleanup(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['strong_cleanup']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_max_token_count(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['max_token_count']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_num_threads(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['threads']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_optimization_steps(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['optimization_steps']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_api_port(value: int):
|
||||||
|
Parameters.getInstance().hyperparameters['api_port']['default'] = value
|
||||||
|
|
||||||
|
|
||||||
|
def set_api_on(value: bool):
|
||||||
|
Parameters.getInstance().hyperparameters['api_on']['default'] = value
|
8
extensions/superboogav2/requirements.txt
Normal file
8
extensions/superboogav2/requirements.txt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
beautifulsoup4==4.12.2
|
||||||
|
chromadb==0.3.18
|
||||||
|
lxml
|
||||||
|
optuna
|
||||||
|
pandas==2.0.3
|
||||||
|
posthog==2.4.2
|
||||||
|
sentence_transformers==2.2.2
|
||||||
|
spacy
|
355
extensions/superboogav2/script.py
Normal file
355
extensions/superboogav2/script.py
Normal file
@ -0,0 +1,355 @@
|
|||||||
|
"""
|
||||||
|
This file is responsible for the UI and how the application interracts with the rest of the system.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Point to where nltk will find the required data.
|
||||||
|
os.environ['NLTK_DATA'] = str(Path("extensions/superboogav2/nltk_data").resolve())
|
||||||
|
|
||||||
|
import textwrap
|
||||||
|
import codecs
|
||||||
|
import gradio as gr
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
from modules.logging_colors import logger
|
||||||
|
from modules import shared
|
||||||
|
|
||||||
|
from .utils import create_metadata_source
|
||||||
|
from .chromadb import make_collector
|
||||||
|
from .download_urls import feed_url_into_collector
|
||||||
|
from .data_processor import process_and_add_to_collector
|
||||||
|
from .benchmark import benchmark
|
||||||
|
from .optimize import optimize
|
||||||
|
from .notebook_handler import input_modifier_internal
|
||||||
|
from .chat_handler import custom_generate_chat_prompt_internal
|
||||||
|
from .api import APIManager
|
||||||
|
|
||||||
|
collector = None
|
||||||
|
api_manager = None
|
||||||
|
|
||||||
|
def setup():
|
||||||
|
global collector
|
||||||
|
global api_manager
|
||||||
|
collector = make_collector()
|
||||||
|
api_manager = APIManager(collector)
|
||||||
|
|
||||||
|
if parameters.get_api_on():
|
||||||
|
api_manager.start_server(parameters.get_api_port())
|
||||||
|
|
||||||
|
def _feed_data_into_collector(corpus):
|
||||||
|
yield '### Processing data...'
|
||||||
|
process_and_add_to_collector(corpus, collector, False, create_metadata_source('direct-text'))
|
||||||
|
yield '### Done.'
|
||||||
|
|
||||||
|
|
||||||
|
def _feed_file_into_collector(file):
|
||||||
|
yield '### Reading and processing the input dataset...'
|
||||||
|
text = file.decode('utf-8')
|
||||||
|
process_and_add_to_collector(text, collector, False, create_metadata_source('file'))
|
||||||
|
yield '### Done.'
|
||||||
|
|
||||||
|
|
||||||
|
def _feed_url_into_collector(urls):
|
||||||
|
for i in feed_url_into_collector(urls, collector):
|
||||||
|
yield i
|
||||||
|
yield '### Done.'
|
||||||
|
|
||||||
|
|
||||||
|
def _begin_benchmark():
|
||||||
|
score, max_score = benchmark(Path("extensions/superboogav2/benchmark_texts/questions.json"), collector)
|
||||||
|
return f'**Score**: {score}/{max_score}'
|
||||||
|
|
||||||
|
|
||||||
|
def _begin_optimization(progress=gr.Progress()):
|
||||||
|
return optimize(collector, progress), *_get_optimizable_settings()
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_data():
|
||||||
|
collector.clear()
|
||||||
|
return "### Data Cleared!"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_optimizable_settings() -> list:
|
||||||
|
preprocess_pipeline = []
|
||||||
|
if parameters.should_to_lower():
|
||||||
|
preprocess_pipeline.append('Lower Cases')
|
||||||
|
if parameters.should_remove_punctuation():
|
||||||
|
preprocess_pipeline.append('Remove Punctuation')
|
||||||
|
if parameters.should_remove_specific_pos():
|
||||||
|
preprocess_pipeline.append('Remove Adverbs')
|
||||||
|
if parameters.should_remove_stopwords():
|
||||||
|
preprocess_pipeline.append('Remove Stop Words')
|
||||||
|
if parameters.should_lemmatize():
|
||||||
|
preprocess_pipeline.append('Lemmatize')
|
||||||
|
if parameters.should_merge_spaces():
|
||||||
|
preprocess_pipeline.append('Merge Spaces')
|
||||||
|
if parameters.should_strip():
|
||||||
|
preprocess_pipeline.append('Strip Edges')
|
||||||
|
|
||||||
|
return [
|
||||||
|
parameters.get_time_power(),
|
||||||
|
parameters.get_time_steepness(),
|
||||||
|
parameters.get_significant_level(),
|
||||||
|
parameters.get_min_num_sentences(),
|
||||||
|
parameters.get_new_dist_strategy(),
|
||||||
|
parameters.get_delta_start(),
|
||||||
|
parameters.get_min_num_length(),
|
||||||
|
parameters.get_num_conversion_strategy(),
|
||||||
|
preprocess_pipeline,
|
||||||
|
parameters.get_chunk_count(),
|
||||||
|
parameters.get_context_len(),
|
||||||
|
parameters.get_chunk_len()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
|
||||||
|
preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
|
||||||
|
chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup):
|
||||||
|
logger.debug('Applying settings.')
|
||||||
|
|
||||||
|
try:
|
||||||
|
parameters.set_optimization_steps(optimization_steps)
|
||||||
|
parameters.set_significant_level(significant_level)
|
||||||
|
parameters.set_min_num_sentences(min_sentences)
|
||||||
|
parameters.set_new_dist_strategy(new_dist_strat)
|
||||||
|
parameters.set_delta_start(delta_start)
|
||||||
|
parameters.set_min_num_length(min_number_length)
|
||||||
|
parameters.set_num_conversion_strategy(num_conversion)
|
||||||
|
parameters.set_api_port(api_port)
|
||||||
|
parameters.set_api_on(api_on)
|
||||||
|
parameters.set_injection_strategy(injection_strategy)
|
||||||
|
parameters.set_add_chat_to_data(add_chat_to_data)
|
||||||
|
parameters.set_manual(manual)
|
||||||
|
parameters.set_postfix(codecs.decode(postfix, 'unicode_escape'))
|
||||||
|
parameters.set_data_separator(codecs.decode(data_separator, 'unicode_escape'))
|
||||||
|
parameters.set_prefix(codecs.decode(prefix, 'unicode_escape'))
|
||||||
|
parameters.set_max_token_count(max_token_count)
|
||||||
|
parameters.set_time_power(time_power)
|
||||||
|
parameters.set_time_steepness(time_steepness)
|
||||||
|
parameters.set_chunk_count(chunk_count)
|
||||||
|
parameters.set_chunk_separator(codecs.decode(chunk_sep, 'unicode_escape'))
|
||||||
|
parameters.set_context_len(context_len)
|
||||||
|
parameters.set_chunk_regex(chunk_regex)
|
||||||
|
parameters.set_chunk_len(chunk_len)
|
||||||
|
parameters.set_num_threads(threads)
|
||||||
|
parameters.set_strong_cleanup(strong_cleanup)
|
||||||
|
|
||||||
|
preprocess_choices = ['Lower Cases', 'Remove Punctuation', 'Remove Adverbs', 'Remove Stop Words', 'Lemmatize', 'Merge Spaces', 'Strip Edges']
|
||||||
|
for preprocess_method in preprocess_choices:
|
||||||
|
if preprocess_method == 'Lower Cases':
|
||||||
|
parameters.set_to_lower(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Remove Punctuation':
|
||||||
|
parameters.set_remove_punctuation(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Remove Adverbs':
|
||||||
|
parameters.set_remove_specific_pos(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Remove Stop Words':
|
||||||
|
parameters.set_remove_stopwords(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Lemmatize':
|
||||||
|
parameters.set_lemmatize(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Merge Spaces':
|
||||||
|
parameters.set_merge_spaces(preprocess_method in preprocess_pipeline)
|
||||||
|
elif preprocess_method == 'Strip Edges':
|
||||||
|
parameters.set_strip(preprocess_method in preprocess_pipeline)
|
||||||
|
|
||||||
|
# Based on API on/off, start or stop the server
|
||||||
|
if api_manager is not None:
|
||||||
|
if parameters.get_api_on() and (not api_manager.is_server_running()):
|
||||||
|
api_manager.start_server(parameters.get_api_port())
|
||||||
|
elif (not parameters.get_api_on()) and api_manager.is_server_running():
|
||||||
|
api_manager.stop_server()
|
||||||
|
except Exception as e:
|
||||||
|
logger.warn(f'Could not properly apply settings: {str(e)}')
|
||||||
|
|
||||||
|
|
||||||
|
def custom_generate_chat_prompt(user_input, state, **kwargs):
|
||||||
|
return custom_generate_chat_prompt_internal(user_input, state, collector, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def input_modifier(string):
|
||||||
|
return input_modifier_internal(string, collector)
|
||||||
|
|
||||||
|
|
||||||
|
def ui():
|
||||||
|
with gr.Accordion("Click for more information...", open=False):
|
||||||
|
gr.Markdown(textwrap.dedent("""
|
||||||
|
|
||||||
|
## About
|
||||||
|
|
||||||
|
This extension takes a dataset as input, breaks it into chunks, and adds the result to a local/offline Chroma database.
|
||||||
|
|
||||||
|
The database is then queried during inference time to get the excerpts that are closest to your input. The idea is to create an arbitrarily large pseudo context.
|
||||||
|
|
||||||
|
The core methodology was developed and contributed by kaiokendev, who is working on improvements to the method in this repository: https://github.com/kaiokendev/superbig
|
||||||
|
|
||||||
|
## Data input
|
||||||
|
|
||||||
|
Start by entering some data in the interface below and then clicking on "Load data".
|
||||||
|
|
||||||
|
Each time you load some new data, the old chunks are discarded.
|
||||||
|
|
||||||
|
## Chat mode
|
||||||
|
|
||||||
|
#### Instruct
|
||||||
|
|
||||||
|
On each turn, the chunks will be compared to your current input and the most relevant matches will be appended to the input in the following format:
|
||||||
|
|
||||||
|
```
|
||||||
|
Consider the excerpts below as additional context:
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
The injection doesn't make it into the chat history. It is only used in the current generation.
|
||||||
|
|
||||||
|
#### Regular chat
|
||||||
|
|
||||||
|
The chunks from the external data sources are ignored, and the chroma database is built based on the chat history instead. The most relevant past exchanges relative to the present input are added to the context string. This way, the extension acts as a long term memory.
|
||||||
|
|
||||||
|
## Notebook/default modes
|
||||||
|
|
||||||
|
Your question must be manually specified between `<|begin-user-input|>` and `<|end-user-input|>` tags, and the injection point must be specified with `<|injection-point|>`.
|
||||||
|
|
||||||
|
The special tokens mentioned above (`<|begin-user-input|>`, `<|end-user-input|>`, and `<|injection-point|>`) are removed in the background before the text generation begins.
|
||||||
|
|
||||||
|
Here is an example in Vicuna 1.1 format:
|
||||||
|
|
||||||
|
```
|
||||||
|
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
|
||||||
|
|
||||||
|
USER:
|
||||||
|
<|injection-point|>
|
||||||
|
|
||||||
|
<|begin-user-input|>What datasets are mentioned in the text above?<|end-user-input|>
|
||||||
|
ASSISTANT:
|
||||||
|
```
|
||||||
|
"""))
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
with gr.Column(min_width=600):
|
||||||
|
with gr.Tab("Text input"):
|
||||||
|
data_input = gr.Textbox(lines=20, label='Input data')
|
||||||
|
update_data = gr.Button('Load data')
|
||||||
|
|
||||||
|
with gr.Tab("URL input"):
|
||||||
|
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
|
||||||
|
strong_cleanup = gr.Checkbox(value=parameters.get_is_strong_cleanup(), label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
|
||||||
|
threads = gr.Number(value=parameters.get_num_threads(), label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
|
||||||
|
update_url = gr.Button('Load data')
|
||||||
|
|
||||||
|
with gr.Tab("File input"):
|
||||||
|
file_input = gr.File(label='Input file', type='binary')
|
||||||
|
update_file = gr.Button('Load data')
|
||||||
|
|
||||||
|
with gr.Tab("Settings"):
|
||||||
|
with gr.Accordion("Processing settings", open=True):
|
||||||
|
chunk_len = gr.Textbox(value=parameters.get_chunk_len(), label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
|
||||||
|
chunk_regex = gr.Textbox(value=parameters.get_chunk_regex(), label='Chunk regex', info='Will specifically add the captured text to the embeddings.')
|
||||||
|
context_len = gr.Textbox(value=parameters.get_context_len(), label='Context length', info='In characters, not tokens. How much context to load around each chunk.')
|
||||||
|
chunk_sep = gr.Textbox(value=codecs.encode(parameters.get_chunk_separator(), 'unicode_escape').decode(), label='Chunk separator', info='Used to manually split chunks. Manually split chunks longer than chunk length are split again. This value is used when you click on "Load data".')
|
||||||
|
|
||||||
|
with gr.Accordion("Generation settings", open=False):
|
||||||
|
chunk_count = gr.Number(value=parameters.get_chunk_count(), label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
|
||||||
|
max_token_count = gr.Number(value=parameters.get_max_token_count(), label='Max Context Tokens', info='The context length in tokens will not exceed this value.')
|
||||||
|
prefix = gr.Textbox(value=codecs.encode(parameters.get_prefix(), 'unicode_escape').decode(), label='Prefix', info='What to put before the injection point.')
|
||||||
|
data_separator = gr.Textbox(value=codecs.encode(parameters.get_data_separator(), 'unicode_escape').decode(), label='Data separator', info='When multiple pieces of distant data are added, they might be unrelated. It\'s important to separate them.')
|
||||||
|
postfix = gr.Textbox(value=codecs.encode(parameters.get_postfix(), 'unicode_escape').decode(), label='Postfix', info='What to put after the injection point.')
|
||||||
|
with gr.Row():
|
||||||
|
manual = gr.Checkbox(value=parameters.get_is_manual(), label="Is Manual", info="Manually specify when to use ChromaDB. Insert `!c` at the start or end of the message to trigger a query.", visible=shared.is_chat())
|
||||||
|
add_chat_to_data = gr.Checkbox(value=parameters.get_add_chat_to_data(), label="Add Chat to Data", info="Automatically feed the chat history as you chat.", visible=shared.is_chat())
|
||||||
|
injection_strategy = gr.Radio(choices=[parameters.PREPEND_TO_LAST, parameters.APPEND_TO_LAST, parameters.HIJACK_LAST_IN_CONTEXT], value=parameters.get_injection_strategy(), label='Injection Strategy', info='Where to inject the messages in chat or instruct mode.', visible=shared.is_chat())
|
||||||
|
with gr.Row():
|
||||||
|
api_on = gr.Checkbox(value=parameters.get_api_on(), label="Turn on API", info="Check this to turn on the API service.")
|
||||||
|
api_port = gr.Number(value=parameters.get_api_port(), label="API Port", info="The port on which the API service will run.")
|
||||||
|
|
||||||
|
with gr.Accordion("Advanced settings", open=False):
|
||||||
|
preprocess_set_choices = []
|
||||||
|
if parameters.should_to_lower():
|
||||||
|
preprocess_set_choices.append('Lower Cases')
|
||||||
|
if parameters.should_remove_punctuation():
|
||||||
|
preprocess_set_choices.append('Remove Punctuation')
|
||||||
|
if parameters.should_remove_specific_pos():
|
||||||
|
preprocess_set_choices.append('Remove Adverbs')
|
||||||
|
if parameters.should_remove_stopwords():
|
||||||
|
preprocess_set_choices.append('Remove Stop Words')
|
||||||
|
if parameters.should_lemmatize():
|
||||||
|
preprocess_set_choices.append('Lemmatize')
|
||||||
|
if parameters.should_merge_spaces():
|
||||||
|
preprocess_set_choices.append('Merge Spaces')
|
||||||
|
if parameters.should_strip():
|
||||||
|
preprocess_set_choices.append('Strip Edges')
|
||||||
|
|
||||||
|
preprocess_pipeline = gr.CheckboxGroup(label='Preprocessing pipeline', choices=[
|
||||||
|
'Lower Cases',
|
||||||
|
'Remove Punctuation',
|
||||||
|
'Remove Adverbs',
|
||||||
|
'Remove Stop Words',
|
||||||
|
'Lemmatize',
|
||||||
|
'Merge Spaces',
|
||||||
|
'Strip Edges',
|
||||||
|
], value=preprocess_set_choices, interactive=True, info='How to preprocess the text before it is turned into an embedding.')
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
num_conversion = gr.Dropdown(choices=[parameters.NUM_TO_WORD_METHOD, parameters.NUM_TO_CHAR_METHOD, parameters.NUM_TO_CHAR_LONG_METHOD, 'None'], value=parameters.get_num_conversion_strategy(), label="Number Conversion Method", info='How to preprocess numbers before creating the embeddings.', interactive=True)
|
||||||
|
min_number_length = gr.Number(value=parameters.get_min_num_length(), label='Number Length Threshold', info='In digits. Only numbers that have at least that many digits will be converted.', interactive=True)
|
||||||
|
|
||||||
|
delta_start = gr.Number(value=parameters.get_delta_start(), label='Delta Start Index', info='If the system encounters two identical embeddings, and they both start within the same delta, then only the first will be considered.', interactive=True)
|
||||||
|
new_dist_strat = gr.Dropdown(choices=[parameters.DIST_MIN_STRATEGY, parameters.DIST_HARMONIC_STRATEGY, parameters.DIST_GEOMETRIC_STRATEGY, parameters.DIST_ARITHMETIC_STRATEGY], value=parameters.get_new_dist_strategy(), label="Distance Strategy", info='When two embedding texts are merged, the distance of the new piece will be decided using one of these strategies.', interactive=True)
|
||||||
|
min_sentences = gr.Number(value=parameters.get_min_num_sentences(), label='Summary Threshold', info='In sentences. The minumum number of sentences to trigger text-rank summarization.', interactive=True)
|
||||||
|
significant_level = gr.Slider(0.8, 2, value=parameters.get_significant_level(), label='Significant Level', info='Defines the cut-off for what is considered a "significant" distance relative to the median distance among the returned samples.', interactive=True)
|
||||||
|
time_steepness = gr.Slider(0.01, 1.0, value=parameters.get_time_steepness(), label='Time Weighing Steepness', info='How differently two close excerpts are going to be weighed.')
|
||||||
|
time_power = gr.Slider(0.0, 1.0, value=parameters.get_time_power(), label='Time Weighing Power', info='How influencial is the weighing. At 1.0, old entries won\'t be considered')
|
||||||
|
|
||||||
|
with gr.Tab("Benchmark"):
|
||||||
|
benchmark_button = gr.Button('Benchmark')
|
||||||
|
optimize_button = gr.Button('Optimize')
|
||||||
|
optimization_steps = gr.Number(value=parameters.get_optimization_steps(), label='Optimization Steps', info='For how many steps to optimize.', interactive=True)
|
||||||
|
|
||||||
|
|
||||||
|
clear_button = gr.Button('❌ Clear Data')
|
||||||
|
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
|
last_updated = gr.Markdown()
|
||||||
|
|
||||||
|
all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
|
||||||
|
preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
|
||||||
|
chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup]
|
||||||
|
optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
|
||||||
|
preprocess_pipeline, chunk_count, context_len, chunk_len]
|
||||||
|
|
||||||
|
|
||||||
|
update_data.click(_feed_data_into_collector, [data_input], last_updated, show_progress=False)
|
||||||
|
update_url.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
|
||||||
|
update_file.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
|
||||||
|
benchmark_button.click(_begin_benchmark, [], last_updated, show_progress=True)
|
||||||
|
optimize_button.click(_begin_optimization, [], [last_updated] + optimizable_params, show_progress=True)
|
||||||
|
clear_button.click(_clear_data, [], last_updated, show_progress=False)
|
||||||
|
|
||||||
|
|
||||||
|
optimization_steps.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
time_power.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
time_steepness.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
significant_level.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
min_sentences.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
new_dist_strat.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
delta_start.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
min_number_length.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
num_conversion.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
preprocess_pipeline.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
api_port.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
api_on.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
injection_strategy.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
add_chat_to_data.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
manual.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
postfix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
data_separator.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
prefix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
max_token_count.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
chunk_count.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
chunk_sep.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
context_len.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
chunk_regex.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
chunk_len.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
threads.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
||||||
|
strong_cleanup.input(fn=_apply_settings, inputs=all_params, show_progress=False)
|
16
extensions/superboogav2/utils.py
Normal file
16
extensions/superboogav2/utils.py
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
This module contains common functions across multiple other modules.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import extensions.superboogav2.parameters as parameters
|
||||||
|
|
||||||
|
# Create the context using the prefix + data_separator + postfix from parameters.
|
||||||
|
def create_context_text(results):
|
||||||
|
context = parameters.get_prefix() + parameters.get_data_separator().join(results) + parameters.get_postfix()
|
||||||
|
|
||||||
|
return context
|
||||||
|
|
||||||
|
|
||||||
|
# Create metadata with the specified source
|
||||||
|
def create_metadata_source(source: str):
|
||||||
|
return {'source': source}
|
@ -195,7 +195,7 @@ def update_requirements(initial_installation=False):
|
|||||||
print("Installing extensions requirements.")
|
print("Installing extensions requirements.")
|
||||||
extensions = next(os.walk("extensions"))[1]
|
extensions = next(os.walk("extensions"))[1]
|
||||||
for extension in extensions:
|
for extension in extensions:
|
||||||
if extension in ['superbooga']: # No wheels available for requirements
|
if extension in ['superbooga', 'superboogav2']: # No wheels available for requirements
|
||||||
continue
|
continue
|
||||||
|
|
||||||
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
|
extension_req_path = os.path.join("extensions", extension, "requirements.txt")
|
||||||
|
Loading…
Reference in New Issue
Block a user