2023-05-07 08:50:12 +02:00
import re
import textwrap
import gradio as gr
2023-05-07 09:49:02 +02:00
from bs4 import BeautifulSoup
2023-05-20 23:42:17 +02:00
2023-05-07 20:01:14 +02:00
from modules import chat , shared
2023-05-22 03:42:34 +02:00
from modules . logging_colors import logger
2023-05-07 20:01:14 +02:00
2023-05-13 19:14:59 +02:00
from . chromadb import add_chunks_to_collector , make_collector
2023-05-12 19:19:55 +02:00
from . download_urls import download_urls
2023-07-05 22:10:58 +02:00
import requests
import json
2023-07-05 23:48:34 +02:00
from sentence_transformers import SentenceTransformer
from sklearn . metrics . pairwise import cosine_similarity
2023-07-05 22:10:58 +02:00
2023-05-11 04:23:37 +02:00
params = {
' chunk_count ' : 5 ,
2023-05-25 15:22:45 +02:00
' chunk_count_initial ' : 10 ,
' time_weight ' : 0 ,
2023-05-11 04:54:25 +02:00
' chunk_length ' : 700 ,
2023-05-15 02:44:52 +02:00
' chunk_separator ' : ' ' ,
2023-05-13 17:50:19 +02:00
' strong_cleanup ' : False ,
2023-07-05 23:48:34 +02:00
' semantic_cleanup ' : True ,
' semantic_weight ' : 0.5 ,
2023-05-12 19:19:55 +02:00
' threads ' : 4 ,
2023-05-11 04:23:37 +02:00
}
2023-05-13 19:14:59 +02:00
collector = make_collector ( )
chat_collector = make_collector ( )
2023-05-07 08:50:12 +02:00
2023-05-15 02:44:52 +02:00
def feed_data_into_collector ( corpus , chunk_len , chunk_sep ) :
2023-05-13 17:50:19 +02:00
global collector
2023-05-07 20:01:14 +02:00
# Defining variables
chunk_len = int ( chunk_len )
2023-05-15 02:44:52 +02:00
chunk_sep = chunk_sep . replace ( r ' \ n ' , ' \n ' )
2023-05-07 09:49:02 +02:00
cumulative = ' '
2023-05-07 20:01:14 +02:00
# Breaking the data into chunks and adding those to the db
2023-05-07 09:49:02 +02:00
cumulative + = " Breaking the input dataset... \n \n "
yield cumulative
2023-05-15 02:44:52 +02:00
if chunk_sep :
data_chunks = corpus . split ( chunk_sep )
data_chunks = [ [ data_chunk [ i : i + chunk_len ] for i in range ( 0 , len ( data_chunk ) , chunk_len ) ] for data_chunk in data_chunks ]
data_chunks = [ x for y in data_chunks for x in y ]
else :
data_chunks = [ corpus [ i : i + chunk_len ] for i in range ( 0 , len ( corpus ) , chunk_len ) ]
2023-05-20 23:42:17 +02:00
2023-05-07 09:49:02 +02:00
cumulative + = f " { len ( data_chunks ) } chunks have been found. \n \n Adding the chunks to the database... \n \n "
yield cumulative
2023-05-13 17:50:19 +02:00
add_chunks_to_collector ( data_chunks , collector )
2023-05-07 09:49:02 +02:00
cumulative + = " Done. "
yield cumulative
2023-05-15 02:44:52 +02:00
def feed_file_into_collector ( file , chunk_len , chunk_sep ) :
2023-05-07 09:49:02 +02:00
yield ' Reading the input dataset... \n \n '
text = file . decode ( ' utf-8 ' )
2023-05-15 02:44:52 +02:00
for i in feed_data_into_collector ( text , chunk_len , chunk_sep ) :
2023-05-07 09:49:02 +02:00
yield i
2023-05-15 02:44:52 +02:00
def feed_url_into_collector ( urls , chunk_len , chunk_sep , strong_cleanup , threads ) :
2023-05-07 16:07:16 +02:00
all_text = ' '
cumulative = ' '
2023-05-12 19:19:55 +02:00
urls = urls . strip ( ) . split ( ' \n ' )
cumulative + = f ' Loading { len ( urls ) } URLs with { threads } threads... \n \n '
yield cumulative
for update , contents in download_urls ( urls , threads = threads ) :
yield cumulative + update
cumulative + = ' Processing the HTML sources... '
yield cumulative
for content in contents :
soup = BeautifulSoup ( content , features = " html.parser " )
2023-05-07 16:07:16 +02:00
for script in soup ( [ " script " , " style " ] ) :
script . extract ( )
2023-05-11 04:23:37 +02:00
strings = soup . stripped_strings
if strong_cleanup :
strings = [ s for s in strings if re . search ( " [A-Za-z] " , s ) ]
text = ' \n ' . join ( [ s . strip ( ) for s in strings ] )
2023-05-07 16:07:16 +02:00
all_text + = text
2023-05-15 02:44:52 +02:00
for i in feed_data_into_collector ( all_text , chunk_len , chunk_sep ) :
2023-05-07 09:49:02 +02:00
yield i
2023-05-07 08:50:12 +02:00
2023-07-05 23:48:34 +02:00
def calculate_semantic_similarity ( query_embedding , target_embedding ) :
# Calculate cosine similarity between the query embedding and the target embedding
similarity = cosine_similarity ( query_embedding . reshape ( 1 , - 1 ) , target_embedding . reshape ( 1 , - 1 ) )
return similarity [ 0 ] [ 0 ]
2023-07-05 22:10:58 +02:00
def feed_search_into_collector ( query , chunk_len , chunk_sep , strong_cleanup , semantic_cleanup , semantic_requirement , threads ) :
# Load parameters from the config file
with open ( ' custom_search_engine_keys.json ' ) as key_file :
key = json . load ( key_file )
2023-07-05 23:48:34 +02:00
model = SentenceTransformer ( ' all-MiniLM-L6-v2 ' )
query_embedding = model . encode ( [ query ] ) [ 0 ]
2023-07-05 22:10:58 +02:00
# Set up API endpoint and parameters
url = " https://www.googleapis.com/customsearch/v1 "
# Retrieve the values from the config dictionary
params = {
" key " : key . get ( " key " , " default_key_value " ) ,
" cx " : key . get ( " cx " , " default_custom_engine_value " ) ,
" q " : str ( query ) ,
}
if " default_key_value " in str ( params ) :
print ( " You need to provide an API key, by modifying the custom_search_engine_keys.json in oobabooga_windows \ text-generation-webui. \n Skipping search " )
return query
if " default_custom_engine_value " in str ( params ) :
print ( " You need to provide an CSE ID, by modifying the script.py in oobabooga_windows \ text-generation-webui. \n Skipping search " )
return query
# Send API request
response = requests . get ( url , params = params )
# Parse JSON response
data = response . json ( )
# get the result items
search_items = data . get ( " items " )
2023-07-05 23:48:34 +02:00
2023-07-05 22:10:58 +02:00
# iterate over 10 results found
2023-07-05 23:48:34 +02:00
urls = " "
2023-07-05 22:10:58 +02:00
for i , search_item in enumerate ( search_items , start = 1 ) :
2023-07-05 23:48:34 +02:00
if semantic_cleanup :
# get titles and descriptions and use that to semantically weight the search result
# get the page title
title = search_item . get ( " title " )
# page snippet
snippet = search_item . get ( " snippet " )
2023-07-05 22:10:58 +02:00
2023-07-05 23:48:34 +02:00
target_sentence = str ( title ) + " " + str ( snippet )
target_embedding = model . encode ( [ target_sentence ] ) [ 0 ]
2023-07-05 22:10:58 +02:00
2023-07-05 23:48:34 +02:00
similarity_score = calculate_semantic_similarity ( query_embedding , target_embedding )
2023-07-05 22:10:58 +02:00
2023-07-05 23:48:34 +02:00
if similarity_score < semantic_requirement :
continue
2023-07-05 22:10:58 +02:00
2023-07-05 23:48:34 +02:00
# extract the page url and add it to the urls to download
link = search_item . get ( " link " )
urls + = link + " \n "
2023-07-05 22:10:58 +02:00
2023-07-05 23:48:34 +02:00
# Call the original feed_url_into_collector function instead of duplicating the code
result_generator = feed_url_into_collector ( urls , chunk_len , chunk_sep , strong_cleanup , threads )
# Consume the yielded values
for result in result_generator :
yield result
2023-07-05 22:10:58 +02:00
2023-05-07 08:50:12 +02:00
2023-05-25 15:22:45 +02:00
def apply_settings ( chunk_count , chunk_count_initial , time_weight ) :
global params
params [ ' chunk_count ' ] = int ( chunk_count )
params [ ' chunk_count_initial ' ] = int ( chunk_count_initial )
params [ ' time_weight ' ] = time_weight
settings_to_display = { k : params [ k ] for k in params if k in [ ' chunk_count ' , ' chunk_count_initial ' , ' time_weight ' ] }
2023-05-07 16:30:16 +02:00
yield f " The following settings are now active: { str ( settings_to_display ) } "
2023-05-13 17:50:19 +02:00
def custom_generate_chat_prompt ( user_input , state , * * kwargs ) :
global chat_collector
if state [ ' mode ' ] == ' instruct ' :
2023-05-25 15:22:45 +02:00
results = collector . get_sorted ( user_input , n_results = params [ ' chunk_count ' ] )
2023-05-13 19:23:02 +02:00
additional_context = ' \n Your reply should be based on the context below: \n \n ' + ' \n ' . join ( results )
2023-05-13 17:50:19 +02:00
user_input + = additional_context
2023-07-05 22:10:58 +02:00
logger . info ( f ' \n \n === === === \n Adding the following new context: \n { additional_context } \n === === === \n ' )
2023-05-13 17:50:19 +02:00
else :
def make_single_exchange ( id_ ) :
output = ' '
2023-07-05 22:10:58 +02:00
output + = f " { state [ ' name1 ' ] } : { shared . history [ ' internal ' ] [ id_ ] [ 0 ] } \n "
output + = f " { state [ ' name2 ' ] } : { shared . history [ ' internal ' ] [ id_ ] [ 1 ] } \n "
2023-05-13 17:50:19 +02:00
return output
2023-07-05 22:10:58 +02:00
if len ( shared . history [ ' internal ' ] ) > params [ ' chunk_count ' ] and user_input != ' ' :
2023-05-13 17:50:19 +02:00
chunks = [ ]
2023-07-05 22:10:58 +02:00
hist_size = len ( shared . history [ ' internal ' ] )
2023-05-13 17:50:19 +02:00
for i in range ( hist_size - 1 ) :
chunks . append ( make_single_exchange ( i ) )
add_chunks_to_collector ( chunks , chat_collector )
2023-07-05 22:10:58 +02:00
query = ' \n ' . join ( shared . history [ ' internal ' ] [ - 1 ] + [ user_input ] )
2023-05-13 17:50:19 +02:00
try :
2023-05-25 15:22:45 +02:00
best_ids = chat_collector . get_ids_sorted ( query , n_results = params [ ' chunk_count ' ] , n_initial = params [ ' chunk_count_initial ' ] , time_weight = params [ ' time_weight ' ] )
2023-05-13 17:50:19 +02:00
additional_context = ' \n '
for id_ in best_ids :
2023-07-05 22:10:58 +02:00
if shared . history [ ' internal ' ] [ id_ ] [ 0 ] != ' <|BEGIN-VISIBLE-CHAT|> ' :
2023-05-13 17:50:19 +02:00
additional_context + = make_single_exchange ( id_ )
2023-05-22 03:42:34 +02:00
logger . warning ( f ' Adding the following new context: \n { additional_context } ' )
2023-05-13 17:50:19 +02:00
state [ ' context ' ] = state [ ' context ' ] . strip ( ) + ' \n ' + additional_context
2023-05-20 23:42:17 +02:00
kwargs [ ' history ' ] = {
2023-07-05 22:10:58 +02:00
' internal ' : [ shared . history [ ' internal ' ] [ i ] for i in range ( hist_size ) if i not in best_ids ] ,
2023-05-20 23:42:17 +02:00
' visible ' : ' '
}
2023-05-13 17:50:19 +02:00
except RuntimeError :
2023-05-22 03:42:34 +02:00
logger . error ( " Couldn ' t query the database, moving on... " )
2023-05-13 17:50:19 +02:00
return chat . generate_chat_prompt ( user_input , state , * * kwargs )
def remove_special_tokens ( string ) :
2023-05-13 19:14:59 +02:00
pattern = r ' (< \ |begin-user-input \ |>|< \ |end-user-input \ |>|< \ |injection-point \ |>) '
return re . sub ( pattern , ' ' , string )
2023-05-13 17:50:19 +02:00
2023-05-07 08:50:12 +02:00
def input_modifier ( string ) :
2023-05-07 20:01:14 +02:00
if shared . is_chat ( ) :
return string
2023-05-07 08:50:12 +02:00
# Find the user input
2023-05-07 16:54:26 +02:00
pattern = re . compile ( r " < \ |begin-user-input \ |>(.*?)< \ |end-user-input \ |> " , re . DOTALL )
2023-05-07 08:50:12 +02:00
match = re . search ( pattern , string )
if match :
2023-05-07 16:54:26 +02:00
user_input = match . group ( 1 ) . strip ( )
2023-05-07 08:50:12 +02:00
2023-05-13 19:14:59 +02:00
# Get the most similar chunks
2023-05-25 15:22:45 +02:00
results = collector . get_sorted ( user_input , n_results = params [ ' chunk_count ' ] )
2023-05-07 08:50:12 +02:00
2023-05-13 19:14:59 +02:00
# Make the injection
string = string . replace ( ' <|injection-point|> ' , ' \n ' . join ( results ) )
2023-05-07 08:50:12 +02:00
2023-05-13 19:14:59 +02:00
return remove_special_tokens ( string )
2023-05-07 08:50:12 +02:00
def ui ( ) :
2023-05-07 16:30:16 +02:00
with gr . Accordion ( " Click for more information... " , open = False ) :
gr . Markdown ( textwrap . dedent ( """
## About
2023-05-07 08:50:12 +02:00
2023-05-07 16:30:16 +02:00
This extension takes a dataset as input , breaks it into chunks , and adds the result to a local / offline Chroma database .
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
The database is then queried during inference time to get the excerpts that are closest to your input . The idea is to create an arbitrarily large pseudo context .
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
The core methodology was developed and contributed by kaiokendev , who is working on improvements to the method in this repository : https : / / github . com / kaiokendev / superbig
2023-05-07 18:29:49 +02:00
2023-05-13 17:50:19 +02:00
## Data input
2023-05-07 20:01:14 +02:00
2023-05-13 17:50:19 +02:00
Start by entering some data in the interface below and then clicking on " Load data " .
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
Each time you load some new data , the old chunks are discarded .
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
## Chat mode
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
#### Instruct
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
On each turn , the chunks will be compared to your current input and the most relevant matches will be appended to the input in the following format :
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
` ` `
Consider the excerpts below as additional context :
. . .
` ` `
2023-05-13 19:14:59 +02:00
The injection doesn ' t make it into the chat history. It is only used in the current generation.
2023-05-13 17:50:19 +02:00
#### Regular chat
The chunks from the external data sources are ignored , and the chroma database is built based on the chat history instead . The most relevant past exchanges relative to the present input are added to the context string . This way , the extension acts as a long term memory .
## Notebook/default modes
Your question must be manually specified between ` < | begin - user - input | > ` and ` < | end - user - input | > ` tags , and the injection point must be specified with ` < | injection - point | > ` .
The special tokens mentioned above ( ` < | begin - user - input | > ` , ` < | end - user - input | > ` , and ` < | injection - point | > ` ) are removed in the background before the text generation begins .
Here is an example in Vicuna 1.1 format :
2023-05-07 08:50:12 +02:00
2023-05-07 16:30:16 +02:00
` ` `
2023-05-12 19:19:55 +02:00
A chat between a curious user and an artificial intelligence assistant . The assistant gives helpful , detailed , and polite answers to the user ' s questions.
2023-05-07 08:50:12 +02:00
2023-05-12 19:19:55 +02:00
USER :
2023-05-07 08:50:12 +02:00
2023-05-07 16:54:26 +02:00
< | begin - user - input | >
2023-05-12 19:19:55 +02:00
What datasets are mentioned in the text below ?
2023-05-07 16:54:26 +02:00
< | end - user - input | >
2023-05-07 16:30:16 +02:00
2023-05-12 19:19:55 +02:00
< | injection - point | >
ASSISTANT :
2023-05-07 16:30:16 +02:00
` ` `
2023-05-12 19:19:55 +02:00
⚠ ️ For best results , make sure to remove the spaces and new line characters after ` ASSISTANT : ` .
2023-05-07 16:30:16 +02:00
* This extension is currently experimental and under development . *
""" ))
2023-05-07 08:50:12 +02:00
2023-05-13 17:50:19 +02:00
with gr . Row ( ) :
with gr . Column ( min_width = 600 ) :
with gr . Tab ( " Text input " ) :
data_input = gr . Textbox ( lines = 20 , label = ' Input data ' )
update_data = gr . Button ( ' Load data ' )
with gr . Tab ( " URL input " ) :
url_input = gr . Textbox ( lines = 10 , label = ' Input URLs ' , info = ' Enter one or more URLs separated by newline characters. ' )
strong_cleanup = gr . Checkbox ( value = params [ ' strong_cleanup ' ] , label = ' Strong cleanup ' , info = ' Only keeps html elements that look like long-form text. ' )
threads = gr . Number ( value = params [ ' threads ' ] , label = ' Threads ' , info = ' The number of threads to use while downloading the URLs. ' , precision = 0 )
update_url = gr . Button ( ' Load data ' )
with gr . Tab ( " File input " ) :
file_input = gr . File ( label = ' Input file ' , type = ' binary ' )
update_file = gr . Button ( ' Load data ' )
2023-07-05 22:10:58 +02:00
with gr . Tab ( " Search input " ) :
search_term = gr . Textbox ( lines = 1 , label = ' Search Input ' , info = ' Enter a google search, returned results will be fed into the DB ' )
search_strong_cleanup = gr . Checkbox ( value = params [ ' strong_cleanup ' ] , label = ' Strong cleanup ' , info = ' Only keeps html elements that look like long-form text. ' )
2023-07-05 23:48:34 +02:00
semantic_cleanup = gr . Checkbox ( value = params [ ' semantic_cleanup ' ] , label = ' Require semantic similarity ' , info = ' Only download pages with similar titles/snippets to the search based on a semantic search ' )
semantic_requirement = gr . Slider ( 0 , 1 , value = params [ ' semantic_weight ' ] , label = ' Semantic similarity requirement ' , info = ' Defines the requirement of the semantic search. 0 = no culling of dissimilar pages. ' )
2023-07-05 22:10:58 +02:00
search_threads = gr . Number ( value = params [ ' threads ' ] , label = ' Threads ' , info = ' The number of threads to use while downloading the URLs. ' , precision = 0 )
update_search = gr . Button ( ' Load data ' )
with gr . Accordion ( " Click for more information... " , open = False ) :
gr . Markdown ( textwrap . dedent ( """
# installation/setup
Please follow the instruction found here to setup a custom search engine with Google .
https : / / www . thepythoncode . com / article / use - google - custom - search - engine - api - in - python
create a file called " custom_search_engine_keys.json "
Paste this text in it and replace with your values from the previous step :
"
{
" key " : " Custom search engine key " ,
" cx " : " Custom search engine cx number "
}
"
# usage
Enter a search query above . Press the load data button . This data will be added to the local chromaDB to be read into context at runtime .
""" ))
2023-05-13 17:50:19 +02:00
with gr . Tab ( " Generation settings " ) :
chunk_count = gr . Number ( value = params [ ' chunk_count ' ] , label = ' Chunk count ' , info = ' The number of closest-matching chunks to include in the prompt. ' )
2023-05-25 15:22:45 +02:00
gr . Markdown ( ' Time weighting (optional, used in to make recently added chunks more likely to appear) ' )
time_weight = gr . Slider ( 0 , 1 , value = params [ ' time_weight ' ] , label = ' Time weight ' , info = ' Defines the strength of the time weighting. 0 = no time weighting. ' )
chunk_count_initial = gr . Number ( value = params [ ' chunk_count_initial ' ] , label = ' Initial chunk count ' , info = ' The number of closest-matching chunks retrieved for time weight reordering in chat mode. This should be >= chunk count. -1 = All chunks are retrieved. Only used if time_weight > 0. ' )
2023-05-13 17:50:19 +02:00
update_settings = gr . Button ( ' Apply changes ' )
chunk_len = gr . Number ( value = params [ ' chunk_length ' ] , label = ' Chunk length ' , info = ' In characters, not tokens. This value is used when you click on " Load data " . ' )
2023-05-15 02:44:52 +02:00
chunk_sep = gr . Textbox ( value = params [ ' chunk_separator ' ] , label = ' Chunk separator ' , info = ' Used to manually split chunks. Manually split chunks longer than chunk length are split again. This value is used when you click on " Load data " . ' )
2023-05-13 17:50:19 +02:00
with gr . Column ( ) :
last_updated = gr . Markdown ( )
2023-05-15 02:44:52 +02:00
update_data . click ( feed_data_into_collector , [ data_input , chunk_len , chunk_sep ] , last_updated , show_progress = False )
update_url . click ( feed_url_into_collector , [ url_input , chunk_len , chunk_sep , strong_cleanup , threads ] , last_updated , show_progress = False )
update_file . click ( feed_file_into_collector , [ file_input , chunk_len , chunk_sep ] , last_updated , show_progress = False )
2023-07-05 22:10:58 +02:00
update_search . click ( feed_search_into_collector , [ search_term , chunk_len , chunk_sep , search_strong_cleanup , semantic_cleanup , semantic_requirement , search_threads ] , last_updated , show_progress = False )
2023-05-25 15:22:45 +02:00
update_settings . click ( apply_settings , [ chunk_count , chunk_count_initial , time_weight ] , last_updated , show_progress = False )