original idea by GuizzyQC, improved with bs4

https://github.com/GuizzyQC/text-generation-webui but bs4
This commit is contained in:
catalpaaa 2023-04-20 16:38:33 -07:00
parent 96ba55501c
commit f99e1b102d
2 changed files with 125 additions and 0 deletions

View File

@ -0,0 +1 @@
trafilatura

View File

@ -0,0 +1,124 @@
import json
import requests
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup
from bs4.element import Comment
import gradio as gr
import modules.chat as chat
import modules.shared as shared
import requests
import torch
torch._C._jit_set_profiling_mode(False)
params = { # These can all be set in the settings.yml file
'enable_search': True,
'include_first_result_content': True,
'include_result_summary': False,
'max_characters_per_page': 4096,
'searx_instance': "",
'max_total_characters': 8192,
'extra_query_information': "",
'removal_list': ['\t', '\n', '\\n', '\\t'],
'number_of_results': 1,
'console_log': True
}
html_element_blacklist = [
'[document]',
'noscript',
'header',
'meta',
'head',
'input',
'script',
'style'
]
def url_to_text(url):
html = urlopen(url).read()
soup = BeautifulSoup(html, "lxml")
for tag in soup.find_all(html_element_blacklist):
tag.decompose()
text = soup.get_text(strip=True)
for string in params['removal_list']:
text.replace(string, " ")
return text[0:params['max_characters_per_page']]
def get_search_term(string): # This checks if you say "search ... about" something, and if the "Activate Searx integration" checkbox is ticked will search about that
commands = ['search', 'tell me', 'give me a summary']
marker = ['about']
lowstr = string.lower()
for s in ['\"', '\'']:
lowstr = lowstr.replace(s, '')
if any(command in lowstr for command in commands) and any(case in lowstr for case in marker):
print("Found search term")
subject = string.split('about',1)[1]
return subject
def search_string(search_term): # This is the main logic that sends the API request to Searx and returns the text to add to the context
print("Searching about" + search_term + "...")
query = f"{search_term} {params['extra_query_information']}"
r = requests.get(params['searx_instance'], params={'q': query,'format': 'json','pageno': '1'})
try:
searchdata = r.json()
searchdata = searchdata['results']
except:
new_context = "Tell me that you could not find the results I asked for."
else:
new_context = f"This is new information from after your knowledge cutoff date about {search_term} :\n"
if params['include_first_result_content']:
for i in range(params['number_of_results']):
new_context += url_to_text(searchdata[i]['url']) + "\n"
if params['include_result_summary']:
for result in searchdata:
if 'content' in result:
new_context += result['content'] + "\n"
new_context = new_context[0:params['max_total_characters']]
finally:
if params['console_log']:
print(new_context)
return new_context
def input_modifier(string):
if params['enable_search'] and params['searx_instance']:
if get_search_term(string):
search_result = search_string(string)
if search_result == "Tell me that you could not find the results I asked for.": # If it failed to get a result, ask the LLM to tell user it did
return search_result
else:
return f"{search_result} Using the information I just gave you, without adding any thing new, respond to this request: {string}"
return string
def ui():
with gr.Accordion("Searx Integration", open=True):
with gr.Row():
with gr.Column():
enable_search = gr.Checkbox(value=params['enable_search'], label='Activate Searx integration')
console_log = gr.Checkbox(value=params['console_log'], label='Display search results on console')
include_first_result_content = gr.Checkbox(value=params['include_first_result_content'], label='Include content from the first result')
number_of_results = gr.Slider(1,10,value=params['number_of_results'],step=1,label='Number of results to fetch')
with gr.Column():
searx_instance = gr.Textbox(placeholder=params['searx_instance'], value=params['searx_instance'], label='Searx instance address')
extra_query_information = gr.Textbox(placeholder=params['extra_query_information'], value=params['extra_query_information'], label='Extra info to pass in Searx query')
include_result_summary = gr.Checkbox(value=params['include_result_summary'], label='Include summary from each search result')
max_characters_per_page = gr.Slider(256,16384,value=params['max_characters_per_page'],step=64,label='Maximum characters per fetched pages')
max_total_characters = gr.Slider(256,16384,value=params['max_total_characters'],step=64,label='Total max characters')
enable_search.change(lambda x: params.update({"enable_search": x}), enable_search, None)
console_log.change(lambda x: params.update({"console_display": x}), console_log, None)
include_first_result_content.change(lambda x: params.update({"include_first_result_content": x}), include_first_result_content, None)
include_result_summary.change(lambda x: params.update({"include_result_summary": x}), include_result_summary, None)
number_of_results.change(lambda x: params.update({"number_of_results": x}), number_of_results, None)
max_characters_per_page.change(lambda x: params.update({"max_characters_per_page": x}), max_characters_per_page, None)
searx_instance.change(lambda x: params.update({"searx_instance": x}), searx_instance, None)
extra_query_information.change(lambda x: params.update({"extra_query_information": x}), extra_query_information, None)
max_total_characters.change(lambda x: params.update({"max_total_characters": x}), max_total_characters, None)