original idea by GuizzyQC, improved with bs4

https://github.com/GuizzyQC/text-generation-webui but bs4
2024-11-25 01:09:22 +01:00 · 2023-04-20 16:38:33 -07:00 · 2023-04-20 16:38:33 -07:00 · f99e1b102d
commit f99e1b102d
parent 96ba55501c
2 changed files with 125 additions and 0 deletions
--- a/extensions/searx_integration/requirements.txt
+++ b/extensions/searx_integration/requirements.txt
@ -0,0 +1 @@
+trafilatura
--- a/extensions/searx_integration/script.py
+++ b/extensions/searx_integration/script.py
@ -0,0 +1,124 @@
+import json
+import requests
+import re
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+
+import gradio as gr
+import modules.chat as chat
+import modules.shared as shared
+import requests
+import torch
+
+torch._C._jit_set_profiling_mode(False)
+
+params = { # These can all be set in the settings.yml file
+    'enable_search': True,
+    'include_first_result_content': True,
+    'include_result_summary': False,
+    'max_characters_per_page': 4096,
+    'searx_instance': "",
+    'max_total_characters': 8192,
+    'extra_query_information': "",
+    'removal_list': ['\t', '\n', '\\n', '\\t'],
+    'number_of_results': 1,
+    'console_log': True
+}
+
+html_element_blacklist = [
+    '[document]',
+    'noscript',
+    'header',
+    'meta',
+    'head', 
+    'input',
+    'script',
+    'style'
+]
+
+
+def url_to_text(url):
+    
+    html = urlopen(url).read()
+    soup = BeautifulSoup(html, "lxml")
+    for tag in soup.find_all(html_element_blacklist):
+        tag.decompose()
+    text = soup.get_text(strip=True)
+    for string in params['removal_list']:
+        text.replace(string, " ")
+    return text[0:params['max_characters_per_page']]
+
+def get_search_term(string): # This checks if you say "search ... about" something, and if the "Activate Searx integration" checkbox is ticked will search about that
+
+    commands = ['search', 'tell me', 'give me a summary']
+    marker = ['about']
+    lowstr = string.lower()
+    for s in ['\"', '\'']:
+        lowstr = lowstr.replace(s, '')
+    if any(command in lowstr for command in commands) and any(case in lowstr for case in marker):
+        print("Found search term")
+        subject = string.split('about',1)[1]
+        return subject
+
+def search_string(search_term): # This is the main logic that sends the API request to Searx and returns the text to add to the context
+
+    print("Searching about" + search_term + "...")
+    query = f"{search_term} {params['extra_query_information']}"
+    r = requests.get(params['searx_instance'], params={'q': query,'format': 'json','pageno': '1'})
+    try:
+        searchdata = r.json()
+        searchdata = searchdata['results']
+    except:
+        new_context = "Tell me that you could not find the results I asked for."
+    else:
+        new_context = f"This is new information from after your knowledge cutoff date about {search_term} :\n"
+        if params['include_first_result_content']:
+            for i in range(params['number_of_results']):
+                new_context += url_to_text(searchdata[i]['url']) + "\n"
+        if params['include_result_summary']:
+            for result in searchdata:
+                if 'content' in result:
+                    new_context += result['content'] + "\n"
+        new_context = new_context[0:params['max_total_characters']]
+    finally:
+        if params['console_log']:
+            print(new_context)
+        return new_context
+
+def input_modifier(string):
+
+    if params['enable_search'] and params['searx_instance']:
+        if get_search_term(string):
+            search_result = search_string(string)
+            if search_result == "Tell me that you could not find the results I asked for.": # If it failed to get a result, ask the LLM to tell user it did
+                return search_result
+            else:
+                return f"{search_result} Using the information I just gave you, without adding any thing new, respond to this request: {string}"
+    return string
+
+def ui():
+    
+    with gr.Accordion("Searx Integration", open=True):
+        with gr.Row():
+            with gr.Column():
+                enable_search = gr.Checkbox(value=params['enable_search'], label='Activate Searx integration')
+                console_log = gr.Checkbox(value=params['console_log'], label='Display search results on console')
+                include_first_result_content = gr.Checkbox(value=params['include_first_result_content'], label='Include content from the first result')
+                number_of_results = gr.Slider(1,10,value=params['number_of_results'],step=1,label='Number of results to fetch')
+            with gr.Column():
+                searx_instance = gr.Textbox(placeholder=params['searx_instance'], value=params['searx_instance'], label='Searx instance address')
+                extra_query_information = gr.Textbox(placeholder=params['extra_query_information'], value=params['extra_query_information'], label='Extra info to pass in Searx query')
+                include_result_summary = gr.Checkbox(value=params['include_result_summary'], label='Include summary from each search result')
+                max_characters_per_page = gr.Slider(256,16384,value=params['max_characters_per_page'],step=64,label='Maximum characters per fetched pages')
+                max_total_characters = gr.Slider(256,16384,value=params['max_total_characters'],step=64,label='Total max characters')
+
+    enable_search.change(lambda x: params.update({"enable_search": x}), enable_search, None)
+    console_log.change(lambda x: params.update({"console_display": x}), console_log, None)
+    include_first_result_content.change(lambda x: params.update({"include_first_result_content": x}), include_first_result_content, None)
+    include_result_summary.change(lambda x: params.update({"include_result_summary": x}), include_result_summary, None)
+    number_of_results.change(lambda x: params.update({"number_of_results": x}), number_of_results, None)
+    max_characters_per_page.change(lambda x: params.update({"max_characters_per_page": x}), max_characters_per_page, None)
+    searx_instance.change(lambda x: params.update({"searx_instance": x}), searx_instance, None)
+    extra_query_information.change(lambda x: params.update({"extra_query_information": x}), extra_query_information, None)
+    max_total_characters.change(lambda x: params.update({"max_total_characters": x}), max_total_characters, None)