original idea by GuizzyQC, improved with bs4

https://github.com/GuizzyQC/text-generation-webui but bs4
2024-11-28 18:48:04 +01:00 · 2023-04-20 16:38:33 -07:00 · 2023-04-20 16:38:33 -07:00 · f99e1b102d
commit f99e1b102d
parent 96ba55501c
2 changed files with 125 additions and 0 deletions
--- a/extensions/searx_integration/requirements.txt
+++ b/extensions/searx_integration/requirements.txt
@ -0,0 +1 @@
 trafilatura
--- a/extensions/searx_integration/script.py
+++ b/extensions/searx_integration/script.py
@ -0,0 +1,124 @@
 import json
 import requests
 import re
 from urllib.request import urlopen
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 import gradio as gr
 import modules.chat as chat
 import modules.shared as shared
 import requests
 import torch
 torch._C._jit_set_profiling_mode(False)
 params = { # These can all be set in the settings.yml file
    'enable_search': True,
    'include_first_result_content': True,
    'include_result_summary': False,
    'max_characters_per_page': 4096,
    'searx_instance': "",
    'max_total_characters': 8192,
    'extra_query_information': "",
    'removal_list': ['\t', '\n', '\\n', '\\t'],
    'number_of_results': 1,
    'console_log': True
 }
 html_element_blacklist = [
    '[document]',
    'noscript',
    'header',
    'meta',
    'head', 
    'input',
    'script',
    'style'
 ]
 def url_to_text(url):
    html = urlopen(url).read()
    soup = BeautifulSoup(html, "lxml")
    for tag in soup.find_all(html_element_blacklist):
        tag.decompose()
    text = soup.get_text(strip=True)
    for string in params['removal_list']:
        text.replace(string, " ")
    return text[0:params['max_characters_per_page']]
 def get_search_term(string): # This checks if you say "search ... about" something, and if the "Activate Searx integration" checkbox is ticked will search about that
    commands = ['search', 'tell me', 'give me a summary']
    marker = ['about']
    lowstr = string.lower()
    for s in ['\"', '\'']:
        lowstr = lowstr.replace(s, '')
    if any(command in lowstr for command in commands) and any(case in lowstr for case in marker):
        print("Found search term")
        subject = string.split('about',1)[1]
        return subject
 def search_string(search_term): # This is the main logic that sends the API request to Searx and returns the text to add to the context
    print("Searching about" + search_term + "...")
    query = f"{search_term} {params['extra_query_information']}"
    r = requests.get(params['searx_instance'], params={'q': query,'format': 'json','pageno': '1'})
    try:
        searchdata = r.json()
        searchdata = searchdata['results']
    except:
        new_context = "Tell me that you could not find the results I asked for."
    else:
        new_context = f"This is new information from after your knowledge cutoff date about {search_term} :\n"
        if params['include_first_result_content']:
            for i in range(params['number_of_results']):
                new_context += url_to_text(searchdata[i]['url']) + "\n"
        if params['include_result_summary']:
            for result in searchdata:
                if 'content' in result:
                    new_context += result['content'] + "\n"
        new_context = new_context[0:params['max_total_characters']]
    finally:
        if params['console_log']:
            print(new_context)
        return new_context
 def input_modifier(string):
    if params['enable_search'] and params['searx_instance']:
        if get_search_term(string):
            search_result = search_string(string)
            if search_result == "Tell me that you could not find the results I asked for.": # If it failed to get a result, ask the LLM to tell user it did
                return search_result
            else:
                return f"{search_result} Using the information I just gave you, without adding any thing new, respond to this request: {string}"
    return string
 def ui():
    with gr.Accordion("Searx Integration", open=True):
        with gr.Row():
            with gr.Column():
                enable_search = gr.Checkbox(value=params['enable_search'], label='Activate Searx integration')
                console_log = gr.Checkbox(value=params['console_log'], label='Display search results on console')
                include_first_result_content = gr.Checkbox(value=params['include_first_result_content'], label='Include content from the first result')
                number_of_results = gr.Slider(1,10,value=params['number_of_results'],step=1,label='Number of results to fetch')
            with gr.Column():
                searx_instance = gr.Textbox(placeholder=params['searx_instance'], value=params['searx_instance'], label='Searx instance address')
                extra_query_information = gr.Textbox(placeholder=params['extra_query_information'], value=params['extra_query_information'], label='Extra info to pass in Searx query')
                include_result_summary = gr.Checkbox(value=params['include_result_summary'], label='Include summary from each search result')
                max_characters_per_page = gr.Slider(256,16384,value=params['max_characters_per_page'],step=64,label='Maximum characters per fetched pages')
                max_total_characters = gr.Slider(256,16384,value=params['max_total_characters'],step=64,label='Total max characters')
    enable_search.change(lambda x: params.update({"enable_search": x}), enable_search, None)
    console_log.change(lambda x: params.update({"console_display": x}), console_log, None)
    include_first_result_content.change(lambda x: params.update({"include_first_result_content": x}), include_first_result_content, None)
    include_result_summary.change(lambda x: params.update({"include_result_summary": x}), include_result_summary, None)
    number_of_results.change(lambda x: params.update({"number_of_results": x}), number_of_results, None)
    max_characters_per_page.change(lambda x: params.update({"max_characters_per_page": x}), max_characters_per_page, None)
    searx_instance.change(lambda x: params.update({"searx_instance": x}), searx_instance, None)
    extra_query_information.change(lambda x: params.update({"extra_query_information": x}), extra_query_information, None)
    max_total_characters.change(lambda x: params.update({"max_total_characters": x}), max_total_characters, None)