import json import requests import re from urllib.request import urlopen from bs4 import BeautifulSoup from bs4.element import Comment import gradio as gr import modules.chat as chat import modules.shared as shared import requests import torch torch._C._jit_set_profiling_mode(False) searx_integration_params = { # These can all be set in the settings.yml file 'enable_search': True, 'include_first_result_content': True, 'include_result_summary': False, 'max_characters_per_page': 4096, 'searx_instance': "", 'max_total_characters': 8192, 'extra_query_information': "", 'removal_list': ['\t', '\n', '\\n', '\\t'], 'number_of_results': 1, 'console_log': True } html_element_blacklist = [ '[document]', 'noscript', 'header', 'meta', 'head', 'input', 'script', 'style' ] def url_to_text(url): html = urlopen(url).read() soup = BeautifulSoup(html, "lxml") for tag in soup.find_all(html_element_blacklist): tag.decompose() text = soup.get_text(strip=True) for string in searx_integration_params['removal_list']: text.replace(string, " ") return text[0:searx_integration_params['max_characters_per_page']] def get_search_term(string): # This checks if you say "search ... about" something, and if the "Activate Searx integration" checkbox is ticked will search about that commands = ['search', 'tell me', 'give me a summary'] marker = ['about'] lowstr = string.lower() for s in ['\"', '\'']: lowstr = lowstr.replace(s, '') if any(command in lowstr for command in commands) and any(case in lowstr for case in marker): print("Found search term") subject = string.split('about',1)[1] return subject def search_string(search_term): # This is the main logic that sends the API request to Searx and returns the text to add to the context print("Searching about" + search_term + "...") query = f"{search_term} {searx_integration_params['extra_query_information']}" r = requests.get(searx_integration_params['searx_instance'], params={'q': query,'format': 'json','pageno': '1'}) try: searchdata = r.json() searchdata = searchdata['results'] except: new_context = "Tell me that you could not find the results I asked for." else: new_context = f"This is new information from after your knowledge cutoff date about {search_term} :\n" if searx_integration_params['include_first_result_content']: for i in range(searx_integration_params['number_of_results']): new_context += url_to_text(searchdata[i]['url']) + "\n" if searx_integration_params['include_result_summary']: for result in searchdata: if 'content' in result: new_context += result['content'] + "\n" new_context = new_context[0:searx_integration_params['max_total_characters']] finally: if searx_integration_params['console_log']: print(new_context) return new_context def input_modifier(string): if searx_integration_params['enable_search'] and searx_integration_params['searx_instance']: if get_search_term(string): search_result = search_string(string) if search_result == "Tell me that you could not find the results I asked for.": # If it failed to get a result, ask the LLM to tell user it did return search_result else: return f"{search_result} Using the information I just gave you, without adding any thing new, respond to this request: {string}" return string def ui(): with gr.Accordion("Searx Integration", open=True): with gr.Row(): with gr.Column(): enable_search = gr.Checkbox(value=searx_integration_params['enable_search'], label='Activate Searx integration') console_log = gr.Checkbox(value=searx_integration_params['console_log'], label='Display search results on console') include_first_result_content = gr.Checkbox(value=searx_integration_params['include_first_result_content'], label='Include content from the first result') number_of_results = gr.Slider(1,10,value=searx_integration_params['number_of_results'],step=1,label='Number of results to fetch') with gr.Column(): searx_instance = gr.Textbox(placeholder=searx_integration_params['searx_instance'], value=searx_integration_params['searx_instance'], label='Searx instance address') extra_query_information = gr.Textbox(placeholder=searx_integration_params['extra_query_information'], value=searx_integration_params['extra_query_information'], label='Extra info to pass in Searx query') include_result_summary = gr.Checkbox(value=searx_integration_params['include_result_summary'], label='Include summary from each search result') max_characters_per_page = gr.Slider(256,16384,value=searx_integration_params['max_characters_per_page'],step=64,label='Maximum characters per fetched pages') max_total_characters = gr.Slider(256,16384,value=searx_integration_params['max_total_characters'],step=64,label='Total max characters') enable_search.change(lambda x: searx_integration_params.update({"enable_search": x}), enable_search, None) console_log.change(lambda x: searx_integration_params.update({"console_display": x}), console_log, None) include_first_result_content.change(lambda x: searx_integration_params.update({"include_first_result_content": x}), include_first_result_content, None) include_result_summary.change(lambda x: searx_integration_params.update({"include_result_summary": x}), include_result_summary, None) number_of_results.change(lambda x: searx_integration_params.update({"number_of_results": x}), number_of_results, None) max_characters_per_page.change(lambda x: searx_integration_params.update({"max_characters_per_page": x}), max_characters_per_page, None) searx_instance.change(lambda x: searx_integration_params.update({"searx_instance": x}), searx_instance, None) extra_query_information.change(lambda x: searx_integration_params.update({"extra_query_information": x}), extra_query_information, None) max_total_characters.change(lambda x: searx_integration_params.update({"max_total_characters": x}), max_total_characters, None)