From 1fc0b5041e3c903263b68065ce474f4e4e3a658a Mon Sep 17 00:00:00 2001 From: Juliano Henriquez Date: Tue, 11 Jul 2023 18:02:49 -0400 Subject: [PATCH] substitu superboog Beatiful Soup Parser (#2996) * add lxml to requirments add lxml to requirments * Change Beaitful Soup Parser "lxml" parser which might be more tolerant of certain kinds of parsing errors than "html.parser" and quicker at the same time. --- extensions/superbooga/requirements.txt | 1 + extensions/superbooga/script.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/extensions/superbooga/requirements.txt b/extensions/superbooga/requirements.txt index dd2cbde6..4d5a95a4 100644 --- a/extensions/superbooga/requirements.txt +++ b/extensions/superbooga/requirements.txt @@ -2,3 +2,4 @@ beautifulsoup4==4.12.2 chromadb==0.3.18 posthog==2.4.2 sentence_transformers==2.2.2 +lxml diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py index c0d3f8eb..f67a956e 100644 --- a/extensions/superbooga/script.py +++ b/extensions/superbooga/script.py @@ -69,7 +69,7 @@ def feed_url_into_collector(urls, chunk_len, chunk_sep, strong_cleanup, threads) cumulative += 'Processing the HTML sources...' yield cumulative for content in contents: - soup = BeautifulSoup(content, features="html.parser") + soup = BeautifulSoup(content, features="lxml") for script in soup(["script", "style"]): script.extract()