diff --git a/extensions/superbooga/requirements.txt b/extensions/superbooga/requirements.txt index dd2cbde6..4d5a95a4 100644 --- a/extensions/superbooga/requirements.txt +++ b/extensions/superbooga/requirements.txt @@ -2,3 +2,4 @@ beautifulsoup4==4.12.2 chromadb==0.3.18 posthog==2.4.2 sentence_transformers==2.2.2 +lxml diff --git a/extensions/superbooga/script.py b/extensions/superbooga/script.py index c0d3f8eb..f67a956e 100644 --- a/extensions/superbooga/script.py +++ b/extensions/superbooga/script.py @@ -69,7 +69,7 @@ def feed_url_into_collector(urls, chunk_len, chunk_sep, strong_cleanup, threads) cumulative += 'Processing the HTML sources...' yield cumulative for content in contents: - soup = BeautifulSoup(content, features="html.parser") + soup = BeautifulSoup(content, features="lxml") for script in soup(["script", "style"]): script.extract()