diff --git a/huixiangdou/service/web_search.py b/huixiangdou/service/web_search.py index a0b79680..dd636929 100644 --- a/huixiangdou/service/web_search.py +++ b/huixiangdou/service/web_search.py @@ -7,7 +7,7 @@ import time import types -import nest_asyncio +# import nest_asyncio import pytoml import requests from bs4 import BeautifulSoup as BS @@ -18,33 +18,33 @@ from ..primitive import FileOperation from .helper import check_str_useful -import_pyppeteer = False -try: - from pyppeteer import launch - import_pyppeteer = True -except Exception as e: - # Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not - # apt install libgbm-dev - # See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/ - logger.warning( - 'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442' - ) - - -async def fetch_chroumium_content(url): - browser = await launch(headless=True, - args=[ - '--no-sandbox', '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-software-rasterizer', - '--disable-setuid-sandbox' - ]) - page = await browser.newPage() - await page.goto(url) - time.sleep(1) - content = await page.evaluate('document.body.innerText', force_expr=True) - await browser.close() - return content +# import_pyppeteer = False +# try: +# from pyppeteer import launch +# import_pyppeteer = True +# except Exception as e: +# # Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not +# # apt install libgbm-dev +# # See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/ +# logger.warning( +# 'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442' +# ) + + +# async def fetch_chroumium_content(url): +# browser = await launch(headless=True, +# args=[ +# '--no-sandbox', '--disable-dev-shm-usage', +# '--disable-gpu', +# '--disable-software-rasterizer', +# '--disable-setuid-sandbox' +# ]) +# page = await browser.newPage() +# await page.goto(url) +# time.sleep(1) +# content = await page.evaluate('document.body.innerText', force_expr=True) +# await browser.close() +# return content class Article: @@ -138,12 +138,13 @@ def fetch_url(self, query: str, target_link: str, brief: str = ''): content = content.replace(' ', ' ') if not check_str_useful(content=content): - logger.info('retry with chromium {}'.format(target_link)) - nest_asyncio.apply() - content = asyncio.get_event_loop().run_until_complete( - fetch_chroumium_content(url=target_link)) - if not check_str_useful(content=content): - return None + return None + # logger.info('retry with chromium {}'.format(target_link)) + # nest_asyncio.apply() + # content = asyncio.get_event_loop().run_until_complete( + # fetch_chroumium_content(url=target_link)) + # if not check_str_useful(content=content): + # return None return Article(content=content, source=target_link, brief=brief) except Exception as e: diff --git a/requirements.txt b/requirements.txt index 6ba6e5f7..44dc7241 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ einops faiss-gpu loguru lxml_html_clean -nest_asyncio networkx>=3.0 numpy<2.0.0 openai>=1.0.0