Skip to content

Commit

Permalink
fix(web_search.py): remove nest_asyncio (#357)
Browse files Browse the repository at this point in the history
* fix(web_search.py): close async pyppeteer
  • Loading branch information
tpoisonooo authored Aug 14, 2024
1 parent b782bdf commit d36d98e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 35 deletions.
69 changes: 35 additions & 34 deletions huixiangdou/service/web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import time
import types

import nest_asyncio
# import nest_asyncio
import pytoml
import requests
from bs4 import BeautifulSoup as BS
Expand All @@ -18,33 +18,33 @@
from ..primitive import FileOperation
from .helper import check_str_useful

import_pyppeteer = False
try:
from pyppeteer import launch
import_pyppeteer = True
except Exception as e:
# Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not
# apt install libgbm-dev
# See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/
logger.warning(
'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442'
)


async def fetch_chroumium_content(url):
browser = await launch(headless=True,
args=[
'--no-sandbox', '--disable-dev-shm-usage',
'--disable-gpu',
'--disable-software-rasterizer',
'--disable-setuid-sandbox'
])
page = await browser.newPage()
await page.goto(url)
time.sleep(1)
content = await page.evaluate('document.body.innerText', force_expr=True)
await browser.close()
return content
# import_pyppeteer = False
# try:
# from pyppeteer import launch
# import_pyppeteer = True
# except Exception as e:
# # Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not
# # apt install libgbm-dev
# # See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/
# logger.warning(
# 'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442'
# )


# async def fetch_chroumium_content(url):
# browser = await launch(headless=True,
# args=[
# '--no-sandbox', '--disable-dev-shm-usage',
# '--disable-gpu',
# '--disable-software-rasterizer',
# '--disable-setuid-sandbox'
# ])
# page = await browser.newPage()
# await page.goto(url)
# time.sleep(1)
# content = await page.evaluate('document.body.innerText', force_expr=True)
# await browser.close()
# return content


class Article:
Expand Down Expand Up @@ -138,12 +138,13 @@ def fetch_url(self, query: str, target_link: str, brief: str = ''):
content = content.replace(' ', ' ')

if not check_str_useful(content=content):
logger.info('retry with chromium {}'.format(target_link))
nest_asyncio.apply()
content = asyncio.get_event_loop().run_until_complete(
fetch_chroumium_content(url=target_link))
if not check_str_useful(content=content):
return None
return None
# logger.info('retry with chromium {}'.format(target_link))
# nest_asyncio.apply()
# content = asyncio.get_event_loop().run_until_complete(
# fetch_chroumium_content(url=target_link))
# if not check_str_useful(content=content):
# return None

return Article(content=content, source=target_link, brief=brief)
except Exception as e:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ einops
faiss-gpu
loguru
lxml_html_clean
nest_asyncio
networkx>=3.0
numpy<2.0.0
openai>=1.0.0
Expand Down

0 comments on commit d36d98e

Please sign in to comment.