Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(web_search.py): remove nest_asyncio #357

Merged
merged 10 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 35 additions & 34 deletions huixiangdou/service/web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import time
import types

import nest_asyncio
# import nest_asyncio
import pytoml
import requests
from bs4 import BeautifulSoup as BS
Expand All @@ -18,33 +18,33 @@
from ..primitive import FileOperation
from .helper import check_str_useful

import_pyppeteer = False
try:
from pyppeteer import launch
import_pyppeteer = True
except Exception as e:
# Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not
# apt install libgbm-dev
# See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/
logger.warning(
'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442'
)


async def fetch_chroumium_content(url):
browser = await launch(headless=True,
args=[
'--no-sandbox', '--disable-dev-shm-usage',
'--disable-gpu',
'--disable-software-rasterizer',
'--disable-setuid-sandbox'
])
page = await browser.newPage()
await page.goto(url)
time.sleep(1)
content = await page.evaluate('document.body.innerText', force_expr=True)
await browser.close()
return content
# import_pyppeteer = False
# try:
# from pyppeteer import launch
# import_pyppeteer = True
# except Exception as e:
# # Fix ldd ~/.local/share/pyppeteer/local-chromium/1181205/chrome-linux/chrome | grep not
# # apt install libgbm-dev
# # See https://techoverflow.net/2020/09/29/how-to-fix-pyppeteer-pyppeteer-errors-browsererror-browser-closed-unexpectedly/
# logger.warning(
# 'For better URL parsing, try `pip install pyppeteer` and see https://github.com/pyppeteer/pyppeteer/issues/442'
# )


# async def fetch_chroumium_content(url):
# browser = await launch(headless=True,
# args=[
# '--no-sandbox', '--disable-dev-shm-usage',
# '--disable-gpu',
# '--disable-software-rasterizer',
# '--disable-setuid-sandbox'
# ])
# page = await browser.newPage()
# await page.goto(url)
# time.sleep(1)
# content = await page.evaluate('document.body.innerText', force_expr=True)
# await browser.close()
# return content


class Article:
Expand Down Expand Up @@ -138,12 +138,13 @@ def fetch_url(self, query: str, target_link: str, brief: str = ''):
content = content.replace(' ', ' ')

if not check_str_useful(content=content):
logger.info('retry with chromium {}'.format(target_link))
nest_asyncio.apply()
content = asyncio.get_event_loop().run_until_complete(
fetch_chroumium_content(url=target_link))
if not check_str_useful(content=content):
return None
return None
# logger.info('retry with chromium {}'.format(target_link))
# nest_asyncio.apply()
# content = asyncio.get_event_loop().run_until_complete(
# fetch_chroumium_content(url=target_link))
# if not check_str_useful(content=content):
# return None

return Article(content=content, source=target_link, brief=brief)
except Exception as e:
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ einops
faiss-gpu
loguru
lxml_html_clean
nest_asyncio
networkx>=3.0
numpy<2.0.0
openai>=1.0.0
Expand Down
Loading