Skip to content

Commit

Permalink
Goose3: using new top node html raw
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Apr 21, 2024
1 parent 0d4ca00 commit 30ad95e
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 53 deletions.
1 change: 0 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ flask-restx = "==1.*"
goose3 = "==3.*"
gunicorn = "==22.*"
json-logging-py = "*" # needed by gunicorn config
lxml = "==5.*"
opml = "==0.*"
prometheus-distributed-client = "==1.*"
psycopg2-binary = "==2.*"
Expand Down
4 changes: 2 additions & 2 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

93 changes: 43 additions & 50 deletions jarr/lib/content_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from typing import Optional
from goose3 import Goose
from lxml import etree
from jarr.bootstrap import conf
from jarr.controllers.article import to_vector
from jarr.lib.enums import ArticleType, FeedType
Expand All @@ -15,8 +14,9 @@
logger = logging.getLogger(__name__)
IMG_ALT_MAX_LENGTH = 100
YOUTUBE_RE = re.compile(
r'^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))'
r'(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$')
r"^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))"
r"(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$"
)


def is_embedded_link(link):
Expand All @@ -37,16 +37,16 @@ def _get_goose(self):
try:
self._page = goose.extract(self.article.link)
except Exception as error:
logger.error("something wrong happened while trying to fetch "
"%r: %r", self.article.link, error)
msg = "something wrong happened while trying to fetch %r: %r"
logger.error(msg, self.article.link, error)
if not self._page:
return False
lang = self._page.opengraph.get('locale') or self._page.meta_lang
self.extracted_infos['lang'] = clean_lang(lang)
self.extracted_infos['link'] = self._page.final_url
keywords = set(self._page.meta_keywords.split(', '))
self.extracted_infos['tags'] = set(self._page.tags).union(keywords)
self.extracted_infos['title'] = self._page.title
lang = self._page.opengraph.get("locale") or self._page.meta_lang
self.extracted_infos["lang"] = clean_lang(lang)
self.extracted_infos["link"] = self._page.final_url
keywords = set(self._page.meta_keywords.split(", "))
self.extracted_infos["tags"] = set(self._page.tags).union(keywords)
self.extracted_infos["title"] = self._page.title
return True

def get_vector(self):
Expand All @@ -55,31 +55,23 @@ def get_vector(self):
if self._page and self.extracted_infos:
return to_vector(self.extracted_infos, self._page)

def _from_goose_to_html(self, encoding="utf8"):
result = b""
current_node = self._page.top_node
while True:
result += etree.tostring(current_node, encoding=encoding)
current_node = current_node.getnext()
if current_node is None:
break
return result.replace(b'\n', b' ').decode(encoding)

@staticmethod
def generate():
return {}

def generate_and_merge(self, content):
content = migrate_content(content)
# if there is already some fetched content
already_fetched = any(cnt.get('type') == 'fetched'
for cnt in content.get('contents') or [])
already_fetched = any(
cnt.get("type") == "fetched"
for cnt in content.get("contents") or []
)
if isinstance(self, TruncatedContentGenerator) and already_fetched:
return content
article_content = self.generate()
if not article_content:
return content
content['contents'].append(article_content)
content["contents"].append(article_content)
return content


Expand Down Expand Up @@ -120,15 +112,15 @@ def get_vector():
def generate(self):
yt_match = YOUTUBE_RE.match(self.article.link)
if yt_match:
logger.info('%r constructing embedded youtube content '
'from article', self.article)
msg = "%r constructing embedded youtube content from article"
logger.info(msg, self.article)
try:
return {'type': 'youtube', 'link': yt_match.group(5)}
return {"type": "youtube", "link": yt_match.group(5)}
except IndexError:
pass
else:
logger.warning('embedded video not recognized %r',
self.article.link)
msg = "embedded media not recognized %r"
logger.warning(msg, self.article.link)
return {}


Expand All @@ -137,18 +129,18 @@ class TruncatedContentGenerator(ContentGenerator):
def generate(self):
if self._page is None:
self._get_goose()
content = {'type': 'fetched'}
content = {"type": "fetched"}
try:
content['content'] = self._from_goose_to_html()
content['link'] = remove_utm_tags(self._page.final_url)
content['title'] = self._page.title
content["content"] = self._page.top_node_raw_html
content["link"] = remove_utm_tags(self._page.final_url)
content["title"] = self._page.title
except Exception:
logger.exception("Could not rebuild parsed content for %r",
self.article)
msg = "Could not rebuild parsed content for %r"
logger.exception(msg, self.article)
return {}
if self.article.comments:
content['comments'] = self.article.comments
logger.debug('%r no special type found doing nothing', self.article)
content["comments"] = self.article.comments
logger.debug("%r no special type found doing nothing", self.article)
return content


Expand All @@ -168,9 +160,12 @@ def is_pure_reddit_post(self):
return self._is_pure_reddit_post
try:
split = urllib.parse.urlsplit(self.article.link)
paths = split.path.strip('/').split('/')
if ('reddit.com' in split.netloc
and paths[0] == 'r' and paths[2] == 'comments'):
paths = split.path.strip("/").split("/")
if (
"reddit.com" in split.netloc
and paths[0] == "r"
and paths[2] == "comments"
):
self._is_pure_reddit_post = True
except (AttributeError, IndexError):
pass
Expand Down Expand Up @@ -207,8 +202,7 @@ def get_content_generator(article):
if article.article_type and article.article_type in CONTENT_GENERATORS:
return CONTENT_GENERATORS[article.article_type](article)

if article.feed.feed_type \
and article.feed.feed_type in CONTENT_GENERATORS:
if article.feed.feed_type and article.feed.feed_type in CONTENT_GENERATORS:
return CONTENT_GENERATORS[article.feed.feed_type](article)

if article.feed.truncated_content:
Expand All @@ -218,12 +212,11 @@ def get_content_generator(article):


def migrate_content(content: dict):
content = content or {'v': 2, 'contents': []}
if content.get('v') == 2:
content = content or {"v": 2, "contents": []}
if content.get("v") == 2:
return content
if content['type'] in {'image', 'audio', 'video'}:
return {'v': 2, 'contents': []}
if content['type'] == 'embedded': # migrating original embedded
return {'v': 2, 'contents': [{'type': content['player'],
'link': content['videoId']}]}
return {'v': 2, 'contents': [content]}
if content["type"] in {"image", "audio", "video"}:
return {"v": 2, "contents": []}
if content["type"] == "embedded": # migrating original embedded
content = {"type": content["player"], "link": content["videoId"]}
return {"v": 2, "contents": [content]}

0 comments on commit 30ad95e

Please sign in to comment.