Goose3: using new top node html raw

jaesivsm · Apr 21, 2024 · 30ad95e · 30ad95e
1 parent 0d4ca00
commit 30ad95e
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 53 deletions.
diff --git a/Pipfile b/Pipfile
@@ -37,7 +37,6 @@ flask-restx = "==1.*"
 goose3 = "==3.*"
 gunicorn = "==22.*"
 json-logging-py = "*"  # needed by gunicorn config
-lxml = "==5.*"
 opml = "==0.*"
 prometheus-distributed-client = "==1.*"
 psycopg2-binary = "==2.*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/jarr/lib/content_generator.py b/jarr/lib/content_generator.py
@@ -5,7 +5,6 @@
 
 from typing import Optional
 from goose3 import Goose
-from lxml import etree
 from jarr.bootstrap import conf
 from jarr.controllers.article import to_vector
 from jarr.lib.enums import ArticleType, FeedType
@@ -15,8 +14,9 @@
 logger = logging.getLogger(__name__)
 IMG_ALT_MAX_LENGTH = 100
 YOUTUBE_RE = re.compile(
-        r'^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))'
-        r'(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$')
+    r"^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))"
+    r"(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$"
+)
 
 
 def is_embedded_link(link):
@@ -37,16 +37,16 @@ def _get_goose(self):
         try:
             self._page = goose.extract(self.article.link)
         except Exception as error:
-            logger.error("something wrong happened while trying to fetch "
-                         "%r: %r", self.article.link, error)
+            msg = "something wrong happened while trying to fetch %r: %r"
+            logger.error(msg, self.article.link, error)
         if not self._page:
             return False
-        lang = self._page.opengraph.get('locale') or self._page.meta_lang
-        self.extracted_infos['lang'] = clean_lang(lang)
-        self.extracted_infos['link'] = self._page.final_url
-        keywords = set(self._page.meta_keywords.split(', '))
-        self.extracted_infos['tags'] = set(self._page.tags).union(keywords)
-        self.extracted_infos['title'] = self._page.title
+        lang = self._page.opengraph.get("locale") or self._page.meta_lang
+        self.extracted_infos["lang"] = clean_lang(lang)
+        self.extracted_infos["link"] = self._page.final_url
+        keywords = set(self._page.meta_keywords.split(", "))
+        self.extracted_infos["tags"] = set(self._page.tags).union(keywords)
+        self.extracted_infos["title"] = self._page.title
         return True
 
     def get_vector(self):
@@ -55,31 +55,23 @@ def get_vector(self):
         if self._page and self.extracted_infos:
             return to_vector(self.extracted_infos, self._page)
 
-    def _from_goose_to_html(self, encoding="utf8"):
-        result = b""
-        current_node = self._page.top_node
-        while True:
-            result += etree.tostring(current_node, encoding=encoding)
-            current_node = current_node.getnext()
-            if current_node is None:
-                break
-        return result.replace(b'\n', b' ').decode(encoding)
-
     @staticmethod
     def generate():
         return {}
 
     def generate_and_merge(self, content):
         content = migrate_content(content)
         # if there is already some fetched content
-        already_fetched = any(cnt.get('type') == 'fetched'
-                              for cnt in content.get('contents') or [])
+        already_fetched = any(
+            cnt.get("type") == "fetched"
+            for cnt in content.get("contents") or []
+        )
         if isinstance(self, TruncatedContentGenerator) and already_fetched:
             return content
         article_content = self.generate()
         if not article_content:
             return content
-        content['contents'].append(article_content)
+        content["contents"].append(article_content)
         return content
 
 
@@ -120,15 +112,15 @@ def get_vector():
     def generate(self):
         yt_match = YOUTUBE_RE.match(self.article.link)
         if yt_match:
-            logger.info('%r constructing embedded youtube content '
-                        'from article', self.article)
+            msg = "%r constructing embedded youtube content from article"
+            logger.info(msg, self.article)
             try:
-                return {'type': 'youtube', 'link': yt_match.group(5)}
+                return {"type": "youtube", "link": yt_match.group(5)}
             except IndexError:
                 pass
         else:
-            logger.warning('embedded video not recognized %r',
-                           self.article.link)
+            msg = "embedded media not recognized %r"
+            logger.warning(msg, self.article.link)
         return {}
 
 
@@ -137,18 +129,18 @@ class TruncatedContentGenerator(ContentGenerator):
     def generate(self):
         if self._page is None:
             self._get_goose()
-        content = {'type': 'fetched'}
+        content = {"type": "fetched"}
         try:
-            content['content'] = self._from_goose_to_html()
-            content['link'] = remove_utm_tags(self._page.final_url)
-            content['title'] = self._page.title
+            content["content"] = self._page.top_node_raw_html
+            content["link"] = remove_utm_tags(self._page.final_url)
+            content["title"] = self._page.title
         except Exception:
-            logger.exception("Could not rebuild parsed content for %r",
-                             self.article)
+            msg = "Could not rebuild parsed content for %r"
+            logger.exception(msg, self.article)
             return {}
         if self.article.comments:
-            content['comments'] = self.article.comments
-        logger.debug('%r no special type found doing nothing', self.article)
+            content["comments"] = self.article.comments
+        logger.debug("%r no special type found doing nothing", self.article)
         return content
 
 
@@ -168,9 +160,12 @@ def is_pure_reddit_post(self):
             return self._is_pure_reddit_post
         try:
             split = urllib.parse.urlsplit(self.article.link)
-            paths = split.path.strip('/').split('/')
-            if ('reddit.com' in split.netloc
-                    and paths[0] == 'r' and paths[2] == 'comments'):
+            paths = split.path.strip("/").split("/")
+            if (
+                "reddit.com" in split.netloc
+                and paths[0] == "r"
+                and paths[2] == "comments"
+            ):
                 self._is_pure_reddit_post = True
         except (AttributeError, IndexError):
             pass
@@ -207,8 +202,7 @@ def get_content_generator(article):
     if article.article_type and article.article_type in CONTENT_GENERATORS:
         return CONTENT_GENERATORS[article.article_type](article)
 
-    if article.feed.feed_type \
-            and article.feed.feed_type in CONTENT_GENERATORS:
+    if article.feed.feed_type and article.feed.feed_type in CONTENT_GENERATORS:
         return CONTENT_GENERATORS[article.feed.feed_type](article)
 
     if article.feed.truncated_content:
@@ -218,12 +212,11 @@ def get_content_generator(article):
 
 
 def migrate_content(content: dict):
-    content = content or {'v': 2, 'contents': []}
-    if content.get('v') == 2:
+    content = content or {"v": 2, "contents": []}
+    if content.get("v") == 2:
         return content
-    if content['type'] in {'image', 'audio', 'video'}:
-        return {'v': 2, 'contents': []}
-    if content['type'] == 'embedded':  # migrating original embedded
-        return {'v': 2, 'contents': [{'type': content['player'],
-                                      'link': content['videoId']}]}
-    return {'v': 2, 'contents': [content]}
+    if content["type"] in {"image", "audio", "video"}:
+        return {"v": 2, "contents": []}
+    if content["type"] == "embedded":  # migrating original embedded
+        content = {"type": content["player"], "link": content["videoId"]}
+    return {"v": 2, "contents": [content]}