extraction: improve spacing in item, cell and code blocks (#772)

* add preserve_space option to keep original format in code blocks * fix #769 * fix ut * fix type check * set preserve_space as default option * fix xml_tei_tests * fix readworld_tests * set preserve_space default to True and refine list extraction * improve performance by replacing xpath search with iterative search * improve performance and move item-related checks to utils * minor fixes --------- Co-authored-by: CodyInnowhere <[email protected]>
adbar · Feb 17, 2025 · 729b737 · 729b737
1 parent 139dfd6
commit 729b737
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 28 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -330,10 +330,10 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
     assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
     # titles as markdown
-    my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>'
+    my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
     my_document = html.fromstring(my_string)
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
-    assert my_result == '### Title\n\n**This here is in bold font.**'
+    assert my_result == '### Title\n\n**This here is in bold font.**Non-bold here'
     assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
     assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")
 
@@ -354,7 +354,7 @@ def test_formatting():
 Here is a code sample:
 
 `import trafilatura`"""
-    my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n    trafilatura.extract("")</pre></article></body></html>')
+    my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\ntrafilatura.extract("")</pre></article></body></html>')
     my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
     print(my_result)
     assert my_result == """### Title
@@ -419,6 +419,18 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
     assert '<item>Number <ref target="test.html">2</ref></item>' in my_result
 
+    my_document = html.fromstring("""<html><body><article>
+        <ul>
+            <li>Number 0</li>
+            <li>Number <a href="test.html">1</a></li>
+            <li><a href="test.html">Number 2</a> n2</li>
+            <li>Number 3</li>
+            <li><p>Number 4</p> n4</li>
+        </ul>
+        Test</article></body></html>
+    """)
+    my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
+    assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html)n2\n- Number 3\n- Number 4 n4\n\nTest'
     # XML and Markdown formatting within <p>-tag
     my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
     my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
@@ -454,6 +466,27 @@ def test_formatting():
     my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result
 
+    my_document = html.fromstring("""
+    <html><head><body><article>python code below:
+<pre><code>
+def test:
+    print('hello')
+    print('world')
+    </code></pre>
+    </article></body></html> 
+    """)
+    my_result = extract(my_document, output_format='markdown', include_formatting=True)
+    assert "python code below:\n```\ndef test:\n    print('hello')\n    print('world')\n    \n```" == my_result
+
+    my_result = extract(my_document, output_format='markdown', include_formatting=True)
+    assert """python code below:
+```
+def test:
+    print('hello')
+    print('world')
+    
+```""" == my_result
+
 
 def test_extract_with_metadata():
     '''Test extract_with_metadata method'''
@@ -1278,7 +1311,7 @@ def test_table_processing():
                  </article></body></html>
                  """
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "| a | b | c |\n| a | b c | |"
+    assert result == "| a | b | c | \n| a | b c | |"
 
     htmlstring = """
                  <html><body><article>
@@ -1296,7 +1329,7 @@ def test_table_processing():
                  </article></body></html>
                  """
     result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
-    assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
+    assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"
 
     htmlstring = """
                  <html><body><article>
@@ -1312,7 +1345,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
+    assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1328,7 +1361,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+    assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1344,7 +1377,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
+    assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"
 
     htmlstring = """
                  <html><body><article>
@@ -1360,7 +1393,7 @@ def test_table_processing():
                  """
     result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
                      include_images=True, include_tables=True)
-    assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
+    assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
 
 
 def test_list_processing():

diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py
@@ -486,7 +486,7 @@ def test_replace_element_text():
     elem = Element("item")
     elem.text = "Test text"
     elem.tag = "item"
-    assert replace_element_text(elem, True) == "- Test text\n"
+    assert replace_element_text(elem, True) == "- Test text"
 
     elem = Element("ref")
     elem.text = "Link"

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -93,7 +93,8 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
             header = ""
         returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
         if document.commentsbody is not None:
-            returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
+            returnstring = \
+                f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
     # normalize Unicode format (defaults to NFC)
     return normalize_unicode(returnstring)
 

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -21,7 +21,7 @@
 
 from functools import lru_cache
 from itertools import islice
-from typing import Any, List, Literal, Optional, Tuple, Union
+from typing import Any, cast, List, Literal, Optional, Tuple, Union
 from unicodedata import normalize
 
 # response compression
@@ -464,4 +464,72 @@ def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:
 
 def is_in_table_cell(elem: _Element) -> bool:
     '''Check whether an element is in a table cell'''
-    return bool(elem.xpath('//ancestor::cell'))
+    # return elem.getparent() is not None and bool(elem.xpath('//ancestor::cell'))
+    if elem.getparent() is None:
+        return False
+    current: Optional[_Element] = elem
+    while current is not None:
+        if current.tag == 'cell':
+            return True
+        current = current.getparent()
+    return False
+
+
+def is_last_element_in_cell(elem: _Element) -> bool:
+    '''Check whether an element is the last element in table cell'''
+    if not is_in_table_cell(elem): # shortcut
+        return False
+
+    if elem.tag == "cell":
+        children = elem.getchildren()
+        return not children or children[-1] == elem
+    else:
+        parent = cast(_Element, elem.getparent())
+        children = parent.getchildren()
+        return not children or children[-1] == elem
+
+
+def is_element_in_item(element: _Element) -> bool:
+    """Check whether an element is a list item or within a list item"""
+    current: Optional[_Element] = element
+    while current is not None:
+        if current.tag == 'item':
+            return True
+        current = current.getparent()
+    return False
+
+
+def is_first_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the first element in list item"""
+    if element.tag == 'item' and element.text:
+        return True
+
+    current: Optional[_Element] = element
+    item_ancestor = None
+    while current is not None:
+        if current.tag == 'item':
+            item_ancestor = current
+            break
+        current = current.getparent()
+
+    if item_ancestor is None:
+        return False
+    elif not item_ancestor.text:
+        return True
+    return False
+
+
+def is_last_element_in_item(element: _Element) -> bool:
+    """Check whether an element is the last element in list item"""
+    if not is_element_in_item(element):
+        return False
+
+    # pure text only in list item
+    if element.tag == 'item':
+        return len(element.getchildren()) == 0
+    # element within list item
+    next_element = element.getnext()
+    if next_element is None:
+        return True
+    else:
+        return next_element.tag == 'item'
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -17,7 +17,8 @@
                         fromstring, tostring, DTD)
 
 from .settings import Document, Extractor
-from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
+from .utils import is_element_in_item, is_first_element_in_item, is_in_table_cell, is_last_element_in_cell, \
+    is_last_element_in_item, sanitize, sanitize_tree, text_chars_test
 
 
 LOGGER = logging.getLogger(__name__)
@@ -35,7 +36,7 @@
 CONTROL_PARSER = XMLParser(remove_blank_text=True)
 
 NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
-SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
+SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref', 'item', 'cell'}
 WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
 NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
 
@@ -251,11 +252,13 @@ def validate_tei(xmldoc: _Element) -> bool:
 
 
 def replace_element_text(element: _Element, include_formatting: bool) -> str:
+    """Determine element text based on just the text of the element. One must deal with the tail separately."""
     elem_text = element.text or ""
-    "Determine element text based on just the text of the element. One must deal with the tail separately."
     # handle formatting: convert to markdown
     if include_formatting and element.text:
-        if element.tag == "head":
+        if element.tag in ('article', 'list', 'table'):
+            elem_text = elem_text.strip()
+        elif element.tag == "head":
             try:
                 number = int(element.get("rend")[1])  # type: ignore[index]
             except (TypeError, ValueError):
@@ -289,14 +292,16 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
         else:
             LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
     # cells
-    if element.tag == "cell":
+    if element.tag == 'cell':
         elem_text = elem_text.strip()
 
-        if elem_text:
+        if elem_text and not is_last_element_in_cell(element):
             elem_text = f"{elem_text} "
-    # lists
-    elif element.tag == "item" and elem_text:
-        elem_text = f"- {elem_text}\n"
+
+    # within lists
+    if is_first_element_in_item(element) and not is_in_table_cell(element):
+        elem_text = f"- {elem_text}"
+
     return elem_text
 
 
@@ -344,25 +349,31 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
                     returnlist.append(f'\n|{"---|" * max_span}\n')
             else:
                 returnlist.append("\n")
-        elif element.tag != "cell":
+        elif element.tag != "cell" and element.tag != 'item':
             # cells still need to append vertical bars
             # but nothing more to do with other textless elements
             return
 
     # Process text
 
     # Common elements (Now processes end-tag logic correctly)
-    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
+    if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
         # spacing hack
         returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
     elif element.tag == "cell":
         returnlist.append(" | ")
-    elif element.tag not in SPECIAL_FORMATTING:
+    elif element.tag not in SPECIAL_FORMATTING and not is_last_element_in_cell(element): #  and not is_in_table_cell(element)
         returnlist.append(" ")
 
     # this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
-    if element.tail and not is_in_table_cell(element):
-        returnlist.append(element.tail)
+    # unless it's within a list item or a table
+    is_in_cell = is_in_table_cell(element)
+    if element.tail and not is_in_cell:
+        returnlist.append(element.tail.strip() if is_element_in_item(element) or element.tag=='list' else element.tail)
+
+    # deal with list items alone
+    if is_last_element_in_item(element) and not is_in_cell:
+        returnlist.append('\n')
 
 
 def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
@@ -374,7 +385,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
 
     process_element(xmloutput, returnlist, include_formatting)
 
-    return unescape(sanitize("".join(returnlist)) or "")
+    return unescape(sanitize("".join(returnlist), True) or "")
 
 
 def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str: