Skip to content

Commit

Permalink
extraction: improve spacing in item, cell and code blocks (#772)
Browse files Browse the repository at this point in the history
* add preserve_space option to keep original format in code blocks

* fix #769

* fix ut

* fix type check

* set preserve_space as default option

* fix xml_tei_tests

* fix readworld_tests

* set preserve_space default to True and refine list extraction

* improve performance by replacing xpath search with iterative search

* improve performance and move item-related checks to utils

* minor fixes

---------

Co-authored-by: CodyInnowhere <[email protected]>
  • Loading branch information
unsleepy22 and CodyInnowhere authored Feb 17, 2025
1 parent 139dfd6 commit 729b737
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 28 deletions.
51 changes: 42 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
assert '<hi rend="#b">This here is in bold font.</hi>' in my_result
# titles as markdown
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>'
my_string = '<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b>Non-bold here</p></article></body></html>'
my_document = html.fromstring(my_string)
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n\n**This here is in bold font.**'
assert my_result == '### Title\n\n**This here is in bold font.**Non-bold here'
assert extract(my_string, output_format='markdown', config=ZERO_CONFIG) == my_result
assert '<hi rend="#b">' in etree.tostring(bare_extraction(my_string, output_format='markdown', config=ZERO_CONFIG).body, encoding="unicode")

Expand All @@ -354,7 +354,7 @@ def test_formatting():
Here is a code sample:
`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\n trafilatura.extract("")</pre></article></body></html>')
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code><span>import</span> <span>something</span><br/>something.run("somewhere")</code><p>Sometimes code is wrapped using <code>pre</code> and <code>code</code>:</p><pre><code>import trafilatura\ntrafilatura.extract("")</code></pre><p>Less often code is wrapped using just <code>pre</code>:</p><pre>\ntrafilatura.extract("")</pre></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
print(my_result)
assert my_result == """### Title
Expand Down Expand Up @@ -419,6 +419,18 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', include_links=True, config=ZERO_CONFIG)
assert '<item>Number <ref target="test.html">2</ref></item>' in my_result

my_document = html.fromstring("""<html><body><article>
<ul>
<li>Number 0</li>
<li>Number <a href="test.html">1</a></li>
<li><a href="test.html">Number 2</a> n2</li>
<li>Number 3</li>
<li><p>Number 4</p> n4</li>
</ul>
Test</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_links=True, config=ZERO_CONFIG)
assert my_result == '- Number 0\n- Number [1](test.html)\n- [Number 2](test.html)n2\n- Number 3\n- Number 4 n4\n\nTest'
# XML and Markdown formatting within <p>-tag
my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
my_result = extract(copy(my_document), fast=True, include_formatting=False, config=ZERO_CONFIG)
Expand Down Expand Up @@ -454,6 +466,27 @@ def test_formatting():
my_result = extract(my_document, output_format='xml', fast=True, include_formatting=True, config=ZERO_CONFIG)
assert '<head rend="h4">1) The <code>in</code> Operator</head>' in my_result and '<p>The easiest way to check if a Python string contains a substring is to use the <code>in</code> operator. The <code>in</code> operator is used to check data structures for membership in Python. It returns a Boolean (either <code>True</code> or <code>False</code>) and can be used as follows:</p>' in my_result

my_document = html.fromstring("""
<html><head><body><article>python code below:
<pre><code>
def test:
print('hello')
print('world')
</code></pre>
</article></body></html>
""")
my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert "python code below:\n```\ndef test:\n print('hello')\n print('world')\n \n```" == my_result

my_result = extract(my_document, output_format='markdown', include_formatting=True)
assert """python code below:
```
def test:
print('hello')
print('world')
```""" == my_result


def test_extract_with_metadata():
'''Test extract_with_metadata method'''
Expand Down Expand Up @@ -1278,7 +1311,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1296,7 +1329,7 @@ def test_table_processing():
</article></body></html>
"""
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "| a | b | c |\n| a | b c | |\n| a | b c | |"
assert result == "| a | b | c | \n| a | b c | |\n| a | b c | |"

htmlstring = """
<html><body><article>
Expand All @@ -1312,7 +1345,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| a ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| a ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1328,7 +1361,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1344,7 +1377,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img](http://aa.bb/c.jpg) a | b c | d |"
assert result == "| a | b | c | \n| ![img](http://aa.bb/c.jpg) a | b c | d |"

htmlstring = """
<html><body><article>
Expand All @@ -1360,7 +1393,7 @@ def test_table_processing():
"""
result = extract(htmlstring, fast=True, output_format='markdown', config=ZERO_CONFIG,
include_images=True, include_tables=True)
assert result == "| a | b | c |\n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"
assert result == "| a | b | c | \n| ![img1](http://aa.bb/c.jpg) a ![img2](http://aa.bb/c.jpg) | b c | d |"


def test_list_processing():
Expand Down
2 changes: 1 addition & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ def test_replace_element_text():
elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text\n"
assert replace_element_text(elem, True) == "- Test text"

elem = Element("ref")
elem.text = "Link"
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def determine_returnstring(document: Document, options: Extractor) -> str:
header = ""
returnstring = f"{header}{xmltotxt(document.body, options.formatting)}"
if document.commentsbody is not None:
returnstring = f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
returnstring = \
f"{returnstring}\n{xmltotxt(document.commentsbody, options.formatting)}".strip()
# normalize Unicode format (defaults to NFC)
return normalize_unicode(returnstring)

Expand Down
72 changes: 70 additions & 2 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

from functools import lru_cache
from itertools import islice
from typing import Any, List, Literal, Optional, Tuple, Union
from typing import Any, cast, List, Literal, Optional, Tuple, Union
from unicodedata import normalize

# response compression
Expand Down Expand Up @@ -464,4 +464,72 @@ def copy_attributes(dest_elem: _Element, src_elem: _Element) -> None:

def is_in_table_cell(elem: _Element) -> bool:
'''Check whether an element is in a table cell'''
return bool(elem.xpath('//ancestor::cell'))
# return elem.getparent() is not None and bool(elem.xpath('//ancestor::cell'))
if elem.getparent() is None:
return False
current: Optional[_Element] = elem
while current is not None:
if current.tag == 'cell':
return True
current = current.getparent()
return False


def is_last_element_in_cell(elem: _Element) -> bool:
'''Check whether an element is the last element in table cell'''
if not is_in_table_cell(elem): # shortcut
return False

if elem.tag == "cell":
children = elem.getchildren()
return not children or children[-1] == elem
else:
parent = cast(_Element, elem.getparent())
children = parent.getchildren()
return not children or children[-1] == elem


def is_element_in_item(element: _Element) -> bool:
"""Check whether an element is a list item or within a list item"""
current: Optional[_Element] = element
while current is not None:
if current.tag == 'item':
return True
current = current.getparent()
return False


def is_first_element_in_item(element: _Element) -> bool:
"""Check whether an element is the first element in list item"""
if element.tag == 'item' and element.text:
return True

current: Optional[_Element] = element
item_ancestor = None
while current is not None:
if current.tag == 'item':
item_ancestor = current
break
current = current.getparent()

if item_ancestor is None:
return False
elif not item_ancestor.text:
return True
return False


def is_last_element_in_item(element: _Element) -> bool:
"""Check whether an element is the last element in list item"""
if not is_element_in_item(element):
return False

# pure text only in list item
if element.tag == 'item':
return len(element.getchildren()) == 0
# element within list item
next_element = element.getnext()
if next_element is None:
return True
else:
return next_element.tag == 'item'
41 changes: 26 additions & 15 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
fromstring, tostring, DTD)

from .settings import Document, Extractor
from .utils import is_in_table_cell, sanitize, sanitize_tree, text_chars_test
from .utils import is_element_in_item, is_first_element_in_item, is_in_table_cell, is_last_element_in_cell, \
is_last_element_in_item, sanitize, sanitize_tree, text_chars_test


LOGGER = logging.getLogger(__name__)
Expand All @@ -35,7 +36,7 @@
CONTROL_PARSER = XMLParser(remove_blank_text=True)

NEWLINE_ELEMS = {'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref'}
SPECIAL_FORMATTING = {'code', 'del', 'head', 'hi', 'ref', 'item', 'cell'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}

Expand Down Expand Up @@ -251,11 +252,13 @@ def validate_tei(xmldoc: _Element) -> bool:


def replace_element_text(element: _Element, include_formatting: bool) -> str:
"""Determine element text based on just the text of the element. One must deal with the tail separately."""
elem_text = element.text or ""
"Determine element text based on just the text of the element. One must deal with the tail separately."
# handle formatting: convert to markdown
if include_formatting and element.text:
if element.tag == "head":
if element.tag in ('article', 'list', 'table'):
elem_text = elem_text.strip()
elif element.tag == "head":
try:
number = int(element.get("rend")[1]) # type: ignore[index]
except (TypeError, ValueError):
Expand Down Expand Up @@ -289,14 +292,16 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell":
if element.tag == 'cell':
elem_text = elem_text.strip()

if elem_text:
if elem_text and not is_last_element_in_cell(element):
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"

# within lists
if is_first_element_in_item(element) and not is_in_table_cell(element):
elem_text = f"- {elem_text}"

return elem_text


Expand Down Expand Up @@ -344,25 +349,31 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(f'\n|{"---|" * max_span}\n')
else:
returnlist.append("\n")
elif element.tag != "cell":
elif element.tag != "cell" and element.tag != 'item':
# cells still need to append vertical bars
# but nothing more to do with other textless elements
return

# Process text

# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell") and not is_element_in_item(element):
# spacing hack
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag not in SPECIAL_FORMATTING:
elif element.tag not in SPECIAL_FORMATTING and not is_last_element_in_cell(element): # and not is_in_table_cell(element)
returnlist.append(" ")

# this is text that comes after the closing tag, so it should be after any NEWLINE_ELEMS
if element.tail and not is_in_table_cell(element):
returnlist.append(element.tail)
# unless it's within a list item or a table
is_in_cell = is_in_table_cell(element)
if element.tail and not is_in_cell:
returnlist.append(element.tail.strip() if is_element_in_item(element) or element.tag=='list' else element.tail)

# deal with list items alone
if is_last_element_in_item(element) and not is_in_cell:
returnlist.append('\n')


def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:
Expand All @@ -374,7 +385,7 @@ def xmltotxt(xmloutput: Optional[_Element], include_formatting: bool) -> str:

process_element(xmloutput, returnlist, include_formatting)

return unescape(sanitize("".join(returnlist)) or "")
return unescape(sanitize("".join(returnlist), True) or "")


def xmltocsv(document: Document, include_formatting: bool, *, delim: str = "\t", null: str = "null") -> str:
Expand Down

0 comments on commit 729b737

Please sign in to comment.