Skip to content

Commit

Permalink
Fix formatting on Foodista (#93)
Browse files Browse the repository at this point in the history
Currently, the content of the step number div for instructions doesn't
have any spaces around the actual text itself. So bs4's .get_text would
smash everything together. This fixes that for better formatting.

It also filters out some documents that didn't have any text in them.
  • Loading branch information
blester125 authored Oct 18, 2024
1 parent d93d66e commit 98b768c
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 30 deletions.
71 changes: 43 additions & 28 deletions food/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import bs4
import tqdm

from licensed_pile import logs
from licensed_pile import logs, utils
from licensed_pile.write import ShardParallelProcessor

parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -47,10 +47,14 @@
default=mp.cpu_count(),
help="Number of processors for multicore.",
)
parser.add_argument(
"--meta",
help="Location to store Dolma Metadata information.",
)

# Dolma later sets the log level to error, need to override cls.get_logger() if
# we want to see info methods.
logs.configure_logging("dolma.FoodistaParallel")
logs.configure_logging()


class FoodistaParallel(ShardParallelProcessor):
Expand All @@ -67,7 +71,10 @@ def process_example(cls, example, **kwargs):

html = example["text"]

text, authors, date = parse_page(html, example["id"])
with logger(id=example["id"]):
text, authors, date = parse_page(html)
if text is None:
return None

example["text"] = text
example["created"] = date
Expand All @@ -86,17 +93,22 @@ def clean_date(date: str) -> str:
return date.strip()


def clean_text(text: str) -> str:
text = text.strip()
def clean_text(html) -> str:
for step in html.find_all("div", class_=("step-number")):
if step.string == "1":
step.string = f"{step.string} "
else:
step.string = f" {step.string} "
text = html.get_text().strip()
# Remove when the text is only the section header.
if text in ("Tools", "Ingredients", "Preparation"):
if text in ("Tools", "Ingredients", "Preparation", "About", "Information"):
return ""
return text


def parse_page(html, idx, include_user_id: bool = False):
def parse_page(html, include_user_id: bool = False):
"""Convert a page's html to plain text for LLM training."""
logger = logs.get_logger("dolma.FoodistaParallel")
logger = logs.get_logger()
soup = bs4.BeautifulSoup(html, "html.parser")

result = []
Expand All @@ -107,35 +119,33 @@ def parse_page(html, idx, include_user_id: bool = False):
title = title.get_text().strip()
result.append(title)
else:
logger.warning(f"Failed to find title for example: {idx}")
logger.warning("Failed to find title.")

# Find the author's name (which is included in the text) and the user id (which
# will be included in the metadata).
if author := soup.find("div", class_="pane-node-author"):
if user_id := author.find("a", class_="username"):
user_id = user_id.get("href")
else:
logger.warning(
f"Failed to find the user_id for the author in example: {idx}"
)
logger.warning("Failed to find the user_id for the author.")
author = clean_author(author.get_text()).strip()
result.append(f"By: {author}")
if include_user_id:
authors.append((author, user_id))
else:
authors.append(author)
else:
logger.warning(f"Failed to find author for example: {idx}")
logger.warning("Failed to find author.")

# Find the date it was published.
if date := soup.find("div", class_="pane-node-created"):
date = clean_date(date.get_text()).strip()
result.append(f"Published: {date}")
else:
logger.warning(f"Failed to find date for example: {idx}")
logger.warning("Failed to find date.")

# Find the text of the page.
if text := soup.find_all(
if text_tags := soup.find_all(
"div",
class_=(
"pane-node-body",
Expand All @@ -145,13 +155,18 @@ def parse_page(html, idx, include_user_id: bool = False):
"pane-node-field-about",
),
):
text = []
# Create an empty line between the header and the body text.
for t in text:
t = clean_text(t.get_text()).strip()
for t in text_tags:
t = clean_text(t).strip()
if t:
result.append(f"\n{t}")
text.append(f"\n{t}")
if not text:
logger.warning(f"Cleaned text was empty.")
return None, None, None
result.extend(text)
else:
logger.warning(f"Failed to find text for example: {idx}")
logger.warning("Failed to find text for example.")

# Collect all comments first as we may filter them out later.
comments = []
Expand All @@ -177,16 +192,16 @@ def parse_page(html, idx, include_user_id: bool = False):
user_id = comment_author["href"]
comment_author = comment_author.get_text().strip()
else:
logger.warning(f"Failed to find comment author in example: {idx}")
logger.warning(f"Failed to find comment author.")
# The date follows a <br> which bs4 wraps in <br>...</br>
if comment_date := comment_submitted.find("br"):
comment_date = comment_date.get_text().strip()
else:
logger.warning(f"Failed to find comment date in example: {idx}")
logger.warning(f"Failed to find comment date.")
if comment_text := comment.find("div", class_="content"):
comment_text = comment_text.get_text().strip()
else:
logger.warning(f"Failed to find comment text in example: {idx}")
logger.warning(f"Failed to find comment text.")

# Some comments seems to be snippets from other sites, ignore those
if comment_text.startswith("[...]") or comment_text.endswith("[...]"):
Expand All @@ -201,7 +216,7 @@ def parse_page(html, idx, include_user_id: bool = False):
)
else:
# Not all articles have comments, so we don't call this an error.
logger.info(f"Didn't find comments for example: {idx}")
logger.info(f"Didn't find comments.")

# Add non-filtered comments into the text.
if comments:
Expand Down Expand Up @@ -230,17 +245,17 @@ def parse_date(date: str) -> datetime.datetime:
return datetime.datetime.strptime(date, fmt).isoformat()
except:
pass
logger = logs.get_logger("dolma.FoodistaParallel")
logger = logs.get_logger()
logger.warning(f"Filed to parse date: {date}")
return date


def main(args):
with TemporaryDirectory() as tempdir:
with utils.maybe_temp_dir(path=args.meta) as meta_dir:
processor = FoodistaParallel(
source_prefix=os.path.join(args.input, "documents", "*.jsonl.gz"),
destination_prefix=os.path.join(args.output, "documents"),
metadata_prefix=tempdir,
source_prefix=utils.dolma_input(args.input, "*.jsonl.gz"),
destination_prefix=utils.dolma_output(args.output),
metadata_prefix=meta_dir,
num_processes=args.processes,
)
processor(debug=args.debug)
Expand Down
6 changes: 4 additions & 2 deletions licensed_pile/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ def removesuffix(s: str, suffix: str) -> str:
def dolma_input(input_path: str, filepattern: str = "*.jsonl.gz") -> str:
# If the input is directly to a file, or it is a glob that returns matches,
# use as is.
if (os.path.exists(input_path) and os.path.isfile(input_path)) or glob.glob(
input_path, recursive=True
if (
(os.path.exists(input_path) and os.path.isfile(input_path))
or not os.path.isdir(input_path)
and glob.glob(input_path, recursive=True)
):
return input_path
# Otherwise it is probably meant as a directory, so add the ../documents/${filepattern}
Expand Down

0 comments on commit 98b768c

Please sign in to comment.