Skip to content

Commit

Permalink
Fix memory leak
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 19, 2025
1 parent d95bffd commit 9c71302
Show file tree
Hide file tree
Showing 5 changed files with 3 additions and 7 deletions.
1 change: 0 additions & 1 deletion marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def __init__(
if self.use_llm:
self.layout_builder_class = LLMLayoutBuilder

@cache
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
Expand Down
1 change: 0 additions & 1 deletion marker/converters/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class TableConverter(PdfConverter):
)
converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)

@cache
def build_document(self, filepath: str):
provider_cls = provider_from_filepath(filepath)
layout_builder = self.resolve_dependencies(self.layout_builder_class)
Expand Down
2 changes: 1 addition & 1 deletion marker/processors/llm/llm_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):

block_types = (BlockTypes.Line,)
image_remove_blocks = (BlockTypes.Equation,)
text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
The number of output lines MUST match the number of input lines. Stay as faithful to the original text as possible.
Expand Down
3 changes: 0 additions & 3 deletions marker/providers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,11 @@ def __init__(self, filepath: str, config=None):
assert max(self.page_range) < len(doc) and min(self.page_range) >= 0, \
f"Invalid page range, values must be between 0 and {len(doc) - 1}. Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."

self.page_bboxes = {i: doc[i].get_bbox() for i in self.page_range}
"""
if self.force_ocr:
# Manually assign page bboxes, since we can't get them from pdftext
self.page_bboxes = {i: doc[i].get_bbox() for i in self.page_range}
else:
self.page_lines = self.pdftext_extraction(doc)
"""

@contextlib.contextmanager
def get_doc(self):
Expand Down
3 changes: 2 additions & 1 deletion marker/scripts/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def process_single_pdf(args):
@click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
@click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
@click.option("--debug_print", is_flag=True, default=False, help="Print debug information.")
@click.option("--max_tasks_per_worker", type=int, default=10, help="Maximum number of tasks per worker process.")
@ConfigParser.common_options
def convert_cli(in_folder: str, **kwargs):
in_folder = os.path.abspath(in_folder)
Expand Down Expand Up @@ -125,7 +126,7 @@ def convert_cli(in_folder: str, **kwargs):
print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
task_args = [(f, kwargs) for f in files_to_convert]

with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,), maxtasksperchild=1) as pool:
with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,), maxtasksperchild=kwargs["max_tasks_per_worker"]) as pool:
pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf")
for _ in pool.imap_unordered(process_single_pdf, task_args):
pbar.update(1)
Expand Down

0 comments on commit 9c71302

Please sign in to comment.