Fix memory leak

VikParuchuri · Feb 19, 2025 · 9c71302 · 9c71302
1 parent d95bffd
commit 9c71302
Show file tree

Hide file tree

Showing 5 changed files with 3 additions and 7 deletions.
diff --git a/marker/converters/pdf.py b/marker/converters/pdf.py
@@ -135,7 +135,6 @@ def __init__(
         if self.use_llm:
             self.layout_builder_class = LLMLayoutBuilder
 
-    @cache
     def build_document(self, filepath: str):
         provider_cls = provider_from_filepath(filepath)
         layout_builder = self.resolve_dependencies(self.layout_builder_class)

diff --git a/marker/converters/table.py b/marker/converters/table.py
@@ -25,7 +25,6 @@ class TableConverter(PdfConverter):
     )
     converter_block_types: List[BlockTypes] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents)
 
-    @cache
     def build_document(self, filepath: str):
         provider_cls = provider_from_filepath(filepath)
         layout_builder = self.resolve_dependencies(self.layout_builder_class)

diff --git a/marker/processors/llm/llm_text.py b/marker/processors/llm/llm_text.py
@@ -21,7 +21,7 @@ class LLMTextProcessor(BaseLLMSimpleBlockProcessor):
 
     block_types = (BlockTypes.Line,)
     image_remove_blocks = (BlockTypes.Equation,)
-    text_math_rewriting_prompt = """You are a text correction expert specializing in accurately reproducing text from images.
+    text_math_rewriting_prompt = r"""You are a text correction expert specializing in accurately reproducing text from images.
 You will receive an image of a text block and a set of extracted lines corresponding to the text in the image.
 Your task is to correct any errors in the extracted lines, including math, formatting, and other inaccuracies, and output the corrected lines in a JSON format.
 The number of output lines MUST match the number of input lines.  Stay as faithful to the original text as possible.

diff --git a/marker/providers/pdf.py b/marker/providers/pdf.py
@@ -87,14 +87,11 @@ def __init__(self, filepath: str, config=None):
             assert max(self.page_range) < len(doc) and min(self.page_range) >= 0, \
                 f"Invalid page range, values must be between 0 and {len(doc) - 1}.  Min of provided page range is {min(self.page_range)} and max is {max(self.page_range)}."
 
-            self.page_bboxes = {i: doc[i].get_bbox() for i in self.page_range}
-            """
             if self.force_ocr:
                 # Manually assign page bboxes, since we can't get them from pdftext
                 self.page_bboxes = {i: doc[i].get_bbox() for i in self.page_range}
             else:
                 self.page_lines = self.pdftext_extraction(doc)
-            """
 
     @contextlib.contextmanager
     def get_doc(self):

diff --git a/marker/scripts/convert.py b/marker/scripts/convert.py
@@ -88,6 +88,7 @@ def process_single_pdf(args):
 @click.option("--workers", type=int, default=5, help="Number of worker processes to use.")
 @click.option("--skip_existing", is_flag=True, default=False, help="Skip existing converted files.")
 @click.option("--debug_print", is_flag=True, default=False, help="Print debug information.")
+@click.option("--max_tasks_per_worker", type=int, default=10, help="Maximum number of tasks per worker process.")
 @ConfigParser.common_options
 def convert_cli(in_folder: str, **kwargs):
     in_folder = os.path.abspath(in_folder)
@@ -125,7 +126,7 @@ def convert_cli(in_folder: str, **kwargs):
     print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
     task_args = [(f, kwargs) for f in files_to_convert]
 
-    with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,), maxtasksperchild=1) as pool:
+    with mp.Pool(processes=total_processes, initializer=worker_init, initargs=(model_dict,), maxtasksperchild=kwargs["max_tasks_per_worker"]) as pool:
         pbar = tqdm(total=len(task_args), desc="Processing PDFs", unit="pdf")
         for _ in pool.imap_unordered(process_single_pdf, task_args):
             pbar.update(1)