Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,8 @@ def to_markdown(
show_progress=False,
use_glyphs=False,
ignore_alpha=False,
min_table_rows=2,
min_table_cols=2,
) -> str:
"""Process the document and return the text of the selected pages.

Expand All @@ -354,8 +356,16 @@ def to_markdown(
show_progress: (bool, False) print progress as each page is processed.
use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers.
ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent).
min_table_rows: (int, 2) minimum number of rows for a table to be included.
min_table_cols: (int, 2) minimum number of columns for a table to be included.

"""
# Validate min_table_rows and min_table_cols parameters
if not isinstance(min_table_rows, int) or min_table_rows < 1:
raise ValueError("min_table_rows must be a positive integer (>= 1)")
if not isinstance(min_table_cols, int) or min_table_cols < 1:
raise ValueError("min_table_cols must be a positive integer (>= 1)")

if write_images is False and embed_images is False and force_text is False:
raise ValueError("Image and text on images cannot both be suppressed.")
if embed_images is True:
Expand Down Expand Up @@ -929,7 +939,7 @@ def sort_words(words: list) -> list:
return nwords

def get_page_output(
doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS
doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS, min_table_rows, min_table_cols
):
"""Process one page.

Expand Down Expand Up @@ -1036,7 +1046,7 @@ def get_page_output(
tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
for t in tabs.tables:
# remove tables with too few rows or columns
if t.row_count < 2 or t.col_count < 2:
if t.row_count < min_table_rows or t.col_count < min_table_cols:
omitted_table_rects.append(pymupdf.Rect(t.bbox))
continue
parms.tabs.append(t)
Expand Down Expand Up @@ -1200,7 +1210,7 @@ def get_page_output(
pages = ProgressBar(pages)
for pno in pages:
parms = get_page_output(
doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS
doc, pno, margins, textflags, FILENAME, IGNORE_IMAGES, IGNORE_GRAPHICS, min_table_rows, min_table_cols
)
if page_chunks is False:
document_output += parms.md_string
Expand Down