diff --git a/benchmark.py b/benchmark.py index 3d71e22..bb2398f 100644 --- a/benchmark.py +++ b/benchmark.py @@ -36,6 +36,7 @@ pypdf_get_text, pypdf_image_extraction, pypdf_watermarking, tika_get_text, pdfium_image_extraction, + textract_get_text, ) from pdf_benchmark.output import write_benchmark_report from pdf_benchmark.score import get_text_extraction_score @@ -218,6 +219,15 @@ def write_single_result( last_release_date="-", license="GPL", ), + "textract": Library( + "textract", + "textract", + "https://pypi.org/project/textract/", + text_extraction_function=textract_get_text, + version="2.0.0", + license="MIT", + last_release_date="2026-04-27", + ), # "borb": Library( # "Borb", # "borb", diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py index 32938c2..62a9d12 100644 --- a/pdf_benchmark/library_code.py +++ b/pdf_benchmark/library_code.py @@ -225,6 +225,19 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes: return out_buffer.read() +def textract_get_text(data: bytes) -> str: + import textract + new_file, filename = tempfile.mkstemp(suffix=".pdf") + try: + with open(filename, "wb") as fp: + fp.write(data) + text = textract.process(filename).decode("utf-8", errors="replace") + finally: + os.close(new_file) + os.remove(filename) + return text + + def tika_get_text(data: bytes) -> str: from tika import parser diff --git a/requirements/main.in b/requirements/main.in index d277654..1e1b09a 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -13,3 +13,4 @@ pymupdf pypdfium2 pdfrw lxml +textract