From d3101437bc78ece36a11a3828241a0a0700ac502 Mon Sep 17 00:00:00 2001 From: bosd <5e2fd43-d292-4c90-9d1f-74ff3436329a@anonaddy.me> Date: Sat, 23 May 2026 12:55:37 +0200 Subject: [PATCH] Add textract text-extraction backend textract 2.0.0 (MIT) wraps pdftotext for PDFs. Adds a textract_get_text adapter, the Library entry, and the requirement. --- benchmark.py | 10 ++++++++++ pdf_benchmark/library_code.py | 13 +++++++++++++ requirements/main.in | 1 + 3 files changed, 24 insertions(+) diff --git a/benchmark.py b/benchmark.py index 3d71e22..bb2398f 100644 --- a/benchmark.py +++ b/benchmark.py @@ -36,6 +36,7 @@ pypdf_get_text, pypdf_image_extraction, pypdf_watermarking, tika_get_text, pdfium_image_extraction, + textract_get_text, ) from pdf_benchmark.output import write_benchmark_report from pdf_benchmark.score import get_text_extraction_score @@ -218,6 +219,15 @@ def write_single_result( last_release_date="-", license="GPL", ), + "textract": Library( + "textract", + "textract", + "https://pypi.org/project/textract/", + text_extraction_function=textract_get_text, + version="2.0.0", + license="MIT", + last_release_date="2026-04-27", + ), # "borb": Library( # "Borb", # "borb", diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py index 32938c2..62a9d12 100644 --- a/pdf_benchmark/library_code.py +++ b/pdf_benchmark/library_code.py @@ -225,6 +225,19 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes: return out_buffer.read() +def textract_get_text(data: bytes) -> str: + import textract + new_file, filename = tempfile.mkstemp(suffix=".pdf") + try: + with open(filename, "wb") as fp: + fp.write(data) + text = textract.process(filename).decode("utf-8", errors="replace") + finally: + os.close(new_file) + os.remove(filename) + return text + + def tika_get_text(data: bytes) -> str: from tika import parser diff --git a/requirements/main.in b/requirements/main.in index d277654..1e1b09a 100644 --- a/requirements/main.in +++ b/requirements/main.in @@ -13,3 +13,4 @@ pymupdf pypdfium2 pdfrw lxml +textract