py-pdf · bosd · May 23, 2026
diff --git a/benchmark.py b/benchmark.py
@@ -36,6 +36,7 @@
     pypdf_get_text,
     pypdf_image_extraction,
     pypdf_watermarking, tika_get_text, pdfium_image_extraction,
+    textract_get_text,
 )
 from pdf_benchmark.output import write_benchmark_report
 from pdf_benchmark.score import get_text_extraction_score
@@ -218,6 +219,15 @@ def write_single_result(
             last_release_date="-",
             license="GPL",
         ),
+        "textract": Library(
+            "textract",
+            "textract",
+            "https://pypi.org/project/textract/",
+            text_extraction_function=textract_get_text,
+            version="2.0.0",
+            license="MIT",
+            last_release_date="2026-04-27",
+        ),
         # "borb": Library(
         #     "Borb",
         #     "borb",

diff --git a/pdf_benchmark/library_code.py b/pdf_benchmark/library_code.py
@@ -225,6 +225,19 @@ def pdfrw_watermarking(watermark_data: bytes, data: bytes) -> bytes:
     return out_buffer.read()
 
 
+def textract_get_text(data: bytes) -> str:
+    import textract
+    new_file, filename = tempfile.mkstemp(suffix=".pdf")
+    try:
+        with open(filename, "wb") as fp:
+            fp.write(data)
+        text = textract.process(filename).decode("utf-8", errors="replace")
+    finally:
+        os.close(new_file)
+        os.remove(filename)
+    return text
+
+
 def tika_get_text(data: bytes) -> str:
     from tika import parser
 

diff --git a/requirements/main.in b/requirements/main.in
@@ -13,3 +13,4 @@ pymupdf
 pypdfium2
 pdfrw
 lxml
+textract
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ pymupdf @@
     pypdfium2
     pdfrw
     lxml
+    textract