From 0877e5e8ab7a7d687103247611dcb3623dbb4db0 Mon Sep 17 00:00:00 2001
From: JahnaviSingh2005 <Singhjahnavi031@gmail.com>
Date: Sun, 22 Mar 2026 10:33:45 +0530
Subject: [PATCH] Improve error messages across pipeline stages for better
 debugging

---
 src/glossapi/corpus/phase_clean.py    |  8 ++++----
 src/glossapi/corpus/phase_download.py |  6 +++---
 src/glossapi/corpus/phase_extract.py  |  7 ++++---
 src/glossapi/corpus/phase_ocr_math.py | 16 ++++++++--------
 src/glossapi/corpus/phase_sections.py |  6 +++---
 5 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py
index e5a4329..05f3505 100644
--- a/src/glossapi/corpus/phase_clean.py
+++ b/src/glossapi/corpus/phase_clean.py
@@ -52,7 +52,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str):
             manifest = root_dir / manifest_relative
             if not manifest.exists():
                 raise RuntimeError(
-                    f"Cannot locate Cargo manifest for {module_name} at {manifest}"
+                    f"[Clean Phase] Cannot locate Cargo manifest for {module_name} at {manifest}"
                 )
             try:
                 subprocess.run(
@@ -74,7 +74,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str):
                 return importlib.import_module(module_name)
             except Exception as build_err:
                 raise RuntimeError(
-                    f"Automatic build of {module_name} failed: {build_err}"
+                    f"[Clean Phase] Automatic build of {module_name} failed | Error: {build_err}"
                 )
 
     def _load_metrics_dataframe(
@@ -367,7 +367,7 @@ def finalize(self) -> None:
             # Do not abort the entire cleaning pass – proceed to evaluate gates
             # using existing metrics on disk. If the Rust report is available,
             # it will be merged below as usual.
-            self.logger.error("Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code)
+            self.logger.error("[Clean Phase] Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code)
 
         # ----- Parse metrics Parquet produced by Rust -----
         if report_parquet_path.exists():
@@ -385,7 +385,7 @@ def finalize(self) -> None:
                         }
                     )
             except Exception as e:
-                self.logger.warning("Failed to parse cleaning report %s: %s", report_parquet_path, e)
+                self.logger.warning("[Clean Phase] Failed to parse cleaning report %s | Error: %s", report_parquet_path, e)
         else:
             self.logger.warning("Cleaning report Parquet not found: %s", report_parquet_path)
 
diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py
index c543076..6db19c0 100644
--- a/src/glossapi/corpus/phase_download.py
+++ b/src/glossapi/corpus/phase_download.py
@@ -58,7 +58,7 @@ def download(
         if input_parquet is None:
             parquet_files = list(self.input_dir.glob('*.parquet'))
             if not parquet_files:
-                raise ValueError(f"No parquet files found in {self.input_dir}")
+                raise ValueError(f"[Download Phase] No parquet files found in {self.input_dir}")
             input_parquet = parquet_files[0]
             self.logger.info(f"Using parquet file: {input_parquet}")
         else:
@@ -92,7 +92,7 @@ def download(
                 existing_results_path = specific_results_path
                 found_existing = True
             except Exception as e:
-                self.logger.warning(f"Failed to read specific download results: {e}")
+                self.logger.warning(f"[Download Phase] Failed to read specific download results: {e}")
         elif os.path.exists(partial_results_path):
             self.logger.info(f"Found partial download checkpoint: {partial_results_path}")
             try:
@@ -100,7 +100,7 @@ def download(
                 existing_results_path = partial_results_path
                 found_existing = True
             except Exception as e:
-                self.logger.warning(f"Failed to read partial results: {e}")
+                self.logger.warning(f"[Download Phase] Failed to read partial results: {e}")
 
         # If specific results not found, look in the directory for any download results
         if not found_existing and os.path.exists(download_results_dir):
diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py
index a748dcc..cb9adbd 100644
--- a/src/glossapi/corpus/phase_extract.py
+++ b/src/glossapi/corpus/phase_extract.py
@@ -66,7 +66,8 @@ def prime_extractor(
         try:
             setattr(self.extractor, "export_doc_json", bool(export_doc_json))
             setattr(self.extractor, "emit_formula_index", bool(emit_formula_index))
-        except Exception:
+        except Exception as e:
+            self.logger.debug(f"[Extract Phase] Failed to propagate extractor toggles: {e}")
             pass
         # Resolve backend preference (safe vs docling)
         backend_choice = self._resolve_phase1_backend(
@@ -298,7 +299,7 @@ def extract(
             try:
                 input_files = [Path(p) for p in file_paths]
             except Exception as exc:
-                raise ValueError(f"Invalid file path supplied to extract(): {exc}")
+                raise ValueError(f"[Extract Phase] Invalid file path supplied to extract(): {exc}")
             self.logger.info(f"[Worker Batch] Processing {len(input_files)} direct file paths")
         elif input_format.lower() == "all":
             input_files = []
@@ -722,7 +723,7 @@ def extract(
                 from ..gloss_extract import GlossExtract  # local import to avoid import-time heavy deps
                 self.extractor = GlossExtract(url_column=self.url_column)
             except Exception as e:
-                self.logger.error(f"Failed to initialize GlossExtract: {e}")
+                self.logger.error(f"[Extract Phase] Failed to initialize GlossExtract: {e}")
                 raise
         # Configure Phase-1 helpers on extractor
         try:
diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py
index 80afc7f..7df3a64 100644
--- a/src/glossapi/corpus/phase_ocr_math.py
+++ b/src/glossapi/corpus/phase_ocr_math.py
@@ -84,7 +84,7 @@ def ocr(
         # Normalize backend
         backend_norm = str(backend or "deepseek").strip().lower()
         if backend_norm != "deepseek":
-            raise ValueError("backend must be 'deepseek'")
+            raise ValueError("[OCR Phase] backend must be 'deepseek'")
 
         # CONTENT_DEBUG override (preferred uppercase alias)
         # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags
@@ -300,7 +300,7 @@ def _run_math(stems: List[str]) -> None:
                         except Exception:
                             pass
                 if not devs:
-                    msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement"
+                    msg = "[Math Phase] Multi-GPU math requested but no GPUs detected; aborting math enhancement"
                     self.logger.error(msg)
                     raise RuntimeError(msg)
                 else:
@@ -584,7 +584,7 @@ def _run_math(stems: List[str]) -> None:
                         content_debug=bool(content_debug),
                     )
                 except Exception as _e:
-                    self.logger.error("DeepSeek OCR runner failed: %s", _e)
+                    self.logger.error("[OCR Phase] DeepSeek OCR runner failed | Error: %s", _e)
                     raise
             reran_ocr = True
             # Update metadata to reflect successful OCR reruns
@@ -633,7 +633,7 @@ def _run_math(stems: List[str]) -> None:
                     except Exception:
                         pass
             except Exception as _e:
-                self.logger.warning("Failed to update OCR success metadata: %s", _e)
+                self.logger.warning("[OCR Phase] Failed to update OCR success metadata | Error: %s", _e)
 
         if reran_ocr:
             try:
@@ -643,7 +643,7 @@ def _run_math(stems: List[str]) -> None:
                     drop_bad=False,
                 )
             except Exception as _e:
-                self.logger.warning("Cleaner refresh after OCR failed: %s", _e)
+                self.logger.warning("[OCR Phase] Cleaner refresh after OCR failed | Error: %s", _e)
 
         if mode_norm == "ocr_bad_then_math":
             try:
@@ -708,7 +708,7 @@ def _run_math(stems: List[str]) -> None:
                 except Exception:
                     pass
             except Exception as _e:
-                self.logger.warning("Phase‑2 enrichment after OCR failed: %s", _e)
+                self.logger.warning("[Math Phase] Phase‑2 enrichment after OCR failed | Error: %s", _e)
 
     def formula_enrich_from_json(
         self,
@@ -732,9 +732,9 @@ def formula_enrich_from_json(
         try:
             enrich_from_docling_json = getattr(_math_pkg, "enrich_from_docling_json")
         except AttributeError as exc:
-            raise RuntimeError("Math enrichment backend unavailable") from exc
+            raise RuntimeError("[Math Phase] Math enrichment backend unavailable") from exc
         if not callable(enrich_from_docling_json):
-            raise RuntimeError("Math enrichment backend missing 'enrich_from_docling_json'")
+            raise RuntimeError("[Math Phase] Math enrichment backend missing 'enrich_from_docling_json'")
         json_dir = self.output_dir / "json"
         md_dir = self.markdown_dir
         dl_dir = self.output_dir / "downloads"
diff --git a/src/glossapi/corpus/phase_sections.py b/src/glossapi/corpus/phase_sections.py
index c948829..593fd78 100644
--- a/src/glossapi/corpus/phase_sections.py
+++ b/src/glossapi/corpus/phase_sections.py
@@ -85,9 +85,9 @@ def section(self) -> None:
                             self._cache_metadata_parquet(parquet_path)
                             parquet_schema.write_metadata_parquet(df_meta, parquet_path)
                         except Exception as e:
-                            self.logger.warning(f"Failed to update processing_stage in {parquet_path}: {e}")
+                            self.logger.warning(f"[Section Phase] Failed to update processing_stage in {parquet_path} | Error: {e}")
                 except Exception as e:
-                    self.logger.warning(f"Error reading parquet file {parquet_path}: {e}")
+                    self.logger.warning(f"[Section Phase] Error reading parquet file {parquet_path} | Error: {e}")
             else:
                 self.logger.info("No metadata parquet found for section selection; will fall back to all markdown files")
 
@@ -102,7 +102,7 @@ def section(self) -> None:
                 for p in Path(self.markdown_dir).glob("*.md")
             ]
             if not good_filenames:
-                error_msg = "No markdown files found to section. Extraction might have failed."
+                error_msg = "[Section Phase] No markdown files found to section. Extraction might have failed."
                 self.logger.error(error_msg)
                 raise ValueError(error_msg)