From 0877e5e8ab7a7d687103247611dcb3623dbb4db0 Mon Sep 17 00:00:00 2001 From: JahnaviSingh2005 Date: Sun, 22 Mar 2026 10:33:45 +0530 Subject: [PATCH] Improve error messages across pipeline stages for better debugging --- src/glossapi/corpus/phase_clean.py | 8 ++++---- src/glossapi/corpus/phase_download.py | 6 +++--- src/glossapi/corpus/phase_extract.py | 7 ++++--- src/glossapi/corpus/phase_ocr_math.py | 16 ++++++++-------- src/glossapi/corpus/phase_sections.py | 6 +++--- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/glossapi/corpus/phase_clean.py b/src/glossapi/corpus/phase_clean.py index e5a4329..05f3505 100644 --- a/src/glossapi/corpus/phase_clean.py +++ b/src/glossapi/corpus/phase_clean.py @@ -52,7 +52,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str): manifest = root_dir / manifest_relative if not manifest.exists(): raise RuntimeError( - f"Cannot locate Cargo manifest for {module_name} at {manifest}" + f"[Clean Phase] Cannot locate Cargo manifest for {module_name} at {manifest}" ) try: subprocess.run( @@ -74,7 +74,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str): return importlib.import_module(module_name) except Exception as build_err: raise RuntimeError( - f"Automatic build of {module_name} failed: {build_err}" + f"[Clean Phase] Automatic build of {module_name} failed | Error: {build_err}" ) def _load_metrics_dataframe( @@ -367,7 +367,7 @@ def finalize(self) -> None: # Do not abort the entire cleaning pass – proceed to evaluate gates # using existing metrics on disk. If the Rust report is available, # it will be merged below as usual. - self.logger.error("Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code) + self.logger.error("[Clean Phase] Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code) # ----- Parse metrics Parquet produced by Rust ----- if report_parquet_path.exists(): @@ -385,7 +385,7 @@ def finalize(self) -> None: } ) except Exception as e: - self.logger.warning("Failed to parse cleaning report %s: %s", report_parquet_path, e) + self.logger.warning("[Clean Phase] Failed to parse cleaning report %s | Error: %s", report_parquet_path, e) else: self.logger.warning("Cleaning report Parquet not found: %s", report_parquet_path) diff --git a/src/glossapi/corpus/phase_download.py b/src/glossapi/corpus/phase_download.py index c543076..6db19c0 100644 --- a/src/glossapi/corpus/phase_download.py +++ b/src/glossapi/corpus/phase_download.py @@ -58,7 +58,7 @@ def download( if input_parquet is None: parquet_files = list(self.input_dir.glob('*.parquet')) if not parquet_files: - raise ValueError(f"No parquet files found in {self.input_dir}") + raise ValueError(f"[Download Phase] No parquet files found in {self.input_dir}") input_parquet = parquet_files[0] self.logger.info(f"Using parquet file: {input_parquet}") else: @@ -92,7 +92,7 @@ def download( existing_results_path = specific_results_path found_existing = True except Exception as e: - self.logger.warning(f"Failed to read specific download results: {e}") + self.logger.warning(f"[Download Phase] Failed to read specific download results: {e}") elif os.path.exists(partial_results_path): self.logger.info(f"Found partial download checkpoint: {partial_results_path}") try: @@ -100,7 +100,7 @@ def download( existing_results_path = partial_results_path found_existing = True except Exception as e: - self.logger.warning(f"Failed to read partial results: {e}") + self.logger.warning(f"[Download Phase] Failed to read partial results: {e}") # If specific results not found, look in the directory for any download results if not found_existing and os.path.exists(download_results_dir): diff --git a/src/glossapi/corpus/phase_extract.py b/src/glossapi/corpus/phase_extract.py index a748dcc..cb9adbd 100644 --- a/src/glossapi/corpus/phase_extract.py +++ b/src/glossapi/corpus/phase_extract.py @@ -66,7 +66,8 @@ def prime_extractor( try: setattr(self.extractor, "export_doc_json", bool(export_doc_json)) setattr(self.extractor, "emit_formula_index", bool(emit_formula_index)) - except Exception: + except Exception as e: + self.logger.debug(f"[Extract Phase] Failed to propagate extractor toggles: {e}") pass # Resolve backend preference (safe vs docling) backend_choice = self._resolve_phase1_backend( @@ -298,7 +299,7 @@ def extract( try: input_files = [Path(p) for p in file_paths] except Exception as exc: - raise ValueError(f"Invalid file path supplied to extract(): {exc}") + raise ValueError(f"[Extract Phase] Invalid file path supplied to extract(): {exc}") self.logger.info(f"[Worker Batch] Processing {len(input_files)} direct file paths") elif input_format.lower() == "all": input_files = [] @@ -722,7 +723,7 @@ def extract( from ..gloss_extract import GlossExtract # local import to avoid import-time heavy deps self.extractor = GlossExtract(url_column=self.url_column) except Exception as e: - self.logger.error(f"Failed to initialize GlossExtract: {e}") + self.logger.error(f"[Extract Phase] Failed to initialize GlossExtract: {e}") raise # Configure Phase-1 helpers on extractor try: diff --git a/src/glossapi/corpus/phase_ocr_math.py b/src/glossapi/corpus/phase_ocr_math.py index 80afc7f..7df3a64 100644 --- a/src/glossapi/corpus/phase_ocr_math.py +++ b/src/glossapi/corpus/phase_ocr_math.py @@ -84,7 +84,7 @@ def ocr( # Normalize backend backend_norm = str(backend or "deepseek").strip().lower() if backend_norm != "deepseek": - raise ValueError("backend must be 'deepseek'") + raise ValueError("[OCR Phase] backend must be 'deepseek'") # CONTENT_DEBUG override (preferred uppercase alias) # Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags @@ -300,7 +300,7 @@ def _run_math(stems: List[str]) -> None: except Exception: pass if not devs: - msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement" + msg = "[Math Phase] Multi-GPU math requested but no GPUs detected; aborting math enhancement" self.logger.error(msg) raise RuntimeError(msg) else: @@ -584,7 +584,7 @@ def _run_math(stems: List[str]) -> None: content_debug=bool(content_debug), ) except Exception as _e: - self.logger.error("DeepSeek OCR runner failed: %s", _e) + self.logger.error("[OCR Phase] DeepSeek OCR runner failed | Error: %s", _e) raise reran_ocr = True # Update metadata to reflect successful OCR reruns @@ -633,7 +633,7 @@ def _run_math(stems: List[str]) -> None: except Exception: pass except Exception as _e: - self.logger.warning("Failed to update OCR success metadata: %s", _e) + self.logger.warning("[OCR Phase] Failed to update OCR success metadata | Error: %s", _e) if reran_ocr: try: @@ -643,7 +643,7 @@ def _run_math(stems: List[str]) -> None: drop_bad=False, ) except Exception as _e: - self.logger.warning("Cleaner refresh after OCR failed: %s", _e) + self.logger.warning("[OCR Phase] Cleaner refresh after OCR failed | Error: %s", _e) if mode_norm == "ocr_bad_then_math": try: @@ -708,7 +708,7 @@ def _run_math(stems: List[str]) -> None: except Exception: pass except Exception as _e: - self.logger.warning("Phase‑2 enrichment after OCR failed: %s", _e) + self.logger.warning("[Math Phase] Phase‑2 enrichment after OCR failed | Error: %s", _e) def formula_enrich_from_json( self, @@ -732,9 +732,9 @@ def formula_enrich_from_json( try: enrich_from_docling_json = getattr(_math_pkg, "enrich_from_docling_json") except AttributeError as exc: - raise RuntimeError("Math enrichment backend unavailable") from exc + raise RuntimeError("[Math Phase] Math enrichment backend unavailable") from exc if not callable(enrich_from_docling_json): - raise RuntimeError("Math enrichment backend missing 'enrich_from_docling_json'") + raise RuntimeError("[Math Phase] Math enrichment backend missing 'enrich_from_docling_json'") json_dir = self.output_dir / "json" md_dir = self.markdown_dir dl_dir = self.output_dir / "downloads" diff --git a/src/glossapi/corpus/phase_sections.py b/src/glossapi/corpus/phase_sections.py index c948829..593fd78 100644 --- a/src/glossapi/corpus/phase_sections.py +++ b/src/glossapi/corpus/phase_sections.py @@ -85,9 +85,9 @@ def section(self) -> None: self._cache_metadata_parquet(parquet_path) parquet_schema.write_metadata_parquet(df_meta, parquet_path) except Exception as e: - self.logger.warning(f"Failed to update processing_stage in {parquet_path}: {e}") + self.logger.warning(f"[Section Phase] Failed to update processing_stage in {parquet_path} | Error: {e}") except Exception as e: - self.logger.warning(f"Error reading parquet file {parquet_path}: {e}") + self.logger.warning(f"[Section Phase] Error reading parquet file {parquet_path} | Error: {e}") else: self.logger.info("No metadata parquet found for section selection; will fall back to all markdown files") @@ -102,7 +102,7 @@ def section(self) -> None: for p in Path(self.markdown_dir).glob("*.md") ] if not good_filenames: - error_msg = "No markdown files found to section. Extraction might have failed." + error_msg = "[Section Phase] No markdown files found to section. Extraction might have failed." self.logger.error(error_msg) raise ValueError(error_msg)