Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/glossapi/corpus/phase_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str):
manifest = root_dir / manifest_relative
if not manifest.exists():
raise RuntimeError(
f"Cannot locate Cargo manifest for {module_name} at {manifest}"
f"[Clean Phase] Cannot locate Cargo manifest for {module_name} at {manifest}"
)
try:
subprocess.run(
Expand All @@ -74,7 +74,7 @@ def _load_rust_extension(self, module_name: str, manifest_relative: str):
return importlib.import_module(module_name)
except Exception as build_err:
raise RuntimeError(
f"Automatic build of {module_name} failed: {build_err}"
f"[Clean Phase] Automatic build of {module_name} failed | Error: {build_err}"
)

def _load_metrics_dataframe(
Expand Down Expand Up @@ -367,7 +367,7 @@ def finalize(self) -> None:
# Do not abort the entire cleaning pass – proceed to evaluate gates
# using existing metrics on disk. If the Rust report is available,
# it will be merged below as usual.
self.logger.error("Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code)
self.logger.error("[Clean Phase] Rust cleaning pipeline failed (code=%s); proceeding with existing metrics", return_code)

# ----- Parse metrics Parquet produced by Rust -----
if report_parquet_path.exists():
Expand All @@ -385,7 +385,7 @@ def finalize(self) -> None:
}
)
except Exception as e:
self.logger.warning("Failed to parse cleaning report %s: %s", report_parquet_path, e)
self.logger.warning("[Clean Phase] Failed to parse cleaning report %s | Error: %s", report_parquet_path, e)
else:
self.logger.warning("Cleaning report Parquet not found: %s", report_parquet_path)

Expand Down
6 changes: 3 additions & 3 deletions src/glossapi/corpus/phase_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def download(
if input_parquet is None:
parquet_files = list(self.input_dir.glob('*.parquet'))
if not parquet_files:
raise ValueError(f"No parquet files found in {self.input_dir}")
raise ValueError(f"[Download Phase] No parquet files found in {self.input_dir}")
input_parquet = parquet_files[0]
self.logger.info(f"Using parquet file: {input_parquet}")
else:
Expand Down Expand Up @@ -92,15 +92,15 @@ def download(
existing_results_path = specific_results_path
found_existing = True
except Exception as e:
self.logger.warning(f"Failed to read specific download results: {e}")
self.logger.warning(f"[Download Phase] Failed to read specific download results: {e}")
elif os.path.exists(partial_results_path):
self.logger.info(f"Found partial download checkpoint: {partial_results_path}")
try:
existing_results = pd.read_parquet(partial_results_path)
existing_results_path = partial_results_path
found_existing = True
except Exception as e:
self.logger.warning(f"Failed to read partial results: {e}")
self.logger.warning(f"[Download Phase] Failed to read partial results: {e}")

# If specific results not found, look in the directory for any download results
if not found_existing and os.path.exists(download_results_dir):
Expand Down
7 changes: 4 additions & 3 deletions src/glossapi/corpus/phase_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def prime_extractor(
try:
setattr(self.extractor, "export_doc_json", bool(export_doc_json))
setattr(self.extractor, "emit_formula_index", bool(emit_formula_index))
except Exception:
except Exception as e:
self.logger.debug(f"[Extract Phase] Failed to propagate extractor toggles: {e}")
pass
# Resolve backend preference (safe vs docling)
backend_choice = self._resolve_phase1_backend(
Expand Down Expand Up @@ -298,7 +299,7 @@ def extract(
try:
input_files = [Path(p) for p in file_paths]
except Exception as exc:
raise ValueError(f"Invalid file path supplied to extract(): {exc}")
raise ValueError(f"[Extract Phase] Invalid file path supplied to extract(): {exc}")
self.logger.info(f"[Worker Batch] Processing {len(input_files)} direct file paths")
elif input_format.lower() == "all":
input_files = []
Expand Down Expand Up @@ -722,7 +723,7 @@ def extract(
from ..gloss_extract import GlossExtract # local import to avoid import-time heavy deps
self.extractor = GlossExtract(url_column=self.url_column)
except Exception as e:
self.logger.error(f"Failed to initialize GlossExtract: {e}")
self.logger.error(f"[Extract Phase] Failed to initialize GlossExtract: {e}")
raise
# Configure Phase-1 helpers on extractor
try:
Expand Down
16 changes: 8 additions & 8 deletions src/glossapi/corpus/phase_ocr_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def ocr(
# Normalize backend
backend_norm = str(backend or "deepseek").strip().lower()
if backend_norm != "deepseek":
raise ValueError("backend must be 'deepseek'")
raise ValueError("[OCR Phase] backend must be 'deepseek'")

# CONTENT_DEBUG override (preferred uppercase alias)
# Priority: CONTENT_DEBUG > INTERNAL_DEBUG > content_debug/internal_debug flags
Expand Down Expand Up @@ -300,7 +300,7 @@ def _run_math(stems: List[str]) -> None:
except Exception:
pass
if not devs:
msg = "Multi-GPU math requested but no GPUs detected; aborting math enhancement"
msg = "[Math Phase] Multi-GPU math requested but no GPUs detected; aborting math enhancement"
self.logger.error(msg)
raise RuntimeError(msg)
else:
Expand Down Expand Up @@ -584,7 +584,7 @@ def _run_math(stems: List[str]) -> None:
content_debug=bool(content_debug),
)
except Exception as _e:
self.logger.error("DeepSeek OCR runner failed: %s", _e)
self.logger.error("[OCR Phase] DeepSeek OCR runner failed | Error: %s", _e)
raise
reran_ocr = True
# Update metadata to reflect successful OCR reruns
Expand Down Expand Up @@ -633,7 +633,7 @@ def _run_math(stems: List[str]) -> None:
except Exception:
pass
except Exception as _e:
self.logger.warning("Failed to update OCR success metadata: %s", _e)
self.logger.warning("[OCR Phase] Failed to update OCR success metadata | Error: %s", _e)

if reran_ocr:
try:
Expand All @@ -643,7 +643,7 @@ def _run_math(stems: List[str]) -> None:
drop_bad=False,
)
except Exception as _e:
self.logger.warning("Cleaner refresh after OCR failed: %s", _e)
self.logger.warning("[OCR Phase] Cleaner refresh after OCR failed | Error: %s", _e)

if mode_norm == "ocr_bad_then_math":
try:
Expand Down Expand Up @@ -708,7 +708,7 @@ def _run_math(stems: List[str]) -> None:
except Exception:
pass
except Exception as _e:
self.logger.warning("Phase‑2 enrichment after OCR failed: %s", _e)
self.logger.warning("[Math Phase] Phase‑2 enrichment after OCR failed | Error: %s", _e)

def formula_enrich_from_json(
self,
Expand All @@ -732,9 +732,9 @@ def formula_enrich_from_json(
try:
enrich_from_docling_json = getattr(_math_pkg, "enrich_from_docling_json")
except AttributeError as exc:
raise RuntimeError("Math enrichment backend unavailable") from exc
raise RuntimeError("[Math Phase] Math enrichment backend unavailable") from exc
if not callable(enrich_from_docling_json):
raise RuntimeError("Math enrichment backend missing 'enrich_from_docling_json'")
raise RuntimeError("[Math Phase] Math enrichment backend missing 'enrich_from_docling_json'")
json_dir = self.output_dir / "json"
md_dir = self.markdown_dir
dl_dir = self.output_dir / "downloads"
Expand Down
6 changes: 3 additions & 3 deletions src/glossapi/corpus/phase_sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,9 @@ def section(self) -> None:
self._cache_metadata_parquet(parquet_path)
parquet_schema.write_metadata_parquet(df_meta, parquet_path)
except Exception as e:
self.logger.warning(f"Failed to update processing_stage in {parquet_path}: {e}")
self.logger.warning(f"[Section Phase] Failed to update processing_stage in {parquet_path} | Error: {e}")
except Exception as e:
self.logger.warning(f"Error reading parquet file {parquet_path}: {e}")
self.logger.warning(f"[Section Phase] Error reading parquet file {parquet_path} | Error: {e}")
else:
self.logger.info("No metadata parquet found for section selection; will fall back to all markdown files")

Expand All @@ -102,7 +102,7 @@ def section(self) -> None:
for p in Path(self.markdown_dir).glob("*.md")
]
if not good_filenames:
error_msg = "No markdown files found to section. Extraction might have failed."
error_msg = "[Section Phase] No markdown files found to section. Extraction might have failed."
self.logger.error(error_msg)
raise ValueError(error_msg)

Expand Down