From 128908a954aea56ab9433a5328d09cf672061c7b Mon Sep 17 00:00:00 2001 From: mac Date: Mon, 23 Feb 2026 12:09:47 -0800 Subject: [PATCH 1/3] #85-add bugzila preprocessor for clang --- .../preprocessor/bugzila_preprocessor.py | 139 ++++++++++++++++++ requirements.txt | Bin 5828 -> 5968 bytes 2 files changed, 139 insertions(+) create mode 100644 pinecone_rag/preprocessor/bugzila_preprocessor.py diff --git a/pinecone_rag/preprocessor/bugzila_preprocessor.py b/pinecone_rag/preprocessor/bugzila_preprocessor.py new file mode 100644 index 0000000..782d034 --- /dev/null +++ b/pinecone_rag/preprocessor/bugzila_preprocessor.py @@ -0,0 +1,139 @@ +""" +Bugzilla issue preprocessor for Pinecone RAG. + +Reads JSON files under data/bugs/** and builds one Document per bug. +Expected JSON shape: +- bug: bug metadata +- comments: list of comment objects +""" + +import json +import logging +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +from langchain_core.documents import Document + +logger = logging.getLogger(__name__) + + +def _to_timestamp(value: Optional[str]) -> float: + """Convert ISO datetime string to Unix timestamp (UTC).""" + if not value: + return 0.0 + try: + dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") + return dt.timestamp() + except Exception: + return 0.0 + + +def _is_valid_content(text: str, min_length: int) -> bool: + return bool(text and len(text.strip()) >= min_length) + + +def _build_content(bug: Dict[str, Any], comments: List[Any]) -> str: + lines: List[str] = [] + lines.append(f"Product: {bug.get('product', '')}") + lines.append(f"Component: {bug.get('component', '')}") + lines.append(f"Status: {bug.get('status', '')}") + lines.append(f"Resolution: {bug.get('resolution', '')}") + lines.append(f"Severity: {bug.get('severity', '')}") + lines.append(f"Priority: {bug.get('priority', '')}") + + keywords = bug.get("keywords") or [] + if isinstance(keywords, list) and keywords: + lines.append(f"Keywords: {', '.join(str(k) for k in keywords)}") + + lines.append("") + lines.append("Description and comments:") + + for comment in comments: + if not isinstance(comment, dict): + continue + text = (comment.get("text") or "").strip() + if not text: + continue + creator = comment.get("creator") or "" + created = comment.get("creation_time") or comment.get("time") or "" + lines.append(f"\n--- Comment by {creator} ({created}) ---\n{text}") + + return "\n".join(lines).strip() + + +def _load_bug_document(json_path: Path, min_content_length: int) -> Optional[Document]: + """Load one Bugzilla JSON file and convert it into a Document.""" + try: + data = json.loads(json_path.read_text(encoding="utf-8", errors="replace")) + except (json.JSONDecodeError, OSError) as exc: + logger.debug("Skip %s: %s", json_path.name, exc) + return None + + bug = data.get("bug") + if not isinstance(bug, dict): + logger.debug("Skip %s: missing bug object", json_path.name) + return None + + comments = data.get("comments") or [] + if not isinstance(comments, list): + comments = [] + + content = _build_content(bug, comments) + if not _is_valid_content(content, min_content_length): + logger.debug("Skip %s: content too short", json_path.name) + return None + + bug_id = bug.get("id", data.get("id", -1)) + is_open = bool(bug.get("is_open", False)) + last_change = bug.get("last_change_time") + created = bug.get("creation_time") + bug_url = bug.get("url") or f"https://bugs.llvm.org/show_bug.cgi?id={bug_id}" + + metadata: Dict[str, Any] = { + "type": "issue-bugzilla", + "number": bug_id, + "title": (bug.get("summary") or "").strip(), + "url": bug_url, + "author": bug.get("creator", "") or "", + "state": bug.get("status", "") or "", + "state_reason": bug.get("resolution", "") or "", + "created_at": _to_timestamp(created), + "updated_at": _to_timestamp(last_change), + "closed_at": 0.0 if is_open else _to_timestamp(last_change), + } + + return Document(page_content=content, metadata=metadata) + + +class BugIssuePreprocessor: + """Load Bugzilla issue JSON files from data/bugs and produce Documents.""" + + def __init__( + self, data_dir: str = "data/github/Clang/bugs", min_content_length: int = 10 + ): + self.data_dir = Path(data_dir) + self.min_content_length = min_content_length + + def load_documents(self, limit: Optional[int] = None) -> List[Document]: + """Load Bugzilla JSON files from data/github/Clang/bugs/**/*.json.""" + if not self.data_dir.exists(): + logger.warning("Bug data dir does not exist: %s", self.data_dir) + return [] + + json_paths = sorted(self.data_dir.rglob("*.json")) + if limit is not None: + json_paths = json_paths[:limit] + + documents: List[Document] = [] + for json_path in json_paths: + if json_path.name.startswith("."): + continue + doc = _load_bug_document(json_path, self.min_content_length) + if doc is not None: + documents.append(doc) + + logger.info( + "Loaded %d Bugzilla issue documents from %s", len(documents), self.data_dir + ) + return documents diff --git a/requirements.txt b/requirements.txt index ac66770a52e548214820548700a9127dfb79c958..b48c412a7145bfd09d96a64eddb6fb1a3cf81632 100644 GIT binary patch delta 1468 zcma)6O>7%g5Psg+iM@{3>%S#+H`Zpm+FHfl+OA1M1XYMQ6jV@^f&>T^<;NusuH)Jc z)TmN8A-G{a4n6cjPE~~}aX~#)s1#0B4;2Sa7OFUK;J|?cLP#LK_lzqL7qpt4nKv_U ze&2jkeh>_`U|4g8FS*0Z!SK5MvM$y|pI=w>#MAt0;t3ImI-w$Tu|eskcuVYx3;f3N zfI5MwD(NQe2BJfIDU+5J30Mdn$66?lcF4mgAs@Fxb^H=?4(YtDC7S9^X|Y7@6UySa zwtzLQh$~tJH#Hmg2|sBrjHwz{r;edL#ha#F+#-y~N5dY1u#2@YZNjU#6L#=JxBx9O z2Rq{6RK&$b#KwCOH>Bg!=wrA|$#;?B$I}*GBm~n1;>DA?c%A{Z#ST-gsWfCjRl*`u z+@Y*58cb7$E0e1#SruPR&*R~=18v5~+Kh#j8I$-lu8}_G6m+m9Ffk!x0?tber-v?Rvzl_CAjrWL#0A ziY$IxEOQKwkdt4BjylA0VE!3eS(K^UVavL7wxt|QbbI9bO4_HB-M1EtV`}j8PpVO(^q}TCH@+rzEA@@{| zQ8m?$YO=x;`S>AAscddXSvSPn%t30%D3)+8a}-xH9u6`#?qs+_GRI(KUDUEZUd!eg ze_vc;M=vTH8RMezTE#H?IKIr<7-c!?Sw4YW7N>Icd7ZwxY>M1-8{BRaU*kQ8+ycJH jc^Ks?Ff1R7R*60as&V7Ts~)b3u7Z8*Fs@miri;G-wd+Km delta 1216 zcmYLJO=uHg5dD&D{x;dfWOuX1Si)LMT1(m{v8_P@77x-xkzOpGq-})$q>VJuT09hy z;z?-v@TN$qA_|_xL-o*tRD|lGpa&HX9;Js;)QffAWE)xbF+006@6FG*AG$WP?t{O} zT`p}UYZDB$5>&eh9<~s?0_>#Inoz}<7#DM5L0l7cQKG9?bcr6mH5qkLh%C8Mgd?st z!dtEk;i{{baK~l0Aj7z*h)Kd}jz2@Og~NC^k;`s(2jPf2M|juW!mMR+og>P`57rz- zoTfV`H2KC&YUOu#fl&8!5|%vR9`(R~hw%yfx7okP9#a8JNjXorsDSs8k|tbNl7u_# z*;kOk@>2P-;BEDgdYKDYa@Kp4JWF0=c;U4OH@v9um$#oV>%;ehzJ~HCQ6yC_XFrkGT6{HBt~hD)1plJ=EW>`6>}((uHhi^4hMC)95l#U3nKDu5cTc` zQz=YwhOP;k+~E%7DYl1VZ57p*RCMl&iha1po@XkyVnelgo>l5dg%r%tjSSeJJ}1M0 z3Z6^|d8*;G91A5J>vyPKz78F4KAm%{OJUSp4Pyq?uq`*kePsO!W1f}8xFBv~;;M&(vwH?8i0}(_GM09y0a>B8mN5HbdZ+=9O$rD9=xhN*J64m8Yw4E*N z;A9Lrmtx5CG=_?{W9Z@@d(1et^#bEU9G#JIJBhx|iJH?B?01oF?D$rkw;_%^mX6E? z{Vd^(4!_hhJd+x2*-h>z&PPu4J01O~L+vi3W+0+qK+A}MOf>@)O9QIc4e0%5WcZ}k zX*cHR+)Z#!Oc;7F<+KUSMYEHqS?B-3Rr>aDmigvCVbMfa7tOSMXJVJWn3z&50Y%*j oFkehSOD%!(zLtQxtwfgj7kEc2&5H7LY8w1e3;Z2cn}VMG1I8};t^fc4 From a090a804f30dd445d70057fa4fcfedd59caed94f Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 24 Feb 2026 09:33:25 -0800 Subject: [PATCH 2/3] #85-update preprocessor --- .../preprocessor/bugzila_preprocessor.py | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/pinecone_rag/preprocessor/bugzila_preprocessor.py b/pinecone_rag/preprocessor/bugzila_preprocessor.py index 782d034..68d1e2a 100644 --- a/pinecone_rag/preprocessor/bugzila_preprocessor.py +++ b/pinecone_rag/preprocessor/bugzila_preprocessor.py @@ -9,24 +9,14 @@ import json import logging -from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional from langchain_core.documents import Document -logger = logging.getLogger(__name__) - +from preprocessor.utility import get_timestamp_from_date -def _to_timestamp(value: Optional[str]) -> float: - """Convert ISO datetime string to Unix timestamp (UTC).""" - if not value: - return 0.0 - try: - dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") - return dt.timestamp() - except Exception: - return 0.0 +logger = logging.getLogger(__name__) def _is_valid_content(text: str, min_length: int) -> bool: @@ -98,9 +88,9 @@ def _load_bug_document(json_path: Path, min_content_length: int) -> Optional[Doc "author": bug.get("creator", "") or "", "state": bug.get("status", "") or "", "state_reason": bug.get("resolution", "") or "", - "created_at": _to_timestamp(created), - "updated_at": _to_timestamp(last_change), - "closed_at": 0.0 if is_open else _to_timestamp(last_change), + "created_at": get_timestamp_from_date(created or "", 0.0), + "updated_at": get_timestamp_from_date(last_change or "", 0.0), + "closed_at": 0.0 if is_open else get_timestamp_from_date(last_change or "", 0.0), } return Document(page_content=content, metadata=metadata) From b3fd1d447e9944cdf62181305fef05f5d9520a60 Mon Sep 17 00:00:00 2001 From: mac Date: Tue, 24 Feb 2026 10:57:44 -0800 Subject: [PATCH 3/3] #85-add docstring --- pinecone_rag/preprocessor/bugzila_preprocessor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pinecone_rag/preprocessor/bugzila_preprocessor.py b/pinecone_rag/preprocessor/bugzila_preprocessor.py index 68d1e2a..b0bb0dc 100644 --- a/pinecone_rag/preprocessor/bugzila_preprocessor.py +++ b/pinecone_rag/preprocessor/bugzila_preprocessor.py @@ -20,10 +20,12 @@ def _is_valid_content(text: str, min_length: int) -> bool: + """Return True if text is non-empty and has at least min_length characters (after strip).""" return bool(text and len(text.strip()) >= min_length) def _build_content(bug: Dict[str, Any], comments: List[Any]) -> str: + """Build plain-text document content from bug metadata and comments.""" lines: List[str] = [] lines.append(f"Product: {bug.get('product', '')}") lines.append(f"Component: {bug.get('component', '')}") @@ -102,6 +104,7 @@ class BugIssuePreprocessor: def __init__( self, data_dir: str = "data/github/Clang/bugs", min_content_length: int = 10 ): + """Initialize with the directory containing Bugzilla JSON files and minimum content length.""" self.data_dir = Path(data_dir) self.min_content_length = min_content_length