diff --git a/Dockerfile b/Dockerfile index a7337638..bd453cbf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -79,7 +79,7 @@ RUN python3 -m playwright install chromium && \ npx -y playwright install chromium # Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`) -RUN pipx install postgres-mcp +RUN pipx install postgres-mcp==0.3.0 # Set working directory WORKDIR /app diff --git a/README.md b/README.md index 31a33968..e4c36739 100644 --- a/README.md +++ b/README.md @@ -14,10 +14,13 @@ An evaluation suite for agentic models in real MCP tool environments (Notion / G MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports. +> πŸš€ **MCPMark Verified is out** β€” a version-pinned, stabilized subset of the standard tasks for reproducible evaluation. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264). + [![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai) ## News +- πŸš€ **12 Jun** β€” **MCPMark Verified** is out: a version-pinned, stabilized subset of the standard tasks. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264). - πŸ“£ **27 May** β€” The previous Notion Source Hub page is deprecated; please use the new link: [MCPMark Source Hub](https://gossamer-sawfish-47c.notion.site/MCPMark-Source-Hub-dc32b7e8cebd82b8959b81ae322df87a). - πŸ“Œ **21 Jan** β€” Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246). - πŸ”₯ **13 Dec** β€” Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])). diff --git a/pipeline.py b/pipeline.py index 6d932f6e..a2cc79b7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -113,7 +113,7 @@ def main(): parser.add_argument( "--reasoning-effort", default="default", - choices=["default", "minimal", "low", "medium", "high"], + choices=["default", "minimal", "low", "medium", "high", "xhigh", "max"], help="Reasoning effort level for supported models (default: None)", ) diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py index c49b4cda..dcf7fcdd 100644 --- a/src/agents/base_agent.py +++ b/src/agents/base_agent.py @@ -176,7 +176,7 @@ def _create_stdio_server(self) -> MCPStdioServer: raise ValueError("Notion API key required") return MCPStdioServer( command="npx", - args=["-y", "@notionhq/notion-mcp-server"], + args=["-y", "@notionhq/notion-mcp-server@1.9.1"], env={ "OPENAPI_MCP_HEADERS": ( '{"Authorization": "Bearer ' + notion_key + '", ' @@ -193,7 +193,7 @@ def _create_stdio_server(self) -> MCPStdioServer: command="npx", args=[ "-y", - "@modelcontextprotocol/server-filesystem", + "@modelcontextprotocol/server-filesystem@2025.12.18", str(test_directory), ], ) @@ -204,7 +204,7 @@ def _create_stdio_server(self) -> MCPStdioServer: viewport_width = self.service_config.get("viewport_width", 1280) viewport_height = self.service_config.get("viewport_height", 720) - args = ["-y", "@playwright/mcp@latest"] + args = ["-y", "@playwright/mcp@0.0.68"] if headless: args.append("--headless") args.extend( @@ -234,7 +234,7 @@ def _create_stdio_server(self) -> MCPStdioServer: ) return MCPStdioServer( command="pipx", - args=["run", "postgres-mcp", "--access-mode=unrestricted"], + args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"], env={"DATABASE_URI": database_url}, ) diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py index 6f072c5c..ef9b5df9 100644 --- a/src/agents/mcpmark_agent.py +++ b/src/agents/mcpmark_agent.py @@ -849,6 +849,10 @@ async def _execute_litellm_tool_loop( "model": self.litellm_input_model_name, "messages": messages, "api_key": self.api_key, + "max_tokens": 32768, + "temperature": 1.0, + "enforcer_mode": "on", + "think_mode": "on", } # Always use tools format if available - LiteLLM will handle conversion @@ -1131,7 +1135,7 @@ def _create_stdio_server(self) -> MCPStdioServer: command="npx", args=[ "-y", - "@modelcontextprotocol/server-filesystem", + "@modelcontextprotocol/server-filesystem@2025.12.18", str(test_directory), ], ) @@ -1142,7 +1146,7 @@ def _create_stdio_server(self) -> MCPStdioServer: viewport_width = self.service_config.get("viewport_width", 1280) viewport_height = self.service_config.get("viewport_height", 720) - args = ["-y", "@playwright/mcp@latest"] + args = ["-y", "@playwright/mcp@0.0.68"] if headless: args.append("--headless") args.extend( @@ -1176,7 +1180,7 @@ def _create_stdio_server(self) -> MCPStdioServer: return MCPStdioServer( command="pipx", - args=["run", "postgres-mcp", "--access-mode=unrestricted"], + args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"], env={"DATABASE_URI": database_url}, ) diff --git a/src/mcp_services/github/github_state_manager.py b/src/mcp_services/github/github_state_manager.py index ae07f0a9..4828bd17 100644 --- a/src/mcp_services/github/github_state_manager.py +++ b/src/mcp_services/github/github_state_manager.py @@ -255,7 +255,8 @@ def _push_repo( # Safety check: Prevent importing to public repositories # Public repos would send @ mention notifications to real users, causing spam - if not private: + # Exception: mcpmark-cicd needs to be public for GitHub Actions workflows to work properly + if not private and "mcpmark-cicd" not in template_dir.name: error_msg = ( "ERROR: Cannot import template to a public repository.\n\n" "Reason: The template contains @ mentions of real GitHub users from the original\n" diff --git a/src/model_config.py b/src/model_config.py index 11d1e67d..db5f5026 100644 --- a/src/model_config.py +++ b/src/model_config.py @@ -50,6 +50,11 @@ class ModelConfig: "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5.2", }, + "gpt-5.5": { + "provider": "openai", + "api_key_var": "OPENAI_API_KEY", + "litellm_input_model_name": "openai/gpt-5.5", + }, "gpt-5": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", diff --git a/tasks/filesystem/standard/desktop/project_management/description.md b/tasks/filesystem/standard/desktop/project_management/description.md index 830cad53..22c892ca 100644 --- a/tasks/filesystem/standard/desktop/project_management/description.md +++ b/tasks/filesystem/standard/desktop/project_management/description.md @@ -1,6 +1,6 @@ Please use FileSystem tools to finish the following task: -1. **Create the main directory structure** in `desktop_2`: +1. **Create the main directory structure** in `desktop`: - Create a new directory in main directory called `organized_projects` - Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal` diff --git a/tasks/filesystem/standard/file_context/duplicates_searching/description.md b/tasks/filesystem/standard/file_context/duplicates_searching/description.md index 71400867..4d648667 100644 --- a/tasks/filesystem/standard/file_context/duplicates_searching/description.md +++ b/tasks/filesystem/standard/file_context/duplicates_searching/description.md @@ -6,7 +6,7 @@ You are given a directory containing multiple text files. Some files have identi ### Task Objectives -1. **Scan all text files** in the test directory to identify groups with identical content +1. **Find out all the duplicate files** in the test directory with identical content based on the directory’s initial state. 2. **Create a 'duplicates' directory** in the test directory root 3. **Move all duplicate files** into the 'duplicates' directory 4. **Leave unique files** in their original location diff --git a/tasks/filesystem/standard/file_context/file_merging/description.md b/tasks/filesystem/standard/file_context/file_merging/description.md index 353b6d8c..6fff4a4e 100644 --- a/tasks/filesystem/standard/file_context/file_merging/description.md +++ b/tasks/filesystem/standard/file_context/file_merging/description.md @@ -9,4 +9,4 @@ You are given a directory containing multiple text files of varying sizes. Your 1. **Identify the 10 smallest .txt files** in the test directory 2. **Sort the selected files alphabetically** by filename 3. **Merge the content** of these files into a single file -4. **Add file headers** (file name) before each file's content +4. **Format of merged_content.txt** : For each file, write its full filename (e.g., "example.txt") on the first line. On the immediately following line(s), copy the entire content of the file. After the file content, insert exactly one empty line (unless it is the last file). Repeat this pattern for all 10 files in alphabetical order. diff --git a/tasks/filesystem/standard/file_context/uppercase/verify.py b/tasks/filesystem/standard/file_context/uppercase/verify.py index 0f93bd37..03e7bd5f 100644 --- a/tasks/filesystem/standard/file_context/uppercase/verify.py +++ b/tasks/filesystem/standard/file_context/uppercase/verify.py @@ -64,7 +64,8 @@ def verify_uppercase_content(test_dir: Path) -> bool: # Check if uppercase content is the uppercase version of original expected_uppercase = original_content.upper() - + uppercase_content = uppercase_content.strip() + expected_uppercase = expected_uppercase.strip() if uppercase_content != expected_uppercase: print(f"| ❌ File '{filename}' content is not properly converted to uppercase") return False diff --git a/tasks/filesystem/standard/file_property/time_classification/description.md b/tasks/filesystem/standard/file_property/time_classification/description.md index 80a7181f..f35450d0 100644 --- a/tasks/filesystem/standard/file_property/time_classification/description.md +++ b/tasks/filesystem/standard/file_property/time_classification/description.md @@ -2,13 +2,13 @@ Please use FileSystem tools to finish the following task: ### Task Description -Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates. +Analyze the last modified time (mtime) of all files in the test directory and organize them into a hierarchical directory structure based on their modification dates. ### Task Objectives 1. **Read metadata** of all files in the test directory -2. **Analyze creation times** (ctime) of all files (excluding .DS_Store) -3. **Create directory structure** organized by month/day based on creation time +2. **Analyze last modified times** (mtime) of all files (excluding .DS_Store) , assuming China Standard Time (UTC+8) +3. **Create directory structure** organized by month/day based on last modified time 4. **Move files** to appropriate directories 5. **Create metadata analysis files** in each directory @@ -25,5 +25,5 @@ Create directories in the format: `MM/DD/` where: Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines: -- **Line 1**: Oldest filename and its creation time (excluding .DS_Store) -- **Line 2**: Latest filename and its creation time (excluding .DS_Store) +- **Line 1**: Oldest filename and its last modified time (excluding .DS_Store) +- **Line 2**: Latest filename and its last modified time (excluding .DS_Store) diff --git a/tasks/filesystem/standard/file_property/time_classification/verify.py b/tasks/filesystem/standard/file_property/time_classification/verify.py index b6311913..36cacca3 100644 --- a/tasks/filesystem/standard/file_property/time_classification/verify.py +++ b/tasks/filesystem/standard/file_property/time_classification/verify.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Verification script for File Organization by Creation Time Task +Verification script for File Organization by Last Modification Time Task """ import sys diff --git a/tasks/filesystem/standard/folder_structure/structure_analysis/description.md b/tasks/filesystem/standard/folder_structure/structure_analysis/description.md index 5a92c887..3eef09b0 100644 --- a/tasks/filesystem/standard/folder_structure/structure_analysis/description.md +++ b/tasks/filesystem/standard/folder_structure/structure_analysis/description.md @@ -15,7 +15,7 @@ Do not try to use python code. Count the following information for the entire directory structure: - total number of files -- total number of folders +- total number of folders (exclude the folder named "complex_structure") - total size of the hole folder (in bytes, include .DS_Store only in this subtask) **Format (one item per line):** diff --git a/tasks/filesystem/standard/papers/author_folders/description.md b/tasks/filesystem/standard/papers/author_folders/description.md index 8fad2ba8..b44b459e 100644 --- a/tasks/filesystem/standard/papers/author_folders/description.md +++ b/tasks/filesystem/standard/papers/author_folders/description.md @@ -27,20 +27,20 @@ You are given a directory containing multiple paper files. You have a collection [given_task_folder]/ β”œβ”€β”€ [original HTML files remain untouched] β”œβ”€β”€ frequent_authors/ # Authors with β‰₯4 papers total -β”‚ β”œβ”€β”€ smith_john/ +β”‚ β”œβ”€β”€ john_smith/ β”‚ β”‚ └── [copied papers] -β”‚ β”œβ”€β”€ johnson_sarah/ +β”‚ β”œβ”€β”€ sarah_johnson/ β”‚ β”‚ └── [copied papers] β”‚ └── ... └── 2025_authors/ # Authors with β‰₯3 papers in 2025 - β”œβ”€β”€ williams_david/ + β”œβ”€β”€ david_williams/ β”‚ └── [copied 2025 papers] - β”œβ”€β”€ brown_emily/ + β”œβ”€β”€ emily_brown/ β”‚ └── [copied 2025 papers] └── ... ``` #### Requirements: -- Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`) +- Author folder names should be **lowercase** with underscores, using `firstname_lastname` format (e.g., `john_smith`, `david_williams`). Only the first name is used (middle names are ignored). - Papers should be **copied** (not moved) to preserve originals - Author extraction should handle various name formats correctly \ No newline at end of file diff --git a/tasks/filesystem/standard/votenet/requirements_writing/description.md b/tasks/filesystem/standard/votenet/requirements_writing/description.md index 24cb17e4..448aac2a 100644 --- a/tasks/filesystem/standard/votenet/requirements_writing/description.md +++ b/tasks/filesystem/standard/votenet/requirements_writing/description.md @@ -10,7 +10,6 @@ The VoteNet project is a 3D object detection framework for point clouds. Your ta 2. **Include all essential dependencies** needed to run the VoteNet codebase 3. **Ensure the file format is correct** (one dependency per line) 4. **Save the file as `requirements.txt`** in the current working directory -5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**. ### Requirements diff --git a/tasks/filesystem/standard/votenet/requirements_writing/verify.py b/tasks/filesystem/standard/votenet/requirements_writing/verify.py index df99ca9d..e314b146 100644 --- a/tasks/filesystem/standard/votenet/requirements_writing/verify.py +++ b/tasks/filesystem/standard/votenet/requirements_writing/verify.py @@ -55,7 +55,6 @@ def verify_required_dependencies_present(test_dir: Path) -> bool: "opencv", "plyfile", "trimesh", - "pointnet2", "networkx" ] diff --git a/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md b/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md index d42eebb0..31f66dfa 100644 --- a/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md +++ b/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md @@ -1,10 +1,10 @@ I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository. **Step 1: Commit History Analysis** -Analyze ALL commits in the repository to identify: +Analyze all commits reachable from the default branch (`main`) to identify: -1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude " in commit messages) -2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude +1. **Claude Co-Authored Commits**: Find all commits whose message contains a `Co-Authored-By: Claude ` trailer. Match case-insensitively (both `Co-Authored-By` and `Co-authored-by` count). Count each commit at most once. +2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude. **Step 2: Create Collaboration Analysis Report** Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with: @@ -23,7 +23,7 @@ Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root w ``` Include the top 3 developers by number of Claude collaborations. -**Step 3: Commit Analysis to Repository** +**Step 3: Commit the Analysis to Repository** Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with: - Commit message: "Add Claude AI collaboration analysis report" -- Ensure all statistics are accurate based on actual commit data \ No newline at end of file +- Ensure all statistics are accurate based on actual commit data diff --git a/tasks/github/standard/claude-code/feature_commit_tracking/verify.py b/tasks/github/standard/claude-code/feature_commit_tracking/verify.py index 90c6ad47..18bed08f 100644 --- a/tasks/github/standard/claude-code/feature_commit_tracking/verify.py +++ b/tasks/github/standard/claude-code/feature_commit_tracking/verify.py @@ -129,16 +129,23 @@ def verify_task() -> bool: } # Expected feature commits based on exploration + # For CHANGELOG Version 1.0.65, two valid answers exist: + # - 94dcaca5: merge commit that brought 1.0.65 into pr/2466-QwertyJack-main branch + # - 5faa082d: the actual commit that first added 1.0.65 content to CHANGELOG.md on main expected_features = { - "Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d", - "CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0", - "Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332", + "Shell Completion Scripts": ["8a0febdd09bda32f38c351c0881784460d69997d"], + "CHANGELOG Version 1.0.65": [ + "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0", + "5faa082d6e4e5300485daafb94615fe133175055", + ], + "Rust Extraction Improvements": ["50e58affdf1bfc7d875202bc040ebe0dcfb7d332"], } # Expected authors for each commit expected_authors = { "8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack", + "5faa082d6e4e5300485daafb94615fe133175055": "actions-user", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre", } @@ -146,6 +153,7 @@ def verify_task() -> bool: expected_messages = { "8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main", + "5faa082d6e4e5300485daafb94615fe133175055": "chore: Update CHANGELOG.md", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows", } @@ -153,6 +161,7 @@ def verify_task() -> bool: expected_dates = { "8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02", + "5faa082d6e4e5300485daafb94615fe133175055": "2025-07-31", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09", } @@ -197,7 +206,7 @@ def verify_task() -> bool: for feature in features: found_features[feature["name"]] = feature["sha"] - for feature_name, expected_sha in expected_features.items(): + for feature_name, expected_shas in expected_features.items(): if feature_name not in found_features: print( f"Error: Feature '{feature_name}' not found in table", file=sys.stderr @@ -205,9 +214,9 @@ def verify_task() -> bool: return False actual_sha = found_features[feature_name] - if actual_sha != expected_sha: + if actual_sha not in expected_shas: print( - f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}", + f"Error: Wrong SHA for '{feature_name}'. Expected one of: {expected_shas}, Got: {actual_sha}", file=sys.stderr, ) return False @@ -216,8 +225,12 @@ def verify_task() -> bool: # 5. Verify each commit exists and has correct author print("5. Verifying commit details...") + all_expected_shas = set() + for shas in expected_features.values(): + all_expected_shas.update(shas) + for feature in features: - if feature["sha"] in expected_features.values(): + if feature["sha"] in all_expected_shas: success, commit_data = _verify_commit_exists( feature["sha"], headers, github_org ) diff --git a/tasks/github/standard/easyr1/qwen3_issue_management/verify.py b/tasks/github/standard/easyr1/qwen3_issue_management/verify.py index dc83d9b2..cbedf517 100644 --- a/tasks/github/standard/easyr1/qwen3_issue_management/verify.py +++ b/tasks/github/standard/easyr1/qwen3_issue_management/verify.py @@ -27,34 +27,26 @@ def _get_github_api( return False, None -def _search_github_issues( - query: str, headers: Dict[str, str] -) -> Tuple[bool, Optional[List]]: - """Search GitHub issues using the search API.""" - url = f"https://api.github.com/search/issues?q={query}&per_page=100" - try: - response = requests.get(url, headers=headers) - if response.status_code == 200: - data = response.json() - return True, data.get("items", []) - else: - print(f"Search API error: {response.status_code}", file=sys.stderr) - return False, None - except Exception as e: - print(f"Search exception: {e}", file=sys.stderr) - return False, None - - def _check_qwen3_issues_reopened(headers: Dict[str, str]) -> Tuple[bool, List]: """Check if all Qwen3 issues have been reopened and tagged.""" - # Search for all issues mentioning qwen3 (both open and closed) - github_org = os.environ.get("GITHUB_EVAL_ORG") - success, all_qwen3_issues = _search_github_issues( - f"repo:{github_org}/EasyR1 qwen3", headers - ) + # /search/issues is unreliable on freshly-imported repos (no index yet) and + # rejects unencoded queries. Fetch issues directly and filter client-side. + success, issues = _get_github_api("issues?state=all&per_page=100", headers) + if not success or issues is None: + print("Error: Could not fetch issues for Qwen3 check", file=sys.stderr) + return False, [] - if not success or not all_qwen3_issues: - print("Error: Could not search for Qwen3 issues", file=sys.stderr) + # Exclude the summary issue itself β€” it mentions qwen3 by design but isn't + # one of the reopened issues it describes. + all_qwen3_issues = [ + i for i in issues + if i.get("pull_request") is None + and i.get("title") != "Reopened Qwen3 Issues Summary" + and "qwen3" in ((i.get("title") or "") + " " + (i.get("body") or "")).lower() + ] + + if not all_qwen3_issues: + print("Error: No Qwen3 issues found", file=sys.stderr) return False, [] reopened_issues = [] diff --git a/tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py b/tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py index 349e74f2..58df7985 100644 --- a/tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py +++ b/tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py @@ -139,6 +139,18 @@ def _get_pr_reviews( return [] +def _get_pr_review_comments( + pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" +) -> List[Dict]: + """Get all review comments (inline comments on code) for a PR.""" + success, comments = _get_github_api( + f"pulls/{pr_number}/comments", headers, org, repo + ) + if success and comments: + return comments + return [] + + def _check_issue_comment_references( comments: List[Dict], pr_number: int, keywords: List[str] ) -> bool: @@ -170,12 +182,21 @@ def _check_headings_and_content( return has_headings and has_keywords -def _check_pr_review_content(reviews: List[Dict], keywords: List[str]) -> bool: - """Check if PR has review comments containing required keywords.""" +def _check_pr_review_content( + reviews: List[Dict], keywords: List[str], review_comments: Optional[List[Dict]] = None +) -> bool: + """Check if PR has review bodies or inline review comments containing required keywords.""" + # Check review top-level bodies for review in reviews: body = review.get("body", "") if body and all(keyword.lower() in body.lower() for keyword in keywords): return True + # Check inline review comments (code-level comments added during review) + if review_comments: + for comment in review_comments: + body = comment.get("body", "") + if body and all(keyword.lower() in body.lower() for keyword in keywords): + return True return False @@ -334,7 +355,8 @@ def verify() -> bool: # 5. Check PR review comments print("5. Verifying PR review comments...") reviews = _get_pr_reviews(pr_number, headers, github_org) - if not _check_pr_review_content(reviews, REVIEW_KEYWORDS): + review_comments = _get_pr_review_comments(pr_number, headers, github_org) + if not _check_pr_review_content(reviews, REVIEW_KEYWORDS, review_comments): print( "Error: PR missing review comment with required technical keywords", file=sys.stderr, diff --git a/tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md b/tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md index 266e680d..6758cb18 100644 --- a/tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md +++ b/tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md @@ -27,11 +27,13 @@ In the 'history-report-2025' branch, create a file called `BRANCH_COMMITS.json` **Step 3: Create Cross-Branch Analysis** Create a file `CROSS_BRANCH_ANALYSIS.md` that contains: - A section "## Top Contributors" listing the 3 contributors with the most commits on the main branch, sorted by commit count (format: "github_username: X commits") +- Use the GitHub username (i.e. `author.login` from the commits API), and include merge commits in the count +- If there's a tie at the 3rd position, listing either tied contributor is acceptable - Must include keywords: "contributors" **Step 4: Generate Merge Timeline** Create a file `MERGE_TIMELINE.txt` that lists the 10 most recent merge commits from the main branch: -- Format: `DATE | MERGE_COMMIT_MESSAGE | COMMIT_SHA` +- Format: `DATE | MERGE_COMMIT_MESSAGE | COMMIT_SHA` where DATE is `YYYY-MM-DD` (UTC) - List in reverse chronological order (newest first) - Only include actual merge commits (commits that have exactly 2 parent commits) - Note: While the commit messages reference PR numbers, those PRs no longer exist in the repository \ No newline at end of file diff --git a/tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py b/tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py index 6d8646ab..df4499ce 100644 --- a/tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py +++ b/tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py @@ -1,5 +1,6 @@ import sys import os +import re import requests from typing import Dict, Optional, Tuple import base64 @@ -195,31 +196,48 @@ def _check_cross_branch_analysis(content: str) -> bool: ) return False - # Verify the top 3 contributors with correct counts from main branch (order matters) - expected_contributors = [ + # Top 1 and Top 2 are uniquely determined; Top 3 is tied between axion66 and + # zhuohan123 (both have 2 commits on main), so accept either. + fixed_top = [ "scott-oai: 35 commits", "egorsmkv: 4 commits", - "axion66: 2 commits", ] - - for contributor in expected_contributors: - if contributor not in content: + for entry in fixed_top: + if entry not in content: print( - f"Missing or incorrect contributor entry: {contributor}", + f"Missing or incorrect contributor entry: {entry}", file=sys.stderr, ) return False + tied_third = ("axion66: 2 commits", "zhuohan123: 2 commits") + if not any(t in content for t in tied_third): + print( + "Missing tied third contributor (axion66 or zhuohan123 at 2 commits)", + file=sys.stderr, + ) + return False + return True def _check_merge_timeline(content: str) -> bool: """Verify MERGE_TIMELINE.txt has correct format and expected merge commits.""" + # Normalize any ISO 8601 timestamps in the DATE column down to YYYY-MM-DD so + # that "2025-08-06T23:21:08Z" or "2025-08-06 23:21:08" both compare equal to + # the canonical "2025-08-06". This keeps the SHA / message / ordering checks + # strict while accepting any reasonable date representation. + normalized = re.sub( + r"(\d{4}-\d{2}-\d{2})[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?Z?", + r"\1", + content, + ) + expected_timeline = [ "2025-08-06 | Merge pull request #29 from axion66/improve-readme-and-checks | 3efbf742533a375fc148d75513597e139329578b", "2025-08-06 | Merge pull request #30 from Yuan-ManX/harmony-format | 9d653a4c7382abc42d115014d195d9354e7ad357", "2025-08-06 | Merge pull request #28 from dkqjrm/fix-typo-format-md | 161e5fe2a57c63e9f8353c4c5b8faa3c3854bb5f", - "2025-08-05 | Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool | 82b3afb9eb043343f322c937262cc50405e892c3", + "2025-08-06 | Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool | 82b3afb9eb043343f322c937262cc50405e892c3", "2025-08-05 | Merge pull request #18 from openai/dev/scl/better-ci | b255cbeb6274adbea774f26fd9590922ce8874ed", "2025-08-05 | Merge pull request #21 from Tialo/main | 058ef3257c24fb099aac7960c10ce51c8e55d9fe", "2025-08-05 | Merge branch 'main' into dev/scl/better-ci | 6375a15ea1b0a486cbb1468964cf8f5800ff5a5c", @@ -228,9 +246,9 @@ def _check_merge_timeline(content: str) -> bool: "2025-08-05 | Merge pull request #17 from openai/dev/scl/add-docs-to-cargo | 64bca4cf327ebeafa0bbd0345650d86e2d02142f", ] - # Verify each expected timeline entry exists in the content + # Verify each expected timeline entry exists in the normalized content for i, expected_line in enumerate(expected_timeline): - if expected_line not in content: + if expected_line not in normalized: print(f"Missing expected timeline entry {i + 1} in MERGE_TIMELINE.txt", file=sys.stderr) print(f"Expected: {expected_line}", file=sys.stderr) return False diff --git a/tasks/github/standard/harmony/release_management_workflow/verify.py b/tasks/github/standard/harmony/release_management_workflow/verify.py index 2d046a4e..07f92f0e 100644 --- a/tasks/github/standard/harmony/release_management_workflow/verify.py +++ b/tasks/github/standard/harmony/release_management_workflow/verify.py @@ -134,14 +134,21 @@ def _check_pr_squash_merged( if not success or not commit: return False - # For squash and merge, the commit will have exactly one parent - # and the commit message typically includes the PR number + # Squash and merge produces a single new commit with one parent (the base + # branch tip); regular merge produces a commit with two parents. The + # message-content check distinguishes squash from rebase: GitHub's squash + # commit either auto-appends `(#N)` (when commit_title is left unset) or + # carries the agent-supplied commit_title (which in practice mirrors the + # PR title). Rebase, by contrast, preserves the original per-commit + # messages, which carry neither signal. parents = commit.get("parents", []) commit_message = commit.get("commit", {}).get("message", "") + pr_title = pr.get("title", "") - # Squash and merge commits have exactly 1 parent (the base branch) - # Regular merge commits have 2 parents (base and head branches) - if len(parents) == 1 and f"#{pr_number}" in commit_message: + if len(parents) == 1 and ( + f"#{pr_number}" in commit_message + or (pr_title and pr_title in commit_message) + ): return True return False diff --git a/tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py b/tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py index 02b9d397..a59ae82b 100644 --- a/tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py +++ b/tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py @@ -332,28 +332,69 @@ def _verify_deployment_issue( print(" βœ… Found rollback plan comment from GitHub Actions bot") # Check for required rollback plan elements - required_elements = [ - "**Previous Commit**:", - "**Current Commit**:", - "**Package Version**:", - "βœ… Executable rollback script created", - "βœ… Configuration backups saved", - "βœ… Dependency verification script prepared", - "βœ… Comprehensive rollback documentation generated", - "βœ… Compressed rollback package created", - "**SHA256**:", - "**Artifact**:", - "Quick Rollback Commands", + # Use flexible matching: accept both markdown bold ("**Key**:") and plain text ("Key:") + import re as _re + + def _flex_match(comment: str, keyword: str) -> bool: + """Match 'keyword:' with optional markdown bold wrapping.""" + pattern = r"(\*\*\s*)?" + _re.escape(keyword) + r"(\s*\*\*)?\s*:" + return bool(_re.search(pattern, comment)) + + field_keywords = [ + "Previous Commit", + "Current Commit", + "Package Version", + "SHA256", + "Artifact", ] - for element in required_elements: - if element not in rollback_comment: - errors.append(f"Missing element in rollback plan: '{element}'") + for kw in field_keywords: + if _flex_match(rollback_comment, kw): + print(f" βœ… Found rollback plan field: '{kw}'") else: - print(f" βœ… Found rollback plan element: '{element}'") + errors.append(f"Missing field in rollback plan: '{kw}'") + + # Check for at least 5 checkmarks (βœ…) with rollback-related keywords + # Accept any reasonable wording as long as the semantic component is mentioned + rollback_component_keywords = [ + r"rollback\s+script", + r"configuration\s+backup|config.*backup", + r"dependency\s+verification|dependency.*check", + r"rollback\s+documentation|rollback.*doc", + r"rollback\s+package|compressed.*package", + ] + + checkmark_lines = [ + line.strip() + for line in rollback_comment.split("\n") + if "βœ…" in line + ] + + matched_components = 0 + for kw_pattern in rollback_component_keywords: + for line in checkmark_lines: + if _re.search(kw_pattern, line, _re.IGNORECASE): + matched_components += 1 + print(f" βœ… Found rollback component checkmark matching: '{kw_pattern}'") + break + + if matched_components < 5: + errors.append( + f"Expected at least 5 rollback component checkmarks (βœ…), found {matched_components}" + ) + else: + print(f" βœ… All 5 rollback component checkmarks found") + + # Check for Quick Rollback Commands section + if "Quick Rollback Command" in rollback_comment or "rollback command" in rollback_comment.lower(): + print(" βœ… Found rollback commands section") + else: + errors.append("Missing 'Quick Rollback Commands' section in rollback plan") # Verify commit SHAs in rollback comment - if f"**Current Commit**: {head_sha}" in rollback_comment: + # Accept both "**Current Commit**: sha" and "Current Commit: sha" + current_sha_pattern = r"(?:\*\*\s*)?Current\s+Commit(?:\s*\*\*)?\s*:\s*" + _re.escape(head_sha) + if _re.search(current_sha_pattern, rollback_comment): print(f" βœ… Current commit SHA verified: {head_sha}") else: errors.append( @@ -361,30 +402,19 @@ def _verify_deployment_issue( ) # Extract and verify previous commit SHA - if "**Previous Commit**:" in rollback_comment: - import re - - prev_sha_match = re.search( - r"\*\*Previous Commit\*\*:\s*([a-f0-9]{40})", rollback_comment - ) - if prev_sha_match: - prev_sha = prev_sha_match.group(1) - print(f" βœ… Previous commit SHA found: {prev_sha}") - - # Verify it's a valid 40-character SHA - if len(prev_sha) != 40: - errors.append( - f"Previous commit SHA has invalid length: {len(prev_sha)}" - ) - else: - errors.append( - "Previous commit SHA format not found in rollback comment" - ) + prev_sha_pattern = r"(?:\*\*\s*)?Previous\s+Commit(?:\s*\*\*)?\s*:\s*([a-f0-9]{40})" + prev_sha_match = _re.search(prev_sha_pattern, rollback_comment) + if prev_sha_match: + prev_sha = prev_sha_match.group(1) + print(f" βœ… Previous commit SHA found: {prev_sha}") else: - errors.append("Previous commit SHA not found in rollback comment") + errors.append( + "Previous commit SHA (40-char hex) not found in rollback comment" + ) # Verify SHA256 checksum is present - sha256_match = re.search(r"\*\*SHA256\*\*:\s*([a-f0-9]{64})", rollback_comment) + sha256_pattern = r"(?:\*\*\s*)?SHA256(?:\s*\*\*)?\s*:\s*([a-f0-9]{64})" + sha256_match = _re.search(sha256_pattern, rollback_comment) if sha256_match: sha256_value = sha256_match.group(1) print(f" βœ… SHA256 checksum found: {sha256_value[:16]}...") diff --git a/tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md b/tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md index 32480c8f..a2dfbda7 100644 --- a/tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md +++ b/tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md @@ -20,11 +20,26 @@ Create `.github/workflows/issue-automation.yml` that triggers on `issues` events - For issues with a title containing "Epic", create exactly 4 sub-issues with the pattern: "[SUBTASK] [Original Title] - Task N: [Task Name]" - Task names: 1. Requirements Analysis, 2. Design and Architecture, 3. Implementation, 4. Testing and Documentation - Links sub-issues to parent using "Related to #[parent-number]" in sub-issue body - - Updates parent issue body with "## Epic Tasks" checklist linking to sub-issue numbers + - Updates parent issue body with an "## Epic Tasks" checklist that links to sub-issue numbers. Each checklist line MUST contain the literal substring `- [ ] #` (a GitHub-style task-list reference) AND the corresponding task name. The recommended line format is: + + ``` + - [ ] # - Task : + ``` + + Example for sub-issues #5–#8: + + ``` + ## Epic Tasks + + - [ ] #5 - Task 1: Requirements Analysis + - [ ] #6 - Task 2: Design and Architecture + - [ ] #7 - Task 3: Implementation + - [ ] #8 - Task 4: Testing and Documentation + ``` - All sub-issues get `enhancement` and `needs-review` labels ### 3. **auto-response** job: - - Checks if the issue author is creating their first issue in this repository (not first on GitHub globally, but first in this specific repo) + - Checks if the issue author is creating their first issue in this repository (not first on GitHub globally, but first in this specific repo). For the purpose of this check, automation-created sub-issues authored by `github-actions[bot]` do not count as prior issues by the human author. - If first issue in repo: adds `first-time-contributor` label and posts welcome message - Posts different responses based on issue type: - `bug` issues: comment must contain "Bug Report Guidelines" @@ -82,18 +97,24 @@ Create a comprehensive pull request and merge it to main: - Merge the pull request to main branch **Step 5: Test the Workflow** -Create test issues to demonstrate the issue automation workflow: +Create test issues to demonstrate the issue automation workflow. Create them in the order listed below, and ensure that the **Bug issue is the very first issue you (the human author) open in this repository** so that the first-time-contributor logic fires on it. + +When writing the issue bodies, be careful about which priority keywords you include. Priority is matched against title OR body, with the highest match winning, so avoid adding higher-priority keywords (e.g. `critical`, `urgent`, `production`, `outage`) when you want a lower priority outcome. 1. **Bug Issue**: "Bug: Login form validation not working" - - Expected: `bug`, `priority-high`, `needs-triage`β†’`needs-review`, milestone "v1.0.0" + - This must be the first issue authored by you in the repo (so that `first-time-contributor` is applied). + - Title/body should include high-priority wording (e.g. `important`, `high`, `blocking`) but MUST NOT contain any critical keyword (`critical`, `urgent`, `production`, `outage`). + - Expected: `bug`, `priority-high`, `first-time-contributor`, `needs-triage`β†’`needs-review`, milestone "v1.0.0" - Auto-response comment must contain "Bug Report Guidelines" 2. **Epic Issue**: "Epic: Redesign user dashboard interface" + - Title/body should include high-priority wording (`important`, `high`, `blocking`) and MUST NOT contain any critical keyword. - Expected: `epic`, `priority-high`, `needs-triage`β†’`needs-review`, milestone "v1.0.0" - Must create 4 sub-issues with `enhancement` and `needs-review` labels - - Parent updated with "## Epic Tasks" checklist, sub-issues linked with "Related to #[parent-number]" + - Parent updated with "## Epic Tasks" checklist whose lines contain `- [ ] #` plus the task name (see the example format under the task-breakdown job above), sub-issues linked back with "Related to #[parent-number]" - Auto-response comment must contain "Feature Request Process" 3. **Maintenance Issue**: "Weekly maintenance cleanup and refactor" + - Title/body should use medium/normal wording and MUST NOT contain any high or critical keyword. - Expected: `maintenance`, `priority-medium`, `needs-triage`β†’`needs-review`, no milestone - Auto-response comment must contain "Maintenance Guidelines" \ No newline at end of file diff --git a/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md b/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md index b1460a11..dbf6ecc6 100644 --- a/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md +++ b/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md @@ -36,7 +36,7 @@ Create the file `.github/workflows/lint.yml` with: - Uses ubuntu-latest runner - Sets up Node.js version 18 using actions/setup-node - Installs dependencies with npm ci -- Installs ESLint globally +- Installs ESLint v8 globally (`npm install -g eslint@8`) - Runs ESLint on all JavaScript files in src/ directories - Fails the workflow if linting errors are found diff --git a/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py b/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py index 6a0fcc50..a40683d1 100644 --- a/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py +++ b/tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py @@ -263,22 +263,22 @@ def verify() -> bool: # First get the commits for this PR commits = _get_pr_commits(pr_number, headers, github_org) - if len(commits) != 2: + if len(commits) < 2: print( - f"Error: Expected exactly 2 commits, found {len(commits)}", file=sys.stderr + f"Error: Expected at least 2 commits, found {len(commits)}", file=sys.stderr ) return False - print("βœ“ Found exactly 2 commits as expected") + print(f"βœ“ Found {len(commits)} commits (>= 2 as required)") # Sort commits chronologically (oldest first) commits.sort(key=lambda x: x.get("commit", {}).get("author", {}).get("date", "")) first_commit_sha = commits[0].get("sha") - second_commit_sha = commits[1].get("sha") + second_commit_sha = commits[-1].get("sha") print(f"First commit (should fail): {first_commit_sha[:7]}") - print(f"Second commit (should pass): {second_commit_sha[:7]}") + print(f"Last commit (should pass): {second_commit_sha[:7]}") # Wait for workflows on both commits to complete print("Waiting for workflow completion on first commit...") @@ -286,7 +286,7 @@ def verify() -> bool: second_commit_runs = [] start_time = time.time() - timeout = 90 + timeout = 300 no_workflow_check_count = 0 while time.time() - start_time < timeout: diff --git a/tasks/github/standard/missing-semester/assign_contributor_labels/verify.py b/tasks/github/standard/missing-semester/assign_contributor_labels/verify.py index 2a9b90ff..a57de3e2 100644 --- a/tasks/github/standard/missing-semester/assign_contributor_labels/verify.py +++ b/tasks/github/standard/missing-semester/assign_contributor_labels/verify.py @@ -74,7 +74,7 @@ def verify() -> bool: 15: ["assigned-anishathalye"], # Issue #15 # PRs 21: ["assigned-anishathalye"], # PR #21 - 22: ["assigned-anishathalye"], # PR #22 + 22: ["assigned-jonhoo"], # PR #22 23: ["assigned-anishathalye"], # PR #23 24: ["assigned-anishathalye"], # PR #24 } diff --git a/tasks/github/standard/missing-semester/find_legacy_name/verify.py b/tasks/github/standard/missing-semester/find_legacy_name/verify.py index 517655da..98fe5d85 100644 --- a/tasks/github/standard/missing-semester/find_legacy_name/verify.py +++ b/tasks/github/standard/missing-semester/find_legacy_name/verify.py @@ -95,8 +95,11 @@ def verify() -> bool: # 2. Check that the content matches expected answer print("2. Verifying ANSWER.md content...") answer_content = answer_content.strip() - - if answer_content not in EXPECTED_CONTENTS: + + # Match case-insensitively: the title and domain are accepted regardless of + # capitalization (e.g. "hacker tools" / "Hacker-Tools.github.io" both pass). + expected_lower = {c.lower() for c in EXPECTED_CONTENTS} + if answer_content.lower() not in expected_lower: print(f"Error: ANSWER.md content does not match expected answer(s)", file=sys.stderr) print(f"Expected one of: {sorted(EXPECTED_CONTENTS)}", file=sys.stderr) print(f"Found: {answer_content}", file=sys.stderr) diff --git a/tasks/notion/standard/company_in_a_box/employee_onboarding/description.md b/tasks/notion/standard/company_in_a_box/employee_onboarding/description.md index 02a22309..dbe3fa58 100644 --- a/tasks/notion/standard/company_in_a_box/employee_onboarding/description.md +++ b/tasks/notion/standard/company_in_a_box/employee_onboarding/description.md @@ -1,15 +1,18 @@ Build an integrated **Employee Onboarding** system for the existing **Company In A Box** page. **Task Requirements:** -1. Create a new **database** titled **Employee Onboarding Checklist** with the following properties *exactly*: + +Under the top-level page **Company In A Box**, create a new child page titled **Onboarding Hub** that contains, in order: + +1. An **Employee Onboarding Checklist** database created inline at the top of the page, with the following properties *exactly*: β€’ **Employee Name** – title β€’ **Start Date** – date β€’ **Department** – select (options: Product, Marketing, Sales, HR, Engineering) - Populate this database with **3** sample new-hire pages covering three different departments. Every property in each entry must be filled. + Populate it with **3** sample new-hire pages covering three different departments. Every property in each entry must be filled. + +2. A section headed **Benefits Overview** that includes linked mentions (@-mentions or link-to-page blocks) to **β‰₯ 3** distinct benefit-policy pages from the **Company Wiki** (for example *Benefits policy*, *Vacation Policy*, *Corporate travel*). + +3. A section headed **30-Day Timeline** that presents a numbered list with **7** steps covering the first 30 days. **Each step must reference (via @-mention) an existing page or database**. -2. Under the top-level page **Company In A Box**, create a new child page titled **Onboarding Hub** containing, in order: - 1) The **Employee Onboarding Checklist** database embedded at the top. - 2) A section headed **Benefits Overview** that includes linked mentions (@-mentions or link-to-page blocks) to **β‰₯ 3** distinct benefit-policy pages from the **Company Wiki** (for example *Benefits policy*, *Vacation Policy*, *Corporate travel*). - 3) A section headed **30-Day Timeline** that presents a numbered list with **7** steps covering the first 30 days. **Each step must reference (via @-mention) an existing page or database**. - 4) A section headed **Feedback Form** that provides **β‰₯ 3** to-do items for new hires to check off. \ No newline at end of file +4. A section headed **Feedback Form** that provides **β‰₯ 3** to-do items for new hires to check off. diff --git a/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md b/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md index 00aa6c76..67056b71 100644 --- a/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md +++ b/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md @@ -7,13 +7,14 @@ Create a quarterly business review dashboard in Notion based on the existing **C 1. A single **callout** block near the top that summarises progress toward the three *Current Goals* shown on the main page: β€’ *LATAM expansion* β€’ *Enterprise push* β€’ *Employee engagement* (All three phrases must appear in the callout text.) - 2. Four separate **section headings** (any heading level) – one for each department (**Product**, **Marketing**, **Sales**, **Human Resources**) – placed below the callout. Under each heading list that department’s objectives in a progress-tracking format (e.g. to-dos, check-box list). Each objective from the **Company Goals** page must appear at least once. + 2. Four separate **section headings** (any heading level) – one for each department (**Product**, **Marketing**, **Sales**, **Human Resources**) – placed below the callout, each as its own heading block. Under each heading, list that department's objectives as a **to-do checklist**. Each objective from the **Company Goals** page must appear at least once under its owning department's heading. 3. Add a **database** named **Action Items** with the following properties *exactly*: β€’ **Task Name** – title β€’ **Department** – select (options: Product, Marketing, Sales, HR) β€’ **Priority** – select (options: High, Medium, Low) β€’ **Status** – status - Populate this database with **β‰₯ 5** action-item pages derived from the departmental objectives, making sure every field in each entry is filled: - β€’ **Task Name** & **Department** must correctly correspond to the underlying objective/department. + Populate this database with **β‰₯ 5** action-item pages, **each from a different objective**, covering all four departments (at least one entry per department), and making sure every field in each entry is filled: + β€’ **Task Name** must be the exact wording of the underlying objective. + β€’ **Department** must be the department that owns that objective. β€’ **Priority** and **Status** can be any allowed value, but they must **not** be left empty. 4. Keep the overall visual style consistent with the existing wiki (use headings, dividers, etc.). \ No newline at end of file diff --git a/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py b/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py index 5beaf4ba..4ca85af8 100644 --- a/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py +++ b/tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py @@ -1,3 +1,4 @@ +import re import sys from typing import List from notion_client import Client @@ -9,14 +10,48 @@ def _contains_keywords(text: str, keywords: List[str]) -> bool: return all(kw.lower() in lowered for kw in keywords) +def _norm(text: str) -> str: + """Normalize whitespace and case for tolerant string comparison.""" + return " ".join(text.lower().split()) + + def verify(notion: Client, main_id: str = None) -> bool: """Programmatically verify that the dashboard page and its contents meet the requirements described in description.md. """ DASHBOARD_TITLE = "Q4 2024 Business Review Dashboard" PARENT_PAGE_TITLE = "Company In A Box" - CALL_OUT_KEYWORDS = ["latam", "enterprise", "employee engagement"] - DEPARTMENTS = ["Product", "Marketing", "Sales", "Human Resources"] + CALL_OUT_KEYWORDS = ["latam expansion", "enterprise push", "employee engagement"] + + # Section heading uses "Human Resources"; the database `Department` select + # uses the abbreviation "HR" (per description.md). The two sets are kept + # separate on purpose. + HEADING_DEPT_NAMES = ["Product", "Marketing", "Sales", "Human Resources"] + DB_DEPT_NAMES = {"Product", "Marketing", "Sales", "HR"} + + # Objectives sourced from the Company Goals page in the Company In A Box + # workspace. Keyed by the database department option name ("HR", not + # "Human Resources"), so it can be reused for entry-level checks below. + OBJECTIVES_BY_DEPT = { + "Product": [ + "Launch 3 major features", + "Expand enterprise offering", + ], + "Marketing": [ + "Improve new user retention", + "Publish more help content to reduce burden on support", + ], + "Sales": [ + "Close 20 SMB deals", + "Create strong sales pitch", + ], + "HR": [ + "Fill hiring gaps", + "Improve the onboarding experience for new hires", + ], + } + ALL_OBJECTIVES = [obj for objs in OBJECTIVES_BY_DEPT.values() for obj in objs] + REQUIRED_DB_PROPERTIES = { "Task Name": "title", "Department": "select", @@ -25,12 +60,24 @@ def verify(notion: Client, main_id: str = None) -> bool: } PRIORITY_OPTIONS = {"High", "Medium", "Low"} - # 1. Locate the dashboard page + # 1. Locate the dashboard page. + # `main_id` is the duplicated seed root (Company In A Box). Search inside + # its direct children for a child_page titled DASHBOARD_TITLE. page_id = None if main_id: - found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) - if found_id and obj_type == "page": - page_id = found_id + try: + children = notion.blocks.children.list(block_id=main_id).get( + "results", [] + ) + for block in children: + if ( + block.get("type") == "child_page" + and block.get("child_page", {}).get("title") == DASHBOARD_TITLE + ): + page_id = block["id"] + break + except Exception: + pass if not page_id: page_id = notion_utils.find_page(notion, DASHBOARD_TITLE) @@ -39,18 +86,27 @@ def verify(notion: Client, main_id: str = None) -> bool: print(f"Error: Page '{DASHBOARD_TITLE}' not found.", file=sys.stderr) return False - # Optional: ensure it is a child of Company In A Box + # Ensure the dashboard is a direct child of Company In A Box. + # When `main_id` is provided (the harness path), it IS the seed root, so + # compare ids directly. Otherwise fall back to a title comparison. try: page_obj = notion.pages.retrieve(page_id=page_id) parent_id = page_obj.get("parent", {}).get("page_id") - if parent_id: + if main_id: + if not parent_id or parent_id.replace("-", "") != main_id.replace("-", ""): + print( + f"Error: Dashboard page is not a direct child of '{PARENT_PAGE_TITLE}'.", + file=sys.stderr, + ) + return False + elif parent_id: parent_page = notion.pages.retrieve(page_id=parent_id) parent_title_rt = ( parent_page.get("properties", {}).get("title", {}).get("title", []) ) - parent_title = ( - parent_title_rt[0].get("plain_text") if parent_title_rt else None - ) + parent_title = "".join( + rt.get("plain_text", "") for rt in parent_title_rt + ) or None if parent_title != PARENT_PAGE_TITLE: print( f"Error: Dashboard page is not a direct child of '{PARENT_PAGE_TITLE}'.", @@ -76,23 +132,75 @@ def verify(notion: Client, main_id: str = None) -> bool: ) return False - # 3. Verify department section headings - found_depts = set() - for block in all_blocks: - if block.get("type") in {"heading_1", "heading_2", "heading_3"}: + # 3. Verify department section headings exist (word-boundary match so + # "Productivity" is not accepted in place of "Product"). Each department + # must be satisfied by a *distinct* heading block so a single combined + # heading like "Product / Marketing / Sales / Human Resources" cannot + # cover all four. + dept_to_heading_block = {} + used_block_ids = set() + for dept in HEADING_DEPT_NAMES: + pattern = rf"\b{re.escape(dept)}\b" + for block in all_blocks: + if block.get("type") not in {"heading_1", "heading_2", "heading_3"}: + continue + if block["id"] in used_block_ids: + continue heading_text = notion_utils.get_block_plain_text(block) - for dept in DEPARTMENTS: - if dept.lower() in heading_text.lower(): - found_depts.add(dept) - if set(DEPARTMENTS) != found_depts: - missing = set(DEPARTMENTS) - found_depts + if re.search(pattern, heading_text, flags=re.IGNORECASE): + dept_to_heading_block[dept] = block["id"] + used_block_ids.add(block["id"]) + break + missing_headings = set(HEADING_DEPT_NAMES) - set(dept_to_heading_block) + if missing_headings: print( - f"Error: Missing department headings: {', '.join(missing)}.", + f"Error: Missing distinct department headings: {', '.join(sorted(missing_headings))}.", file=sys.stderr, ) return False - # 4. Verify Action Items database exists and has correct schema + # 4. Each objective from the Company Goals page must appear verbatim in a + # to-do (checkbox) block, and that to-do block must be located between its + # owning department's heading and the next department heading (so each + # objective lives under the correct section). + block_index = {b["id"]: i for i, b in enumerate(all_blocks)} + heading_indices = sorted(block_index[bid] for bid in dept_to_heading_block.values()) + + def _section_range(dept: str): + start = block_index[dept_to_heading_block[dept]] + next_starts = [i for i in heading_indices if i > start] + end = next_starts[0] if next_starts else len(all_blocks) + return start, end + + todo_blocks = [b for b in all_blocks if b.get("type") == "to_do"] + todo_norm_by_id = { + b["id"]: _norm(notion_utils.get_block_plain_text(b)) for b in todo_blocks + } + + HEADING_TO_DB_DEPT = { + "Product": "Product", + "Marketing": "Marketing", + "Sales": "Sales", + "Human Resources": "HR", + } + for heading_dept in HEADING_DEPT_NAMES: + db_dept = HEADING_TO_DB_DEPT[heading_dept] + start, end = _section_range(heading_dept) + section_todo_norms = [ + todo_norm_by_id[b["id"]] + for b in todo_blocks + if start < block_index[b["id"]] < end + ] + for objective in OBJECTIVES_BY_DEPT[db_dept]: + norm_obj = _norm(objective) + if not any(norm_obj in t for t in section_todo_norms): + print( + f"Error: Objective '{objective}' not found in a to-do block under the '{heading_dept}' heading.", + file=sys.stderr, + ) + return False + + # 5. Verify Action Items database exists and has correct schema db_id = notion_utils.find_database_in_block(notion, page_id, "Action Items") if not db_id: print( @@ -129,17 +237,32 @@ def verify(notion: Client, main_id: str = None) -> bool: file=sys.stderr, ) return False - # Extra check for Priority options - if prop_name == "Priority": - options = {opt["name"] for opt in db_props[prop_name]["select"]["options"]} - if not PRIORITY_OPTIONS.issubset(options): - print( - f"Error: Priority property options must include High/Medium/Low. Current options: {options}", - file=sys.stderr, - ) - return False - # 5. Verify at least 5 action items exist + # Department select options must be exactly the four declared in description.md + dept_options = { + opt["name"] for opt in db_props["Department"]["select"]["options"] + } + if dept_options != DB_DEPT_NAMES: + print( + f"Error: Department property options must be exactly {sorted(DB_DEPT_NAMES)}. " + f"Got: {sorted(dept_options)}.", + file=sys.stderr, + ) + return False + + # Priority options must be exactly {High, Medium, Low} + priority_options = { + opt["name"] for opt in db_props["Priority"]["select"]["options"] + } + if priority_options != PRIORITY_OPTIONS: + print( + f"Error: Priority property options must be exactly {sorted(PRIORITY_OPTIONS)}. " + f"Got: {sorted(priority_options)}.", + file=sys.stderr, + ) + return False + + # 6. Verify at least 5 action items exist and content matches description. try: pages = notion.databases.query(database_id=db_id).get("results", []) except Exception as exc: @@ -150,13 +273,21 @@ def verify(notion: Client, main_id: str = None) -> bool: print("Error: Database contains fewer than 5 action items.", file=sys.stderr) return False - # Optional: Verify Department values valid + # Build reverse map objective -> department for entry validation + objective_to_dept = { + objective: dept + for dept, objs in OBJECTIVES_BY_DEPT.items() + for objective in objs + } + + seen_dept_in_entries = set() + seen_entry_pairs = set() for page in pages: props = page.get("properties", {}) - # Task Name must be non-empty + # Task Name must contain the verbatim text of one of the underlying objectives. title_rt = props.get("Task Name", {}).get("title", []) - task_name = title_rt[0].get("plain_text") if title_rt else "" + task_name = "".join(rt.get("plain_text", "") for rt in title_rt) if not task_name.strip(): print( f"Error: Action item '{page.get('id')}' is missing a Task Name.", @@ -164,25 +295,71 @@ def verify(notion: Client, main_id: str = None) -> bool: ) return False - # Department must be valid + task_name_norm = _norm(task_name) + matched_objective = None + for objective in ALL_OBJECTIVES: + if _norm(objective) in task_name_norm: + matched_objective = objective + break + if not matched_objective: + print( + f"Error: Action item Task Name '{task_name}' does not contain " + f"the verbatim text of any Company Goals objective.", + file=sys.stderr, + ) + return False + + # Department must match the department that owns the matched objective. dept_select = props.get("Department", {}).get("select", {}).get("name") - if not dept_select or dept_select not in DEPARTMENTS: + if not dept_select or dept_select not in DB_DEPT_NAMES: print( - f"Error: Action item '{page.get('id')}' has invalid or missing Department value.", + f"Error: Action item '{task_name}' has invalid or missing Department value: '{dept_select}'.", file=sys.stderr, ) return False - # Priority and Status must be set (any value) + expected_dept = objective_to_dept[matched_objective] + if dept_select != expected_dept: + print( + f"Error: Action item '{task_name}' has Department='{dept_select}' " + f"but objective '{matched_objective}' belongs to '{expected_dept}'.", + file=sys.stderr, + ) + return False + + seen_dept_in_entries.add(dept_select) + seen_entry_pairs.add((_norm(matched_objective), dept_select)) + + # Priority and Status must be set (any allowed value). priority_val = props.get("Priority", {}).get("select", {}).get("name") status_val = props.get("Status", {}).get("status", {}).get("name") if not priority_val or not status_val: print( - f"Error: Action item '{page.get('id')}' must have both Priority and Status set.", + f"Error: Action item '{task_name}' must have both Priority and Status set.", file=sys.stderr, ) return False + # All four departments must be represented across the entries. + missing_depts = DB_DEPT_NAMES - seen_dept_in_entries + if missing_depts: + print( + f"Error: Action items do not cover all four departments. " + f"Missing: {', '.join(sorted(missing_depts))}.", + file=sys.stderr, + ) + return False + + # Require at least 5 distinct (objective, department) pairs so the entries + # genuinely derive from different objectives rather than duplicates. + if len(seen_entry_pairs) < 5: + print( + f"Error: Action items must include at least 5 distinct " + f"(objective, department) pairs. Got {len(seen_entry_pairs)}.", + file=sys.stderr, + ) + return False + print( "Success: Verified Business Review Dashboard, departmental sections, callout, and Action Items database with β‰₯5 entries." ) diff --git a/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md b/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md index 66d8a67d..5c13a8a0 100644 --- a/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md +++ b/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md @@ -1,7 +1,7 @@ Your goal is to create a new study-session entry in the **Computer Science Student Dashboard** page. 1. Locate the β˜‘οΈ Habit tracker section of the page. -2. **Insert a new date section** immediately **after the existing `2022-09-02` to-do items but *before* the divider block** that follows them. Make sure the new date has proper formatting with a date mention and bold styling like the existing dates, and all to-do items should be unchecked initially. The new section should be inserted right after the 2022-09-02 to-do items but before the divider. +2. **Insert a new date section for `2025-01-29`** immediately **after the existing `2022-09-02` to-do items but *before* the divider block** that follows them. Make sure the new date has proper formatting with a date mention and bold styling like the existing dates, and all to-do items should be unchecked initially. The new section should be inserted right after the 2022-09-02 to-do items but before the divider. 3. Directly **beneath** this new date mention, add **exactly four unchecked to-do blocks** with the following plain text (including the leading emoji on each line): β€’ 🧠 Review algorithms for technical interview β€’ πŸ“š Study database systems chapter 7 diff --git a/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py b/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py index 763f7e69..4cff2a6a 100644 --- a/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py +++ b/tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py @@ -106,10 +106,19 @@ def verify(notion: Client, main_id: str | None = None) -> bool: ) return False - # (2) Verify ordering - if not (index_previous_date < index_new_date < index_divider_after_previous): + # (2) Verify ordering: new date paragraph must come AFTER the last consecutive + # to-do under the 2022-09-02 paragraph and BEFORE the divider that follows. + last_previous_todo_idx = index_previous_date + walk = index_previous_date + 1 + while walk < len(all_blocks) and all_blocks[walk].get("type") == "to_do": + last_previous_todo_idx = walk + walk += 1 + + if not (last_previous_todo_idx < index_new_date < index_divider_after_previous): print( - "Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr + "Error: The 2025-01-29 section is positioned incorrectly " + "(must sit after the existing 2022-09-02 to-do items and before the divider).", + file=sys.stderr, ) return False @@ -122,30 +131,43 @@ def verify(notion: Client, main_id: str | None = None) -> bool: "⚑ Practice system design problems", "🎯 Complete data structures assignment", ] - expected_todos: Dict[str, bool] = { - _normalize_string(t): False for t in expected_texts - } + expected_set = {_normalize_string(t) for t in expected_texts} - # Look through the blocks that lie between the new date mention and the divider - for block in all_blocks[index_new_date + 1 : index_divider_after_previous]: - if block.get("type") != "to_do": - # Any non to-do block inside this range indicates mis-placement. - # We simply ignore it – correctness is determined by presence of required to-dos. - continue + # The blocks between the new date paragraph and the divider must be EXACTLY + # the four expected to-dos, directly beneath the date mention. + new_section_blocks = all_blocks[index_new_date + 1 : index_divider_after_previous] + if len(new_section_blocks) != 4: + print( + f"Error: Expected exactly 4 blocks directly beneath the 2025-01-29 date " + f"(before the divider), found {len(new_section_blocks)}.", + file=sys.stderr, + ) + return False + found_texts: set[str] = set() + for block in new_section_blocks: + if block.get("type") != "to_do": + print( + f"Error: Block directly beneath the 2025-01-29 date is not a to-do " + f"(got type '{block.get('type')}').", + file=sys.stderr, + ) + return False plain_text = notion_utils.get_block_plain_text(block).strip() plain_text_norm = _normalize_string(plain_text) - if plain_text_norm in expected_todos: - # (3a) Verify the to-do is unchecked - if block["to_do"].get("checked", False): - print(f"Error: To-do '{plain_text}' is checked.", file=sys.stderr) - return False - expected_todos[plain_text_norm] = True - - missing_items = [text for text, found in expected_todos.items() if not found] + if block["to_do"].get("checked", False): + print(f"Error: To-do '{plain_text}' is checked.", file=sys.stderr) + return False + found_texts.add(plain_text_norm) + + missing_items = [t for t in expected_set if t not in found_texts] if missing_items: print(f"Error: Missing to-do items: {missing_items}", file=sys.stderr) return False + extra_items = [t for t in found_texts if t not in expected_set] + if extra_items: + print(f"Error: Unexpected to-do items: {extra_items}", file=sys.stderr) + return False # --------------------------------------------------------------------- # Success -------------------------------------------------------------- diff --git a/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md b/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md index 42821c7e..12152099 100644 --- a/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md +++ b/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md @@ -1,11 +1,11 @@ **Task Overview** -My IT knowledge base contains pages whose verification status has expired: +My IT knowledge base contains pages whose verification has lapsed and needs re-verification: **Task Requirements** 1. Locate the database named **"IT Homepage"** inside the main page **"It Trouble Shooting Hub"**. -2. Within that database, find every page (except for **"It Inventory"**) where the **Verification** property state contains `expired`. -3. For **each** expired page: +2. Within that database, find every page (except for **"It Inventory"**) where the **Verification** property state is `unverified`. +3. For **each** such page: β€’ Insert a **callout block** at the very top (as the first child block) whose rich-text content is: `VERIFICATION EXPIRED - This page needs review and re-verification` β€’ Set the callout’s icon to ⚠️. diff --git a/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py b/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py index 8ac5d898..cfedf55f 100644 --- a/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py +++ b/tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py @@ -11,6 +11,16 @@ PRIORITY_HIGH = "High" STATUS_IN_PROGRESS = "In progress" +# Ground-truth list of pages that were 'expired' in the seed template. +# We hardcode by title because the live Verification.state drifts (Notion can +# transition expired -> unverified over time, and editing a page can clear the +# state), so re-querying it at verify time is unreliable. +EXPECTED_EXPIRED_TITLES = [ + "What Is IT in Charge Of", + "How to Submit a Ticket", + "Security Awareness Guide", +] + def _get_main_page_id(notion: Client, main_id: str | None) -> str | None: """Resolve the main page id starting from CLI arg or by title search.""" @@ -29,23 +39,46 @@ def _fetch_database_id( return notion_utils.find_database_in_block(notion, parent_page_id, db_title) -def _expired_pages(notion: Client, db_id: str) -> list[dict]: - """Return list of page objects with Verification.state == 'expired'.""" - # Query all pages (API max 100 per call). If many pages expected, iterate. - results = notion.databases.query(database_id=db_id).get("results", []) - expired = [] - for page in results: - verification_prop = page.get("properties", {}).get("Verification", {}) - state = verification_prop.get("verification", {}).get("state") - # Skip the IT Inventory database entry +def _query_all_pages(notion: Client, db_id: str) -> list[dict]: + """Query every page in a database, paginating until exhausted.""" + pages: list[dict] = [] + cursor: str | None = None + while True: + kwargs = {"database_id": db_id, "page_size": 100} + if cursor: + kwargs["start_cursor"] = cursor + resp = notion.databases.query(**kwargs) + pages.extend(resp.get("results", [])) + if not resp.get("has_more"): + break + cursor = resp.get("next_cursor") + return pages + + +def _expected_expired_pages(notion: Client, db_id: str) -> list[dict] | None: + """Resolve the seed-defined expired pages by title. + + Returns the list in the same order as EXPECTED_EXPIRED_TITLES, or None + if any expected title is missing in the database. + """ + by_title: dict[str, dict] = {} + for page in _query_all_pages(notion, db_id): title_prop = page.get("properties", {}).get("Page", {}).get("title", []) title_text = title_prop[0].get("plain_text") if title_prop else "" - if title_text.strip().lower() == "it inventory": - continue + if title_text: + by_title[title_text.strip()] = page - if state and "expired" in state.lower(): - expired.append(page) - return expired + resolved: list[dict] = [] + for expected in EXPECTED_EXPIRED_TITLES: + page = by_title.get(expected) + if not page: + print( + f"Error: Expected expired page '{expected}' not found in IT Homepage.", + file=sys.stderr, + ) + return None + resolved.append(page) + return resolved def _check_callout_present(notion: Client, page_id: str) -> bool: @@ -68,9 +101,9 @@ def _check_callout_present(notion: Client, page_id: str) -> bool: if icon.get("type") != "emoji" or icon.get("emoji") != CALL_OUT_ICON: return False - # Check text content (callout rich text plain text) - plain_text = notion_utils.get_block_plain_text(first_block) - return CALL_OUT_TEXT in plain_text + # Check text content (callout rich text plain text) β€” exact match after trim + plain_text = notion_utils.get_block_plain_text(first_block).strip() + return plain_text == CALL_OUT_TEXT def _find_request_page(notion: Client, db_id: str) -> dict | None: @@ -94,24 +127,64 @@ def _check_request_properties(page: dict) -> bool: return priority == PRIORITY_HIGH and status == STATUS_IN_PROGRESS -def _request_page_contains_mentions( +def _request_page_bullets_match( notion: Client, request_page_id: str, expected_page_ids: list[str] ) -> bool: - children = notion.blocks.children.list(block_id=request_page_id, page_size=100).get( - "results", [] - ) + """Check the IT Request body bullets correspond exactly to the expected pages. + + Strict reading of "each bullet is a mention of the page processed": + - bullet count == len(expected_page_ids) + - each bullet's rich_text contains exactly one element, of type + mention/page + - the set of mentioned page ids equals the set of expected page ids + """ + children = notion.blocks.children.list( + block_id=request_page_id, page_size=100 + ).get("results", []) bullet_blocks = [b for b in children if b.get("type") == "bulleted_list_item"] - mentioned_ids: set[str] = set() + + if len(bullet_blocks) != len(expected_page_ids): + print( + f"Error: IT Request body has {len(bullet_blocks)} bullets, " + f"expected exactly {len(expected_page_ids)}.", + file=sys.stderr, + ) + return False + + expected_set = {_normalize_id(pid) for pid in expected_page_ids} + mentioned_ids: list[str] = [] for block in bullet_blocks: rich_text = block.get("bulleted_list_item", {}).get("rich_text", []) - for rt in rich_text: - if rt.get("type") == "mention": - mention = rt.get("mention", {}) - if mention.get("type") == "page": - mentioned_ids.add(mention.get("page", {}).get("id")) - if len(mentioned_ids) < len(expected_page_ids): + page_mentions = [ + rt for rt in rich_text + if rt.get("type") == "mention" + and rt.get("mention", {}).get("type") == "page" + ] + if len(page_mentions) != 1: + print( + "Error: each IT Request bullet must contain exactly one page " + f"mention; found {len(page_mentions)}.", + file=sys.stderr, + ) + return False + mentioned_ids.append( + _normalize_id(page_mentions[0]["mention"]["page"].get("id", "")) + ) + + if set(mentioned_ids) != expected_set: + print( + f"Error: IT Request bullet mentions do not match expected pages.\n" + f" expected: {sorted(expected_set)}\n" + f" got: {sorted(set(mentioned_ids))}", + file=sys.stderr, + ) return False - return all(pid in mentioned_ids for pid in expected_page_ids) + return True + + +def _normalize_id(pid: str) -> str: + """Notion ids may appear with or without dashes; normalise for comparison.""" + return pid.replace("-", "").lower() def verify(notion: Client, main_id: str | None = None) -> bool: @@ -132,21 +205,20 @@ def verify(notion: Client, main_id: str | None = None) -> bool: ) return False - # Identify expired pages - expired_pages = _expired_pages(notion, it_home_db_id) - if not expired_pages: - print( - "Failure: No expired pages found; expected at least one for this task.", - file=sys.stderr, - ) + # Identify the expected expired pages by hardcoded title (ground truth) + expected_pages = _expected_expired_pages(notion, it_home_db_id) + if expected_pages is None: return False - # Verify callout on each expired page - for pg in expired_pages: + # Verify callout on each expected expired page + for pg in expected_pages: pid = pg["id"] + title_prop = pg.get("properties", {}).get("Page", {}).get("title", []) + title_text = title_prop[0].get("plain_text") if title_prop else pid if not _check_callout_present(notion, pid): print( - f"Failure: Callout missing or incorrect on page {pid}.", file=sys.stderr + f"Failure: Callout missing or incorrect on page '{title_text}'.", + file=sys.stderr, ) return False @@ -162,21 +234,11 @@ def verify(notion: Client, main_id: str | None = None) -> bool: print("Failure: Priority or Status incorrect on IT Request.", file=sys.stderr) return False - # Verify bullet list in IT Request body - expired_titles = [] - for p in expired_pages: - title_prop = p.get("properties", {}).get("Page", {}).get("title", []) - title_text = title_prop[0].get("plain_text") if title_prop else None - if title_text: - expired_titles.append(title_text) - expected_page_ids = [p["id"] for p in expired_pages] - if not _request_page_contains_mentions( + # Verify bullet list in IT Request body matches the expected pages exactly + expected_page_ids = [p["id"] for p in expected_pages] + if not _request_page_bullets_match( notion, request_page["id"], expected_page_ids ): - print( - "Failure: IT Request body does not contain mentions for all affected pages.", - file=sys.stderr, - ) return False print("Success: All verification checks passed.") diff --git a/tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py b/tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py index bc27d7f8..62fd8448 100644 --- a/tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py +++ b/tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py @@ -4,7 +4,17 @@ from tasks.utils import notion_utils -def verify_todo_database_correspondence(all_blocks, activities_by_day, _): +def _normalize_apos(s: str) -> str: + """Normalise curly apostrophes (U+2018, U+2019) to straight ones for comparison. + + The seed Travel Itinerary database uses a curly apostrophe in entries like + 'Rikuro's Namba Main Branch'. Agents that retype the name with a straight + apostrophe would otherwise fail substring matching here. + """ + return s.replace("’", "'").replace("β€˜", "'") + + +def verify_todo_database_correspondence(all_blocks, activities_by_day, visited_count): """ Verify that to-do items in the overview page correspond exactly to database activities. """ @@ -59,12 +69,15 @@ def verify_todo_database_correspondence(all_blocks, activities_by_day, _): if db_activity["city"]: expected_format += f" - {db_activity['city']}" - # Find matching to-do item + # Find matching to-do item (apostrophe-normalised) + expected_format_norm = _normalize_apos(expected_format) + db_name_norm = _normalize_apos(db_activity["name"]) matching_todo = None for todo in page_todos: + todo_text_norm = _normalize_apos(todo["text"]) if ( - expected_format in todo["text"] - or db_activity["name"] in todo["text"] + expected_format_norm in todo_text_norm + or db_name_norm in todo_text_norm ): matching_todo = todo break @@ -86,18 +99,22 @@ def verify_todo_database_correspondence(all_blocks, activities_by_day, _): ) return False - # Verify summary count matches checked to-dos + # Verify summary count matches the database's true visited count + expected_summary = ( + f"Total activities visited (from Day 1 to Day 3): {visited_count}" + ) for block in all_blocks: if block.get("type") == "paragraph": block_text = notion_utils.get_block_plain_text(block) - if "Total activities visited (from Day 1 to Day 3): 8" in block_text: + if expected_summary in block_text: print( - f"Success: Daily Itinerary Overview page created with correct structure. All {checked_todos_count} visited activities match database." + f"Success: Daily Itinerary Overview page created with correct structure. All {visited_count} visited activities match database." ) return True print( - f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)", + f"Error: Summary does not show the expected count {visited_count}. " + f"Expected line containing: {expected_summary!r}", file=sys.stderr, ) return False @@ -107,7 +124,11 @@ def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Daily Itinerary Overview page has been created correctly. """ - # Find the main Japan Travel Planner page + # Find the main Japan Travel Planner page. + # If main_id is supplied (the normal harness path) we MUST resolve it + # successfully; falling back to a global title search in that case can + # silently pick up a different copy of the page (e.g. another task's GT). + # The title fallback is only used when main_id was not supplied at all. page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( @@ -115,9 +136,15 @@ def verify(notion: Client, main_id: str = None) -> bool: ) if found_id and object_type == "page": page_id = found_id - - if not page_id: + else: + print( + f"Error: Could not resolve main_id {main_id!r} to an accessible page.", + file=sys.stderr, + ) + return False + else: page_id = notion_utils.find_page(notion, "Japan Travel Planner") + if not page_id: print("Error: Main 'Japan Travel Planner' page not found.", file=sys.stderr) return False @@ -138,19 +165,6 @@ def verify(notion: Client, main_id: str = None) -> bool: overview_page_id = result["id"] break - if not overview_page_id: - # Alternative method: check page title directly - for result in response.get("results", []): - title_list = ( - result.get("properties", {}).get("title", {}).get("title", []) - ) - for title_obj in title_list: - if "Daily Itinerary Overview" in title_obj.get("plain_text", ""): - overview_page_id = result["id"] - break - if overview_page_id: - break - except Exception as e: print( f"Error searching for Daily Itinerary Overview page: {e}", file=sys.stderr @@ -268,7 +282,6 @@ def verify(notion: Client, main_id: str = None) -> bool: # Organize database activities by day activities_by_day = {"Day 1": [], "Day 2": [], "Day 3": []} - visited_count = 0 for result in db_activities: properties = result.get("properties", {}) @@ -299,7 +312,6 @@ def verify(notion: Client, main_id: str = None) -> bool: elif prop_type == "checkbox": if prop_value.get("checkbox"): activity_info["visited"] = True - visited_count += 1 # Get day info elif "day" in prop_name.lower() and prop_type in [ @@ -319,6 +331,14 @@ def verify(notion: Client, main_id: str = None) -> bool: if activity_info["day"] and activity_info["name"]: activities_by_day[activity_info["day"]].append(activity_info) + # Visited count is restricted to Day 1-3, matching the summary text + visited_count = sum( + 1 + for day_acts in activities_by_day.values() + for a in day_acts + if a["visited"] + ) + # Now verify to-do items match database activities return verify_todo_database_correspondence( all_blocks, activities_by_day, visited_count diff --git a/tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md b/tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md index cd77f30b..28937cba 100644 --- a/tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md +++ b/tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md @@ -1,10 +1,10 @@ I'm preparing for my Japan trip and need to organize my packing list. Please help me: **Step 1: Update Items in the Packing List Database** -In the Clothes category, all items have already been packed except for the hat After this, check the `SIM Card` entry and the `Wallet` entry. +In the Clothes category, mark all items as packed except for the hat. After this, mark the `SIM Card` entry and the `Wallet` entry as packed. **Step 2: Create Packing Progress Summary** -After adding the items, create a new section in the main Japan Travel Planner page immediately after the "Packing List πŸ’Ό" heading. This section should contain: +After updating the items, create a new section in the main Japan Travel Planner page immediately after the "Packing List πŸ’Ό" heading. This section should contain: 1. A paragraph block with the bold text "**Packing Progress Summary**" 2. Followed by bullet list items showing statistics for each category in the format: diff --git a/tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py b/tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py index a4bd93d0..de35e337 100644 --- a/tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py +++ b/tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py @@ -3,6 +3,10 @@ from tasks.utils import notion_utils +def _norm(s: str) -> str: + return s.replace("’", "'").replace("β€―", " ") + + def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that restaurants from Day 1 of Travel Itinerary have corresponding expense entries. @@ -125,7 +129,7 @@ def verify(notion: Client, main_id: str = None) -> bool: expense_text = "".join( t.get("plain_text", "") for t in expense_prop.get("title", []) ) - if expense_text.strip() != restaurant_name: + if _norm(expense_text.strip()) != _norm(restaurant_name): continue # Check Date @@ -152,9 +156,7 @@ def verify(notion: Client, main_id: str = None) -> bool: comment_text = "".join( t.get("plain_text", "") for t in comment_prop.get("rich_text", []) ) - if comment_text.strip().replace( - "\u202f", " " - ) != expected_description.replace("\u202f", " "): + if _norm(comment_text.strip()) != _norm(expected_description): continue found_matching_expense = True diff --git a/tasks/notion/standard/online_resume/layout_adjustment/verify.py b/tasks/notion/standard/online_resume/layout_adjustment/verify.py index 3d705b54..9f07a9ac 100644 --- a/tasks/notion/standard/online_resume/layout_adjustment/verify.py +++ b/tasks/notion/standard/online_resume/layout_adjustment/verify.py @@ -109,63 +109,53 @@ def verify(notion: Client, main_id: str = None) -> bool: print("Error: Languages heading not found in left column.", file=sys.stderr) return False - # Look for Skills heading after Languages + # Look for Skills heading after Languages (any heading level is fine). + heading_types = ("heading_1", "heading_2", "heading_3") + skill_block_types = ("paragraph", "bulleted_list_item", "numbered_list_item") for i in range(languages_index + 1, len(left_column_blocks)): left_block = left_column_blocks[i] - + if ( - left_block.get("type") == "heading_2" + left_block.get("type") in heading_types and "Skills" in notion_utils.get_block_plain_text(left_block) ): skills_section_found = True - - # Check divider after Skills heading - if i + 1 < len(left_column_blocks): - next_block = left_column_blocks[i + 1] - if next_block.get("type") != "divider": + + # Collect skill rows directly after the Skills heading. + # Dividers, empty blocks, and other non-skill blocks are skipped; + # we stop only when we hit the next section heading. + for j in range(i + 1, len(left_column_blocks)): + skill_block = left_column_blocks[j] + block_type = skill_block.get("type") + + if block_type in heading_types: + break + if block_type not in skill_block_types: + continue + + skill_text = notion_utils.get_block_plain_text(skill_block) + if not skill_text or not skill_text.strip(): + continue + + # Check icon format + if skill_text.startswith("✨✨"): + skills_with_double_sparkles.append(skill_text) + elif skill_text.startswith("✨"): + skills_with_single_sparkle.append(skill_text) + else: print( - "Error: Divider not found after Skills heading.", + f"Error: Skill '{skill_text}' doesn't start with sparkle icon.", + file=sys.stderr, + ) + return False + + # Check format includes type in parentheses + if "(" not in skill_text or ")" not in skill_text: + print( + f"Error: Skill '{skill_text}' doesn't include type in parentheses.", file=sys.stderr, ) return False - - # Collect skills after divider - for j in range(i + 2, len(left_column_blocks)): - skill_block = left_column_blocks[j] - if skill_block.get("type") == "paragraph": - skill_text = notion_utils.get_block_plain_text(skill_block) - if skill_text and skill_text.strip(): # Check for non-empty text - # Check if text is bold - rich_text = skill_block.get("paragraph", {}).get("rich_text", []) - if rich_text and not rich_text[0].get("annotations", {}).get("bold"): - print( - f"Error: Skill '{skill_text}' is not bold.", - file=sys.stderr, - ) - return False - - # Check icon format - if skill_text.startswith("✨✨"): - skills_with_double_sparkles.append(skill_text) - elif skill_text.startswith("✨"): - skills_with_single_sparkle.append(skill_text) - else: - print( - f"Error: Skill '{skill_text}' doesn't start with sparkle icon.", - file=sys.stderr, - ) - return False - - # Check format includes type in parentheses - if "(" not in skill_text or ")" not in skill_text: - print( - f"Error: Skill '{skill_text}' doesn't include type in parentheses.", - file=sys.stderr, - ) - return False - elif skill_block.get("type") in ["heading_1", "heading_2", "heading_3"]: - # Stop when we reach another section - break break if not skills_section_found: diff --git a/tasks/notion/standard/online_resume/work_history_addition/description.md b/tasks/notion/standard/online_resume/work_history_addition/description.md index eabef045..4307a062 100644 --- a/tasks/notion/standard/online_resume/work_history_addition/description.md +++ b/tasks/notion/standard/online_resume/work_history_addition/description.md @@ -1,9 +1,15 @@ Hi! I realized I forgot to include one work experience on my resume page titled "Online Resume." Could you please help me add it to the "Work History" section? -The position is "Research Assistant," and it took place from January to August 2023. The description should be: "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams." +The position is "Research Assistant," and it took place from January to August 2023. The description should be: "Assisted in conducting user experience research projects at my bachelor's program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams." -For the image or logo, please use the one from the "Education" section (my bachelor school) to keep everything consistent. +Since this is my most recent role, please put it as the **first** entry under Work History (above UI Design Internship). -Also, please make sure that the formatting β€” including font style, size, and layout β€” matches the existing entries in the Work History section so it looks seamless. +Please make sure that the formatting β€” including font style, size, and layout β€” matches the existing entries in the Work History section so it looks seamless. -Thank you! \ No newline at end of file +While you're at it, could you also: + +1. Update the summary callout near the top of the page so it mentions my new research experience (just weave the word "research" into the existing summary, keep it short β€” don't rewrite the whole thing). + +2. Add a new entry to my Skills database titled "Research Methodologies" with a Skill Level of 0.7. + +Thank you! diff --git a/tasks/notion/standard/online_resume/work_history_addition/verify.py b/tasks/notion/standard/online_resume/work_history_addition/verify.py index 2dad8a1b..785deb1e 100644 --- a/tasks/notion/standard/online_resume/work_history_addition/verify.py +++ b/tasks/notion/standard/online_resume/work_history_addition/verify.py @@ -2,186 +2,337 @@ from notion_client import Client from tasks.utils import notion_utils +EXPECTED_TITLE = "Research Assistant" +EXPECTED_DATE = "January - August 2023" +EXPECTED_DESCRIPTION = ( + "Assisted in conducting user experience research projects at my bachelor's program, " + "supporting data collection, analyzing user feedback, and preparing research reports. " + "Developed strong skills in research methodologies and improved collaboration with " + "interdisciplinary teams." +) +EXPECTED_SKILL_NAME = "Research Methodologies" +EXPECTED_SKILL_LEVEL = 0.7 +SUMMARY_KEYWORD = "research" +SUMMARY_PRESERVED_TOKEN = "Recent graduate" -def verify(notion: Client, main_id: str = None) -> bool: - """ - Verifies that the new work history entry for 'Research Assistant' has been added correctly. - """ - page_id = None - if main_id: - found_id, object_type = notion_utils.find_page_or_database_by_id( - notion, main_id + +def _list_children(notion: Client, parent_id: str) -> list[dict]: + out: list[dict] = [] + cursor: str | None = None + while True: + kwargs = {"block_id": parent_id, "page_size": 100} + if cursor: + kwargs["start_cursor"] = cursor + resp = notion.blocks.children.list(**kwargs) + out.extend(resp.get("results", [])) + if not resp.get("has_more"): + break + cursor = resp.get("next_cursor") + return out + + +def _annotations(block: dict) -> dict: + block_type = block.get("type") + if not block_type: + return {} + rich_text = block.get(block_type, {}).get("rich_text", []) + if not rich_text: + return {} + return rich_text[0].get("annotations", {}) + + +def _check_first_work_entry(notion: Client, page_id: str) -> bool: + all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) + + wh_block = None + edu_block = None + for b in all_blocks: + if b.get("type") != "heading_1": + continue + text = notion_utils.get_block_plain_text(b).strip() + if text == "Work History" and wh_block is None: + wh_block = b + elif text == "Education" and edu_block is None: + edu_block = b + + if not wh_block or not edu_block: + print( + "Error: Could not locate both 'Work History' and 'Education' headings.", + file=sys.stderr, ) - if found_id and object_type == "page": - page_id = found_id + return False - if not page_id: - page_id = notion_utils.find_page(notion, "Online Resume") - if not page_id: - print("Error: Page 'Online Resume' not found.", file=sys.stderr) + wh_parent_id = wh_block.get("parent", {}).get("block_id") + edu_parent_id = edu_block.get("parent", {}).get("block_id") + if not wh_parent_id or wh_parent_id != edu_parent_id: + print( + "Error: 'Work History' and 'Education' headings are not siblings under the " + "same parent block.", + file=sys.stderr, + ) + return False + + siblings = _list_children(notion, wh_parent_id) + wh_idx = next( + (i for i, b in enumerate(siblings) if b.get("id") == wh_block["id"]), -1 + ) + edu_idx = next( + (i for i, b in enumerate(siblings) if b.get("id") == edu_block["id"]), -1 + ) + if wh_idx < 0 or edu_idx < 0 or edu_idx <= wh_idx: + print( + "Error: Could not order 'Work History' before 'Education' in their parent.", + file=sys.stderr, + ) + return False + + first_column_list = None + for i in range(wh_idx + 1, edu_idx): + if siblings[i].get("type") == "column_list": + first_column_list = siblings[i] + break + + if not first_column_list: + print( + "Error: No column_list found between 'Work History' and 'Education'.", + file=sys.stderr, + ) + return False + + columns = _list_children(notion, first_column_list["id"]) + image_column = None + text_column = None + for col in columns: + if col.get("type") != "column": + continue + ratio = col.get("column", {}).get("width_ratio") + if ratio == 0.125 and image_column is None: + image_column = col + elif ratio == 0.875 and text_column is None: + text_column = col + + if not image_column or not text_column: + print( + "Error: First Work History column_list does not have the expected " + "width_ratios (0.125 / 0.875).", + file=sys.stderr, + ) + return False + + text_blocks = _list_children(notion, text_column["id"]) + paragraphs = [ + b + for b in text_blocks + if b.get("type") == "paragraph" + and notion_utils.get_block_plain_text(b).strip() + ] + if len(paragraphs) < 3: + print( + f"Error: First entry's text column has fewer than 3 non-empty " + f"paragraphs (got {len(paragraphs)}).", + file=sys.stderr, + ) + return False + + title_block, date_block, description_block = paragraphs[0], paragraphs[1], paragraphs[2] + + title_text = notion_utils.get_block_plain_text(title_block).strip() + if title_text != EXPECTED_TITLE: + print( + f"Error: First Work History entry title is {title_text!r}, expected " + f"{EXPECTED_TITLE!r}. The Research Assistant entry must be the FIRST " + f"entry under Work History (above UI Design Internship).", + file=sys.stderr, + ) + return False + if not _annotations(title_block).get("bold"): + print("Error: Title is not bold.", file=sys.stderr) + return False + + date_text = notion_utils.get_block_plain_text(date_block).strip() + if date_text != EXPECTED_DATE: + print( + f"Error: Date paragraph is {date_text!r}, expected {EXPECTED_DATE!r}.", + file=sys.stderr, + ) + return False + date_annot = _annotations(date_block) + if not date_annot.get("italic"): + print( + f"Error: Date should be italic (annotations: {date_annot}).", + file=sys.stderr, + ) return False + if date_annot.get("color") != "gray": + print( + f"Error: Date color should be 'gray' (got {date_annot.get('color')!r}).", + file=sys.stderr, + ) + return False + + description_text = notion_utils.get_block_plain_text(description_block).strip() + if description_text != EXPECTED_DESCRIPTION: + print( + "Error: Description text mismatch.\n" + f" expected: {EXPECTED_DESCRIPTION!r}\n" + f" got: {description_text!r}", + file=sys.stderr, + ) + return False + desc_annot = _annotations(description_block) + if desc_annot.get("bold") or desc_annot.get("italic"): + print( + f"Error: Description should not be bold or italic (got {desc_annot}).", + file=sys.stderr, + ) + return False + if desc_annot.get("color") not in (None, "default"): + print( + f"Error: Description color should be 'default' (got " + f"{desc_annot.get('color')!r}).", + file=sys.stderr, + ) + return False + + return True + +def _check_summary_callout(notion: Client, page_id: str) -> bool: all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) - def find_image_url_under_heading(blocks, heading_text, notion_client): - heading_index = -1 - for i, block in enumerate(blocks): - block_type = block.get("type") - if block_type == "heading_1": - if heading_text in notion_utils.get_block_plain_text(block): - heading_index = i - break - - if heading_index == -1: - return None - - for i in range(heading_index + 1, len(blocks)): - block = blocks[i] - if block.get("type") in ["heading_1", "heading_2", "heading_3"]: - break - if block.get("type") == "image" and block.get("image", {}).get("file"): - return block.get("image", {}).get("file", {}).get("url") - if block.get("type") == "column_list": - column_list_id = block["id"] - columns = notion_utils.get_all_blocks_recursively( - notion_client, column_list_id - ) - for column in columns: - if column.get("type") == "column": - column_id = column["id"] - column_blocks = notion_utils.get_all_blocks_recursively( - notion_client, column_id - ) - for inner_block in column_blocks: - if inner_block.get("type") == "image" and inner_block.get( - "image", {} - ).get("file"): - return ( - inner_block.get("image", {}) - .get("file", {}) - .get("url") - ) - return None - - def get_block_annotations(block): - block_type = block.get("type") - if not block_type: - return {} - block_content = block.get(block_type) - if not block_content: - return {} - rich_text_list = block_content.get("rich_text", []) - if not rich_text_list: - return {} - return rich_text_list[0].get("annotations", {}) - - education_image_url = find_image_url_under_heading(all_blocks, "Education", notion) - if not education_image_url: - print( - "Error: Could not find the image in the 'Education' section.", - file=sys.stderr, - ) - return False - - heading_text = "Work History" - heading_index = -1 - for i, block in enumerate(all_blocks): - if block.get( - "type" - ) == "heading_1" and heading_text in notion_utils.get_block_plain_text(block): - heading_index = i + work_history_idx = -1 + for i, b in enumerate(all_blocks): + if ( + b.get("type") == "heading_1" + and notion_utils.get_block_plain_text(b).strip() == "Work History" + ): + work_history_idx = i break + if work_history_idx < 0: + print( + "Error: 'Work History' heading not found while looking for summary callout.", + file=sys.stderr, + ) + return False - if heading_index == -1: - print(f"Error: Could not find the '{heading_text}' heading.", file=sys.stderr) + callouts = [ + b for b in all_blocks[:work_history_idx] if b.get("type") == "callout" + ] + if not callouts: + print( + "Error: No callout block found above the 'Work History' heading.", + file=sys.stderr, + ) return False - for i in range(heading_index + 1, len(all_blocks)): - block = all_blocks[i] - if block.get("type") in ["heading_1", "heading_2", "heading_3"]: + for callout in callouts: + text = notion_utils.get_block_plain_text(callout) + if SUMMARY_KEYWORD in text.lower() and SUMMARY_PRESERVED_TOKEN in text: + return True + + print( + f"Error: No callout above 'Work History' contains both the new keyword " + f"{SUMMARY_KEYWORD!r} (case-insensitive) and the original phrase " + f"{SUMMARY_PRESERVED_TOKEN!r}. The summary should be lightly edited, " + f"not rewritten.", + file=sys.stderr, + ) + for callout in callouts: + snippet = notion_utils.get_block_plain_text(callout)[:200] + print(f" callout text: {snippet!r}", file=sys.stderr) + return False + + +def _check_skills_database(notion: Client, page_id: str) -> bool: + all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) + skills_db_id = None + for b in all_blocks: + if b.get("type") == "child_database": + title = b.get("child_database", {}).get("title", "") + if title == "Skills": + skills_db_id = b["id"] + break + + if not skills_db_id: + print("Error: 'Skills' child database not found on the page.", file=sys.stderr) + return False + + rows: list[dict] = [] + cursor: str | None = None + while True: + kwargs = {"database_id": skills_db_id, "page_size": 100} + if cursor: + kwargs["start_cursor"] = cursor + resp = notion.databases.query(**kwargs) + rows.extend(resp.get("results", [])) + if not resp.get("has_more"): break + cursor = resp.get("next_cursor") - if block.get("type") == "column_list": - column_list_id = block["id"] - columns = notion_utils.get_all_blocks_recursively(notion, column_list_id) - if len(columns) < 2: - continue - - for column in columns: - if column.get("type") == "column": - if column.get("column", {}).get("width_ratio") == 0.125: - image_column = column - elif column.get("column", {}).get("width_ratio") == 0.875: - text_column = column - - image_column_blocks = notion_utils.get_all_blocks_recursively( - notion, image_column["id"] - ) - text_column_blocks = notion_utils.get_all_blocks_recursively( - notion, text_column["id"] - ) + for row in rows: + props = row.get("properties", {}) + title_prop = props.get("Skill", {}).get("title", []) + title_text = "".join(rt.get("plain_text", "") for rt in title_prop).strip() + if title_text != EXPECTED_SKILL_NAME: + continue + level = props.get("Skill Level", {}).get("number") + if level == EXPECTED_SKILL_LEVEL: + return True + print( + f"Error: Skills entry {EXPECTED_SKILL_NAME!r} has Skill Level={level!r}, " + f"expected {EXPECTED_SKILL_LEVEL}.", + file=sys.stderr, + ) + return False - column_image_url = None - for inner_block in image_column_blocks: - if inner_block.get("type") == "image" and inner_block.get( - "image", {} - ).get("file"): - column_image_url = ( - inner_block.get("image", {}).get("file", {}).get("url") - ) - break - - if ( - not column_image_url - or column_image_url[:100] != education_image_url[:100] - ): - continue - - for j, inner_block in enumerate(text_column_blocks): - if "Research Assistant" in notion_utils.get_block_plain_text( - inner_block - ): - title_annotations = get_block_annotations(inner_block) - if j + 2 < len(text_column_blocks): - date_block = text_column_blocks[j + 1] - description_block = text_column_blocks[j + 2] - - date_text = "January - August 2023" - description_text = "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams." - - date_annotations = get_block_annotations(date_block) - description_annotations = get_block_annotations( - description_block - ) - - if ( - date_text in notion_utils.get_block_plain_text(date_block) - and description_text - in notion_utils.get_block_plain_text(description_block) - and title_annotations.get("bold") - and date_annotations.get("italic") - and date_annotations.get("color") == "gray" - and description_annotations.get("color") == "default" - and description_annotations.get("italic") != True - and description_annotations.get("bold") != True - ): - print("Success: Verified new work history entry.") - return True - - print("Failure: Could not verify the new work history entry.", file=sys.stderr) + print( + f"Error: Skills database has no entry titled {EXPECTED_SKILL_NAME!r}.", + file=sys.stderr, + ) return False -def main(): - """ - Executes the verification process and exits with a status code. - """ +def verify(notion: Client, main_id: str | None = None) -> bool: + page_id: str | None = None + if main_id: + found_id, object_type = notion_utils.find_page_or_database_by_id( + notion, main_id + ) + if found_id and object_type == "page": + page_id = found_id + else: + print( + f"Error: Could not resolve main_id {main_id!r} to an accessible page.", + file=sys.stderr, + ) + return False + else: + page_id = notion_utils.find_page(notion, "Online Resume") + + if not page_id: + print("Error: Page 'Online Resume' not found.", file=sys.stderr) + return False + + if not _check_first_work_entry(notion, page_id): + return False + if not _check_summary_callout(notion, page_id): + return False + if not _check_skills_database(notion, page_id): + return False + + print( + "Success: Verified Research Assistant entry (positioned first), summary " + "callout update, and Skills database entry." + ) + return True + + +def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) - else: - sys.exit(1) + sys.exit(1) if __name__ == "__main__": diff --git a/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md b/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md index 86f62990..3fb2291d 100644 --- a/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md +++ b/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md @@ -1,24 +1,27 @@ -Go to my Self Assessment page, and then create a hyperfocus analysis report by analyzing sessions with high productivity but significant challenges. +Go to my Self Assessment page, and then add an inline hyperfocus analysis section by analyzing sessions with high productivity but significant challenges. **Task Requirements:** -1. Create a new page titled "Hyperfocus Analysis Report" as a child of the Self Assessment page. The new page should be located between 'Why Use the Term "Hyperfocus"?' callout and the following divider line. +1. In the Self Assessment page, directly between the 'Why Use the Term "Hyperfocus"?' callout and the divider line that follows it, insert a hyperfocus analysis section consisting of: + - A summary callout (described in step 4) at the top of the section + - Per-session content (described in step 3) following the summary callout + Do NOT create a separate child page; insert the blocks inline as direct children of the Self Assessment page. 2. Query the "Hyperfocus Self-Assessment Worksheet" database to find all sessions where: - Work Completion Rate is greater than 80% (0.8) - At least one challenge is present in the Challenges field -3. For each qualifying session, create a section with: - - A heading showing the date and activity type (format: YYYY-MM-DD Activity) +3. For each qualifying session, append (after the summary callout) a section with: + - A level 2 heading showing the date and activity type (format: YYYY-MM-DD Activity) - A bullet list containing: - Focus factors used (e.g., Focus factors: XXX, YYY) - Energy level and mood (format: "Energy: X/10, Mood: Y/10") - Challenges faced (e.g., Challenges: XXX, YYY) - Strategies that helped overcome challenges (e.g., Strategies: XXX, YYY) - Work completion rate (format: "Completion: XX%") -4. At the top of the page, add a callout block (type: "info") with: +4. The summary callout (a callout block, type: "info") should contain: - Title: "Top 2 Most Effective Strategies" - - Content: List the 2 most frequently used strategies from all sessions, each on a new line with format "β€’ Strategy Name (used in X sessions)" + - Content: List the 2 strategies that appear most frequently across the entire "Hyperfocus Self-Assessment Worksheet" database (count every entry in the database β€” do NOT limit counting to the filtered sessions from step 2), each on a new line with format "β€’ Strategy Name (used in X sessions)" **Structure Requirements:** -- The page must have the exact title "Hyperfocus Analysis Report" +- All inserted blocks must be located strictly between the 'Why Use the Term "Hyperfocus"?' callout and the divider line that follows it - Each session section must start with a level 2 heading - All session details must be in bullet point format -- The summary callout must be at the top of the page before any session details \ No newline at end of file +- The summary callout must come first in the inserted section, before any session heading diff --git a/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py b/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py index 02556de0..e5c9abd9 100644 --- a/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py +++ b/tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py @@ -12,11 +12,9 @@ def validate_comma_separated(text: str, expected_items: list) -> bool: if not text or not expected_items: return False - # Extract items from text items = [item.strip().lower() for item in text.split(",")] expected_lower = [item.lower() for item in expected_items] - # Check if all expected items are present for expected in expected_lower: if not any(expected in item or item in expected for item in items): return False @@ -25,7 +23,9 @@ def validate_comma_separated(text: str, expected_items: list) -> bool: def verify(notion: Client, main_id: str = None) -> bool: """ - Verifies that the Hyperfocus Analysis Report has been created correctly. + Verifies that the inline hyperfocus analysis section has been inserted + between the 'Why Use the Term "Hyperfocus"?' callout and the divider + line that follows it inside the Self Assessment page. """ # Find the Self Assessment page self_assessment_page_id = main_id @@ -37,73 +37,43 @@ def verify(notion: Client, main_id: str = None) -> bool: self_assessment_page_id = found_id if not self_assessment_page_id: - # Try to find by name self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment") if not self_assessment_page_id: print("Error: Self Assessment page not found.", file=sys.stderr) return False - # Find the Hyperfocus Analysis Report page - report_page_id = None - report_position = -1 - callout_position = -1 - divider_position = -1 children = notion.blocks.children.list(block_id=self_assessment_page_id).get( "results", [] ) - for i, child in enumerate(children): - # Track position of callout with "Why Use the Term" - if child.get("type") == "callout": - callout_text = notion_utils.get_block_plain_text(child) - if "Why Use the Term" in callout_text and "Hyperfocus" in callout_text: - callout_position = i - - # Track position of divider - elif child.get("type") == "divider": - if callout_position != -1 and divider_position == -1: - divider_position = i - - # Find the report page - elif child.get("type") == "child_page": - page_data = notion.pages.retrieve(page_id=child["id"]) - title_prop = ( - page_data.get("properties", {}).get("title", {}).get("title", []) - ) - if ( - title_prop - and title_prop[0].get("plain_text") == "Hyperfocus Analysis Report" - ): - report_page_id = child["id"] - report_position = i - - if not report_page_id: - print("Error: 'Hyperfocus Analysis Report' page not found.", file=sys.stderr) - return False - # Verify position - if callout_position == -1: + # Locate the 'Why Use the Term "Hyperfocus"?' callout and the first divider after it. + callout_idx = -1 + divider_idx = -1 + for i, child in enumerate(children): + if callout_idx == -1 and child.get("type") == "callout": + text = notion_utils.get_block_plain_text(child) + if "Why Use the Term" in text and "Hyperfocus" in text: + callout_idx = i + elif callout_idx != -1 and child.get("type") == "divider": + divider_idx = i + break + + if callout_idx == -1: print( "Error: Could not find 'Why Use the Term \"Hyperfocus\"?' callout.", file=sys.stderr, ) return False - if divider_position == -1: + if divider_idx == -1: print("Error: Could not find divider after the callout.", file=sys.stderr) return False - if not (callout_position < report_position < divider_position): - print( - f"Error: Report page is not positioned between callout and divider. Positions: callout={callout_position}, report={report_position}, divider={divider_position}", - file=sys.stderr, - ) - return False + # Section blocks: strictly between the callout and the divider. + section_blocks = children[callout_idx + 1 : divider_idx] - # Get all blocks from the report page - all_blocks = notion_utils.get_all_blocks_recursively(notion, report_page_id) - - # Find the database in the Self Assessment page + # Find the worksheet database (recursively, since it lives inside a toggle). database_id = None for block in notion_utils.get_all_blocks_recursively( notion, self_assessment_page_id @@ -124,7 +94,20 @@ def verify(notion: Client, main_id: str = None) -> bool: ) return False - # Query database for sessions with >80% completion rate and challenges + # Top 2 strategies across ALL entries in the database (not filtered). + all_sessions = notion.databases.query(database_id=database_id).get("results", []) + all_strategies = [] + for s in all_sessions: + strats = ( + s.get("properties", {}) + .get("Key Strategies Used", {}) + .get("multi_select", []) + ) + all_strategies.extend([x.get("name") for x in strats]) + strategy_counts = Counter(all_strategies) + top_2_strategies = strategy_counts.most_common(2) + + # Filtered sessions (>80% completion + at least one challenge). query_results = notion.databases.query( database_id=database_id, filter={ @@ -135,71 +118,40 @@ def verify(notion: Client, main_id: str = None) -> bool: }, ).get("results", []) - if not query_results: - print( - "Warning: No sessions found with >80% completion rate and challenges.", - file=sys.stderr, - ) - # Still check if the page structure is correct - - # Verify page structure - has_callout = False - has_top_strategies = False - session_count = 0 - found_sessions = {} # Track sessions by date for validation - - # Track strategies for validation - count from ALL sessions - all_sessions = notion.databases.query(database_id=database_id).get("results", []) - all_strategies = [] - for session in all_sessions: - strategies = ( - session.get("properties", {}) - .get("Key Strategies Used", {}) - .get("multi_select", []) - ) - all_strategies.extend([s.get("name") for s in strategies]) - - strategy_counts = Counter(all_strategies) - top_2_strategies = strategy_counts.most_common(2) - - # Build expected sessions from query results with all data expected_sessions = {} - for result in query_results: - date_prop = result.get("properties", {}).get("Date", {}).get("date", {}) + for r in query_results: + date_prop = r.get("properties", {}).get("Date", {}).get("date", {}) activity_prop = ( - result.get("properties", {}).get("Activity", {}).get("select", {}) + r.get("properties", {}).get("Activity", {}).get("select", {}) ) if date_prop and date_prop.get("start") and activity_prop: date_str = date_prop["start"] activity_name = activity_prop.get("name", "") - - # Extract all session data for validation focus_factors = [ f.get("name", "") - for f in result.get("properties", {}) + for f in r.get("properties", {}) .get("Focus Factors", {}) .get("multi_select", []) ] challenges = [ c.get("name", "") - for c in result.get("properties", {}) + for c in r.get("properties", {}) .get("Challenges", {}) .get("multi_select", []) ] strategies = [ s.get("name", "") - for s in result.get("properties", {}) + for s in r.get("properties", {}) .get("Key Strategies Used", {}) .get("multi_select", []) ] - energy = result.get("properties", {}).get("Energy Level", {}).get("number") - mood = result.get("properties", {}).get("Mood", {}).get("number") + energy = r.get("properties", {}).get("Energy Level", {}).get("number") + mood = r.get("properties", {}).get("Mood", {}).get("number") completion = ( - result.get("properties", {}) + r.get("properties", {}) .get("Work Completion Rate", {}) .get("number") ) - expected_sessions[date_str] = { "activity": activity_name, "focus_factors": focus_factors, @@ -210,52 +162,53 @@ def verify(notion: Client, main_id: str = None) -> bool: "completion": completion, } + # Walk section blocks. + has_callout = False + has_top_strategies = False + callout_seen_at = -1 + found_sessions = {} + session_count = 0 current_session_date = None current_session_data = None - session_bullet_points = {} # Track bullet points for each session + session_bullet_points = {} - for i, block in enumerate(all_blocks): + for i, block in enumerate(section_blocks): block_type = block.get("type") - # Check for callout at the top - if block_type == "callout" and i < 5: # Should be near the top - callout_text = notion_utils.get_block_plain_text(block) - if "Top 2 Most Effective Strategies" in callout_text: + if block_type == "callout": + text = notion_utils.get_block_plain_text(block) + if "Top 2 Most Effective Strategies" in text: has_callout = True - # Check if it contains strategy information - s1, n1 = top_2_strategies[0] - s2, n2 = top_2_strategies[1] - t1 = f"{s1} (used in {n1} sessions)" - t2 = f"{s2} (used in {n2} sessions)" - - if t1 in callout_text and t2 in callout_text: - has_top_strategies = True - break + if callout_seen_at == -1: + callout_seen_at = i + if len(top_2_strategies) >= 2: + s1, n1 = top_2_strategies[0] + s2, n2 = top_2_strategies[1] + t1 = f"{s1} (used in {n1} sessions)" + t2 = f"{s2} (used in {n2} sessions)" + if t1 in text and t2 in text: + has_top_strategies = True - # Check for session headings with format YYYY-MM-DD Activity if block_type == "heading_2": heading_text = notion_utils.get_block_plain_text(block) - # Check if heading matches expected format - for date_str, session_data in expected_sessions.items(): - activity = session_data["activity"] - expected_heading = f"{date_str} {activity}" + # A new heading_2 closes the previous session's bullet scope. + current_session_date = None + current_session_data = None + for date_str, sd in expected_sessions.items(): + expected_heading = f"{date_str} {sd['activity']}" if expected_heading in heading_text: - found_sessions[date_str] = session_data + found_sessions[date_str] = sd session_count += 1 current_session_date = date_str - current_session_data = session_data + current_session_data = sd session_bullet_points[date_str] = [] break - # Check for bullet points with session details if block_type == "bulleted_list_item" and current_session_data: bullet_text = notion_utils.get_block_plain_text(block) - - # Track bullet points for current session if current_session_date: session_bullet_points[current_session_date].append(bullet_text) - # Validate specific bullet point content if bullet_text.startswith("Focus factors"): content = bullet_text.split(":", 1)[1].strip() expected_factors = current_session_data.get("focus_factors", []) @@ -267,16 +220,13 @@ def verify(notion: Client, main_id: str = None) -> bool: return False elif "Energy" in bullet_text and "Mood" in bullet_text: - # Extract energy and mood values - energy_match = re.search(r"Energy:\s*(\d+)/10", bullet_text) - mood_match = re.search(r"Mood:\s*(\d+)/10", bullet_text) - - if energy_match and mood_match: - found_energy = int(energy_match.group(1)) - found_mood = int(mood_match.group(1)) + em = re.search(r"Energy:\s*(\d+)/10", bullet_text) + mm = re.search(r"Mood:\s*(\d+)/10", bullet_text) + if em and mm: + found_energy = int(em.group(1)) + found_mood = int(mm.group(1)) expected_energy = current_session_data.get("energy") expected_mood = current_session_data.get("mood") - if found_energy != expected_energy or found_mood != expected_mood: print( f"Error: Energy/Mood mismatch for {current_session_date}. Expected: Energy: {expected_energy}/10, Mood: {expected_mood}/10", @@ -313,15 +263,12 @@ def verify(notion: Client, main_id: str = None) -> bool: return False elif bullet_text.startswith("Completion"): - # Extract completion percentage - completion_match = re.search(r"Completion:\s*(\d+)%", bullet_text) - - if completion_match: - found_completion = int(completion_match.group(1)) + cm = re.search(r"Completion:\s*(\d+)%", bullet_text) + if cm: + found_completion = int(cm.group(1)) expected_completion = int( current_session_data.get("completion", 0) * 100 ) - if found_completion != expected_completion: print( f"Error: Completion rate mismatch for {current_session_date}. Expected: {expected_completion}%, Found: {found_completion}%", @@ -335,10 +282,10 @@ def verify(notion: Client, main_id: str = None) -> bool: ) return False - # Verify all sessions have complete bullet points + # Per-session bullet completeness. for date_str, bullets in session_bullet_points.items(): - bullets_text = " ".join(bullets) - required_items = [ + bt = " ".join(bullets) + required = [ "Focus factors", "Energy:", "Mood:", @@ -346,44 +293,51 @@ def verify(notion: Client, main_id: str = None) -> bool: "Strategies", "Completion", ] - missing_items = [] - - for item in required_items: - if item not in bullets_text: - missing_items.append(item) - - if missing_items: + missing = [r for r in required if r not in bt] + if missing: print( - f"Error: Missing bullet points for session {date_str}: {', '.join(missing_items)}", + f"Error: Missing bullet points for session {date_str}: {', '.join(missing)}", file=sys.stderr, ) return False - # Verify all requirements + # Final structural checks. if not has_callout: print( - "Error: Missing callout block with 'Top 2 Most Effective Strategies'.", + "Error: Missing callout block with 'Top 2 Most Effective Strategies' between the 'Why Use the Term \"Hyperfocus\"?' callout and the following divider.", file=sys.stderr, ) return False if not has_top_strategies and len(top_2_strategies) > 0: - print("Error: Callout doesn't contain strategy information.", file=sys.stderr) + print( + "Error: Callout doesn't contain correct top 2 strategy information.", + file=sys.stderr, + ) return False - if query_results and session_count == 0: - print("Error: No session sections found with proper headings.", file=sys.stderr) + # The summary callout must come before any session heading_2. + first_h2_idx = next( + (i for i, b in enumerate(section_blocks) if b.get("type") == "heading_2"), + len(section_blocks), + ) + if callout_seen_at == -1 or callout_seen_at > first_h2_idx: + print( + "Error: 'Top 2 Most Effective Strategies' callout must come before any session heading.", + file=sys.stderr, + ) return False - # Check if all expected sessions are present - missing_sessions = [] - for date_str in expected_sessions.keys(): - if date_str not in found_sessions: - missing_sessions.append(date_str) + if query_results and session_count == 0: + print( + "Error: No session sections found with proper headings.", file=sys.stderr + ) + return False - if missing_sessions: + missing = [d for d in expected_sessions if d not in found_sessions] + if missing: print( - f"Error: Missing session sections for dates: {', '.join(missing_sessions)}", + f"Error: Missing session sections for dates: {', '.join(missing)}", file=sys.stderr, ) return False @@ -395,7 +349,7 @@ def verify(notion: Client, main_id: str = None) -> bool: ) print( - "Success: Hyperfocus Analysis Report created with proper structure and content." + "Success: Hyperfocus analysis section created with proper structure and content." ) return True diff --git a/tasks/notion/standard/standard_operating_procedure/section_organization/description.md b/tasks/notion/standard/standard_operating_procedure/section_organization/description.md index 24e38a95..a8593c0e 100644 --- a/tasks/notion/standard/standard_operating_procedure/section_organization/description.md +++ b/tasks/notion/standard/standard_operating_procedure/section_organization/description.md @@ -16,4 +16,4 @@ Modify the structure of the Standard Operating Procedure page in Notion by reorg - Position the "Tools" section in the left column - Position the "Terminologies" section in the right column - In the "Tools" column, add links to the Notion and Figma pages using appropriate reference blocks -- Preserve the original child pages from the "Tools" section in a toggle block placed below the column layout, with the toggle titled "original pages" \ No newline at end of file +- Preserve access to the original Notion and Figma child pages from the "Tools" section inside a toggle titled "original pages", positioned below the column layout (the original pages must remain reachable from this toggle) \ No newline at end of file diff --git a/tasks/notion/standard/standard_operating_procedure/section_organization/verify.py b/tasks/notion/standard/standard_operating_procedure/section_organization/verify.py index aa9506e3..12d21775 100644 --- a/tasks/notion/standard/standard_operating_procedure/section_organization/verify.py +++ b/tasks/notion/standard/standard_operating_procedure/section_organization/verify.py @@ -187,26 +187,52 @@ def verify(notion: Client, main_id: str = None) -> bool: print(f"Error getting toggle children: {e}", file=sys.stderr) return False - # Check for child_page blocks (Notion and Figma) + # Accept either nested child_page blocks (UI-built path) or + # link_to_page references (API-built path). Notion's public API does not + # expose a way to move/create a child_page block inside a toggle, so an + # API agent can only reference the original pages via link_to_page. + def _resolve_page_title(target_id: str) -> str: + try: + page = notion.pages.retrieve(page_id=target_id) + except Exception: + return "" + title_prop = page.get("properties", {}).get("title", {}) + rich_text = title_prop.get("title", []) + return "".join(rt.get("plain_text", "") for rt in rich_text) + notion_page_found = False figma_page_found = False - + for block in toggle_children: - if block.get("type") == "child_page": + btype = block.get("type") + if btype == "child_page": title = block.get("child_page", {}).get("title", "") - if title == "Notion": - notion_page_found = True - print("βœ“ Found 'Notion' child page in toggle") - elif title == "Figma": - figma_page_found = True - print("βœ“ Found 'Figma' child page in toggle") - + elif btype == "link_to_page": + target_id = block.get("link_to_page", {}).get("page_id") + title = _resolve_page_title(target_id) if target_id else "" + else: + continue + if title == "Notion": + notion_page_found = True + print(f"βœ“ Found 'Notion' page reference in toggle (via {btype})") + elif title == "Figma": + figma_page_found = True + print(f"βœ“ Found 'Figma' page reference in toggle (via {btype})") + if not notion_page_found: - print("Error: 'Notion' child page not found in toggle block.", file=sys.stderr) + print( + "Error: No 'Notion' page reference found in toggle block " + "(expected child_page or link_to_page targeting the original 'Notion' page).", + file=sys.stderr, + ) return False - + if not figma_page_found: - print("Error: 'Figma' child page not found in toggle block.", file=sys.stderr) + print( + "Error: No 'Figma' page reference found in toggle block " + "(expected child_page or link_to_page targeting the original 'Figma' page).", + file=sys.stderr, + ) return False # Step 6: Verify that original sections no longer exist at top level diff --git a/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md b/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md index 15131946..481e60f3 100644 --- a/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md +++ b/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md @@ -20,4 +20,3 @@ Use Playwright MCP tools to complete Cloudflare Turnstile authentication challen - Use the provided test credentials: testuser / password123 - Page shows success message inline, does not redirect to separate success page - Wait for all UI state changes before proceeding to next step -- Verify both Turnstile completion and form submission success diff --git a/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py b/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py index 7669cae0..7334db60 100644 --- a/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py +++ b/tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py @@ -20,7 +20,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None diff --git a/tasks/playwright/standard/eval_web/extraction_table/data.csv b/tasks/playwright/standard/eval_web/extraction_table/data.csv index 483b32c8..8a0f438c 100644 --- a/tasks/playwright/standard/eval_web/extraction_table/data.csv +++ b/tasks/playwright/standard/eval_web/extraction_table/data.csv @@ -1,98 +1,98 @@ Title, Rating, Likes, Views, Replies -React 18 New Features Deep Dive, "4.8", 856, 12543, 89 -Vue 3 Composition API in Practice, "4.5", 743, 9876, 67 -Advanced TypeScript Types Guide, "4.9", 924, 15432, 102 -Node.js Performance Optimization, "4.2", 567, 8765, 45 -Frontend Engineering Best Practices, "4.7", 812, 11234, 78 -Microservices Architecture Patterns, "4.3", 634, 9543, 56 -Docker Containerization Deployment, "4.6", 789, 10876, 71 -Kubernetes Cluster Management, "4.4", 698, 9234, 63 -GraphQL API Design Principles, "4.8", 876, 13456, 94 -Webpack 5 Configuration Guide, "4.1", 523, 7654, 38 -Vite Build Tool Usage, "4.5", 745, 10123, 69 -ESLint Code Standards, "4.7", 823, 11567, 82 -Unit Testing Best Practices, "4.3", 612, 8934, 51 -Performance Monitoring & Optimization, "4.9", 945, 16234, 108 -Security Protection Strategies, "4.2", 578, 8456, 47 -Database Design Principles, "4.6", 767, 10567, 73 -Caching Strategies Implementation, "4.4", 689, 9123, 61 -Message Queue Applications, "4.8", 834, 12876, 87 -Distributed Systems Design, "4.0", 456, 6789, 34 -Cloud Native Development, "4.5", 723, 9876, 65 -DevOps Process Optimization, "4.7", 801, 11234, 79 -Machine Learning Introduction, "4.1", 534, 7543, 41 -Artificial Intelligence Applications, "4.6", 778, 10456, 74 -Blockchain Technology Fundamentals, "4.3", 645, 8765, 53 -Mobile Development Techniques, "4.9", 912, 14567, 97 -Cross-Platform Solutions, "4.2", 589, 8234, 48 -Progressive Web App Development, "4.8", 867, 12345, 91 -Web3 Development Guide, "4.4", 712, 9567, 64 -NFT Smart Contracts, "4.5", 756, 10234, 70 -DeFi Protocol Design, "4.7", 834, 11876, 83 -Game Engine Development, "4.3", 623, 8567, 52 -3D Graphics Rendering, "4.6", 789, 10678, 75 -Audio Video Processing, "4.1", 545, 7234, 42 -IoT Applications, "4.8", 856, 12567, 88 -Edge Computing Practices, "4.2", 567, 8345, 46 -5G Network Technology, "4.9", 923, 15123, 103 -Quantum Computing Principles, "4.4", 678, 9345, 62 -Bioinformatics Analysis, "4.5", 734, 9876, 68 -Data Science Methods, "4.7", 812, 11456, 80 -Algorithms and Data Structures, "4.3", 634, 8678, 54 -System Design Interview, "4.6", 778, 10345, 76 -Code Refactoring Techniques, "4.8", 845, 12234, 89 -Open Source Contributions, "4.2", 556, 7890, 43 -Technical Team Management, "4.5", 723, 9567, 66 -Product Thinking Development, "4.9", 901, 14234, 95 -User Experience Design, "4.1", 512, 7123, 39 -Interface Interaction Optimization, "4.7", 789, 10890, 77 -Accessibility Design, "4.4", 667, 8901, 58 -SEO Optimization Strategies, "4.6", 756, 10123, 72 -Social Media Operations, "4.3", 623, 8456, 55 -Serverless Architecture, "4.7", 834, 11234, 81 -API Gateway Design, "4.2", 567, 8765, 49 -Microservice Communication, "4.8", 892, 13567, 95 -Event-Driven Architecture, "4.5", 723, 9876, 67 -CQRS Pattern Implementation, "4.3", 645, 8234, 54 -Domain-Driven Design, "4.6", 778, 10456, 73 -Clean Architecture Principles, "4.4", 689, 9123, 62 -Hexagonal Architecture, "4.1", 534, 7543, 42 -Onion Architecture, "4.5", 712, 9567, 65 -Event Sourcing Patterns, "4.7", 823, 11876, 79 -Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53 -Circuit Breaker Pattern, "4.8", 856, 12543, 87 -Bulkhead Pattern, "4.2", 578, 8456, 47 -Retry Pattern Implementation, "4.6", 767, 10567, 74 -Timeout Pattern, "4.4", 698, 9234, 63 -Rate Limiting Strategies, "4.9", 934, 15432, 103 -Load Balancing Techniques, "4.1", 523, 7654, 39 -Service Mesh Architecture, "4.5", 745, 10123, 69 -Istio Service Mesh, "4.7", 812, 11567, 82 -Envoy Proxy Configuration, "4.3", 634, 9543, 56 -Consul Service Discovery, "4.6", 789, 10876, 71 -Kubernetes Ingress, "4.4", 676, 9345, 58 -Helm Chart Development, "4.8", 845, 12234, 89 -Terraform Infrastructure, "4.2", 556, 7890, 44 -Ansible Automation, "4.5", 723, 9567, 66 -Jenkins Pipeline, "4.7", 801, 11234, 78 -GitLab CI/CD, "4.3", 623, 8567, 52 -GitHub Actions, "4.6", 789, 10678, 75 -Azure DevOps, "4.1", 512, 7123, 41 -AWS CodePipeline, "4.8", 867, 12345, 91 -Docker Compose, "4.4", 712, 9567, 64 -Kubernetes Operators, "4.5", 756, 10234, 70 -Custom Resource Definitions, "4.7", 834, 11876, 83 -Pod Security Policies, "4.3", 623, 8567, 52 -Network Policies, "4.6", 789, 10678, 75 -RBAC Configuration, "4.1", 545, 7234, 42 -Secret Management, "4.8", 856, 12567, 88 -ConfigMap Usage, "4.2", 567, 8345, 46 -Persistent Volumes, "4.9", 923, 15123, 103 -StatefulSets, "4.4", 678, 9345, 62 -DaemonSets, "4.5", 734, 9876, 68 -Jobs and CronJobs, "4.7", 812, 11456, 80 -Horizontal Pod Autoscaler, "4.3", 634, 8678, 54 -Vertical Pod Autoscaler, "4.6", 778, 10345, 76 -Cluster Autoscaler, "4.8", 845, 12234, 89 -Resource Quotas, "4.2", 556, 7890, 43 -Limit Ranges, "4.5", 723, 9567, 66 +React 18 New Features Deep Dive, 4.8, 856, 12543, 89 +Vue 3 Composition API in Practice, 4.5, 743, 9876, 67 +Advanced TypeScript Types Guide, 4.9, 924, 15432, 102 +Node.js Performance Optimization, 4.2, 567, 8765, 45 +Frontend Engineering Best Practices, 4.7, 812, 11234, 78 +Microservices Architecture Patterns, 4.3, 634, 9543, 56 +Docker Containerization Deployment, 4.6, 789, 10876, 71 +Kubernetes Cluster Management, 4.4, 698, 9234, 63 +GraphQL API Design Principles, 4.8, 876, 13456, 94 +Webpack 5 Configuration Guide, 4.1, 523, 7654, 38 +Vite Build Tool Usage, 4.5, 745, 10123, 69 +ESLint Code Standards, 4.7, 823, 11567, 82 +Unit Testing Best Practices, 4.3, 612, 8934, 51 +Performance Monitoring & Optimization, 4.9, 945, 16234, 108 +Security Protection Strategies, 4.2, 578, 8456, 47 +Database Design Principles, 4.6, 767, 10567, 73 +Caching Strategies Implementation, 4.4, 689, 9123, 61 +Message Queue Applications, 4.8, 834, 12876, 87 +Distributed Systems Design, 4.0, 456, 6789, 34 +Cloud Native Development, 4.5, 723, 9876, 65 +DevOps Process Optimization, 4.7, 801, 11234, 79 +Machine Learning Introduction, 4.1, 534, 7543, 41 +Artificial Intelligence Applications, 4.6, 778, 10456, 74 +Blockchain Technology Fundamentals, 4.3, 645, 8765, 53 +Mobile Development Techniques, 4.9, 912, 14567, 97 +Cross-Platform Solutions, 4.2, 589, 8234, 48 +Progressive Web App Development, 4.8, 867, 12345, 91 +Web3 Development Guide, 4.4, 712, 9567, 64 +NFT Smart Contracts, 4.5, 756, 10234, 70 +DeFi Protocol Design, 4.7, 834, 11876, 83 +Game Engine Development, 4.3, 623, 8567, 52 +3D Graphics Rendering, 4.6, 789, 10678, 75 +Audio Video Processing, 4.1, 545, 7234, 42 +IoT Applications, 4.8, 856, 12567, 88 +Edge Computing Practices, 4.2, 567, 8345, 46 +5G Network Technology, 4.9, 923, 15123, 103 +Quantum Computing Principles, 4.4, 678, 9345, 62 +Bioinformatics Analysis, 4.5, 734, 9876, 68 +Data Science Methods, 4.7, 812, 11456, 80 +Algorithms and Data Structures, 4.3, 634, 8678, 54 +System Design Interview, 4.6, 778, 10345, 76 +Code Refactoring Techniques, 4.8, 845, 12234, 89 +Open Source Contributions, 4.2, 556, 7890, 43 +Technical Team Management, 4.5, 723, 9567, 66 +Product Thinking Development, 4.9, 901, 14234, 95 +User Experience Design, 4.1, 512, 7123, 39 +Interface Interaction Optimization, 4.7, 789, 10890, 77 +Accessibility Design, 4.4, 667, 8901, 58 +SEO Optimization Strategies, 4.6, 756, 10123, 72 +Social Media Operations, 4.3, 623, 8456, 55 +Serverless Architecture, 4.7, 834, 11234, 81 +API Gateway Design, 4.2, 567, 8765, 49 +Microservice Communication, 4.8, 892, 13567, 95 +Event-Driven Architecture, 4.5, 723, 9876, 67 +CQRS Pattern Implementation, 4.3, 645, 8234, 54 +Domain-Driven Design, 4.6, 778, 10456, 73 +Clean Architecture Principles, 4.4, 689, 9123, 62 +Hexagonal Architecture, 4.1, 534, 7543, 42 +Onion Architecture, 4.5, 712, 9567, 65 +Event Sourcing Patterns, 4.7, 823, 11876, 79 +Saga Pattern for Distributed Systems, 4.3, 612, 8934, 53 +Circuit Breaker Pattern, 4.8, 856, 12543, 87 +Bulkhead Pattern, 4.2, 578, 8456, 47 +Retry Pattern Implementation, 4.6, 767, 10567, 74 +Timeout Pattern, 4.4, 698, 9234, 63 +Rate Limiting Strategies, 4.9, 934, 15432, 103 +Load Balancing Techniques, 4.1, 523, 7654, 39 +Service Mesh Architecture, 4.5, 745, 10123, 69 +Istio Service Mesh, 4.7, 812, 11567, 82 +Envoy Proxy Configuration, 4.3, 634, 9543, 56 +Consul Service Discovery, 4.6, 789, 10876, 71 +Kubernetes Ingress, 4.4, 676, 9345, 58 +Helm Chart Development, 4.8, 845, 12234, 89 +Terraform Infrastructure, 4.2, 556, 7890, 44 +Ansible Automation, 4.5, 723, 9567, 66 +Jenkins Pipeline, 4.7, 801, 11234, 78 +GitLab CI/CD, 4.3, 623, 8567, 52 +GitHub Actions, 4.6, 789, 10678, 75 +Azure DevOps, 4.1, 512, 7123, 41 +AWS CodePipeline, 4.8, 867, 12345, 91 +Docker Compose, 4.4, 712, 9567, 64 +Kubernetes Operators, 4.5, 756, 10234, 70 +Custom Resource Definitions, 4.7, 834, 11876, 83 +Pod Security Policies, 4.3, 623, 8567, 52 +Network Policies, 4.6, 789, 10678, 75 +RBAC Configuration, 4.1, 545, 7234, 42 +Secret Management, 4.8, 856, 12567, 88 +ConfigMap Usage, 4.2, 567, 8345, 46 +Persistent Volumes, 4.9, 923, 15123, 103 +StatefulSets, 4.4, 678, 9345, 62 +DaemonSets, 4.5, 734, 9876, 68 +Jobs and CronJobs, 4.7, 812, 11456, 80 +Horizontal Pod Autoscaler, 4.3, 634, 8678, 54 +Vertical Pod Autoscaler, 4.6, 778, 10345, 76 +Cluster Autoscaler, 4.8, 845, 12234, 89 +Resource Quotas, 4.2, 556, 7890, 43 +Limit Ranges, 4.5, 723, 9567, 66 diff --git a/tasks/playwright/standard/eval_web/extraction_table/description.md b/tasks/playwright/standard/eval_web/extraction_table/description.md index cee70019..0b263702 100644 --- a/tasks/playwright/standard/eval_web/extraction_table/description.md +++ b/tasks/playwright/standard/eval_web/extraction_table/description.md @@ -20,11 +20,11 @@ Use Playwright MCP tools to extract all data from the specified website and pres ```csv Title, Rating, Likes, Views, Replies -SEO Optimization, "4.6", 756, 10123, 72 -Vue 3 Composition API, "4.5", 743, 9876, 67 -Advanced TypeScript Types Guide, "4.9", 924, 15432, 102 -Node.js Performance Optimization, "4.2", 567, 8765, 45 -Frontend Engineering Best Practices, "4.7", 812, 11234, 78 +SEO Optimization, 4.6, 756, 10123, 72 +Vue 3 Composition API, 4.5, 743, 9876, 67 +Advanced TypeScript Types Guide, 4.9, 924, 15432, 102 +Node.js Performance Optimization, 4.2, 567, 8765, 45 +Frontend Engineering Best Practices, 4.7, 812, 11234, 78 ``` ## Notes: diff --git a/tasks/playwright/standard/eval_web/extraction_table/verify.py b/tasks/playwright/standard/eval_web/extraction_table/verify.py index 5bc5aa08..ff3152bc 100644 --- a/tasks/playwright/standard/eval_web/extraction_table/verify.py +++ b/tasks/playwright/standard/eval_web/extraction_table/verify.py @@ -13,11 +13,26 @@ import csv from io import StringIO -# Expected CSV header (must match exactly, including spaces) -EXPECTED_HEADER_LINE = "Title, Rating, Likes, Views, Replies" +# Expected CSV columns (order in output is flexible, but the set must match) EXPECTED_HEADERS = ["Title", "Rating", "Likes", "Views", "Replies"] + + +def _load_expected_rows(): + """Load ground-truth rows from data.csv as a set of normalized tuples.""" + path = os.path.join(os.path.dirname(__file__), "data.csv") + expected = set() + with open(path, newline='') as f: + reader = csv.reader(f) + next(reader) # skip header + for row in reader: + cells = [c.strip() for c in row] + expected.add((cells[0], float(cells[1]), int(cells[2]), int(cells[3]), int(cells[4]))) + return expected + + +EXPECTED_ROWS = _load_expected_rows() # Exact number of data rows (must match data.csv exactly) -EXPECTED_DATA_ROWS = 97 +EXPECTED_DATA_ROWS = len(EXPECTED_ROWS) def get_model_response(): @@ -26,7 +41,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"| MCP_MESSAGES: {messages_path}") + print(f"| MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("| Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -71,9 +86,9 @@ def extract_csv_from_response(response): lines = response.split('\n') csv_start = -1 - # Stricter header matching: look for lines containing "Title" and "Rating" + # Stricter header matching: look for lines containing all expected column names for i, line in enumerate(lines): - if "Title" in line and "Rating" in line and "Likes" in line: + if all(h in line for h in EXPECTED_HEADERS): csv_start = i break @@ -110,11 +125,6 @@ def validate_csv_data(csv_text): if len(lines) != expected_total_rows: return False, f"| CSV total row count mismatch, expected: {expected_total_rows} rows, actual: {len(lines)} rows" - # Check header row format (must match exactly) - header_line = lines[0].strip() - if header_line != EXPECTED_HEADER_LINE: - return False, f"| Header format mismatch, expected: '{EXPECTED_HEADER_LINE}', actual: '{header_line}'" - # Parse CSV to validate structure csv_reader = csv.reader(StringIO(csv_text)) rows = list(csv_reader) @@ -125,16 +135,25 @@ def validate_csv_data(csv_text): if len(row) != expected_columns: return False, f"| Row {i+1} column count incorrect, expected: {expected_columns} columns, actual: {len(row)} columns" + # Check header columns β€” order can vary, but the set must match exactly + header_cells = [c.strip() for c in rows[0]] + if set(header_cells) != set(EXPECTED_HEADERS): + missing_h = set(EXPECTED_HEADERS) - set(header_cells) + extra_h = set(header_cells) - set(EXPECTED_HEADERS) + return False, f"| Header columns mismatch, missing: {sorted(missing_h)}, extra: {sorted(extra_h)}" + col_idx = {name: i for i, name in enumerate(header_cells)} + # Validate data row format valid_rows = 0 + seen = set() for i, row in enumerate(rows[1:], 2): # Skip header, start from row 2 # Check if each column has data if not all(cell.strip() for cell in row): return False, f"| Row {i} contains empty data" # Check numeric column format (Rating, Likes, Views, Replies should not have quotes) - for col_idx, col_name in [(1, "Rating"), (2, "Likes"), (3, "Views"), (4, "Replies")]: - value = row[col_idx].strip() + for col_name in ("Rating", "Likes", "Views", "Replies"): + value = row[col_idx[col_name]].strip() # Check for quotes (should not have any) if value.startswith('"') and value.endswith('"'): @@ -150,12 +169,25 @@ def validate_csv_data(csv_text): if not value.isdigit(): return False, f"| Row {i} {col_name} should be pure digits, actual: {value}" + seen.add(( + row[col_idx["Title"]].strip(), + float(row[col_idx["Rating"]].strip()), + int(row[col_idx["Likes"]].strip()), + int(row[col_idx["Views"]].strip()), + int(row[col_idx["Replies"]].strip()), + )) valid_rows += 1 # Validate number of data rows if valid_rows != EXPECTED_DATA_ROWS: return False, f"| Valid data row count mismatch, expected: {EXPECTED_DATA_ROWS} rows, actual: {valid_rows} rows" + # Validate row contents match data.csv exactly (order independent) + missing = EXPECTED_ROWS - seen + extra = seen - EXPECTED_ROWS + if missing or extra: + return False, f"| Row content mismatch, missing: {len(missing)} row(s), extra: {len(extra)} row(s); sample missing: {sorted(missing)[:2]}" + return True, f"| CSV validation successful: format matches data.csv exactly, {valid_rows} valid data rows" except Exception as e: diff --git a/tasks/playwright/standard/web_search/birth_of_arvinxu/description.md b/tasks/playwright/standard/web_search/birth_of_arvinxu/description.md index 9455377e..81e5677b 100644 --- a/tasks/playwright/standard/web_search/birth_of_arvinxu/description.md +++ b/tasks/playwright/standard/web_search/birth_of_arvinxu/description.md @@ -4,5 +4,4 @@ Use Playwright MCP tools to search for information about the X profile https://x ## Requirements: -Extract the answer in specific format: - - just year,like 1990, 2001 +Output ONLY the 4-digit birth year (e.g. `1990`, `2001`), with no other text β€” no prose, units, punctuation, quotes, or surrounding whitespace. diff --git a/tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py b/tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py index 338905d1..8f3a59ce 100644 --- a/tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py +++ b/tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py @@ -63,22 +63,29 @@ def parse_ai_results(work_dir: Path) -> Dict[str, Any]: found_answer = False ai_responses = [] - for message in messages: - if message.get("role") == "assistant": - content = str(message.get("content", "")) + # Find the last completed assistant message + for message in reversed(messages): + if (message.get("role") == "assistant" and + message.get("status") == "completed" and + message.get("type") == "message"): + content = "" # Handle both string and list content formats - if isinstance(message.get("content"), list): - content = " ".join( - item.get("text", "") if isinstance(item, dict) else str(item) - for item in message.get("content", []) - ) + raw = message.get("content", "") + if isinstance(raw, list): + for item in raw: + if isinstance(item, dict) and item.get("type") in ["text", "output_text"]: + content = item.get("text", "") + break + elif isinstance(raw, str): + content = raw ai_responses.append(content) # Exact match (character-for-character, case-sensitive, no trimming) if content == EXPECTED_GROUND_TRUTH: found_answer = True + break return { "success": True, diff --git a/tasks/playwright/standard/web_search/r1_arxiv/description.md b/tasks/playwright/standard/web_search/r1_arxiv/description.md index d20946f5..24a4d098 100644 --- a/tasks/playwright/standard/web_search/r1_arxiv/description.md +++ b/tasks/playwright/standard/web_search/r1_arxiv/description.md @@ -1,19 +1,20 @@ # Web Search Task -Use Playwright MCP tools to search for the DeepSeek R1 research paper and extract all the paragraphs of the Conclusion section. +Use Playwright MCP tools to search for the **v1 (initial) version** of the DeepSeek R1 research paper on arXiv and extract all the paragraphs of the "Conclusion, Limitations, and Future Work" section. ## Requirements: -1. Search for the DeepSeek R1 research paper -2. Navigate to the paper and find the Conclusion section -3. Extract **ALL the paragraphs** of the Conclusion section -4. **Provide the content in Markdown format - no explanations, no additional text** +1. Search for the DeepSeek R1 research paper on arXiv +2. Navigate to the **v1 (initial) version** of the paper +3. Find the "Conclusion, Limitations, and Future Work" section +4. Extract **ALL the paragraphs** of the "Conclusion, Limitations, and Future Work" section +5. **Provide the content in Markdown format - no explanations, no additional text** ## Important Notes: - **Output ALL the paragraphs of text** - **Do NOT include any explanations, summaries, or additional content** -- **The response should contain ONLY the Conclusion section content formatted in Markdown** +- **The response should contain ONLY the "Conclusion, Limitations, and Future Work" section content formatted in Markdown** ## Expected Output: -All the paragraphs of the Conclusion section from the DeepSeek R1 paper, formatted in Markdown with proper paragraph structure and formatting. +All the paragraphs of the "Conclusion, Limitations, and Future Work" section from the DeepSeek R1 paper, formatted in Markdown with proper paragraph structure and formatting. diff --git a/tasks/playwright/standard/web_search/r1_arxiv/verify.py b/tasks/playwright/standard/web_search/r1_arxiv/verify.py index 73e7d2b9..30b23f67 100644 --- a/tasks/playwright/standard/web_search/r1_arxiv/verify.py +++ b/tasks/playwright/standard/web_search/r1_arxiv/verify.py @@ -9,6 +9,9 @@ import sys import json import os +import re +import difflib +import unicodedata from pathlib import Path from typing import Dict, Any @@ -19,6 +22,28 @@ # Expected ground truth content from content.txt EXPECTED_CONTENT_FILE = "content.txt" +# Similarity threshold for content match (after normalization) +SIMILARITY_THRESHOLD = 0.9 + + +def _normalize_content(s: str) -> str: + """Strip markdown / unicode / whitespace noise so comparison focuses on text content.""" + # Strip bold markers + s = re.sub(r'\*\*|__', '', s) + # Strip leading list markers and numbered items + s = re.sub(r'^[\s]*[-*+]\s+', '', s, flags=re.M) + s = re.sub(r'^[\s]*\d+\.\s+', '', s, flags=re.M) + # Unicode-normalize and collapse smart quotes / dashes to ASCII + s = unicodedata.normalize('NFKC', s) + s = s.translate(str.maketrans({ + 'β€˜': "'", '’': "'", + 'β€œ': '"', '”': '"', + '–': '-', 'β€”': '-', + })) + # Collapse whitespace, lowercase + s = re.sub(r'\s+', ' ', s).strip().lower() + return s + # ============================================================================= # MCP RESULT PARSING # ============================================================================= @@ -84,21 +109,28 @@ def parse_ai_results(work_dir: Path) -> Dict[str, Any]: ai_responses = [] extracted_content = "" - for message in messages: - if message.get("role") == "assistant": - content = str(message.get("content", "")) + # Find the last completed assistant message + for message in reversed(messages): + if (message.get("role") == "assistant" and + message.get("status") == "completed" and + message.get("type") == "message"): + content = "" # Handle both string and list content formats - if isinstance(message.get("content"), list): - content = " ".join( - item.get("text", "") if isinstance(item, dict) else str(item) - for item in message.get("content", []) - ) + raw = message.get("content", "") + if isinstance(raw, list): + for item in raw: + if isinstance(item, dict) and item.get("type") in ["text", "output_text"]: + content = item.get("text", "") + break + elif isinstance(raw, str): + content = raw ai_responses.append(content) # Store the last response as extracted content extracted_content = content + break return { "success": True, @@ -117,16 +149,19 @@ def compare_content(extracted: str, expected: str) -> Dict[str, Any]: if not extracted: return {"success": False, "error": "No extracted content found"} - # Normalize content for comparison (remove extra whitespace, normalize line breaks) - extracted_normalized = " ".join(extracted.split()) - expected_normalized = " ".join(expected.split()) + # Normalize markdown / unicode / whitespace noise on both sides + extracted_normalized = _normalize_content(extracted) + expected_normalized = _normalize_content(expected) - # Direct text comparison - content must be exactly the same - is_exact_match = extracted_normalized == expected_normalized + # Similarity-based comparison; threshold tolerates small residual noise + similarity = difflib.SequenceMatcher(None, expected_normalized, extracted_normalized).ratio() + is_exact_match = similarity >= SIMILARITY_THRESHOLD return { "success": True, "is_exact_match": is_exact_match, + "similarity": similarity, + "threshold": SIMILARITY_THRESHOLD, "extracted_length": len(extracted_normalized), "expected_length": len(expected_normalized), "extracted_preview": extracted_normalized[:100] + "..." if len(extracted_normalized) > 100 else extracted_normalized, @@ -181,14 +216,15 @@ def verify_task(work_dir: Path) -> bool: print(f"| Content comparison results:") print(f"| - Extracted length: {comparison['extracted_length']} characters") print(f"| - Expected length: {comparison['expected_length']} characters") + print(f"| - Similarity: {comparison['similarity']:.4f} (threshold: {comparison['threshold']})") print(f"| - Extracted preview: {comparison['extracted_preview']}") print(f"| - Expected preview: {comparison['expected_preview']}") if comparison['is_exact_match']: - print("| Task completed successfully! Content matches exactly.") + print(f"| Task completed successfully! Similarity {comparison['similarity']:.4f} >= {comparison['threshold']}.") return True else: - print("| Task verification failed. Content does not match exactly.") + print(f"| Task verification failed. Similarity {comparison['similarity']:.4f} < {comparison['threshold']}.") return False diff --git a/tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py b/tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py index b9d3151d..d7fea406 100644 --- a/tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py +++ b/tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py @@ -51,14 +51,13 @@ def parse_key_value_format(text): def normalize_text(text): """ - Normalize text for comparison by handling different quote styles and whitespace. + Normalize text for comparison by collapsing whitespace. """ if not isinstance(text, str): return str(text) - # Replace various quote styles with standard quotes - text = text.replace(""", "'").replace(""", "'") - text = text.replace('"', '"').replace('"', '"') + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') # Normalize whitespace text = " ".join(text.split()) @@ -181,13 +180,18 @@ async def verify() -> bool: extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) - # Load expected values from label.txt + # Load expected values from label.txt β€” hard fail if missing label_path = Path(__file__).parent / "label.txt" - if label_path.exists(): - with open(label_path, "r") as f: - expected_text = f.read().strip() - expected_data = parse_key_value_format(expected_text) - print("Loaded expected values from label.txt", file=sys.stderr) + if not label_path.exists(): + print( + f"FAILED: Ground-truth file not found at {label_path}", + file=sys.stderr, + ) + return False + with open(label_path, "r") as f: + expected_text = f.read().strip() + expected_data = parse_key_value_format(expected_text) + print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ @@ -207,7 +211,7 @@ async def verify() -> bool: if missing_keys: print( - "FAILED: Missing required keys in submission: {', '.join(missing_keys)}", + f"FAILED: Missing required keys in submission: {', '.join(missing_keys)}", file=sys.stderr, ) print( @@ -219,64 +223,35 @@ async def verify() -> bool: # Validate data format and content errors = [] - # Check numeric fields - try: - post_count = int(extracted_data["Deeplearning_Post_Count"]) - if ( - "expected_data" in locals() - and "Deeplearning_Post_Count" in expected_data - ): - expected_count = int(expected_data["Deeplearning_Post_Count"]) - if post_count != expected_count: - errors.append( - f"Deeplearning_Post_Count mismatch: got {post_count}, expected {expected_count}" - ) - except ValueError: - errors.append( - f"Deeplearning_Post_Count must be a number, got: {extracted_data['Deeplearning_Post_Count']}" - ) - - # If we have expected data, compare against it - if "expected_data" in locals(): - # Compare each field - for key in required_keys: - if key in expected_data and key in extracted_data: - expected_val = normalize_text(expected_data[key]) - actual_val = normalize_text(extracted_data[key]) - - # For numeric fields, compare as integers - if key in [ - "Deeplearning_Post_Count", - "ChatGPT_Tool_Vote_Count", - "Page2_Top_Post_Votes", - ]: - try: - expected_int = int(expected_val) - actual_int = int(actual_val) - if expected_int != actual_int: - errors.append( - f"{key} mismatch: got {actual_int}, expected {expected_int}" - ) - except ValueError: - errors.append( - f"{key} should be numeric: got '{actual_val}'" - ) - else: - # For text fields, compare normalized text - if expected_val != actual_val: + # Compare each field against expected_data + for key in required_keys: + if key in expected_data and key in extracted_data: + expected_val = normalize_text(expected_data[key]) + actual_val = normalize_text(extracted_data[key]) + + # For numeric fields, compare as integers + if key in [ + "Deeplearning_Post_Count", + "ChatGPT_Tool_Vote_Count", + "Page2_Top_Post_Votes", + ]: + try: + expected_int = int(expected_val) + actual_int = int(actual_val) + if expected_int != actual_int: errors.append( - f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + f"{key} mismatch: got {actual_int}, expected {expected_int}" ) - - else: - # If no expected data, just do basic validation - for key in required_keys: - if key not in extracted_data: - errors.append(f"Missing required key: {key}") - elif ( - not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" - ): - errors.append(f"{key} was not filled in") + except ValueError: + errors.append( + f"{key} should be numeric: got '{actual_val}'" + ) + else: + # For text fields, compare normalized text + if expected_val != actual_val: + errors.append( + f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + ) if errors: print( @@ -286,10 +261,9 @@ async def verify() -> bool: for error in errors: print(f" - {error}", file=sys.stderr) print("\nExpected values from label.txt:", file=sys.stderr) - if "expected_data" in locals(): - for key in required_keys: - if key in expected_data: - print(f" {key}: {expected_data[key]}", file=sys.stderr) + for key in required_keys: + if key in expected_data: + print(f" {key}: {expected_data[key]}", file=sys.stderr) return False # All checks passed diff --git a/tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py b/tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py index 387911c7..c481ab6a 100644 --- a/tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py +++ b/tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py @@ -7,23 +7,23 @@ BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") + def normalize_text(text): """ - Normalize text for comparison by handling different quote styles and whitespace. + Normalize text for comparison by collapsing whitespace. """ if not isinstance(text, str): return str(text) - - # Replace various quote styles with standard quotes - text = text.replace('\'', "'").replace('\'', "'") - text = text.replace('"', '"').replace('"', '"') - text = text.replace('&', '&') - + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + # Normalize whitespace - text = ' '.join(text.split()) - + text = " ".join(text.split()) + return text.strip() + async def verify() -> bool: """ Verifies that the budget Europe travel resource task has been completed correctly. @@ -193,7 +193,7 @@ async def verify() -> bool: wiki_title_elem = page.locator(selector) if await wiki_title_elem.count(): title_text = await wiki_title_elem.first.text_content() - if expected_wiki_title in title_text: + if title_text and normalize_text(title_text) == normalize_text(expected_wiki_title): wiki_title_found = True break @@ -257,37 +257,13 @@ async def verify() -> bool: else: print("βœ“ On search results page for 'travel insurance Europe'", file=sys.stderr) - # Check for upvoted posts - upvote_found = False - - # Method 1: Check for "Retract upvote" button (indicates user has upvoted) - retract_buttons = page.locator('button:has-text("Retract upvote")') + # Postmill renders vote buttons as icon-only (no text node), so + # match the title attribute. The title flips from "Upvote" to + # "Retract upvote" when the current user has upvoted. + retract_buttons = page.locator('button[title="Retract upvote"]') if await retract_buttons.count() > 0: print("βœ“ Found upvoted post (Retract upvote button present)", file=sys.stderr) - upvote_found = True - - # Method 2: Check for posts with upvote count >= 1 - if not upvote_found: - # Look for vote counts - vote_elements = page.locator('div.vote, span.vote-count, [class*="vote"]') - - for i in range(await vote_elements.count()): - vote_elem = vote_elements.nth(i) - vote_text = await vote_elem.text_content() - try: - # Extract number from vote text - import re - numbers = re.findall(r'\d+', vote_text) - if numbers: - vote_count = int(numbers[0]) - if vote_count >= 1: - print(f"βœ“ Found post with {vote_count} upvote(s)", file=sys.stderr) - upvote_found = True - break - except: - continue - - if not upvote_found: + else: print("❌ ERROR: No upvoted posts found in search results", file=sys.stderr) verification_passed = False diff --git a/tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py b/tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py index 0fc63e08..3058201d 100644 --- a/tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py +++ b/tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py @@ -36,14 +36,13 @@ def parse_markdown_list_format(text): def normalize_text(text): """ - Normalize text for comparison by handling different quote styles and whitespace. + Normalize text for comparison by collapsing whitespace. """ if not isinstance(text, str): return str(text) - # Replace various quote styles with standard quotes - text = text.replace(""", "'").replace(""", "'") - text = text.replace('"', '"').replace('"', '"') + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') # Normalize whitespace text = " ".join(text.split()) @@ -264,17 +263,6 @@ async def verify() -> bool: if expected_val != actual_val: errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'") - # Verify upvotes are in descending order - try: - post1_votes = int(extracted_data["Post1_Upvotes"]) - post2_votes = int(extracted_data["Post2_Upvotes"]) - post3_votes = int(extracted_data["Post3_Upvotes"]) - - if not (post1_votes >= post2_votes >= post3_votes): - errors.append(f"Posts should be ordered by upvotes: {post1_votes} >= {post2_votes} >= {post3_votes}") - except (ValueError, KeyError): - pass # Already reported above - if errors: print("Error: Validation failed with the following issues:", file=sys.stderr) for error in errors: @@ -287,7 +275,6 @@ async def verify() -> bool: print("βœ“ Submission 'Research Report for BuyItForLife' found in correct forum", file=sys.stderr) print("βœ“ All 14 required fields present and correct", file=sys.stderr) print("βœ“ Data matches expected values from label.txt", file=sys.stderr) - print("βœ“ Posts ordered by upvotes (descending)", file=sys.stderr) return True except PlaywrightTimeoutError as e: diff --git a/tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md b/tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md index 2417a4d2..6da397b6 100644 --- a/tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md +++ b/tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md @@ -21,13 +21,13 @@ I need you to perform a comprehensive analysis of Large Language Model discussio - Total_LLM_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Upvotes|FILL_VALUE -- Top1_Date|FILL_VALUE +- Top1_Author|FILL_VALUE - Top2_Title|FILL_VALUE - Top2_Upvotes|FILL_VALUE -- Top2_Date|FILL_VALUE +- Top2_Author|FILL_VALUE - Top3_Title|FILL_VALUE - Top3_Upvotes|FILL_VALUE -- Top3_Date|FILL_VALUE +- Top3_Author|FILL_VALUE - Deeplearning_MostDiscussed|FILL_VALUE - Deeplearning_Comments|FILL_VALUE ``` \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt b/tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt index 72b25e79..54ba45d0 100644 --- a/tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt +++ b/tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt @@ -1,12 +1,12 @@ -- Total_LLM_Posts|9 +- Total_LLM_Posts|8 - Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments) - Top1_Upvotes|2655 -- Top1_Date|3 years ago +- Top1_Author|jsonathan - Top2_Title|[P] I built Adrenaline, a debugger that fixes errors and explains them with GPT-3 - Top2_Upvotes|1542 -- Top2_Date|3 years ago +- Top2_Author|jsonathan - Top3_Title|[N] OpenAI may have benchmarked GPT-4's coding ability on it's own training data - Top3_Upvotes|925 -- Top3_Date|2 years ago +- Top3_Author|Balance- - Deeplearning_MostDiscussed|Do companies actually care about their model's training/inference speed? - Deeplearning_Comments|39 \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py b/tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py index 519f212c..621ae684 100644 --- a/tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py +++ b/tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py @@ -15,51 +15,42 @@ def parse_key_value_format(text): """ Parse the Key|Value format from the submission body. - Handles both pipe (|) and colon (:) separators for compatibility. """ data = {} - - # Try to parse with pipe separator first (expected format) + lines = text.strip().split('\n') for line in lines: line = line.strip() if not line: continue - + # Remove markdown list prefix if present if line.startswith('- '): line = line[2:] elif line.startswith('* '): line = line[2:] - - # Try pipe separator first + elif line.startswith('β€’ '): + line = line[2:] + if '|' in line: parts = line.split('|', 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() data[key] = value - # Fallback to colon separator for label.txt compatibility - elif ':' in line: - parts = line.split(':', 1) - if len(parts) == 2: - key = parts[0].strip() - value = parts[1].strip() - data[key] = value - + return data def normalize_text(text): """ - Normalize text for comparison by handling different quote styles and whitespace. + Normalize text for comparison by collapsing whitespace. """ if not isinstance(text, str): return str(text) - # Replace various quote styles with standard quotes - text = text.replace(""", "'").replace(""", "'") - text = text.replace('"', '"').replace('"', '"') + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') # Normalize whitespace text = " ".join(text.split()) @@ -168,26 +159,31 @@ async def verify() -> bool: extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) - # Load expected values from label.txt + # Load expected values from label.txt β€” hard fail if missing label_path = Path(__file__).parent / "label.txt" - if label_path.exists(): - with open(label_path, "r") as f: - expected_text = f.read().strip() - expected_data = parse_key_value_format(expected_text) - print("Loaded expected values from label.txt", file=sys.stderr) + if not label_path.exists(): + print( + f"Error: Ground-truth file not found at {label_path}", + file=sys.stderr, + ) + return False + with open(label_path, "r") as f: + expected_text = f.read().strip() + expected_data = parse_key_value_format(expected_text) + print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ "Total_LLM_Posts", "Top1_Title", "Top1_Upvotes", - "Top1_Date", + "Top1_Author", "Top2_Title", "Top2_Upvotes", - "Top2_Date", + "Top2_Author", "Top3_Title", "Top3_Upvotes", - "Top3_Date", + "Top3_Author", "Deeplearning_MostDiscussed", "Deeplearning_Comments", ] @@ -204,79 +200,36 @@ async def verify() -> bool: ) return False - # Validate data format and content + # Compare each field against expected_data errors = [] - - # Check Total_LLM_Posts is a number and matches expected - try: - total_posts = int(extracted_data["Total_LLM_Posts"]) - if "expected_data" in locals() and "Total_LLM_Posts" in expected_data: - expected_total = int(expected_data["Total_LLM_Posts"]) - if total_posts != expected_total: - errors.append( - f"Total_LLM_Posts mismatch: got {total_posts}, expected {expected_total}" - ) - elif total_posts < 5: # Based on exploration, should be at least 5 - errors.append(f"Total_LLM_Posts seems too low: {total_posts}") - except ValueError: - errors.append( - f"Total_LLM_Posts must be a number, got: {extracted_data['Total_LLM_Posts']}" - ) - - # If we have expected data, compare against it - if "expected_data" in locals(): - # Compare each field - for key in required_keys: - if key in expected_data and key in extracted_data: - expected_val = normalize_text(expected_data[key]) - actual_val = normalize_text(extracted_data[key]) - - # For numeric fields, compare as integers - if ( - "Upvotes" in key - or "Comments" in key - or key == "Total_LLM_Posts" - ): - try: - expected_int = int(expected_val) - actual_int = int(actual_val) - if expected_int != actual_int: - errors.append( - f"{key} mismatch: got {actual_int}, expected {expected_int}" - ) - except ValueError: - errors.append( - f"{key} should be numeric: got '{actual_val}'" - ) - else: - # For text fields, compare normalized text - if expected_val != actual_val: + for key in required_keys: + if key in expected_data and key in extracted_data: + expected_val = normalize_text(expected_data[key]) + actual_val = normalize_text(extracted_data[key]) + + # For numeric fields, compare as integers + if ( + "Upvotes" in key + or "Comments" in key + or key == "Total_LLM_Posts" + ): + try: + expected_int = int(expected_val) + actual_int = int(actual_val) + if expected_int != actual_int: errors.append( - f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + f"{key} mismatch: got {actual_int}, expected {expected_int}" ) - - else: - # If no expected data, just do basic validation - for key in required_keys: - if key not in extracted_data: - errors.append(f"Missing required key: {key}") - elif ( - not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" - ): - errors.append(f"{key} was not filled in") - - # Verify upvotes are in descending order for top 3 - try: - top1_votes = int(extracted_data["Top1_Upvotes"]) - top2_votes = int(extracted_data["Top2_Upvotes"]) - top3_votes = int(extracted_data["Top3_Upvotes"]) - - if not (top1_votes >= top2_votes >= top3_votes): - errors.append( - f"Top posts should be ordered by upvotes: {top1_votes} >= {top2_votes} >= {top3_votes}" - ) - except (ValueError, KeyError): - pass # Already reported above + except ValueError: + errors.append( + f"{key} should be numeric: got '{actual_val}'" + ) + else: + # For text fields, compare normalized text + if expected_val != actual_val: + errors.append( + f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + ) if errors: print( diff --git a/tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py b/tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py index 94be4319..270fd5b9 100644 --- a/tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py +++ b/tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py @@ -52,16 +52,17 @@ def parse_key_value_format(text): def normalize_text(text): """ - Normalize text for comparison by handling different quote styles and whitespace. + Normalize text for comparison by decoding the & HTML entity and collapsing whitespace. """ if not isinstance(text, str): return str(text) - # Replace various quote styles with standard quotes - text = text.replace(""", "'").replace(""", "'") - text = text.replace('"', '"').replace('"', '"') + # Decode & HTML entity text = text.replace("&", "&") + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + # Normalize whitespace text = " ".join(text.split()) @@ -171,13 +172,18 @@ async def verify() -> bool: extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) - # Load expected values from label.txt + # Load expected values from label.txt β€” hard fail if missing label_path = Path(__file__).parent / "label.txt" - if label_path.exists(): - with open(label_path, "r") as f: - expected_text = f.read().strip() - expected_data = parse_key_value_format(expected_text) - print("Loaded expected values from label.txt", file=sys.stderr) + if not label_path.exists(): + print( + f"Error: Ground-truth file not found at {label_path}", + file=sys.stderr, + ) + return False + with open(label_path, "r") as f: + expected_text = f.read().strip() + expected_data = parse_key_value_format(expected_text) + print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ @@ -208,65 +214,37 @@ async def verify() -> bool: ) return False - # Validate data format and content + # Compare each field against expected_data errors = [] - - # Check Total_Year_Posts is a number and matches expected - try: - total_posts = int(extracted_data["Total_Year_Posts"]) - if "expected_data" in locals() and "Total_Year_Posts" in expected_data: - expected_total = int(expected_data["Total_Year_Posts"]) - if total_posts != expected_total: - errors.append( - f"Total_Year_Posts mismatch: got {total_posts}, expected {expected_total}" - ) - except ValueError: - errors.append( - f"Total_Year_Posts must be a number, got: {extracted_data['Total_Year_Posts']}" - ) - - # If we have expected data, compare against it - if "expected_data" in locals(): - # Compare each field - for key in required_keys: - if key in expected_data and key in extracted_data: - expected_val = normalize_text(expected_data[key]) - actual_val = normalize_text(extracted_data[key]) - - # For numeric fields, compare as integers - if ( - "Upvotes" in key - or "Comments" in key - or key == "Total_Year_Posts" - or key == "Total_Image_Posts_5Pages" - ): - try: - expected_int = int(expected_val) - actual_int = int(actual_val) - if expected_int != actual_int: - errors.append( - f"{key} mismatch: got {actual_int}, expected {expected_int}" - ) - except ValueError: - errors.append( - f"{key} should be numeric: got '{actual_val}'" - ) - else: - # For text fields, compare normalized text - if expected_val != actual_val: + for key in required_keys: + if key in expected_data and key in extracted_data: + expected_val = normalize_text(expected_data[key]) + actual_val = normalize_text(extracted_data[key]) + + # For numeric fields, compare as integers + if ( + "Upvotes" in key + or "Comments" in key + or key == "Total_Year_Posts" + or key == "Total_Image_Posts_5Pages" + ): + try: + expected_int = int(expected_val) + actual_int = int(actual_val) + if expected_int != actual_int: errors.append( - f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + f"{key} mismatch: got {actual_int}, expected {expected_int}" ) - - else: - # If no expected data, just do basic validation - for key in required_keys: - if key not in extracted_data: - errors.append(f"Missing required key: {key}") - elif ( - not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" - ): - errors.append(f"{key} was not filled in") + except ValueError: + errors.append( + f"{key} should be numeric: got '{actual_val}'" + ) + else: + # For text fields, compare normalized text + if expected_val != actual_val: + errors.append( + f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + ) if errors: print( diff --git a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md index 76c4fe1a..fdfaa4dc 100644 --- a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md +++ b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md @@ -4,8 +4,7 @@ I'm conducting research on NBA player discussions in online sports communities. 1. Register a new account with username 'NBA_DataAnalyst_2024' and password 'Research#2024!' -2. Navigate to the sports forum and search for posts containing 'NBA' in their titles: - - Collect data from the 5 NBA-related posts with the most comments +2. Navigate to the sports forum and browse its posts to find posts whose titles contain 'NBA'. From these, collect the 5 with the most comments: - For each post, record: the exact post title, vote count, comment count, and the username of the person who submitted it 3. Visit the user profile of 'BCLetsRide69': @@ -16,7 +15,6 @@ I'm conducting research on NBA player discussions in online sports communities. - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the colon, follow the markdown format): ``` -- Total_NBA_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Votes|FILL_VALUE - Top1_Comments|FILL_VALUE diff --git a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt index 57b4ca8e..d6757351 100644 --- a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt +++ b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt @@ -1,4 +1,3 @@ -- Total_NBA_Posts|20 - Top1_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade - Top1_Votes|614 - Top1_Comments|170 @@ -7,7 +6,7 @@ - Top2_Votes|1266 - Top2_Comments|145 - Top2_Author|XXmynameisNeganXX -- Top3_Title|[ESPN] Announced attendance at the Alamodome tonight|68,323, a new single-game NBA record, in the Spurs' first game there since Game 4 of the 2002 Western Conference Semifinals. +- Top3_Title|[ESPN] Announced attendance at the Alamodome tonight: 68,323, a new single-game NBA record, in the Spurs' first game there since Game 4 of the 2002 Western Conference Semifinals. - Top3_Votes|1511 - Top3_Comments|101 - Top3_Author|dragon8811 diff --git a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py index 43c5dad1..7490cc82 100644 --- a/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py +++ b/tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py @@ -23,12 +23,14 @@ def parse_key_value_format(text): lines = text.strip().split('\n') for line in lines: line = line.strip() - if not line or line.startswith('#'): + if not line: continue - + # Remove bullet point if present if line.startswith('- '): line = line[2:] + elif line.startswith('* '): + line = line[2:] elif line.startswith('β€’ '): line = line[2:] @@ -51,12 +53,8 @@ def normalize_text(text): if not isinstance(text, str): return str(text) - # Replace various quote styles with standard quotes - text = text.replace(""", "'").replace(""", "'") - text = text.replace('"', '"').replace('"', '"') - # Also normalize apostrophes - use unicode escapes to be safe - text = text.replace("\u2019", "'") # RIGHT SINGLE QUOTATION MARK (') - text = text.replace("\u2018", "'") # LEFT SINGLE QUOTATION MARK (') + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') # Normalize whitespace text = " ".join(text.split()) @@ -138,22 +136,22 @@ async def verify() -> bool: ".post-body", ".RichText", '[class*="RichText"]', - 'div:has(> p:has-text("Total_NBA_Posts"))', - 'div:has-text("Total_NBA_Posts"):has-text("Most_Popular_NBA_Author")', + 'div:has(> p:has-text("Top1_Title"))', + 'div:has-text("Top1_Title"):has-text("BCLetsRide69_Total_Posts")', ] for selector in selectors: content_element = page.locator(selector) if await content_element.count(): post_content = await content_element.first.inner_text() - if "Total_NBA_Posts" in post_content: + if "Top1_Title" in post_content: print( f"Found submission content using selector: {selector}", file=sys.stderr, ) break - if not post_content or "Total_NBA_Posts" not in post_content: + if not post_content or "Top1_Title" not in post_content: print( "Error: Could not find submission body with required format", file=sys.stderr, @@ -167,17 +165,21 @@ async def verify() -> bool: extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) - # Load expected values from label.txt + # Load expected values from label.txt β€” hard fail if missing label_path = Path(__file__).parent / "label.txt" - if label_path.exists(): - with open(label_path, "r") as f: - expected_text = f.read().strip() - expected_data = parse_key_value_format(expected_text) - print("Loaded expected values from label.txt", file=sys.stderr) + if not label_path.exists(): + print( + f"Error: Ground-truth file not found at {label_path}", + file=sys.stderr, + ) + return False + with open(label_path, "r") as f: + expected_text = f.read().strip() + expected_data = parse_key_value_format(expected_text) + print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ - "Total_NBA_Posts", "Top1_Title", "Top1_Votes", "Top1_Comments", @@ -213,69 +215,36 @@ async def verify() -> bool: ) return False - # Validate data format and content + # Compare each field against expected_data errors = [] - - # Check Total_NBA_Posts is a number and matches expected - try: - total_posts = int(extracted_data["Total_NBA_Posts"]) - if "expected_data" in locals() and "Total_NBA_Posts" in expected_data: - expected_total = int(expected_data["Total_NBA_Posts"]) - if total_posts != expected_total: - errors.append( - f"Total_NBA_Posts mismatch: got {total_posts}, expected {expected_total}" - ) - elif ( - total_posts < 5 - ): # Should be at least 5 since we're collecting top 5 - errors.append(f"Total_NBA_Posts seems too low: {total_posts}") - except ValueError: - errors.append( - f"Total_NBA_Posts must be a number, got: {extracted_data['Total_NBA_Posts']}" - ) - - # If we have expected data, compare against it - if "expected_data" in locals(): - # Compare each field - for key in required_keys: - if key in expected_data and key in extracted_data: - expected_val = normalize_text(expected_data[key]) - actual_val = normalize_text(extracted_data[key]) - - # For numeric fields, compare as integers - if ( - "Votes" in key - or "Comments" in key - or key == "Total_NBA_Posts" - or key == "BCLetsRide69_Total_Posts" - ): - try: - expected_int = int(expected_val) - actual_int = int(actual_val) - if expected_int != actual_int: - errors.append( - f"{key} mismatch: got {actual_int}, expected {expected_int}" - ) - except ValueError: - errors.append( - f"{key} should be numeric: got '{actual_val}'" - ) - else: - # For text fields, compare normalized text - if expected_val != actual_val: + for key in required_keys: + if key in expected_data and key in extracted_data: + expected_val = normalize_text(expected_data[key]) + actual_val = normalize_text(extracted_data[key]) + + # For numeric fields, compare as integers + if ( + "Votes" in key + or "Comments" in key + or key == "BCLetsRide69_Total_Posts" + ): + try: + expected_int = int(expected_val) + actual_int = int(actual_val) + if expected_int != actual_int: errors.append( - f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + f"{key} mismatch: got {actual_int}, expected {expected_int}" ) - - else: - # If no expected data, just do basic validation - for key in required_keys: - if key not in extracted_data: - errors.append(f"Missing required key: {key}") - elif ( - not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" - ): - errors.append(f"{key} was not filled in") + except ValueError: + errors.append( + f"{key} should be numeric: got '{actual_val}'" + ) + else: + # For text fields, compare normalized text + if expected_val != actual_val: + errors.append( + f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" + ) if errors: print( @@ -292,9 +261,6 @@ async def verify() -> bool: print( "- Submission 'Statistical Analysis: NBA Content Engagement on This Forum' found" ) - print( - f"- Total NBA-related posts analyzed: {extracted_data['Total_NBA_Posts']}" - ) print("- Top 5 posts identified and documented") print( f"- BCLetsRide69's total posts: {extracted_data['BCLetsRide69_Total_Posts']}" diff --git a/tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py b/tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py index ea4da6eb..ffe0ebb8 100644 --- a/tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py +++ b/tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py @@ -11,6 +11,22 @@ BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + async def verify() -> bool: """ Verifies that the daily routine tracking setup has been completed correctly on the forum. @@ -77,7 +93,7 @@ async def verify() -> bool: # Check if the content exists in the page content_found = False article_content = await page.locator("article").text_content() - if article_content and expected_content in article_content: + if article_content and normalize_text(expected_content) in normalize_text(article_content): content_found = True if not content_found: diff --git a/tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py b/tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py index 1d9a94a8..7474a66e 100644 --- a/tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -26,6 +26,7 @@ def get_model_response(): if ( message.get("role") == "assistant" and message.get("status") == "completed" + and message.get("type") == "message" ): content = message.get("content", []) for item in content: @@ -39,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the xxx format from the agent's output. @@ -65,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -83,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -120,20 +137,13 @@ def compare_answers(model_answer, expected_answer): ) elif key == "CartTotal": - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): + # Compare amount only β€” strip $ and , so format variations don't fail a correct value + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + if expected_clean != model_clean: mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" + f"{key}: expected '{expected_value}', got '{model_value}'" ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) elif key == "ReviewCount": # Check review count matches @@ -143,8 +153,8 @@ def compare_answers(model_answer, expected_answer): ) elif key == "LatestReviewer": - # Check reviewer name (allow partial match for names) - if expected_value.lower() not in model_value.lower() and model_value.lower() not in expected_value.lower(): + # Case-insensitive exact match + if expected_value.lower() != model_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) diff --git a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md index e9a8f9fa..5ed1894b 100644 --- a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md @@ -2,7 +2,7 @@ **Task Requirements:** -1. In Video Games category, count products with customer rating 70% or higher in the first 2 pages +1. In Video Games category, count products with customer rating 70% or higher in the first 2 pages (products without any rating do not count) 2. Sort products by price (ascending) and identify the cheapest product that has customer reviews diff --git a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt index a1a48694..3aa3a989 100644 --- a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt +++ b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt @@ -1,4 +1,4 @@ -Products70Plus|7 +Products70Plus|6 CheapestReviewedSKU|B014HDAUAA CheapestReviewedPrice|$0.99 ComparisonCount|2 diff --git a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py index 0bd2d9e6..a38c8605 100644 --- a/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -84,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -107,42 +123,30 @@ def compare_answers(model_answer, expected_answer): # Special handling for different types of values if key in ["CheapestReviewedPrice", "N64Subtotal"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): + # Compare amount only β€” strip $ and , so format variations don't fail a correct value + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + if expected_clean != model_clean: mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" + f"{key}: expected '{expected_value}', got '{model_value}'" ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - elif key == "CheckoutEmail": - # Email should match exactly (case-insensitive) + elif key in ["CheckoutEmail", "ShippingState"]: + # Case-insensitive exact match if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) - elif key == "Products70Plus": - # For count fields, allow some flexibility (products might change) - # But still check if it's a reasonable number + elif key in ["Products70Plus", "ComparisonCount", "ShippingMethods"]: try: - model_count = int(model_value) - expected_count = int(expected_value) - # Allow up to 2 products difference (in case of dynamic content) - if abs(model_count - expected_count) > 2: + if int(model_value) != int(expected_value): mismatches.append( - f"{key}: expected around '{expected_value}', got '{model_value}'" + f"{key}: expected '{expected_value}', got '{model_value}'" ) except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) else: diff --git a/tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py b/tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py index 18f68632..cd219cc9 100644 --- a/tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py +++ b/tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py @@ -13,7 +13,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -93,7 +109,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -115,20 +131,26 @@ def compare_answers(model_answer, expected_answer): # Special handling for different types of values if key in ["Battery1Price", "Battery2Price", "InitialSubtotal", "FinalSubtotal"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): + # Compare amount only β€” strip $ and , so format variations don't fail a correct value + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + if expected_clean != model_clean: mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" + f"{key}: expected '{expected_value}', got '{model_value}'" ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: + + elif key in ["AdvancedSearchResults", "ComparisonCount", "TeaReviews", + "CartUniqueProducts", "CartTotalQuantity", "TeaRating"]: + # Strip % so "95" and "95%" both compare as 95 + try: + if int(model_value.replace("%", "")) != int(expected_value.replace("%", "")): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) + except ValueError: + mismatches.append( + f"{key} should be numeric: got '{model_value}'" + ) else: # Exact match for other fields diff --git a/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md b/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md index b03d3fe0..c1a0bb48 100644 --- a/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md +++ b/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md @@ -20,7 +20,7 @@ 4. In cart: - Update cookie quantity from 2 to 5 - - Record cart subtotal and total items count + - Record cart subtotal and total items count (sum of all product quantities, not the number of distinct products) 5. Search 'gingerbread', go to page 2: - Find third product on page 2 diff --git a/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py b/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py index e2620a29..7b6b4d47 100644 --- a/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py +++ b/tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -84,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -114,35 +130,29 @@ def compare_answers(model_answer, expected_answer): ) elif key in ["CartSubtotalAfterUpdate"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): - mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" - ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - # Allow some tolerance for price calculations (within $0.01) - try: - expected_float = float(expected_clean) - model_float = float(model_clean) - if abs(expected_float - model_float) > 0.01: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - except ValueError: - if expected_value != model_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # Strip $ and , then compare as floats (exact equality β€” no tolerance) + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + try: + if float(expected_clean) != float(model_clean): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) elif key in ["TotalCartItems"]: - # Should be a number - if model_value != expected_value: + try: + if int(model_value) != int(expected_value): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) elif key in ["HighestRatedCookieSKURating", "CheapestChocolatePriceReviews", "Page2ThirdProductSKUPrice"]: @@ -153,49 +163,49 @@ def compare_answers(model_answer, expected_answer): if len(expected_parts) == 2 and len(model_parts) == 2: # For price fields, normalize the price part if key == "CheapestChocolatePriceReviews": - # Check if price part has correct format ($XX.XX) - if not model_parts[0].startswith("$"): - mismatches.append( - f"{key}: incorrect format - price part should start with '$', got '{model_value}'" - ) - else: - expected_price = expected_parts[0].replace("$", "").replace(",", "") - model_price = model_parts[0].replace("$", "").replace(",", "") - try: - if abs(float(expected_price) - float(model_price)) > 0.01 or expected_parts[1] != model_parts[1]: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - except ValueError: - if expected_value != model_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # Price as float (exact, no tolerance), reviews as int + expected_price = expected_parts[0].replace("$", "").replace(",", "") + model_price = model_parts[0].replace("$", "").replace(",", "") + try: + if (float(expected_price) != float(model_price) + or int(expected_parts[1]) != int(model_parts[1])): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) elif key == "Page2ThirdProductSKUPrice": - # Check if price part has correct format ($XX.XX) - if not model_parts[1].startswith("$"): - mismatches.append( - f"{key}: incorrect format - price part should start with '$', got '{model_value}'" - ) - else: - expected_price = expected_parts[1].replace("$", "").replace(",", "") - model_price = model_parts[1].replace("$", "").replace(",", "") - try: - if expected_parts[0] != model_parts[0] or abs(float(expected_price) - float(model_price)) > 0.01: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - except ValueError: - if expected_value != model_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # SKU exact (case-insensitive), price as float (exact, no tolerance) + expected_price = expected_parts[1].replace("$", "").replace(",", "") + model_price = model_parts[1].replace("$", "").replace(",", "") + try: + if (expected_parts[0].upper() != model_parts[0].upper() + or float(expected_price) != float(model_price)): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) else: - # For rating fields, exact match - if expected_value != model_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # HighestRatedCookieSKURating: SKU exact (case-insensitive) + rating as int (strip %) + try: + if (expected_parts[0].upper() != model_parts[0].upper() + or int(expected_parts[1].replace("%", "")) + != int(model_parts[1].replace("%", ""))): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) else: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" diff --git a/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md b/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md index 465d62be..25864e27 100644 --- a/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md @@ -1,4 +1,4 @@ -Complete the following tasks on One Stop Market (http://localhost:7770): +Complete the following tasks on One Stop Market: **Task Requirements:** @@ -6,8 +6,8 @@ Complete the following tasks on One Stop Market (http://localhost:7770): - Record price and SKU of first 3 products 2. Search for 'tabletop' with price range $100.00-$200.00: - - Find the cheapest tabletop that has the highest review rating with at least 3 reviews. - Record search results count + - Among tabletops with at least 3 reviews, find the one with the highest rating % (if tied, choose the cheapest) - Record price of required tabletop 3. In "Computers & Accessories" subcategory with price filter $0.00-$9,999.99: @@ -27,7 +27,7 @@ Complete the following tasks on One Stop Market (http://localhost:7770): 6. Calculate: - Sum of 3 chocolate product prices - - Price difference: cheapest tabletop minus cheapest computer accessory + - Price difference: selected tabletop (from step 2) minus cheapest computer accessory (from step 3) - Whether sum of 3 comparison items < $60 **Output Format:** diff --git a/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py b/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py index 273ad9c4..827e1bd8 100644 --- a/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -84,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -120,11 +136,16 @@ def compare_answers(model_answer, expected_answer): if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: product {i+1} format error - expected 'price:SKU'") else: - # Check price format (should start with $) - if not mod_parts[0].startswith("$"): - mismatches.append(f"{key}: product {i+1} price format error - expected '$XX.XX' format, got '{mod_parts[0]}'") - elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: - mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'") + # Price as float (exact, no tolerance), SKU case-insensitive + exp_price = exp_parts[0].replace("$", "").replace(",", "") + mod_price = mod_parts[0].replace("$", "").replace(",", "") + try: + if (float(exp_price) != float(mod_price) + or exp_parts[1].upper() != mod_parts[1].upper()): + mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'") + except ValueError: + if exp != mod: + mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'") elif key == "tabletop_product": # Parse and compare tabletop product with price:SKU format @@ -133,11 +154,16 @@ def compare_answers(model_answer, expected_answer): if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: format error - expected 'price:SKU', got '{model_value}'") else: - # Check price format (should start with $) - if not mod_parts[0].startswith("$"): - mismatches.append(f"{key}: price format error - expected '$XX.XX' format, got '{mod_parts[0]}'") - elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: - mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") + # Price as float (exact, no tolerance), SKU case-insensitive + exp_price = exp_parts[0].replace("$", "").replace(",", "") + mod_price = mod_parts[0].replace("$", "").replace(",", "") + try: + if (float(exp_price) != float(mod_price) + or exp_parts[1].upper() != mod_parts[1].upper()): + mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") + except ValueError: + if expected_value != model_value: + mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") elif key == "tabletop_reviews": # Parse and compare tabletop reviews with NumberOfReviews:Rating format @@ -146,22 +172,26 @@ def compare_answers(model_answer, expected_answer): if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: format error - expected 'NumberOfReviews:Rating', got '{model_value}'") else: - # Check if both parts match - if exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: - mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") + # Reviews as int, rating as int (strip %) + try: + if (int(exp_parts[0]) != int(mod_parts[0]) + or int(exp_parts[1].replace("%", "")) != int(mod_parts[1].replace("%", ""))): + mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") + except ValueError: + if expected_value != model_value: + mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") elif key in ["chocolate_sum", "price_difference", "cart_subtotal", "cheapest_computer_accessory"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): - mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" - ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: + # Strip $ and , then compare as floats (exact equality β€” no tolerance) + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + try: + if float(expected_clean) != float(model_clean): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) @@ -172,10 +202,14 @@ def compare_answers(model_answer, expected_answer): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ["tabletop_search_count", "comparison_count", "cart_item_count"]: - # Numeric fields - exact match - if model_value != expected_value: + try: + if int(model_value) != int(expected_value): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) else: diff --git a/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md b/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md index 269487b8..68455699 100644 --- a/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md +++ b/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md @@ -1,12 +1,13 @@ 1. Search for a `printer capable of reducing blue light` that: - - Is pink or purple (must be stated in product details, not from image) + - Price between $240.00-$260.00 + - Is pink or purple - Manufactured in Asia Record SKU ID and price 2. Find a keyboard with: - - Bluetooth mode (must be stated either stated in details or title) + - Bluetooth mode - Price between $50.00-$100.00 - Highest review rating among matching products Record SKU ID, price, number of reviews, and review rating diff --git a/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py b/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py index 826e14cb..50f44c01 100644 --- a/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py +++ b/tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -84,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -107,17 +123,16 @@ def compare_answers(model_answer, expected_answer): # Special handling for different types of values if key in ["PrinterPrice", "KeyboardPrice"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): - mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" - ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: + # Strip $ and , then compare as floats (exact equality β€” no tolerance) + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + try: + if float(expected_clean) != float(model_clean): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) @@ -130,17 +145,26 @@ def compare_answers(model_answer, expected_answer): ) elif key == "KeyboardReviews": - # Number of reviews should match exactly - if model_value != expected_value: + try: + if int(model_value) != int(expected_value): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) elif key == "KeyboardRating": - # Rating should match exactly (including % sign) - if model_value != expected_value: + # Strip % and compare as int + try: + if int(model_value.replace("%", "")) != int(expected_value.replace("%", "")): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) else: diff --git a/tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py b/tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py index fa540ed7..8b1ef066 100644 --- a/tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py +++ b/tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -40,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -66,7 +82,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -84,7 +100,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -107,30 +123,19 @@ def compare_answers(model_answer, expected_answer): # Special handling for different types of values if key in ["Price", "Subtotal"]: - # For price fields, only support $XX.XX format - # Check if model value has correct format - if not model_value.startswith("$"): - mismatches.append( - f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" - ) - else: - # Normalize and compare values - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - - # Allow small tolerance for price calculations (within $0.01) - try: - expected_float = float(expected_clean) - model_float = float(model_clean) - if abs(expected_float - model_float) > 0.01: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - except ValueError: - if expected_clean != model_clean: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # Strip $ and , then compare as floats (exact equality β€” no tolerance) + expected_clean = expected_value.replace("$", "").replace(",", "") + model_clean = model_value.replace("$", "").replace(",", "") + try: + if float(expected_clean) != float(model_clean): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) elif key == "SKUID": # SKU should match exactly (case-insensitive) @@ -140,17 +145,26 @@ def compare_answers(model_answer, expected_answer): ) elif key == "NumberOfReviews": - # Number of reviews should match exactly - if model_value != expected_value: + try: + if int(model_value) != int(expected_value): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) elif key == "ReviewRating": - # Rating should match exactly (including % sign) - if model_value != expected_value: + # Strip % and compare as int + try: + if int(model_value.replace("%", "")) != int(expected_value.replace("%", "")): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" + f"{key} should be numeric: got '{model_value}'" ) else: diff --git a/tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py b/tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py index dd6897b8..453378a9 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py @@ -19,7 +19,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -33,6 +33,7 @@ def get_model_response(): if ( message.get("role") == "assistant" and message.get("status") == "completed" + and message.get("type") == "message" ): content = message.get("content", []) for item in content: @@ -46,6 +47,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -72,7 +89,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -90,7 +107,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -111,11 +128,28 @@ def compare_answers(model_answer, expected_answer): for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") - # Exact match for all fields - if model_value != expected_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + if key in ["InitialGroups", "FinalGroups", "InitialCustomers", "FinalCustomers"]: + try: + if int(model_value) != int(expected_value): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + mismatches.append( + f"{key} should be numeric: got '{model_value}'" + ) + elif key == "LastOrderCustomer": + # Case-insensitive exact match + if model_value.lower() != expected_value.lower(): + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + else: + # Exact match for other fields + if model_value != expected_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) @@ -165,12 +199,10 @@ async def verify() -> bool: "Warning: Could not parse answer format from model response", file=sys.stderr, ) - print("Will proceed with browser verification only", file=sys.stderr) + return False else: - print( - "No model response found, proceeding with browser verification", - file=sys.stderr, - ) + print("No model response found", file=sys.stderr) + return False # Browser verification for actual state print("\n=== Starting Browser Verification ===", file=sys.stderr) @@ -227,27 +259,20 @@ async def verify() -> bool: ) else: print( - f"Warning: Premium Europe tax class is '{tax_class_text}'", + f"βœ— Premium Europe tax class is '{tax_class_text}', expected 'Retail Customer'", file=sys.stderr, ) + return False + else: + print( + "βœ— Could not locate 'Premium Europe' row to verify tax class", + file=sys.stderr, + ) + return False else: print("βœ— 'Premium Europe' customer group not found", file=sys.stderr) return False - # Check total groups count - records_found = page.locator("text=records found").first - if await records_found.count() > 0: - count_text = await records_found.inner_text() - print(f"Customer Groups count: {count_text}", file=sys.stderr) - - # Extract number - import re - - match = re.search(r"(\d+)\s+records found", count_text) - if match: - groups_count = int(match.group(1)) - print(f"βœ“ Customer groups count is {groups_count}", file=sys.stderr) - # 2. Verify Customer print("\nVerifying Customer Isabella Romano...", file=sys.stderr) await page.goto( @@ -256,35 +281,6 @@ async def verify() -> bool: ) await page.wait_for_timeout(3000) # Wait for grid to load - # Check total customers count - customer_records = page.locator("text=records found").first - if await customer_records.count() > 0: - count_text = await customer_records.inner_text() - print(f"Customers count: {count_text}", file=sys.stderr) - - # Extract number - match = re.search(r"(\d+)\s+records found", count_text) - if match: - customers_count = int(match.group(1)) - print( - f"βœ“ Total customers count is {customers_count}", file=sys.stderr - ) - - # Verify against expected answer if available - if expected_answer and "FinalCustomers" in expected_answer: - expected_final = int(expected_answer["FinalCustomers"]) - if customers_count == expected_final: - print( - f"βœ“ Customer count matches expected: {customers_count}", - file=sys.stderr, - ) - else: - print( - f"βœ— Customer count mismatch: Expected {expected_final} customers, found {customers_count}", - file=sys.stderr, - ) - return False - # Wait for the customer grid to load properly await page.wait_for_timeout(5000) @@ -325,17 +321,9 @@ async def verify() -> bool: isabella_exists = ( await page.locator("text=isabella.romano@premium.eu").count() > 0 ) - - # Also check for "No records found" message - no_records = await page.locator("text=We couldn't find any records., text=No records found").count() > 0 - if no_records: - print( - "βœ— Customer 'isabella.romano@premium.eu' not found - search returned no results", - file=sys.stderr, - ) - return False except Exception as e: print(f"βœ— Search failed: {str(e)}", file=sys.stderr) + return False if isabella_exists: print( @@ -349,58 +337,6 @@ async def verify() -> bool: ) return False - # 3. Verify Dashboard Last Orders - print("\nVerifying Dashboard Last Orders...", file=sys.stderr) - await page.goto( - f"{BASE_URL}/admin/dashboard/", - wait_until="networkidle", - ) - await page.wait_for_timeout(2000) - - # Check for Last Orders section - last_orders_exists = await page.locator("text=Last Orders").count() > 0 - if last_orders_exists: - print("βœ“ Found 'Last Orders' section on dashboard", file=sys.stderr) - - # Find the first customer in the table - # Look for the table after "Last Orders" heading - orders_table = ( - page.locator("text=Last Orders") - .locator("..") - .locator("table") - .first - ) - if await orders_table.count() > 0: - # Get the last row in tbody - last_row = orders_table.locator("tbody tr").last - if await last_row.count() > 0: - last_customer = await last_row.locator( - "td" - ).first.inner_text() - print( - f"βœ“ Last customer in Last Orders: {last_customer}", - file=sys.stderr, - ) - - # Verify against expected answer if available - if expected_answer and "LastOrderCustomer" in expected_answer: - if last_customer == expected_answer["LastOrderCustomer"]: - print( - f"βœ“ Last Order Customer matches expected: {last_customer}", - file=sys.stderr, - ) - else: - print( - f"βœ— Last Order Customer mismatch: Expected '{expected_answer['LastOrderCustomer']}' but actual is '{last_customer}'", - file=sys.stderr, - ) - return False - else: - print( - "Warning: 'Last Orders' section not found on dashboard", - file=sys.stderr, - ) - # Summary of verification - only print if we reach this point (all checks passed) print("\n=== Browser Verification Summary ===", file=sys.stderr) print("βœ“ Magento Admin login successful", file=sys.stderr) @@ -409,8 +345,6 @@ async def verify() -> bool: file=sys.stderr, ) print("βœ“ Customer 'isabella.romano@premium.eu' found in system", file=sys.stderr) - print("βœ“ Customer counts verified", file=sys.stderr) - print("βœ“ Dashboard Last Orders section accessible", file=sys.stderr) return True diff --git a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md index 4a371231..5b2a148b 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md @@ -5,19 +5,17 @@ Our marketing team is planning a new promotion for our bestselling fitness produ 1. If need to login, login with username 'admin' and password 'admin1234' 2. Start by checking our current bestsellers: - - Identify the top 3 bestselling products based on their Price and Quantity - record their names, prices, and quantities sold - - Note the total Revenue amount displayed - - Check if any of these bestsellers appear in the Top Search Terms table - if yes, record the search term and its usage count, else output 'No:0' + - Identify the top 3 bestselling products based on their Price and Quantity - record their names, prices, and quantities sold + - Check if any of these bestsellers appear in the Top Search Terms table - if yes, record the search term and its usage count, else output 'None:0' 3. Investigate these bestselling products in detail: - - For each of the top 3 bestsellers identified, search for them by name and record: + - For each of the top 3 bestsellers, open the product page linked from the Bestsellers row and record: - Their SKU - - Current inventory quantity + - Salable Quantity - Whether they are 'Enabled' or 'Disabled' -4. Check if we have existing promotions for these products: - - Look for any active rules that might apply to fitness/yoga products - - Find if there's a rule offering percentage discount - record the rule name and discount percentage +4. Check existing Cart Price Rules (Marketing β†’ Promotions β†’ Cart Price Rules): + - Find the rule that offers a percentage discount on the entire order (not tied to a specific product) - record the rule name and discount percentage - Count total number of active rules 5. Analyze customer purchasing patterns: @@ -27,16 +25,15 @@ Our marketing team is planning a new promotion for our bestselling fitness produ 6. Review our top customers who might be interested: - Find the customer who appears in the Last Orders section of the dashboard with the highest total - Look up this customer in the All Customers list and record his email and customer group - - Count how many other customers are in the same group + - Count how many customers are in the same group 7. Compile your findings and output them in the following exact format: ``` -Bestseller1|name:price:quantity:sku:inventory:status -Bestseller2|name:price:quantity:sku:inventory:status -Bestseller3|name:price:quantity:sku:inventory:status -TotalRevenue|amount +Bestseller1|name:price:quantity:sku:salable_quantity:status +Bestseller2|name:price:quantity:sku:salable_quantity:status +Bestseller3|name:price:quantity:sku:salable_quantity:status BestsellerInSearch|term:count PercentageDiscountRule|name:percentage ActiveRulesCount|count @@ -53,13 +50,12 @@ SameGroupCustomers|count Bestseller1|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled Bestseller2|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled Bestseller3|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled -TotalRevenue|$XX.XX BestsellerInSearch|Term:X or None:0 PercentageDiscountRule|Rule Name:XX% ActiveRulesCount|X TotalOrders|X -MostRecentOrderID|X or None -TopCustomer|Customer Name:email@example.com:Group Name +MostRecentOrderID|X +TopCustomer|Customer Name::Group Name SameGroupCustomers|X ``` diff --git a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt index 5fd088d2..dd4cfaf7 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt +++ b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt @@ -1,11 +1,10 @@ -Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled -Bestseller2|Quest Lumaflexβ„’ Band:$19.00:6:24-UG01:100:Enabled -Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled -TotalRevenue|$0.00 -BestsellerInSearch|No:0 +Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:93:Enabled +Bestseller2|Quest Lumaflexβ„’ Band:$19.00:6:24-UG01:93:Enabled +Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:88:Enabled +BestsellerInSearch|None:0 PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20% ActiveRulesCount|4 TotalOrders|308 MostRecentOrderID|000000299 -TopCustomer|Sarah Miller:sarah.miller@example.com:General +TopCustomer|Sarah Miller:helloworld@yahoo.com:General SameGroupCustomers|70 \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py index e4835e4b..ea526d85 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py @@ -11,7 +11,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -22,7 +22,11 @@ def get_model_response(): # Find the last assistant message for message in reversed(messages): - if message.get('role') == 'assistant' and message.get('status') == 'completed': + if ( + message.get('role') == 'assistant' + and message.get('status') == 'completed' + and message.get('type') == 'message' + ): content = message.get('content', []) for item in content: if item.get('type') == 'output_text': @@ -34,6 +38,22 @@ def get_model_response(): print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -41,28 +61,28 @@ def parse_answer_format(text): """ if not text: return None - + # Look for ... pattern match = re.search(r'(.*?)', text, re.IGNORECASE | re.DOTALL) if not match: return None - + answer_content = match.group(1).strip() - + # Parse each line result = {} lines = answer_content.split('\n') - + # Skip the check for exact number of lines - just parse what we have # if len(lines) != 13: # print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr) # return None - + for line in lines: if '|' in line: key, value = line.split('|', 1) - result[key.strip()] = value.strip() - + result[key.strip()] = normalize_text(value.strip()) + return result def load_expected_answer(label_path): @@ -73,18 +93,43 @@ def load_expected_answer(label_path): try: with open(label_path, 'r') as f: lines = f.read().strip().split('\n') - + expected = {} for line in lines: if '|' in line: key, value = line.split('|', 1) - expected[key.strip()] = value.strip() - + expected[key.strip()] = normalize_text(value.strip()) + return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None +def _normalize_bestseller(value): + """ + Parse a Bestseller line "name:price:quantity:sku:salable_quantity:status" + into a normalized tuple so the three lines can be compared as a set + (order-independent). Returns None if the value is malformed. + """ + if ':' not in value: + return None + parts = value.split(':') + if len(parts) != 6: + return None + name, price, qty, sku, salable, status = parts + try: + return ( + name.replace('™', 'β„’').strip().lower(), + float(price.replace('$', '').replace(',', '').strip()), + int(qty.strip()), + sku.strip().lower(), + float(salable.replace(',', '').strip()), + status.strip().lower(), + ) + except ValueError: + return None + + def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. @@ -92,63 +137,38 @@ def compare_answers(model_answer, expected_answer): """ if not model_answer or not expected_answer: return False - - # Check each expected key + + bestseller_keys = ['Bestseller1', 'Bestseller2', 'Bestseller3'] mismatches = [] + + # Bestseller1/2/3 are compared as a set β€” model may list the three lines + # in any order + expected_bs_raw = [expected_answer.get(k, '') for k in bestseller_keys if k in expected_answer] + if expected_bs_raw: + model_bs_raw = [model_answer.get(k, '') for k in bestseller_keys if k in model_answer] + expected_bs = [_normalize_bestseller(v) for v in expected_bs_raw] + model_bs = [_normalize_bestseller(v) for v in model_bs_raw] + if None in expected_bs: + mismatches.append(f"Bestseller (label): malformed line in label.txt: {expected_bs_raw}") + elif None in model_bs: + bad = [r for r, n in zip(model_bs_raw, model_bs) if n is None] + mismatches.append(f"Bestseller: malformed or non-numeric line(s): {bad}") + else: + expected_set = set(expected_bs) + model_set = set(model_bs) + missing = expected_set - model_set + extra = model_set - expected_set + if missing or extra: + mismatches.append( + f"Bestseller set mismatch β€” missing: {sorted(missing)}; extra: {sorted(extra)}" + ) + for key, expected_value in expected_answer.items(): + if key in bestseller_keys: + continue # already handled above model_value = model_answer.get(key, '') - - # Special handling for different types of values - if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']: - # Check if all parts match (name:price:quantity:sku:inventory:status) - if ':' in expected_value and ':' in model_value: - expected_parts = expected_value.split(':') - model_parts = model_value.split(':') - if len(expected_parts) == 6 and len(model_parts) == 6: - # Compare each part - for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)): - if i == 1: # Price field - exp_clean = exp.replace('$', '').replace(',', '') - mod_clean = mod.replace('$', '').replace(',', '') - if exp_clean != mod_clean: - mismatches.append(f"{key} price: expected '{exp}', got '{mod}'") - elif i == 4: # Inventory field (may have decimal places) - exp_float = float(exp.replace(',', '')) - mod_float = float(mod.replace(',', '')) - if abs(exp_float - mod_float) > 0.0001: - mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'") - else: - if exp.lower() != mod.lower(): - mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'") - else: - mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'") - else: - if expected_value != model_value: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key == 'LowestInventoryProduct': - # Check product name and inventory - if ':' in expected_value and ':' in model_value: - expected_name, expected_inv = expected_value.rsplit(':', 1) - model_name, model_inv = model_value.rsplit(':', 1) - if expected_name.lower() != model_name.lower(): - mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") - exp_float = float(expected_inv.replace(',', '')) - mod_float = float(model_inv.replace(',', '')) - if abs(exp_float - mod_float) > 0.0001: - mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'") - else: - if expected_value != model_value: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key in ['TotalRevenue', 'MinimumPurchaseRule']: - # For price/amount fields, normalize format - expected_clean = expected_value.replace('$', '').replace(',', '') - model_clean = model_value.replace('$', '').replace(',', '') - if expected_clean != model_clean: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key == 'BestsellerInSearch': + + if key == 'BestsellerInSearch': # Check search term and count if expected_value.lower() != model_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") @@ -158,13 +178,16 @@ def compare_answers(model_answer, expected_answer): if ':' in expected_value and ':' in model_value: expected_name, expected_pct = expected_value.rsplit(':', 1) model_name, model_pct = model_value.rsplit(':', 1) - if expected_name != model_name: + if expected_name.lower() != model_name.lower(): mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") - # Normalize percentage (20% vs 20 vs 0.20) + # Normalize percentage (20 vs 20% vs 20.0) exp_pct_clean = expected_pct.replace('%', '').strip() mod_pct_clean = model_pct.replace('%', '').strip() - if exp_pct_clean != mod_pct_clean: - mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'") + try: + if float(exp_pct_clean) != float(mod_pct_clean): + mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'") + except ValueError: + mismatches.append(f"{key} percentage should be numeric: got '{model_pct}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") @@ -177,7 +200,7 @@ def compare_answers(model_answer, expected_answer): if len(expected_parts) == 3 and len(model_parts) == 3: exp_name, exp_email, exp_group = expected_parts mod_name, mod_email, mod_group = model_parts - if exp_name != mod_name: + if exp_name.lower() != mod_name.lower(): mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") if exp_email.lower() != mod_email.lower(): mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'") @@ -189,16 +212,17 @@ def compare_answers(model_answer, expected_answer): if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - elif key == 'MostRecentOrderDate': - # Date format may vary, do flexible comparison - if expected_value.lower() == 'none' and model_value.lower() == 'none': - continue - elif expected_value != model_value: - # Could add more flexible date parsing here if needed - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - + elif key in ['ActiveRulesCount', 'TotalOrders', 'SameGroupCustomers']: + # Numeric counts: compare as int so "04" vs "4" doesn't fail + try: + if int(model_value) != int(expected_value): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + else: - # Exact match for other fields (counts, etc.) + # Exact string match for IDs and other text fields (e.g. MostRecentOrderID + # has leading zeros like "000000299" that must be preserved) if str(model_value) != str(expected_value): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") diff --git a/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md b/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md index 513914f0..4b7b9093 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md @@ -44,7 +44,7 @@ Perform a comprehensive marketing and customer analysis workflow in the Magento 6. Finally, let's review overall business performance metrics from the main dashboard: Go to Dashboard and identify: - - The names and sales quantities of the products that are both the best-selling and most expensive + - Among the products tied for the highest sales quantity in the Bestsellers table, identify the one with the highest price - record its name and sales quantity - The total revenue displayed on the dashboard 7. Compile all your findings and must output them in the following exact format at last: diff --git a/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py b/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py index 940ba995..3a95f38f 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py @@ -19,7 +19,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -33,6 +33,7 @@ def get_model_response(): if ( message.get("role") == "assistant" and message.get("status") == "completed" + and message.get("type") == "message" ): content = message.get("content", []) for item in content: @@ -46,6 +47,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the new multi-line xxx format from the agent's output. @@ -72,7 +89,7 @@ def parse_answer_format(text): for line in lines: if "|" in line: key, value = line.split("|", 1) - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) return result @@ -90,7 +107,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -111,45 +128,124 @@ def compare_answers(model_answer, expected_answer): for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") - # Special handling for different types of values if key == "Top2SearchTerms": - # Check if both search terms are present with correct counts - expected_terms = expected_value.split(",") - model_terms = model_value.split(",") - if set(expected_terms) != set(model_terms): + # Two "term:count" entries separated by ',' β€” order-independent; + # term case-insensitive, count compared as int + expected_terms = set() + for item in expected_value.split(','): + item = item.strip() + term, count = item.rsplit(':', 1) + expected_terms.add((term.strip().lower(), int(count.strip()))) + model_terms = set() + for item in model_value.split(','): + item = item.strip() + if ':' not in item: + mismatches.append(f"{key}: malformed entry '{item}'") + continue + term, count = item.rsplit(':', 1) + try: + model_terms.add((term.strip().lower(), int(count.strip()))) + except ValueError: + mismatches.append(f"{key}: non-numeric count in '{item}'") + if expected_terms != model_terms: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) + elif key == "ZeroResultTerm": + # Single "term:count" β€” term case-insensitive, count as int + exp_term, exp_count = expected_value.rsplit(':', 1) + expected_pair = (exp_term.strip().lower(), int(exp_count.strip())) + if ':' not in model_value: + mismatches.append(f"{key}: malformed value '{model_value}'") + else: + mod_term, mod_count = model_value.rsplit(':', 1) + try: + model_pair = (mod_term.strip().lower(), int(mod_count.strip())) + if expected_pair != model_pair: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + except ValueError: + mismatches.append(f"{key}: non-numeric count in '{model_value}'") + elif key == "EmailVerification": - # Check email verification status - expected_emails = dict( - item.split(":") for item in expected_value.split(",") - ) - model_emails = dict( - item.split(":") for item in model_value.split(",") if ":" in item - ) + # "email:yes/no" entries separated by ',' β€” email & status case-insensitive + expected_emails = {} + for item in expected_value.split(','): + email, status = item.rsplit(':', 1) + expected_emails[email.strip().lower()] = status.strip().lower() + model_emails = {} + for item in model_value.split(','): + item = item.strip() + if ':' not in item: + mismatches.append(f"{key}: malformed entry '{item}'") + continue + email, status = item.rsplit(':', 1) + model_emails[email.strip().lower()] = status.strip().lower() if expected_emails != model_emails: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "CouponCodes": - # Check if coupon code and rule name are present - if "H20" not in model_value or "Luma water bottle" not in model_value: + # "code:rule_name" entries separated by ',' β€” code case-sensitive + # (coupon codes are typically uppercase tokens), rule name case-insensitive + expected_coupons = set() + for item in expected_value.split(','): + code, rule = item.split(':', 1) + expected_coupons.add((code.strip(), rule.strip().lower())) + model_coupons = set() + for item in model_value.split(','): + item = item.strip() + if ':' not in item: + mismatches.append(f"{key}: malformed entry '{item}'") + continue + code, rule = item.split(':', 1) + model_coupons.add((code.strip(), rule.strip().lower())) + if expected_coupons != model_coupons: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "TopProduct": - # Check if product name and quantity match - if expected_value != model_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # "name:quantity" β€” name case-insensitive, qty as int + if ':' in expected_value and ':' in model_value: + exp_name, exp_qty = expected_value.rsplit(':', 1) + mod_name, mod_qty = model_value.rsplit(':', 1) + if exp_name.strip().lower() != mod_name.strip().lower(): + mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") + try: + if int(exp_qty.strip()) != int(mod_qty.strip()): + mismatches.append(f"{key} quantity: expected '{exp_qty}', got '{mod_qty}'") + except ValueError: + mismatches.append(f"{key} quantity should be numeric: got '{mod_qty}'") + else: + if expected_value != model_value: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + + elif key in ("TotalSearchTerms", "ActiveRulesCount", "SubscribedCount"): + # Numeric counts: compare as int so "04" vs "4" doesn't fail + try: + if int(model_value) != int(expected_value): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + + elif key == "TotalRevenue": + # Strip $ and , then compare as float so "$0.00" matches "0" / "0.00" + exp_clean = expected_value.replace('$', '').replace(',', '').strip() + mod_clean = model_value.replace('$', '').replace(',', '').strip() + try: + if float(exp_clean) != float(mod_clean): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") else: - # Exact match for other fields + # Fallback exact match for any unrecognized key if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" @@ -183,32 +279,29 @@ async def verify() -> bool: # Get model's response from MCP_MESSAGES model_response = get_model_response() - if model_response: - print("Found model response, parsing answer format...", file=sys.stderr) - model_answer = parse_answer_format(model_response) - - if model_answer: - print("\n=== Model Answer Parsed ===", file=sys.stderr) - for key, value in model_answer.items(): - print(f"{key}: {value}", file=sys.stderr) - - # Compare answers - answer_match = compare_answers(model_answer, expected_answer) - if not answer_match: - print("\nModel answer does not match expected answer", file=sys.stderr) - return False - print("\nβœ“ Model answer matches expected answer", file=sys.stderr) - else: - print( - "Warning: Could not parse answer format from model response", - file=sys.stderr, - ) - print("Will proceed with browser verification only", file=sys.stderr) - else: + if not model_response: + print("No model response found", file=sys.stderr) + return False + + print("Found model response, parsing answer format...", file=sys.stderr) + model_answer = parse_answer_format(model_response) + if not model_answer: print( - "No model response found, proceeding with browser verification", + "Could not parse answer format from model response", file=sys.stderr, ) + return False + + print("\n=== Model Answer Parsed ===", file=sys.stderr) + for key, value in model_answer.items(): + print(f"{key}: {value}", file=sys.stderr) + + # Compare answers + answer_match = compare_answers(model_answer, expected_answer) + if not answer_match: + print("\nModel answer does not match expected answer", file=sys.stderr) + return False + print("\nβœ“ Model answer matches expected answer", file=sys.stderr) # Browser verification - only check customer creation (the critical task requirement) print("\n=== Starting Browser Verification ===", file=sys.stderr) diff --git a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md index dbe8f41e..3432bbf3 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md @@ -11,13 +11,13 @@ Our company is planning to expand sales operations to New York state and needs a 3. Since we're expanding to New York, we need check tax: - Find and record the exact tax rate for New York state - - Compare it with California's tax rate - record which state has a higher rate + - Also record California's tax rate as a reference - Count how many different US states currently have tax configurations -4. You need to understand our order status of stores processing for the NY market: - - Filter orders to show only statuses that are 'Visible On Storefront = Yes' - - Among these visible statuses, identify if exists one has the status code 'processing' (Yes or No), - - Check if this 'processing' status is set as a 'Default Status' (Yes or No) +4. Review the order status configuration for the NY market: + In Stores β†’ Settings β†’ Order Status, focus on rows where 'Visible On Storefront' is 'Yes': + - Identify if any of them has the status code 'processing' (Yes or No) + - For that 'processing' status, record whether 'Default Status' is Yes or No 5. Since New York orders might need special handling, check all stores: @@ -25,7 +25,7 @@ Our company is planning to expand sales operations to New York state and needs a - Record the store code for the first Main Website Store 6. For inventory planning, check the sources of it: - - Check if the Default Source is currently 'Enabled' or shows as 'Disabled' for Pickup Location + - For the Default Source, check whether its 'Pickup Location' column is 'Enabled' or 'Disabled' - Click the 'Edit' link for the Default Source and check if there's a 'State/Province' field (Yes or No) 7. Finally, return to the Dashboard and examine the revenue metrics: @@ -38,19 +38,18 @@ Our company is planning to expand sales operations to New York state and needs a Lifetime_Sales_Amount|amount Cheap_Bestseller_Name|name -Second_Bestseller_Price|price -Second_Bestseller_Quantity|quantity +Cheap_Bestseller_Price|price +Cheap_Bestseller_Quantity|quantity Product_In_Last_Orders|yes_or_no NY_Tax_Rate|rate CA_Tax_Rate|rate -Higher_Tax_State|state Total_States_With_Tax|count Processing_Visible_Storefront|Yes_or_No Processing_Default_Status|Yes_or_No Number_Of_Websites|count Main_Store_Code|code Default_Source_Pickup_Status|status -Default_Source_State|state_or_none +Default_Source_State|yes_or_no Dashboard_Revenue|amount Tax_Shipping_Zero|yes_or_no @@ -61,19 +60,18 @@ Tax_Shipping_Zero|yes_or_no Lifetime_Sales_Amount|$XX.XX Cheap_Bestseller_Name|Product Name Here -Second_Bestseller_Price|$XX.XX -Second_Bestseller_Quantity|XX +Cheap_Bestseller_Price|$XX.XX +Cheap_Bestseller_Quantity|XX Product_In_Last_Orders|Yes/No NY_Tax_Rate|X.XXXX CA_Tax_Rate|X.XXXX -Higher_Tax_State|XX Total_States_With_Tax|XX Processing_Visible_Storefront|Yes/No Processing_Default_Status|Yes/No Number_Of_Websites|X Main_Store_Code|code_here Default_Source_Pickup_Status|Enabled/Disabled -Default_Source_State|State or None +Default_Source_State|Yes/No Dashboard_Revenue|$XX.XX Tax_Shipping_Zero|Yes/No diff --git a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt index ceb1678c..b8ed7ba8 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt +++ b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt @@ -1,17 +1,16 @@ Lifetime_Sales_Amount|$0.00 Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot -Second_Bestseller_Price|$14.00 -Second_Bestseller_Quantity|6 +Cheap_Bestseller_Price|$14.00 +Cheap_Bestseller_Quantity|6 Product_In_Last_Orders|No NY_Tax_Rate|8.3750 CA_Tax_Rate|8.2500 -Higher_Tax_State|NY -Total_States_With_Tax|2 +Total_States_With_Tax|3 Processing_Visible_Storefront|Yes Processing_Default_Status|Yes Number_Of_Websites|1 Main_Store_Code|main_website_store -Default_Source_Pickup_Status|Enabled -Default_Source_State|No +Default_Source_Pickup_Status|Disabled +Default_Source_State|Yes Dashboard_Revenue|$0.00 Tax_Shipping_Zero|Yes \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py index f010fe84..a5aa191c 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py @@ -11,7 +11,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -39,7 +39,11 @@ def get_model_response(): # Find the last assistant message for message in reversed(messages): - if message.get('role') == 'assistant' and message.get('status') == 'completed': + if ( + message.get('role') == 'assistant' + and message.get('status') == 'completed' + and message.get('type') == 'message' + ): content = message.get('content', []) if not content: print("WARNING: Assistant message has empty content", file=sys.stderr) @@ -62,6 +66,22 @@ def get_model_response(): print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr) return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -90,9 +110,9 @@ def parse_answer_format(text): # Expected keys that should be present expected_keys = [ - 'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price', - 'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate', - 'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax', + 'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Cheap_Bestseller_Price', + 'Cheap_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate', + 'CA_Tax_Rate', 'Total_States_With_Tax', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Number_Of_Websites', 'Main_Store_Code', 'Default_Source_Pickup_Status', 'Default_Source_State', 'Dashboard_Revenue', 'Tax_Shipping_Zero' @@ -115,7 +135,7 @@ def parse_answer_format(text): key, value = parts key = key.strip() - value = value.strip() + value = normalize_text(value.strip()) if not key: print(f"ERROR: Empty key in line: {line}", file=sys.stderr) @@ -153,7 +173,7 @@ def load_expected_answer(label_path): for line in lines: if '|' in line: key, value = line.split('|', 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -172,60 +192,54 @@ def compare_answers(model_answer, expected_answer): mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, '') - - # Special handling for different types of values - if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']: - # For price/amount fields, normalize format - expected_clean = expected_value.replace('$', '').replace(',', '') - model_clean = model_value.replace('$', '').replace(',', '') - if expected_clean != model_clean: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - + + if key in ['Lifetime_Sales_Amount', 'Cheap_Bestseller_Price', 'Dashboard_Revenue']: + # Price/amount fields: strip $ and , then compare as float so + # "$0.00" matches "0" / "0.00" + expected_clean = expected_value.replace('$', '').replace(',', '').strip() + model_clean = model_value.replace('$', '').replace(',', '').strip() + try: + if float(expected_clean) != float(model_clean): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']: - # Tax rates - allow different decimal formats + # Tax rates: strip % then compare as float expected_clean = expected_value.replace('%', '').strip() model_clean = model_value.replace('%', '').strip() - # Convert to float for comparison try: if float(expected_clean) != float(model_clean): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") except ValueError: - if expected_clean != model_clean: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Tax_Shipping_Zero']: - # Yes/No fields - case insensitive + mismatches.append(f"{key} should be numeric: got '{model_value}'") + + elif key in [ + 'Product_In_Last_Orders', + 'Processing_Visible_Storefront', + 'Processing_Default_Status', + 'Tax_Shipping_Zero', + 'Default_Source_State', + ]: + # Yes/No fields β€” case-insensitive if model_value.lower() != expected_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key == 'Empty_Rows_Yes_Effect': - # Allow flexible descriptions for this field - # Just check if model provided some reasonable description - if not model_value or len(model_value) < 5: - mismatches.append(f"{key}: expected meaningful description, got '{model_value}'") - - elif key == 'Order_Status_Options': - # Check if main options are mentioned - expected_options = set(opt.strip() for opt in expected_value.split(',')) - model_options = set(opt.strip() for opt in model_value.split(',')) - if expected_options != model_options: - mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - - elif key == 'Chart_Disabled_Message': - # Allow some flexibility in message text - # Check for key words - if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower(): - mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'") - - elif key == 'Default_Source_State': - # Handle 'None' or empty state - expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else '' - model_normalized = model_value.lower() if model_value.lower() != 'none' else '' - if expected_normalized != model_normalized: + + elif key in ['Cheap_Bestseller_Quantity', 'Total_States_With_Tax', 'Number_Of_Websites']: + # Numeric counts: compare as int so "04" / "4" / "4.0" don't fail + try: + if int(float(model_value)) != int(float(expected_value)): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + + elif key in ['Cheap_Bestseller_Name', 'Default_Source_Pickup_Status', 'Main_Store_Code']: + # String fields β€” case-insensitive + if model_value.strip().lower() != expected_value.strip().lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") - + else: - # Exact match for other fields + # Fallback exact match for any unrecognized key if model_value != expected_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") diff --git a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md index b160bdff..fe650cb9 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md @@ -5,16 +5,16 @@ Perform a comprehensive products and sales analysis in the Magento Admin panel t 1. if need to login, login with username 'admin' and password 'admin1234' 2. Analyze product inventory and catalog details, perform the following: - - Search for all products containing 'Yoga' in their name - count the exact number of results - - Clear the search and find the product with SKU 'WH11' - record its exact price + - Filter by Name containing 'Yoga' - count the exact number of results + - Clear all filters and find the product with SKU 'WH11' - record its exact price - Apply a filter to show only products with Quantity = 0.0000 - count how many products match 3. To identify top-selling products and revenue metrics, navigate to the Dashboard and from the Bestsellers table: - - Identify the product with lowest price and lowest quantity - record the product name and quantity sold + - Among the products tied for the lowest sales quantity, identify the one with the lowest price - record its name and sales quantity - Find the second cheapest product in the table - record its exact quantity sold - Note the total Revenue amount displayed in the dashboard -4. Father all customers' information and demographics: +4. Gather all customers' information and demographics: - Find customer 'Sarah Miller' - record her exact email address - Count the total number of customers shown in the grid @@ -30,7 +30,7 @@ YogaProducts|count WH11Price|price ZeroQuantityProducts|count LowestProduct|name:quantity -QuestLumaflexQuantity|quantity +SecondCheapestQuantity|quantity DashboardRevenue|amount SarahMillerEmail|email TotalCustomers|count @@ -46,9 +46,9 @@ YogaProducts|XX WH11Price|$XX.XX ZeroQuantityProducts|XX LowestProduct|Product Name Here:XX -QuestLumaflexQuantity|XX +SecondCheapestQuantity|XX DashboardRevenue|$XX.XX -SarahMillerEmail|email@example.com +SarahMillerEmail| TotalCustomers|XX PendingOrders|X GraceNguyenOrderID|00000XXXX diff --git a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt index ce32c2e7..fd8ff909 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt +++ b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt @@ -1,10 +1,10 @@ -YogaProducts|171 +YogaProducts|172 WH11Price|$54.00 ZeroQuantityProducts|150 -LowestProduct|Sprite Stasis Ball 55 cm foot:5 -QuestLumaflexQuantity|6 +LowestProduct|Sprite Stasis Ball 55 cm:5 +SecondCheapestQuantity|6 DashboardRevenue|$0.00 SarahMillerEmail|helloworld@yahoo.com -TotalCustomers|72 +TotalCustomers|70 PendingOrders|10 GraceNguyenOrderID|000000189 \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py index 8bc33fb4..03ca3a5f 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -26,6 +26,7 @@ def get_model_response(): if ( message.get("role") == "assistant" and message.get("status") == "completed" + and message.get("type") == "message" ): content = message.get("content", []) for item in content: @@ -39,6 +40,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -71,7 +88,7 @@ def parse_answer_format(text): # Expected keys for validation expected_keys = [ "YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct", - "QuestLumaflexQuantity", "DashboardRevenue", "SarahMillerEmail", + "SecondCheapestQuantity", "DashboardRevenue", "SarahMillerEmail", "TotalCustomers", "PendingOrders", "GraceNguyenOrderID" ] @@ -85,7 +102,7 @@ def parse_answer_format(text): print(f"Error: Invalid line format: {line}", file=sys.stderr) return None - key, value = parts[0].strip(), parts[1].strip() + key, value = parts[0].strip(), normalize_text(parts[1].strip()) if not key or not value: print(f"Error: Empty key or value in line: {line}", file=sys.stderr) @@ -115,7 +132,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -136,16 +153,18 @@ def compare_answers(model_answer, expected_answer): for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") - # Special handling for different types of values if key == "LowestProduct": - # Check if product name and quantity match (format: "Product Name:quantity") + # "name:quantity" β€” name case-insensitive, qty as int if ":" in expected_value and ":" in model_value: - expected_name, expected_qty = expected_value.rsplit(":", 1) - model_name, model_qty = model_value.rsplit(":", 1) - if expected_name != model_name or expected_qty != model_qty: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + exp_name, exp_qty = expected_value.rsplit(":", 1) + mod_name, mod_qty = model_value.rsplit(":", 1) + if exp_name.strip().lower() != mod_name.strip().lower(): + mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") + try: + if int(exp_qty.strip()) != int(mod_qty.strip()): + mismatches.append(f"{key} quantity: expected '{exp_qty}', got '{mod_qty}'") + except ValueError: + mismatches.append(f"{key} quantity should be numeric: got '{mod_qty}'") else: if expected_value != model_value: mismatches.append( @@ -153,23 +172,41 @@ def compare_answers(model_answer, expected_answer): ) elif key in ["WH11Price", "DashboardRevenue"]: - # For price/amount fields, normalize format - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # Price/amount fields: strip $ and , then compare as float so + # "$54.00" matches "54" / "54.0" + expected_clean = expected_value.replace("$", "").replace(",", "").strip() + model_clean = model_value.replace("$", "").replace(",", "").strip() + try: + if float(expected_clean) != float(model_clean): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + + elif key in [ + "YogaProducts", + "ZeroQuantityProducts", + "SecondCheapestQuantity", + "TotalCustomers", + "PendingOrders", + ]: + # Numeric counts: compare as int so "04" / "4" / "4.0" don't fail + try: + if int(float(model_value)) != int(float(expected_value)): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") elif key == "SarahMillerEmail": - # Email should match exactly + # Email β€” case-insensitive if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: - # Exact match for other fields + # Exact string match for IDs and other text fields (e.g. + # GraceNguyenOrderID has leading zeros like "000000189" that must + # be preserved) if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" diff --git a/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md b/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md index 06a89dca..54c29c92 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md @@ -6,22 +6,22 @@ Perform a comprehensive sales and inventory analysis by extracting specific metr 2. To analyze product inventory and identify key items, check all products: - Search for all products containing 'Sprite' in their name - count the exact number of results - - Clear the search and filter products by Quantity = 100.0000 - count how many products match + - Clear all filters and filter products by Quantity = 100.0000 - count how many products match - Find the product with SKU 'WS12' - record its exact name and price 3. To understand sales performance and order status, we need check all orders: - Search for all orders with 'Pending' status - count the total number - - Find Grace Nguyen's Complete and the most cheap order - record the order ID (starts with "000") + - Find Grace Nguyen's order with Complete status and the lowest price - record its order ID (starts with "000") - Find the order with the highest Grand Total - record the customer name and amount 4. To examine bestselling products and search trends, from the main page: - - In the Bestsellers table, identify the product with most quantity but and lowest price - record its name and quantity sold + - Among the products tied for the highest sales quantity in the Bestsellers table, identify the one with the lowest price - record its name and sales quantity - Find 'Overnight Duffle' and record its exact price - In the Top Search Terms table, find 'hollister' and record its position number (1st, 2nd, etc.) 5. To analyze customer demographics and account information, go to All Customers: - - Search for customers with its email address containing 'costello' - count the results - - Find Sarah Miller's customer record - record her Group and extract Customer Since date + - Search for customers whose email address contains 'costello' - count the results + - Find Sarah Miller's customer record - record her Group and the full Customer Since timestamp exactly as shown in Magento (e.g. `Jan 7, 2022 10:15:42 AM`) 6. To review payment status and billing information, navigate to Invoices: - Find all invoices with 'Paid' status - count them @@ -60,7 +60,7 @@ CheapProduct|Product Name:XX OvernightDufflePrice|$XX.XX HollisterPosition|Xth CostelloCustomers|X -SarahMillerInfo|Group Name:MMM DD, YYYY +SarahMillerInfo|Group Name:MMM DD, YYYY H:MM:SS AM/PM PaidInvoices|X Invoice002BillTo|Customer Name diff --git a/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py b/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py index f42172e5..fbe17b27 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py @@ -12,7 +12,7 @@ def get_model_response(): Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") - print(f"MCP_MESSAGES: {messages_path}") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None @@ -41,6 +41,22 @@ def get_model_response(): return None +def normalize_text(text): + """ + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') + + # Normalize whitespace + text = " ".join(text.split()) + + return text.strip() + + def parse_answer_format(text): """ Parse the ... format from the agent's output. @@ -88,7 +104,7 @@ def parse_answer_format(text): return None key, value = parts - result[key.strip()] = value.strip() + result[key.strip()] = normalize_text(value.strip()) # Check if all expected keys are present missing_keys = set(expected_keys) - set(result.keys()) @@ -118,7 +134,7 @@ def load_expected_answer(label_path): for line in lines: if "|" in line: key, value = line.split("|", 1) - expected[key.strip()] = value.strip() + expected[key.strip()] = normalize_text(value.strip()) return expected except Exception as e: @@ -139,71 +155,58 @@ def compare_answers(model_answer, expected_answer): for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") - # Special handling for different types of values if key == "WS12Info": - # Check if product name and price match (format: name:price) + # "name:price" β€” name case-insensitive, price as float if ":" in expected_value and ":" in model_value: - expected_name, expected_price = expected_value.rsplit(":", 1) - model_name, model_price = model_value.rsplit(":", 1) - # Normalize price format - expected_price_clean = expected_price.replace("$", "").replace(",", "") - model_price_clean = model_price.replace("$", "").replace(",", "") - if ( - expected_name != model_name - or expected_price_clean != model_price_clean - ): - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + exp_name, exp_price = expected_value.rsplit(":", 1) + mod_name, mod_price = model_value.rsplit(":", 1) + if exp_name.strip().lower() != mod_name.strip().lower(): + mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") + exp_price_clean = exp_price.replace("$", "").replace(",", "").strip() + mod_price_clean = mod_price.replace("$", "").replace(",", "").strip() + try: + if float(exp_price_clean) != float(mod_price_clean): + mismatches.append(f"{key} price: expected '{exp_price}', got '{mod_price}'") + except ValueError: + mismatches.append(f"{key} price should be numeric: got '{mod_price}'") else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) - elif key == "GraceOrderID": - # Order ID should start with "000" and match exactly - if not model_value.startswith("000"): - mismatches.append( - f"{key}: expected to start with '000', got '{model_value}'" - ) - elif model_value != expected_value: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) - elif key == "HighestOrderInfo": - # Check format customer:amount + # "customer:amount" β€” customer case-insensitive, amount as float if ":" in expected_value and ":" in model_value: - expected_customer, expected_amount = expected_value.rsplit(":", 1) - model_customer, model_amount = model_value.rsplit(":", 1) - # Normalize amount format - expected_amount_clean = expected_amount.replace("$", "").replace( - ",", "" - ) - model_amount_clean = model_amount.replace("$", "").replace(",", "") - if ( - expected_customer != model_customer - or expected_amount_clean != model_amount_clean - ): - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + exp_customer, exp_amount = expected_value.rsplit(":", 1) + mod_customer, mod_amount = model_value.rsplit(":", 1) + if exp_customer.strip().lower() != mod_customer.strip().lower(): + mismatches.append(f"{key} customer: expected '{exp_customer}', got '{mod_customer}'") + exp_amount_clean = exp_amount.replace("$", "").replace(",", "").strip() + mod_amount_clean = mod_amount.replace("$", "").replace(",", "").strip() + try: + if float(exp_amount_clean) != float(mod_amount_clean): + mismatches.append(f"{key} amount: expected '{exp_amount}', got '{mod_amount}'") + except ValueError: + mismatches.append(f"{key} amount should be numeric: got '{mod_amount}'") else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) - elif key == "Position2Product": - # Check if product name and quantity match + elif key == "CheapProduct": + # "name:quantity" β€” name case-insensitive, qty as int if ":" in expected_value and ":" in model_value: - expected_name, expected_qty = expected_value.rsplit(":", 1) - model_name, model_qty = model_value.rsplit(":", 1) - if expected_name != model_name or expected_qty != model_qty: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + exp_name, exp_qty = expected_value.rsplit(":", 1) + mod_name, mod_qty = model_value.rsplit(":", 1) + if exp_name.strip().lower() != mod_name.strip().lower(): + mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") + try: + if int(exp_qty.strip()) != int(mod_qty.strip()): + mismatches.append(f"{key} quantity: expected '{exp_qty}', got '{mod_qty}'") + except ValueError: + mismatches.append(f"{key} quantity should be numeric: got '{mod_qty}'") else: if expected_value != model_value: mismatches.append( @@ -211,36 +214,32 @@ def compare_answers(model_answer, expected_answer): ) elif key == "OvernightDufflePrice": - # Normalize price format - expected_clean = expected_value.replace("$", "").replace(",", "") - model_clean = model_value.replace("$", "").replace(",", "") - if expected_clean != model_clean: - mismatches.append( - f"{key}: expected '{expected_value}', got '{model_value}'" - ) + # Strip $ and , then compare as float + expected_clean = expected_value.replace("$", "").replace(",", "").strip() + model_clean = model_value.replace("$", "").replace(",", "").strip() + try: + if float(expected_clean) != float(model_clean): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") elif key == "HollisterPosition": - # Position format (1st, 2nd, 3rd, etc.) - if model_value.lower() != expected_value.lower(): + # Position format (1st, 2nd, 3rd, etc.) β€” case-insensitive + if model_value.strip().lower() != expected_value.strip().lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SarahMillerInfo": - # Format: group:date + # "group:date" β€” both case-insensitive exact match + # (split on first ':' only; the date itself contains ':' for the time) if ":" in expected_value and ":" in model_value: - expected_group, expected_date = expected_value.split(":", 1) - model_group, model_date = model_value.split(":", 1) - # Allow some flexibility in date format - if expected_group != model_group: - mismatches.append( - f"{key}: expected group '{expected_group}', got '{model_group}'" - ) - # For date, check if key parts match - if not (expected_date in model_date or model_date in expected_date): - mismatches.append( - f"{key}: expected date '{expected_date}', got '{model_date}'" - ) + exp_group, exp_date = expected_value.split(":", 1) + mod_group, mod_date = model_value.split(":", 1) + if exp_group.strip().lower() != mod_group.strip().lower(): + mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'") + if exp_date.strip().lower() != mod_date.strip().lower(): + mismatches.append(f"{key} date: expected '{exp_date}', got '{mod_date}'") else: if expected_value != model_value: mismatches.append( @@ -248,14 +247,30 @@ def compare_answers(model_answer, expected_answer): ) elif key == "Invoice002BillTo": - # Name should match exactly - if model_value != expected_value: + # Customer name β€” case-insensitive + if model_value.strip().lower() != expected_value.strip().lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) + elif key in [ + "SpriteProducts", + "Quantity100Products", + "PendingOrders", + "CostelloCustomers", + "PaidInvoices", + ]: + # Numeric counts: compare as int so "04" / "4" / "4.0" don't fail + try: + if int(float(model_value)) != int(float(expected_value)): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + else: - # Exact match for count fields and other numeric values + # Exact string match for IDs and other text fields (e.g. + # GraceOrderID has leading zeros like "000000114" that must be + # preserved) if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" diff --git a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md index 958a80c5..e81c09d0 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md +++ b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md @@ -7,23 +7,22 @@ Perform comprehensive search and filtering operations in the Magento Admin panel 2. To analyze search behavior and term effectiveness, check the Search Terms of Marketing and perform complex filtering: - Search for all terms containing 'tank' in their name - count the exact number of results - Clear filters and find terms with exactly 0 results - count how many such terms exist - - Apply a filter to show only terms with more than 10 uses - record the term with highest uses and its count (You need to see how many there are and record them all.) - - Find the search term that has results between 20-30 - record its name and exact result count + - Apply a filter to show only terms with more than 10 uses - record the term with highest uses and its count + - Find the search terms that have results between 20-30 - record their names and exact result counts (You need to see how many there are and record them all.) 3. To gather detailed marketing insights from search data, go to Search Terms in Reports: - Apply filter for terms with more than 15 hits - count total filtered results - - Find the term with ID between 10-15 that has the most results - record term name and result count (You need to see how many there are and record them all.) + - Find the term with ID between 10-15 that has the most results - record term name and result count - Filter to show only terms from "Default Store View" - count total results 4. To examine real-time search trends and top performers, from the Dashboard, perform targeted searches: - - In the 'Top Search Terms' table, find the term with exactly 1 result - record its name and uses - - In the 'Last Search Terms' table, identify the term with the both the highest number of results and uses - record name and the number of results - - In the 'Bestsellers' tab, find the product at position #3 - record name and quantity + - In the 'Top Search Terms' table, find the terms with exactly 1 result - record their names and uses (You need to see how many there are and record them all.) + - In the 'Last Search Terms' table, identify the term with both the highest number of results and uses - record its name and number of results -5. To identify patterns in search usage and results, navigate to Search Terms (main grid) in step 2: +5. To identify patterns in search usage and results, return to the Search Terms grid (Marketing β†’ Search Terms): - Sort by 'Uses' column (descending) - record the top term and its uses count - - Sort by 'Results' column (ascending) - record the first non-zero result term and its count - - Count total number of unique search terms in the system + - Sort by 'Results' column (ascending), then by 'Uses' (ascending) - record the first non-zero result term and its count + - Count the total number of search terms in the grid 6. To provide a comprehensive report of all gathered data, compile all findings and output in the following exact format: @@ -32,13 +31,12 @@ Perform comprehensive search and filtering operations in the Magento Admin panel TankSearchCount|count ZeroResultsCount|count HighestUseTerm|term:uses -Results20to30Term|term1:results1|term2:result2|term3:result3|... +Results20to30Term|term1:results1;term2:result2;term3:result3;... Hits15PlusCount|count ID10to15MaxResults|term:results DefaultStoreViewCount|count -OneResultTerm|term1:uses1|term2:uses2|term3:uses3|... +OneResultTerm|term1:uses1;term2:uses2;term3:uses3;... HighestResultLastSearch|term:results -Position3Bestseller|product:quantity TopUseTerm|term:uses FirstNonZeroResult|term:results TotalUniqueTerms|count @@ -51,13 +49,12 @@ TotalUniqueTerms|count TankSearchCount|X ZeroResultsCount|X HighestUseTerm|search_term:XX -Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|... +Results20to30Term|search_term1:XX1;search_term2:XX2;search_term3:XX3;... Hits15PlusCount|X ID10to15MaxResults|Product Name:XX DefaultStoreViewCount|X -OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|... +OneResultTerm|search_term1:XX1;search_term2:XX2;search_term3:XX3;... HighestResultLastSearch|search_term:XX -Position3Bestseller|Product Name:X TopUseTerm|search_term:XX FirstNonZeroResult|search_term:X TotalUniqueTerms|X @@ -72,5 +69,5 @@ TotalUniqueTerms|X - Navigated between different report views - Extracted data from filtered and sorted results - Counted records accurately after applying filters -- Output answer in exact format with 13 data lines +- Output answer in exact format with 12 data lines - Answer wrapped in tags \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt index 28a9606e..9892e48e 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt +++ b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt @@ -1,13 +1,12 @@ TankSearchCount|2 ZeroResultsCount|1 HighestUseTerm|hollister:19 -Results20to30Term|Antonia Racer Tank:23|tanks:23 +Results20to30Term|Antonia Racer Tank:23;tanks:23 Hits15PlusCount|1 ID10to15MaxResults|Antonia Racer Tank:23 DefaultStoreViewCount|7 -OneResultTerm|hollister:19|WP10:1 +OneResultTerm|hollister:19;WP10:1 HighestResultLastSearch|Antonia Racer Tank:23 -Position3Bestseller|Sprite Stasis Ball 65 cm:6 TopUseTerm|hollister:19 FirstNonZeroResult|WP10:1 TotalUniqueTerms|7 \ No newline at end of file diff --git a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py index 980357fd..dc66826a 100644 --- a/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py +++ b/tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py @@ -1,322 +1,275 @@ +import asyncio +import sys import re -import json import os -import sys +import json +from pathlib import Path + + +def get_model_response(): + """ + Get the model's response from the MCP_MESSAGES environment variable. + Returns the last assistant message text. + """ + messages_path = os.getenv("MCP_MESSAGES") + print(f"MCP_MESSAGES: {messages_path}", file=sys.stderr) + if not messages_path: + print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) + return None + + try: + with open(messages_path, "r") as f: + messages = json.load(f) + + # Find the last assistant message with status='completed', type='message' + for message in reversed(messages): + if ( + message.get("role") == "assistant" + and message.get("status") == "completed" + and message.get("type") == "message" + ): + content = message.get("content", []) + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get("type") in ["text", "output_text"]: + return item.get("text", "") + elif isinstance(content, str): + return content + + print("Warning: No assistant response found in messages", file=sys.stderr) + return None + except Exception as e: + print(f"Error reading messages file: {str(e)}", file=sys.stderr) + return None -def verify(messages): +def normalize_text(text): """ - Verify that the agent has successfully performed complex search and filtering operations - in the Magento Admin panel and extracted all required information correctly. + Normalize text for comparison by collapsing whitespace. + """ + if not isinstance(text, str): + return str(text) + + text = text.replace("β€˜", "'").replace("’", "'") + text = text.replace("β€œ", '"').replace("”", '"') - Args: - messages: List of message dictionaries containing the conversation + # Normalize whitespace + text = " ".join(text.split()) - Returns: - Dictionary with 'valid' boolean and 'reason' string + return text.strip() + + +def parse_answer_format(text): + """ + Parse the ... format from the agent's output. + Returns a dictionary with the parsed values. """ + if not text: + print("Error: No text provided to parse", file=sys.stderr) + return None - # Find the last assistant message with status "completed" and type "message" - answer_content = None - for message in reversed(messages): - if ( - message.get("role") == "assistant" - and message.get("status") == "completed" - and message.get("type") == "message" - and message.get("content") - ): - # Extract text from content structure - content = message["content"] - if isinstance(content, list): - for item in content: - if isinstance(item, dict) and item.get("type") == "output_text": - text = item.get("text", "") - # Look for answer tags with case-insensitive search - answer_match = re.search( - r"(.*?)", text, re.DOTALL | re.IGNORECASE - ) - if answer_match: - answer_content = answer_match.group(1).strip() - break - elif isinstance(content, str): - # Look for answer tags in string content - answer_match = re.search(r"(.*?)", content, re.DOTALL | re.IGNORECASE) - if answer_match: - answer_content = answer_match.group(1).strip() - break - - if answer_content: - break + match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL) + if not match: + print("Error: No ... tags found in response", file=sys.stderr) + return None + answer_content = match.group(1).strip() if not answer_content: - return {"valid": False, "reason": "No answer found in tags"} + print("Error: Empty answer content", file=sys.stderr) + return None - # Expected format - each line should have a key|value pair - expected_keys = [ - "TankSearchCount", - "ZeroResultsCount", - "HighestUseTerm", - "Results20to30Term", - "Hits15PlusCount", - "ID10to15MaxResults", - "DefaultStoreViewCount", - "OneResultTerm", - "HighestResultLastSearch", - "Position3Bestseller", - "TopUseTerm", - "FirstNonZeroResult", - "TotalUniqueTerms", - ] + lines = [line.strip() for line in answer_content.split("\n") if line.strip()] - # Parse the answer - lines = answer_content.strip().split("\n") + if len(lines) != 12: + print(f"Error: Expected 12 lines in answer, got {len(lines)}", file=sys.stderr) + print(f"Lines found: {lines}", file=sys.stderr) + return None - # Check if we have exactly 13 lines - if len(lines) != 13: - return {"valid": False, "reason": f"Expected 13 data lines, found {len(lines)}"} + expected_keys = [ + "TankSearchCount", "ZeroResultsCount", "HighestUseTerm", + "Results20to30Term", "Hits15PlusCount", "ID10to15MaxResults", + "DefaultStoreViewCount", "OneResultTerm", "HighestResultLastSearch", + "TopUseTerm", "FirstNonZeroResult", "TotalUniqueTerms", + ] - # Parse each line and validate format - extracted_data = {} + result = {} for line in lines: if "|" not in line: - return { - "valid": False, - "reason": f"Invalid format in line: {line}. Expected 'key|value' format", - } - + print(f"Error: Line missing '|' separator: {line}", file=sys.stderr) + return None parts = line.split("|", 1) if len(parts) != 2: - return {"valid": False, "reason": f"Invalid format in line: {line}"} - - key, value = parts - extracted_data[key] = value - - # Check all required keys are present - missing_keys = set(expected_keys) - set(extracted_data.keys()) + print(f"Error: Invalid line format: {line}", file=sys.stderr) + return None + key, value = parts[0].strip(), normalize_text(parts[1].strip()) + if not key or not value: + print(f"Error: Empty key or value in line: {line}", file=sys.stderr) + return None + result[key] = value + + missing_keys = set(expected_keys) - set(result.keys()) if missing_keys: - return { - "valid": False, - "reason": f"Missing required keys: {', '.join(missing_keys)}", - } - - # Validate specific data formats and expected values based on the current data - - # 1. TankSearchCount should be a number (2 terms containing 'tank') - if not extracted_data["TankSearchCount"].isdigit(): - return { - "valid": False, - "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}", - } - - # Expected: "Antonia Racer Tank" and "tanks" contain 'tank' - if extracted_data["TankSearchCount"] != "2": - return { - "valid": False, - "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}", - } - - # 2. ZeroResultsCount should be a number (nike has 0 results) - if not extracted_data["ZeroResultsCount"].isdigit(): - return { - "valid": False, - "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}", - } - - if extracted_data["ZeroResultsCount"] != "1": - return { - "valid": False, - "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}", - } - - # 3. HighestUseTerm should be in format "term:uses" - if ":" not in extracted_data["HighestUseTerm"]: - return { - "valid": False, - "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}", - } - - # hollister has 19 uses (highest among terms with > 10 uses) - if extracted_data["HighestUseTerm"] != "hollister:19": - return { - "valid": False, - "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}", - } - - # 4. Results20to30Term should be in format "term:results" - if ":" not in extracted_data["Results20to30Term"]: - return { - "valid": False, - "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}", - } - - # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30) - valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"] - # Check if answer contains one of the valid values or both separated by | - if not any( - val in extracted_data["Results20to30Term"] for val in valid_results20to30 - ): - return { - "valid": False, - "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}", - } - - # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15) - if not extracted_data["Hits15PlusCount"].isdigit(): - return { - "valid": False, - "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}", - } - - if extracted_data["Hits15PlusCount"] != "1": - return { - "valid": False, - "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}", - } - - # 6. ID10to15MaxResults should be in format "term:results" - if ":" not in extracted_data["ID10to15MaxResults"]: - return { - "valid": False, - "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}", - } - - # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results) - if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23": - return { - "valid": False, - "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}", - } - - # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View) - if not extracted_data["DefaultStoreViewCount"].isdigit(): - return { - "valid": False, - "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}", - } - - if extracted_data["DefaultStoreViewCount"] != "7": - return { - "valid": False, - "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}", - } - - # 8. OneResultTerm should be in format "term:uses" - if ":" not in extracted_data["OneResultTerm"]: - return { - "valid": False, - "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}", - } - - # Both hollister and WP10 have exactly 1 result - valid_one_result = ["hollister:19", "WP10:1"] - if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result): - return { - "valid": False, - "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}", - } - - # 9. HighestResultLastSearch should be in format "term:results" - if ":" not in extracted_data["HighestResultLastSearch"]: - return { - "valid": False, - "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}", - } - - # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest) - valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"] - if not any( - val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last - ): - return { - "valid": False, - "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}", - } - - # 10. Position3Bestseller should be in format "product:quantity" - if ":" not in extracted_data["Position3Bestseller"]: - return { - "valid": False, - "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}", - } - - # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6 - if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6": - return { - "valid": False, - "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}", - } - - # 11. TopUseTerm should be in format "term:uses" - if ":" not in extracted_data["TopUseTerm"]: - return { - "valid": False, - "reason": f"TopUseTerm should be in format 'term:uses', got: {extracted_data['TopUseTerm']}", - } - - # hollister has 19 uses (highest) - if extracted_data["TopUseTerm"] != "hollister:19": - return { - "valid": False, - "reason": f"TopUseTerm should be 'hollister:19', got: {extracted_data['TopUseTerm']}", - } - - # 12. FirstNonZeroResult should be in format "term:results" - if ":" not in extracted_data["FirstNonZeroResult"]: - return { - "valid": False, - "reason": f"FirstNonZeroResult should be in format 'term:results', got: {extracted_data['FirstNonZeroResult']}", - } - - # When sorted by results ascending, first non-zero is WP10 (has 1 result) - if extracted_data["FirstNonZeroResult"] != "WP10:1": - return { - "valid": False, - "reason": f"FirstNonZeroResult should be 'WP10:1', got: {extracted_data['FirstNonZeroResult']}", - } - - # 13. TotalUniqueTerms should be a number - if not extracted_data["TotalUniqueTerms"].isdigit(): - return { - "valid": False, - "reason": f"TotalUniqueTerms should be a number, got: {extracted_data['TotalUniqueTerms']}", - } - - # There are 7 unique search terms in the system - if extracted_data["TotalUniqueTerms"] != "7": - return { - "valid": False, - "reason": f"TotalUniqueTerms should be '7', got: {extracted_data['TotalUniqueTerms']}", - } - - # All validations passed - return { - "valid": True, - "reason": "All complex search and filtering operations completed successfully", - } + print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr) + return None + return result -if __name__ == "__main__": - # Load messages from environment variable - messages_path = os.getenv("MCP_MESSAGES") - if not messages_path: - print( - json.dumps( - {"valid": False, "reason": "MCP_MESSAGES environment variable not set"} - ) - ) - exit(1) +def load_expected_answer(label_path): + """ + Load the expected answer from label.txt file. + Returns a dictionary with the expected values. + + Each line is "Key|value..."; split only on the first '|' so values that + themselves contain '|' (e.g. "term1:c1|term2:c2") are preserved. + """ try: - with open(messages_path, "r") as f: - messages = json.load(f) + with open(label_path, "r") as f: + lines = f.read().strip().split("\n") + + expected = {} + for line in lines: + if "|" in line: + key, value = line.split("|", 1) + expected[key.strip()] = normalize_text(value.strip()) + + return expected except Exception as e: - print( - json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"}) - ) - exit(1) - - # Run verification - result = verify(messages) - print(json.dumps(result)) - # Exit with appropriate code based on verification result - sys.exit(0 if result["valid"] else 1) + print(f"Error reading label file: {str(e)}", file=sys.stderr) + return None + + +def compare_answers(model_answer, expected_answer): + """ + Compare the model's answer with the expected answer. + Returns True if all key information matches, False otherwise. + """ + if not model_answer or not expected_answer: + return False + + mismatches = [] + for key, expected_value in expected_answer.items(): + model_value = model_answer.get(key, "") + + if key in [ + "TankSearchCount", + "ZeroResultsCount", + "Hits15PlusCount", + "DefaultStoreViewCount", + "TotalUniqueTerms", + ]: + # Numeric counts: compare as int so "04" / "4" / "4.0" don't fail + try: + if int(float(model_value)) != int(float(expected_value)): + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + except ValueError: + mismatches.append(f"{key} should be numeric: got '{model_value}'") + + elif key in [ + "HighestUseTerm", + "ID10to15MaxResults", + "HighestResultLastSearch", + "TopUseTerm", + "FirstNonZeroResult", + ]: + # Single "term:count" β€” term case-insensitive, count as int + if ":" not in expected_value or ":" not in model_value: + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + continue + exp_term, exp_count = expected_value.rsplit(":", 1) + mod_term, mod_count = model_value.rsplit(":", 1) + if exp_term.strip().lower() != mod_term.strip().lower(): + mismatches.append(f"{key} term: expected '{exp_term}', got '{mod_term}'") + try: + if int(exp_count.strip()) != int(mod_count.strip()): + mismatches.append(f"{key} count: expected '{exp_count}', got '{mod_count}'") + except ValueError: + mismatches.append(f"{key} count should be numeric: got '{mod_count}'") + + elif key in ["Results20to30Term", "OneResultTerm"]: + # Multi-entry "term1:count1;term2:count2;..." β€” order-independent; + # term case-insensitive, count as int + expected_entries = set() + for item in expected_value.split(";"): + item = item.strip() + if ":" not in item: + continue + term, count = item.rsplit(":", 1) + expected_entries.add((term.strip().lower(), int(count.strip()))) + model_entries = set() + for item in model_value.split(";"): + item = item.strip() + if ":" not in item: + mismatches.append(f"{key}: malformed entry '{item}'") + continue + term, count = item.rsplit(":", 1) + try: + model_entries.add((term.strip().lower(), int(count.strip()))) + except ValueError: + mismatches.append(f"{key}: non-numeric count in '{item}'") + if expected_entries != model_entries: + mismatches.append( + f"{key}: expected '{expected_value}', got '{model_value}'" + ) + + else: + # Fallback exact match for any unrecognized key + if model_value != expected_value: + mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") + + if mismatches: + print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) + for mismatch in mismatches: + print(f"βœ— {mismatch}", file=sys.stderr) + return False + + print("\n=== Answer Comparison ===", file=sys.stderr) + print("βœ“ All key information matches the expected answer", file=sys.stderr) + return True + + +async def verify() -> bool: + """ + Verify the search and filtering operations task by comparing the model's + answer against the expected label. + """ + label_path = Path(__file__).parent / "label.txt" + + expected_answer = load_expected_answer(label_path) + if not expected_answer: + print("Error: Could not load expected answer from label.txt", file=sys.stderr) + return False + + model_response = get_model_response() + if not model_response: + print("No model response found", file=sys.stderr) + return False + + print("Found model response, parsing answer format...", file=sys.stderr) + model_answer = parse_answer_format(model_response) + if not model_answer: + print("Could not parse answer format from model response", file=sys.stderr) + return False + + print("\n=== Model Answer Parsed ===", file=sys.stderr) + for key, value in model_answer.items(): + print(f"{key}: {value}", file=sys.stderr) + + answer_match = compare_answers(model_answer, expected_answer) + if not answer_match: + print("\nModel answer does not match expected answer", file=sys.stderr) + return False + print("\nβœ“ Model answer matches expected answer", file=sys.stderr) + return True + + +def main(): + """ + Executes the verification process and exits with a status code. + """ + result = asyncio.run(verify()) + sys.exit(0 if result else 1) + + +if __name__ == "__main__": + main() diff --git a/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md b/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md index 1d617e1a..af93d156 100644 --- a/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md +++ b/tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md @@ -136,4 +136,6 @@ Debug and fix the query to produce accurate results. Then create a table with yo 2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output. -**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues. \ No newline at end of file +**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues. + +**Note on payments**: A rental's revenue is determined by the rental itself β€” every payment recorded against a rental counts toward the customer who made that rental, regardless of which `customer_id` happens to appear on the payment row. \ No newline at end of file diff --git a/tasks/postgres/standard/employees/employee_demographics_report/description.md b/tasks/postgres/standard/employees/employee_demographics_report/description.md index c79cb6e2..380121b3 100644 --- a/tasks/postgres/standard/employees/employee_demographics_report/description.md +++ b/tasks/postgres/standard/employees/employee_demographics_report/description.md @@ -12,7 +12,7 @@ Generate a comprehensive employee demographics and basic statistics report for t * `age_group` (varchar) β€” age range ('20-29', '30-39', '40-49', '50-59', '60+') * `employee_count` (integer) β€” number of current employees in age group * `avg_salary` (decimal) β€” average current salary for age group - * `avg_tenure_days` (decimal) β€” average days of service + * `avg_tenure_days` (decimal) β€” average days of service as of the reference date `2002-08-01` 3. **Create the birth month distribution table** β€” build a table called `birth_month_distribution` in the `employees` schema with: * `birth_month` (integer) β€” month number (1-12) @@ -26,7 +26,7 @@ Generate a comprehensive employee demographics and basic statistics report for t * `still_employed` (integer) β€” how many from that year are still employed * `retention_rate` (decimal) β€” percentage still employed (still_employed/employees_hired * 100) -5. **Apply age group classification** based on current age: +5. **Apply age group classification** based on each employee's age as of the reference date `2002-08-01`. Only include age groups that contain at least one current employee β€” empty buckets must not appear in `age_group_analysis`. * **20-29**: Ages 20-29 * **30-39**: Ages 30-39 * **40-49**: Ages 40-49 diff --git a/tasks/postgres/standard/employees/employee_demographics_report/verify.py b/tasks/postgres/standard/employees/employee_demographics_report/verify.py index ef059874..b039208c 100644 --- a/tasks/postgres/standard/employees/employee_demographics_report/verify.py +++ b/tasks/postgres/standard/employees/employee_demographics_report/verify.py @@ -119,7 +119,7 @@ def verify_age_group_results(conn) -> bool: SELECT e.id AS employee_id, e.hire_date, - EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years + EXTRACT(YEAR FROM AGE(DATE '2002-08-01', e.birth_date))::INT AS age_years FROM employees.employee e WHERE e.birth_date IS NOT NULL ) @@ -133,7 +133,7 @@ def verify_age_group_results(conn) -> bool: END AS age_group, COUNT(*)::INT AS employee_count, AVG(cs.amount) AS avg_salary, - AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days + AVG((DATE '2002-08-01' - a.hire_date)::INT) AS avg_tenure_days FROM emp_age a JOIN current_salary cs ON cs.employee_id = a.employee_id WHERE a.age_years >= 20 diff --git a/tasks/postgres/standard/employees/employee_performance_analysis/description.md b/tasks/postgres/standard/employees/employee_performance_analysis/description.md index fff8496b..2877f5c5 100644 --- a/tasks/postgres/standard/employees/employee_performance_analysis/description.md +++ b/tasks/postgres/standard/employees/employee_performance_analysis/description.md @@ -24,4 +24,8 @@ Create a comprehensive employee performance evaluation system that analyzes care 5. **Calculate salary equity metrics** β€” populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments. -The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies. \ No newline at end of file +The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies. + +### Important Notes + +- Do NOT use ROUND functions - keep the full precision of calculated values \ No newline at end of file diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/description.md b/tasks/postgres/standard/employees/employee_retention_analysis/description.md index 3d5f70ae..05b17863 100644 --- a/tasks/postgres/standard/employees/employee_retention_analysis/description.md +++ b/tasks/postgres/standard/employees/employee_retention_analysis/description.md @@ -13,7 +13,7 @@ Analyze employee retention patterns and identify factors contributing to turnove * `employee_id` (bigint) β€” the employee's ID * `full_name` (varchar) β€” concatenated first and last name * `current_department` (varchar) β€” current department name - * `tenure_days` (integer) β€” days with the company + * `tenure_days` (integer) β€” days with the company as of the reference date `2002-08-01` * `current_salary` (integer) β€” current salary amount * `risk_category` (varchar) β€” risk level ('high_risk', 'medium_risk', 'low_risk') @@ -25,7 +25,7 @@ Analyze employee retention patterns and identify factors contributing to turnove * `avg_tenure_days` (decimal) β€” average tenure in days for employees who left that year * `avg_final_salary` (decimal) β€” average final salary of departed employees that year -4. **Apply risk assessment criteria** for current employees: +4. **Apply risk assessment criteria** for current employees (measure tenure as of the reference date `2002-08-01`): * **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years) * **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years) * **Low risk**: All other current employees diff --git a/tasks/postgres/standard/employees/employee_retention_analysis/verify.py b/tasks/postgres/standard/employees/employee_retention_analysis/verify.py index 744cc33f..913f5be0 100644 --- a/tasks/postgres/standard/employees/employee_retention_analysis/verify.py +++ b/tasks/postgres/standard/employees/employee_retention_analysis/verify.py @@ -136,11 +136,11 @@ def verify_high_risk_results(conn) -> bool: e.id AS employee_id, CONCAT(e.first_name, ' ', e.last_name) AS full_name, d.dept_name AS current_department, - (CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days, + (DATE '2002-08-01' - e.hire_date)::INTEGER AS tenure_days, cs.current_amount::INTEGER AS current_salary, CASE - WHEN dr.retention_rate < 80 AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk' - WHEN dr.retention_rate < 85 AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk' + WHEN dr.retention_rate < 80 AND (DATE '2002-08-01' - e.hire_date) < 1095 THEN 'high_risk' + WHEN dr.retention_rate < 85 AND (DATE '2002-08-01' - e.hire_date) < 1825 THEN 'medium_risk' ELSE 'low_risk' END AS risk_category FROM employees.employee e diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/description.md b/tasks/postgres/standard/employees/executive_dashboard_automation/description.md index 490cdc41..4f00e5d4 100644 --- a/tasks/postgres/standard/employees/executive_dashboard_automation/description.md +++ b/tasks/postgres/standard/employees/executive_dashboard_automation/description.md @@ -8,7 +8,7 @@ Design a comprehensive reporting and automation system for executive dashboard a * `department_name` (varchar) β€” department name * `total_employees` (integer) β€” current active employee count * `avg_salary` (decimal) β€” average current salary - * `total_payroll` (bigint) β€” total monthly payroll cost + * `total_payroll` (bigint) β€” sum of current salary amounts for active employees in the department * `manager_name` (varchar) β€” current department manager name **View 2: `exec_hiring_trends`** diff --git a/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py b/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py index 1a34fcc9..3988358b 100644 --- a/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py +++ b/tasks/postgres/standard/employees/executive_dashboard_automation/verify.py @@ -339,12 +339,15 @@ def verify_materialized_views(conn) -> bool: def verify_stored_procedures(conn) -> bool: """Verify that stored procedure was created.""" with conn.cursor() as cur: - # Check if procedure exists + # Check if the routine exists in pg_proc. pg_proc lists both + # FUNCTION and PROCEDURE entries, so we don't have to filter on + # type β€” accepts either form of "stored procedure". cur.execute(""" - SELECT routine_name FROM information_schema.routines - WHERE routine_schema = 'employees' - AND routine_type = 'FUNCTION' - AND routine_name = 'generate_monthly_report' + SELECT p.proname + FROM pg_proc p + JOIN pg_namespace n ON n.oid = p.pronamespace + WHERE n.nspname = 'employees' + AND p.proname = 'generate_monthly_report' """) procedures = [row[0] for row in cur.fetchall()] diff --git a/tasks/postgres/standard/lego/consistency_enforcement/description.md b/tasks/postgres/standard/lego/consistency_enforcement/description.md index f02a21cc..5e6dc6f7 100644 --- a/tasks/postgres/standard/lego/consistency_enforcement/description.md +++ b/tasks/postgres/standard/lego/consistency_enforcement/description.md @@ -4,7 +4,7 @@ Implement a data consistency enforcement system for the LEGO database. The syste For any given `set_num`, the following invariant must be maintained: `lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false -**Important**: If a set has no inventory records, the consistency check should be skipped. +**Important**: If a set has no inventory records (or no non-spare parts in its latest inventory), treat the actual part count as `0`. The consistency check still applies β€” `num_parts` must equal `0` for such sets. # Your Tasks: @@ -15,7 +15,7 @@ Write a single `SELECT` query to find all sets where the stored `num_parts` does 1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table. 2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false. -3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum. +3. **Compare and Filter**: `LEFT JOIN` this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum, using `COALESCE(actual_parts, 0)` so that sets without inventory are also surfaced when their `num_parts` is non-zero. ## Task 2: Fix Existing Inconsistencies diff --git a/tasks/postgres/standard/lego/consistency_enforcement/verify.py b/tasks/postgres/standard/lego/consistency_enforcement/verify.py index d0222cc4..c1aaa23d 100644 --- a/tasks/postgres/standard/lego/consistency_enforcement/verify.py +++ b/tasks/postgres/standard/lego/consistency_enforcement/verify.py @@ -1,6 +1,5 @@ """ Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints -Version 2.1: Relaxed consistency check to allow for one known corner case mismatch. """ import os @@ -82,17 +81,15 @@ def get_mismatch_count(cur) -> int: def verify_data_consistency(conn) -> bool: """ TASK 1 VERIFICATION: Checks if the initial data fix was successful. - (Relaxed: Allows for one corner-case mismatch). """ - print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --") + print("\n-- Verifying Task 1: Data Consistency Fix --") with conn.cursor() as cur: count = get_mismatch_count(cur) - # RELAXED CONDITION: Allow 0 or 1 mismatch to pass. - if count > 1: - print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.") + if count > 0: + print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 after fix.") return False - - print("βœ… PASS: Data consistency check passed (allowing for one known mismatch).") + + print("βœ… PASS: All sets have consistent part counts.") return True diff --git a/tasks/postgres/standard/lego/database_security_policies/verify.py b/tasks/postgres/standard/lego/database_security_policies/verify.py index 0a019c95..16384485 100644 --- a/tasks/postgres/standard/lego/database_security_policies/verify.py +++ b/tasks/postgres/standard/lego/database_security_policies/verify.py @@ -183,16 +183,19 @@ def test_theme_analyst_access(conn) -> bool: return False print("βœ… PASS: Reference tables appear to be accessible.") - # Test 4 & 5: Check related tables + # Test 4 & 5: Check related tables β€” counts must match exactly + # what is reachable through theme_id=18 (Star Wars: 65081-1 + K8008-1). cur.execute("SELECT COUNT(*) FROM lego_inventories;") - if cur.fetchone()[0] == 0: - print("❌ FAIL: No inventories are visible for the allowed sets.") + inv_count = cur.fetchone()[0] + if inv_count != 2: + print(f"❌ FAIL: Expected 2 inventories for Star Wars sets, got {inv_count}.") cur.execute("RESET ROLE;") return False - + cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;") - if cur.fetchone()[0] == 0: - print("❌ FAIL: No inventory parts are visible for the allowed sets.") + parts_count = cur.fetchone()[0] + if parts_count != 3: + print(f"❌ FAIL: Expected 3 inventory parts for Star Wars sets, got {parts_count}.") cur.execute("RESET ROLE;") return False print("βœ… PASS: Related tables (inventories, inventory_parts) are correctly filtered.") diff --git a/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md b/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md index 9b5deda9..126ff5bb 100644 --- a/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md +++ b/tasks/postgres/standard/lego/transactional_inventory_transfer/description.md @@ -36,9 +36,11 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - Verify both inventory IDs exist in `lego_inventories` table - Verify part exists in `lego_parts` table - Verify color exists in `lego_colors` table - - Check source has sufficient quantity (including spare parts) + - Check the source's non-spare row for this `(part_num, color_id)` has sufficient quantity - Prevent self-transfers (source and target cannot be the same) + *Note: The function operates on non-spare rows only (`is_spare = false`).* + **Validation B: Business Rules** - Maximum transfer quantity is 500 parts per operation - Minimum transfer quantity is 1 part @@ -52,26 +54,22 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - Calculate transfer feasibility **Step B: Source Inventory Update** - - Decrease quantity in source inventory + - Decrease quantity on the source's non-spare row - If quantity becomes zero, delete the row - - Handle spare parts appropriately (maintain `is_spare` flag) **Step C: Target Inventory Update** - - Check if part exists in target inventory + - Check if a non-spare row for `(part_num, color_id)` exists in target inventory - If exists: increase quantity - - If not exists: insert new record - - Handle spare parts appropriately + - If not exists: insert a new non-spare row (`is_spare = false`) **Step D: Audit Logging** - Log successful transfers with details - - Log failed transfers with error messages - Include transfer reason and status 5. **Error handling requirements**: - Use `RAISE EXCEPTION` with descriptive error messages - Handle all validation failures gracefully - Ensure complete rollback on any failure - - Log all attempts (successful and failed) 6. **Return value**: - Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'` @@ -81,7 +79,7 @@ Create a PostgreSQL function to handle inventory part transfers between LEGO set - **Transaction Safety**: All operations wrapped in transaction block - **Data Integrity**: No partial updates possible -- **Audit Trail**: Complete logging of all transfer attempts +- **Audit Trail**: Logging of successful transfer attempts - **Validation**: Comprehensive input and business rule validation - **Error Recovery**: Failed transfers leave database unchanged - **Performance**: Use appropriate locking to prevent race conditions @@ -105,8 +103,7 @@ SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer'); ## Verification Criteria: - Function handles all validation rules correctly -- Audit logging captures all transfer attempts -- Failed transfers are properly logged with error details +- Audit logging captures successful transfer attempts - Self-transfers are prevented - Quantity limits are enforced - Database state remains consistent after failures \ No newline at end of file diff --git a/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py b/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py index 933e1a87..49a2fbf8 100644 --- a/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py +++ b/tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py @@ -293,7 +293,26 @@ def verify_business_rule_validation(conn) -> bool: finally: conn.rollback() # Rollback after third test - return test1_passed and test2_passed and test3_passed + # Test 4: Negative transfer quantity (should fail) + print("Test 4: Negative transfer quantity (should fail)") + test4_passed = False + try: + with conn.cursor() as cur: + cur.execute( + "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", + (14469, 14686, '3024', 15, -5, 'negative_transfer') + ) + result = cur.fetchone() + print(f"❌ FAIL: Negative transfer should have failed but succeeded: {result[0]}") + except psycopg2.Error: + print(f"βœ… PASS: Negative transfer correctly failed") + test4_passed = True + except Exception as e: + print(f"❌ FAIL: Negative transfer test failed with unexpected error: {e}") + finally: + conn.rollback() + + return test1_passed and test2_passed and test3_passed and test4_passed def verify_insufficient_quantity_error(conn) -> bool: @@ -341,49 +360,43 @@ def verify_insufficient_quantity_error(conn) -> bool: def verify_invalid_inventory_error(conn) -> bool: - """Test that transfer fails with invalid inventory IDs.""" - print("\n-- Verifying Invalid Inventory Error --") - passed = False - try: - source_id = 99999 # Non-existent inventory - target_id = 14686 - part_num = '3024' - color_id = 15 - transfer_qty = 10 - reason = 'invalid_test' - - target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) - - with conn.cursor() as cur: - try: + """Test that transfer fails when any of inventory_id / part_num / color_id is invalid.""" + print("\n-- Verifying Invalid Reference Errors --") + + # Each case: (label, source_id, target_id, part_num, color_id) + cases = [ + ("invalid source inventory", 99999, 14686, '3024', 15), + ("invalid target inventory", 14469, 99999, '3024', 15), + ("invalid part_num", 14469, 14686, 'NOT_A_REAL_PART', 15), + ("invalid color_id", 14469, 14686, '3024', 99999), + ] + transfer_qty = 10 + reason = 'invalid_test' + all_passed = True + for label, source_id, target_id, part_num, color_id in cases: + try: + with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() - print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}") - except psycopg2.Error as e: - print(f"βœ… PASS: Transfer correctly failed with an exception.") - # Rollback the aborted transaction - conn.rollback() - - target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) - if target_final != target_initial: - print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}") - else: - print("βœ… PASS: Database state unchanged after invalid inventory error") - passed = True - finally: - conn.rollback() - return passed + print(f"❌ FAIL ({label}): Transfer should have failed but succeeded: {result[0]}") + all_passed = False + except psycopg2.Error: + print(f"βœ… PASS ({label}): Transfer correctly failed with an exception.") + finally: + conn.rollback() + return all_passed def verify_audit_logging(conn) -> bool: """ - Test that audit logging captures both successful and failed transfers. - This function uses commits to separate test cases and work around the - transactional paradox of logging a failure within a transaction that - is about to be rolled back by the client. + Test audit logging behavior: + - Part 1: a successful transfer must produce a log row within the transaction. + - Part 2: if the function raises (e.g., self-transfer), the whole transaction + rolls back β€” any log row the function may have written disappears too. + This is standard PostgreSQL transaction semantics for RAISE EXCEPTION. """ print("\n-- Verifying Audit Logging --") @@ -432,9 +445,7 @@ def verify_audit_logging(conn) -> bool: "SELECT transfer_parts(14469, 14469, '3024', 15, 5, 'audit_test_fail')" ) except psycopg2.Error: - # This is the expected failure path. - # The function should have logged the failure before raising the error. - # Now, we check the log table. + # Expected: self-transfer raises an exception, aborting the transaction. pass # The transaction is now in an aborted state. We must rollback to issue new commands. @@ -465,13 +476,13 @@ def verify_exact_quantity_transfer(conn) -> bool: target_id = 14686 # Use a fixed target inventory try: - # Find a part with a small quantity that doesn't conflict with the target inventory + # Find a non-spare part with a small quantity that doesn't conflict with the target inventory with conn.cursor() as cur: cur.execute( """ SELECT inventory_id, part_num, color_id, quantity FROM public.lego_inventory_parts - WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s + WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s AND is_spare = false LIMIT 1 """, (target_id,) diff --git a/tasks/postgres/standard/security/rls_business_access/description.md b/tasks/postgres/standard/security/rls_business_access/description.md index be00f6b7..8e49b5d5 100644 --- a/tasks/postgres/standard/security/rls_business_access/description.md +++ b/tasks/postgres/standard/security/rls_business_access/description.md @@ -18,13 +18,13 @@ Build RLS policies for a social platform where users create posts and comments i - **DELETE**: Only channel owners can delete channels ### 3. Posts Table Access Rules: -- **SELECT**: Users can read all posts in channels they have access to +- **SELECT**: Users can read all posts in channels they have access to. (Authors do NOT get a separate read privilege β€” visibility is determined solely by channel accessibility.) - **INSERT**: Authenticated users can create posts in any channel - **UPDATE**: Post authors OR channel moderators OR channel owners can edit posts - **DELETE**: Post authors OR channel moderators OR channel owners can delete posts ### 4. Comments Table Access Rules: -- **SELECT**: Users can read comments on posts they can access +- **SELECT**: Users can read comments on posts they can access. (Comment authors do NOT get a separate read privilege β€” visibility follows the post's channel accessibility only.) - **INSERT**: Authenticated users can comment on posts they can see - **UPDATE**: Comment authors OR post authors OR channel moderators OR channel owners can edit comments - **DELETE**: Comment authors OR post authors OR channel moderators OR channel owners can delete comments @@ -36,7 +36,7 @@ Build RLS policies for a social platform where users create posts and comments i ## Session Context: -Use `current_setting('app.current_user_id')` to get the current user ID from session context. +The session sets `app.current_user_id` to the user's UUID, or `''` for anonymous users. Use the pre-created helper `app_current_user_id()` in your policies β€” it returns the UUID or `NULL` for anonymous (a raw `::UUID` cast on the empty string would error). ## Schema Requirements: @@ -48,11 +48,12 @@ Use `current_setting('app.current_user_id')` to get the current user ID from ses 1. **Enable RLS** on all five tables 2. **Create policies** for SELECT, INSERT, UPDATE, DELETE operations on each table -3. **Helper functions** to check permissions efficiently: +3. **Helper functions are pre-created** β€” use them in your policies: + - `app_current_user_id()` β€” returns the current user UUID (or `NULL` for anonymous) - `is_channel_owner(channel_id, user_id)` - `is_channel_moderator(channel_id, user_id)` - `can_moderate_channel(channel_id, user_id)` -4. **Proper indexing** to ensure RLS policies perform well +4. **Performance indexes are pre-created** β€” focus on writing correct, efficient policies. ## Test Scenarios: diff --git a/tasks/postgres/standard/security/rls_business_access/prepare_environment.py b/tasks/postgres/standard/security/rls_business_access/prepare_environment.py index 645fc608..3c3cecf8 100644 --- a/tasks/postgres/standard/security/rls_business_access/prepare_environment.py +++ b/tasks/postgres/standard/security/rls_business_access/prepare_environment.py @@ -25,6 +25,18 @@ def setup_rls_environment(): conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() + # Defensive cleanup: ROLEs are cluster-level objects and survive + # `DROP DATABASE` of a previous task run. Drop stale `test_user` + # (created by verify.py) so re-runs don't trip over it. + try: + cur.execute("DROP OWNED BY test_user CASCADE;") + except psycopg2.Error: + pass # role may not exist or own nothing + try: + cur.execute("DROP ROLE IF EXISTS test_user;") + except psycopg2.Error as e: + print(f"⚠ Could not drop stale role test_user: {e}") + # 1. Users Table (with correct field name for verification) cur.execute(""" CREATE TABLE IF NOT EXISTS users ( diff --git a/tasks/postgres/standard/security/rls_business_access/verify.py b/tasks/postgres/standard/security/rls_business_access/verify.py index 8fef4527..9bd71a7f 100644 --- a/tasks/postgres/standard/security/rls_business_access/verify.py +++ b/tasks/postgres/standard/security/rls_business_access/verify.py @@ -94,7 +94,10 @@ def verify_rls_implementation(): SET email = 'alice.updated@example.com' WHERE id = '11111111-1111-1111-1111-111111111111' """) - test_results.append("βœ“ Users can update their own profile") + if cur.rowcount > 0: + test_results.append("βœ“ Users can update their own profile") + else: + test_results.append("βœ— User update affected 0 rows (RLS too restrictive on own profile)") except Exception as e: test_results.append(f"βœ— User cannot update own profile: {e}") @@ -125,7 +128,10 @@ def verify_rls_implementation(): SET description = 'Updated by Alice' WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' """) - test_results.append("βœ“ Channel owners can update their channels") + if cur.rowcount > 0: + test_results.append("βœ“ Channel owners can update their channels") + else: + test_results.append("βœ— Channel owner update affected 0 rows (RLS too restrictive on own channel)") except Exception as e: test_results.append(f"βœ— Channel owner cannot update channel: {e}") @@ -156,7 +162,10 @@ def verify_rls_implementation(): SET title = 'Updated by Alice' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) - test_results.append("βœ“ Post authors can update their posts") + if cur.rowcount > 0: + test_results.append("βœ“ Post authors can update their posts") + else: + test_results.append("βœ— Post author update affected 0 rows (RLS too restrictive on own post)") except Exception as e: test_results.append(f"βœ— Post author cannot update post: {e}") @@ -168,7 +177,10 @@ def verify_rls_implementation(): SET content = 'Moderated by Bob' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) - test_results.append("βœ“ Channel moderators can update posts in their channels") + if cur.rowcount > 0: + test_results.append("βœ“ Channel moderators can update posts in their channels") + else: + test_results.append("βœ— Moderator update affected 0 rows (RLS too restrictive on moderator path)") except Exception as e: test_results.append(f"βœ— Channel moderator cannot update post: {e}") @@ -199,7 +211,10 @@ def verify_rls_implementation(): SET content = 'Updated by Bob himself' WHERE id = '99999999-9999-9999-9999-999999999999' """) - test_results.append("βœ“ Comment authors can update their comments") + if cur.rowcount > 0: + test_results.append("βœ“ Comment authors can update their comments") + else: + test_results.append("βœ— Comment author update affected 0 rows (RLS too restrictive on own comment)") except Exception as e: test_results.append(f"βœ— Comment author cannot update comment: {e}") @@ -211,7 +226,10 @@ def verify_rls_implementation(): SET content = 'Moderated by post author Alice' WHERE id = '99999999-9999-9999-9999-999999999999' """) - test_results.append("βœ“ Post authors can moderate comments on their posts") + if cur.rowcount > 0: + test_results.append("βœ“ Post authors can moderate comments on their posts") + else: + test_results.append("βœ— Post author moderation affected 0 rows (RLS too restrictive on post-author path)") except Exception as e: test_results.append(f"βœ— Post author cannot moderate comment: {e}") @@ -247,20 +265,29 @@ def verify_rls_implementation(): # Test 7: Content visibility based on user context print("\n7. Testing content visibility...") - # Count posts visible to Alice + # Count posts visible to Alice (general+tech-talk public channels: 3 posts) cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute("SELECT COUNT(*) FROM posts;") alice_posts = cur.fetchone()[0] - # Count posts visible to Eve + # Count posts visible to Eve (same two public channels: 3 posts; she does NOT see her own + # post in the private 'random' channel since she's not owner/moderator there) cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve cur.execute("SELECT COUNT(*) FROM posts;") eve_posts = cur.fetchone()[0] - if alice_posts >= 2 and eve_posts >= 1: # Alice should see posts in channels she has access to + # Count posts visible to Charlie (owner of private 'random' channel: 3 public + 1 own = 4) + cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie + cur.execute("SELECT COUNT(*) FROM posts;") + charlie_posts = cur.fetchone()[0] + + if alice_posts == 3 and eve_posts == 3 and charlie_posts == 4: test_results.append("βœ“ Content visibility varies correctly based on user context") else: - test_results.append(f"βœ— Content visibility issue: Alice sees {alice_posts}, Eve sees {eve_posts}") + test_results.append( + f"βœ— Content visibility issue: Alice sees {alice_posts} (expected 3), " + f"Eve sees {eve_posts} (expected 3), Charlie sees {charlie_posts} (expected 4)" + ) # Test 8: Anonymous user access print("\n8. Testing anonymous user restrictions...") @@ -271,9 +298,10 @@ def verify_rls_implementation(): anon_users = cur.fetchone()[0] # Anonymous users should be able to see public user profiles per requirements - # Count public users that should be visible + # Count public users that should be visible (re-counted under anon's view; with + # correct RLS this equals the total number of public users) cur.execute("SELECT COUNT(*) FROM users WHERE is_public = true;") - public_users = cur.fetchone()[0] if cur.rowcount > 0 else 0 + public_users = cur.fetchone()[0] if anon_users == public_users and anon_users > 0: test_results.append(f"βœ“ Anonymous users can see {anon_users} public user profiles (correct)") @@ -282,7 +310,9 @@ def verify_rls_implementation(): else: test_results.append(f"βœ— Anonymous users can see {anon_users} users but expected {public_users} public users") except Exception as e: - test_results.append("βœ“ Anonymous users properly restricted") + # An exception here usually means the policy failed to handle empty/NULL session + # context (e.g., `current_setting('app.current_user_id')::UUID` on empty string). + test_results.append(f"βœ— Anonymous-user query raised an exception: {e}") # Print results print("\n" + "="*60) diff --git a/tasks/postgres/standard/security/user_permission_audit/description.md b/tasks/postgres/standard/security/user_permission_audit/description.md index 59a566bb..d0bd751a 100644 --- a/tasks/postgres/standard/security/user_permission_audit/description.md +++ b/tasks/postgres/standard/security/user_permission_audit/description.md @@ -10,9 +10,9 @@ You've been hired as a security consultant to audit the PostgreSQL database perm 2. **Catalog all database users and roles**: Use `pg_user`, `pg_roles`, and `pg_auth_members` to find all accounts 3. **Analyze current permissions**: Use `information_schema.table_privileges` to map permissions 4. **Identify security issues**: - - **Dangling users**: Inactive accounts that should be removed + - **Dangling users**: A *dangling user* is a database role that has been granted privileges on one or more business tables but is **not** assigned to any of the expected business roles in `USER_ROLE` below. (This definition naturally excludes PostgreSQL system roles such as `postgres`, `pg_read_all_data`, etc., since they aren't granted on business tables directly.) - **Missing permissions**: Users lacking permissions required for their business role - - **Excessive permissions**: Users with unnecessary permissions that should be revoked + - **Excessive permissions**: Any privilege that does not belong to the user's expected business role. **This includes every grant currently held by a dangling user** β€” each grant must still be reported as a separate `EXCESSIVE_PERMISSION` row in addition to the per-user `DANGLING_USER` row. ## Expected permissions by role (what they SHOULD have) @@ -95,8 +95,8 @@ CREATE TABLE security_audit_results ( audit_id SERIAL PRIMARY KEY, audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS' total_issues INTEGER NOT NULL, - users_affected INTEGER NOT NULL, - tables_affected INTEGER NOT NULL + users_affected INTEGER NOT NULL, -- COUNT(DISTINCT username) for this audit_type + tables_affected INTEGER NOT NULL -- COUNT(DISTINCT table_name) for this audit_type; NULL table_name does not count (so DANGLING_USERS is 0) ); ``` @@ -108,7 +108,7 @@ CREATE TABLE security_audit_details ( issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION' table_name VARCHAR(50), -- NULL for dangling users permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users - expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not + expected_access BOOLEAN NOT NULL -- TRUE if user should have access (MISSING_PERMISSION); FALSE otherwise (EXCESSIVE_PERMISSION, DANGLING_USER) ); ``` @@ -118,18 +118,4 @@ Your audit should populate both tables with: - **Summary data**: High-level counts of different types of security issues - **Detailed findings**: Specific permission gaps for each user and table combination -## Business Role Expectations - -Analyze usernames and infer their intended business roles based on naming patterns: - -- **analytics_user** β†’ Analytics Team (needs user behavior and statistics data) -- **marketing_user** β†’ Marketing Department (needs customer and product data for campaigns) -- **customer_service** β†’ Customer Service (needs user profiles and order management) -- **finance_user** β†’ Finance Team (needs financial and order data) -- **product_manager** β†’ Product Management (needs full product catalog access) -- **security_auditor** β†’ Security Team (needs audit logs and credential data) -- **developer_user** β†’ Development Team (needs limited access for testing) -- **backup_user** β†’ Backup Service (needs read-only access to all business data) -- **temp_contractor, old_employee, test_account** β†’ Inactive/Temporary (should have NO permissions) - The verification process will check that your findings correctly identify the actual permission gaps in the system by comparing against expected results. diff --git a/tasks/postgres/standard/security/user_permission_audit/verify.py b/tasks/postgres/standard/security/user_permission_audit/verify.py index 13f1c2af..3bb5d908 100644 --- a/tasks/postgres/standard/security/user_permission_audit/verify.py +++ b/tasks/postgres/standard/security/user_permission_audit/verify.py @@ -97,6 +97,34 @@ def verify_security_audit(): } } + # Validate structure first β€” if the schema is wrong, abort before parsing. + structure_valid = True + for i, finding in enumerate(findings): + if len(finding) != 6: + print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})") + structure_valid = False + continue + + _detail_id, username, issue_type, _table_name, _permission_type, expected_access = finding + + if not username: + print(f"| FAIL: Finding {i + 1} missing username") + structure_valid = False + + if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']: + print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}") + structure_valid = False + + if expected_access not in [True, False]: + print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}") + structure_valid = False + + if not structure_valid: + print("| FAIL: aborting further checks because finding rows have invalid structure") + return False + + print("| βœ“ structure is valid") + found_dangling = set() found_missing_permissions = set() found_excessive_permissions = set() @@ -130,54 +158,42 @@ def verify_security_audit(): missing_excessive_perms = expected_findings['excessive_permissions'] - found_excessive_permissions extra_excessive_perms = found_excessive_permissions - expected_findings['excessive_permissions'] - # Validate structure - structure_valid = True - for i, finding in enumerate(findings): - if len(finding) != 6: # Should have 6 columns - print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})") - structure_valid = False - continue - - detail_id, username, issue_type, table_name, permission_type, expected_access = finding - - if not username: - print(f"| FAIL: Finding {i + 1} missing username") - structure_valid = False - - if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']: - print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}") - structure_valid = False - - if expected_access not in [True, False]: - print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}") - structure_valid = False - - if structure_valid: - print(f"| βœ“ structure is valid") - # Check for missing findings all_correct = True print(f"| Expected dangling users: {expected_findings['dangling_users']} Found: {found_dangling}") if missing_dangling: - print(f"| Missing dangling users: {missing_dangling}") + print(f"| FAIL: Missing dangling users (not reported): {missing_dangling}") + all_correct = False + if extra_dangling: + print(f"| FAIL: Unexpected dangling users (extra): {extra_dangling}") all_correct = False print( - f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Missing: {len(missing_missing_perms)}") + f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Not reported: {len(missing_missing_perms)} Extra: {len(extra_missing_perms)}") if missing_missing_perms: - print(f"| Missing 'missing permission' findings:") + print(f"| FAIL: Missing 'missing permission' findings:") for perm in sorted(missing_missing_perms): print(f"| - {perm[0]} should be granted {perm[2]} on {perm[1]}") all_correct = False + if extra_missing_perms: + print(f"| FAIL: Unexpected 'missing permission' findings:") + for perm in sorted(extra_missing_perms): + print(f"| - {perm[0]} / {perm[1]} / {perm[2]} (not expected as missing)") + all_correct = False print( - f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Missing: {len(missing_excessive_perms)}") + f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Not reported: {len(missing_excessive_perms)} Extra: {len(extra_excessive_perms)}") if missing_excessive_perms: - print(f"| Missing 'excessive permission' findings:") + print(f"| FAIL: Missing 'excessive permission' findings:") for perm in sorted(missing_excessive_perms): print(f"| - {perm[0]} should have {perm[2]} revoked on {perm[1]}") all_correct = False + if extra_excessive_perms: + print(f"| FAIL: Unexpected 'excessive permission' findings:") + for perm in sorted(extra_excessive_perms): + print(f"| - {perm[0]} / {perm[1]} / {perm[2]} (not expected as excessive)") + all_correct = False # Check audit summary table cur.execute( @@ -204,13 +220,19 @@ def verify_security_audit(): else: print(f"| βœ“ {audit_type} summary matches expected values") - # Assert exact counts match expected - assert len(found_dangling) == 3, f"Expected 3 dangling users, found {len(found_dangling)}" - assert len(found_missing_permissions) == 13, f"Expected 13 missing permissions, found {len(found_missing_permissions)}" - assert len(found_excessive_permissions) == 13, f"Expected 13 excessive permissions, found {len(found_excessive_permissions)}" - - if all_correct and structure_valid and summary_correct: - print("| βœ“ All assertions passed") + # Exact-count sanity checks (catch e.g. duplicate findings that summary missed). + count_correct = True + for label, found_set, expected_count in [ + ("dangling users", found_dangling, 3), + ("missing permissions", found_missing_permissions, 13), + ("excessive permissions", found_excessive_permissions, 13), + ]: + if len(found_set) != expected_count: + print(f"| FAIL: Expected {expected_count} {label}, found {len(found_set)}") + count_correct = False + + if all_correct and structure_valid and summary_correct and count_correct: + print("| βœ“ All checks passed") return True else: return False diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/description.md b/tasks/postgres/standard/vectors/dba_vector_analysis/description.md index 97936c0e..ccff2e0a 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/description.md +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/description.md @@ -54,7 +54,7 @@ CREATE TABLE vector_analysis_columns ( column_name VARCHAR(100), dimensions INTEGER, data_type VARCHAR(50), - has_constraints BOOLEAN, + has_constraints BOOLEAN, -- true if the column has any non-default constraint (NOT NULL, CHECK, FK); false otherwise rows BIGINT ); ``` diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql b/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql index 12995a3a..4e4a74cc 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql @@ -3,7 +3,7 @@ /* ================================================================================ -EXPECTED VECTOR DATABASE STRUCTURE (created by vectors_setup.py) +EXPECTED VECTOR DATABASE STRUCTURE (created by prepare_environment.py) ================================================================================ Tables with Vector Columns: diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py b/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py index 52a981cd..42ecbb58 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py @@ -1,26 +1,500 @@ """ Environment preparation script for Vector Database DBA Analysis task. -This script imports and uses the shared vector database setup utilities. +Sets up a PostgreSQL database with the pgvector extension, sample RAG-style +tables (documents / document_chunks / user_queries plus three non-vector +metadata tables), HNSW indexes on the vector columns, and sample data. """ -import sys +import os import logging -from pathlib import Path +import psycopg2 +import json +import random +import numpy as np +from typing import List -# Add the vectors directory to import the shared utilities -sys.path.append(str(Path(__file__).resolve().parents[1])) +logger = logging.getLogger(__name__) -from vectors_setup import prepare_vector_environment -logger = logging.getLogger(__name__) +def get_connection_params(): + """Get database connection parameters from environment variables.""" + return { + 'host': os.getenv('POSTGRES_HOST', 'localhost'), + 'port': os.getenv('POSTGRES_PORT', '5432'), + 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), + 'password': os.getenv('POSTGRES_PASSWORD', 'password'), + 'database': os.getenv('POSTGRES_DATABASE', 'postgres') + } + + +def generate_mock_embedding(dimensions: int = 1536) -> List[float]: + """Generate a mock embedding vector with specified dimensions.""" + # Generate random values between -1 and 1, then normalize + vector = np.random.uniform(-1, 1, dimensions) + # Normalize to unit vector (common practice for embeddings) + norm = np.linalg.norm(vector) + if norm > 0: + vector = vector / norm + return vector.tolist() + + +def create_vector_extension(): + """Create the pgvector extension.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating pgvector extension...") + cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") + logger.info("pgvector extension created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create pgvector extension: {e}") + raise + + +def create_vector_tables(): + """Create sample tables with vector columns for RAG applications.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating vector database tables...") + + # Create documents table for document embeddings + cur.execute(""" + CREATE TABLE IF NOT EXISTS documents ( + id SERIAL PRIMARY KEY, + title TEXT NOT NULL, + content TEXT NOT NULL, + source_url TEXT, + document_type VARCHAR(50) DEFAULT 'article', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + word_count INTEGER, + embedding vector(1536) + ); + """) + + # Create chunks table for document chunks (common in RAG) + cur.execute(""" + CREATE TABLE IF NOT EXISTS document_chunks ( + id SERIAL PRIMARY KEY, + document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + chunk_text TEXT NOT NULL, + chunk_size INTEGER, + overlap_size INTEGER DEFAULT 0, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + embedding vector(1536) + ); + """) + + # Create queries table for storing user queries and their embeddings + cur.execute(""" + CREATE TABLE IF NOT EXISTS user_queries ( + id SERIAL PRIMARY KEY, + query_text TEXT NOT NULL, + user_id VARCHAR(100), + session_id VARCHAR(100), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + response_time_ms INTEGER, + embedding vector(1536) + ); + """) + + # Create embeddings metadata table + cur.execute(""" + CREATE TABLE IF NOT EXISTS embedding_models ( + id SERIAL PRIMARY KEY, + model_name VARCHAR(100) NOT NULL UNIQUE, + provider VARCHAR(50) NOT NULL, + dimensions INTEGER NOT NULL, + max_tokens INTEGER, + cost_per_token DECIMAL(10, 8), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + is_active BOOLEAN DEFAULT TRUE + ); + """) + + # Create knowledge base table + cur.execute(""" + CREATE TABLE IF NOT EXISTS knowledge_base ( + id SERIAL PRIMARY KEY, + kb_name VARCHAR(100) NOT NULL, + description TEXT, + domain VARCHAR(50), + language VARCHAR(10) DEFAULT 'en', + total_documents INTEGER DEFAULT 0, + total_chunks INTEGER DEFAULT 0, + total_storage_mb DECIMAL(10, 2), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ); + """) + + # Create similarity search results cache + cur.execute(""" + CREATE TABLE IF NOT EXISTS search_cache ( + id SERIAL PRIMARY KEY, + query_hash VARCHAR(64) NOT NULL, + query_text TEXT NOT NULL, + results_json JSONB, + result_count INTEGER, + search_time_ms INTEGER, + similarity_threshold DECIMAL(4, 3), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + expires_at TIMESTAMP + ); + """) + + logger.info("Vector database tables created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create vector tables: {e}") + raise + + +def create_vector_indexes(): + """Create indexes for vector columns and other frequently queried fields.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Creating vector indexes...") + + # Vector indexes using HNSW (Hierarchical Navigable Small World) + indexes = [ + ("documents_embedding_idx", "documents", "embedding", "hnsw"), + ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"), + ("queries_embedding_idx", "user_queries", "embedding", "hnsw"), + ] + + for idx_name, table_name, column_name, method in indexes: + try: + if method == "hnsw": + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name} + ON {table_name} USING hnsw ({column_name} vector_cosine_ops); + """) + else: + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name} + ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); + """) + logger.info(f"Created index {idx_name} on {table_name}") + except psycopg2.Error as e: + logger.warning(f"Could not create {method} index {idx_name}: {e}") + # Try with IVFFlat as fallback + if method == "hnsw": + try: + cur.execute(f""" + CREATE INDEX IF NOT EXISTS {idx_name}_ivf + ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); + """) + logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}") + except psycopg2.Error as e2: + logger.warning(f"Could not create fallback index: {e2}") + + # Regular indexes for performance + regular_indexes = [ + ("documents_title_idx", "documents", "title"), + ("documents_type_idx", "documents", "document_type"), + ("documents_created_idx", "documents", "created_at"), + ("chunks_doc_id_idx", "document_chunks", "document_id"), + ("chunks_index_idx", "document_chunks", "chunk_index"), + ("queries_user_idx", "user_queries", "user_id"), + ("queries_created_idx", "user_queries", "created_at"), + ("cache_hash_idx", "search_cache", "query_hash"), + ("cache_expires_idx", "search_cache", "expires_at"), + ] + + for idx_name, table_name, column_name in regular_indexes: + try: + cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});") + logger.debug(f"Created regular index {idx_name}") + except psycopg2.Error as e: + logger.warning(f"Could not create regular index {idx_name}: {e}") + + logger.info("Vector indexes created successfully") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to create vector indexes: {e}") + raise + + +def insert_sample_data(): + """Insert sample data into vector tables.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + conn.autocommit = True + + with conn.cursor() as cur: + logger.info("Inserting sample data...") + + # Insert embedding models + embedding_models = [ + ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True), + ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True), + ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False), + ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True), + ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True), + ] + + for model_data in embedding_models: + cur.execute(""" + INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active) + VALUES (%s, %s, %s, %s, %s, %s) + ON CONFLICT (model_name) DO NOTHING; + """, model_data) + + # Insert knowledge bases + knowledge_bases = [ + ('Technical Documentation', 'Software engineering and API documentation', 'technology'), + ('Research Papers', 'Academic papers and research publications', 'research'), + ('Customer Support', 'FAQ and troubleshooting guides', 'support'), + ('Product Catalog', 'Product descriptions and specifications', 'commerce'), + ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'), + ] + + kb_ids = [] + for kb_data in knowledge_bases: + cur.execute(""" + INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING id; + """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2))) + kb_ids.append(cur.fetchone()[0]) + + # Insert sample documents + sample_documents = [ + ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"), + ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"), + ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"), + ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"), + ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"), + ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"), + ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"), + ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"), + ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"), + ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"), + ] + + doc_ids = [] + for title, content, url, doc_type in sample_documents: + embedding = generate_mock_embedding(1536) + word_count = len(content.split()) + + cur.execute(""" + INSERT INTO documents (title, content, source_url, document_type, word_count, embedding) + VALUES (%s, %s, %s, %s, %s, %s) + RETURNING id; + """, (title, content, url, doc_type, word_count, embedding)) + doc_ids.append(cur.fetchone()[0]) + + # Insert document chunks + chunk_count = 0 + for doc_id in doc_ids: + # Generate 3-7 chunks per document + num_chunks = random.randint(3, 7) + for chunk_idx in range(num_chunks): + chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \ + "It contains relevant information that would be useful for similarity search and RAG applications. " + \ + "The content includes technical details, examples, and best practices." + chunk_size = len(chunk_text) + overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0 + embedding = generate_mock_embedding(1536) + + cur.execute(""" + INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding) + VALUES (%s, %s, %s, %s, %s, %s); + """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding)) + chunk_count += 1 + + # Insert sample user queries + sample_queries = [ + ("How to optimize PostgreSQL performance?", "user123", "session_abc1"), + ("What are vector embeddings?", "user456", "session_def2"), + ("Best practices for RAG implementation", "user789", "session_ghi3"), + ("Database security checklist", "user123", "session_abc2"), + ("Machine learning with databases", "user456", "session_def3"), + ("API documentation examples", "user321", "session_jkl1"), + ("Microservices design patterns", "user654", "session_mno2"), + ("Data pipeline best practices", "user987", "session_pqr3"), + ("Cloud migration strategies", "user111", "session_stu4"), + ("NoSQL vs SQL databases", "user222", "session_vwx5"), + ] + + for query_text, user_id, session_id in sample_queries: + embedding = generate_mock_embedding(1536) + response_time = random.randint(50, 500) + + cur.execute(""" + INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding) + VALUES (%s, %s, %s, %s, %s); + """, (query_text, user_id, session_id, response_time, embedding)) + + # Insert some search cache entries + for i in range(5): + query_hash = f"hash_{random.randint(100000, 999999)}" + query_text = f"Sample cached query {i + 1}" + results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)] + result_count = len(results) + search_time = random.randint(10, 100) + threshold = round(random.uniform(0.6, 0.8), 3) + + cur.execute(""" + INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold) + VALUES (%s, %s, %s, %s, %s, %s); + """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold)) + + logger.info(f"Sample data inserted successfully:") + logger.info(f" {len(sample_documents)} documents") + logger.info(f" {chunk_count} document chunks") + logger.info(f" {len(sample_queries)} user queries") + logger.info(f" {len(embedding_models)} embedding models") + logger.info(f" {len(knowledge_bases)} knowledge bases") + + conn.close() + + except psycopg2.Error as e: + logger.error(f"Failed to insert sample data: {e}") + raise + + +def verify_vector_setup(): + """Verify that the vector database was set up correctly.""" + conn_params = get_connection_params() + + try: + conn = psycopg2.connect(**conn_params) + + with conn.cursor() as cur: + logger.info("Verifying vector database setup...") + + # Check extension + cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';") + if cur.fetchone(): + logger.info("pgvector extension is installed") + else: + logger.error("pgvector extension not found") + return False + + # Check tables and record counts + tables_to_check = [ + 'documents', 'document_chunks', 'user_queries', + 'embedding_models', 'knowledge_base', 'search_cache' + ] + + table_counts = {} + for table in tables_to_check: + cur.execute(f'SELECT COUNT(*) FROM {table}') + count = cur.fetchone()[0] + table_counts[table] = count + logger.info(f"Table {table}: {count} records") + + # Check vector columns + cur.execute(""" + SELECT table_name, column_name, data_type + FROM information_schema.columns + WHERE data_type = 'USER-DEFINED' + AND udt_name = 'vector' + ORDER BY table_name, column_name; + """) + + vector_columns = cur.fetchall() + logger.info(f"Found {len(vector_columns)} vector columns:") + for table, column, dtype in vector_columns: + logger.info(f" {table}.{column} ({dtype})") + + # Check indexes + cur.execute(""" + SELECT schemaname, tablename, indexname, indexdef + FROM pg_indexes + WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%' + ORDER BY tablename, indexname; + """) + + vector_indexes = cur.fetchall() + logger.info(f"Found {len(vector_indexes)} vector indexes:") + for schema, table, index, definition in vector_indexes: + logger.info(f" {index} on {table}") + + # Test a simple vector similarity query + mock_embedding = generate_mock_embedding(1536) + cur.execute(""" + SELECT id, title, embedding <-> %s::vector as distance + FROM documents + ORDER BY embedding <-> %s::vector + LIMIT 3; + """, (mock_embedding, mock_embedding)) + + results = cur.fetchall() + logger.info(f"Vector similarity query returned {len(results)} results") + + conn.close() + logger.info("Vector database verification completed successfully") + return table_counts, vector_columns, vector_indexes + + except psycopg2.Error as e: + logger.error(f"Verification failed: {e}") + raise def prepare_environment(): """Main function to prepare the vector database environment.""" - prepare_vector_environment() + logger.info("Preparing vector database environment...") + + try: + # Create pgvector extension + create_vector_extension() + + # Create vector tables + create_vector_tables() + + # Insert sample data first + insert_sample_data() + + # Create indexes after data insertion for better performance + create_vector_indexes() + + # Verify the setup + table_counts, vector_columns, vector_indexes = verify_vector_setup() + + logger.info("Vector database environment prepared successfully!") + logger.info(f"Total tables created: {len(table_counts)}") + logger.info(f"Total vector columns: {len(vector_columns)}") + logger.info(f"Total vector indexes: {len(vector_indexes)}") + + return { + 'table_counts': table_counts, + 'vector_columns': vector_columns, + 'vector_indexes': vector_indexes + } + + except Exception as e: + logger.error(f"Failed to prepare vector environment: {e}") + raise if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - prepare_environment() \ No newline at end of file + prepare_environment() diff --git a/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py b/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py index bdf4e8eb..634d83b1 100644 --- a/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py +++ b/tasks/postgres/standard/vectors/dba_vector_analysis/verify.py @@ -92,7 +92,25 @@ def verify_vector_analysis_columns(conn) -> Dict[str, Any]: if extra_vectors: results['issues'].append(f"Non-existing: {extra_vectors}") - if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors: + # Verify analysis values: dimensions and rows must match reality. + values_ok = True + cur.execute(""" + SELECT table_name, column_name, dimensions, rows + FROM vector_analysis_columns + ORDER BY table_name, column_name; + """) + for tbl, col, dims, rows in cur.fetchall(): + if dims != 1536: + results['issues'].append(f"{tbl}.{col}: dimensions={dims}, expected 1536") + values_ok = False + if (tbl, col) in actual_vector_columns: + cur.execute(f'SELECT COUNT(*) FROM "{tbl}"') + actual_rows = cur.fetchone()[0] + if rows != actual_rows: + results['issues'].append(f"{tbl}.{col}: rows={rows}, expected {actual_rows}") + values_ok = False + + if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -167,7 +185,36 @@ def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]: if extra_tables: results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}") - if not missing and not extra and count > 0 and not missing_tables and not extra_tables: + # Verify analysis values for each row: size/byte/pct/count sanity. + values_ok = True + cur.execute(""" + SELECT table_name, total_size_bytes, vector_data_bytes, + regular_data_bytes, vector_storage_pct, row_count + FROM vector_analysis_storage_consumption + ORDER BY table_name; + """) + for tbl, total_b, vec_b, reg_b, pct, row_cnt in cur.fetchall(): + if tbl not in actual_vector_tables: + continue + cur.execute(f'SELECT COUNT(*) FROM "{tbl}"') + actual_rows = cur.fetchone()[0] + if row_cnt != actual_rows: + results['issues'].append(f"{tbl}: row_count={row_cnt}, expected {actual_rows}") + values_ok = False + if total_b is None or total_b <= 0: + results['issues'].append(f"{tbl}: total_size_bytes={total_b}, expected > 0") + values_ok = False + if vec_b is None or vec_b <= 0: + results['issues'].append(f"{tbl}: vector_data_bytes={vec_b}, expected > 0") + values_ok = False + if reg_b is None or reg_b < 0: + results['issues'].append(f"{tbl}: regular_data_bytes={reg_b}, expected >= 0") + values_ok = False + if pct is None or not (0 <= pct <= 100): + results['issues'].append(f"{tbl}: vector_storage_pct={pct}, expected within [0, 100]") + values_ok = False + + if not missing and not extra and count > 0 and not missing_tables and not extra_tables and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -240,7 +287,28 @@ def verify_vector_analysis_indices(conn) -> Dict[str, Any]: # Allow agent to find more indexes than just vector ones (they might include related indexes) # but at least they should find the vector-specific ones - if not missing and not extra and count > 0 and not missing_indexes: + # Verify analysis values: column_name, index_type, and index_size_bytes. + values_ok = True + actual_index_names = {ix for _s, _t, ix in actual_vector_indexes} + cur.execute(""" + SELECT index_name, column_name, index_type, index_size_bytes + FROM vector_analysis_indices + ORDER BY table_name, index_name; + """) + for idx_name, col_name, idx_type, idx_size in cur.fetchall(): + if idx_name not in actual_index_names: + continue # skip extra/unrelated indexes the agent may have added + if col_name != 'embedding': + results['issues'].append(f"{idx_name}: column_name={col_name!r}, expected 'embedding'") + values_ok = False + if (idx_type or '').lower() not in ('hnsw', 'ivfflat'): + results['issues'].append(f"{idx_name}: index_type={idx_type!r}, expected 'hnsw' or 'ivfflat'") + values_ok = False + if idx_size is None or idx_size <= 0: + results['issues'].append(f"{idx_name}: index_size_bytes={idx_size}, expected > 0") + values_ok = False + + if not missing and not extra and count > 0 and not missing_indexes and values_ok: results['passed'] = True except psycopg2.Error as e: @@ -315,8 +383,6 @@ def main(): print(f"Results: {passed_checks}/{total_checks} checks passed") if passed_checks == total_checks: sys.exit(0) - elif passed_checks >= total_checks * 0.75: - sys.exit(0) else: sys.exit(1) except psycopg2.Error as e: diff --git a/tasks/postgres/standard/vectors/vectors_setup.py b/tasks/postgres/standard/vectors/vectors_setup.py deleted file mode 100644 index c28dcf10..00000000 --- a/tasks/postgres/standard/vectors/vectors_setup.py +++ /dev/null @@ -1,500 +0,0 @@ -""" -Shared Vector Database Setup Utilities - -This module provides utilities for setting up a complete PostgreSQL database -with pgvector extension and sample RAG-related tables with vector data. -Used by all vector database tasks. -""" - -import os -import logging -import psycopg2 -import json -import random -import numpy as np -from typing import List - -logger = logging.getLogger(__name__) - -def get_connection_params(): - """Get database connection parameters from environment variables.""" - return { - 'host': os.getenv('POSTGRES_HOST', 'localhost'), - 'port': os.getenv('POSTGRES_PORT', '5432'), - 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), - 'password': os.getenv('POSTGRES_PASSWORD', 'password'), - 'database': os.getenv('POSTGRES_DATABASE', 'postgres') - } - - -def generate_mock_embedding(dimensions: int = 1536) -> List[float]: - """Generate a mock embedding vector with specified dimensions.""" - # Generate random values between -1 and 1, then normalize - vector = np.random.uniform(-1, 1, dimensions) - # Normalize to unit vector (common practice for embeddings) - norm = np.linalg.norm(vector) - if norm > 0: - vector = vector / norm - return vector.tolist() - - -def create_vector_extension(): - """Create the pgvector extension.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating pgvector extension...") - cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") - logger.info("pgvector extension created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create pgvector extension: {e}") - raise - - -def create_vector_tables(): - """Create sample tables with vector columns for RAG applications.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating vector database tables...") - - # Create documents table for document embeddings - cur.execute(""" - CREATE TABLE IF NOT EXISTS documents ( - id SERIAL PRIMARY KEY, - title TEXT NOT NULL, - content TEXT NOT NULL, - source_url TEXT, - document_type VARCHAR(50) DEFAULT 'article', - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - word_count INTEGER, - embedding vector(1536) - ); - """) - - # Create chunks table for document chunks (common in RAG) - cur.execute(""" - CREATE TABLE IF NOT EXISTS document_chunks ( - id SERIAL PRIMARY KEY, - document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, - chunk_index INTEGER NOT NULL, - chunk_text TEXT NOT NULL, - chunk_size INTEGER, - overlap_size INTEGER DEFAULT 0, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - embedding vector(1536) - ); - """) - - # Create queries table for storing user queries and their embeddings - cur.execute(""" - CREATE TABLE IF NOT EXISTS user_queries ( - id SERIAL PRIMARY KEY, - query_text TEXT NOT NULL, - user_id VARCHAR(100), - session_id VARCHAR(100), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - response_time_ms INTEGER, - embedding vector(1536) - ); - """) - - # Create embeddings metadata table - cur.execute(""" - CREATE TABLE IF NOT EXISTS embedding_models ( - id SERIAL PRIMARY KEY, - model_name VARCHAR(100) NOT NULL UNIQUE, - provider VARCHAR(50) NOT NULL, - dimensions INTEGER NOT NULL, - max_tokens INTEGER, - cost_per_token DECIMAL(10, 8), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - is_active BOOLEAN DEFAULT TRUE - ); - """) - - # Create knowledge base table - cur.execute(""" - CREATE TABLE IF NOT EXISTS knowledge_base ( - id SERIAL PRIMARY KEY, - kb_name VARCHAR(100) NOT NULL, - description TEXT, - domain VARCHAR(50), - language VARCHAR(10) DEFAULT 'en', - total_documents INTEGER DEFAULT 0, - total_chunks INTEGER DEFAULT 0, - total_storage_mb DECIMAL(10, 2), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ); - """) - - # Create similarity search results cache - cur.execute(""" - CREATE TABLE IF NOT EXISTS search_cache ( - id SERIAL PRIMARY KEY, - query_hash VARCHAR(64) NOT NULL, - query_text TEXT NOT NULL, - results_json JSONB, - result_count INTEGER, - search_time_ms INTEGER, - similarity_threshold DECIMAL(4, 3), - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - expires_at TIMESTAMP - ); - """) - - logger.info("Vector database tables created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create vector tables: {e}") - raise - - -def create_vector_indexes(): - """Create indexes for vector columns and other frequently queried fields.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Creating vector indexes...") - - # Vector indexes using HNSW (Hierarchical Navigable Small World) - indexes = [ - ("documents_embedding_idx", "documents", "embedding", "hnsw"), - ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"), - ("queries_embedding_idx", "user_queries", "embedding", "hnsw"), - ] - - for idx_name, table_name, column_name, method in indexes: - try: - if method == "hnsw": - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name} - ON {table_name} USING hnsw ({column_name} vector_cosine_ops); - """) - else: - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name} - ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); - """) - logger.info(f"Created index {idx_name} on {table_name}") - except psycopg2.Error as e: - logger.warning(f"Could not create {method} index {idx_name}: {e}") - # Try with IVFFlat as fallback - if method == "hnsw": - try: - cur.execute(f""" - CREATE INDEX IF NOT EXISTS {idx_name}_ivf - ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); - """) - logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}") - except psycopg2.Error as e2: - logger.warning(f"Could not create fallback index: {e2}") - - # Regular indexes for performance - regular_indexes = [ - ("documents_title_idx", "documents", "title"), - ("documents_type_idx", "documents", "document_type"), - ("documents_created_idx", "documents", "created_at"), - ("chunks_doc_id_idx", "document_chunks", "document_id"), - ("chunks_index_idx", "document_chunks", "chunk_index"), - ("queries_user_idx", "user_queries", "user_id"), - ("queries_created_idx", "user_queries", "created_at"), - ("cache_hash_idx", "search_cache", "query_hash"), - ("cache_expires_idx", "search_cache", "expires_at"), - ] - - for idx_name, table_name, column_name in regular_indexes: - try: - cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});") - logger.debug(f"Created regular index {idx_name}") - except psycopg2.Error as e: - logger.warning(f"Could not create regular index {idx_name}: {e}") - - logger.info("Vector indexes created successfully") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to create vector indexes: {e}") - raise - - -def insert_sample_data(): - """Insert sample data into vector tables.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - conn.autocommit = True - - with conn.cursor() as cur: - logger.info("Inserting sample data...") - - # Insert embedding models - embedding_models = [ - ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True), - ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True), - ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False), - ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True), - ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True), - ] - - for model_data in embedding_models: - cur.execute(""" - INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active) - VALUES (%s, %s, %s, %s, %s, %s) - ON CONFLICT (model_name) DO NOTHING; - """, model_data) - - # Insert knowledge bases - knowledge_bases = [ - ('Technical Documentation', 'Software engineering and API documentation', 'technology'), - ('Research Papers', 'Academic papers and research publications', 'research'), - ('Customer Support', 'FAQ and troubleshooting guides', 'support'), - ('Product Catalog', 'Product descriptions and specifications', 'commerce'), - ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'), - ] - - kb_ids = [] - for kb_data in knowledge_bases: - cur.execute(""" - INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id; - """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2))) - kb_ids.append(cur.fetchone()[0]) - - # Insert sample documents - sample_documents = [ - ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"), - ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"), - ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"), - ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"), - ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"), - ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"), - ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"), - ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"), - ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"), - ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"), - ] - - doc_ids = [] - for title, content, url, doc_type in sample_documents: - embedding = generate_mock_embedding(1536) - word_count = len(content.split()) - - cur.execute(""" - INSERT INTO documents (title, content, source_url, document_type, word_count, embedding) - VALUES (%s, %s, %s, %s, %s, %s) - RETURNING id; - """, (title, content, url, doc_type, word_count, embedding)) - doc_ids.append(cur.fetchone()[0]) - - # Insert document chunks - chunk_count = 0 - for doc_id in doc_ids: - # Generate 3-7 chunks per document - num_chunks = random.randint(3, 7) - for chunk_idx in range(num_chunks): - chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \ - "It contains relevant information that would be useful for similarity search and RAG applications. " + \ - "The content includes technical details, examples, and best practices." - chunk_size = len(chunk_text) - overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0 - embedding = generate_mock_embedding(1536) - - cur.execute(""" - INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding) - VALUES (%s, %s, %s, %s, %s, %s); - """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding)) - chunk_count += 1 - - # Insert sample user queries - sample_queries = [ - ("How to optimize PostgreSQL performance?", "user123", "session_abc1"), - ("What are vector embeddings?", "user456", "session_def2"), - ("Best practices for RAG implementation", "user789", "session_ghi3"), - ("Database security checklist", "user123", "session_abc2"), - ("Machine learning with databases", "user456", "session_def3"), - ("API documentation examples", "user321", "session_jkl1"), - ("Microservices design patterns", "user654", "session_mno2"), - ("Data pipeline best practices", "user987", "session_pqr3"), - ("Cloud migration strategies", "user111", "session_stu4"), - ("NoSQL vs SQL databases", "user222", "session_vwx5"), - ] - - for query_text, user_id, session_id in sample_queries: - embedding = generate_mock_embedding(1536) - response_time = random.randint(50, 500) - - cur.execute(""" - INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding) - VALUES (%s, %s, %s, %s, %s); - """, (query_text, user_id, session_id, response_time, embedding)) - - # Insert some search cache entries - for i in range(5): - query_hash = f"hash_{random.randint(100000, 999999)}" - query_text = f"Sample cached query {i + 1}" - results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)] - result_count = len(results) - search_time = random.randint(10, 100) - threshold = round(random.uniform(0.6, 0.8), 3) - - cur.execute(""" - INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold) - VALUES (%s, %s, %s, %s, %s, %s); - """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold)) - - logger.info(f"Sample data inserted successfully:") - logger.info(f" {len(sample_documents)} documents") - logger.info(f" {chunk_count} document chunks") - logger.info(f" {len(sample_queries)} user queries") - logger.info(f" {len(embedding_models)} embedding models") - logger.info(f" {len(knowledge_bases)} knowledge bases") - - conn.close() - - except psycopg2.Error as e: - logger.error(f"Failed to insert sample data: {e}") - raise - - -def verify_vector_setup(): - """Verify that the vector database was set up correctly.""" - conn_params = get_connection_params() - - try: - conn = psycopg2.connect(**conn_params) - - with conn.cursor() as cur: - logger.info("Verifying vector database setup...") - - # Check extension - cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';") - if cur.fetchone(): - logger.info("pgvector extension is installed") - else: - logger.error("pgvector extension not found") - return False - - # Check tables and record counts - tables_to_check = [ - 'documents', 'document_chunks', 'user_queries', - 'embedding_models', 'knowledge_base', 'search_cache' - ] - - table_counts = {} - for table in tables_to_check: - cur.execute(f'SELECT COUNT(*) FROM {table}') - count = cur.fetchone()[0] - table_counts[table] = count - logger.info(f"Table {table}: {count} records") - - # Check vector columns - cur.execute(""" - SELECT table_name, column_name, data_type - FROM information_schema.columns - WHERE data_type = 'USER-DEFINED' - AND udt_name = 'vector' - ORDER BY table_name, column_name; - """) - - vector_columns = cur.fetchall() - logger.info(f"Found {len(vector_columns)} vector columns:") - for table, column, dtype in vector_columns: - logger.info(f" {table}.{column} ({dtype})") - - # Check indexes - cur.execute(""" - SELECT schemaname, tablename, indexname, indexdef - FROM pg_indexes - WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%' - ORDER BY tablename, indexname; - """) - - vector_indexes = cur.fetchall() - logger.info(f"Found {len(vector_indexes)} vector indexes:") - for schema, table, index, definition in vector_indexes: - logger.info(f" {index} on {table}") - - # Test a simple vector similarity query - mock_embedding = generate_mock_embedding(1536) - cur.execute(""" - SELECT id, title, embedding <-> %s::vector as distance - FROM documents - ORDER BY embedding <-> %s::vector - LIMIT 3; - """, (mock_embedding, mock_embedding)) - - results = cur.fetchall() - logger.info(f"Vector similarity query returned {len(results)} results") - - conn.close() - logger.info("Vector database verification completed successfully") - return table_counts, vector_columns, vector_indexes - - except psycopg2.Error as e: - logger.error(f"Verification failed: {e}") - raise - - -def prepare_vector_environment(): - """Main function to prepare the vector database environment.""" - logger.info("Preparing vector database environment...") - - try: - # Create pgvector extension - create_vector_extension() - - # Create vector tables - create_vector_tables() - - # Insert sample data first - insert_sample_data() - - # Create indexes after data insertion for better performance - create_vector_indexes() - - # Verify the setup - table_counts, vector_columns, vector_indexes = verify_vector_setup() - - logger.info("Vector database environment prepared successfully!") - logger.info(f"Total tables created: {len(table_counts)}") - logger.info(f"Total vector columns: {len(vector_columns)}") - logger.info(f"Total vector indexes: {len(vector_indexes)}") - - return { - 'table_counts': table_counts, - 'vector_columns': vector_columns, - 'vector_indexes': vector_indexes - } - - except Exception as e: - logger.error(f"Failed to prepare vector environment: {e}") - raise - - -if __name__ == "__main__": - # Allow running this module directly for testing - logging.basicConfig(level=logging.INFO) - prepare_vector_environment()