eval-sys · zjwu0522 · Jun 12, 2026 · Jan 27, 2026 · Jan 27, 2026 · Apr 14, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -79,7 +79,7 @@ RUN python3 -m playwright install chromium && \
     npx -y playwright install chromium
 
 # Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`)
-RUN pipx install postgres-mcp
+RUN pipx install postgres-mcp==0.3.0
 
 # Set working directory
 WORKDIR /app

diff --git a/README.md b/README.md
@@ -14,10 +14,13 @@ An evaluation suite for agentic models in real MCP tool environments (Notion / G
 
 MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports.
 
+> 🚀 **MCPMark Verified is out** — a version-pinned, stabilized subset of the standard tasks for reproducible evaluation. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264).
+
 [![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai)
 
 ## News
 
+- 🚀 **12 Jun** — **MCPMark Verified** is out: a version-pinned, stabilized subset of the standard tasks. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264).
 - 📣 **27 May** — The previous Notion Source Hub page is deprecated; please use the new link: [MCPMark Source Hub](https://gossamer-sawfish-47c.notion.site/MCPMark-Source-Hub-dc32b7e8cebd82b8959b81ae322df87a).
 - 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
 - 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).

diff --git a/pipeline.py b/pipeline.py
@@ -113,7 +113,7 @@ def main():
     parser.add_argument(
         "--reasoning-effort",
         default="default",
-        choices=["default", "minimal", "low", "medium", "high"],
+        choices=["default", "minimal", "low", "medium", "high", "xhigh", "max"],
         help="Reasoning effort level for supported models (default: None)",
     )
 

diff --git a/src/agents/base_agent.py b/src/agents/base_agent.py
@@ -176,7 +176,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 raise ValueError("Notion API key required")
             return MCPStdioServer(
                 command="npx",
-                args=["-y", "@notionhq/notion-mcp-server"],
+                args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
                 env={
                     "OPENAPI_MCP_HEADERS": (
                         '{"Authorization": "Bearer ' + notion_key + '", '
@@ -193,7 +193,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 command="npx",
                 args=[
                     "-y",
-                    "@modelcontextprotocol/server-filesystem",
+                    "@modelcontextprotocol/server-filesystem@2025.12.18",
                     str(test_directory),
                 ],
             )
@@ -204,7 +204,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
             viewport_width = self.service_config.get("viewport_width", 1280)
             viewport_height = self.service_config.get("viewport_height", 720)
 
-            args = ["-y", "@playwright/mcp@latest"]
+            args = ["-y", "@playwright/mcp@0.0.68"]
             if headless:
                 args.append("--headless")
             args.extend(
@@ -234,7 +234,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
             )
             return MCPStdioServer(
                 command="pipx",
-                args=["run", "postgres-mcp", "--access-mode=unrestricted"],
+                args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"],
                 env={"DATABASE_URI": database_url},
             )
 

diff --git a/src/agents/mcpmark_agent.py b/src/agents/mcpmark_agent.py
@@ -849,6 +849,10 @@ async def _execute_litellm_tool_loop(
                     "model": self.litellm_input_model_name,
                     "messages": messages,
                     "api_key": self.api_key,
+                    "max_tokens": 32768,
+                    "temperature": 1.0,
+                    "enforcer_mode": "on",
+                    "think_mode": "on",
                 }
 
                 # Always use tools format if available - LiteLLM will handle conversion
@@ -1131,7 +1135,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
                 command="npx",
                 args=[
                     "-y",
-                    "@modelcontextprotocol/server-filesystem",
+                    "@modelcontextprotocol/server-filesystem@2025.12.18",
                     str(test_directory),
                 ],
             )
@@ -1142,7 +1146,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
             viewport_width = self.service_config.get("viewport_width", 1280)
             viewport_height = self.service_config.get("viewport_height", 720)
 
-            args = ["-y", "@playwright/mcp@latest"]
+            args = ["-y", "@playwright/mcp@0.0.68"]
             if headless:
                 args.append("--headless")
             args.extend(
@@ -1176,7 +1180,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
 
             return MCPStdioServer(
                 command="pipx",
-                args=["run", "postgres-mcp", "--access-mode=unrestricted"],
+                args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"],
                 env={"DATABASE_URI": database_url},
             )
 

diff --git a/src/mcp_services/github/github_state_manager.py b/src/mcp_services/github/github_state_manager.py
@@ -255,7 +255,8 @@ def _push_repo(
 
         # Safety check: Prevent importing to public repositories
         # Public repos would send @ mention notifications to real users, causing spam
-        if not private:
+        # Exception: mcpmark-cicd needs to be public for GitHub Actions workflows to work properly
+        if not private and "mcpmark-cicd" not in template_dir.name:
             error_msg = (
                 "ERROR: Cannot import template to a public repository.\n\n"
                 "Reason: The template contains @ mentions of real GitHub users from the original\n"

diff --git a/src/model_config.py b/src/model_config.py
@@ -50,6 +50,11 @@ class ModelConfig:
             "api_key_var": "OPENAI_API_KEY",
             "litellm_input_model_name": "openai/gpt-5.2",
         },
+        "gpt-5.5": {
+            "provider": "openai",
+            "api_key_var": "OPENAI_API_KEY",
+            "litellm_input_model_name": "openai/gpt-5.5",
+        },
         "gpt-5": {
             "provider": "openai",
             "api_key_var": "OPENAI_API_KEY",

diff --git a/tasks/filesystem/standard/desktop/project_management/description.md b/tasks/filesystem/standard/desktop/project_management/description.md
@@ -1,6 +1,6 @@
 Please use FileSystem tools to finish the following task:
 
-1. **Create the main directory structure** in `desktop_2`:
+1. **Create the main directory structure** in `desktop`:
 
    - Create a new directory in main directory called `organized_projects`
    - Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal`

diff --git a/tasks/filesystem/standard/file_context/duplicates_searching/description.md b/tasks/filesystem/standard/file_context/duplicates_searching/description.md
@@ -6,7 +6,7 @@ You are given a directory containing multiple text files. Some files have identi
 
 ### Task Objectives
 
-1. **Scan all text files** in the test directory to identify groups with identical content
+1. **Find out all the duplicate files** in the test directory with identical content based on the directory’s initial state.
 2. **Create a 'duplicates' directory** in the test directory root
 3. **Move all duplicate files** into the 'duplicates' directory
 4. **Leave unique files** in their original location

diff --git a/tasks/filesystem/standard/file_context/file_merging/description.md b/tasks/filesystem/standard/file_context/file_merging/description.md
@@ -9,4 +9,4 @@ You are given a directory containing multiple text files of varying sizes. Your
 1. **Identify the 10 smallest .txt files** in the test directory
 2. **Sort the selected files alphabetically** by filename
 3. **Merge the content** of these files into a single file
-4. **Add file headers** (file name) before each file's content
+4. **Format of merged_content.txt** : For each file, write its full filename (e.g., "example.txt") on the first line. On the immediately following line(s), copy the entire content of the file. After the file content, insert exactly one empty line (unless it is the last file). Repeat this pattern for all 10 files in alphabetical order.
diff --git a/tasks/filesystem/standard/file_context/uppercase/verify.py b/tasks/filesystem/standard/file_context/uppercase/verify.py
@@ -64,7 +64,8 @@ def verify_uppercase_content(test_dir: Path) -> bool:
 
             # Check if uppercase content is the uppercase version of original
             expected_uppercase = original_content.upper()
-
+            uppercase_content = uppercase_content.strip()
+            expected_uppercase = expected_uppercase.strip()
             if uppercase_content != expected_uppercase:
                 print(f"| ❌ File '{filename}' content is not properly converted to uppercase")
                 return False

diff --git a/tasks/filesystem/standard/file_property/time_classification/description.md b/tasks/filesystem/standard/file_property/time_classification/description.md
@@ -2,13 +2,13 @@ Please use FileSystem tools to finish the following task:
 
 ### Task Description
 
-Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates.
+Analyze the last modified time (mtime) of all files in the test directory and organize them into a hierarchical directory structure based on their modification dates.
 
 ### Task Objectives
 
 1. **Read metadata** of all files in the test directory
-2. **Analyze creation times** (ctime) of all files (excluding .DS_Store)
-3. **Create directory structure** organized by month/day based on creation time
+2. **Analyze last modified times** (mtime) of all files (excluding .DS_Store) , assuming China Standard Time (UTC+8)
+3. **Create directory structure** organized by month/day based on last modified time
 4. **Move files** to appropriate directories
 5. **Create metadata analysis files** in each directory
 
@@ -25,5 +25,5 @@ Create directories in the format: `MM/DD/` where:
 
 Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines:
 
-- **Line 1**: Oldest filename and its creation time (excluding .DS_Store)
-- **Line 2**: Latest filename and its creation time (excluding .DS_Store)
+- **Line 1**: Oldest filename and its last modified time (excluding .DS_Store)
+- **Line 2**: Latest filename and its last modified time (excluding .DS_Store)
diff --git a/tasks/filesystem/standard/file_property/time_classification/verify.py b/tasks/filesystem/standard/file_property/time_classification/verify.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Verification script for File Organization by Creation Time Task
+Verification script for File Organization by Last Modification Time Task
 """
 
 import sys

diff --git a/tasks/filesystem/standard/folder_structure/structure_analysis/description.md b/tasks/filesystem/standard/folder_structure/structure_analysis/description.md
@@ -15,7 +15,7 @@ Do not try to use python code.
 Count the following information for the entire directory structure:
 
 - total number of files
-- total number of folders
+- total number of folders (exclude the folder named "complex_structure")
 - total size of the hole folder (in bytes, include .DS_Store only in this subtask)
 
 **Format (one item per line):**

diff --git a/tasks/filesystem/standard/papers/author_folders/description.md b/tasks/filesystem/standard/papers/author_folders/description.md
@@ -27,20 +27,20 @@ You are given a directory containing multiple paper files. You have a collection
 [given_task_folder]/
 ├── [original HTML files remain untouched]
 ├── frequent_authors/              # Authors with ≥4 papers total
-│   ├── smith_john/
+│   ├── john_smith/
 │   │   └── [copied papers]
-│   ├── johnson_sarah/
+│   ├── sarah_johnson/
 │   │   └── [copied papers]
 │   └── ...
 └── 2025_authors/                  # Authors with ≥3 papers in 2025
-    ├── williams_david/
+    ├── david_williams/
     │   └── [copied 2025 papers]
-    ├── brown_emily/
+    ├── emily_brown/
     │   └── [copied 2025 papers]
     └── ...
 ```
 
 #### Requirements:
-- Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`)
+- Author folder names should be **lowercase** with underscores, using `firstname_lastname` format (e.g., `john_smith`, `david_williams`). Only the first name is used (middle names are ignored).
 - Papers should be **copied** (not moved) to preserve originals
 - Author extraction should handle various name formats correctly
diff --git a/tasks/filesystem/standard/votenet/requirements_writing/description.md b/tasks/filesystem/standard/votenet/requirements_writing/description.md
@@ -10,7 +10,6 @@ The VoteNet project is a 3D object detection framework for point clouds. Your ta
 2. **Include all essential dependencies** needed to run the VoteNet codebase
 3. **Ensure the file format is correct** (one dependency per line)
 4. **Save the file as `requirements.txt`** in the current working directory
-5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**.
 
 ### Requirements
 

diff --git a/tasks/filesystem/standard/votenet/requirements_writing/verify.py b/tasks/filesystem/standard/votenet/requirements_writing/verify.py
@@ -55,7 +55,6 @@ def verify_required_dependencies_present(test_dir: Path) -> bool:
             "opencv", 
             "plyfile",
             "trimesh",
-            "pointnet2",
             "networkx"
         ]
 

diff --git a/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md b/tasks/github/standard/claude-code/claude_collaboration_analysis/description.md
@@ -1,10 +1,10 @@
 I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository.
 
 **Step 1: Commit History Analysis**
-Analyze ALL commits in the repository to identify:
+Analyze all commits reachable from the default branch (`main`) to identify:
 
-1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude <noreply@anthropic.com>" in commit messages)
-2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude
+1. **Claude Co-Authored Commits**: Find all commits whose message contains a `Co-Authored-By: Claude <noreply@anthropic.com>` trailer. Match case-insensitively (both `Co-Authored-By` and `Co-authored-by` count). Count each commit at most once.
+2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude.
 
 **Step 2: Create Collaboration Analysis Report**
 Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with:
@@ -23,7 +23,7 @@ Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root w
 ```
 Include the top 3 developers by number of Claude collaborations.
 
-**Step 3: Commit Analysis to Repository**
+**Step 3: Commit the Analysis to Repository**
 Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with:
 - Commit message: "Add Claude AI collaboration analysis report"
-- Ensure all statistics are accurate based on actual commit data
+- Ensure all statistics are accurate based on actual commit data
diff --git a/tasks/github/standard/claude-code/feature_commit_tracking/verify.py b/tasks/github/standard/claude-code/feature_commit_tracking/verify.py
@@ -129,30 +129,39 @@ def verify_task() -> bool:
     }
 
     # Expected feature commits based on exploration
+    # For CHANGELOG Version 1.0.65, two valid answers exist:
+    # - 94dcaca5: merge commit that brought 1.0.65 into pr/2466-QwertyJack-main branch
+    # - 5faa082d: the actual commit that first added 1.0.65 content to CHANGELOG.md on main
     expected_features = {
-        "Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d",
-        "CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
-        "Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332",
+        "Shell Completion Scripts": ["8a0febdd09bda32f38c351c0881784460d69997d"],
+        "CHANGELOG Version 1.0.65": [
+            "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
+            "5faa082d6e4e5300485daafb94615fe133175055",
+        ],
+        "Rust Extraction Improvements": ["50e58affdf1bfc7d875202bc040ebe0dcfb7d332"],
     }
 
     # Expected authors for each commit
     expected_authors = {
         "8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr",
         "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack",
+        "5faa082d6e4e5300485daafb94615fe133175055": "actions-user",
         "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre",
     }
 
     # Expected commit messages for each commit
     expected_messages = {
         "8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)",
         "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main",
+        "5faa082d6e4e5300485daafb94615fe133175055": "chore: Update CHANGELOG.md",
         "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows",
     }
 
     # Expected dates for each commit (YYYY-MM-DD format)
     expected_dates = {
         "8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01",
         "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02",
+        "5faa082d6e4e5300485daafb94615fe133175055": "2025-07-31",
         "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09",
     }
 
@@ -197,17 +206,17 @@ def verify_task() -> bool:
     for feature in features:
         found_features[feature["name"]] = feature["sha"]
 
-    for feature_name, expected_sha in expected_features.items():
+    for feature_name, expected_shas in expected_features.items():
         if feature_name not in found_features:
             print(
                 f"Error: Feature '{feature_name}' not found in table", file=sys.stderr
             )
             return False
 
         actual_sha = found_features[feature_name]
-        if actual_sha != expected_sha:
+        if actual_sha not in expected_shas:
             print(
-                f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}",
+                f"Error: Wrong SHA for '{feature_name}'. Expected one of: {expected_shas}, Got: {actual_sha}",
                 file=sys.stderr,
             )
             return False
@@ -216,8 +225,12 @@ def verify_task() -> bool:
 
     # 5. Verify each commit exists and has correct author
     print("5. Verifying commit details...")
+    all_expected_shas = set()
+    for shas in expected_features.values():
+        all_expected_shas.update(shas)
+
     for feature in features:
-        if feature["sha"] in expected_features.values():
+        if feature["sha"] in all_expected_shas:
             success, commit_data = _verify_commit_exists(
                 feature["sha"], headers, github_org
             )