Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ RUN python3 -m playwright install chromium && \
npx -y playwright install chromium

# Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`)
RUN pipx install postgres-mcp
RUN pipx install postgres-mcp==0.3.0

# Set working directory
WORKDIR /app
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ An evaluation suite for agentic models in real MCP tool environments (Notion / G

MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports.

> 🚀 **MCPMark Verified is out** — a version-pinned, stabilized subset of the standard tasks for reproducible evaluation. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264).

[![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai)

## News

- 🚀 **12 Jun** — **MCPMark Verified** is out: a version-pinned, stabilized subset of the standard tasks. On the Verified set, `gpt-5.5` (xhigh) leads at **92.9%** and `kimi-k2.7` reaches **81.1%**. See [#264](https://github.com/eval-sys/mcpmark/pull/264).
- 📣 **27 May** — The previous Notion Source Hub page is deprecated; please use the new link: [MCPMark Source Hub](https://gossamer-sawfish-47c.notion.site/MCPMark-Source-Hub-dc32b7e8cebd82b8959b81ae322df87a).
- 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
- 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).
Expand Down
2 changes: 1 addition & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def main():
parser.add_argument(
"--reasoning-effort",
default="default",
choices=["default", "minimal", "low", "medium", "high"],
choices=["default", "minimal", "low", "medium", "high", "xhigh", "max"],
help="Reasoning effort level for supported models (default: None)",
)

Expand Down
8 changes: 4 additions & 4 deletions src/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
raise ValueError("Notion API key required")
return MCPStdioServer(
command="npx",
args=["-y", "@notionhq/notion-mcp-server"],
args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
env={
"OPENAPI_MCP_HEADERS": (
'{"Authorization": "Bearer ' + notion_key + '", '
Expand All @@ -193,7 +193,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
command="npx",
args=[
"-y",
"@modelcontextprotocol/server-filesystem",
"@modelcontextprotocol/server-filesystem@2025.12.18",
str(test_directory),
],
)
Expand All @@ -204,7 +204,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
viewport_width = self.service_config.get("viewport_width", 1280)
viewport_height = self.service_config.get("viewport_height", 720)

args = ["-y", "@playwright/mcp@latest"]
args = ["-y", "@playwright/mcp@0.0.68"]
if headless:
args.append("--headless")
args.extend(
Expand Down Expand Up @@ -234,7 +234,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
)
return MCPStdioServer(
command="pipx",
args=["run", "postgres-mcp", "--access-mode=unrestricted"],
args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"],
env={"DATABASE_URI": database_url},
)

Expand Down
10 changes: 7 additions & 3 deletions src/agents/mcpmark_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,10 @@ async def _execute_litellm_tool_loop(
"model": self.litellm_input_model_name,
"messages": messages,
"api_key": self.api_key,
"max_tokens": 32768,
"temperature": 1.0,
"enforcer_mode": "on",
"think_mode": "on",
}

# Always use tools format if available - LiteLLM will handle conversion
Expand Down Expand Up @@ -1131,7 +1135,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
command="npx",
args=[
"-y",
"@modelcontextprotocol/server-filesystem",
"@modelcontextprotocol/server-filesystem@2025.12.18",
str(test_directory),
],
)
Expand All @@ -1142,7 +1146,7 @@ def _create_stdio_server(self) -> MCPStdioServer:
viewport_width = self.service_config.get("viewport_width", 1280)
viewport_height = self.service_config.get("viewport_height", 720)

args = ["-y", "@playwright/mcp@latest"]
args = ["-y", "@playwright/mcp@0.0.68"]
if headless:
args.append("--headless")
args.extend(
Expand Down Expand Up @@ -1176,7 +1180,7 @@ def _create_stdio_server(self) -> MCPStdioServer:

return MCPStdioServer(
command="pipx",
args=["run", "postgres-mcp", "--access-mode=unrestricted"],
args=["run", "postgres-mcp==0.3.0", "--access-mode=unrestricted"],
env={"DATABASE_URI": database_url},
)

Expand Down
3 changes: 2 additions & 1 deletion src/mcp_services/github/github_state_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,8 @@ def _push_repo(

# Safety check: Prevent importing to public repositories
# Public repos would send @ mention notifications to real users, causing spam
if not private:
# Exception: mcpmark-cicd needs to be public for GitHub Actions workflows to work properly
if not private and "mcpmark-cicd" not in template_dir.name:
error_msg = (
"ERROR: Cannot import template to a public repository.\n\n"
"Reason: The template contains @ mentions of real GitHub users from the original\n"
Expand Down
5 changes: 5 additions & 0 deletions src/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ class ModelConfig:
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5.2",
},
"gpt-5.5": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5.5",
},
"gpt-5": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Please use FileSystem tools to finish the following task:

1. **Create the main directory structure** in `desktop_2`:
1. **Create the main directory structure** in `desktop`:

- Create a new directory in main directory called `organized_projects`
- Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ You are given a directory containing multiple text files. Some files have identi

### Task Objectives

1. **Scan all text files** in the test directory to identify groups with identical content
1. **Find out all the duplicate files** in the test directory with identical content based on the directory’s initial state.
2. **Create a 'duplicates' directory** in the test directory root
3. **Move all duplicate files** into the 'duplicates' directory
4. **Leave unique files** in their original location
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ You are given a directory containing multiple text files of varying sizes. Your
1. **Identify the 10 smallest .txt files** in the test directory
2. **Sort the selected files alphabetically** by filename
3. **Merge the content** of these files into a single file
4. **Add file headers** (file name) before each file's content
4. **Format of merged_content.txt** : For each file, write its full filename (e.g., "example.txt") on the first line. On the immediately following line(s), copy the entire content of the file. After the file content, insert exactly one empty line (unless it is the last file). Repeat this pattern for all 10 files in alphabetical order.
3 changes: 2 additions & 1 deletion tasks/filesystem/standard/file_context/uppercase/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ def verify_uppercase_content(test_dir: Path) -> bool:

# Check if uppercase content is the uppercase version of original
expected_uppercase = original_content.upper()

uppercase_content = uppercase_content.strip()
expected_uppercase = expected_uppercase.strip()
if uppercase_content != expected_uppercase:
print(f"| ❌ File '{filename}' content is not properly converted to uppercase")
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ Please use FileSystem tools to finish the following task:

### Task Description

Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates.
Analyze the last modified time (mtime) of all files in the test directory and organize them into a hierarchical directory structure based on their modification dates.

### Task Objectives

1. **Read metadata** of all files in the test directory
2. **Analyze creation times** (ctime) of all files (excluding .DS_Store)
3. **Create directory structure** organized by month/day based on creation time
2. **Analyze last modified times** (mtime) of all files (excluding .DS_Store) , assuming China Standard Time (UTC+8)
3. **Create directory structure** organized by month/day based on last modified time
4. **Move files** to appropriate directories
5. **Create metadata analysis files** in each directory

Expand All @@ -25,5 +25,5 @@ Create directories in the format: `MM/DD/` where:

Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines:

- **Line 1**: Oldest filename and its creation time (excluding .DS_Store)
- **Line 2**: Latest filename and its creation time (excluding .DS_Store)
- **Line 1**: Oldest filename and its last modified time (excluding .DS_Store)
- **Line 2**: Latest filename and its last modified time (excluding .DS_Store)
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
Verification script for File Organization by Creation Time Task
Verification script for File Organization by Last Modification Time Task
"""

import sys
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Do not try to use python code.
Count the following information for the entire directory structure:

- total number of files
- total number of folders
- total number of folders (exclude the folder named "complex_structure")
- total size of the hole folder (in bytes, include .DS_Store only in this subtask)

**Format (one item per line):**
Expand Down
10 changes: 5 additions & 5 deletions tasks/filesystem/standard/papers/author_folders/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@ You are given a directory containing multiple paper files. You have a collection
[given_task_folder]/
├── [original HTML files remain untouched]
├── frequent_authors/ # Authors with ≥4 papers total
│ ├── smith_john/
│ ├── john_smith/
│ │ └── [copied papers]
│ ├── johnson_sarah/
│ ├── sarah_johnson/
│ │ └── [copied papers]
│ └── ...
└── 2025_authors/ # Authors with ≥3 papers in 2025
├── williams_david/
├── david_williams/
│ └── [copied 2025 papers]
├── brown_emily/
├── emily_brown/
│ └── [copied 2025 papers]
└── ...
```

#### Requirements:
- Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`)
- Author folder names should be **lowercase** with underscores, using `firstname_lastname` format (e.g., `john_smith`, `david_williams`). Only the first name is used (middle names are ignored).
- Papers should be **copied** (not moved) to preserve originals
- Author extraction should handle various name formats correctly
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ The VoteNet project is a 3D object detection framework for point clouds. Your ta
2. **Include all essential dependencies** needed to run the VoteNet codebase
3. **Ensure the file format is correct** (one dependency per line)
4. **Save the file as `requirements.txt`** in the current working directory
5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**.

### Requirements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def verify_required_dependencies_present(test_dir: Path) -> bool:
"opencv",
"plyfile",
"trimesh",
"pointnet2",
"networkx"
]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository.

**Step 1: Commit History Analysis**
Analyze ALL commits in the repository to identify:
Analyze all commits reachable from the default branch (`main`) to identify:

1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude <noreply@anthropic.com>" in commit messages)
2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude
1. **Claude Co-Authored Commits**: Find all commits whose message contains a `Co-Authored-By: Claude <noreply@anthropic.com>` trailer. Match case-insensitively (both `Co-Authored-By` and `Co-authored-by` count). Count each commit at most once.
2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude.

**Step 2: Create Collaboration Analysis Report**
Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with:
Expand All @@ -23,7 +23,7 @@ Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root w
```
Include the top 3 developers by number of Claude collaborations.

**Step 3: Commit Analysis to Repository**
**Step 3: Commit the Analysis to Repository**
Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with:
- Commit message: "Add Claude AI collaboration analysis report"
- Ensure all statistics are accurate based on actual commit data
- Ensure all statistics are accurate based on actual commit data
Original file line number Diff line number Diff line change
Expand Up @@ -129,30 +129,39 @@ def verify_task() -> bool:
}

# Expected feature commits based on exploration
# For CHANGELOG Version 1.0.65, two valid answers exist:
# - 94dcaca5: merge commit that brought 1.0.65 into pr/2466-QwertyJack-main branch
# - 5faa082d: the actual commit that first added 1.0.65 content to CHANGELOG.md on main
expected_features = {
"Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d",
"CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
"Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332",
"Shell Completion Scripts": ["8a0febdd09bda32f38c351c0881784460d69997d"],
"CHANGELOG Version 1.0.65": [
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
"5faa082d6e4e5300485daafb94615fe133175055",
],
"Rust Extraction Improvements": ["50e58affdf1bfc7d875202bc040ebe0dcfb7d332"],
}

# Expected authors for each commit
expected_authors = {
"8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack",
"5faa082d6e4e5300485daafb94615fe133175055": "actions-user",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre",
}

# Expected commit messages for each commit
expected_messages = {
"8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main",
"5faa082d6e4e5300485daafb94615fe133175055": "chore: Update CHANGELOG.md",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows",
}

# Expected dates for each commit (YYYY-MM-DD format)
expected_dates = {
"8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02",
"5faa082d6e4e5300485daafb94615fe133175055": "2025-07-31",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09",
}

Expand Down Expand Up @@ -197,17 +206,17 @@ def verify_task() -> bool:
for feature in features:
found_features[feature["name"]] = feature["sha"]

for feature_name, expected_sha in expected_features.items():
for feature_name, expected_shas in expected_features.items():
if feature_name not in found_features:
print(
f"Error: Feature '{feature_name}' not found in table", file=sys.stderr
)
return False

actual_sha = found_features[feature_name]
if actual_sha != expected_sha:
if actual_sha not in expected_shas:
print(
f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}",
f"Error: Wrong SHA for '{feature_name}'. Expected one of: {expected_shas}, Got: {actual_sha}",
file=sys.stderr,
)
return False
Expand All @@ -216,8 +225,12 @@ def verify_task() -> bool:

# 5. Verify each commit exists and has correct author
print("5. Verifying commit details...")
all_expected_shas = set()
for shas in expected_features.values():
all_expected_shas.update(shas)

for feature in features:
if feature["sha"] in expected_features.values():
if feature["sha"] in all_expected_shas:
success, commit_data = _verify_commit_exists(
feature["sha"], headers, github_org
)
Expand Down
Loading