Skip to content
Merged
21 changes: 15 additions & 6 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ steps:
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT


# Set environment variables for extension
export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID
export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE
Expand All @@ -84,18 +83,28 @@ steps:
# Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD

# Combine CI metadata with run config
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml
# Combine CI metadata with all available run configs
for config in /workspace/evals/*run_config.yaml; do
if [ -f "$config" ]; then
echo "Appending CI metadata to $config"
cat /workspace/evals/ci_metadata.yaml >> "$config"
fi
done

# Substitute environment variables in model_config.yaml
# Substitute environment variables in all configs
python3 /workspace/evals/substitute_env.py

cd /evalbench
export PYTHONPATH=./evalbench:./evalbench/evalproto
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

echo "Launching Standalone Evaluation..."
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml
# Run evaluations for all available run configs
for config in /workspace/evals/*run_config.yaml; do
if [ -f "$config" ]; then
echo "Launching Evaluation for config: $config"
python3 evalbench/evalbench.py --experiment_config="$config"
fi
done


availableSecrets:
Expand Down
38 changes: 38 additions & 0 deletions evals/claude_code_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

claude_code_version: "@anthropic-ai/claude-code@2.1.119"
generator: claude_code
model: "claude-opus-4-7"

use_vertex: true
vertex_project_id: "${GOOGLE_CLOUD_PROJECT}"
vertex_region: "global"

env:
# Global environment variables
CLOUD_ML_REGION: "global"
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"

# Cloud SQL PostgreSQL extension configuration
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"

setup:
skills_dir: "/workspace/cloud-sql-postgresql"
72 changes: 72 additions & 0 deletions evals/claude_dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"scenarios": [
{
"id": "cloud-sql-debug-instance",
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
"expected_trajectory": [
"list_instances.js",
"get_instance.js"
],
"expected_skills": [
"cloud-sql-postgres-admin"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-schema-tables-explore",
"starting_prompt": "I want to understand the structure of my database.",
"conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.",
"expected_trajectory": [
"list_schemas.js",
"list_tables.js"
],
"expected_skills": [
"cloud-sql-postgres-data"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-performance-check",
"starting_prompt": "Our database performance seems degraded.",
"conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.",
"expected_trajectory": [
"list_active_queries.js",
"list_locks.js"
],
"expected_skills": [
"cloud-sql-postgres-monitor"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-metrics-cpu-investigation",
"starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.",
"conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.",
"expected_trajectory": [
"get_system_metrics.js",
"list_database_stats.js"
],
"expected_skills": [
"cloud-sql-postgres-monitor"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
}
]
}
49 changes: 49 additions & 0 deletions evals/claude_run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Dataset Related Configs
dataset_config: /workspace/evals/claude_dataset.json
dataset_format: agent-format

# Orchestrator Configuration
orchestrator: agent
model_config: /workspace/evals/claude_code_model.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

# Runner Related Configs
runners:
agent_runners: 1

# Scorer Related Configs
scorers:
# Qualitative (Judge-based)
goal_completion:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_best_practices:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_dir: /workspace/cloud-sql-postgresql/skills

# Performance
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}
skills_trajectory: {}

# Reporting Related Configs
reporting:
bigquery:
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
33 changes: 33 additions & 0 deletions evals/gemini_cli_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gemini_cli_version: "@google/gemini-cli@latest"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
GOOGLE_CLOUD_LOCATION: "global"
GOOGLE_GENAI_USE_VERTEXAI: "true"
GEMINI_CLI_TRUST_WORKSPACE: "true"
setup:
extensions:
# Points to the symlink created in cloudbuild.yaml to match the extension ID
"/workspace/cloud-sql-postgresql":
settings:
CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}"
CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}"
CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}"
CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}"
CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}"
CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}'
CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}"
14 changes: 13 additions & 1 deletion evals/dataset.json → evals/gemini_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
"list_instances",
"get_instance"
],
"expected_skills": [
"cloud-sql-postgres-admin"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
Expand All @@ -22,6 +25,9 @@
"list_schemas",
"list_tables"
],
"expected_skills": [
"cloud-sql-postgres-data"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
Expand All @@ -36,6 +42,9 @@
"list_active_queries",
"list_locks"
],
"expected_skills": [
"cloud-sql-postgres-monitor"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
Expand All @@ -50,11 +59,14 @@
"get_system_metrics",
"list_database_stats"
],
"expected_skills": [
"cloud-sql-postgres-monitor"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
}
]
}
}
5 changes: 2 additions & 3 deletions evals/run_config.yaml → evals/gemini_run_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

dataset_config: /workspace/evals/dataset.json
dataset_config: /workspace/evals/gemini_dataset.json
dataset_format: gemini-cli-format

orchestrator: geminicli
model_config: /workspace/evals/model_config.yaml
model_config: /workspace/evals/gemini_cli_model.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
Expand All @@ -39,4 +39,3 @@ scorers:
reporting:
bigquery:
gcp_project_id: "${EVAL_REPORTING_PROJECT}"

27 changes: 16 additions & 11 deletions evals/substitute_env.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import os
import re
import glob

def main():
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
for yaml_path in yaml_paths:
if os.path.exists(yaml_path):
with open(yaml_path, 'r') as f:
content = f.read()
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(yaml_path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {yaml_path}")
else:
print(f"File not found: {yaml_path}")
# Find all .yaml and .json files in /workspace/evals
paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True)

for path in paths:
if os.path.isfile(path):
try:
with open(path, 'r') as f:
content = f.read()
# Substitute ${VAR} with environment variables
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {path}")
except Exception as e:
print(f"Error processing {path}: {e}")

if __name__ == '__main__':
main()
Loading