diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 921717b..cceb631 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -72,7 +72,6 @@ steps: export GOOGLE_CLOUD_PROJECT=$PROJECT_ID export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT - # Set environment variables for extension export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE @@ -84,18 +83,28 @@ steps: # Maps the decrypted DB_PASSWORD to the exact variable expected by gemini_cli and extension skills export CLOUD_SQL_POSTGRES_PASSWORD=$$DB_PASSWORD - # Combine CI metadata with run config - cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml + # Combine CI metadata with all available run configs + for config in /workspace/evals/*run_config.yaml; do + if [ -f "$config" ]; then + echo "Appending CI metadata to $config" + cat /workspace/evals/ci_metadata.yaml >> "$config" + fi + done - # Substitute environment variables in model_config.yaml + # Substitute environment variables in all configs python3 /workspace/evals/substitute_env.py cd /evalbench export PYTHONPATH=./evalbench:./evalbench/evalproto export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - echo "Launching Standalone Evaluation..." - python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml + # Run evaluations for all available run configs + for config in /workspace/evals/*run_config.yaml; do + if [ -f "$config" ]; then + echo "Launching Evaluation for config: $config" + python3 evalbench/evalbench.py --experiment_config="$config" + fi + done availableSecrets: diff --git a/evals/claude_code_model.yaml b/evals/claude_code_model.yaml new file mode 100644 index 0000000..d84c40a --- /dev/null +++ b/evals/claude_code_model.yaml @@ -0,0 +1,38 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +claude_code_version: "@anthropic-ai/claude-code@2.1.119" +generator: claude_code +model: "claude-opus-4-7" + +use_vertex: true +vertex_project_id: "${GOOGLE_CLOUD_PROJECT}" +vertex_region: "global" + +env: + # Global environment variables + CLOUD_ML_REGION: "global" + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + + # Cloud SQL PostgreSQL extension configuration + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" + +setup: + skills_dir: "/workspace/cloud-sql-postgresql" diff --git a/evals/claude_dataset.json b/evals/claude_dataset.json new file mode 100644 index 0000000..acc8a19 --- /dev/null +++ b/evals/claude_dataset.json @@ -0,0 +1,72 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-debug-instance", + "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.", + "expected_trajectory": [ + "list_instances.js", + "get_instance.js" + ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-schema-tables-explore", + "starting_prompt": "I want to understand the structure of my database.", + "conversation_plan": "First, ask the agent to list the schemas in the database. After the agent provides the schemas, ask it to list the tables specifically for the 'public' schema.", + "expected_trajectory": [ + "list_schemas.js", + "list_tables.js" + ], + "expected_skills": [ + "cloud-sql-postgres-data" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-performance-check", + "starting_prompt": "Our database performance seems degraded.", + "conversation_plan": "Start by asking the agent to check for any active queries that are running for a long time (e.g., more than 10 seconds). After the agent responds, follow up by asking if there are any database locks that might be causing issues.", + "expected_trajectory": [ + "list_active_queries.js", + "list_locks.js" + ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "cloud-sql-metrics-cpu-investigation", + "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", + "expected_trajectory": [ + "get_system_metrics.js", + "list_database_stats.js" + ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" + }, + "kind": "tools", + "max_turns": 3 + } + ] +} diff --git a/evals/claude_run_config.yaml b/evals/claude_run_config.yaml new file mode 100644 index 0000000..5afd7c0 --- /dev/null +++ b/evals/claude_run_config.yaml @@ -0,0 +1,49 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dataset Related Configs +dataset_config: /workspace/evals/claude_dataset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: /workspace/evals/claude_code_model.yaml +simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml + +# Runner Related Configs +runners: + agent_runners: 1 + +# Scorer Related Configs +scorers: + # Qualitative (Judge-based) + goal_completion: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/cloud-sql-postgresql/skills + + # Performance + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + skills_trajectory: {} + +# Reporting Related Configs +reporting: + bigquery: + gcp_project_id: "${EVAL_REPORTING_PROJECT}" diff --git a/evals/gemini_cli_model.yaml b/evals/gemini_cli_model.yaml new file mode 100644 index 0000000..2973cb4 --- /dev/null +++ b/evals/gemini_cli_model.yaml @@ -0,0 +1,33 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +gemini_cli_version: "@google/gemini-cli@latest" +generator: gemini_cli +env: + GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" + GOOGLE_CLOUD_LOCATION: "global" + GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" +setup: + extensions: + # Points to the symlink created in cloudbuild.yaml to match the extension ID + "/workspace/cloud-sql-postgresql": + settings: + CLOUD_SQL_POSTGRES_PROJECT: "${CLOUD_SQL_POSTGRES_PROJECT}" + CLOUD_SQL_POSTGRES_INSTANCE: "${CLOUD_SQL_POSTGRES_INSTANCE}" + CLOUD_SQL_POSTGRES_REGION: "${CLOUD_SQL_POSTGRES_REGION}" + CLOUD_SQL_POSTGRES_DATABASE: "${CLOUD_SQL_POSTGRES_DATABASE}" + CLOUD_SQL_POSTGRES_USER: "${CLOUD_SQL_POSTGRES_USER}" + CLOUD_SQL_POSTGRES_PASSWORD: '${CLOUD_SQL_POSTGRES_PASSWORD}' + CLOUD_SQL_POSTGRES_IP_TYPE: "${CLOUD_SQL_POSTGRES_IP_TYPE}" diff --git a/evals/dataset.json b/evals/gemini_dataset.json similarity index 89% rename from evals/dataset.json rename to evals/gemini_dataset.json index 654015f..7ceead7 100644 --- a/evals/dataset.json +++ b/evals/gemini_dataset.json @@ -8,6 +8,9 @@ "list_instances", "get_instance" ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -22,6 +25,9 @@ "list_schemas", "list_tables" ], + "expected_skills": [ + "cloud-sql-postgres-data" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -36,6 +42,9 @@ "list_active_queries", "list_locks" ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -50,6 +59,9 @@ "get_system_metrics", "list_database_stats" ], + "expected_skills": [ + "cloud-sql-postgres-monitor" + ], "env": { "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, @@ -57,4 +69,4 @@ "max_turns": 3 } ] -} \ No newline at end of file +} diff --git a/evals/run_config.yaml b/evals/gemini_run_config.yaml similarity index 92% rename from evals/run_config.yaml rename to evals/gemini_run_config.yaml index 600bddd..bc0c7ab 100644 --- a/evals/run_config.yaml +++ b/evals/gemini_run_config.yaml @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -dataset_config: /workspace/evals/dataset.json +dataset_config: /workspace/evals/gemini_dataset.json dataset_format: gemini-cli-format orchestrator: geminicli -model_config: /workspace/evals/model_config.yaml +model_config: /workspace/evals/gemini_cli_model.yaml simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml scorers: @@ -39,4 +39,3 @@ scorers: reporting: bigquery: gcp_project_id: "${EVAL_REPORTING_PROJECT}" - diff --git a/evals/substitute_env.py b/evals/substitute_env.py index cbe1a3a..f04200b 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -1,18 +1,23 @@ import os import re +import glob def main(): - yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json'] - for yaml_path in yaml_paths: - if os.path.exists(yaml_path): - with open(yaml_path, 'r') as f: - content = f.read() - content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) - with open(yaml_path, 'w') as f: - f.write(content) - print(f"Successfully substituted environment variables in {yaml_path}") - else: - print(f"File not found: {yaml_path}") + # Find all .yaml and .json files in /workspace/evals + paths = glob.glob('/workspace/evals/**/*.yaml', recursive=True) + glob.glob('/workspace/evals/**/*.json', recursive=True) + + for path in paths: + if os.path.isfile(path): + try: + with open(path, 'r') as f: + content = f.read() + # Substitute ${VAR} with environment variables + content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content) + with open(path, 'w') as f: + f.write(content) + print(f"Successfully substituted environment variables in {path}") + except Exception as e: + print(f"Error processing {path}: {e}") if __name__ == '__main__': main() \ No newline at end of file