From 8783f27a058fdb4562729aa8c65aa5c70931a054 Mon Sep 17 00:00:00 2001 From: MK Date: Sun, 14 Jun 2026 00:46:41 -0400 Subject: [PATCH 1/2] feat(audit): emit guardrail_check on every mask/block/warn with opt-in evidence capture (closes #155) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AuditGuardrail constant has been defined since FWS-7 but nothing ever emitted it — LibraryGuardrailEngine only logged redactions to the ops logger, so operators tailing the audit socket / stderr NDJSON saw no guardrail events at all. Docs claimed otherwise. This commit: - Extends GuardrailChecker (forge-core/runtime/guardrails.go) with a context.Context parameter so emissions can be routed through EmitFromContext and inherit correlation_id, task_id, sequence, and workflow-correlation tags from the request. - Wires *coreruntime.AuditLogger + GuardrailAuditConfig into LibraryGuardrailEngine. BuildGuardrailChecker now takes both, and runner.Start() reorders construction so the audit logger exists before the guardrail engine is built. - Emits AuditGuardrail at all 7 mask/block/warn sites (inbound/outbound/tool_output × {mask, warn, blocked-enforce}). Fields shape: direction, decision (masked/warned/blocked), guardrail (Violation.Type), category (Violation.Category), violation_count, optional tool, optional evidence. - Adds GuardrailAuditConfig with CaptureEvidence (off by default — metadata-only posture matches the existing FWS-8 audit payload-capture default and the issue #130 OTel content-capture default), Redact (on by default, scrubs vendor-secret token shapes with the same patterns as the #130 work and the FWS-8 [REDACTED] marker), and MaxBytes (4 KiB soft cap, truncated via the existing TruncateForAudit + …[truncated:N] marker). - Env knobs: FORGE_GUARDRAIL_CAPTURE_EVIDENCE, FORGE_GUARDRAIL_REDACT, FORGE_GUARDRAIL_MAX_BYTES. - Updates all callers (3× runner.go A2A handlers, AfterToolExec hook, NoopGuardrailChecker, tests). - Docs: docs/security/guardrails.md grows a full Audit Events section with the event field table and opt-in evidence knobs; docs/security/audit-logging.md row + the forge skill audit table row are updated to match. --- .claude/skills/forge.md | 2 +- docs/security/audit-logging.md | 2 +- docs/security/guardrails.md | 79 ++++++++- forge-cli/runtime/guardrails_audit.go | 178 ++++++++++++++++++++ forge-cli/runtime/guardrails_engine.go | 55 +++++- forge-cli/runtime/guardrails_engine_test.go | 153 ++++++++++++++++- forge-cli/runtime/guardrails_loader.go | 18 +- forge-cli/runtime/runner.go | 28 +-- forge-core/runtime/guardrails.go | 17 +- forge-core/runtime/guardrails_test.go | 8 +- go.work.sum | 19 ++- 11 files changed, 513 insertions(+), 46 deletions(-) create mode 100644 forge-cli/runtime/guardrails_audit.go diff --git a/.claude/skills/forge.md b/.claude/skills/forge.md index 16ed2e7..1c31dd6 100644 --- a/.claude/skills/forge.md +++ b/.claude/skills/forge.md @@ -924,7 +924,7 @@ when OTel tracing is enabled (OTel v1 / Phase 4 / #105). Both use | `AuditEgressBlocked` | `egress_blocked` | Outbound request blocked | | `AuditLLMCall` | `llm_call` | LLM provider call complete; `model`, `provider`, `input_tokens`, `output_tokens`, `duration_ms`, `request_id` | | `AuditLLMCallCancelled` | `llm_call_cancelled` | Streaming call aborted mid-flight; partial usage counts | -| `AuditGuardrail` | `guardrail_check` | Guardrail evaluation result | +| `AuditGuardrail` | `guardrail_check` | Mask / block / warn decision. Fields: `direction` (`inbound` / `outbound` / `tool_output`), `decision` (`masked` / `warned` / `blocked`), `guardrail`, `category`, `violation_count`. Opt-in `evidence` (redacted + truncated triggering text) via `FORGE_GUARDRAIL_CAPTURE_EVIDENCE=true` | | `AuditScheduleFire` | `schedule_fire` | Cron task triggered | | `AuditScheduleComplete` | `schedule_complete` | Cron task finished | | `AuditScheduleSkip` | `schedule_skip` | Cron task skipped (e.g. agent busy) | diff --git a/docs/security/audit-logging.md b/docs/security/audit-logging.md index c8896d8..6fb05f9 100644 --- a/docs/security/audit-logging.md +++ b/docs/security/audit-logging.md @@ -21,7 +21,7 @@ All runtime security events are emitted as structured NDJSON to stderr with corr | `llm_call_cancelled` | Streaming LLM call cancelled mid-flight; carries partial token counts captured up to cancellation. | | `invocation_complete` | A2A invocation finished (auth → dispatch → engine → response). Carries `duration_ms` (wall-clock) plus aggregated `input_tokens_total` / `output_tokens_total` / `llm_call_count` / `model` / `provider`. | | `invocation_cancelled` | A2A invocation cancelled mid-flight via `tasks/cancel` (or internal cancellation like parent ctx deadline). Carries `fields.reason` (one of `workflow_failure` / `cost_limit_exceeded` / `timeout` / `external_signal`), `duration_ms` up to cancellation, and any partial token totals consumed before the signal. See [Cancellation](#cancellation). | -| `guardrail_check` | Guardrail evaluation result | +| `guardrail_check` | Guardrail mask / block / warn decision. Carries `fields.direction` (`inbound` / `outbound` / `tool_output`), `fields.decision` (`masked` / `warned` / `blocked`), `fields.guardrail` + `fields.category` from the triggering violation, and `fields.violation_count`. With `FORGE_GUARDRAIL_CAPTURE_EVIDENCE=true` operators also opt into `fields.evidence` carrying the redacted + truncated triggering text. See [Guardrails — Audit Events](guardrails.md#audit-events). | | `auth_verify` | Inbound request authenticated successfully (with `provider`, `user_id`, `org_id`, `token_kind`) | | `auth_fail` | Inbound request rejected (with `reason`, `token_kind`) | | `agent_card_published` | Agent Card finalized at startup or hot-reload (with `name`, `version`, `protocol_version`, `url`, `skill_count`, `capabilities`, `security_schemes`, `card_size_bytes`, `card_sha256`). See [Agent Card reference](../reference/a2a-agent-card.md). | diff --git a/docs/security/guardrails.md b/docs/security/guardrails.md index 79755c6..576f710 100644 --- a/docs/security/guardrails.md +++ b/docs/security/guardrails.md @@ -503,12 +503,81 @@ The `cli_execute` tool blocks arguments containing `file://` URLs (case-insensit ## Audit Events -Guardrail evaluations are logged as structured audit events: +Every mask / block / warn decision emits a `guardrail_check` audit +event through the configured Forge audit sink stack (stderr safety +net + the optional Unix socket / HTTP sink wired via +`FORGE_AUDIT_SOCKET` / `FORGE_AUDIT_HTTP_ENDPOINT`). The event +carries the per-invocation `correlation_id`, `task_id`, sequence +number, and workflow-correlation tags so consumers can join it to +the `session_start` / `llm_call` / `invocation_complete` rows for +the same request. + +Default shape (metadata-only): ```json -{"ts":"2026-02-28T10:00:00Z","event":"guardrail_check","correlation_id":"a1b2c3d4","fields":{"guardrail":"pii","direction":"inbound","result":"masked"}} +{ + "ts": "2026-06-14T10:00:00Z", + "event": "guardrail_check", + "schema_version": "1.0", + "seq": 2, + "correlation_id": "a1b2c3d4", + "task_id": "slack-...", + "fields": { + "direction": "inbound", + "decision": "masked", + "guardrail": "pii", + "category": "ssn", + "violation_count": 1 + } +} ``` -In DB mode, the guardrails library writes audit records to MongoDB automatically when `EnableAudit` is set. - -See [Security Overview](overview.md) for the full security architecture. +Field reference: + +| Field | Values | Meaning | +|-------|--------|---------| +| `direction` | `inbound` / `outbound` / `tool_output` | Which gate fired | +| `decision` | `masked` / `warned` / `blocked` | Library decision after policy resolution | +| `guardrail` | `pii` / `moderation` / `security` / `none` / … | First violation's `Type` (`none` when violations list is empty) | +| `category` | `ssn` / `email` / `hate_speech` / … | First violation's `Category`; omitted when empty | +| `violation_count` | integer ≥ 0 | Length of `result.Violations` | +| `tool` | string | Tool name; present only when `direction=tool_output` | +| `evidence` | string | Captured triggering text; present only when opt-in is on (see below) | + +### Evidence capture (opt-in) + +The default posture is **metadata-only**: the offending text never +travels through the audit pipeline. Operators who need it (false- +positive triage, compliance evidence, pattern tuning) opt in per- +deployment via: + +| Env var | Default | Meaning | +|---------|---------|---------| +| `FORGE_GUARDRAIL_CAPTURE_EVIDENCE` | `false` | Include `fields.evidence` in the emitted event | +| `FORGE_GUARDRAIL_REDACT` | `true` | Run a vendor-secret regex scrub over the captured evidence before emission | +| `FORGE_GUARDRAIL_MAX_BYTES` | `4096` | Per-event soft cap; overage is truncated with a `…[truncated:N]` marker | + +`Redact` is on whenever `CaptureEvidence` is on unless you explicitly +disable it. The scrub matches obvious vendor token shapes (Anthropic +`sk-ant-…`, OpenAI `sk-…`, GitHub `ghp_/gho_/ghs_/github_pat_…`, AWS +`AKIA…`, Slack `xox[bp]-…`, private-key PEM headers, Telegram bot +tokens) and replaces each match with `[REDACTED]`. It is defense- +in-depth — the guardrail library has usually already masked these, +but an unmasked input that hit a different rule (e.g. moderation) +would otherwise carry secrets through verbatim. + +The size envelope and `[REDACTED]` marker match the OTel span +content-capture pipeline (issue #130) so the same string travels +through both pipelines under one contract. + +### Mode-specific behavior + +- **File mode** — every event flows through the Forge audit pipeline. +- **DB mode** — the guardrails library also writes audit records to + MongoDB when `EnableAudit` is set. Forge still emits the + `guardrail_check` event on its own audit sinks so SIEM consumers + reading the export socket see parity regardless of mode. + +See [Security Overview](overview.md) for the full security architecture +and [Audit Logging](audit-logging.md) for the sink stack and schema +contract. diff --git a/forge-cli/runtime/guardrails_audit.go b/forge-cli/runtime/guardrails_audit.go new file mode 100644 index 0000000..5b7ac1c --- /dev/null +++ b/forge-cli/runtime/guardrails_audit.go @@ -0,0 +1,178 @@ +package runtime + +import ( + "context" + "os" + "regexp" + "strconv" + + "github.com/initializ/guardrails" + + coreruntime "github.com/initializ/forge/forge-core/runtime" +) + +// GuardrailAuditConfig controls how the LibraryGuardrailEngine emits +// guardrail_check audit events. The default zero value preserves the +// pre-#155 metadata-only posture: an emitted event carries direction, +// decision, guardrail type, and violation count, but never the raw +// content that triggered the rule. +// +// Operators who need the offending text (to tune patterns, debug +// false positives, or satisfy compliance evidence requirements) opt +// in by flipping CaptureEvidence to true. The Redact knob is on by +// default and runs an obvious-secret scrub even on the captured +// evidence, so a leaked API key in a prompt does not get re-published +// into the audit stream verbatim. MaxBytes bounds the captured +// substring per event; zero falls back to DefaultGuardrailEvidenceCapBytes. +// +// Same posture as the #130 OTel content-capture work: default off, +// opt-in per-deployment, redact-then-truncate when on. +type GuardrailAuditConfig struct { + // CaptureEvidence includes the raw triggering content in the + // emitted guardrail_check event's `fields.evidence`. OFF by default. + CaptureEvidence bool + + // Redact runs a known-secret regex pass on the captured evidence + // before truncation. ON by default. Disable only when consuming + // in an environment that has its own scrubbing layer (e.g. a + // platform-side SIEM normalizer). + Redact bool + + // MaxBytes is the soft cap on the captured evidence string. Zero + // uses DefaultGuardrailEvidenceCapBytes (4 KiB). + MaxBytes int +} + +// DefaultGuardrailEvidenceCapBytes is the per-event cap for captured +// evidence when GuardrailAuditConfig.MaxBytes is unset. 4 KiB matches +// the OTel span attribute soft cap so the same content travels through +// both pipelines under the same size envelope. +const DefaultGuardrailEvidenceCapBytes = 4 << 10 + +// Environment variable names mirror the existing audit/export pattern. +// The CLI surfaces these via run/serve flags or operators can set them +// directly on the agent process. +const ( + EnvGuardrailCaptureEvidence = "FORGE_GUARDRAIL_CAPTURE_EVIDENCE" + EnvGuardrailRedact = "FORGE_GUARDRAIL_REDACT" + EnvGuardrailMaxBytes = "FORGE_GUARDRAIL_MAX_BYTES" +) + +// GuardrailAuditConfigFromEnv reads the env vars and returns a populated +// config. Redact defaults to true so flipping CaptureEvidence on without +// touching Redact preserves the safer posture. +func GuardrailAuditConfigFromEnv() GuardrailAuditConfig { + cfg := GuardrailAuditConfig{Redact: true} + if v := os.Getenv(EnvGuardrailCaptureEvidence); v != "" { + if b, err := strconv.ParseBool(v); err == nil { + cfg.CaptureEvidence = b + } + } + if v := os.Getenv(EnvGuardrailRedact); v != "" { + if b, err := strconv.ParseBool(v); err == nil { + cfg.Redact = b + } + } + if v := os.Getenv(EnvGuardrailMaxBytes); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + cfg.MaxBytes = n + } + } + return cfg +} + +// secretRedactPatterns are the vendor token shapes scrubbed when +// GuardrailAuditConfig.Redact is on. Same set as the OTel content +// redaction pass (issue #130) so the audit and trace pipelines stay +// consistent. Defence-in-depth only: the guardrail library may already +// have masked these, but an unmasked input that hit a different rule +// (e.g. moderation) would otherwise carry secrets through verbatim. +var secretRedactPatterns = []*regexp.Regexp{ + regexp.MustCompile(`sk-ant-[A-Za-z0-9\-]{20,}`), + regexp.MustCompile(`sk-[A-Za-z0-9]{20,}`), + regexp.MustCompile(`ghp_[A-Za-z0-9]{36}`), + regexp.MustCompile(`gho_[A-Za-z0-9]{36}`), + regexp.MustCompile(`ghs_[A-Za-z0-9]{36}`), + regexp.MustCompile(`github_pat_[A-Za-z0-9_]{22,}`), + regexp.MustCompile(`AKIA[0-9A-Z]{16}`), + regexp.MustCompile(`xox[bp]-[0-9]{10,}-[A-Za-z0-9-]+`), + regexp.MustCompile(`-----BEGIN (RSA|EC|OPENSSH|PRIVATE) .*KEY-----`), + regexp.MustCompile(`[0-9]{8,10}:[A-Za-z0-9_-]{35,}`), +} + +// redactSecrets replaces any known secret-token shape with [REDACTED]. +// Mirrors the marker used by the FWS-8 capture path so audit consumers +// see one consistent token across both pipelines. +func redactSecrets(s string) string { + for _, re := range secretRedactPatterns { + s = re.ReplaceAllString(s, "[REDACTED]") + } + return s +} + +// prepareEvidence applies redact (if on) then byte-truncates to the +// configured cap. Returns "" when input is "" so callers can drop the +// field cleanly. +func prepareEvidence(s string, cfg GuardrailAuditConfig) string { + if s == "" { + return "" + } + if cfg.Redact { + s = redactSecrets(s) + } + cap := cfg.MaxBytes + if cap <= 0 { + cap = DefaultGuardrailEvidenceCapBytes + } + return coreruntime.TruncateForAudit(s, cap) +} + +// emitGuardrailEvent builds and emits a guardrail_check audit event for +// one mask/block/warn decision. Routed through EmitFromContext so the +// per-invocation correlation_id, task_id, sequence number, and workflow +// tags auto-attach from the request context. +// +// Behavior matrix: +// +// - audit logger nil → no-op (DB mode with platform-side audit only, +// or unit tests with no logger wired) +// - res nil → no-op (defensive; emit only when we have a +// guardrail Result to summarize) +// - CaptureEvidence on AND content non-empty → fields.evidence is +// set (redacted + truncated per cfg) +// - CaptureEvidence off → fields.evidence omitted entirely +func (e *LibraryGuardrailEngine) emitGuardrailEvent( + ctx context.Context, + direction, tool, content string, + decision string, + res *guardrails.Result, +) { + if e.auditLogger == nil || res == nil { + return + } + fields := map[string]any{ + "direction": direction, + "decision": decision, + "violation_count": len(res.Violations), + } + if len(res.Violations) > 0 { + fields["guardrail"] = res.Violations[0].Type + if cat := res.Violations[0].Category; cat != "" { + fields["category"] = cat + } + } else { + fields["guardrail"] = "none" + } + if tool != "" { + fields["tool"] = tool + } + if e.auditCfg.CaptureEvidence { + if ev := prepareEvidence(content, e.auditCfg); ev != "" { + fields["evidence"] = ev + } + } + e.auditLogger.EmitFromContext(ctx, coreruntime.AuditEvent{ + Event: coreruntime.AuditGuardrail, + Fields: fields, + }) +} diff --git a/forge-cli/runtime/guardrails_engine.go b/forge-cli/runtime/guardrails_engine.go index 61c131f..a622d50 100644 --- a/forge-cli/runtime/guardrails_engine.go +++ b/forge-cli/runtime/guardrails_engine.go @@ -15,10 +15,28 @@ import ( "go.mongodb.org/mongo-driver/mongo/options" ) +// Result-string constants for the guardrail_check audit event. Operators +// group by these values in their SIEM pipeline; keep the set small and +// stable. Map onto library decisions: +// +// DecisionMask → "masked" +// DecisionBlock (warn mode) → "warned" +// DecisionBlock (enforce mode) → "blocked" +const ( + guardrailResultMasked = "masked" + guardrailResultWarned = "warned" + guardrailResultBlocked = "blocked" +) + // LibraryGuardrailEngine implements coreruntime.GuardrailChecker using the // github.com/initializ/guardrails library. It supports two modes: // - File mode: uses StructuredGuardrails loaded from guardrails.json // - DB mode: loads config from MongoDB (set via FORGE_GUARDRAILS_DB env) +// +// On every mask/block decision the engine emits a guardrail_check audit +// event through auditLogger (when wired). The auditCfg knob controls +// whether the offending content is captured as evidence; default off. +// See issue #155. type LibraryGuardrailEngine struct { manager *guardrails.GuardrailManager structured *models.StructuredGuardrails @@ -28,6 +46,8 @@ type LibraryGuardrailEngine struct { orgID string configVersion int64 logger coreruntime.Logger + auditLogger *coreruntime.AuditLogger + auditCfg GuardrailAuditConfig } // NewFileGuardrailEngine creates a guardrail engine backed by a local @@ -80,6 +100,19 @@ func NewDBGuardrailEngine(mongoURI, agentID, orgID string, enforce bool, logger }, nil } +// WithAuditLogger wires an AuditLogger and capture config so the engine +// can emit guardrail_check events on every mask/block/warn decision. +// Returns the receiver for fluent construction. When auditLogger is nil +// the engine is silent on the audit pipeline (legacy behavior — only +// the ops logger sees the redaction line). Callers in the runner pass +// the same AuditLogger they hand to the A2A handlers so events share +// the configured sink stack. +func (e *LibraryGuardrailEngine) WithAuditLogger(al *coreruntime.AuditLogger, cfg GuardrailAuditConfig) *LibraryGuardrailEngine { + e.auditLogger = al + e.auditCfg = cfg + return e +} + // structuredIfFileMode returns the StructuredGuardrails pointer only in file // mode. In DB mode the library loads config from MongoDB automatically. func (e *LibraryGuardrailEngine) structuredIfFileMode() *models.StructuredGuardrails { @@ -90,13 +123,13 @@ func (e *LibraryGuardrailEngine) structuredIfFileMode() *models.StructuredGuardr } // CheckInbound validates an inbound (user) message via the library's InputGate. -func (e *LibraryGuardrailEngine) CheckInbound(msg *a2a.Message) error { +func (e *LibraryGuardrailEngine) CheckInbound(ctx context.Context, msg *a2a.Message) error { text := coreruntime.ExtractText(msg) if text == "" { return nil } - result, err := e.manager.InputGate(context.Background(), guardrails.InputRequest{ + result, err := e.manager.InputGate(ctx, guardrails.InputRequest{ Content: text, EntityID: e.agentID, OrgID: e.orgID, @@ -121,29 +154,33 @@ func (e *LibraryGuardrailEngine) CheckInbound(msg *a2a.Message) error { e.logger.Info("inbound guardrail redaction applied", map[string]any{ "direction": "inbound", }) + e.emitGuardrailEvent(ctx, "inbound", "", text, guardrailResultMasked, result) } case guardrails.DecisionBlock: desc := violationSummary(result) if e.enforce { + e.emitGuardrailEvent(ctx, "inbound", "", text, guardrailResultBlocked, result) return fmt.Errorf("input blocked: %s", desc) } e.logger.Warn("guardrail input violation (warn mode)", map[string]any{ "direction": "inbound", "detail": desc, }) + e.emitGuardrailEvent(ctx, "inbound", "", text, guardrailResultWarned, result) } return nil } // CheckOutbound validates an outbound (agent) message via the library's OutputGate. // Masked content is applied in-place; blocked content returns an error only in enforce mode. -func (e *LibraryGuardrailEngine) CheckOutbound(msg *a2a.Message) error { +func (e *LibraryGuardrailEngine) CheckOutbound(ctx context.Context, msg *a2a.Message) error { for i, p := range msg.Parts { if p.Kind != a2a.PartKindText || p.Text == "" { continue } - result, err := e.manager.OutputGate(context.Background(), guardrails.OutputRequest{ + original := p.Text + result, err := e.manager.OutputGate(ctx, guardrails.OutputRequest{ Content: p.Text, EntityID: e.agentID, OrgID: e.orgID, @@ -163,16 +200,19 @@ func (e *LibraryGuardrailEngine) CheckOutbound(msg *a2a.Message) error { e.logger.Warn("outbound guardrail redaction applied", map[string]any{ "direction": "outbound", }) + e.emitGuardrailEvent(ctx, "outbound", "", original, guardrailResultMasked, result) } case guardrails.DecisionBlock: desc := violationSummary(result) if e.enforce { + e.emitGuardrailEvent(ctx, "outbound", "", original, guardrailResultBlocked, result) return fmt.Errorf("output blocked: %s", desc) } e.logger.Warn("guardrail output violation (warn mode)", map[string]any{ "direction": "outbound", "detail": desc, }) + e.emitGuardrailEvent(ctx, "outbound", "", original, guardrailResultWarned, result) } } return nil @@ -180,12 +220,12 @@ func (e *LibraryGuardrailEngine) CheckOutbound(msg *a2a.Message) error { // CheckToolOutput scans tool output text via the library's OutputGate. // Returns the (possibly masked) text and any blocking error. -func (e *LibraryGuardrailEngine) CheckToolOutput(toolName, text string) (string, error) { +func (e *LibraryGuardrailEngine) CheckToolOutput(ctx context.Context, toolName, text string) (string, error) { if text == "" { return text, nil } - result, err := e.manager.OutputGate(context.Background(), guardrails.OutputRequest{ + result, err := e.manager.OutputGate(ctx, guardrails.OutputRequest{ Content: text, EntityID: e.agentID, OrgID: e.orgID, @@ -210,11 +250,13 @@ func (e *LibraryGuardrailEngine) CheckToolOutput(toolName, text string) (string, "tool": toolName, "detail": "content redacted", }) + e.emitGuardrailEvent(ctx, "tool_output", toolName, text, guardrailResultMasked, result) return result.MaskedContent, nil } case guardrails.DecisionBlock: desc := violationSummary(result) if e.enforce { + e.emitGuardrailEvent(ctx, "tool_output", toolName, text, guardrailResultBlocked, result) return "", fmt.Errorf("tool output blocked: %s", desc) } e.logger.Warn("guardrail tool output violation (warn mode)", map[string]any{ @@ -222,6 +264,7 @@ func (e *LibraryGuardrailEngine) CheckToolOutput(toolName, text string) (string, "tool": toolName, "detail": desc, }) + e.emitGuardrailEvent(ctx, "tool_output", toolName, text, guardrailResultWarned, result) } return text, nil diff --git a/forge-cli/runtime/guardrails_engine_test.go b/forge-cli/runtime/guardrails_engine_test.go index 607fd97..139e73d 100644 --- a/forge-cli/runtime/guardrails_engine_test.go +++ b/forge-cli/runtime/guardrails_engine_test.go @@ -1,9 +1,12 @@ package runtime import ( + "bytes" + "context" "encoding/json" "os" "path/filepath" + "strings" "testing" "github.com/initializ/guardrails/models" @@ -44,13 +47,14 @@ func TestFileGuardrailEngine_CheckInbound(t *testing.T) { Role: "user", Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "Hello, how are you?"}}, } - if err := engine.CheckInbound(msg); err != nil { + ctx := context.Background() + if err := engine.CheckInbound(ctx, msg); err != nil { t.Errorf("normal message should pass inbound check: %v", err) } // Empty message should pass emptyMsg := &a2a.Message{Role: "user"} - if err := engine.CheckInbound(emptyMsg); err != nil { + if err := engine.CheckInbound(ctx, emptyMsg); err != nil { t.Errorf("empty message should pass inbound check: %v", err) } } @@ -68,7 +72,7 @@ func TestFileGuardrailEngine_CheckOutbound(t *testing.T) { Role: "agent", Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "Here is the result."}}, } - if err := engine.CheckOutbound(msg); err != nil { + if err := engine.CheckOutbound(context.Background(), msg); err != nil { t.Errorf("normal message should pass outbound check: %v", err) } } @@ -81,8 +85,9 @@ func TestFileGuardrailEngine_CheckToolOutput(t *testing.T) { t.Fatalf("NewFileGuardrailEngine() error: %v", err) } + ctx := context.Background() // Normal text should pass through - out, err := engine.CheckToolOutput("some_tool", "some normal output") + out, err := engine.CheckToolOutput(ctx, "some_tool", "some normal output") if err != nil { t.Errorf("normal output should pass: %v", err) } @@ -91,7 +96,7 @@ func TestFileGuardrailEngine_CheckToolOutput(t *testing.T) { } // Empty text should pass through - out, err = engine.CheckToolOutput("some_tool", "") + out, err = engine.CheckToolOutput(ctx, "some_tool", "") if err != nil { t.Errorf("empty output should pass: %v", err) } @@ -103,7 +108,7 @@ func TestFileGuardrailEngine_CheckToolOutput(t *testing.T) { // TestBuildGuardrailChecker_FileMode tests the builder with file-based config. func TestBuildGuardrailChecker_FileMode(t *testing.T) { logger := &grTestLogger{} - checker := BuildGuardrailChecker(nil, "/nonexistent", false, logger) + checker := BuildGuardrailChecker(nil, "/nonexistent", false, logger, nil, GuardrailAuditConfig{}) if checker == nil { t.Fatal("BuildGuardrailChecker should return a non-nil checker") } @@ -113,11 +118,145 @@ func TestBuildGuardrailChecker_FileMode(t *testing.T) { Role: "user", Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "hello"}}, } - if err := checker.CheckInbound(msg); err != nil { + if err := checker.CheckInbound(context.Background(), msg); err != nil { t.Errorf("default checker should pass normal message: %v", err) } } +// TestLibraryGuardrailEngine_EmitsAuditOnInboundMask verifies the engine +// emits a guardrail_check event on the configured audit logger when an +// inbound message triggers a mask decision, and that capturing evidence +// surfaces the offending text (redacted + truncated) in fields.evidence. +func TestLibraryGuardrailEngine_EmitsAuditOnInboundMask(t *testing.T) { + sg := DefaultStructuredGuardrails() + engine, err := NewFileGuardrailEngine(sg, false, &grTestLogger{}) + if err != nil { + t.Fatalf("NewFileGuardrailEngine: %v", err) + } + + var buf bytes.Buffer + al := coreruntime.NewAuditLogger(&buf) + engine.WithAuditLogger(al, GuardrailAuditConfig{CaptureEvidence: true, Redact: true}) + + msg := &a2a.Message{ + Role: "user", + Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "my email is foo@example.com please verify"}}, + } + if err := engine.CheckInbound(context.Background(), msg); err != nil { + t.Fatalf("CheckInbound: %v", err) + } + + out := buf.String() + if !strings.Contains(out, `"event":"guardrail_check"`) { + t.Errorf("expected guardrail_check event, got: %s", out) + } + if !strings.Contains(out, `"direction":"inbound"`) { + t.Errorf("expected direction=inbound, got: %s", out) + } + if !strings.Contains(out, `"decision":"masked"`) { + t.Errorf("expected decision=masked, got: %s", out) + } + if !strings.Contains(out, `"evidence"`) { + t.Errorf("expected evidence field with CaptureEvidence=true, got: %s", out) + } + if !strings.Contains(out, "foo@example.com") { + t.Errorf("expected raw email in evidence, got: %s", out) + } +} + +// TestLibraryGuardrailEngine_OmitsEvidenceByDefault verifies the +// metadata-only posture: CaptureEvidence=false (the zero value) means +// fields.evidence is absent even when a mask fires. +func TestLibraryGuardrailEngine_OmitsEvidenceByDefault(t *testing.T) { + sg := DefaultStructuredGuardrails() + engine, err := NewFileGuardrailEngine(sg, false, &grTestLogger{}) + if err != nil { + t.Fatalf("NewFileGuardrailEngine: %v", err) + } + + var buf bytes.Buffer + al := coreruntime.NewAuditLogger(&buf) + engine.WithAuditLogger(al, GuardrailAuditConfig{}) // CaptureEvidence off + + msg := &a2a.Message{ + Role: "user", + Parts: []a2a.Part{{Kind: a2a.PartKindText, Text: "my email is foo@example.com"}}, + } + if err := engine.CheckInbound(context.Background(), msg); err != nil { + t.Fatalf("CheckInbound: %v", err) + } + + out := buf.String() + if !strings.Contains(out, `"event":"guardrail_check"`) { + t.Errorf("expected guardrail_check event, got: %s", out) + } + if strings.Contains(out, `"evidence"`) { + t.Errorf("evidence MUST be omitted when CaptureEvidence=false, got: %s", out) + } + if strings.Contains(out, "foo@example.com") { + t.Errorf("raw content MUST NOT leak when CaptureEvidence=false, got: %s", out) + } +} + +// TestPrepareEvidence verifies the redact + truncate pipeline that +// runs over captured evidence before it lands in fields.evidence. +// Exercises both knobs independently of the guardrails library decision +// path — that path is covered by the EmitsAuditOnInboundMask test. +func TestPrepareEvidence(t *testing.T) { + tests := []struct { + name string + in string + cfg GuardrailAuditConfig + want string + }{ + { + name: "empty input returns empty", + in: "", + cfg: GuardrailAuditConfig{Redact: true}, + want: "", + }, + { + name: "redact-off leaves anthropic token intact", + in: "leak: sk-ant-abcdefghijklmnopqrstuvwxyz123", + cfg: GuardrailAuditConfig{Redact: false}, + want: "leak: sk-ant-abcdefghijklmnopqrstuvwxyz123", + }, + { + name: "redact-on scrubs anthropic token to marker", + in: "leak: sk-ant-abcdefghijklmnopqrstuvwxyz123", + cfg: GuardrailAuditConfig{Redact: true}, + want: "leak: [REDACTED]", + }, + { + name: "redact-on scrubs github pat", + in: "leak: ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + cfg: GuardrailAuditConfig{Redact: true}, + want: "leak: [REDACTED]", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := prepareEvidence(tt.in, tt.cfg) + if got != tt.want { + t.Errorf("prepareEvidence(%q) = %q, want %q", tt.in, got, tt.want) + } + }) + } +} + +// TestPrepareEvidence_TruncatesAtCap verifies the byte cap activates +// when input exceeds MaxBytes; the truncation marker is appended. +func TestPrepareEvidence_TruncatesAtCap(t *testing.T) { + in := strings.Repeat("x", 200) + got := prepareEvidence(in, GuardrailAuditConfig{Redact: false, MaxBytes: 50}) + if len(got) >= 200 { + t.Errorf("expected truncated output, got length %d", len(got)) + } + if !strings.Contains(got, "[truncated:") { + t.Errorf("expected truncation marker, got: %q", got) + } +} + // TestLoadGuardrailsJSON tests parsing a guardrails.json file. func TestLoadGuardrailsJSON(t *testing.T) { dir := t.TempDir() diff --git a/forge-cli/runtime/guardrails_loader.go b/forge-cli/runtime/guardrails_loader.go index 6959deb..28331e5 100644 --- a/forge-cli/runtime/guardrails_loader.go +++ b/forge-cli/runtime/guardrails_loader.go @@ -40,7 +40,19 @@ func DefaultPolicyScaffold() *agentspec.PolicyScaffold { // BuildGuardrailChecker creates the guardrail engine based on configuration. // Priority: FORGE_GUARDRAILS_DB env → guardrails.json file → defaults. -func BuildGuardrailChecker(cfg *types.ForgeConfig, workDir string, enforce bool, logger coreruntime.Logger) coreruntime.GuardrailChecker { +// +// auditLogger and auditCfg are wired into the resulting engine so every +// mask/block/warn decision emits a guardrail_check event through the +// same sink stack the A2A handlers use. When auditLogger is nil the +// engine is silent on the audit pipeline (used by tests). +func BuildGuardrailChecker(cfg *types.ForgeConfig, workDir string, enforce bool, logger coreruntime.Logger, auditLogger *coreruntime.AuditLogger, auditCfg GuardrailAuditConfig) coreruntime.GuardrailChecker { + attach := func(e *LibraryGuardrailEngine) coreruntime.GuardrailChecker { + if auditLogger != nil { + e.WithAuditLogger(auditLogger, auditCfg) + } + return e + } + // DB mode: connect to MongoDB for config + audit if mongoURI := os.Getenv("FORGE_GUARDRAILS_DB"); mongoURI != "" { agentID := os.Getenv("FORGE_AGENT_ID") @@ -53,7 +65,7 @@ func BuildGuardrailChecker(cfg *types.ForgeConfig, workDir string, enforce bool, logger.Info("guardrails: using MongoDB-backed config", map[string]any{ "agent_id": agentID, }) - return engine + return attach(engine) } logger.Warn("failed to connect guardrails DB, falling back to file", map[string]any{ "error": err.Error(), @@ -73,7 +85,7 @@ func BuildGuardrailChecker(cfg *types.ForgeConfig, workDir string, enforce bool, }) return &coreruntime.NoopGuardrailChecker{} } - return engine + return attach(engine) } // LoadGuardrailsJSON reads guardrails.json from the project directory. diff --git a/forge-cli/runtime/runner.go b/forge-cli/runtime/runner.go index 6055c74..d8c91d0 100644 --- a/forge-cli/runtime/runner.go +++ b/forge-cli/runtime/runner.go @@ -280,10 +280,7 @@ func (r *Runner) Run(ctx context.Context) error { return err } - // 2. Build guardrail checker (DB mode → file mode → defaults) - guardrails := BuildGuardrailChecker(r.cfg.Config, r.cfg.WorkDir, r.cfg.EnforceGuardrails, r.logger) - - // Still load scaffold for SkillGuardrails (separate concern) + // 2. Still load scaffold for SkillGuardrails (separate concern) scaffold, err := LoadPolicyScaffold(r.cfg.WorkDir) if err != nil { r.logger.Warn("failed to load policy scaffold", map[string]any{"error": err.Error()}) @@ -311,6 +308,13 @@ func (r *Runner) Run(ctx context.Context) error { // pre-FWS-7 compatible. auditLogger := coreruntime.NewAuditLoggerFromConfig(r.cfg.AuditExport) auditLogger.SetOpsLogger(r.logger) + + // 4a. Build guardrail checker (DB mode → file mode → defaults) and + // wire the audit logger so every mask/block/warn decision lands on + // the configured audit sinks as a guardrail_check event. Capture- + // evidence posture comes from env (FORGE_GUARDRAIL_*), default + // metadata-only. See issue #155. + guardrails := BuildGuardrailChecker(r.cfg.Config, r.cfg.WorkDir, r.cfg.EnforceGuardrails, r.logger, auditLogger, GuardrailAuditConfigFromEnv()) // Periodic audit_export_status — one event every 60s with per-sink // health counters. Operators tail the audit stream to answer // "is my sidecar healthy?". The stop func blocks until the @@ -1163,7 +1167,7 @@ func (r *Runner) registerHandlers(srv *server.Server, executor coreruntime.Agent server.WriteSSEEvent(w, flusher, "status", task) //nolint:errcheck // Guardrail check inbound - if err := guardrails.CheckInbound(¶ms.Message); err != nil { + if err := guardrails.CheckInbound(ctx, ¶ms.Message); err != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -1233,7 +1237,7 @@ func (r *Runner) registerHandlers(srv *server.Server, executor coreruntime.Agent var finalState a2a.TaskState for respMsg := range ch { // Guardrail check outbound - if grErr := guardrails.CheckOutbound(respMsg); grErr != nil { + if grErr := guardrails.CheckOutbound(ctx, respMsg); grErr != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -1400,7 +1404,7 @@ func (r *Runner) executeTask( auditLogger.EmitInvocationComplete(ctx, snap.InvocationDuration, fields) } - if err := guardrails.CheckInbound(¶ms.Message); err != nil { + if err := guardrails.CheckInbound(ctx, ¶ms.Message); err != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -1471,7 +1475,7 @@ func (r *Runner) executeTask( } if respMsg != nil { - if err := guardrails.CheckOutbound(respMsg); err != nil { + if err := guardrails.CheckOutbound(ctx, respMsg); err != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -1662,7 +1666,7 @@ func (r *Runner) registerRESTHandlers(srv *server.Server, executor coreruntime.A store.Put(task) server.WriteSSEEvent(w, flusher, "status", task) //nolint:errcheck - if err := guardrails.CheckInbound(¶ms.Message); err != nil { + if err := guardrails.CheckInbound(ctx, ¶ms.Message); err != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -1726,7 +1730,7 @@ func (r *Runner) registerRESTHandlers(srv *server.Server, executor coreruntime.A var finalState a2a.TaskState for respMsg := range ch { - if grErr := guardrails.CheckOutbound(respMsg); grErr != nil { + if grErr := guardrails.CheckOutbound(ctx, respMsg); grErr != nil { task.Status = a2a.TaskStatus{ State: a2a.TaskStateFailed, Message: &a2a.Message{ @@ -2035,11 +2039,11 @@ func (r *Runner) registerProgressHooks(hooks *coreruntime.HookRegistry) { // registerGuardrailHooks registers an AfterToolExec hook that scans tool output // for secrets and PII, redacting or blocking based on guardrail mode. func (r *Runner) registerGuardrailHooks(hooks *coreruntime.HookRegistry, guardrails coreruntime.GuardrailChecker) { - hooks.Register(coreruntime.AfterToolExec, func(_ context.Context, hctx *coreruntime.HookContext) error { + hooks.Register(coreruntime.AfterToolExec, func(ctx context.Context, hctx *coreruntime.HookContext) error { if hctx.ToolOutput == "" { return nil } - redacted, err := guardrails.CheckToolOutput(hctx.ToolName, hctx.ToolOutput) + redacted, err := guardrails.CheckToolOutput(ctx, hctx.ToolName, hctx.ToolOutput) if err != nil { return err } diff --git a/forge-core/runtime/guardrails.go b/forge-core/runtime/guardrails.go index 3a45830..98222c3 100644 --- a/forge-core/runtime/guardrails.go +++ b/forge-core/runtime/guardrails.go @@ -1,6 +1,7 @@ package runtime import ( + "context" "strings" "github.com/initializ/forge/forge-core/a2a" @@ -8,26 +9,30 @@ import ( // GuardrailChecker validates messages and tool output against guardrail policies. // Implementations may use file-based config, database-backed config, or no-op passthrough. +// +// All three Check methods accept a context so implementations can route audit +// emissions through AuditLogger.EmitFromContext and inherit correlation_id, +// task_id, and workflow-correlation tags from the inbound request scope. type GuardrailChecker interface { // CheckInbound validates an inbound (user) message against guardrails. - CheckInbound(msg *a2a.Message) error + CheckInbound(ctx context.Context, msg *a2a.Message) error // CheckOutbound validates an outbound (agent) message against guardrails. // Implementations should prefer redacting sensitive content over blocking. - CheckOutbound(msg *a2a.Message) error + CheckOutbound(ctx context.Context, msg *a2a.Message) error // CheckToolOutput scans tool output text against configured guardrails. // Returns the (possibly redacted) text and any blocking error. - CheckToolOutput(toolName, text string) (string, error) + CheckToolOutput(ctx context.Context, toolName, text string) (string, error) } // NoopGuardrailChecker is a passthrough implementation that performs no checks. // Used as a fallback when no guardrail configuration is available. type NoopGuardrailChecker struct{} -func (n *NoopGuardrailChecker) CheckInbound(_ *a2a.Message) error { return nil } -func (n *NoopGuardrailChecker) CheckOutbound(_ *a2a.Message) error { return nil } -func (n *NoopGuardrailChecker) CheckToolOutput(_ string, text string) (string, error) { +func (n *NoopGuardrailChecker) CheckInbound(_ context.Context, _ *a2a.Message) error { return nil } +func (n *NoopGuardrailChecker) CheckOutbound(_ context.Context, _ *a2a.Message) error { return nil } +func (n *NoopGuardrailChecker) CheckToolOutput(_ context.Context, _ string, text string) (string, error) { return text, nil } diff --git a/forge-core/runtime/guardrails_test.go b/forge-core/runtime/guardrails_test.go index 8a4a259..066a8c8 100644 --- a/forge-core/runtime/guardrails_test.go +++ b/forge-core/runtime/guardrails_test.go @@ -1,6 +1,7 @@ package runtime import ( + "context" "testing" "github.com/initializ/forge/forge-core/a2a" @@ -30,15 +31,16 @@ func TestNoopGuardrailChecker_ImplementsInterface(t *testing.T) { }, } - if err := checker.CheckInbound(msg); err != nil { + ctx := context.Background() + if err := checker.CheckInbound(ctx, msg); err != nil { t.Errorf("NoopGuardrailChecker.CheckInbound() unexpected error: %v", err) } - if err := checker.CheckOutbound(msg); err != nil { + if err := checker.CheckOutbound(ctx, msg); err != nil { t.Errorf("NoopGuardrailChecker.CheckOutbound() unexpected error: %v", err) } - out, err := checker.CheckToolOutput("some_tool", "some text") + out, err := checker.CheckToolOutput(ctx, "some_tool", "some text") if err != nil { t.Errorf("NoopGuardrailChecker.CheckToolOutput() unexpected error: %v", err) } diff --git a/go.work.sum b/go.work.sum index 8787c34..f54e550 100644 --- a/go.work.sum +++ b/go.work.sum @@ -1,23 +1,38 @@ +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE= +github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/aymanbagabas/go-udiff v0.3.1/go.mod h1:G0fsKmG+P6ylD0r6N/KgQD/nWzgfnl8ZBcNLgcbrw8E= github.com/bits-and-blooms/bitset v1.24.4/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/charmbracelet/harmonica v0.2.0/go.mod h1:KSri/1RMQOZLbw7AHqgcBycp8pgJnQMYYT8QZRqZ1Ao= github.com/charmbracelet/x/exp/golden v0.0.0-20241011142426-46044092ad91/go.mod h1:wDlXFlCrmJ8J+swcL/MnGUuYnqgQdW9rhSD61oNMb6U= +github.com/cncf/xds/go v0.0.0-20260202195803-dba9d589def2/go.mod h1:qwXFYgsP6T7XnJtbKlf1HP8AjxZZyzxMmc+Lq5GjlU4= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= +github.com/envoyproxy/go-control-plane/envoy v1.37.0/go.mod h1:DReE9MMrmecPy+YvQOAOHNYMALuowAnbjjEMkkWOi6A= +github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= +github.com/envoyproxy/protoc-gen-validate v1.3.3/go.mod h1:TsndJ/ngyIdQRhMcVVGDDHINPLWB7C82oDArY51KfB0= +github.com/go-jose/go-jose/v4 v4.1.4/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= +github.com/golang/glog v1.2.5/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwmO+w= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= +github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/sahilm/fuzzy v0.1.1/go.mod h1:VFvziUEIMCrT6A6tw2RFIXPXXmzXbOsSHF0DOI8ZK9Y= +github.com/spiffe/go-spiffe/v2 v2.6.0/go.mod h1:gm2SeUoMZEtpnzPNs2Csc0D/gX33k1xIx7lEzqblHEs= +go.opentelemetry.io/contrib/detectors/gcp v1.42.0/go.mod h1:W9zQ439utxymRrXsUOzZbFX4JhLxXU4+ZnCt8GG7yA8= golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= +golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2/go.mod h1:b7fPSJ0pKZ3ccUh8gnTONJxhn3c/PS6tyzQvyqw4iA8= +golang.org/x/telemetry v0.0.0-20260409153401-be6f6cb8b1fa/go.mod h1:kHjTxDEnAu6/Nl9lDkzjWpR+bmKfxeiRuSDlsMb70gE= golang.org/x/term v0.43.0 h1:S4RLU2sB31O/NCl+zFN9Aru9A/Cq2aqKpTZJ6B+DwT4= golang.org/x/term v0.43.0/go.mod h1:lrhlHNdQJHO+1qVYiHfFKVuVioJIheAc3fBSMFYEIsk= -golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= -golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= From e01ee82ef0a9f142c642a73805d2fdf2ed943494 Mon Sep 17 00:00:00 2001 From: MK Date: Sun, 14 Jun 2026 00:57:15 -0400 Subject: [PATCH 2/2] fix(audit): emit post-mask content as evidence on mask decisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous behavior stamped the raw pre-mask text into fields.evidence even when the library had just masked PII out of the prompt — so an inbound SSN ended up plain-text in the audit stream even though the LLM downstream only saw the masked form. CheckInbound / CheckOutbound / CheckToolOutput now pass result.MaskedContent (the post-library-mask payload) for DecisionMask. Block / warn decisions still emit the original content because the library never produces a masked variant in those paths and the operator wants to see what was rejected. Docs and the EmitsAuditOnInboundMask test are updated to assert the raw PII MUST NOT appear in evidence on a mask decision. --- docs/security/guardrails.md | 14 ++++++++++++++ forge-cli/runtime/guardrails_engine.go | 18 +++++++++++++++--- forge-cli/runtime/guardrails_engine_test.go | 16 ++++++++++++---- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/docs/security/guardrails.md b/docs/security/guardrails.md index 576f710..73f12a4 100644 --- a/docs/security/guardrails.md +++ b/docs/security/guardrails.md @@ -570,6 +570,20 @@ The size envelope and `[REDACTED]` marker match the OTel span content-capture pipeline (issue #130) so the same string travels through both pipelines under one contract. +#### What evidence actually contains + +| Decision | Evidence source | +|----------|-----------------| +| `masked` | The **post-mask** content (`Result.MaskedContent`) — the same payload the LLM saw downstream. PII the library already masked stays masked in the audit stream. | +| `warned` | The original triggering content. No mask was produced (the library only generates a masked variant for `mask` decisions). The redact pass still runs. | +| `blocked` | The original triggering content. Same rationale as `warned`. | + +This means a typical PII-mask event emits the redacted version of the +prompt as evidence, not the raw text. Operators auditing for "did our +agent ever see PII?" should treat a `decision=blocked` row as the +only one that can carry plain-text PII through the stream, and gate +their export pipeline accordingly. + ### Mode-specific behavior - **File mode** — every event flows through the Forge audit pipeline. diff --git a/forge-cli/runtime/guardrails_engine.go b/forge-cli/runtime/guardrails_engine.go index a622d50..f1c5134 100644 --- a/forge-cli/runtime/guardrails_engine.go +++ b/forge-cli/runtime/guardrails_engine.go @@ -154,11 +154,19 @@ func (e *LibraryGuardrailEngine) CheckInbound(ctx context.Context, msg *a2a.Mess e.logger.Info("inbound guardrail redaction applied", map[string]any{ "direction": "inbound", }) - e.emitGuardrailEvent(ctx, "inbound", "", text, guardrailResultMasked, result) + // Evidence carries the post-library-mask content for mask + // decisions — same payload the LLM actually saw downstream. + // Stamping the pre-mask original here would defeat the very + // mask the library produced. + e.emitGuardrailEvent(ctx, "inbound", "", result.MaskedContent, guardrailResultMasked, result) } case guardrails.DecisionBlock: desc := violationSummary(result) if e.enforce { + // Block decisions have no MaskedContent — the message was + // rejected outright. Evidence carries the original so the + // operator can see what they would have sent; redact pass + // still runs over it on the way out. e.emitGuardrailEvent(ctx, "inbound", "", text, guardrailResultBlocked, result) return fmt.Errorf("input blocked: %s", desc) } @@ -200,7 +208,9 @@ func (e *LibraryGuardrailEngine) CheckOutbound(ctx context.Context, msg *a2a.Mes e.logger.Warn("outbound guardrail redaction applied", map[string]any{ "direction": "outbound", }) - e.emitGuardrailEvent(ctx, "outbound", "", original, guardrailResultMasked, result) + // Evidence = post-mask content (same payload the user + // actually received). See CheckInbound for rationale. + e.emitGuardrailEvent(ctx, "outbound", "", result.MaskedContent, guardrailResultMasked, result) } case guardrails.DecisionBlock: desc := violationSummary(result) @@ -250,7 +260,9 @@ func (e *LibraryGuardrailEngine) CheckToolOutput(ctx context.Context, toolName, "tool": toolName, "detail": "content redacted", }) - e.emitGuardrailEvent(ctx, "tool_output", toolName, text, guardrailResultMasked, result) + // Evidence = post-mask content; matches what the loop sends + // to the LLM. See CheckInbound for rationale. + e.emitGuardrailEvent(ctx, "tool_output", toolName, result.MaskedContent, guardrailResultMasked, result) return result.MaskedContent, nil } case guardrails.DecisionBlock: diff --git a/forge-cli/runtime/guardrails_engine_test.go b/forge-cli/runtime/guardrails_engine_test.go index 139e73d..0ee0d1b 100644 --- a/forge-cli/runtime/guardrails_engine_test.go +++ b/forge-cli/runtime/guardrails_engine_test.go @@ -125,8 +125,10 @@ func TestBuildGuardrailChecker_FileMode(t *testing.T) { // TestLibraryGuardrailEngine_EmitsAuditOnInboundMask verifies the engine // emits a guardrail_check event on the configured audit logger when an -// inbound message triggers a mask decision, and that capturing evidence -// surfaces the offending text (redacted + truncated) in fields.evidence. +// inbound message triggers a mask decision, and that fields.evidence +// is populated with the POST-MASK content (never the raw original) — +// the library already redacted the PII, so the audit stream sees the +// same redacted payload the LLM saw downstream. func TestLibraryGuardrailEngine_EmitsAuditOnInboundMask(t *testing.T) { sg := DefaultStructuredGuardrails() engine, err := NewFileGuardrailEngine(sg, false, &grTestLogger{}) @@ -159,8 +161,14 @@ func TestLibraryGuardrailEngine_EmitsAuditOnInboundMask(t *testing.T) { if !strings.Contains(out, `"evidence"`) { t.Errorf("expected evidence field with CaptureEvidence=true, got: %s", out) } - if !strings.Contains(out, "foo@example.com") { - t.Errorf("expected raw email in evidence, got: %s", out) + // PII never lands in evidence — the post-mask content is what we + // emit, so the raw email MUST be absent. + if strings.Contains(out, "foo@example.com") { + t.Errorf("raw email MUST NOT appear in evidence on a mask decision, got: %s", out) + } + // The in-place mask MUST also have rewritten the message Part. + if strings.Contains(msg.Parts[0].Text, "foo@example.com") { + t.Errorf("message part should have been masked in-place, got: %q", msg.Parts[0].Text) } }