Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
f24d8cc
feat(data-retention): granular PII redaction stages (input + block ou…
TheodoreSpeaks Jun 29, 2026
36f2a3d
fix(data-retention): propagate block-output redaction into child work…
TheodoreSpeaks Jun 30, 2026
bb3a84b
fix(data-retention): close block-output redaction gaps on streaming +…
TheodoreSpeaks Jun 30, 2026
0b81fed
fix(data-retention): drain+mask streamed output, resolve PII policy u…
TheodoreSpeaks Jun 30, 2026
2d75987
test(testing): support leftJoin().where().limit() in shared db mock
TheodoreSpeaks Jun 30, 2026
eb6b25a
fix(data-retention): mask agent/Pi memory writes under block-output r…
TheodoreSpeaks Jun 30, 2026
324b04c
Merge remote-tracking branch 'origin/staging' into feat/pii-granular-…
TheodoreSpeaks Jun 30, 2026
d55b557
fix(data-retention): guard partial PII stages in GET normalize
TheodoreSpeaks Jun 30, 2026
83ffe4d
fix(data-retention): mask seeded memory messages under block-output r…
TheodoreSpeaks Jun 30, 2026
a911af8
fix(guardrails): fail closed on misaligned Presidio batch responses
TheodoreSpeaks Jun 30, 2026
31f2e3f
fix(data-retention): enabled stage with no entity types redacts all (…
TheodoreSpeaks Jun 30, 2026
437d2bb
fix(data-retention): reject enabled stage with no entity types; empty…
TheodoreSpeaks Jun 30, 2026
78b2c56
docs(data-retention): note resume remask covers inline values only
TheodoreSpeaks Jun 30, 2026
8f86d77
fix(data-retention): scrub offloaded large-value refs from logs when …
TheodoreSpeaks Jun 30, 2026
6e9587a
fix(data-retention): hydrate, mask, and re-store large-value refs in …
TheodoreSpeaks Jun 30, 2026
f0c71cc
fix(data-retention): always apply logs policy to large-value refs whe…
TheodoreSpeaks Jun 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 90 additions & 24 deletions apps/pii/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@
from typing import Any

from fastapi import FastAPI
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerResult
from presidio_analyzer import (
AnalyzerEngine,
BatchAnalyzerEngine,
Pattern,
PatternRecognizer,
RecognizerResult,
)
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.predefined_recognizers import (
AuAbnRecognizer,
Expand Down Expand Up @@ -133,6 +139,7 @@ def build_analyzer() -> AnalyzerEngine:


analyzer = build_analyzer()
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
anonymizer = AnonymizerEngine()

# Propagates to uvicorn's root handler, so timing lands in the container log stream.
Expand All @@ -149,13 +156,65 @@ class AnalyzeRequest(BaseModel):
return_decision_process: bool = False


class AnalyzeBatchRequest(BaseModel):
texts: list[str]
language: str = "en"
entities: list[str] | None = None
score_threshold: float | None = None


class AnonymizeRequest(BaseModel):
text: str
analyzer_results: list[dict[str, Any]] = []
anonymizers: dict[str, dict[str, Any]] | None = None
operators: dict[str, dict[str, Any]] | None = None


class AnonymizeBatchItem(BaseModel):
text: str
analyzer_results: list[dict[str, Any]] = []


class AnonymizeBatchRequest(BaseModel):
items: list[AnonymizeBatchItem] = []
anonymizers: dict[str, dict[str, Any]] | None = None
operators: dict[str, dict[str, Any]] | None = None


def build_operators(
raw_operators: dict[str, dict[str, Any]] | None,
) -> dict[str, OperatorConfig] | None:
if not raw_operators:
return None
operators: dict[str, OperatorConfig] = {}
for entity, raw_cfg in raw_operators.items():
op_cfg = dict(raw_cfg)
op_type = op_cfg.pop("type", "replace")
operators[entity] = OperatorConfig(op_type, op_cfg)
return operators


def run_anonymize(
text: str,
raw_results: list[dict[str, Any]],
operators: dict[str, OperatorConfig] | None,
):
analyzer_results = [
RecognizerResult(
entity_type=r["entity_type"],
start=r["start"],
end=r["end"],
score=r.get("score", 1.0),
)
for r in raw_results
]
return anonymizer.anonymize(
text=text,
analyzer_results=analyzer_results,
operators=operators,
)


@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
Expand Down Expand Up @@ -186,35 +245,28 @@ def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]:
return [r.to_dict() for r in results]


@app.post("/analyze_batch")
def analyze_batch(req: AnalyzeBatchRequest) -> list[list[dict[str, Any]]]:
"""Analyze many texts in one pass (spaCy nlp.pipe), returning one span list
per input in request order — the batched counterpart to /analyze."""
results = batch_analyzer.analyze_iterator(
texts=req.texts,
language=req.language,
entities=req.entities or None,
score_threshold=req.score_threshold,
)
return [[r.to_dict() for r in per_text] for per_text in results]


@app.post("/anonymize")
def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
started = time.perf_counter()
analyzer_results = [
RecognizerResult(
entity_type=r["entity_type"],
start=r["start"],
end=r["end"],
score=r.get("score", 1.0),
)
for r in req.analyzer_results
]
raw_operators = req.anonymizers or req.operators
operators = None
if raw_operators:
operators = {}
for entity, raw_cfg in raw_operators.items():
op_cfg = dict(raw_cfg)
op_type = op_cfg.pop("type", "replace")
operators[entity] = OperatorConfig(op_type, op_cfg)
result = anonymizer.anonymize(
text=req.text,
analyzer_results=analyzer_results,
operators=operators,
)
operators = build_operators(req.anonymizers or req.operators)
result = run_anonymize(req.text, req.analyzer_results, operators)
logger.info(
"anonymize chars=%d spans=%d duration_ms=%.1f",
len(req.text),
len(analyzer_results),
len(req.analyzer_results),
(time.perf_counter() - started) * 1000,
)
return {
Expand All @@ -230,3 +282,17 @@ def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
for item in result.items
],
}


@app.post("/anonymize_batch")
def anonymize_batch(req: AnonymizeBatchRequest) -> dict[str, list[str]]:
"""Mask many texts in one pass, returning masked text per item in request
order — the batched counterpart to /anonymize. Anonymization is pure string
work (no NLP), so callers should send only items with detected spans."""
operators = build_operators(req.anonymizers or req.operators)
return {
"texts": [
run_anonymize(item.text, item.analyzer_results, operators).text
for item in req.items
]
}
16 changes: 16 additions & 0 deletions apps/sim/app/api/organizations/[id]/data-retention/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ function normalizeConfigured(
rules: settings.piiRedaction.rules.map((rule) => ({
...rule,
language: coercePiiLanguage(rule.language),
stages: rule.stages
? {
input: {
...rule.stages.input,
language: coercePiiLanguage(rule.stages.input?.language),
},
blockOutputs: {
...rule.stages.blockOutputs,
language: coercePiiLanguage(rule.stages.blockOutputs?.language),
},
logs: {
...rule.stages.logs,
language: coercePiiLanguage(rule.stages.logs?.language),
},
Comment thread
cursor[bot] marked this conversation as resolved.
}
: undefined,
})),
}
: null,
Expand Down
Loading
Loading