Skip to content
2 changes: 1 addition & 1 deletion fern/versions/latest/pages/concepts/seed-datasets.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ Directory-backed seed datasets expose these columns:

<Note>
Filesystem matching
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off.
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off. Relative local `path` values are resolved by the active filesystem provider when the seed is validated or read, not when the config object is constructed.
</Note>

### πŸ“„ FileContentsSeedSource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,11 @@ If you need more control, `FileSystemSeedReader` also lets you override:
- `on_attach(...)` for per-attachment setup
- `create_filesystem_context(...)` for custom rooted filesystem behavior

Most filesystem plugins do not need either hook.
For new non-local backends, prefer passing a `FileSystemProvider` to the reader
constructor. The default `create_filesystem_context(...)` implementation calls
the provider's existence preflight and then asks it to create the rooted
filesystem context. Overriding `create_filesystem_context(...)` remains supported
for existing plugins, but that override takes ownership of any backend-specific
existence checks.

Most filesystem plugins do not need these hooks.
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,21 @@ class FileSystemSeedSource(SeedSource, ABC):
``FileSystemSeedReader`` implementation.

Attributes:
path: Directory containing seed artifacts. Relative paths are resolved
from the current working directory when the config is loaded, not
from the config file location.
path: Directory containing seed artifacts. Relative local paths are
resolved by the active filesystem provider when the seed is
validated or read, not when the config object is constructed.
file_pattern: Case-sensitive filename pattern used to match files under
the provided directory. Patterns match basenames only, not relative
paths. Defaults to ``'*'``.
recursive: Whether to search nested subdirectories under the provided
directory for matching files. Defaults to ``True``.
"""

_runtime_path: str | None = PrivateAttr(default=None)

path: str = Field(
...,
description=(
"Directory containing seed artifacts. Relative paths are resolved from the current working "
"directory when the config is loaded, not from the config file location."
"Directory containing seed artifacts. Relative local paths are resolved by the active filesystem "
"provider when the seed is validated or read, not when the config object is constructed."
),
)
file_pattern: str = Field(
Expand All @@ -135,22 +133,23 @@ class FileSystemSeedSource(SeedSource, ABC):
def validate_path(cls, value: str | None) -> str | None:
# Signature is str | None because AgentRolloutSeedSource overrides path to str | None
# and inherited validators fire for all subclasses.
return _validate_filesystem_seed_source_path(value)

def model_post_init(self, __context: Any) -> None:
# None guard is exercised by AgentRolloutSeedSource (path: str | None) via inheritance.
self._runtime_path = None if self.path is None else _resolve_filesystem_runtime_path(self.path)

@property
def runtime_path(self) -> str:
if self._runtime_path is None:
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
return self._runtime_path
if value is None:
return None
if not value.strip():
raise InvalidFilePathError("πŸ›‘ FileSystemSeedSource.path must be a non-empty string.")
return value

@field_validator("file_pattern", mode="after")
def validate_file_pattern(cls, value: str | None) -> str | None:
return _validate_filesystem_seed_source_file_pattern(value)

@property
def runtime_path(self) -> str:
# Path resolution and existence checks are the filesystem provider's job at read
# time, not the config object's. Keeping the raw value here preserves relative
# paths and avoids assuming a local filesystem.
return self.path


class DirectorySeedSource(FileSystemSeedSource):
seed_type: Literal["directory"] = "directory"
Expand Down Expand Up @@ -202,15 +201,6 @@ def get_pi_coding_agent_default_path() -> str:
return str(Path("~/.pi/agent/sessions").expanduser())


def _validate_filesystem_seed_source_path(value: str | None) -> str | None:
if value is None:
return None
path = Path(value).expanduser().resolve()
if not path.is_dir():
raise InvalidFilePathError(f"πŸ›‘ Path {path} is not a directory.")
return value


def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | None:
if value is None:
return None
Expand Down Expand Up @@ -259,8 +249,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
"Claude Code defaults to ~/.claude/projects, Codex defaults to ~/.codex/sessions, "
"Hermes Agent defaults to ~/.hermes/sessions, "
"and Pi Coding Agent defaults to ~/.pi/agent/sessions. "
"Relative paths are resolved from the current working directory when the config is loaded, "
"not from the config file location."
"Relative local paths are resolved by the active filesystem provider when the seed is "
"validated or read, not when the config object is constructed."
),
)

Expand All @@ -282,14 +272,12 @@ def validate_runtime_path_source(self) -> Self:

@property
def runtime_path(self) -> str:
if self._runtime_path is not None:
return self._runtime_path
# Path resolution and existence checks happen in the filesystem provider at read
# time. When no explicit path is given, fall back to the format's default root.
if self.path is not None:
return self.path
default_path, _ = get_agent_rollout_format_defaults(self.format)
resolved_path = self.path if self.path is not None else default_path
if resolved_path is None:
raise ValueError(f"πŸ›‘ AgentRolloutSeedSource.path is required for format {self.format.value!r}.")
self._runtime_path = _resolve_filesystem_runtime_path(resolved_path)
return self._runtime_path
return default_path

@property
def resolved_file_pattern(self) -> str:
Expand Down
74 changes: 64 additions & 10 deletions packages/data-designer-config/tests/config/test_seed_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Literal

import pytest

Expand All @@ -15,6 +16,7 @@
AgentRolloutSeedSource,
DirectorySeedSource,
FileContentsSeedSource,
FileSystemSeedSource,
LocalFileSeedSource,
)
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
Expand Down Expand Up @@ -95,12 +97,14 @@ def test_dataframe_seed_source_serialization() -> None:
assert serialized == {"seed_type": "df"}


def test_directory_seed_source_requires_directory(tmp_path: Path) -> None:
def test_directory_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
file_path = tmp_path / "file.txt"
file_path.write_text("alpha", encoding="utf-8")

with pytest.raises(InvalidFilePathError, match="is not a directory"):
DirectorySeedSource(path=str(file_path))
source = DirectorySeedSource(path=str(file_path))

assert source.path == str(file_path)
assert source.runtime_path == str(file_path)


def test_directory_seed_source_preserves_relative_path_input(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
Expand Down Expand Up @@ -146,7 +150,7 @@ def test_file_contents_seed_source_preserves_relative_path_input(
pytest.param(FileContentsSeedSource, {"file_pattern": "*.txt"}, id="file-contents"),
],
)
def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(
def test_filesystem_seed_sources_preserve_raw_runtime_path_across_cwd_changes(
source_type: type[DirectorySeedSource] | type[FileContentsSeedSource],
source_kwargs: dict[str, str],
tmp_path: Path,
Expand All @@ -160,12 +164,11 @@ def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(

monkeypatch.chdir(initial_root)
source = source_type(path="seed-dir", **source_kwargs)
expected_runtime_path = str(initial_seed_dir.resolve())

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == expected_runtime_path
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


Expand All @@ -176,10 +179,10 @@ def test_seed_source_path_descriptions_document_cwd_resolution() -> None:

assert "current working directory" in local_path_description
assert "config file location" in local_path_description
assert "current working directory" in directory_path_description
assert "config file location" in directory_path_description
assert "current working directory" in file_contents_path_description
assert "config file location" in file_contents_path_description
assert "active filesystem provider" in directory_path_description
assert "config object is constructed" in directory_path_description
assert "active filesystem provider" in file_contents_path_description
assert "config object is constructed" in file_contents_path_description


def test_file_contents_seed_source_parses_from_dict(tmp_path: Path) -> None:
Expand Down Expand Up @@ -223,6 +226,17 @@ def test_filesystem_seed_sources_reject_path_like_file_patterns(
source_type(path=str(tmp_path), file_pattern=file_pattern)


def test_filesystem_seed_source_subclass_inherits_runtime_path(tmp_path: Path) -> None:
# Plugin authors subclass FileSystemSeedSource directly; readers rely on
# `source.runtime_path`, so the base must provide it without an override.
class PluginSeedSource(FileSystemSeedSource):
seed_type: Literal["plugin-seed-source"] = "plugin-seed-source"

source = PluginSeedSource(path=str(tmp_path))

assert source.runtime_path == str(tmp_path)


@pytest.mark.parametrize(
("rollout_format", "file_pattern", "error_message"),
[
Expand Down Expand Up @@ -267,6 +281,46 @@ def test_agent_rollout_seed_source_requires_explicit_atif_path() -> None:
AgentRolloutSeedSource(format=AgentRolloutFormat.ATIF)


def test_agent_rollout_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
missing_dir = tmp_path / "does-not-exist"

source = AgentRolloutSeedSource(path=str(missing_dir), format=AgentRolloutFormat.ATIF)

assert source.path == str(missing_dir)
assert source.runtime_path == str(missing_dir)


def test_agent_rollout_seed_source_preserves_raw_runtime_path_across_cwd_changes(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
initial_root = tmp_path / "initial"
later_root = tmp_path / "later"
(initial_root / "seed-dir").mkdir(parents=True)
later_root.mkdir()

monkeypatch.chdir(initial_root)
source = AgentRolloutSeedSource(path="seed-dir", format=AgentRolloutFormat.ATIF)

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


def test_agent_rollout_seed_source_runtime_path_falls_back_to_format_default(
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
) -> None:
monkeypatch.setenv("HOME", str(tmp_path))

source = AgentRolloutSeedSource(format=AgentRolloutFormat.CLAUDE_CODE)

assert source.path is None
assert source.runtime_path == str(tmp_path / ".claude" / "projects")


def test_agent_rollout_seed_source_uses_default_atif_file_pattern(tmp_path: Path) -> None:
trace_dir = tmp_path / "atif"
trace_dir.mkdir()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from data_designer.config.errors import InvalidConfigError
from data_designer.config.sampler_params import UUIDSamplerParams
from data_designer.engine.resources.resource_provider import ResourceProvider
from data_designer.engine.resources.seed_reader import SeedReader
from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderConfigError
from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config

logger = logging.getLogger(__name__)
Expand All @@ -31,7 +31,10 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR
if not seed_reader:
return

seed_col_names = seed_reader.get_column_names()
try:
seed_col_names = seed_reader.get_column_names()
except SeedReaderConfigError as error:
raise InvalidConfigError(str(error)) from error
existing_columns = {column.name for column in config.columns}
colliding_columns = {name for name in seed_col_names if name in existing_columns}
if colliding_columns:
Expand Down
Loading
Loading