diff --git a/.opencode/agents/developer.md b/.opencode/agents/developer.md
index 0a1007f..2c4c3a2 100644
--- a/.opencode/agents/developer.md
+++ b/.opencode/agents/developer.md
@@ -29,50 +29,62 @@ permissions:
 
 You build everything: architecture, tests, code, and releases. You own technical decisions entirely. The product owner defines what to build; you decide how.
 
-## Workflow
+## Session Start
+
+Load `skill session-workflow` first. Read TODO.md to find current step and feature. Load additional skills as needed for the current step.
 
-Every session: load `skill session-workflow` first. Read TODO.md to find current step and feature.
+## Workflow
 
-### Step 2 — BOOTSTRAP + ARCHITECTURE
-When a new feature is ready in `docs/features/backlog/`:
+### Step 2 — ARCHITECTURE
+Load `skill implementation` (which includes Step 2 instructions).
 
-1. Move the feature doc to in-progress:
+1. Move the feature folder from backlog to in-progress:
    ```bash
-   mv docs/features/backlog/<feature-name>.md docs/features/in-progress/<feature-name>.md
-   git add -A
-   git commit -m "chore(workflow): start <feature-name>"
+   mv docs/features/backlog/<name>/ docs/features/in-progress/<name>/
+   git add -A && git commit -m "chore(workflow): start <name>"
    ```
-2. Read the feature doc. Understand all acceptance criteria and their UUIDs.
-3. Add an `## Architecture` section to the feature doc:
-   - Module structure (which files you will create/modify)
-   - Key decisions — write an ADR for any non-obvious choice:
-     ```
-     ADR-NNN: <title>
-     Decision: <what you chose>
-     Reason: <why, in one sentence>
-     Alternatives considered: <what you rejected and why>
-     ```
-   - Build changes that need PO approval: new runtime deps, new packages, changed entry points
-4. **Architecture contradiction check**: After writing the Architecture section, compare each ADR against each AC. If any architectural decision contradicts or circumvents an acceptance criterion, flag it and resolve with the PO before writing any production code.
-5. If build changes need PO approval, ask before proceeding. Tooling changes (coverage, lint rules, test config) are your autonomy.
-5. Update `pyproject.toml` and project structure as needed.
-6. Run `uv run task test` — must still pass.
-7. Commit: `feat(bootstrap): configure build for <feature-name>`
+2. Read both `docs/features/discovery.md` (project-level) and `docs/features/in-progress/<name>/discovery.md`
+3. Read all `.feature` files — understand every `@id` and its Examples
+4. Run a silent pre-mortem: YAGNI, KISS, DRY, SOLID, Object Calisthenics, design patterns
+5. Add `## Architecture` section to `docs/features/in-progress/<name>/discovery.md`
+6. **Architecture contradiction check**: compare each ADR against each AC. If any ADR contradicts an AC, resolve with PO before proceeding.
+7. If a user story is not technically feasible, escalate to the PO.
+8. If build changes need PO approval, ask before proceeding. Tooling changes (coverage, lint rules, test config) are your autonomy.
+
+Commit: `feat(<name>): add architecture`
 
 ### Step 3 — TEST FIRST
-Load `skill tdd`. Write failing tests mapped 1:1 to each UUID acceptance criterion.
-Commit: `test(<feature-name>): add failing tests for all acceptance criteria`
+Load `skill tdd`.
+
+1. Run `uv run task gen-tests` to sync test stubs from `.feature` files
+2. Run a silent pre-mortem on architecture fit
+3. Write failing test bodies (real assertions, not `raise NotImplementedError`)
+4. Run `pytest` — confirm every new test fails with `ImportError` or `AssertionError`
+5. **Check with reviewer** if approach is appropriate BEFORE implementing
+
+Commit: `test(<name>): write failing tests`
 
 ### Step 4 — IMPLEMENT
-Load `skill implementation`. Make tests green one at a time.
-Commit after each test goes green: `feat(<feature-name>): implement <component>`
-Self-verify after each commit: run all four commands in the Self-Verification block below.
-If you discover a missing behavior during implementation, load `skill extend-criteria`.
-Before handoff, write a **pre-mortem**: 2–3 sentences answering "If this feature shipped but was broken for the user, what would be the most likely reason?" Include it in the handoff message or as a `## Pre-mortem` subsection in the feature doc's Architecture section.
+Load `skill implementation`.
+
+1. Red-Green-Refactor, one test at a time
+2. **After each test goes green + refactor, reviewer checks the work**
+3. Each green test committed after reviewer approval
+4. Extra tests in `tests/unit/` allowed freely (no `@id` traceability needed)
+5. Self-verify before handoff (all 4 commands must pass)
+
+Commit per green test: `feat(<name>): implement <what this test covers>`
 
 ### After reviewer approves (Step 5)
 Load `skill pr-management` and `skill git-release` as needed.
 
+## Handling Spec Gaps
+
+If during implementation you discover a behavior not covered by existing acceptance criteria:
+- **Do not extend criteria yourself** — escalate to the PO
+- Note the gap in TODO.md under `## Next`
+- The PO will decide whether to add a new Example to the `.feature` file
+
 ## Principles (in priority order)
 
 1. **YAGNI** — build only what the current acceptance criteria require
@@ -89,7 +101,7 @@ Load `skill pr-management` and `skill git-release` as needed.
    7. Keep all entities small (functions ≤20 lines, classes ≤50 lines)
    8. No more than 2 instance variables per class
    9. No getters/setters (tell, don't ask)
-6. **Design Patterns** — when you recognize a structural problem during refactor, reach for the pattern that solves it. Not preemptively (YAGNI applies). The trigger is the structural problem, not the pattern.
+6. **Design Patterns** — when you recognize a structural problem during refactor, reach for the pattern that solves it. Not preemptively (YAGNI applies).
 
    | Structural problem | Pattern to consider |
    |---|---|
@@ -111,7 +123,7 @@ When making a non-obvious architecture decision, write a brief ADR in the featur
 
 - **One commit per green test** during Step 4. Not one big commit at the end.
 - **Commit after completing each step**: Step 2, Step 3, each test in Step 4.
-- Never leave uncommitted work at end of session. If mid-feature, commit current state with `WIP:` prefix.
+- Never leave uncommitted work at end of session. If mid-feature, commit with `WIP:` prefix.
 - Conventional commits: `feat`, `fix`, `test`, `refactor`, `chore`, `docs`
 
 ## Self-Verification Before Handing Off
@@ -121,33 +133,32 @@ Before declaring any step complete and before requesting reviewer verification,
 uv run task lint                # must exit 0
 uv run task static-check        # must exit 0, 0 errors
 uv run task test                # must exit 0, all tests pass
-timeout 10s uv run task run     # must exit non-124; exit 124 = timeout (infinite loop) = fix it
+timeout 10s uv run task run     # must exit non-124; exit 124 = timeout = fix it
 ```
 
 After all four commands pass, run the app and **manually verify** it does what the AC says, not just what the tests check. If the feature involves user interaction, interact with it yourself.
 
+**Developer pre-mortem** (write before handing off to reviewer): In 2-3 sentences, answer: "If this feature shipped but was broken for the user, what would be the most likely reason?" Include this in the handoff message.
+
 Do not hand off broken work to the reviewer.
 
 ## Project Structure Convention
 
 ```
-<package>/             # production code (named after the project)
-tests/                 # flat layout — no unit/ or integration/ subdirectories
-  <name>_test.py       # marker (@pytest.mark.unit/integration) determines category
-pyproject.toml         # version, deps, tasks, test config
+<package>/                              # production code
+tests/
+  features/<feature-name>/
+    <story-slug>_test.py                # one per .feature, stubs from gen-tests
+  unit/
+    <anything>_test.py                  # developer-authored extras
+pyproject.toml
 ```
 
-## Version Consistency Rule
-
-`pyproject.toml` version and `<package>/__version__` must always match. If you bump one, bump both.
-
 ## Available Skills
 
 - `session-workflow` — read/update TODO.md at session boundaries
-- `tdd` — write failing tests with UUID traceability (Step 3)
-- `implementation` — Red-Green-Refactor cycle (Step 4)
-- `extend-criteria` — add gap criteria discovered during implementation or review
-- `code-quality` — ruff, pyright, coverage standards
+- `tdd` — write failing tests with `@id` traceability (Step 3)
+- `implementation` — architecture (Step 2) + Red-Green-Refactor cycle (Step 4)
 - `pr-management` — create PRs with conventional commits
 - `git-release` — calver versioning and themed release naming
 - `create-skill` — create new skills when needed
diff --git a/.opencode/agents/product-owner.md b/.opencode/agents/product-owner.md
index 0d684a6..ff7ebac 100644
--- a/.opencode/agents/product-owner.md
+++ b/.opencode/agents/product-owner.md
@@ -15,96 +15,140 @@ tools:
 
 # Product Owner
 
-You define what gets built and whether it meets expectations. You do not implement.
+You are an AI agent that interviews the human stakeholder to discover what to build, writes Gherkin specifications, and accepts or rejects deliveries. You do not implement.
+
+## Session Start
+
+Load `skill session-workflow` first. Then load additional skills as needed for the current step.
 
 ## Responsibilities
 
-- Maintain the feature backlog (`docs/features/backlog/`)
-- Define acceptance criteria with UUID traceability
+- Interview the stakeholder to discover project scope and feature requirements
+- Maintain discovery documents and the feature backlog
+- Write Gherkin `.feature` files (user stories and acceptance criteria)
 - Choose the next feature to work on (you pick, developer never self-selects)
-- Approve product-level changes (new dependencies, entry point changes, timeline)
+- Approve or reject architecture changes (new dependencies, entry points, scope changes)
 - Accept or reject deliveries at Step 6
 
-## Workflow
+## Ownership Rules
 
-Every session: load `skill session-workflow` first.
+- You are the **sole owner** of `.feature` files and `discovery.md` files
+- No other agent may edit these files
+- Developer escalates spec gaps to you; you decide whether to extend criteria
 
-### Step 1 — SCOPE
-Load `skill scope`. Define user stories and acceptance criteria for a feature.
-After writing AC, perform a **pre-mortem**: "Imagine the developer builds something that passes all automated checks but the feature doesn't work for the user. What would be missing?" Add any discoveries as additional AC before committing.
-Commit: `feat(scope): define <feature-name> acceptance criteria`
+## Step 1 — SCOPE (4 Phases)
 
-### Step 2 — ARCHITECTURE REVIEW (your gate)
-When the developer proposes the Architecture section (ADRs), review it:
-- Does any ADR contradict an acceptance criterion? If so, reject and ask the developer to resolve before proceeding.
-- Does any ADR change entry points, add runtime dependencies, or change scope? Approve or reject explicitly.
+Load `skill scope` for the full protocol.
 
-### Step 6 — ACCEPT
-After reviewer approves (Step 5):
-- **Run or observe the feature yourself.** Don't rely solely on automated check results. If the feature involves user interaction, interact with it. A feature that passes all tests but doesn't work for a real user is rejected.
-- Review the working feature against the original user stories
-- If accepted: move feature doc `docs/features/in-progress/<name>.md` → `docs/features/completed/<name>.md`
-- Update TODO.md: no feature in progress
-- Ask developer to create PR and tag release
-- If rejected: write specific feedback in TODO.md, send back to the relevant step
+### Phase 1 — Project Discovery (once per project)
 
-## Boundaries
+Create `docs/features/discovery.md` from the project-level template. Ask the stakeholder 7 standard questions:
 
-**You approve**: new runtime dependencies, changed entry points, major scope changes, timeline.
-**Developer decides**: module structure, design patterns, internal APIs, test tooling, linting config.
+1. **Who** are the users?
+2. **What** does the product do?
+3. **Why** does it exist?
+4. **When** and where is it used?
+5. **Success** — how do we know it works?
+6. **Failure** — what does failure look like?
+7. **Out-of-scope** — what are we explicitly not building?
 
-## Acceptance Criteria Format
+Present all questions at once. Follow up on unanswered ones. Run a silent pre-mortem to generate targeted follow-up questions. Autonomously baseline when all questions are answered.
 
-Every criterion must have a UUID (generate with `python -c "import uuid; print(uuid.uuid4())"`):
+From the answers: identify the feature list and create `docs/features/backlog/<name>/discovery.md` per feature.
 
-```markdown
-- `<uuid>`: <Short description>.
-  Source: <stakeholder | po | developer | reviewer | bug>
+### Phase 2 — Feature Discovery (per feature)
 
-  Given: <precondition>
-  When: <action>
-  Then: <expected outcome>
-```
+Populate the per-feature `discovery.md` with:
+- **Entities table**: nouns (candidate classes) and verbs (candidate methods), with in-scope flag
+- **Questions**: feature-specific gaps from project discovery + targeted probes
+
+Present all questions at once. Follow up on unanswered ones. Run a silent pre-mortem after each cycle. Stakeholder says "baseline" to freeze discovery.
+
+### Phase 3 — Stories (PO alone, post feature-baseline)
+
+Write one `.feature` file per user story in `docs/features/backlog/<name>/`:
+- `Feature:` block with user story line (`As a... I want... So that...`)
+- No `Example:` blocks yet
+
+Commit: `feat(stories): write user stories for <name>`
+
+### Phase 4 — Criteria (PO alone)
 
-All UUIDs must be unique. Every story must have at least one criterion. Every criterion must be independently testable.
+For each story file, run a silent pre-mortem: "What observable behaviors must we prove?"
 
-**Source field** (mandatory): records who originated this criterion.
-- `stakeholder` — an external stakeholder gave this requirement to the PO
-- `po` — the PO originated this criterion independently
-- `developer` — a gap found during Step 4 implementation
-- `reviewer` — a gap found during Step 5 verification
-- `bug` — a post-merge regression; the feature doc was reopened
+Write `Example:` blocks with `@id:<8-char-hex>` tags:
+- Generate IDs with `uv run task gen-id`
+- Soft limit: 3-10 Examples per Feature
+- Each Example must be observably distinct
+- `Given/When/Then` in plain English, observable by end user
 
-When adding criteria discovered after initial scope, load `skill extend-criteria`.
+Commit: `feat(criteria): write acceptance criteria for <name>`
 
-## Feature Document Structure
+**After this commit, the `.feature` files are frozen.** Any change requires adding `@deprecated` to the old Example and writing a new one.
 
-Filename: `<verb>-<object>.md` — imperative verb first, kebab-case, 2–4 words.
-Examples: `display-version.md`, `authenticate-user.md`, `export-metrics-csv.md`
-Title matches: `# Feature: <Verb> <Object>` in Title Case.
+## Step 2 — Architecture Review (your gate)
 
-```markdown
-# Feature: <Verb> <Object>
+When the developer proposes the Architecture section, review it:
+- Does any ADR contradict an acceptance criterion? Reject and ask the developer to resolve.
+- Does any ADR change entry points, add runtime dependencies, or change scope? Approve or reject explicitly.
+- Is a user story not technically feasible? Work with the developer to adjust scope.
+
+## Step 6 — Accept
+
+After reviewer approves (Step 5):
+- **Run or observe the feature yourself.** If user interaction is involved, interact with it. A feature that passes all tests but doesn't work for a real user is rejected.
+- Review the working feature against the original user stories
+- If accepted: move folder `docs/features/in-progress/<name>/` → `docs/features/completed/<name>/`; update TODO.md; ask developer to create PR and tag release
+- If rejected: write specific feedback in TODO.md, send back to the relevant step
+
+## Boundaries
 
-## User Stories
-- As a <role>, I want <goal> so that <benefit>
+**You approve**: new runtime dependencies, changed entry points, major scope changes.
+**Developer decides**: module structure, design patterns, internal APIs, test tooling, linting config.
 
-## Acceptance Criteria
-- `<uuid>`: <Short description>.
-  Source: <stakeholder | po>
+## Gherkin Format
 
-  Given: ...
-  When: ...
-  Then: ...
+```gherkin
+Feature: <Title>
+  As a <role>
+  I want <goal>
+  So that <benefit>
 
-## Notes
-<constraints, risks, out-of-scope items>
+  @id:<8-char-hex>
+  Example: <Short title>
+    Given <precondition>
+    When <action>
+    Then <single observable outcome>
 ```
 
-The developer adds an `## Architecture` section during Step 2. Do not write that section yourself.
+Rules:
+- `Example:` keyword (not `Scenario:`)
+- `@id` on the line before `Example:`
+- Each `Then` must be a single, observable, measurable outcome — no "and"
+- Observable means observable by the end user, not by a test harness
+- If user interaction is involved, declare the interaction model in the Feature description
+
+## Handling Gaps
+
+When a gap is reported (by developer or reviewer):
+
+| Situation | Action |
+|---|---|
+| Edge case within current user stories | Add a new Example with a new `@id` to the relevant `.feature` file. Run `uv run task gen-tests`. |
+| New behavior beyond current stories | Add to backlog as a new feature. Do not extend the current feature. |
+| Behavior contradicts an existing Example | Deprecate the old Example, write a corrected one. |
+| Post-merge defect | Move feature folder back to `in-progress/`, add new Example with `@id`, resume at Step 3. |
+
+## Deprecation
+
+When criteria need to change after baseline:
+1. Add `@deprecated` tag to the old Example in the `.feature` file
+2. Write a new Example with a new `@id`
+3. Run `uv run task gen-tests` to sync test stubs
 
 ## Backlog Management
 
 Features sit in `docs/features/backlog/` until you explicitly move them to `docs/features/in-progress/`.
-Only one file may exist in `docs/features/in-progress/` at any time (WIP limit = 1).
-If the backlog is empty, work with stakeholders to define new features.
+Only one feature folder may exist in `docs/features/in-progress/` at any time (WIP limit = 1).
+When choosing the next feature, prefer lower-hanging fruit first.
+If the backlog is empty, start Phase 1 (Project Discovery) or Phase 2 (Feature Discovery) with the stakeholder.
diff --git a/.opencode/agents/reviewer.md b/.opencode/agents/reviewer.md
index e6f3463..a5c7de4 100644
--- a/.opencode/agents/reviewer.md
+++ b/.opencode/agents/reviewer.md
@@ -31,147 +31,94 @@ You verify that the work is done correctly by running commands and reading code.
 
 **Your default hypothesis is that the code is broken despite passing automated checks. Your job is to find the failure mode. If you cannot find one after thorough investigation, APPROVE. If you find one, REJECTED.**
 
+## Session Start
+
+Load `skill session-workflow` first. Then load `skill verify` for Step 5.
+
 ## Responsibilities
 
 - Run every verification command and report actual output
 - Review code against quality standards
 - Report findings to the developer — pass or fail with specific reasons
+- Report spec gaps to the PO (you do not extend criteria yourself — the PO decides)
 - Never approve work you haven't run
 
 ## Workflow
 
-Every session: load `skill session-workflow` first.
-
 ### Step 5 — VERIFY
 Load `skill verify`. Run all commands, check all criteria, produce a written report.
 
+### Per-test review during Step 4
+When the developer requests a review after SELF-DECLARE (REFACTOR → SELF-DECLARE → reviewer check), load `skill implementation` and use the verification table template in the REVIEWER CHECK section. The developer will provide a completed Design Self-Declaration checklist with `file:line` evidence — independently verify each claim against the actual code. Do NOT run any commands (no lint, no static-check, no test suite). This is a code-design check only.
+
 ## Zero-Tolerance Rules
 
-- **Never approve without running commands.** Reading code alone is not verification.
+- **Never approve without running commands.** Reading code alone is not verification. (Step 5 only — per-test Step 4 checks are code-design only, no commands.)
 - **Never skip a check.** If a command fails, report it. Do not work around it.
 - **Never suggest noqa, type: ignore, or pytest.skip as a fix.** These are bypasses, not solutions.
 - **Report specific locations.** "Line 47 of physics/engine.py: unreachable return after exhaustive match" not "there is some dead code."
+- **Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED.
 
 ## Verification Order
 
-1. **Read feature doc** — UUIDs, interaction model, developer pre-mortem
+1. **Read feature docs** — `.feature` files (all `@id` Examples), discovery docs, developer pre-mortem
 2. **Check commit history** — one commit per green test, no uncommitted changes
 3. **Run the app** — production-grade gate (see below)
-4. **Code review** — read source files, fill all tables
+4. **Code review** — read source files, fill all tables with evidence
 5. **Run commands** — lint, static-check, test (stop on first failure)
 6. **Interactive verification** — if feature involves user interaction
 7. **Write report**
 
 **Do code review before running lint/static-check/test.** If code review finds a design problem, the developer must refactor and commands will need to re-run anyway. Do the hard cognitive work first.
 
-## Production-Grade Gate (Step 3)
-
-Run before code review. If any row is FAIL → REJECTED immediately.
-
-| Check | How to check | PASS | FAIL | Fix |
-|---|---|---|---|---|
-| Developer declared production-grade | Read feature doc pre-mortem or handoff message | Explicit statement present | Absent or says "demo" or "incomplete" | Developer must complete the implementation |
-| App exits cleanly | `timeout 10s uv run task run` | Exit 0 or non-124 | Exit 124 (timeout/hang) | Developer must fix the hang |
-| Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static regardless of input | Developer must implement real logic — output that does not change with input is not complete |
-
-## Code Review (Step 4)
-
-**Correctness** — any FAIL → REJECTED:
-
-| Check | How to check | PASS | FAIL | Fix |
-|---|---|---|---|---|
-| No dead code | Read for unreachable statements, unused variables, impossible branches | None found | Any found | Remove or fix the unreachable path |
-| No duplicate logic (DRY) | Search for repeated blocks doing the same thing | None found | Duplication found | Extract to shared function |
-| No over-engineering (YAGNI) | Check for abstractions with no current use | None found | Unused abstraction or premature generalization | Remove unused code |
-
-**Simplicity (KISS)** — any FAIL → REJECTED:
-
-| Check | How to check | PASS | FAIL | Fix |
-|---|---|---|---|---|
-| Functions do one thing | Read each function; can you describe it without `and`? | Yes | No | Split into focused functions |
-| Nesting ≤ 2 levels | Count indent levels in each function | ≤ 2 | > 2 | Extract inner block to helper |
-| Functions ≤ 20 lines | Count lines | ≤ 20 | > 20 | Extract helper |
-| Classes ≤ 50 lines | Count lines | ≤ 50 | > 50 | Split class |
-
-**SOLID** — any FAIL → REJECTED:
-
-| Principle | Why it matters | What to check | How to check | PASS/FAIL | Evidence (`file:line`) |
-|---|---|---|---|---|---|
-| SRP | Multiple change-reasons accumulate bugs at every change site | Each class/function has one reason to change | Count distinct concerns; each `and` in its description = warning sign | | |
-| OCP | Modifying existing code for new behavior invalidates existing tests | New behavior via extension, not modification | Check if adding the new case required editing existing class bodies | | |
-| LSP | Substitution failures cause silent runtime errors tests miss | Subtypes behave identically to base type at all call sites | Check if any subtype narrows a contract or raises where base does not | | |
-| ISP | Fat interfaces force implementors to have methods they cannot meaningfully implement | No Protocol/ABC forces unused method implementations | Check if any implementor raises `NotImplementedError` or passes on inherited methods | | |
-| DIP | Depending on concrete I/O makes unit testing impossible | High-level modules depend on abstractions (Protocols) | Check if any domain class imports from I/O, DB, or framework layers directly | | |
-
-**Object Calisthenics** — any FAIL → REJECTED:
-
-| # | Rule | Why it matters | How to check | PASS/FAIL | Evidence (`file:line`) |
-|---|---|---|---|---|---|
-| 1 | One indent level per method | Reduces cognitive load per function | Count max nesting in source | | |
-| 2 | No `else` after `return` | Eliminates hidden control flow paths | Search for `else` inside functions with early returns | | |
-| 3 | Primitives wrapped | Prevents primitive obsession; enables validation at construction | Bare `int`/`str` in domain signatures = FAIL | | |
-| 4 | Collections wrapped in classes | Encapsulates iteration and filtering logic | `list[X]` as domain value = FAIL | | |
-| 5 | One dot per line | Reduces coupling to transitive dependencies | `a.b.c()` chains = FAIL | | |
-| 6 | No abbreviations | Names are documentation; abbreviations lose meaning | `mgr`, `tmp`, `calc` = FAIL | | |
-| 7 | Small entities | Smaller units are easier to test, read, and replace | Functions > 20 lines or classes > 50 lines = FAIL | | |
-| 8 | ≤ 2 instance variables | Forces single responsibility through structural constraint | Count `self.x` assignments in `__init__` | | |
-| 9 | No getters/setters | Enforces tell-don't-ask; behavior lives with data | `get_x()`/`set_x()` pairs = FAIL | | |
-
-**Design Patterns** — any FAIL → REJECTED:
-
-| Code smell | Pattern missed | Why it matters | PASS/FAIL | Evidence (`file:line`) |
-|---|---|---|---|---|
-| Multiple if/elif on type/state | State or Strategy | Eliminates conditional complexity, makes adding new states safe | | |
-| Complex `__init__` with side effects | Factory or Builder | Separates construction from use, enables testing | | |
-| Callers must know multiple internal components | Facade | Single entry point reduces coupling | | |
-| External dep without Protocol | Repository/Adapter | Enables testing without real I/O; enforces DIP | | |
-| 0 domain classes, many functions | Missing domain model | Procedural code has no encapsulation boundary | | |
-
-**Tests** — any FAIL → REJECTED:
-
-| Check | How to check | PASS | FAIL | Fix |
-|---|---|---|---|---|
-| UUID docstring format | Read first line of each docstring | UUID only, blank line, Given/When/Then | Description on UUID line | Remove description; UUID line must be bare |
-| Contract test | Would this test survive a full internal rewrite? | Yes | No | Rewrite assertion to test observable output, not internals |
-| No internal attribute access | Search for `_x` in assertions | None found | `_x`, `isinstance`, `type()` found | Replace with public API assertion |
-| Every AC has a mapped test | `grep -r "<uuid>" tests/` per UUID | Found | Not found | Write the missing test |
-| No UUID used twice | See command below — empty = PASS | Empty output | UUID printed | If only `Given` differs: consolidate into Hypothesis `@given` + `@example`. If `When`/`Then` differs: use `extend-criteria` |
-
-```bash
-# UUID Drift check — any output = FAIL
-grep -rh --include='*.py' '[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' tests/ \
-  | grep -oE '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' \
-  | sort | uniq -d
-```
+## Gap Reporting
 
-**Versions and Build** — any FAIL → REJECTED:
+If you discover an observable behavior with no acceptance criterion:
 
-| Check | How to check | PASS | FAIL | Fix |
-|---|---|---|---|---|
-| `pyproject.toml` version matches `__version__` | Read both files | Match | Mismatch | Align the version strings |
-| Coverage target matches package | Check `--cov=<package>` in test config | Matches actual package | Wrong package name | Fix the `--cov` argument |
-| All declared packages exist | Check `[tool.setuptools] packages` against filesystem | All present | Missing package | Add the missing directory or remove the declaration |
+| Situation | Action |
+|---|---|
+| Edge case within current user stories | Report to PO with suggested Example text. PO decides whether to add it. |
+| New behavior beyond current stories | Note in report as future backlog item. Do not add criteria. |
+| Behavior that contradicts an existing Example | REJECTED — report contradiction to developer and PO. |
+
+**You never edit `.feature` files or add Examples yourself.**.
 
 ## Report Format
 
-```
-## Step 5 Verification Report
+```markdown
+## Step 5 Verification Report — <feature-name>
+
+### Production-Grade Gate
+| Check | Result | Notes |
+|---|---|---|
+| Developer declared production-grade | PASS / FAIL | |
+| App exits cleanly | PASS / FAIL / TIMEOUT | |
+| Output driven by real logic | PASS / FAIL | |
 
 ### Commands
-- uv run task lint: PASS | FAIL — <output if fail>
-- uv run task static-check: PASS | FAIL | NOT RUN — <errors if fail, or "stopped after previous failure">
-- uv run task test: PASS | FAIL | NOT RUN — <failures/coverage if fail, or "stopped after previous failure">
-- timeout 10s uv run task run: PASS | FAIL | TIMEOUT | NOT RUN — <error or "process did not exit within 10s" if fail, or "stopped after previous failure">
+| Command | Result | Notes |
+|---------|--------|-------|
+| uv run task lint | PASS / FAIL | <details if fail> |
+| uv run task static-check | PASS / FAIL | <errors if fail> |
+| uv run task test | PASS / FAIL | <failures or coverage% if fail> |
 
-### Code Review
-- PASS | FAIL: <finding with file:line reference>
+### @id Traceability
+| @id | Example Title | Test | Status |
+|-----|---------------|------|--------|
+| `@id:a3f2b1c4` | <title> | `tests/features/<name>/<story>_test.py::test_<slug>_a3f2b1c4` | COVERED / NOT COVERED |
 
-### UUID Traceability
-- <uuid>: COVERED by <test_file>:<test_function> | NOT COVERED
+### Code Review Findings
+- PASS: <aspect>
+- FAIL: `<file>:<line>` — <specific issue>
 
 ### Decision
-APPROVED — developer may proceed to Step 6
+**APPROVED** — work meets all standards. Developer may proceed to Step 6.
 OR
-REJECTED — fix the following before resubmitting:
-1. <specific issue with file:line>
+**REJECTED** — fix the following before resubmitting:
+1. `<file>:<line>` — <specific, actionable fix required>
 ```
+
+## Available Skills
+
+- `session-workflow` — read/update TODO.md at session boundaries
+- `verify` — full Step 5 verification protocol with all tables and gates
diff --git a/.opencode/agents/setup-project.md b/.opencode/agents/setup-project.md
index be5926d..a4a1d8f 100644
--- a/.opencode/agents/setup-project.md
+++ b/.opencode/agents/setup-project.md
@@ -15,32 +15,43 @@ tools:
 
 # Setup Project
 
-You initialize a new project from this Python template by gathering parameters from the user and applying them directly to the project files.
+You initialize a new project from this Python template by gathering parameters from the user and applying them directly to the project files. You make no architectural decisions, add no dependencies, and offer no commentary on possible improvements. You only substitute the template variables with user-provided values.
 
 ## Step 1 — Gather Parameters
 
-Ask the user for:
+Read `template-config.yaml` and show the user the 6 values under `defaults:`. For **each key in order**, display the current default value and ask the user: "Use this value or enter a new one?" Accept the default if the user confirms it. Collect all 6 values before proceeding:
 
-1. **GitHub username** — their GitHub handle (e.g. `myusername`)
-2. **Project name** — kebab-case repo name (e.g. `my-awesome-project`)
-3. **Package name** — snake_case Python package name (default: derive from project name, e.g. `my_awesome_project`). This becomes the `app/` directory.
-4. **Project description** — one sentence describing what the project does
-5. **Author name** — their full name
-6. **Author email** — their email address
+1. `github_username` — their GitHub handle (e.g. `myusername`)
+2. `project_name` — kebab-case repo name (e.g. `my-awesome-project`)
+3. `package_name` — snake_case Python package name (e.g. `my_awesome_project`). This becomes the `app/` directory.
+4. `project_description` — one sentence describing what the project does
+5. `author_name` — their full name
+6. `author_email` — their email address
 
-Read `project_defaults.json` first to know the current placeholder values:
+Do not ask for anything else. Do not suggest additional parameters.
 
-```bash
-cat project_defaults.json
-```
+## Step 2 — Show Summary and Confirm
+
+Print a table showing old value → new value for all 6 parameters:
 
-## Step 2 — Show Summary Before Applying
+| Parameter | Old (default) | New |
+|---|---|---|
+| `github_username` | ... | ... |
+| `project_name` | ... | ... |
+| `package_name` | ... | ... |
+| `project_description` | ... | ... |
+| `author_name` | ... | ... |
+| `author_email` | ... | ... |
 
-Print a summary and ask the user to confirm before making any changes.
+Note explicitly: `github_username` will be used in both `pyproject.toml` URLs and `git remote set-url`. Confirm they are correct before proceeding.
+
+Ask the user to confirm before making any changes.
 
 ## Step 3 — Apply Changes
 
-Execute each change in order. Do not skip any.
+Execute each sub-step in order. Do not skip any. Do not make any changes beyond what is listed here.
+
+The substitution patterns are the source of truth in `template-config.yaml` under `substitutions:`. The steps below describe each file in plain terms; verify counts against the config if in doubt.
 
 ### 3a. Rename the package directory
 
@@ -48,96 +59,85 @@ Execute each change in order. Do not skip any.
 mv app <package_name>
 ```
 
-### 3b. Update pyproject.toml
+### 3b. Update `pyproject.toml`
 
-Replace in `pyproject.toml`:
-- Old project name → new project name (the `name =` field)
-- Old description → new description
-- Old author name → new author name (both `authors` and `maintainers`)
-- Old author email → new author email (both `authors` and `maintainers`)
-- Old GitHub username → new GitHub username (in `[project.urls]`)
-- `"app"` → `"<package_name>"` (in `packages = [...]`)
-- `--cov=app` → `--cov=<package_name>`
-- `pdoc ./app` → `pdoc ./<package_name>`
-- Version: set to `0.1.YYYYMMDD` using today's date
+Apply every substitution listed under `substitutions.pyproject.toml` in `template-config.yaml`. Additionally, reset the version field to `0.1.YYYYMMDD` using today's date.
 
-### 3c. Update README.md
+### 3c. Update `README.md`
 
-Replace all occurrences of:
-- Old GitHub username → new GitHub username
-- Old project name → new project name
-- Old author name → new author name
+Apply every substitution listed under `substitutions.README.md`. The `eol` → `<author_name>` replacement applies only to the author credit line; do not replace `eol` in other contexts.
 
-### 3d. Update main.py
+### 3d. Update test files referencing the package
 
-Replace:
-- `from app.version import version` → `from <package_name>.version import version`
+Apply every substitution listed under `substitutions.tests/unit/app_test.py`.
+
+After applying substitutions, verify no stale references remain:
+
+```bash
+grep -rn "from app" tests/
+```
 
-### 3e. Update the package source file
+The command must return no output before proceeding to Step 3e.
 
-Replace in `<package_name>/version.py` (formerly `app/version.py`):
-- `logging.getLogger("app")` → `logging.getLogger("<package_name>")`
+### 3e. Update `.github/workflows/ci.yml`
 
-### 3f. Update tests
+Apply every substitution listed under `substitutions..github/workflows/ci.yml`.
 
-Replace in `tests/version_test.py`:
-- `from app import version` → `from <package_name> import version`
-- `from app` → `from <package_name>` (any other imports)
-- `logging.getLogger("app")` → `logging.getLogger("<package_name>")`
+### 3f. Update `Dockerfile`
 
-### 3g. Update CI workflow
+Apply every substitution listed under `substitutions.Dockerfile`.
 
-Replace in `.github/workflows/ci.yml`:
-- `import app` → `import <package_name>` (appears in the wheel and sdist install verify steps)
+### 3g. Update `docker-compose.yml`
 
-### 3h. Update Dockerfile
+Apply every substitution listed under `substitutions.docker-compose.yml`.
 
-Replace in `Dockerfile`:
-- `python-project-template` → new project name (in comments and labels)
-- `python_package_template.python_module_template` → `<package_name>` (in CMD and healthcheck)
-- `nullhack` → new GitHub username (in the OCI label URL)
+### 3h. Update `.dockerignore`
 
-### 3i. Update docker-compose.yml
+Apply every substitution listed under `substitutions..dockerignore`.
 
-Replace in `docker-compose.yml`:
-- `python-project-template` → new project name (in comments)
-- `python_package_template` → `<package_name>` (in volume mounts and commands)
+### 3i. Update `docs/index.html`
 
-### 3j. Set git remote
+Apply every substitution listed under `substitutions.docs/index.html`.
+
+### 3j. Update `LICENSE`
+
+Apply every substitution listed under `substitutions.LICENSE`.
+
+### 3k. Update `template-config.yaml`
+
+Apply every substitution listed under `substitutions.template-config.yaml`. This updates the `defaults:` section to reflect the user's values. This is always the last file changed.
+
+### 3l. Set git remote
 
 ```bash
 git remote set-url origin git@github.com:<github_username>/<project_name>.git
 ```
 
-## Step 4 — Verify
+## Step 4 — Smoke Test
 
 ```bash
-uv venv && uv pip install -e '.[dev]'
-task lint
-task test
-timeout 10s task run
+uv sync --all-extras && uv run task test-fast
 ```
 
-All must pass. Fix any issues before continuing.
+Both must succeed. If `uv run task test-fast` fails and the failure is caused by a variable substitution that was missed (e.g. an import still referencing `app` instead of `<package_name>`), apply the same substitution pattern to fix it. If the failure has any other cause, report the error and stop — do not attempt to fix it.
 
-## Step 5 — Cleanup
+## Step 5 — Done
 
-Delete the template artifacts that are no longer needed:
+Tell the user which files were changed (list them). Then show next steps:
 
 ```bash
-rm -f project_defaults.json
+# Commit the setup
+git add -A && git commit -m "chore: initialize project from python-project-template"
+git push -u origin main
+
+# Optional: rename the project folder (run from the parent directory)
+cd .. && mv python-project-template <project_name>
 ```
 
-## Step 6 — Done
+Then tell the user to start the workflow:
 
-Tell the user:
-- What was changed
-- The git remote is now set to their repo
-- Next steps:
-  ```bash
-  # Commit the setup
-  git add -A && git commit -m "chore: initialize project from python-project-template" && git push -u origin main
+```
+@product-owner
+```
 
-  # Optional: rename the project folder (do this from the parent directory)
-  cd .. && mv python-project-template <project-name>
-  ```
+The PO picks the first feature from backlog and moves it to in-progress.
diff --git a/.opencode/skills/code-quality/SKILL.md b/.opencode/skills/code-quality/SKILL.md
index ffee39f..f09d76e 100644
--- a/.opencode/skills/code-quality/SKILL.md
+++ b/.opencode/skills/code-quality/SKILL.md
@@ -1,172 +1,27 @@
 ---
 name: code-quality
 description: Enforce code quality using ruff, pytest coverage, and static type checking
-version: "1.0"
+version: "2.0"
 author: developer
-audience: developer
+audience: developer, reviewer
 workflow: feature-lifecycle
 ---
 
 # Code Quality
 
-Run quality tools and interpret their output. All must pass before handing off to the reviewer.
+Run these four commands before handing off to the reviewer (Step 5). All must pass.
 
-## Commands
+## Developer Self-Check
 
-```bash
-task lint                # ruff check + ruff format
-task static-check        # pyright
-task test                # pytest with coverage
-timeout 10s task run     # application starts; exit 124 = hung = fix it
-```
-
-All four must pass before any step is considered complete. (`task run` passes if exit ≠ 124.)
-
-## Ruff Configuration
-
-The project uses a broad ruff rule set in `pyproject.toml`. Key rules:
-
-| Category | Rules | What it checks |
-|---|---|---|
-| `A` | builtins | Shadowing built-in names |
-| `ANN` | annotations | Missing type hints |
-| `B` | bugbear | Likely bugs and design issues |
-| `C9` | mccabe | Cyclomatic complexity > 10 |
-| `D` | pydocstyle | Google-style docstrings |
-| `E/W` | pycodestyle | Style violations |
-| `F` | pyflakes | Unused imports, undefined names |
-| `N` | pep8-naming | Naming conventions |
-| `S` | bandit | Security issues |
-| `SIM` | simplify | Simplifiable code patterns |
-| `ANN` exempt in tests | — | No type hints required in test files |
-
-**Golden rule: never use `noqa`.** Look up the rule at https://docs.astral.sh/ruff/rules/ and fix it properly.
-
-### Common Fixes
-
-```python
-# ANN001: Missing type hint
-def bad(name):           return f"Hello {name}"   # wrong
-def good(name: str) -> str: return f"Hello {name}" # correct
-
-# S101: assert in production code
-assert data is not None              # wrong — raises AssertionError, skipped with -O
-if data is None: raise ValueError()  # correct
-
-# C901: function too complex — extract methods
-# SIM: simplify conditions — use early returns
-# ERA: eradicate — remove commented-out code
-```
-
-## Pyright Standards
+Before handing off to reviewer:
 
 ```bash
-task static-check
-# Expected: 0 errors, 0 warnings
-```
-
-Requirements:
-- All functions have type hints (args and return type)
-- Use modern syntax: `list[str]` not `List[str]`, `str | None` not `Optional[str]`
-- Protocol-based interfaces for dependency inversion
-- Generic types with `TypeVar` where appropriate
-
-```python
-from typing import Protocol, TypeVar
-
-T = TypeVar("T")
-
-class Repository(Protocol[T]):
-    def save(self, entity: T) -> None: ...
-    def find_by_id(self, entity_id: str) -> T | None: ...
-```
-
-## Coverage Requirements
-
-Coverage must be 100%. The `--cov=<package>` target must match the actual package directory.
-
-```toml
-# pyproject.toml — keep this aligned with your actual package name
-test-report = "pytest --cov=<your-package> --cov-fail-under=100 ..."
-```
-
-If you have code that genuinely cannot be tested (e.g., `if __name__ == "__main__":`), use:
-```python
-if __name__ == "__main__":  # pragma: no cover
-    main()
-```
-
-`pragma: no cover` is allowed only for entry point guards and platform-specific branches. Never for logic.
-
-## Docstring Standards (Google style)
-
-```python
-def calculate_total(items: list[LineItem], discount: float = 0.0) -> float:
-    """Calculate the total price after applying a discount.
-
-    Args:
-        items: Line items to sum.
-        discount: Fractional discount to apply (0.0–1.0).
-
-    Returns:
-        Total price after discount, rounded to 2 decimal places.
-
-    Raises:
-        ValueError: If discount is not between 0.0 and 1.0.
-
-    Example:
-        >>> calculate_total([LineItem(price=10.0, qty=2)], discount=0.1)
-        18.0
-    """
+uv run task lint                # ruff check + ruff format — must exit 0
+uv run task static-check        # pyright — must exit 0, 0 errors
+uv run task test                # pytest with coverage — must exit 0, 100% coverage
+timeout 10s uv run task run     # app starts — must exit non-124
 ```
 
-Required on all public functions and classes. Not required on private helpers (`_name`).
-
-## Complexity Limits
-
-| Metric | Limit |
-|---|---|
-| Cyclomatic complexity | 10 |
-| Function length | 20 lines |
-| Class length | 50 lines |
-| Max nesting | 2 levels |
-| Instance variables | 2 per class |
-
-If a function exceeds the limit, extract sub-functions. If a class exceeds the limit, split responsibilities.
-
-## Structural Quality Checks
-
-`lint`, `static-check`, and `test` verify **syntax-level** quality. They do NOT verify **design-level** quality (nesting depth, function length, value objects, design patterns). Both must pass.
-
-Run through this table during refactor and before handoff:
-
-| If you see... | Then you must... |
-|---|---|
-| Function > 20 lines | Extract helper |
-| Nesting > 2 levels | Extract to function |
-| Bare `int`/`str` as domain concept | Wrap in value object |
-| > 4 positional parameters | Group into dataclass |
-| `list[X]` as domain collection | Wrap in collection class |
-| No classes in domain code | Introduce domain classes |
-
-## Design Anti-Pattern Recognition
-
-| Code smell | Indicates | Fix |
-|---|---|---|
-| 15+ functions, 0 classes | Procedural code disguised as modules | Introduce domain classes |
-| 8+ parameters on a function | Missing abstraction | Group into dataclass/value object |
-| Type alias (`X = int`) instead of value object | Primitive obsession | Wrap in frozen dataclass |
-| 3+ nesting levels | Missing extraction | Extract to helper functions |
-| `get_x()` / `set_x()` pairs | Anemic domain model | Replace with commands and queries |
-
-## Pre-Handoff Checklist
+All four must pass. Do not hand off broken work.
 
-- [ ] `task lint` exits 0, no auto-fixes needed
-- [ ] `task static-check` exits 0, 0 errors, 0 warnings
-- [ ] `task test` exits 0, all tests pass, coverage = 100%
-- [ ] `task run` starts without error
-- [ ] No `noqa` comments in source
-- [ ] No `type: ignore` comments
-- [ ] All public functions have type hints and docstrings
-- [ ] `pyproject.toml` version matches `<package>/__version__`
-- [ ] `--cov=<package>` matches actual package name
+**Golden rule: never use `noqa` or `type: ignore`.** Fix the underlying issue instead.
diff --git a/.opencode/skills/create-skill/SKILL.md b/.opencode/skills/create-skill/SKILL.md
index eac72ab..6480d30 100644
--- a/.opencode/skills/create-skill/SKILL.md
+++ b/.opencode/skills/create-skill/SKILL.md
@@ -86,7 +86,7 @@ Add the skill name to the agent's "Available Skills" section so the agent knows
 | `tdd` | developer | Step 3: write failing tests |
 | `implementation` | developer | Step 4: Red-Green-Refactor |
 | `verify` | reviewer | Step 5: run commands and review code |
-| `code-quality` | developer | Ruff, pyright, coverage standards |
+| `code-quality` | developer | Quick reference: four handoff commands before Step 5 |
 | `pr-management` | developer | Create PRs with proper format |
 | `git-release` | developer | Calver versioning and release naming |
 | `create-skill` | developer | Create new skills |
diff --git a/.opencode/skills/extend-criteria/SKILL.md b/.opencode/skills/extend-criteria/SKILL.md
deleted file mode 100644
index 83f453a..0000000
--- a/.opencode/skills/extend-criteria/SKILL.md
+++ /dev/null
@@ -1,103 +0,0 @@
----
-name: extend-criteria
-description: Add acceptance criteria discovered after scope is written — gaps found during implementation or review, and post-merge defects
-version: "1.0"
-author: any
-audience: developer, reviewer, product-owner
-workflow: feature-lifecycle
----
-
-# Extend Criteria
-
-This skill is loaded when any agent discovers a missing behavior that is not covered by the existing acceptance criteria. It provides the decision rule, UUID assignment, and commit protocol for adding new criteria mid-flight or post-merge.
-
-## When to Use
-
-- **Developer (Step 4)**: implementation reveals an untested behavior
-- **Reviewer (Step 5)**: code review reveals an observable behavior with no acceptance criterion
-- **Post-merge**: a defect is found in production and a regression criterion must be added
-
-Do not use this skill to scope new features. New observable behaviors that go beyond the current feature's user stories must be escalated to the PO.
-
-## Decision Rule: Is This a Gap or a New Feature?
-
-Ask: "Does this behavior fall within the intent of the current user stories?"
-
-| Situation | Action |
-|---|---|
-| Edge case or error path within approved scope | Add criterion with `Source: developer` or `Source: reviewer` |
-| New observable behavior users did not ask for | Escalate to PO; do not add criterion unilaterally |
-| Post-merge regression (the feature was accepted and broke later) | Reopen feature doc; add criterion with `Source: bug` |
-| Behavior already present but criterion was never written | Add criterion with appropriate `Source:` |
-| **Architecture decision contradicts an acceptance criterion** | **Escalate to PO immediately. Do not proceed with implementation.** |
-
-When in doubt, ask the PO before adding.
-
-## Criterion Format
-
-All criteria use this format (mandatory `Source:` field):
-
-```markdown
-- `<uuid>`: <Short description ending with a period>.
-  Source: <source>
-
-  Given: <precondition>
-  When: <action>
-  Then: <single observable outcome>
-```
-
-**Source values** (choose exactly one):
-- `stakeholder` — an external stakeholder gave this requirement to the PO
-- `po` — the PO originated this criterion independently
-- `developer` — a gap found during Step 4 implementation
-- `reviewer` — a gap found during Step 5 verification
-- `bug` — a post-merge regression; the feature doc was reopened
-
-**Rules**:
-- UUID must be unique across the entire project
-- Generate: `python -c "import uuid; print(uuid.uuid4())"`
-- `Then` must be a single observable, measurable outcome — no "and"
-- Do not add `Source:` retroactively to criteria that predate this field
-
-## Procedure by Role
-
-### Developer (Step 4)
-
-1. Determine whether this is a gap within scope or a new feature (use the decision table above)
-2. If it is within scope:
-   a. Add the criterion to the feature doc with `Source: developer`
-   b. Write the failing test for it (load `skill tdd`)
-   c. Make it green (continue Red-Green-Refactor)
-   d. Commit: `test(<feature-name>): add gap criterion <uuid>`
-3. If it is out of scope: write a note in TODO.md under `## Next`, flag it for the PO after Step 5
-
-### Reviewer (Step 5)
-
-1. Determine whether this is a gap within scope or a new feature
-2. If it is within scope:
-   - Add the criterion to the feature doc with `Source: reviewer`
-   - Record in the REJECTED report: "Added criterion `<uuid>` — developer must implement before resubmitting"
-3. If it is out of scope:
-   - Do not add the criterion
-   - Note it in the report as a future backlog item
-
-### Post-merge Defect
-
-1. Move the feature doc back to in-progress:
-   ```bash
-   mv docs/features/completed/<name>.md docs/features/in-progress/<name>.md
-   git add -A
-   git commit -m "chore(workflow): reopen <name> for bug fix"
-   ```
-2. Add the new criterion with `Source: bug`
-3. Return to Step 3 (write failing test) then Step 4 (implement) then Step 5 (verify) then Step 6 (accept)
-4. Update TODO.md to reflect the reopened feature at the correct step
-
-## Checklist
-
-Before committing a new criterion:
-- [ ] UUID is unique (search: `grep -r "<uuid>" docs/features/` and `grep -r "<uuid>" tests/`)
-- [ ] `Source:` value is one of the five valid values
-- [ ] `Then` is a single, observable outcome (no "and")
-- [ ] Blank line between `Source:` line and `Given:`
-- [ ] A corresponding test will be written (or already exists)
diff --git a/.opencode/skills/implementation/SKILL.md b/.opencode/skills/implementation/SKILL.md
index b6262b7..d88f61f 100644
--- a/.opencode/skills/implementation/SKILL.md
+++ b/.opencode/skills/implementation/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: implementation
 description: Step 4 — Red-Green-Refactor cycle, one test at a time, with commit per green test
-version: "1.0"
+version: "2.2"
 author: developer
 audience: developer
 workflow: feature-lifecycle
@@ -9,7 +9,19 @@ workflow: feature-lifecycle
 
 # Implementation
 
-Make the failing tests pass one at a time. Each green test gets its own commit. Refactor only after tests are green.
+Make the failing tests pass one at a time. Each green test gets its own commit after reviewer approval. Refactor only after tests are green.
+
+## Developer Quality Gate Priority Order
+
+During Step 4, correctness priorities are (in order):
+
+1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns
+2. **One test green** — the specific test under work passes, plus `test-fast` still passes
+3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage)
+4. **Commit** — only after reviewer APPROVED
+5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at developer handoff (before Step 5)
+
+Design correctness is far more important than lint/pyright/coverage compliance. Never run lint, static-check, or coverage during the Red-Green-Refactor cycle — those are handoff-only checks.
 
 ## The Cycle
 
@@ -17,22 +29,48 @@ Make the failing tests pass one at a time. Each green test gets its own commit.
 Pick one failing test
   → RED: confirm it fails
   → GREEN: write the minimum code to make it pass
-  → REFACTOR: clean up, apply principles
-  → COMMIT
+  → REFACTOR: clean up, apply design principles
+  → SELF-DECLARE: complete the Design Self-Declaration checklist
+  ─── STOP ─── do not proceed until reviewer checks ───
+  → REVIEWER CHECK: reviewer audits self-declaration against actual code
+  ─── WAIT for APPROVED ───
+  → COMMIT (only after reviewer APPROVED)
+  → Update TODO.md: mark @id [x], update Cycle State to next test
   → pick next failing test
 ```
 
+**Hard gates**: The cycle has two hard gates — you must STOP before the reviewer check, and WAIT for APPROVED before committing. Never batch multiple tests before a reviewer interaction. Never commit without reviewer approval.
+
 Never write production code before picking a specific failing test. Never refactor while tests are red.
 
-## Implementation Order
+**TODO.md Cycle State is mandatory.** Update `## Cycle State` at every phase transition (RED → GREEN → REFACTOR → SELF-DECLARE → REVIEWER → COMMITTED). If the Cycle State block is missing, add it before proceeding.
 
-1. Start with the simplest test: data classes, value objects, pure functions
-2. Work outward: state machines, I/O, orchestration
-3. Follow the order of acceptance criteria in the feature doc
+## Step 2 — Architecture (do this first)
+
+### Package Verification (mandatory — before writing any code)
 
-## Architecture Section (do this first, then verify against AC)
+1. Read `pyproject.toml` → locate `[tool.setuptools]` → record the value of `packages = ["<name>"]`
+2. Confirm that directory exists on disk: `ls <name>/`
+3. Write the correct package name at the top of your working notes for this session
+4. All new source files go under `<name>/` — never under a template placeholder or any other directory
 
-Before writing any production code, add `## Architecture` to `docs/features/in-progress/<name>.md`:
+If `packages` is missing or the directory does not exist, stop and resolve with the stakeholder before writing any code.
+
+**Prerequisites — verify before starting:**
+1. `docs/features/in-progress/` contains only `.gitkeep` (no feature folders). If another feature folder exists, **STOP** — another feature is already in progress.
+2. The feature's `discovery.md` has `Status: BASELINED`. If not, escalate to the PO — Step 1 is incomplete.
+3. At least one `.feature` file in the feature folder contains `Example:` blocks with `@id` tags. If not, escalate to PO — criteria have not been written.
+
+**Steps:**
+
+1. Move the feature folder from `backlog/` to `in-progress/`:
+   ```bash
+   mv docs/features/backlog/<name>/ docs/features/in-progress/<name>/
+   ```
+2. Update `TODO.md` Source path from `backlog/` to `in-progress/`.
+3. Read both `docs/features/discovery.md` (project-level) and the feature's `discovery.md`
+4. Run a silent pre-mortem: YAGNI, KISS, DRY, SOLID, Object Calisthenics, design patterns
+5. Add the Architecture section to `docs/features/in-progress/<name>/discovery.md`:
 
 ```markdown
 ## Architecture
@@ -40,7 +78,6 @@ Before writing any production code, add `## Architecture` to `docs/features/in-p
 ### Module Structure
 - `<package>/domain/entity.py` — data classes and value objects
 - `<package>/domain/service.py` — business logic
-- `<package>/storage/repository.py` — persistence interface
 
 ### Key Decisions
 ADR-001: <title>
@@ -50,78 +87,54 @@ Alternatives considered: <what was rejected and why>
 
 ### Build Changes (needs PO approval: yes/no)
 - New runtime dependency: <name> — reason: <why>
-- New package in pyproject.toml packages list: <name>
-- Changed entry point: <old> → <new>
 ```
 
-If any build changes need PO approval, stop and ask before proceeding.
-
-**Architecture contradiction check**: After writing the Architecture section, compare each ADR against each AC. If any architectural decision contradicts or circumvents an acceptance criterion (e.g., "demo-first" vs. "when the user presses W"), flag it and resolve with the PO before writing any production code. This is not optional.
-
-## Signature Design
-
-Design signatures before writing bodies. Use Python protocols for abstractions:
-
-```python
-from typing import Protocol
-from dataclasses import dataclass
-
-# Value objects: frozen + slots
-@dataclass(frozen=True, slots=True)
-class EmailAddress:
-    """A validated email address."""
-
-    value: str
-
-    def __post_init__(self) -> None:
-        """Validate the email format on creation."""
-        if "@" not in self.value:
-            raise ValueError(f"Invalid email: {self.value!r}")
-
-# Protocol for dependency inversion
-class UserRepository(Protocol):
-    """Persistence interface for users."""
-
-    def save(self, user: "User") -> None: ...
-    def find_by_email(self, email: EmailAddress) -> "User | None": ...
+6. **Architecture contradiction check**: Compare each ADR against each AC. If any architectural decision contradicts or circumvents an acceptance criterion, flag it and resolve with the PO before writing any production code.
+7. **PO domain acknowledgement**: Share the Architecture section with the PO for domain model acknowledgement before Step 3 begins. A one-line response ("no contradictions") is sufficient.
+8. If a user story is not technically feasible, escalate to the PO.
+9. If any build changes need PO approval, stop and ask before proceeding.
 
-# Google docstrings on all public functions
-def register_user(email: EmailAddress, repo: UserRepository) -> "User":
-    """Register a new user with the given email address.
+Commit: `feat(<feature-name>): add architecture`
 
-    Args:
-        email: The validated email address for the new user.
-        repo: Repository for persisting the user.
+**After committing:** Run `uv run task gen-tests -- --check` to verify stub sync. If changes are shown, run `uv run task gen-tests` to apply them.
 
-    Returns:
-        The newly created and persisted user.
+## Implementation Order
 
-    Raises:
-        DuplicateEmailError: If the email is already registered.
-    """
-```
+1. Start with the simplest test: data classes, value objects, pure functions
+2. Work outward: state machines, I/O, orchestration
+3. Follow the order of acceptance criteria in the `.feature` files
 
 ## RED — Confirm the Test Fails
 
 ```bash
-uv run pytest tests/<file>_test.py::test_<name> -v
+uv run pytest tests/features/<name>/<story>_test.py::test_<func> -v
 ```
 
 Expected: `FAILED` or `ERROR`. If it passes before you've written code, the test is wrong — fix it.
 
+Update `## Cycle State` in TODO.md:
+```
+Test: `@id:<hex>` — <description>
+Phase: RED
+```
+
 ## GREEN — Minimum Implementation
 
-Write the least code that makes the test pass. Apply during GREEN:
+Write the least code that makes **this one test** pass. "Green" means the specific test under work passes — not the full suite.
+
+Apply during GREEN:
 - **YAGNI**: if the test doesn't require it, don't write it
 - **KISS**: the simplest code that passes
 
 Do NOT apply during GREEN: DRY, SOLID, Object Calisthenics — those come in refactor.
 
 ```bash
-uv run pytest tests/<file>_test.py::test_<name> -v   # must be PASSED
-uv run task test                                      # must all still pass
+uv run pytest tests/features/<name>/<story>_test.py::test_<func> -v   # this test must PASS
+uv run task test-fast                                                   # no regressions
 ```
 
+Update `## Cycle State` Phase: `GREEN`
+
 ## REFACTOR — Apply Principles (in priority order)
 
 1. **DRY**: extract duplication
@@ -139,63 +152,206 @@ uv run task test                                      # must all still pass
 4. **Type hints**: add/fix type annotations on all public functions and classes
 5. **Docstrings**: Google-style on all public functions and classes
 
+### Design Pattern Decision Table
+
+Use when a pattern solves a structural problem you already have:
+
+| If your code has... | Consider... | Why |
+|---|---|---|
+| Multiple `if/elif` branches on type/state | State or Strategy pattern | Eliminates conditional complexity |
+| Constructor that does complex setup | Factory or Builder | Separates construction from use |
+| Multiple components that must work together | Facade | Single entry point reduces coupling |
+| External dependency (I/O, DB, network) | Repository/Adapter pattern | Enables testing via Protocol |
+| Event-driven flow | Observer or pub/sub | Decouples producers from consumers |
+
+### Doctest Check
+
+If you added or modified a `Examples:` block in a Google-style docstring, verify it passes:
+
+```bash
+uv run pytest --doctest-modules <module_path>
+```
+
+> **Note**: `uv run task test` runs `--doctest-modules`. Keep `Examples:` blocks in Google-style docstrings valid and executable.
+
 ### Refactor Self-Check Gates
 
-After refactor, before committing, run through this table. Each row is a mandatory check:
+After refactor, before moving to self-declaration:
 
-| If you see... | Then you must... | Before committing |
+| If you see... | Then you must... | Before proceeding |
 |---|---|---|
 | Function > 20 lines | Extract helper | Verify line count |
 | Nesting > 2 levels | Extract to function | Verify max depth |
 | Bare `int`/`str` as domain concept | Wrap in value object | Verify no raw primitives in signatures |
 | > 4 positional parameters | Group into dataclass | Verify parameter count |
 | `list[X]` as domain collection | Wrap in collection class | Verify no bare lists |
-| No classes in domain code | Reconsider — are you writing procedural code? | Verify at least one domain class exists |
 
-### Design Pattern Decision Table
+```bash
+uv run task test-fast     # must still pass — the ONLY check during refactor
+```
 
-Not "use patterns everywhere" — use when a pattern solves a structural problem you already have:
+Do NOT run `uv run task lint` or `uv run task static-check` during the cycle. Those are handoff-only checks (before Step 5).
 
-| If your code has... | Consider... | Why |
-|---|---|---|
-| Multiple `if/elif` branches on type/state | State or Strategy pattern | Eliminates conditional complexity |
-| Constructor that does complex setup | Factory or Builder | Separates construction from use |
-| Multiple components that must work together | Facade | Single entry point reduces coupling |
-| External dependency (I/O, DB, network) | Repository/Adapter pattern | Enables testing via Protocol |
-| Event-driven flow | Observer or pub/sub | Decouples producers from consumers |
+Update `## Cycle State` Phase: `REFACTOR`
 
-> **Note**: `uv run task test` runs `--doctest-modules`, which executes code examples embedded in source docstrings. Keep `Examples:` blocks in Google-style docstrings valid and executable. If an example should not be run, mark it with `# doctest: +SKIP`.
+### Design Self-Declaration
 
-```bash
-uv run task test          # must still pass
-uv run task lint          # must exit 0
-uv run task static-check  # must exit 0
+After refactor is complete and `test-fast` passes, complete this checklist before requesting the reviewer check. Include the filled-in checklist in your reviewer check request — this is the structured audit target the reviewer will verify against the actual code.
+
+*For each item: check the box and cite `file:line` evidence, or explain why the rule does not apply to the code changed in this cycle.*
+
+#### YAGNI
+- [ ] No abstractions added beyond what the current acceptance criteria require
+- [ ] No speculative parameters, flags, or extension points for hypothetical future use
+
+#### KISS
+- [ ] Every function can be described in one sentence without "and"
+- [ ] No unnecessary indirection, wrapper layers, or complexity
+
+#### DRY
+- [ ] No logic duplicated across functions or classes
+- [ ] Shared concepts extracted into a single reusable location
+
+#### SOLID
+- [ ] **S** — each class/function has exactly one reason to change (`file:line`)
+- [ ] **O** — new behavior added via extension, not by editing existing class bodies
+- [ ] **L** — subtypes fully substitutable; no subtype narrows a contract or raises where base does not
+- [ ] **I** — no Protocol/ABC forces unused method implementations
+- [ ] **D** — domain classes import from abstractions (Protocols), not from I/O or framework layers directly
+
+#### Object Calisthenics
+- [ ] Rule 1 — one indent level per method (`file:line` of deepest nesting)
+- [ ] Rule 2 — no `else` after `return`; early returns only
+- [ ] Rule 3 — primitives wrapped: no bare `int`/`str` as domain concepts in public signatures
+- [ ] Rule 4 — collections wrapped: no bare `list[X]` as domain values
+- [ ] Rule 5 — one dot per line: no `a.b.c()` chains
+- [ ] Rule 6 — no abbreviations in names
+- [ ] Rule 7 — functions ≤ 20 lines, classes ≤ 50 lines (cite longest: `file:line`)
+- [ ] Rule 8 — ≤ 2 instance variables per class (cite any with 2: `file:line`)
+- [ ] Rule 9 — no getters/setters; tell-don't-ask (`get_x()`/`set_x()` = FAIL)
+
+Update `## Cycle State` Phase: `SELF-DECLARE`
+
+## REVIEWER CHECK — Code Design Only
+
+After each test goes green + refactor + self-declaration, **STOP** and request a reviewer check. Include the filled-in Design Self-Declaration checklist in your request.
+
+**STOP — request a reviewer check of code design and semantic alignment.**
+**WAIT for APPROVED before committing.**
+
+The reviewer is scoped to **code design only** (not full Step 5):
+
+**What the reviewer receives**: The developer's completed Design Self-Declaration with `file:line` evidence for each rule.
+
+**What the reviewer does**: Independently inspects the actual code for each rule the developer claimed compliant. The self-declaration is an audit target — the reviewer verifies claims, not just reads them.
+
+**What the reviewer does NOT check** (deferred to Step 5):
+- Lint compliance
+- Pyright/type checking
+- Coverage metrics
+- Full test suite
+
+The reviewer responds using this template:
+
+```markdown
+## Code-Design Check — @id:<hex>
+
+| Rule | Developer Claims | Reviewer Verdict | Evidence |
+|------|-----------------|------------------|----------|
+| YAGNI | <summary> | PASS / FAIL | `file:line` or N/A |
+| KISS | <summary> | PASS / FAIL | `file:line` or N/A |
+| DRY | <summary> | PASS / FAIL | `file:line` or N/A |
+| SOLID-S | <summary> | PASS / FAIL | `file:line` or N/A |
+| SOLID-O | <summary> | PASS / FAIL | `file:line` or N/A |
+| SOLID-L | <summary> | PASS / FAIL | `file:line` or N/A |
+| SOLID-I | <summary> | PASS / FAIL | `file:line` or N/A |
+| SOLID-D | <summary> | PASS / FAIL | `file:line` or N/A |
+| OC-1 thru OC-9 | <summary> | PASS / FAIL | `file:line` or N/A |
+| Design patterns | <summary> | PASS / FAIL | `file:line` or N/A |
+| Semantic alignment | <summary> | PASS / FAIL | `file:line` or N/A |
+
+Decision: APPROVED / REJECTED
 ```
 
-## COMMIT
+Any row where Reviewer Verdict = FAIL is a rejection. The reviewer must cite `file:line` evidence for every FAIL.
+
+If REJECTED:
+- Mark the `@id` row as `[~]` in TODO.md (do not downgrade to `[ ]`)
+- Update `## Cycle State` Phase to `REVIEWER(code-design)`
+- Fix the specific issues raised
+- Do not commit
+- Request re-review after fix
+
+This is a **hard gate** — do not commit until APPROVED.
+
+Update `## Cycle State` Phase: `REVIEWER(code-design)`
+
+## COMMIT (after reviewer approval)
 
 ```bash
 git add -A
 git commit -m "feat(<feature-name>): implement <what this test covers>"
 ```
 
+Update TODO.md:
+- Mark the `@id` row `[x]` with ` — reviewer(code-design) APPROVED`
+- Update `## Cycle State` Phase to `COMMITTED`
+- Update `## Next` to the next failing test
+
 Then move to the next failing test.
 
-## Self-Verification Before Handoff
+## Handling Spec Gaps
 
-After all tests are green, before telling the reviewer you are ready:
+If during implementation you discover a behavior not covered by existing acceptance criteria:
+- **Do not extend criteria yourself** — escalate to the PO
+- Note the gap in TODO.md under `## Next`
+- The PO will decide whether to add a new Example to the `.feature` file
 
-```bash
-uv run task lint                # exit 0
-uv run task static-check        # exit 0, 0 errors
-uv run task test                # exit 0, all pass, coverage 100%
-timeout 10s uv run task run     # exit non-124; exit 124 = hung process = fix it
+Extra tests in `tests/unit/` are allowed freely (coverage, edge cases, etc.) — these do not need `@id` traceability.
+
+## Signature Design
+
+Design signatures before writing bodies. Use Python protocols for abstractions:
+
+```python
+from typing import Protocol
+from dataclasses import dataclass
+
+@dataclass(frozen=True, slots=True)
+class EmailAddress:
+    """A validated email address."""
+
+    value: str
+
+    def __post_init__(self) -> None:
+        """Validate the email format on creation."""
+        if "@" not in self.value:
+            raise ValueError(f"Invalid email: {self.value!r}")
+
+class UserRepository(Protocol):
+    """Persistence interface for users."""
+
+    def save(self, user: "User") -> None: ...
+    def find_by_email(self, email: EmailAddress) -> "User | None": ...
 ```
 
-All four must pass. Do not hand off broken work.
+## Self-Verification Before Handoff
+
+After all tests are green and every per-test cycle has been committed with reviewer approval, complete these final checks before handing off to the reviewer for full Step 5 verification.
+
+**Manual verification**: Run the app and verify it does what the AC says, not just what the tests check.
 
-**Manual verification**: After all four commands pass, run the app and manually verify it does what the AC says, not just what the tests check. If the feature involves user interaction, interact with it yourself.
+**Production-grade check**: If you change an input, does the output change accordingly? If any output is static regardless of input, the implementation is not complete.
 
-**Production-grade check**: Before handing off, answer honestly: if you change an input, does the output change accordingly? If any output is static regardless of input, the implementation is not complete — fix it before handing off. The reviewer will verify this by running the app and changing an input.
+**Developer pre-mortem**: In 2-3 sentences, answer: "If this feature shipped but was broken for the user, what would be the most likely reason?" Include this in the handoff message.
+
+**Quality tooling** — run all four, all must pass:
+
+```bash
+uv run task lint
+uv run task static-check
+uv run task test
+timeout 10s uv run task run
+```
 
-**Developer pre-mortem** (write this before handing off to reviewer): In 2–3 sentences, answer: "If this feature shipped but was broken for the user, what would be the most likely reason?" Include this in the handoff message or as a `## Pre-mortem` subsection in the feature doc's Architecture section.
+Do not hand off broken work. These are the only commands that run at handoff — the Design Self-Declaration was already completed and verified per-test during each REFACTOR cycle.
diff --git a/.opencode/skills/pr-management/SKILL.md b/.opencode/skills/pr-management/SKILL.md
index 728ad00..172ebd5 100644
--- a/.opencode/skills/pr-management/SKILL.md
+++ b/.opencode/skills/pr-management/SKILL.md
@@ -52,8 +52,8 @@ gh pr create \
 - <What this PR does in 1-3 bullet points>
 
 ## Acceptance Criteria
-- [x] `<uuid>`: <description>
-- [x] `<uuid>`: <description>
+- [x] `@id:<hex>`: <description>
+- [x] `@id:<hex>`: <description>
 
 ## Testing
 - All tests pass: `task test`
@@ -75,7 +75,7 @@ EOF
 - [ ] `task static-check` exits 0
 - [ ] `task test` exits 0, coverage 100%
 - [ ] `timeout 10s task run` exits with code ≠ 124
-- [ ] PR description includes all UUID acceptance criteria
+- [ ] PR description includes all `@id` acceptance criteria
 
 ## Merging
 
diff --git a/.opencode/skills/scope/SKILL.md b/.opencode/skills/scope/SKILL.md
index ce2a70e..ac33fb7 100644
--- a/.opencode/skills/scope/SKILL.md
+++ b/.opencode/skills/scope/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: scope
-description: Step 1 — define user stories and acceptance criteria with UUID traceability
-version: "1.0"
+description: Step 1 — discover requirements through stakeholder interviews and write Gherkin acceptance criteria
+version: "2.0"
 author: product-owner
 audience: product-owner
 workflow: feature-lifecycle
@@ -9,41 +9,130 @@ workflow: feature-lifecycle
 
 # Scope
 
-This skill guides the product owner through Step 1 of the feature lifecycle: defining what to build with enough precision that a developer can write tests without asking questions.
+This skill guides the PO through Step 1 of the feature lifecycle: interviewing the stakeholder, discovering requirements, and writing Gherkin specifications precise enough for a developer to write tests without asking questions.
 
 ## When to Use
 
-When the PO is starting a new feature. The output is a feature document in `docs/features/backlog/`.
+When the PO is starting a new project or a new feature. The output is a set of discovery documents and `.feature` files in `docs/features/backlog/<name>/`.
 
-## Step-by-Step
+## Overview
 
-### 1. Create the Feature Document
+Step 1 has 4 phases:
 
-Create `docs/features/backlog/<verb>-<object>.md`. Filename must be kebab-case, imperative verb first, 2–4 words.
-Examples: `display-version.md`, `authenticate-user.md`, `export-metrics-csv.md`
+| Phase | Who | Output |
+|---|---|---|
+| 1. Project Discovery | PO + stakeholder | `docs/features/discovery.md` + feature list |
+| 2. Feature Discovery | PO + stakeholder | `docs/features/backlog/<name>/discovery.md` |
+| 3. Stories | PO alone | `<story-slug>.feature` files (no Examples) |
+| 4. Criteria | PO alone | `Example:` blocks with `@id` tags |
 
-```markdown
-# Feature: <Verb> <Object>
+---
 
-## User Stories
-- As a <role>, I want <goal> so that <benefit>
+## Phase 1 — Project Discovery
 
-## Acceptance Criteria
+**When**: Once per project, before any features are scoped.
 
-- `<uuid>`: <Short description>.
-  Source: <stakeholder | po | developer | reviewer | bug>
+### 1.1 Create Project Discovery Document
 
-  Given: <precondition>
-  When: <action>
-  Then: <expected outcome>
+Create `docs/features/discovery.md` with Status + Questions only (no Entities table). See the format in the "Discovery Document Formats" section below.
 
-## Notes
-<constraints, risks, out-of-scope items, dependencies>
-```
+### 1.2 Ask the 7 Standard Questions
+
+Present all questions to the stakeholder at once:
+
+1. **Who** are the users of this product?
+2. **What** does the product do at a high level?
+3. **Why** does it exist — what problem does it solve?
+4. **When** and **where** is it used (environment, platform, context)?
+5. **Success** — how do we know it works? What does "done" look like?
+6. **Failure** — what does failure look like? What must never happen?
+7. **Out-of-scope** — what are we explicitly not building?
+
+### 1.3 Silent Pre-mortem
+
+After receiving answers, run this internally (do not show the stakeholder):
+
+> "Imagine we build exactly what the stakeholder described, ship it, and it fails. What was missing from their answers?"
+
+Generate targeted follow-up questions from this analysis. Add them to the Questions table in `discovery.md`.
+
+### 1.4 Follow Up
+
+Present all follow-up questions at once. Continue until all questions have status `ANSWERED`.
+
+### 1.5 Baseline
+
+When all questions are answered, autonomously set `Status: BASELINED` in `docs/features/discovery.md`.
+
+From the answers, identify the feature list. For each feature, create `docs/features/backlog/<name>/discovery.md` using the per-feature template (with Entities table).
+
+Commit: `feat(discovery): baseline project discovery`
+
+---
+
+## Phase 2 — Feature Discovery
+
+**When**: Per feature, after project discovery is baselined.
+
+### 2.1 Derive Questions from Feature Entities
+
+Open `docs/features/backlog/<name>/discovery.md`. This step happens **before** any stakeholder interaction.
 
-### 2. Write User Stories
+1. **Populate the Entities table**: Extract nouns (candidate classes/models) and verbs (candidate methods/features) from project discovery answers relevant to this feature. Mark each as in-scope or not.
+2. **Generate questions from entities**: For each in-scope entity, ask:
+   - What are its boundaries and edge cases?
+   - What happens when it's missing, invalid, or at its limits?
+   - How does it interact with other entities?
+3. **Add questions from gaps**: Questions from areas not covered by project discovery, ambiguities specific to this feature, and boundary conditions.
+4. **Silent pre-mortem** (before the first interview round):
 
-Each story follows the format: "As a `<role>`, I want `<goal>` so that `<benefit>`."
+> "Imagine the developer builds this feature exactly as described, all tests pass, but the feature doesn't work for the user. What would be missing?"
+
+Add any discoveries as new questions to the Questions table.
+
+### 2.2 Interview
+
+Present **all** questions to the stakeholder at once. After receiving answers:
+
+1. Mark answered questions as `ANSWERED` in the Questions table
+2. Run a silent pre-mortem on the new answers — generate follow-up questions
+3. Present follow-up questions to the stakeholder
+4. Repeat until the stakeholder says **"baseline"** to freeze discovery
+
+### 2.3 Feature Decomposition Check
+
+Before moving to Phase 3, check: does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**? If yes:
+
+1. Split into separate features in `backlog/` — each addressing a single cohesive concern
+2. Create a new `discovery.md` for each split feature
+3. Re-run Phase 2 for any split feature that needs its own discovery
+
+### 2.4 Baseline
+
+When the stakeholder says "baseline" (and decomposition check passes), set `Status: BASELINED` in the feature `discovery.md`.
+
+Commit: `feat(discovery): baseline <name> feature discovery`
+
+---
+
+## Phase 3 — Stories
+
+**When**: After feature discovery is baselined. PO works alone.
+
+### 3.1 Write User Story Files
+
+Create one `.feature` file per user story in `docs/features/backlog/<name>/`.
+
+Filename: `<story-slug>.feature` — kebab-case, 2-4 words.
+
+Content (no Examples yet):
+
+```gherkin
+Feature: <Title in natural language>
+  As a <role>
+  I want <goal>
+  So that <benefit>
+```
 
 Good stories are:
 - **Independent**: can be delivered without other stories
@@ -55,80 +144,171 @@ Good stories are:
 
 Avoid: "As the system, I want..." (no business value). Break down stories that contain "and" into two stories.
 
-### 3. Write Acceptance Criteria
+### 3.2 INVEST Gate
+
+Before committing, verify every story passes:
 
-Each criterion maps directly to one test. Write as many as needed — one per observable behavior.
+| Letter | Question | FAIL action |
+|---|---|---|
+| **I**ndependent | Can this story be delivered without other stories? | Split or reorder dependencies |
+| **N**egotiable | Are details open to discussion with the developer? | Remove over-specification |
+| **V**aluable | Does it deliver something the end user cares about? | Reframe or drop |
+| **E**stimable | Can a developer estimate the effort? | Split or add discovery questions |
+| **S**mall | Completable in one feature cycle? | Split into smaller stories |
+| **T**estable | Can it be verified with a concrete test? | Rewrite with observable outcomes |
 
-**UUID generation**:
+### 3.3 Review Checklist
+
+- [ ] Every story has a distinct user role and benefit
+- [ ] No story duplicates another
+- [ ] Stories collectively cover all entities marked in-scope in `discovery.md`
+- [ ] Every story passes the INVEST gate
+
+Commit: `feat(stories): write user stories for <name>`
+
+---
+
+## Phase 4 — Criteria
+
+**When**: After stories are written. PO works alone.
+
+### 4.1 Silent Pre-mortem Per Story
+
+For each `.feature` file, ask internally:
+
+> "What observable behaviors must we prove for this story to be complete?"
+
+### 4.2 Write Example Blocks
+
+Add `Example:` blocks to each `.feature` file. Each Example gets an `@id:<8-char-hex>` tag.
+
+**ID generation**:
 ```bash
-python -c "import uuid; print(uuid.uuid4())"
+uv run task gen-id
 ```
 
-**Format** (mandatory — exactly this structure):
-```markdown
-- `a1b2c3d4-e5f6-7890-abcd-ef1234567890`: Ball bounces off top wall.
-  Source: stakeholder
+**Format** (mandatory):
 
-  Given: A ball moving upward reaches y=0
-  When: The physics engine processes the next frame
-  Then: The ball velocity y-component becomes positive
+```gherkin
+  @id:a3f2b1c4
+  Example: Ball bounces off top wall
+    Given a ball moving upward reaches y=0
+    When the physics engine processes the next frame
+    Then the ball velocity y-component becomes positive
 ```
 
-**Source values** (choose exactly one):
-- `stakeholder` — an external stakeholder gave this requirement to the PO
-- `po` — the PO originated this criterion independently
-- `developer` — a gap found during Step 4 implementation
-- `reviewer` — a gap found during Step 5 verification
-- `bug` — a post-merge regression; the feature doc was reopened
-
 **Rules**:
-- UUID must be unique across the entire project, not just this feature
-- First line: UUID + colon + short description ending with a period
-- `Source:` on the next line, followed by a blank line, then Given/When/Then
-- Use plain English, not technical jargon in Given/When/Then
-- "Then" must be a single observable, measurable outcome — no "and"
-- **Observable means observable by the end user, not by a test harness.** If the AC says "when the user presses W," a test that calls `update_player("W")` does not satisfy it. Either (a) the test must send input through the actual user-facing entry point, or (b) the AC must explicitly state the boundary ("when `update_player` receives 'W'") so the gap is visible.
+- `@id` tag on the line before `Example:`
+- `Example:` keyword (not `Scenario:`)
+- `Given/When/Then` in plain English
+- `Then` must be a single, observable, measurable outcome — no "and"
+- **Observable means observable by the end user**, not by a test harness
+- **Declarative, not imperative** — describe behavior, not UI steps
+- Each Example must be observably distinct from every other
+- A single `.feature` file must not span multiple concerns — split into separate `.feature` files if needed (a feature folder can contain multiple `.feature` files)
+- If user interaction is involved, the Feature description must declare the interaction model
+
+**Declarative vs. imperative Gherkin**:
+
+| Imperative (wrong) | Declarative (correct) |
+|---|---|
+| Given I type "bob" in the username field | Given a registered user Bob |
+| When I click the Login button | When Bob logs in |
+| Then I see "Welcome, Bob" on the dashboard | Then Bob sees a personalized welcome |
 
-**Interaction model declaration**: If the feature involves user interaction (CLI input, web forms, API calls), the Notes section must declare the interaction model: what input the user provides and how. This prevents a hardcoded demo from silently substituting for real interaction.
+Write Examples that describe *what happens*, not *how the user clicks through the UI*. Imperative steps couple tests to specific UI layouts and break when the UI changes.
+
+**MoSCoW triage**: When a story spans multiple concerns or has many candidate Examples, ask for each one: is this a **Must** (required for the story to be correct), a **Should** (high value but deferrable), or a **Could** (nice-to-have edge case)? If the story spans >2 concerns or Musts alone exceed 8, the story needs splitting.
 
 **Common mistakes to avoid**:
 - "Then: It works correctly" (not measurable)
-- "Then: The system updates the database and sends an email" (split into two criteria)
-- Multiple behaviors in one criterion (split them)
-- Criteria that test implementation details ("Then: the Strategy pattern is used")
+- "Then: The system updates the database and sends an email" (split into two Examples)
+- Multiple behaviors in one Example (split them)
+- Examples that test implementation details ("Then: the Strategy pattern is used")
+- Imperative UI steps instead of declarative behavior descriptions
 
-### 4. Review Checklist
+### 4.3 Review Checklist
 
 Before committing:
-- [ ] Filename is `<verb>-<object>.md`, imperative verb first, 2–4 words
-- [ ] Title matches filename: `# Feature: <Verb> <Object>` in Title Case
-- [ ] Every user story has at least one acceptance criterion
-- [ ] Every UUID is unique (check existing feature docs)
-- [ ] Every criterion has a `Source:` field with one of the five valid values
-- [ ] Every criterion has Given/When/Then
-- [ ] Blank line between `Source:` and `Given:`
-- [ ] "Then" is a single, observable, measurable outcome — observable by the end user
-- [ ] No criterion tests implementation details
-- [ ] Out-of-scope items are explicitly listed in Notes
-- [ ] If the feature involves user interaction, the Notes section declares the interaction model
-
-**PO pre-mortem** (do this before committing): Imagine the developer builds exactly what the AC says, all automated tests pass, but the feature doesn't work for the user. What would be missing? Add any discoveries as additional acceptance criteria.
-
-### 5. Commit and Notify Developer
+- [ ] Every `.feature` file has at least one Example
+- [ ] Every `@id` is unique within this feature (check: `grep -r "@id:" docs/features/backlog/<name>/`)
+- [ ] Every Example has `Given/When/Then`
+- [ ] Every `Then` is a single, observable, measurable outcome
+- [ ] No Example tests implementation details
+- [ ] If user interaction is involved, the interaction model is declared in the Feature description
+- [ ] Each Example is observably distinct from every other
+- [ ] No single `.feature` file spans multiple concerns (split if needed)
+
+### 4.4 Final Pre-mortem
+
+Before committing, one last check:
+
+> "Imagine the developer builds exactly what these Examples say, all automated tests pass, but the feature doesn't work for the user. What would be missing?"
+
+Add any discoveries as new Examples.
+
+### 4.5 Commit and Freeze
 
 ```bash
-git add docs/features/backlog/<feature-name>.md
-git commit -m "feat(scope): define <feature-name> acceptance criteria"
+git add docs/features/backlog/<name>/
+git commit -m "feat(criteria): write acceptance criteria for <name>"
+```
+
+**After this commit, the `.feature` files are frozen.** Any change requires:
+1. Add `@deprecated` tag to the old Example
+2. Write a new Example with a new `@id`
+3. Run `uv run task gen-tests` to sync test stubs
+
+---
+
+## Discovery Document Formats
+
+### Project-Level (`docs/features/discovery.md`)
+
+```markdown
+# Discovery: <project-name>
+
+## State
+Status: ELICITING | BASELINED
+
+## Questions
+| ID | Question | Answer | Status |
+|----|----------|--------|--------|
+| Q1 | Who are the users? | ... | OPEN / ANSWERED |
 ```
 
-The developer moves the feature from `backlog/` to `in-progress/` as the first act of Step 2.
+No Entities table at project level.
 
-## MoSCoW Prioritization
+### Per-Feature (`docs/features/backlog/<name>/discovery.md`)
+
+```markdown
+# Discovery: <feature-name>
+
+## State
+Status: ELICITING | BASELINED
+
+## Entities
+| Type | Name | Candidate Class/Method | In Scope |
+|------|------|----------------------|----------|
+| Noun | Ball | Ball | Yes |
+| Verb | Bounce | Ball.bounce() | Yes |
+
+## Rules
+Business rules that apply across multiple Examples. Each rule explains *why* a group of Examples exists.
+
+- <Rule description>
+
+## Constraints
+Non-functional requirements specific to this feature (performance, security, usability, etc.).
+
+- <Constraint description>
+
+## Questions
+| ID | Question | Answer | Status |
+|----|----------|--------|--------|
+| Q1 | ... | ... | OPEN / ANSWERED |
+```
 
-When ordering multiple features in the backlog, use:
-- **Must**: required for the product to work
-- **Should**: high value, strong business case
-- **Could**: nice to have, low risk to defer
-- **Won't**: explicitly out of scope for now
+The **Rules** section captures the business-rule layer from Example Mapping: each rule may generate multiple Examples, and identifying rules first prevents redundant or contradictory Examples.
 
-Add a `Priority: Must | Should | Could | Won't` line to each feature doc's Notes section.
+The **Constraints** section captures non-functional requirements. Testable constraints should become `Example:` blocks with `@id` tags. System-wide constraints belong in the project-level `discovery.md`.
diff --git a/.opencode/skills/scope/discovery-template.md b/.opencode/skills/scope/discovery-template.md
new file mode 100644
index 0000000..5079e99
--- /dev/null
+++ b/.opencode/skills/scope/discovery-template.md
@@ -0,0 +1,16 @@
+# Discovery: <feature-name>
+
+## State
+Status: ELICITING
+
+## Entities
+| Type | Name | Candidate Class/Method | In Scope |
+|------|------|----------------------|----------|
+
+## Rules
+
+## Constraints
+
+## Questions
+| ID | Question | Answer | Status |
+|----|----------|--------|--------|
diff --git a/.opencode/skills/session-workflow/SKILL.md b/.opencode/skills/session-workflow/SKILL.md
index d67ae1a..f7c7afd 100644
--- a/.opencode/skills/session-workflow/SKILL.md
+++ b/.opencode/skills/session-workflow/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: session-workflow
 description: Session start and end protocol — read TODO.md, continue from checkpoint, update and commit
-version: "1.0"
+version: "2.1"
 author: developer
 audience: all-agents
 workflow: session-management
@@ -13,12 +13,15 @@ Every session starts by reading state. Every session ends by writing state. This
 
 ## Session Start
 
-1. Read `TODO.md` — find current feature, current step, and the "Next" line
-2. Read `docs/features/in-progress/<feature-name>.md` if a feature is active
+1. Read `TODO.md` — find current feature, current step, and the "Next" line.
+   - If `TODO.md` does not exist, run `uv run task gen-todo` to create it, then read the result.
+2. If a feature is active, read:
+   - `docs/features/in-progress/<name>/discovery.md` — feature discovery
+   - `docs/features/discovery.md` — project-level discovery (for context)
 3. Run `git status` — understand what is committed vs. what is not
 4. Confirm scope: you are working on exactly one step of one feature
 
-If TODO.md says "No feature in progress", check `docs/features/backlog/`. If the backlog is empty, the PO needs to define the next feature.
+If TODO.md says "No feature in progress", report to the PO that backlog features are waiting. **The developer never self-selects a feature from the backlog — only the PO picks.**
 
 ## Session End
 
@@ -26,12 +29,25 @@ If TODO.md says "No feature in progress", check `docs/features/backlog/`. If the
    - Mark completed criteria `[x]`
    - Mark in-progress criteria `[~]`
    - Update the "Next" line with one concrete action
-2. Commit any uncommitted work (even WIP):
+2. Run `uv run task gen-todo` to sync any new @id rows from .feature files into TODO.md.
+3. Commit any uncommitted work (even WIP):
    ```bash
    git add -A
    git commit -m "WIP(<feature-name>): <what was done>"
    ```
-3. If a step is fully complete, use the proper commit message instead of WIP.
+4. If a step is fully complete, use the proper commit message instead of WIP.
+
+## Step Completion Protocol
+
+When a step completes within a session:
+
+1. Update TODO.md to reflect the completed step before doing any other work.
+2. Commit the TODO.md update:
+   ```bash
+   git add TODO.md
+   git commit -m "chore: complete step <N> for <feature-name>"
+   ```
+3. Only then begin the next step (in a new session where possible — see Rule 4).
 
 ## TODO.md Format
 
@@ -40,17 +56,22 @@ If TODO.md says "No feature in progress", check `docs/features/backlog/`. If the
 
 Feature: <name>
 Step: <1-6> (<step name>)
-Source: docs/features/in-progress/<name>.md
+Source: docs/features/in-progress/<name>/discovery.md
 
 ## Progress
-- [x] `<uuid>`: <description>
-- [~] `<uuid>`: <description>  ← IN PROGRESS
-- [ ] `<uuid>`: <description>
+- [x] `@id:<hex>`: <description>
+- [~] `@id:<hex>`: <description>  ← IN PROGRESS
+- [ ] `@id:<hex>`: <description>
 
 ## Next
 <One sentence: exactly what to do in the next session>
 ```
 
+**Source path by step:**
+- Step 1: `Source: docs/features/backlog/<name>/discovery.md`
+- Steps 2–5: `Source: docs/features/in-progress/<name>/discovery.md`
+- Step 6: `Source: docs/features/completed/<name>/discovery.md`
+
 Status markers:
 - `[ ]` — not started
 - `[~]` — in progress
@@ -65,6 +86,53 @@ No feature in progress.
 Next: PO picks feature from docs/features/backlog/ and moves it to docs/features/in-progress/.
 ```
 
+## Step 4 Cycle-Aware TODO Format
+
+During Step 4 (Implementation), TODO.md **must** include a `## Cycle State` block to track Red-Green-Refactor-Review progress. This block is **mandatory** — missing it means the cycle is unverifiable.
+
+```markdown
+# Current Work
+
+Feature: <name>
+Step: 4 (implement)
+Source: docs/features/in-progress/<name>/discovery.md
+
+## Cycle State
+Test: `@id:<hex>` — <description>
+Phase: RED | GREEN | REFACTOR | SELF-DECLARE | REVIEWER(code-design) | COMMITTED
+
+## Progress
+- [x] `@id:<hex>`: <description> — reviewer(code-design) APPROVED
+- [~] `@id:<hex>`: <description>          ← in progress (see Cycle State)
+- [ ] `@id:<hex>`: <description>          ← next
+
+## Next
+<One actionable sentence>
+```
+
+### Reviewer Scope Legend
+
+When referencing reviewer interactions in TODO.md:
+- `reviewer(code-design)` — per-test design check during Step 4 (YAGNI/KISS/DRY/SOLID/ObjCal/patterns + semantic alignment only)
+- `reviewer(full-verify)` — Step 5 full verification (lint, pyright, coverage, semantic review, adversarial testing)
+
+## gen-todo Script
+
+`uv run task gen-todo` keeps TODO.md in sync with `.feature` files:
+
+```bash
+uv run task gen-todo              # merge-write: add missing @id rows, preserve existing status
+uv run task gen-todo -- --check   # dry run — report what would change
+```
+
+**Merge rules:**
+- Adds any `@id` rows from in-progress `.feature` files that are missing in `## Progress`
+- Never removes or downgrades existing `[x]`, `[~]`, `[-]` rows
+- Preserves the `Step:` field and `## Next` line from the current TODO.md
+- If no feature is in-progress, writes the "No feature in progress" format
+
+Run `gen-todo` at session start (after reading TODO.md) and at session end (before committing).
+
 ## Rules
 
 1. Never skip reading TODO.md at session start
@@ -72,3 +140,5 @@ Next: PO picks feature from docs/features/backlog/ and moves it to docs/features
 3. Never leave uncommitted changes — commit as WIP if needed
 4. One step per session where possible; do not start Step N+1 in the same session as Step N
 5. The "Next" line must be actionable enough that a fresh AI can execute it without asking questions
+6. During Step 4, always update `## Cycle State` when transitioning between RED/GREEN/REFACTOR/SELF-DECLARE/REVIEWER phases
+7. When a step completes, update TODO.md and commit **before** any further work
diff --git a/.opencode/skills/session-workflow/scripts/gen_todo.py b/.opencode/skills/session-workflow/scripts/gen_todo.py
new file mode 100644
index 0000000..b4b883c
--- /dev/null
+++ b/.opencode/skills/session-workflow/scripts/gen_todo.py
@@ -0,0 +1,376 @@
+"""Generate and sync the TODO.md session bookmark from .feature files.
+
+Reads the in-progress feature folder (or backlog if no in-progress feature),
+merges missing @id rows into the existing TODO.md, and writes the result.
+
+Modes:
+    uv run task gen-todo              Merge-write TODO.md (default)
+    uv run task gen-todo -- --check   Dry run — show what would change
+
+Merge rules:
+    - Adds @id rows that are in .feature files but missing from TODO.md
+    - Never removes or downgrades existing [x], [~], [-] rows
+    - Updates the Feature/Step/Source header from the in-progress folder
+    - If no feature is in-progress, writes the "No feature in progress" format
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+PROJECT_ROOT = Path(__file__).resolve().parents[4]
+FEATURES_DIR = PROJECT_ROOT / "docs" / "features"
+TODO_PATH = PROJECT_ROOT / "TODO.md"
+
+PROGRESS_ROW_RE = re.compile(r"^- \[(?P<status>[x~\- ])\] `@id:(?P<id>[a-f0-9]{8})`")
+ID_TAG_RE = re.compile(r"@id:([a-f0-9]{8})")
+EXAMPLE_RE = re.compile(r"^\s*Example:\s*(.+)$")
+DEPRECATED_TAG_RE = re.compile(r"@deprecated")
+
+
+@dataclass(frozen=True, slots=True)
+class Criterion:
+    """One acceptance criterion extracted from a .feature file."""
+
+    id_hex: str
+    title: str
+    deprecated: bool
+
+
+def find_in_progress_feature() -> tuple[str, Path] | None:
+    """Find the single feature currently in docs/features/in-progress/.
+
+    Returns:
+        Tuple of (feature_name, feature_path) or None if nothing is in progress.
+    """
+    in_progress = FEATURES_DIR / "in-progress"
+    if not in_progress.exists():
+        return None
+    folders = [
+        f
+        for f in in_progress.iterdir()
+        if f.is_dir() and f.name != ".gitkeep" and not f.name.startswith(".")
+    ]
+    if not folders:
+        return None
+    return folders[0].name, folders[0]
+
+
+def find_backlog_features() -> list[str]:
+    """List feature names in docs/features/backlog/.
+
+    Returns:
+        Sorted list of feature folder names.
+    """
+    backlog = FEATURES_DIR / "backlog"
+    if not backlog.exists():
+        return []
+    return sorted(
+        f.name
+        for f in backlog.iterdir()
+        if f.is_dir() and f.name != ".gitkeep" and not f.name.startswith(".")
+    )
+
+
+def extract_criteria(feature_path: Path) -> list[Criterion]:
+    """Extract all @id-tagged Examples from .feature files in a feature folder.
+
+    Args:
+        feature_path: Path to the feature folder.
+
+    Returns:
+        Ordered list of Criterion objects (deprecated ones included).
+    """
+    criteria: list[Criterion] = []
+    for feature_file in sorted(feature_path.glob("*.feature")):
+        criteria.extend(_parse_feature_file(feature_file))
+    return criteria
+
+
+def _parse_feature_file(path: Path) -> list[Criterion]:
+    """Parse a single .feature file for @id-tagged Examples.
+
+    Args:
+        path: Path to the .feature file.
+
+    Returns:
+        List of Criterion objects found in this file.
+    """
+    lines = path.read_text(encoding="utf-8").splitlines()
+    criteria: list[Criterion] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        id_match = ID_TAG_RE.search(line)
+        if id_match:
+            id_hex = id_match.group(1)
+            deprecated = bool(DEPRECATED_TAG_RE.search(line))
+            title = _find_example_title(lines, i + 1)
+            criteria.append(
+                Criterion(id_hex=id_hex, title=title, deprecated=deprecated)
+            )
+        i += 1
+    return criteria
+
+
+def _find_example_title(lines: list[str], start: int) -> str:
+    """Scan forward from start to find the Example: title line.
+
+    Args:
+        lines: All lines from the .feature file.
+        start: Index to start scanning from.
+
+    Returns:
+        The Example title string, or empty string if not found.
+    """
+    for i in range(start, min(start + 5, len(lines))):
+        m = EXAMPLE_RE.match(lines[i])
+        if m:
+            return m.group(1).strip()
+    return ""
+
+
+def read_existing_progress(todo_text: str) -> dict[str, str]:
+    """Extract existing @id rows and their status from TODO.md content.
+
+    Args:
+        todo_text: Full content of current TODO.md.
+
+    Returns:
+        Dict mapping id_hex -> status character ('x', '~', '-', ' ').
+    """
+    existing: dict[str, str] = {}
+    for line in todo_text.splitlines():
+        m = PROGRESS_ROW_RE.match(line)
+        if m:
+            existing[m.group("id")] = m.group("status")
+    return existing
+
+
+def build_progress_lines(
+    criteria: list[Criterion],
+    existing: dict[str, str],
+) -> list[str]:
+    """Build the ## Progress section lines, merging new with existing.
+
+    Args:
+        criteria: All criteria from .feature files (in order).
+        existing: Existing @id -> status mapping from current TODO.md.
+
+    Returns:
+        List of progress row strings (without trailing newline).
+    """
+    lines = []
+    for c in criteria:
+        status = existing.get(c.id_hex, " ")
+        label = c.title or "(no title)"
+        suffix = " — DEPRECATED" if c.deprecated else ""
+        lines.append(f"- [{status}] `@id:{c.id_hex}`: {label}{suffix}")
+    return lines
+
+
+def build_todo_content(
+    feature_name: str,
+    step: str,
+    source: str,
+    progress_lines: list[str],
+    next_action: str,
+) -> str:
+    """Assemble the full TODO.md content.
+
+    Args:
+        feature_name: Display name of the current feature.
+        step: Current step number and name, e.g. '4 (implement)'.
+        source: Path to discovery.md.
+        progress_lines: The ## Progress rows.
+        next_action: The ## Next one-liner.
+
+    Returns:
+        Full TODO.md content string.
+    """
+    lines = [
+        "# Current Work",
+        "",
+        f"Feature: {feature_name}",
+        f"Step: {step}",
+        f"Source: {source}",
+        "",
+        "## Progress",
+        *progress_lines,
+        "",
+        "## Next",
+        next_action,
+        "",
+    ]
+    return "\n".join(lines)
+
+
+def build_empty_todo() -> str:
+    """Build the 'No feature in progress' TODO.md content.
+
+    Returns:
+        Minimal TODO.md content string.
+    """
+    return "\n".join(
+        [
+            "# Current Work",
+            "",
+            "No feature in progress.",
+            "Next: PO picks feature from docs/features/backlog/ and moves it to"
+            " docs/features/in-progress/.",
+            "",
+        ]
+    )
+
+
+def _extract_header_field(todo_text: str, field: str) -> str:
+    """Extract a header field value from existing TODO.md.
+
+    Args:
+        todo_text: Full TODO.md content.
+        field: Field name to look for (e.g. 'Step', 'Feature').
+
+    Returns:
+        The value string, or empty string if not found.
+    """
+    pattern = re.compile(rf"^{field}:\s*(.+)$", re.MULTILINE)
+    m = pattern.search(todo_text)
+    return m.group(1).strip() if m else ""
+
+
+def _extract_next_action(todo_text: str) -> str:
+    """Extract the ## Next line from existing TODO.md.
+
+    Args:
+        todo_text: Full TODO.md content.
+
+    Returns:
+        The Next action string, or a placeholder.
+    """
+    lines = todo_text.splitlines()
+    for i, line in enumerate(lines):
+        if line.strip() == "## Next" and i + 1 < len(lines) and lines[i + 1].strip():
+            return lines[i + 1].strip()
+    return "<fill in next action>"
+
+
+def _sync_no_feature(*, check_only: bool) -> int:
+    """Handle sync when no feature is in progress.
+
+    Args:
+        check_only: If True, report changes without writing.
+
+    Returns:
+        Exit code: 0 = in sync or wrote successfully, 1 = changes needed (check mode).
+    """
+    new_content = build_empty_todo()
+    existing = TODO_PATH.read_text(encoding="utf-8") if TODO_PATH.exists() else ""
+    if existing.strip() == new_content.strip():
+        print("TODO.md is in sync.")
+        return 0
+    if check_only:
+        print("TODO.md would be updated: no feature in progress format.")
+        return 1
+    TODO_PATH.write_text(new_content, encoding="utf-8")
+    print("TODO.md updated: no feature in progress.")
+    return 0
+
+
+def _write_or_report(
+    new_content: str,
+    new_ids: set[str],
+    criteria: list[Criterion],
+    *,
+    check_only: bool,
+) -> int:
+    """Write updated TODO.md or report what would change.
+
+    Args:
+        new_content: The new TODO.md content to write.
+        new_ids: Set of @id hex values that are new (not in existing TODO.md).
+        criteria: All criteria from .feature files.
+        check_only: If True, report changes without writing.
+
+    Returns:
+        Exit code: 0 = wrote successfully, 1 = changes needed (check mode).
+    """
+    if check_only:
+        if new_ids:
+            print(f"TODO.md would add {len(new_ids)} new @id row(s):")
+            for c in criteria:
+                if c.id_hex in new_ids:
+                    print(f"  [ ] @id:{c.id_hex}: {c.title}")
+        else:
+            print("TODO.md header or structure would be updated.")
+        return 1
+    TODO_PATH.write_text(new_content, encoding="utf-8")
+    if new_ids:
+        print(f"TODO.md updated: added {len(new_ids)} new @id row(s).")
+        for c in criteria:
+            if c.id_hex in new_ids:
+                print(f"  [ ] @id:{c.id_hex}: {c.title}")
+    else:
+        print("TODO.md updated.")
+    return 0
+
+
+def sync_todo(*, check_only: bool = False) -> int:
+    """Main sync logic: read feature state, merge TODO.md, write if changed.
+
+    Args:
+        check_only: If True, report changes without writing.
+
+    Returns:
+        Exit code: 0 = in sync or wrote successfully, 1 = changes needed (check mode).
+    """
+    result = find_in_progress_feature()
+
+    if result is None:
+        return _sync_no_feature(check_only=check_only)
+
+    feature_name, feature_path = result
+    criteria = extract_criteria(feature_path)
+
+    existing_text = TODO_PATH.read_text(encoding="utf-8") if TODO_PATH.exists() else ""
+    existing_progress = read_existing_progress(existing_text)
+
+    step = (
+        _extract_header_field(existing_text, "Step") or "? (unknown — update manually)"
+    )
+    source = f"docs/features/in-progress/{feature_name}/discovery.md"
+    next_action = _extract_next_action(existing_text)
+
+    progress_lines = build_progress_lines(criteria, existing_progress)
+    new_content = build_todo_content(
+        feature_name=feature_name,
+        step=step,
+        source=source,
+        progress_lines=progress_lines,
+        next_action=next_action,
+    )
+
+    existing_ids = set(existing_progress.keys())
+    new_ids = {c.id_hex for c in criteria} - existing_ids
+
+    if existing_text.strip() == new_content.strip():
+        print("TODO.md is in sync.")
+        return 0
+
+    return _write_or_report(new_content, new_ids, criteria, check_only=check_only)
+
+
+def main() -> int:
+    """Entry point for the gen-todo command.
+
+    Returns:
+        Exit code (0 = success, 1 = changes needed in check mode).
+    """
+    check_only = "--check" in sys.argv
+    return sync_todo(check_only=check_only)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/.opencode/skills/tdd/SKILL.md b/.opencode/skills/tdd/SKILL.md
index 20113b4..6a33e27 100644
--- a/.opencode/skills/tdd/SKILL.md
+++ b/.opencode/skills/tdd/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: tdd
-description: Step 3 — write failing tests mapped 1:1 to UUID acceptance criteria with proper markers and docstrings
-version: "1.0"
+description: Step 3 — write failing tests mapped 1:1 to @id acceptance criteria with proper markers and docstrings
+version: "2.1"
 author: developer
 audience: developer
 workflow: feature-lifecycle
@@ -9,45 +9,64 @@ workflow: feature-lifecycle
 
 # TDD — Test First
 
-Write tests before writing any production code. Every test must fail when first run. Every test maps to exactly one UUID acceptance criterion.
+Write tests before writing any production code. Every test must fail when first run. Every test maps to exactly one `@id` acceptance criterion from a `.feature` file.
 
-## Test Tool Decision
+## Step 3 Workflow
 
-| Situation | Tool |
-|---|---|
-| Deterministic input/output, one scenario | Plain pytest |
-| Pure function, many input combinations | Hypothesis `@given` |
-| Stateful system with sequences of operations | Hypothesis stateful testing |
+1. Run `uv run task gen-tests -- --check` to preview what will be created/updated.
+2. Run `uv run task gen-tests` to generate/sync test stubs from `.feature` files.
+3. Run a silent pre-mortem: does the architecture fit? Is this the minimal solution?
+4. Write failing test bodies (real assertions, not `raise NotImplementedError`)
+5. Run `pytest` — confirm every new test fails with `ImportError` or `AttributeError`
+6. **STOP — request a reviewer check of test design and semantic alignment. WAIT for APPROVED before committing or implementing.**
+7. Commit: `test(<feature-name>): write failing tests`
 
-**Never use Hypothesis for**: I/O operations, side effects, network calls, database writes, or anything where the test environment matters.
+## Test Stub Generation
 
-## Test File Structure
+```bash
+uv run task gen-tests -- --check   # dry run — review what would change
+uv run task gen-tests              # apply: sync all features
+uv run task gen-tests -- --orphans # list orphaned tests
+```
 
-File naming: `<descriptive-name>_test.py` — never `test_<name>.py`. All test files live directly in `tests/` (flat layout, no subdirectories).
+Always run `--check` first to review planned changes before applying them.
 
-| Source | Test |
+The script reads `.feature` files from `docs/features/{backlog,in-progress,completed}/` and creates/updates test files in `tests/features/<feature-name>/`.
+
+| `.feature` state | Script action |
 |---|---|
-| `<package>/module.py` | `tests/module_test.py` |
-| `<package>/domain/service.py` | `tests/service_test.py` |
-| `<package>/api/routes.py` | `tests/routes_test.py` |
+| New `@id` Example | Create stub with `raise NotImplementedError` |
+| Example title/Given/When/Then changed | Update docstring + rename function |
+| `@deprecated` tag added | Add `@pytest.mark.deprecated` decorator |
+| `@deprecated` tag removed | Remove `@pytest.mark.deprecated` decorator |
+| Test `@id` matches no Example | Mark orphan: `@pytest.mark.skip(reason="orphan: ...")` |
+| completed features | Only toggle `@deprecated` (no docstring changes) |
+| Never | Touch function body |
 
-## Test Function Naming
+## Test File Structure
 
 ```
-test_<short_title>
+tests/features/<feature-name>/<story-slug>_test.py    ← one per .feature file
+tests/unit/<anything>_test.py                          ← developer-authored extras
 ```
 
-Examples:
-- `test_ball_bounces_off_top_wall`
-- `test_email_requires_at_symbol`
-- `test_empty_cart_returns_zero_total`
+## Test Function Naming
 
-## Docstring Format (mandatory)
+Generated by `gen-tests`:
 
 ```python
-def test_ball_bounces_off_top_wall():
-    """a1b2c3d4-e5f6-7890-abcd-ef1234567890
+def test_<feature_slug>_<8char_hex>() -> None:
+```
 
+- `feature_slug` = feature folder name with hyphens replaced by underscores
+- `8char_hex` = the `@id` from the `.feature` file
+
+## Docstring Format (mandatory)
+
+```python
+@pytest.mark.unit
+def test_bounce_physics_a3f2b1c4() -> None:
+    """
     Given: A ball moving upward reaches y=0
     When: The physics engine processes the next frame
     Then: The ball velocity y-component becomes positive
@@ -57,114 +76,86 @@ def test_ball_bounces_off_top_wall():
     # When
     result = physics.update(ball)
     # Then
-    assert result.vy > 0  # Asserts observable behavior
+    assert result.vy > 0
 ```
 
+**Rules**:
+- Docstring contains `Given:/When:/Then:` on separate indented lines
+- `# Given`, `# When`, `# Then` comments in the test body mirror the docstring
+- No extra metadata in docstring — traceability comes from the function name `@id` suffix
+
 **A test that looks correct but is wrong:**
 
 ```python
-def test_ball_bounces_off_top_wall():
-    """a1b2c3d4-e5f6-7890-abcd-ef1234567890
-
-    Given: A ball moving upward reaches y=0
-    When: The physics engine processes the next frame
-    Then: The ball velocity y-component becomes positive
-    """
-    # Given
+def test_bounce_physics_a3f2b1c4() -> None:
+    """..."""
     ball = Ball(x=5, y=0, vy=-1)
-    # When
     physics.update(ball)
-    # Then
     assert ball._velocity_y > 0  # WRONG: tests internal attribute, not observable behavior
-    # This test would break if you rename _velocity_y, even though behavior is unchanged.
 ```
 
-The correct test (`result.vy > 0`) would still pass after a complete rewrite that preserves behavior.
-The wrong test (`ball._velocity_y > 0`) would break if you rename the internal field.
+The correct test asserts on the return value. The wrong test breaks if you rename an internal field.
 
-**Rules**:
-- First line: `<uuid>` only — no description
-- Mandatory blank line between UUID and Given
-- Given/When/Then on separate indented lines
-- `# Given`, `# When`, `# Then` comments in the test body mirror the docstring
-- UUID must exactly match the UUID on the criterion's first line in the feature doc
+## Test Tool Decision
 
-## Markers
+| Situation | Tool |
+|---|---|
+| Deterministic input/output, one scenario | Plain pytest |
+| Pure function, many input combinations | Hypothesis `@given` |
+| Stateful system with sequences of operations | Hypothesis stateful testing |
+
+**Never use Hypothesis for**: I/O, side effects, network calls, database writes.
+
+## Markers (4 total)
 
 Every test gets exactly one of:
 - `@pytest.mark.unit` — isolated, no external state
 - `@pytest.mark.integration` — multiple components, external state
 
-Slow tests additionally get `@pytest.mark.slow` (anything > 50ms: DB, network, Hypothesis, terminal I/O).
+Additionally:
+- `@pytest.mark.slow` — takes > 50ms (DB, network, Hypothesis, terminal I/O)
+- `@pytest.mark.deprecated` — auto-skipped by conftest hook; added by `gen-tests`
 
 ```python
 @pytest.mark.unit
-def test_ball_bounces_off_top_wall():
+def test_bounce_physics_a3f2b1c4() -> None:
     ...
 
 @pytest.mark.integration
 @pytest.mark.slow
-def test_checkout_persists_order_to_database():
+def test_checkout_flow_b2c3d4e5() -> None:
     ...
 ```
 
-### Choosing a Marker
-
-| Marker | Use When |
-|---|---|
-| `unit` | One function or class in isolation; no external dependencies |
-| `integration` | Multiple components working together; external state (DB, filesystem, network) |
-| `slow` | Test takes > 50ms — add alongside `unit` or `integration`, never alone |
-
 When in doubt, start with `unit`. Upgrade to `integration` if the implementation requires external state.
 
 ## Hypothesis Tests
 
-Use `@given` with `@example` for known edge cases and `assume` for precondition filtering. Configure via `@settings`, not markers.
-
 ```python
-from hypothesis import given, example, assume, settings
-from hypothesis import strategies as st
-
 @pytest.mark.unit
 @pytest.mark.slow
 @given(x=st.floats(min_value=-100, max_value=100, allow_nan=False))
 @example(x=0.0)
-@example(x=-100.0)
 @settings(max_examples=200)
-def test_compute_distance_always_non_negative(x: float) -> None:
-    """b2c3d4e5-f6a7-8901-bcde-f12345678901
-
+def test_bounce_physics_c4d5e6f7(x: float) -> None:
+    """
     Given: Any floating point input value
     When: compute_distance is called
     Then: The result is >= 0
     """
-    # Given
     assume(x != 0.0)
-    # When
     result = compute_distance(x)
-    # Then
     assert result >= 0
 ```
 
 ### Meaningful vs. Tautological Property Tests
 
-A meaningful property test asserts an **invariant** — something that must be true regardless of input, that is NOT derived from the implementation itself:
-
 | Tautological (useless) | Meaningful (tests the contract) |
 |---|---|
 | `assert Score(x).value == x` | `assert Score(x).value >= 0` |
 | `assert sorted(list) == sorted(list)` | `assert sorted(list) == sorted(list, key=...)` |
 | `assert EmailAddress(valid).value == valid` | `assert "@" in EmailAddress(valid).value` |
 
-## Writing Failing Tests (Step 3 Checklist)
-
-1. For each UUID in the feature doc, create one test function
-2. Write the full test body with real assertions (not `raise NotImplementedError`)
-3. The test will fail because the production code does not exist yet — that is correct
-4. Run `pytest` — confirm every new test fails with `ImportError` or `AttributeError`, not a logic failure
-5. Commit: `test(<feature-name>): add failing tests for all acceptance criteria`
-
 ## Integration Test Requirement
 
 For any feature with multiple components or user interaction, at least one `@pytest.mark.integration` test must exercise the public entry point with realistic input. This test must NOT call internal helpers directly — it must go through the same path a real user would.
@@ -175,33 +166,16 @@ The test's Given/When/Then must operate at the **same abstraction level** as the
 
 | AC says | Test must do |
 |---|---|
-| "When the user presses W" | Send `"W"` through the actual input mechanism (stdin, key event, CLI arg) |
+| "When the user presses W" | Send `"W"` through the actual input mechanism |
 | "When `update_player` receives 'W'" | Call `update_player("W")` directly — the boundary is explicit |
 
-If testing through the real entry point is infeasible, the developer must add a new AC via `skill extend-criteria` that explicitly describes the lower-level boundary. **Never silently shift abstraction levels.**
-
-## UUID Uniqueness
-
-One test function per UUID. One UUID per test function. If only `Given` varies across cases, that is a property by definition — use Hypothesis `@given` with `@example` for known edge cases. If `When` or `Then` would differ, that is a new criterion: use `extend-criteria`.
-
-## Property-Based Testing Decision Rule
-
-Use Hypothesis when a function has invariants — things that must always be true regardless of input:
-
-| Code has... | Write a property test for... |
-|---|---|
-| Ball position clamped to screen | Position always stays within bounds |
-| Score accumulator | Score never goes negative |
-| Sorted collection | Order is always preserved after insertion |
-| Domain value object | Constructor always rejects invalid inputs |
-
-This catches "technically passes but doesn't work" failures that deterministic tests miss.
+If testing through the real entry point is infeasible, escalate to the PO to adjust the AC boundary. **Never silently shift abstraction levels.**
 
 ## Quality Rules
 
-- Write every test as if you cannot see the production code. The test describes what a caller observes, not how the code achieves it. If a complete internal rewrite would not break this test (while preserving behavior), it is correctly written.
+- Write every test as if you cannot see the production code — test what a caller observes
 - No `isinstance()`, `type()`, or internal attribute (`_x`) checks in assertions
-- One assertion concept per test (multiple `assert` statements are ok if they verify the same thing)
+- One assertion concept per test (multiple `assert` ok if they verify the same thing)
 - No `pytest.skip` or `pytest.mark.xfail` without written justification in the docstring
 - Never use `noqa` — fix the underlying issue instead
 - Test data embedded directly in the test, not loaded from external files
diff --git a/.opencode/skills/tdd/scripts/gen_test_stubs.py b/.opencode/skills/tdd/scripts/gen_test_stubs.py
new file mode 100644
index 0000000..0137608
--- /dev/null
+++ b/.opencode/skills/tdd/scripts/gen_test_stubs.py
@@ -0,0 +1,643 @@
+"""Generate and sync pytest test stubs from Gherkin .feature files.
+
+Scans all feature folders under docs/features/{backlog,in-progress,completed}/
+and creates or updates test stubs in tests/features/<feature-name>/.
+
+Modes:
+    uv run task gen-tests              Sync all features (default)
+    uv run task gen-tests -- --check   Dry run — report what would change
+    uv run task gen-tests -- --orphans List orphaned tests (no matching @id)
+
+Safety rules:
+    - backlog / in-progress: full write (create stubs, update docstrings, rename)
+    - completed: only toggle @pytest.mark.deprecated (no docstring changes)
+    - Never touches function bodies (code between # Given and end of function)
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from gherkin import Parser as GherkinParser
+
+PROJECT_ROOT = Path(__file__).resolve().parents[4]
+FEATURES_DIR = PROJECT_ROOT / "docs" / "features"
+TESTS_DIR = PROJECT_ROOT / "tests" / "features"
+
+FEATURE_STAGES = ("backlog", "in-progress", "completed")
+
+ID_TAG_RE = re.compile(r"@id:([a-f0-9]{8})")
+
+TEST_FUNC_RE = re.compile(r"^def (test_\w+)\(.*\)")
+TEST_ID_RE = re.compile(r"test_\w+_([a-f0-9]{8})\b")
+DEPRECATED_MARKER_RE = re.compile(r"^@pytest\.mark\.deprecated$", re.MULTILINE)
+ORPHAN_MARKER_RE = re.compile(
+    r'^@pytest\.mark\.skip\(reason="orphan: no matching @id in \.feature files"\)$',
+    re.MULTILINE,
+)
+
+
+@dataclass(frozen=True, slots=True)
+class GherkinExample:
+    """A single Example block parsed from a .feature file."""
+
+    id_hex: str
+    title: str
+    given: str
+    when: str
+    then: str
+    deprecated: bool
+    source_file: str
+
+
+@dataclass(frozen=True, slots=True)
+class FeatureFile:
+    """A parsed .feature file with its examples."""
+
+    path: Path
+    feature_name: str
+    story_slug: str
+    examples: list[GherkinExample]
+
+
+def slugify(name: str) -> str:
+    """Convert a feature folder name to a Python-safe slug.
+
+    Args:
+        name: The feature folder name (kebab-case).
+
+    Returns:
+        Underscore-separated lowercase string.
+    """
+    return name.replace("-", "_").lower()
+
+
+def parse_feature_file(path: Path) -> FeatureFile | None:
+    """Parse a .feature file into structured data.
+
+    Args:
+        path: Path to the .feature file.
+
+    Returns:
+        FeatureFile if valid, None if no Feature: line found.
+    """
+    text = path.read_text(encoding="utf-8")
+    doc = GherkinParser().parse(text)
+    feature: dict[str, Any] | None = doc.get("feature")
+    if not feature or not feature.get("name"):
+        return None
+
+    story_slug = path.stem
+    examples = _extract_examples(feature, str(path))
+    return FeatureFile(
+        path=path,
+        feature_name=feature["name"],
+        story_slug=story_slug,
+        examples=examples,
+    )
+
+
+def _extract_examples(
+    feature: dict[str, Any], source_file: str
+) -> list[GherkinExample]:
+    """Extract all Example blocks from a parsed Gherkin feature AST.
+
+    Args:
+        feature: The 'feature' dict from gherkin-official Parser output.
+        source_file: Path string for provenance tracking.
+
+    Returns:
+        List of parsed GherkinExample objects.
+    """
+    examples: list[GherkinExample] = []
+    for child in feature.get("children", []):
+        scenario: dict[str, Any] | None = child.get("scenario")
+        if scenario is None:
+            continue
+        example = _scenario_to_example(scenario, source_file)
+        if example is not None:
+            examples.append(example)
+    return examples
+
+
+def _scenario_to_example(
+    scenario: dict[str, Any], source_file: str
+) -> GherkinExample | None:
+    """Convert a single parsed scenario dict to a GherkinExample.
+
+    Skips scenarios without an @id tag.
+
+    Args:
+        scenario: A scenario dict from the Gherkin AST.
+        source_file: Path string for provenance tracking.
+
+    Returns:
+        GherkinExample if the scenario has an @id tag, None otherwise.
+    """
+    tags = scenario.get("tags", [])
+    id_hex = _extract_id_tag(tags)
+    if id_hex is None:
+        return None
+
+    deprecated = any(t["name"] == "@deprecated" for t in tags)
+    given, when, then = _extract_steps(scenario.get("steps", []))
+    return GherkinExample(
+        id_hex=id_hex,
+        title=scenario.get("name", ""),
+        given=given,
+        when=when,
+        then=then,
+        deprecated=deprecated,
+        source_file=source_file,
+    )
+
+
+def _extract_id_tag(tags: list[dict[str, Any]]) -> str | None:
+    """Find the @id:<hex> tag value from a list of AST tags.
+
+    Args:
+        tags: List of tag dicts from the Gherkin AST.
+
+    Returns:
+        The 8-char hex ID, or None if no @id tag is present.
+    """
+    for tag in tags:
+        m = ID_TAG_RE.search(tag.get("name", ""))
+        if m:
+            return m.group(1)
+    return None
+
+
+def _extract_steps(steps: list[dict[str, Any]]) -> tuple[str, str, str]:
+    """Extract Given/When/Then text from parsed Gherkin steps.
+
+    Args:
+        steps: List of step dicts from the Gherkin AST.
+
+    Returns:
+        Tuple of (given, when, then) step text strings.
+    """
+    given = when = then = ""
+    for step in steps:
+        keyword_type = step.get("keywordType", "")
+        text = step.get("text", "")
+        if keyword_type == "Context":
+            given = text
+        elif keyword_type == "Action":
+            when = text
+        elif keyword_type == "Outcome":
+            then = text
+    return given, when, then
+
+
+def generate_stub(feature_slug: str, example: GherkinExample) -> str:
+    """Generate a single test stub function.
+
+    Args:
+        feature_slug: Underscored feature folder name.
+        example: The parsed Gherkin example.
+
+    Returns:
+        Complete test function source code as a string.
+    """
+    func_name = f"test_{feature_slug}_{example.id_hex}"
+    markers = ["@pytest.mark.unit"]
+    if example.deprecated:
+        markers.append("@pytest.mark.deprecated")
+
+    marker_lines = "\n".join(markers)
+    docstring = _build_docstring(example)
+
+    lines = [
+        marker_lines,
+        f"def {func_name}() -> None:",
+        *docstring,
+        "    # Given",
+        "",
+        "    # When",
+        "",
+        "    # Then",
+        "    raise NotImplementedError",
+    ]
+    return "\n".join(lines) + "\n"
+
+
+def _build_docstring(example: GherkinExample) -> list[str]:
+    """Build properly indented docstring lines for a test stub.
+
+    Args:
+        example: The parsed Gherkin example.
+
+    Returns:
+        List of indented lines (each with 4-space prefix) including triple quotes.
+    """
+    return [
+        '    """',
+        f"    Given: {example.given}",
+        f"    When: {example.when}",
+        f"    Then: {example.then}",
+        '    """',
+    ]
+
+
+def generate_test_file(
+    feature_slug: str, story_slug: str, examples: list[GherkinExample]
+) -> str:
+    """Generate a complete test file for one .feature file.
+
+    Args:
+        feature_slug: Underscored feature folder name.
+        story_slug: The story file stem (becomes test file name).
+        examples: All examples from that .feature file.
+
+    Returns:
+        Complete test module source code.
+    """
+    header = (
+        f'"""Tests for {story_slug.replace("_", " ")} story."""\n\nimport pytest\n\n\n'
+    )
+    stubs = "\n\n".join(generate_stub(feature_slug, ex) for ex in examples)
+    return header + stubs + "\n"
+
+
+def find_feature_folders() -> dict[str, list[tuple[Path, str]]]:
+    """Find all feature folders across all stages.
+
+    Returns:
+        Dict mapping feature folder name to list of (feature_file_path, stage).
+    """
+    features: dict[str, list[tuple[Path, str]]] = {}
+    for stage in FEATURE_STAGES:
+        stage_dir = FEATURES_DIR / stage
+        if not stage_dir.exists():
+            continue
+        for folder in sorted(stage_dir.iterdir()):
+            if not folder.is_dir():
+                continue
+            feature_files = sorted(folder.glob("*.feature"))
+            if feature_files:
+                name = folder.name
+                features.setdefault(name, [])
+                for ff in feature_files:
+                    features[name].append((ff, stage))
+    return features
+
+
+def read_existing_test_ids(test_file: Path) -> set[str]:
+    """Extract @id hex values from existing test function names.
+
+    Args:
+        test_file: Path to existing test file.
+
+    Returns:
+        Set of 8-char hex IDs found in test function names.
+    """
+    if not test_file.exists():
+        return set()
+    text = test_file.read_text(encoding="utf-8")
+    return set(TEST_ID_RE.findall(text))
+
+
+def sync_test_file(
+    feature_slug: str,
+    story_slug: str,
+    examples: list[GherkinExample],
+    test_file: Path,
+    stage: str,
+    *,
+    check_only: bool = False,
+) -> list[str]:
+    """Sync a single test file with its .feature examples.
+
+    Args:
+        feature_slug: Underscored feature folder name.
+        story_slug: The story file stem.
+        examples: Parsed examples from the .feature file.
+        test_file: Path to the test file to create/update.
+        stage: Feature stage (backlog, in-progress, completed).
+        check_only: If True, report changes without writing.
+
+    Returns:
+        List of action descriptions taken/planned.
+    """
+    actions: list[str] = []
+    example_ids = {ex.id_hex for ex in examples}
+
+    if not test_file.exists():
+        if stage == "completed":
+            return actions
+        content = generate_test_file(feature_slug, story_slug, examples)
+        actions.append(f"CREATE {test_file} ({len(examples)} stubs)")
+        if not check_only:
+            test_file.parent.mkdir(parents=True, exist_ok=True)
+            test_file.write_text(content, encoding="utf-8")
+        return actions
+
+    text = test_file.read_text(encoding="utf-8")
+    existing_ids = set(TEST_ID_RE.findall(text))
+
+    if stage == "completed":
+        actions.extend(_sync_deprecated_markers(examples, test_file, text, check_only))
+        return actions
+
+    actions.extend(
+        _sync_full(
+            feature_slug,
+            examples,
+            example_ids,
+            existing_ids,
+            test_file,
+            text,
+            check_only,
+        )
+    )
+    return actions
+
+
+def _sync_deprecated_markers(
+    examples: list[GherkinExample],
+    test_file: Path,
+    text: str,
+    check_only: bool,
+) -> list[str]:
+    """For completed features, only toggle @deprecated markers.
+
+    Args:
+        examples: Parsed examples from the .feature file.
+        test_file: Path to the test file.
+        text: Current content of the test file.
+        check_only: If True, report without writing.
+
+    Returns:
+        List of action descriptions.
+    """
+    actions: list[str] = []
+    modified = text
+    for ex in examples:
+        func_pattern = re.compile(
+            rf"((?:@pytest\.mark\.\w+(?:\(.*?\))?\n)*)def test_\w+_{ex.id_hex}\b"
+        )
+        match = func_pattern.search(modified)
+        if not match:
+            continue
+        decorators = match.group(1)
+        has_deprecated = "@pytest.mark.deprecated" in decorators
+        if ex.deprecated and not has_deprecated:
+            new_decorators = "@pytest.mark.deprecated\n" + decorators
+            modified = (
+                modified[: match.start()]
+                + new_decorators
+                + match.group()[len(decorators) :]
+                + modified[match.end() :]
+            )
+            actions.append(f"ADD @deprecated to test for {ex.id_hex}")
+        elif not ex.deprecated and has_deprecated:
+            new_decorators = decorators.replace("@pytest.mark.deprecated\n", "")
+            modified = (
+                modified[: match.start()]
+                + new_decorators
+                + match.group()[len(decorators) :]
+                + modified[match.end() :]
+            )
+            actions.append(f"REMOVE @deprecated from test for {ex.id_hex}")
+    if modified != text and not check_only:
+        test_file.write_text(modified, encoding="utf-8")
+    return actions
+
+
+def _sync_full(
+    feature_slug: str,
+    examples: list[GherkinExample],
+    example_ids: set[str],
+    existing_ids: set[str],
+    test_file: Path,
+    text: str,
+    check_only: bool,
+) -> list[str]:
+    """Full sync for backlog/in-progress features.
+
+    Args:
+        feature_slug: Underscored feature folder name.
+        examples: Parsed examples.
+        example_ids: Set of IDs from .feature file.
+        existing_ids: Set of IDs found in existing test file.
+        test_file: Path to test file.
+        text: Current file content.
+        check_only: Dry run flag.
+
+    Returns:
+        List of action descriptions.
+    """
+    actions: list[str] = []
+    modified = text
+
+    new_ids = example_ids - existing_ids
+    orphan_ids = existing_ids - example_ids
+
+    for ex in examples:
+        if ex.id_hex in new_ids:
+            stub = "\n\n" + generate_stub(feature_slug, ex)
+            modified += stub
+            actions.append(f"ADD stub for @id:{ex.id_hex}")
+        elif ex.id_hex in existing_ids:
+            modified, doc_actions = _update_docstring(modified, feature_slug, ex)
+            actions.extend(doc_actions)
+
+    for oid in orphan_ids:
+        orphan_marker = (
+            '@pytest.mark.skip(reason="orphan: no matching @id in .feature files")'
+        )
+        func_pattern = re.compile(
+            rf"((?:@pytest\.mark\.\w+(?:\(.*?\))?\n)*)def test_\w+_{oid}\b"
+        )
+        match = func_pattern.search(modified)
+        if match and orphan_marker not in match.group(1):
+            decorators = match.group(1)
+            new_decorators = orphan_marker + "\n" + decorators
+            modified = (
+                modified[: match.start()]
+                + new_decorators
+                + match.group()[len(decorators) :]
+                + modified[match.end() :]
+            )
+            actions.append(f"MARK orphan: test with @id:{oid}")
+
+    if modified != text and not check_only:
+        test_file.write_text(modified, encoding="utf-8")
+    return actions
+
+
+def _update_docstring(
+    text: str, feature_slug: str, example: GherkinExample
+) -> tuple[str, list[str]]:
+    """Update the docstring of an existing test to match the .feature file.
+
+    Args:
+        text: Full test file content.
+        feature_slug: Underscored feature folder name.
+        example: The Gherkin example to match.
+
+    Returns:
+        Tuple of (modified_text, list_of_actions).
+    """
+    actions: list[str] = []
+    func_re = re.compile(
+        rf'(def test_\w+_{example.id_hex}\(.*?\).*?:\n\s+""")'
+        rf"(.*?)"
+        rf'(""")',
+        re.DOTALL,
+    )
+    match = func_re.search(text)
+    if not match:
+        return text, actions
+
+    new_docstring = (
+        f"\n    Given: {example.given}\n"
+        f"    When: {example.when}\n"
+        f"    Then: {example.then}\n    "
+    )
+    old_docstring = match.group(2)
+    if old_docstring.strip() != new_docstring.strip():
+        text = text[: match.start(2)] + new_docstring + text[match.end(2) :]
+        actions.append(f"UPDATE docstring for @id:{example.id_hex}")
+
+    old_func = re.search(rf"def (test_\w+_{example.id_hex})\b", text)
+    if old_func:
+        expected_name = f"test_{feature_slug}_{example.id_hex}"
+        if old_func.group(1) != expected_name:
+            text = text.replace(old_func.group(1), expected_name)
+            actions.append(f"RENAME {old_func.group(1)} -> {expected_name}")
+    return text, actions
+
+
+def find_duplicate_ids() -> list[str]:
+    """Find @id hex values that appear in more than one .feature file.
+
+    Args:
+        None.
+
+    Returns:
+        List of warning strings describing each duplicate @id.
+    """
+    id_sources: dict[str, list[str]] = {}
+    for name, files in find_feature_folders().items():
+        for fpath, _stage in files:
+            parsed = parse_feature_file(fpath)
+            if not parsed:
+                continue
+            for ex in parsed.examples:
+                id_sources.setdefault(ex.id_hex, []).append(f"{name}/{fpath.name}")
+
+    warnings: list[str] = []
+    for id_hex, sources in sorted(id_sources.items()):
+        if len(sources) > 1:
+            locations = ", ".join(sources)
+            warnings.append(f"@id:{id_hex} appears in multiple features: {locations}")
+    return warnings
+
+
+def find_orphaned_tests() -> list[str]:
+    """Find all test files with IDs that don't match any .feature file.
+
+    Returns:
+        List of orphan descriptions.
+    """
+    all_feature_ids: set[str] = set()
+    features = find_feature_folders()
+    for _name, files in features.items():
+        for fpath, _stage in files:
+            parsed = parse_feature_file(fpath)
+            if parsed:
+                all_feature_ids.update(ex.id_hex for ex in parsed.examples)
+
+    orphans: list[str] = []
+    if not TESTS_DIR.exists():
+        return orphans
+    for test_file in TESTS_DIR.rglob("*_test.py"):
+        ids = read_existing_test_ids(test_file)
+        for tid in ids:
+            if tid not in all_feature_ids:
+                orphans.append(f"{test_file}: @id:{tid}")
+    return orphans
+
+
+def _sync_all_features(
+    features: dict[str, list[tuple[Path, str]]], *, check_only: bool
+) -> int:
+    """Sync test stubs for all feature folders.
+
+    Args:
+        features: Mapping of feature name to list of (fpath, stage) tuples.
+        check_only: If True, report actions without writing files.
+
+    Returns:
+        Exit code: 0 = success, 1 = changes needed in check mode.
+    """
+    duplicates = find_duplicate_ids()
+    for warning in duplicates:
+        print(f"WARNING: {warning}")
+
+    all_actions: list[str] = []
+    for name, files in sorted(features.items()):
+        feature_slug = slugify(name)
+        for fpath, stage in files:
+            parsed = parse_feature_file(fpath)
+            if not parsed:
+                print(f"SKIP {fpath} — no Feature: line found")
+                continue
+            story_slug = slugify(parsed.story_slug)
+            test_file = TESTS_DIR / name / f"{story_slug}_test.py"
+            actions = sync_test_file(
+                feature_slug,
+                story_slug,
+                parsed.examples,
+                test_file,
+                stage,
+                check_only=check_only,
+            )
+            all_actions.extend(actions)
+
+    if all_actions:
+        mode = "Would" if check_only else "Did"
+        print(f"{mode} perform {len(all_actions)} action(s):")
+        for a in all_actions:
+            print(f"  {a}")
+        return 1 if check_only else 0
+
+    print("All test stubs are in sync.")
+    return 0
+
+
+def main() -> int:
+    """Entry point for the gen-tests command.
+
+    Returns:
+        Exit code (0 = success, 1 = changes needed in check mode).
+    """
+    check_only = "--check" in sys.argv
+    orphans_only = "--orphans" in sys.argv
+
+    if orphans_only:
+        orphans = find_orphaned_tests()
+        if orphans:
+            print("Orphaned tests (no matching @id in .feature files):")
+            for o in orphans:
+                print(f"  {o}")
+            return 1
+        print("No orphaned tests found.")
+        return 0
+
+    features = find_feature_folders()
+    if not features:
+        print("No feature folders with .feature files found.")
+        return 0
+
+    return _sync_all_features(features, check_only=check_only)
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/.opencode/skills/verify/SKILL.md b/.opencode/skills/verify/SKILL.md
index 3b055d2..4a9ea5d 100644
--- a/.opencode/skills/verify/SKILL.md
+++ b/.opencode/skills/verify/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: verify
-description: Step 5 — run all verification commands and review code quality, produce a written report
-version: "1.0"
+description: Step 5 — run all verification commands, review code quality, and produce a written report
+version: "2.2"
 author: reviewer
 audience: reviewer
 workflow: feature-lifecycle
@@ -13,18 +13,31 @@ This skill guides the reviewer through Step 5: independent verification that the
 
 **Your default hypothesis is that the code is broken despite passing automated checks. Your job is to find the failure mode. If you cannot find one after thorough investigation, APPROVE. If you find one, REJECTED.**
 
-## When to Use
+**Every PASS/FAIL cell must have evidence.** Empty evidence = UNCHECKED = REJECTED.
 
-After the developer signals Step 4 is complete. Do not start verification until the developer has committed all work.
+## Scope Guard — Step 4 vs. Step 5
+
+If you are invoked for a **per-test code-design check during Step 4** (not a full Step 5 review):
+- The developer will provide a completed **Design Self-Declaration** checklist with `file:line` evidence for each rule.
+- **Independently verify each claim** against the actual code using sections 4a–4e (YAGNI, KISS, DRY, SOLID, Object Calisthenics, Design Patterns) and the semantic alignment check.
+- Do **NOT** run any commands (no lint, no static-check, no test suite).
+- Respond using the verification table template in `implementation/SKILL.md` — compare developer claims vs. your independent findings for each rule.
+
+This full skill applies only when the developer signals Step 4 is complete and hands off for Step 5.
+
+## When to Use (Step 5)
+
+After the developer signals Step 4 is complete and all self-verification checks pass. Do not start verification until the developer has committed all work.
 
 ## Step-by-Step
 
-### 1. Read the Feature Doc
+### 1. Read the Feature Docs
 
-Read `docs/features/in-progress/<feature-name>.md`. Extract:
-- All UUIDs and their descriptions
-- The interaction model from Notes (if the feature involves user interaction)
-- The developer's pre-mortem (if present in the Architecture section)
+Read the feature folder `docs/features/in-progress/<name>/`. Extract:
+- All `@id` tags and their Example titles from `.feature` files
+- The interaction model (if the feature involves user interaction)
+- The developer's pre-mortem (if present in the Architecture section of `discovery.md`)
+- The Rules and Constraints sections from `discovery.md`
 
 ### 2. Check Commit History
 
@@ -35,24 +48,26 @@ git status
 
 Verify:
 - There is a commit per green test (not one giant commit at the end)
-- Every step has a commit (`bootstrap`, `failing tests`, per-feature-name commits)
+- Every step has a commit (architecture, failing tests, per-feature-name commits)
 - No uncommitted changes: `git status` should be clean
 
 ### 3. Production-Grade Gate
 
-Run before code review. If any row is FAIL → REJECTED immediately.
+Run before code review. If any row is FAIL, stop immediately with REJECTED.
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
 | Developer declared production-grade | Read feature doc pre-mortem or handoff message | Explicit statement present | Absent or says "demo" or "incomplete" | Developer must complete the implementation |
 | App exits cleanly | `timeout 10s uv run task run` | Exit 0 or non-124 | Exit 124 (timeout/hang) | Developer must fix the hang |
-| Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static regardless of input | Developer must implement real logic — output that does not change with input is not complete |
+| Output changes when input changes | Run app, change an input or condition, observe output | Output changes accordingly | Output is static regardless of input | Developer must implement real logic |
 
 ### 4. Code Review
 
 Read the source files changed in this feature. **Do this before running lint/static-check/test** — if code review finds a design problem, commands will need to re-run after the fix anyway.
 
-**Correctness** — any FAIL → REJECTED:
+**Stop on first failure category — do not accumulate issues.** When a category FAILs, stop code review, write the report, and send REJECTED. In the report, mark all skipped sections as `NOT CHECKED (stopped at <category>)` — this is valid evidence of a deliberate stop, not an unchecked cell.
+
+#### 4a. Correctness — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
@@ -60,7 +75,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | No duplicate logic (DRY) | Search for repeated blocks doing the same thing | None found | Duplication found | Extract to shared function |
 | No over-engineering (YAGNI) | Check for abstractions with no current use | None found | Unused abstraction or premature generalization | Remove unused code |
 
-**Simplicity (KISS)** — any FAIL → REJECTED:
+#### 4b. Simplicity (KISS) — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
@@ -69,7 +84,7 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | Functions ≤ 20 lines | Count lines | ≤ 20 | > 20 | Extract helper |
 | Classes ≤ 50 lines | Count lines | ≤ 50 | > 50 | Split class |
 
-**SOLID** — any FAIL → REJECTED:
+#### 4c. SOLID — any FAIL → REJECTED
 
 | Principle | Why it matters | What to check | How to check | PASS/FAIL | Evidence (`file:line`) |
 |---|---|---|---|---|---|
@@ -79,71 +94,72 @@ Read the source files changed in this feature. **Do this before running lint/sta
 | ISP | Fat interfaces force implementors to have methods they cannot meaningfully implement | No Protocol/ABC forces unused method implementations | Check if any implementor raises `NotImplementedError` or passes on inherited methods | | |
 | DIP | Depending on concrete I/O makes unit testing impossible | High-level modules depend on abstractions (Protocols) | Check if any domain class imports from I/O, DB, or framework layers directly | | |
 
-**Object Calisthenics** — any FAIL → REJECTED:
+#### 4d. Object Calisthenics — any FAIL → REJECTED
 
 | # | Rule | Why it matters | How to check | PASS/FAIL | Evidence (`file:line`) |
 |---|---|---|---|---|---|
 | 1 | One indent level per method | Reduces cognitive load per function | Count max nesting in source | | |
 | 2 | No `else` after `return` | Eliminates hidden control flow paths | Search for `else` inside functions with early returns | | |
 | 3 | Primitives wrapped | Prevents primitive obsession; enables validation at construction | Bare `int`/`str` in domain signatures = FAIL | | |
-| 4 | Collections wrapped | Encapsulates iteration and filtering logic | `list[X]` as domain value = FAIL | | |
-| 5 | One dot per line | Reduces coupling to transitive dependencies | `a.b.c()` = FAIL | | |
+| 4 | Collections wrapped in classes | Encapsulates iteration and filtering logic | `list[X]` as domain value = FAIL | | |
+| 5 | One dot per line | Reduces coupling to transitive dependencies | `a.b.c()` chains = FAIL | | |
 | 6 | No abbreviations | Names are documentation; abbreviations lose meaning | `mgr`, `tmp`, `calc` = FAIL | | |
 | 7 | Small entities | Smaller units are easier to test, read, and replace | Functions > 20 lines or classes > 50 lines = FAIL | | |
 | 8 | ≤ 2 instance variables | Forces single responsibility through structural constraint | Count `self.x` assignments in `__init__` | | |
 | 9 | No getters/setters | Enforces tell-don't-ask; behavior lives with data | `get_x()`/`set_x()` pairs = FAIL | | |
 
-**Design Patterns** — any FAIL → REJECTED:
+#### 4e. Design Patterns — any FAIL → REJECTED
 
 | Code smell | Pattern missed | Why it matters | How to check | PASS/FAIL | Evidence (`file:line`) |
 |---|---|---|---|---|---|
-| Multiple if/elif on type/state | State or Strategy | Eliminates conditional complexity, makes adding new states safe | Search for chains of `isinstance` or string-based dispatch | | |
-| Complex `__init__` with side effects | Factory or Builder | Separates construction from use, enables testing | Check `__init__` line count and side effects | | |
+| Multiple if/elif on type/state | State or Strategy | Eliminates conditional complexity | Search for chains of `isinstance` or string-based dispatch | | |
+| Complex `__init__` with side effects | Factory or Builder | Separates construction from use | Check `__init__` line count and side effects | | |
 | Callers must know multiple internal components | Facade | Single entry point reduces coupling | Check how callers interact with the subsystem | | |
 | External dep without Protocol | Repository/Adapter | Enables testing without real I/O; enforces DIP | Check if the dep is injected via abstraction | | |
 | 0 domain classes, many functions | Missing domain model | Procedural code has no encapsulation boundary | Count classes vs functions in domain code | | |
 
-**Tests** — any FAIL → REJECTED:
+#### 4f. Tests — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
-| UUID docstring format | Read first line of each docstring | UUID only, blank line, Given/When/Then | Description on UUID line | Remove description; UUID line must be bare |
+| Docstring format | Read each test docstring | Given/When/Then lines only (no UUID) | Extra metadata or missing G/W/T | Fix docstring to match canonical format |
 | Contract test | Would this test survive a full internal rewrite? | Yes | No | Rewrite assertion to test observable output, not internals |
 | No internal attribute access | Search for `_x` in assertions | None found | `_x`, `isinstance`, `type()` found | Replace with public API assertion |
-| Every AC has a mapped test | `grep -r "<uuid>" tests/` per UUID | Found | Not found | Write the missing test |
-| No UUID used twice | See command below — empty = PASS | Empty output | UUID printed | If only `Given` differs: consolidate into Hypothesis `@given` + `@example`. If `When`/`Then` differs: use `extend-criteria` |
+| Every `@id` has a mapped test | Match `@id` tags in `.feature` files to test functions | All mapped | Missing test | Write the missing test |
+| No `@id` used by two functions | Check for duplicate `@id` hex in test function names | None | Duplicate found | Consolidate into Hypothesis `@given` + `@example` or escalate to PO |
+| Function naming | Test names match `test_<feature_slug>_<8char_hex>` | All match | Mismatch | Rename function |
 
-```bash
-# UUID Drift check — any output = FAIL
-grep -rh --include='*.py' '[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' tests/ \
-  | grep -oE '[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}' \
-  | sort | uniq -d
-```
-
-**Versions and Build** — any FAIL → REJECTED:
+#### 4g. Code Quality — any FAIL → REJECTED
 
 | Check | How to check | PASS | FAIL | Fix |
 |---|---|---|---|---|
-| `pyproject.toml` version matches `__version__` | Read both files | Match | Mismatch | Align the version strings |
-| Coverage target matches package | Check `--cov=<package>` in test config | Matches actual package | Wrong package name | Fix the `--cov` argument |
-| All declared packages exist | Check `[tool.setuptools] packages` against filesystem | All present | Missing package | Add the missing directory or remove the declaration |
-| No `noqa` comments | `grep -r "noqa" src/` | None found | Any found | Fix the underlying issue |
-| No `type: ignore` comments | `grep -r "type: ignore" src/` | None found | Any found | Fix the underlying type error |
+| No `noqa` comments | `grep -r "noqa" <package>/` | None found | Any found | Fix the underlying issue |
+| No `type: ignore` comments | `grep -r "type: ignore" <package>/` | None found | Any found | Fix the underlying type error |
+| All public functions have type hints | Read signatures | All annotated | Missing hints | Add type annotations |
+| All public functions have docstrings | Read source | Google-style present | Missing docstring | Add docstring |
+| Coverage target matches package | Check `--cov=<package>` in test config matches `[tool.setuptools] packages` in `pyproject.toml` | Matches | Wrong package name | Fix the `--cov` argument |
+| All declared packages exist on disk | Check `[tool.setuptools] packages` in `pyproject.toml` against filesystem | All directories present | Missing directory | Add directory or remove declaration |
+| Imports use correct package name | Search production code and tests for import statements; confirm they match `[tool.setuptools] packages`, not a template placeholder | All match | Any import from wrong package | Fix imports and move misplaced source files |
 
 ### 5. Run Verification Commands (in order, stop on first failure)
 
 ```bash
+uv run task gen-tests -- --orphans   # any output = FAIL
 uv run task lint
 uv run task static-check
 uv run task test
 ```
 
-Expected for each: exit 0, no errors. Record exact output on failure.
+Expected for each: exit 0, no output/errors. Record exact output on failure.
+
+If a command fails, stop and REJECT immediately. Do not run subsequent commands.
 
 ### 6. Interactive Verification
 
 If the feature involves user interaction: run the app, provide real input, verify the output changes in response. An app that produces the same output regardless of input is NOT verified.
 
+Record what input was given and what output was observed.
+
 ### 7. Write the Report
 
 ```markdown
@@ -159,18 +175,24 @@ If the feature involves user interaction: run the app, provide real input, verif
 ### Commands
 | Command | Result | Notes |
 |---------|--------|-------|
+| uv run task gen-tests -- --orphans | PASS / FAIL | <orphans listed if fail> |
 | uv run task lint | PASS / FAIL | <details if fail> |
 | uv run task static-check | PASS / FAIL | <errors if fail> |
 | uv run task test | PASS / FAIL | <failures or coverage% if fail> |
+| Interactive run (if user interaction involved) | PASS / SKIP (no UI) / FAIL | <what was tested> |
 
-### UUID Traceability
-| UUID | Description | Test | Status |
-|------|-------------|------|--------|
-| `<uuid>` | <description> | `tests/<file>:<function>` | COVERED / NOT COVERED |
+### @id Traceability
+| @id | Example Title | Test | Status |
+|-----|---------------|------|--------|
+| `@id:a3f2b1c4` | <title> | `tests/features/<name>/<story>_test.py::test_<slug>_a3f2b1c4` | COVERED / NOT COVERED |
 
 ### Code Review Findings
 - PASS: <aspect>
 - FAIL: `<file>:<line>` — <specific issue>
+- NOT CHECKED (stopped at <category>): <sections skipped>
+
+### Gap Report (if any)
+- `<suggested Example text>` — reported to PO for decision
 
 ### Decision
 **APPROVED** — work meets all standards. Developer may proceed to Step 6.
@@ -190,11 +212,13 @@ OR
 | Class length | ≤ 50 lines |
 | Max nesting | 2 levels |
 | Instance variables | ≤ 2 per class |
-| Uncovered UUIDs | 0 |
+| Uncovered `@id` tags | 0 |
 | `noqa` comments | 0 |
 | `type: ignore` | 0 |
 | Semantic alignment mismatches | 0 |
 | SOLID FAIL rows | 0 |
 | ObjCal FAIL rows | 0 |
 | Design pattern FAIL rows | 0 |
-| Duplicate UUIDs | 0 |
+| Duplicate `@id` in tests | 0 |
+| Empty evidence cells | 0 |
+| Orphaned tests | 0 |
diff --git a/AGENTS.md b/AGENTS.md
index 74eb2e1..de6a04c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -5,27 +5,34 @@ A Python template to quickstart any project with a production-ready workflow, qu
 ## Workflow Overview
 
 Features flow through 6 steps with a WIP limit of 1 feature at a time. The filesystem enforces WIP:
-- `docs/features/backlog/` — features waiting to be worked on
-- `docs/features/in-progress/` — exactly one feature being built right now
-- `docs/features/completed/` — accepted and shipped features
+- `docs/features/backlog/<feature-name>/` — features waiting to be worked on
+- `docs/features/in-progress/<feature-name>/` — exactly one feature being built right now
+- `docs/features/completed/<feature-name>/` — accepted and shipped features
 
 ```
-STEP 1: SCOPE          (product-owner)  → define user stories + acceptance criteria
-STEP 2: BOOTSTRAP+ARCH (developer)      → set up build, design module structure
-STEP 3: TEST FIRST     (developer)      → write failing tests mapped to UUIDs
+STEP 1: SCOPE          (product-owner)  → discovery + Gherkin stories + criteria
+STEP 2: ARCH           (developer)      → design module structure, get PO approval
+STEP 3: TEST FIRST     (developer)      → sync stubs, write failing tests
 STEP 4: IMPLEMENT      (developer)      → Red-Green-Refactor, commit per green test
 STEP 5: VERIFY         (reviewer)       → run all commands, review code
-STEP 6: ACCEPT         (product-owner)  → demo, validate, merge, tag
+STEP 6: ACCEPT         (product-owner)  → demo, validate, move folder to completed/
 ```
 
 **PO picks the next feature from backlog. Developer never self-selects.**
 
 **Verification is adversarial.** The reviewer's job is to try to break the feature, not to confirm it works. The default hypothesis is "it might be broken despite green checks; prove otherwise."
 
+## Roles
+
+- **Product Owner (PO)** — AI agent. Interviews the stakeholder, writes discovery docs, Gherkin features, and acceptance criteria. Accepts or rejects deliveries.
+- **Stakeholder** — Human. Answers PO's questions, provides domain knowledge, says "baseline" when discovery is complete.
+- **Developer** — AI agent. Architecture, test bodies, implementation, git. Never edits `.feature` files. Escalates spec gaps to PO.
+- **Reviewer** — AI agent. Adversarial verification. Reports spec gaps to PO.
+
 ## Agents
 
-- **product-owner** — defines scope, acceptance criteria, picks features, accepts deliveries
-- **developer** — architecture, tests, code, git, releases (Steps 2–4 + release)
+- **product-owner** — defines scope (4 phases), picks features, accepts deliveries
+- **developer** — architecture, tests, code, git, releases (Steps 2-4 + release)
 - **reviewer** — runs commands and reviews code at Step 5, produces APPROVED/REJECTED report
 - **setup-project** — one-time setup to initialize a new project from this template
 
@@ -38,12 +45,126 @@ STEP 6: ACCEPT         (product-owner)  → demo, validate, merge, tag
 | `tdd` | developer | 3 |
 | `implementation` | developer | 4 |
 | `verify` | reviewer | 5 |
-| `code-quality` | developer | pre-handoff |
+| `code-quality` | developer | pre-handoff (redirects to `verify`) |
 | `pr-management` | developer | 6 |
 | `git-release` | developer | 6 |
-| `extend-criteria` | any agent | when a gap is found |
 | `create-skill` | developer | meta |
 
+**Session protocol**: Every agent loads `skill session-workflow` at session start. Load additional skills as needed for the current step.
+
+## Step 1 — SCOPE (4 Phases)
+
+### Phase 1 — Project Discovery (once per project)
+PO creates `docs/features/discovery.md`. Asks stakeholder 7 standard questions (Who/What/Why/When/Success/Failure/Out-of-scope). Silent pre-mortem generates follow-up questions. All questions presented at once. Autonomous baseline when all questions are answered. PO identifies feature list and creates `backlog/<name>/discovery.md` per feature.
+
+### Phase 2 — Feature Discovery (per feature)
+PO derives targeted questions from feature entities: extract nouns/verbs from project discovery, populate the Entities table, then generate questions from gaps, ambiguities, and boundary conditions. Silent pre-mortem before the first interview round. Present all questions to the stakeholder at once; iterate with follow-up rounds (pre-mortem after each) until stakeholder says "baseline" to freeze discovery.
+
+### Phase 3 — Stories (PO alone)
+One `.feature` file per user story. `Feature:` block with user story header only — no `Example:` blocks yet. Commit: `feat(stories): write user stories for <name>`
+
+### Phase 4 — Criteria (PO alone)
+Silent pre-mortem per story. Write `Example:` blocks with `@id:<8-char-hex>` tags. Each Example must be observably distinct; if a single `.feature` file spans multiple concerns, split into separate `.feature` files (a feature folder can contain multiple `.feature` files). Commit: `feat(criteria): write acceptance criteria for <name>`
+
+### Feature Decomposition Threshold
+Before moving to Phase 3, check: does this feature span **>2 distinct concerns** OR have **>8 candidate Examples**? If yes, split into separate features in `backlog/` before writing stories. Each feature should address a single cohesive concern.
+
+**Baseline is frozen**: no `.feature` changes after criteria are written. Change = `@deprecated` tag + new Example.
+
+## Filesystem Structure
+
+```
+docs/features/
+  discovery.md                        ← project-level (Status + Questions only)
+  backlog/<feature-name>/
+    discovery.md                      ← Status + Entities + Rules + Constraints + Questions
+    <story-slug>.feature              ← one per user story (Gherkin)
+  in-progress/<feature-name>/         ← whole folder moves here at Step 2
+  completed/<feature-name>/           ← whole folder moves here at Step 6
+
+tests/
+  features/<feature-name>/
+    <story-slug>_test.py              ← one per .feature, stubs from gen-tests
+  unit/
+    <anything>_test.py                ← developer-authored extras
+```
+
+## Gherkin Format
+
+```gherkin
+Feature: Bounce physics
+  As a game engine
+  I want balls to bounce off walls
+  So that gameplay feels physical
+
+  @id:a3f2b1c4
+  Example: Ball bounces off top wall
+    Given a ball moving upward reaches y=0
+    When the physics engine processes the next frame
+    Then the ball velocity y-component becomes positive
+
+  @deprecated @id:b5c6d7e8
+  Example: Old behavior no longer needed
+    Given ...
+    When ...
+    Then ...
+```
+
+- `@id:<8-char-hex>` — generated with `uv run task gen-id`
+- `@deprecated` — marks superseded criteria; `gen-tests` adds `@pytest.mark.deprecated` to the mapped test
+- `Example:` keyword (not `Scenario:`)
+- Each Example must be observably distinct from every other
+
+## Test Conventions
+
+### Test Stub Generation
+
+```bash
+uv run task gen-tests              # sync all features
+uv run task gen-tests -- --check   # dry run
+uv run task gen-tests -- --orphans # list orphaned tests
+```
+
+- backlog / in-progress: full write (create stubs, update docstrings, rename functions)
+- completed: only toggle `@pytest.mark.deprecated` (no docstring changes)
+- Orphaned tests (no matching `@id`) get `@pytest.mark.skip(reason="orphan: ...")`
+
+### Test File Layout
+
+```
+tests/features/<feature-name>/<story-slug>_test.py
+```
+
+### Function Naming
+
+```python
+def test_<feature_slug>_<8char_hex>() -> None:
+```
+
+### Docstring Format (mandatory)
+
+```python
+@pytest.mark.unit
+def test_bounce_physics_a3f2b1c4() -> None:
+    """
+    Given: A ball moving upward reaches y=0
+    When: The physics engine processes the next frame
+    Then: The ball velocity y-component becomes positive
+    """
+    # Given
+    # When
+    # Then
+    raise NotImplementedError
+```
+
+### Markers (4 total)
+- `@pytest.mark.unit` — isolated, one function/class, no external state
+- `@pytest.mark.integration` — multiple components, external state
+- `@pytest.mark.slow` — takes > 50ms; additionally applied alongside `unit` or `integration`
+- `@pytest.mark.deprecated` — auto-skipped by conftest hook; added by `gen-tests`
+
+Every test gets exactly one of `unit` or `integration`. Slow tests additionally get `slow`.
+
 ## Development Commands
 
 ```bash
@@ -53,8 +174,7 @@ uv sync --all-extras
 # Run the application (for humans)
 uv run task run
 
-# Run the application with timeout (for agents — prevents hanging on infinite loops)
-# Exit code 124 means the process was killed; treat as FAIL
+# Run the application with timeout (for agents — prevents hanging)
 timeout 10s uv run task run
 
 # Run tests (fast, no coverage)
@@ -72,108 +192,55 @@ uv run task lint
 # Type checking
 uv run task static-check
 
-# Serve documentation
-uv run task doc-serve
-```
+# Generate an 8-char hex ID
+uv run task gen-id
 
-## Test Conventions
-
-### Markers (3 only)
-- `@pytest.mark.unit` — isolated, one function/class, no external state
-- `@pytest.mark.integration` — multiple components, external state (DB, network, filesystem)
-- `@pytest.mark.slow` — takes > 50ms; additionally applied to DB, Hypothesis, and terminal I/O tests
+# Sync test stubs from .feature files
+uv run task gen-tests
 
-Every test gets exactly one of `unit` or `integration`. Slow tests additionally get `slow`.
-
-### File and Function Naming
-```
-<descriptive-group-name>_test.py         # file name
-test_<short_title>                       # function name
-```
-
-### Docstring Format (mandatory)
-```python
-def test_email_requires_at_symbol():
-    """a1b2c3d4-e5f6-7890-abcd-ef1234567890
-
-    Given: An email address without an @ symbol
-    When: EmailAddress is constructed
-    Then: A ValueError is raised with a descriptive message
-    """
-    # Given
-    invalid = "not-an-email"
-    # When
-    # Then
-    with pytest.raises(ValueError):
-        EmailAddress(invalid)
+# Serve documentation
+uv run task doc-serve
 ```
 
-Rules:
-- First line: `<uuid>` only — no description
-- Mandatory blank line between UUID and Given
-- `# Given`, `# When`, `# Then` comments in the test body
-- Assert behavior, not structure — no `isinstance()`, `type()`, or internal attributes
-- Never use `noqa` or `type: ignore`
-- Never use `pytest.skip` or `pytest.mark.xfail` without written justification in the docstring
-
 ## Code Quality Standards
 
 - **Principles (in priority order)**: YAGNI > KISS > DRY > SOLID > Object Calisthenics
 - **Linting**: ruff, Google docstring convention, `noqa` forbidden
 - **Type checking**: pyright, 0 errors required
-- **Coverage**: 100% (measured against your actual package, not `app` unless that is your package)
+- **Coverage**: 100% (measured against your actual package)
 - **Function length**: ≤ 20 lines
 - **Class length**: ≤ 50 lines
 - **Max nesting**: 2 levels
 - **Instance variables**: ≤ 2 per class
-- **Semantic alignment**: tests must operate at the same abstraction level as the acceptance criteria they cover. If the AC says "when the user presses W," the test must send W through the actual input mechanism, not call an internal helper.
-- **Integration tests**: multi-component features and features involving user interaction require at least one `@pytest.mark.integration` test that exercises the public entry point.
-
-## Verification Philosophy
-
-- **Automated checks** (lint, typecheck, coverage) verify **syntax-level** correctness — the code is well-formed.
-- **Human review** (semantic alignment, code review, manual testing) verifies **semantic-level** correctness — the code does what the user needs.
-- Both are required. All-green automated checks are necessary but not sufficient for APPROVED.
-
-## Feature Document Format
-
-One file per feature, lives in `docs/features/`. PO writes the top sections; developer adds `## Architecture`.
+- **Semantic alignment**: tests must operate at the same abstraction level as the acceptance criteria they cover
+- **Integration tests**: multi-component features require at least one `@pytest.mark.integration` test exercising the public entry point
 
-**Naming:** `<verb>-<object>.md` — imperative verb first, kebab-case, 2–4 words.
-Examples: `display-version.md`, `authenticate-user.md`, `export-metrics-csv.md`
-Title matches: `# Feature: <Verb> <Object>` in Title Case.
-
-```markdown
-# Feature: <Verb> <Object>
+### Developer Quality Gate Priority Order
 
-## User Stories
-- As a <role>, I want <goal> so that <benefit>
+During Step 4 (Implementation), correctness priorities are:
 
-## Acceptance Criteria
-- `<uuid>`: <Short description ending with a period>.
-  Source: <stakeholder | po | developer | reviewer | bug>
+1. **Design correctness** — YAGNI > KISS > DRY > SOLID > Object Calisthenics > appropriate design patterns
+2. **One test green** — the specific test under work passes, plus `test-fast` still passes
+3. **Reviewer code-design check** — reviewer verifies design + semantic alignment (no lint/pyright/coverage)
+4. **Commit** — only after reviewer APPROVED
+5. **Quality tooling** — `lint`, `static-check`, full `test` with coverage run only at developer handoff (before Step 5)
 
-  Given: <precondition>
-  When: <action>
-  Then: <single observable outcome>
+Design correctness is far more important than lint/pyright/coverage compliance. A well-designed codebase with minor lint issues is better than a lint-clean codebase with poor design.
 
-## Notes
-<constraints, risks, out-of-scope items>
+## Verification Philosophy
 
-## Architecture  ← Developer adds this in Step 2
-### Module Structure
-### Key Decisions (ADRs)
-### Build Changes (needs PO approval: yes/no)
-```
+- **Automated checks** (lint, typecheck, coverage) verify **syntax-level** correctness — the code is well-formed.
+- **Human review** (semantic alignment, code review, manual testing) verifies **semantic-level** correctness — the code does what the user needs.
+- Both are required. All-green automated checks are necessary but not sufficient for APPROVED.
+- Reviewer defaults to REJECTED unless correctness is proven.
 
-**Source field values:**
-- `stakeholder` — an external stakeholder gave this requirement to the PO
-- `po` — the PO originated this criterion independently
-- `developer` — a gap found during Step 4 implementation
-- `reviewer` — a gap found during Step 5 verification
-- `bug` — a post-merge regression; the feature doc was reopened
+## Deprecation Process
 
-**Gaps and Defects:** When any agent finds a missing behavior, load `skill extend-criteria`. It provides the decision rule (gap within scope vs. new feature), UUID assignment, and commit protocol. For post-merge defects, the feature doc moves from `completed/` back to `in-progress/`.
+1. PO adds `@deprecated` tag to Example in `.feature` file
+2. Run `uv run task gen-tests` — script adds `@pytest.mark.deprecated` to mapped test
+3. Deprecated tests auto-skip via conftest hook
+4. Feature is done when all non-deprecated tests pass
+5. No special folder — features move to `completed/` normally
 
 ## Release Management
 
@@ -181,7 +248,7 @@ Version format: `v{major}.{minor}.{YYYYMMDD}`
 
 - Minor bump for new features; major bump for breaking changes
 - Same-day second release: increment minor, keep same date
-- Each release gets a unique adjective-animal name generated from the PR/commit content
+- Each release gets a unique adjective-animal name
 
 Use `@developer /skill git-release` for the full release process.
 
@@ -195,13 +262,13 @@ Every session: load `skill session-workflow`. Read `TODO.md` first, update it at
 
 Feature: <name>
 Step: <1-6> (<step name>)
-Source: docs/features/in-progress/<name>.md
+Source: docs/features/in-progress/<name>/discovery.md
 
 ## Progress
-- [x] `<uuid>`: <description>          ← done
-- [~] `<uuid>`: <description>          ← in progress
-- [ ] `<uuid>`: <description>          ← next
-- [-] `<uuid>`: <description>          ← cancelled
+- [x] `<@id:hex>`: <description>          ← done
+- [~] `<@id:hex>`: <description>          ← in progress
+- [ ] `<@id:hex>`: <description>          ← next
+- [-] `<@id:hex>`: <description>          ← cancelled
 
 ## Next
 <One actionable sentence>
diff --git a/Dockerfile b/Dockerfile
index 26a4260..e6645ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -39,10 +39,10 @@ EXPOSE 8000 8080 5678
 
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD python -m python_package_template.python_module_template || exit 1
+    CMD python -m app || exit 1
 
 # Default command
-CMD ["python", "-m", "python_package_template.python_module_template"]
+CMD ["python", "-m", "app"]
 
 # Labels
 LABEL maintainer="eol"
diff --git a/README.md b/README.md
index 317963d..955fd5f 100644
--- a/README.md
+++ b/README.md
@@ -48,12 +48,12 @@ docs/features/completed/     ← accepted and shipped features
 
 | Step | Role | What happens |
 |------|------|-------------|
-| 1. SCOPE | Product Owner | User stories + UUID acceptance criteria |
-| 2. BOOTSTRAP + ARCH | Developer | Build system, module structure, ADRs |
-| 3. TEST FIRST | Developer | Failing tests mapped 1:1 to UUID criteria |
+| 1. SCOPE | Product Owner | Discovery + Gherkin stories + `@id` criteria |
+| 2. ARCH | Developer | Design module structure, get PO approval |
+| 3. TEST FIRST | Developer | Sync stubs, write failing tests mapped to `@id` |
 | 4. IMPLEMENT | Developer | Red→Green→Refactor, commit per green test |
-| 5. VERIFY | Reviewer | Run all commands, code review, UUID traceability |
-| 6. ACCEPT | Product Owner | Demo, validate, merge, tag |
+| 5. VERIFY | Reviewer | Run all commands, code review, `@id` traceability |
+| 6. ACCEPT | Product Owner | Demo, validate, move folder to completed/ |
 
 ### AI Agents
 
@@ -71,7 +71,7 @@ docs/features/completed/     ← accepted and shipped features
 /skill scope               # Write user stories + acceptance criteria
 /skill tdd                 # TDD: file naming, docstring format, markers
 /skill implementation      # Red-Green-Refactor, architecture, ADRs
-/skill code-quality        # ruff, pyright, coverage, complexity limits
+/skill code-quality        # redirects to verify (quick reference)
 /skill verify              # Step 5 verification checklist
 /skill pr-management       # Branch naming, PR template, squash merge
 /skill git-release         # Hybrid calver versioning, themed naming
@@ -88,6 +88,8 @@ uv run task test-fast        # Tests without coverage (faster iteration)
 uv run task test-slow        # Only slow tests
 uv run task lint             # ruff check + format
 uv run task static-check     # pyright type checking
+uv run task gen-id           # Generate an 8-char hex ID for @id tags
+uv run task gen-tests        # Sync test stubs from .feature files
 uv run task doc-build        # Generate API docs + coverage + test reports
 uv run task doc-publish      # Publish unified docs site to GitHub Pages
 uv run task doc-serve        # Live API doc server at localhost:8080
@@ -108,12 +110,12 @@ uv run task doc-serve        # Live API doc server at localhost:8080
 ## Test Conventions
 
 ```python
-def test_<short_title>():
-    """a1b2c3d4-e5f6-7890-abcd-ef1234567890
-
-    Given: precondition
-    When: action
-    Then: single observable outcome
+@pytest.mark.unit
+def test_bounce_physics_a3f2b1c4() -> None:
+    """
+    Given: A ball moving upward reaches y=0
+    When: The physics engine processes the next frame
+    Then: The ball velocity y-component becomes positive
     """
     # Given
     ...
@@ -123,7 +125,7 @@ def test_<short_title>():
     ...
 ```
 
-**Markers**: `@pytest.mark.unit` · `@pytest.mark.integration` · `@pytest.mark.slow`
+**Markers**: `@pytest.mark.unit` · `@pytest.mark.integration` · `@pytest.mark.slow` · `@pytest.mark.deprecated`
 
 ## Technology Stack
 
diff --git a/app/__main__.py b/app/__main__.py
new file mode 100644
index 0000000..a200610
--- /dev/null
+++ b/app/__main__.py
@@ -0,0 +1,24 @@
+"""Entry point for running the application as a module."""
+
+import logging
+
+import fire
+
+logger = logging.getLogger(__name__)
+
+
+def main(verbosity: str = "INFO") -> None:
+    """Run the application.
+
+    Args:
+        verbosity: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
+    """
+    logging.basicConfig(
+        level=getattr(logging, verbosity.upper(), logging.INFO),
+        format="%(levelname)s - %(name)s: %(message)s",
+    )
+    logger.info("Ready.")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/app/version.py b/app/version.py
deleted file mode 100644
index bf008ec..0000000
--- a/app/version.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""Module Docstring."""
-
-import logging
-import tomllib
-from pathlib import Path
-
-logger = logging.getLogger("app")
-
-
-def version() -> str:
-    """Log version at INFO level.
-
-    Returns:
-        Version string from pyproject.toml.
-
-    Examples:
-        >>> result = version()  # doctest: +ELLIPSIS
-        >>> isinstance(result, str)
-        True
-        >>> len(result) > 0
-        True
-        >>> '.' in result  # Version should contain dots
-        True
-
-    """
-    pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
-
-    with Path(pyproject_path).open("rb") as f:
-        data = tomllib.load(f)
-
-    version_str = data["project"]["version"]
-    logger.info("Version: %s", version_str)
-    return version_str
-
-
-if __name__ == "__main__":
-    version()
diff --git a/docker-compose.yml b/docker-compose.yml
index 806af12..f8708f8 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,7 +12,7 @@ services:
     container_name: python-template-app
     volumes:
       # Hot reload: mount source code
-      - ./python_package_template:/app/python_package_template
+      - ./app:/app/app
       - ./tests:/app/tests
       - ./pyproject.toml:/app/pyproject.toml:ro
     ports:
@@ -23,7 +23,7 @@ services:
       - PYTHONPATH=/app
       - PYTHONUNBUFFERED=1
       - DEVELOPMENT=true
-    command: python -m python_package_template.python_module_template
+    command: python -m app
     restart: unless-stopped
 
   # =============================================================================
@@ -52,7 +52,7 @@ services:
       dockerfile: Dockerfile
     container_name: python-template-docs
     volumes:
-      - ./python_package_template:/app/python_package_template:ro
+      - ./app:/app/app:ro
       - ./pyproject.toml:/app/pyproject.toml:ro
     ports:
       - "8080:8080"
diff --git a/docs/academic_research.md b/docs/academic_research.md
index 1de562d..56dd27d 100644
--- a/docs/academic_research.md
+++ b/docs/academic_research.md
@@ -24,7 +24,7 @@ This document explains the cognitive and social-science mechanisms that justify
 | **Source** | Gollwitzer, P. M. (1999). Implementation intentions: Strong effects of simple planning aids. *American Journal of Preventive Medicine*, 16(4), 257–276. |
 | **Core finding** | "If X then Y" plans are 2–3x more likely to execute than general intentions. |
 | **Mechanism** | If-then plans create automatic cue-response links in memory. The brain processes "if function > 20 lines then extract helper" as an action trigger, not a suggestion to consider. |
-| **Where used** | Refactor Self-Check Gates in `implementation/SKILL.md`, Structural Quality Checks in `code-quality/SKILL.md`. |
+| **Where used** | Refactor Self-Check Gates in `implementation/SKILL.md`, Code Quality checks in `verify/SKILL.md`. |
 
 ---
 
@@ -173,6 +173,63 @@ This document explains the cognitive and social-science mechanisms that justify
 
 ---
 
+### 16. Cost of Change Curve (Shift Left)
+
+| | |
+|---|---|
+| **Source** | Boehm, B. W. (1981). *Software Engineering Economics*. Prentice-Hall. |
+| **Alternative** | Boehm, B., & Papaccio, P. N. (1988). Understanding and controlling software costs. *IEEE Transactions on Software Engineering*, 14(10), 1462–1477. |
+| **Core finding** | The cost to fix a defect multiplies by roughly 10x per SDLC phase: requirements (1x) → design (5x) → coding (10x) → testing (20x) → production (200x). A defect caught during requirements costs 200x less than the same defect found after release. |
+| **Mechanism** | Defects compound downstream: a wrong requirement becomes a wrong design, which becomes wrong code, which becomes wrong tests, all of which must be unwound. Catching errors at the source eliminates the entire cascade. This is the empirical foundation for "shift left" — investing earlier in quality always dominates fixing later. |
+| **Where used** | Justifies the multi-session PO elicitation model: every acceptance criterion clarified at scope prevents 10–200x rework downstream. Also justifies the adversarial pre-mortem at the end of each elicitation cycle, and the adversarial mandate in `verify/SKILL.md`. The entire 6-step pipeline is ordered to surface defects at the earliest (cheapest) phase. |
+
+---
+
+### 17. INVEST Criteria for User Stories
+
+| | |
+|---|---|
+| **Source** | Wake, B. (2003). *INVEST in Good Stories, and SMART Tasks*. XP123.com. |
+| **Alternative** | Cohn, M. (2004). *User Stories Applied: For Agile Software Development*. Addison-Wesley. |
+| **Core finding** | Stories that are Independent, Negotiable, Valuable, Estimable, Small, and Testable produce fewer downstream defects and smoother development cycles. Stories that fail INVEST — especially "Testable" and "Small" — are the leading cause of scope creep and unbounded iteration. |
+| **Mechanism** | INVEST serves as a quality gate before stories enter development. "Testable" forces the PO to express observable outcomes (directly enabling Given/When/Then). "Small" forces decomposition, which reduces cognitive load and makes estimation feasible. "Independent" prevents hidden ordering dependencies between stories. |
+| **Where used** | INVEST gate in Phase 3 of `scope/SKILL.md`. PO verifies every story against all 6 letters before committing. |
+
+---
+
+### 18. Example Mapping (Rules Layer)
+
+| | |
+|---|---|
+| **Source** | Wynne, M. (2015). *Introducing Example Mapping*. Cucumber Blog. https://cucumber.io/blog/bdd/example-mapping-introduction/ |
+| **Core finding** | Inserting a "rules" layer between stories and examples prevents redundant or contradictory acceptance criteria. A story with many rules needs splitting; a story with many open questions is not ready for development. |
+| **Mechanism** | Example Mapping uses four card types: Story (yellow), Rules (blue), Examples (green), Questions (red). The rules layer groups related examples under the business rule they illustrate. Without this layer, POs jump from story directly to examples and lose the reasoning that connects them. Red cards (unanswered questions) are a first-class signal to stop and investigate rather than assume. |
+| **Where used** | `## Rules` section in per-feature `discovery.md` (Phase 2). PO identifies business rules before writing Examples in Phase 4, making the reasoning behind Example clusters visible and reviewable. |
+
+---
+
+### 19. Declarative Gherkin
+
+| | |
+|---|---|
+| **Source** | Cucumber Team. (2024). *Better Gherkin*. Cucumber Documentation. https://cucumber.io/docs/bdd/better-gherkin/ |
+| **Core finding** | Declarative Gherkin ("When Bob logs in") produces specifications that survive UI changes. Imperative Gherkin ("When I click the Login button") couples specs to implementation details and breaks on every UI redesign. |
+| **Mechanism** | Declarative steps describe *what happens* at the business level. Imperative steps describe *how the user interacts with a specific UI*. The distinction maps to the abstraction level: declarative = behavior contract, imperative = interaction script. AI agents are especially prone to writing imperative Gherkin because they mirror literal steps. |
+| **Where used** | Declarative vs. imperative table in Phase 4 of `scope/SKILL.md`. PO is explicitly instructed to write behavior descriptions, not UI interaction scripts. |
+
+---
+
+### 20. MoSCoW Prioritization (Within-Story Triage)
+
+| | |
+|---|---|
+| **Source** | Clegg, D., & Barker, R. (1994). *Case Method Fast-Track: A RAD Approach*. Addison-Wesley (DSDM origin). |
+| **Core finding** | Classifying requirements as Must/Should/Could/Won't forces explicit negotiation about what is essential vs. desired. When applied *within* a single story (not just across a backlog), it reveals bloated stories that should be split. |
+| **Mechanism** | DSDM mandates that Musts cannot exceed 60% of total effort. At the story level: if a story has 12 Examples and only 3 are Musts, the remaining 9 can be deferred or split into a follow-up story. This prevents gold-plating and keeps stories small. |
+| **Where used** | MoSCoW triage in Phase 4 of `scope/SKILL.md`. PO applies Must/Should/Could when a story exceeds 5 Examples. |
+
+---
+
 ## Bibliography
 
 1. Cialdini, R. B. (2001). *Influence: The Psychology of Persuasion* (rev. ed.). HarperBusiness.
@@ -190,3 +247,10 @@ This document explains the cognitive and social-science mechanisms that justify
 13. Google Testing Blog. (2013). Testing on the Toilet: Test Behavior, Not Implementation.
 14. Martin, R. C. (2017). First-Class Tests. *Clean Coder Blog*.
 15. MacIver, D. R. (2016). What is Property Based Testing? *Hypothesis*. https://hypothesis.works/articles/what-is-property-based-testing/
+16. Boehm, B. W. (1981). *Software Engineering Economics*. Prentice-Hall.
+17. Boehm, B., & Papaccio, P. N. (1988). Understanding and controlling software costs. *IEEE Transactions on Software Engineering*, 14(10), 1462–1477.
+18. Wake, B. (2003). INVEST in Good Stories, and SMART Tasks. *XP123.com*.
+19. Cohn, M. (2004). *User Stories Applied: For Agile Software Development*. Addison-Wesley.
+20. Wynne, M. (2015). Introducing Example Mapping. *Cucumber Blog*. https://cucumber.io/blog/bdd/example-mapping-introduction/
+21. Cucumber Team. (2024). Better Gherkin. *Cucumber Documentation*. https://cucumber.io/docs/bdd/better-gherkin/
+22. Clegg, D., & Barker, R. (1994). *Case Method Fast-Track: A RAD Approach*. Addison-Wesley.
diff --git a/docs/features/completed/display-version.md b/docs/features/completed/display-version.md
deleted file mode 100644
index 99c70cf..0000000
--- a/docs/features/completed/display-version.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Feature: Display Version
-
-## User Stories
-- As a developer, I want to retrieve the application version programmatically so that I can display or log it at runtime.
-- As a developer, I want to control log verbosity via a parameter so that I can tune output for different environments.
-
-## Acceptance Criteria
-- `3f2a1b4c-d5e6-7890-abcd-ef1234567890`: Version string is read from pyproject.toml.
-  Source: po
-
-  Given: pyproject.toml exists with a version field
-  When: version() is called
-  Then: The returned string matches the version in pyproject.toml
-
-- `7a8b9c0d-e1f2-3456-bcde-f12345678901`: Version call emits a log message.
-  Source: po
-
-  Given: pyproject.toml exists with a version field
-  When: version() is called
-  Then: An INFO log message in the format "Version: <version>" is emitted
-
-- `a1b2c3d4-e5f6-7890-abcd-ef1234567890`: Version appears in logs at DEBUG and INFO verbosity.
-  Source: po
-
-  Given: A verbosity level of DEBUG or INFO is passed to main()
-  When: main() is called
-  Then: The version string appears in the log output
-
-- `b2c3d4e5-f6a7-8901-bcde-f12345678901`: Version is absent from logs at WARNING and above.
-  Source: po
-
-  Given: A verbosity level of WARNING, ERROR, or CRITICAL is passed to main()
-  When: main() is called
-  Then: The version string does not appear in the log output
-
-- `e5f6a7b8-c9d0-1234-defa-012345678903`: Invalid verbosity raises a descriptive error.
-  Source: po
-
-  Given: An invalid verbosity string is passed to main()
-  When: main() is called
-  Then: A ValueError is raised with the invalid value and valid options listed
-
-## Notes
-- This is the template example feature shipped with the project skeleton.
-- Tests live in `tests/version_test.py`.
-- No out-of-scope items; this feature is complete and serves as a reference implementation.
-
-## Architecture
-
-### Module Structure
-- `app/version.py` — `version()` function; reads `pyproject.toml` via `tomllib`
-- `main.py` — `main(verbosity)` entry point; configures logging, calls `version()`
-
-### Key Decisions (ADRs)
-
-ADR-001: Read version from pyproject.toml at runtime
-Decision: Use `tomllib` to read the version field from `pyproject.toml` at runtime
-Reason: Avoids duplicating the version between `pyproject.toml` and a `__version__` constant
-Alternatives considered: Hardcoded `__version__` in `app/__init__.py` — rejected to keep a single source of truth
-
-ADR-002: Enforce verbosity via Literal type alias
-Decision: Define `ValidVerbosity` as a `Literal` type alias for the five standard log level strings
-Reason: Catches invalid verbosity values at the type-checker level before runtime
-Alternatives considered: Accepting a plain `str` and validating at runtime only — rejected because it defers errors that the type checker can catch earlier
-
-### Build Changes (needs PO approval: yes/no)
-no
diff --git a/docs/features/completed/display-version/discovery.md b/docs/features/completed/display-version/discovery.md
new file mode 100644
index 0000000..3fc335c
--- /dev/null
+++ b/docs/features/completed/display-version/discovery.md
@@ -0,0 +1,24 @@
+# Feature Discovery: display-version
+
+## Status
+completed
+
+## Entities
+
+**Nouns**: version string, pyproject.toml, log output, verbosity level, entry point  
+**Verbs**: retrieve, display, log, configure, validate
+
+## Rules
+- Version is read from `pyproject.toml` at runtime using `tomllib`
+- Log verbosity is controlled by a `ValidVerbosity` parameter passed to `main()`
+- Valid verbosity levels are: DEBUG, INFO, WARNING, ERROR, CRITICAL
+- An invalid verbosity value raises a `ValueError` with the invalid value and the list of valid options
+- The version string is logged at INFO level; it is visible at DEBUG and INFO but not at WARNING or above
+
+## Constraints
+- No hardcoded `__version__` constant — `pyproject.toml` is the single source of truth
+- Entry point: `app/__main__.py` (`main(verbosity)` function)
+- Version logic: `app/version.py` (`version()` function)
+
+## Questions
+All questions answered. Discovery frozen.
diff --git a/docs/features/completed/display-version/verbosity-control.feature b/docs/features/completed/display-version/verbosity-control.feature
new file mode 100644
index 0000000..4a16c05
--- /dev/null
+++ b/docs/features/completed/display-version/verbosity-control.feature
@@ -0,0 +1,22 @@
+Feature: Verbosity control
+  As a developer
+  I want to control log verbosity via a parameter
+  So that I can tune output for different environments
+
+  @id:a1b2c3d4
+  Example: Version appears in logs at DEBUG and INFO verbosity
+    Given a verbosity level of DEBUG or INFO is passed to main()
+    When main() is called
+    Then the version string appears in the log output
+
+  @id:b2c3d4e5
+  Example: Version is absent from logs at WARNING and above
+    Given a verbosity level of WARNING, ERROR, or CRITICAL is passed to main()
+    When main() is called
+    Then the version string does not appear in the log output
+
+  @id:e5f6a7b8
+  Example: Invalid verbosity raises a descriptive error
+    Given an invalid verbosity string is passed to main()
+    When main() is called
+    Then a ValueError is raised with the invalid value and valid options listed
diff --git a/docs/features/completed/display-version/version-retrieval.feature b/docs/features/completed/display-version/version-retrieval.feature
new file mode 100644
index 0000000..9150195
--- /dev/null
+++ b/docs/features/completed/display-version/version-retrieval.feature
@@ -0,0 +1,16 @@
+Feature: Version retrieval
+  As a developer
+  I want to retrieve the application version programmatically
+  So that I can display or log it at runtime
+
+  @id:3f2a1b4c
+  Example: Version string is read from pyproject.toml
+    Given pyproject.toml exists with a version field
+    When version() is called
+    Then the returned string matches the version in pyproject.toml
+
+  @id:7a8b9c0d
+  Example: Version call emits an INFO log message
+    Given pyproject.toml exists with a version field
+    When version() is called
+    Then an INFO log message in the format "Version: <version>" is emitted
diff --git a/docs/features/discovery.md b/docs/features/discovery.md
new file mode 100644
index 0000000..16ef362
--- /dev/null
+++ b/docs/features/discovery.md
@@ -0,0 +1,8 @@
+# Discovery: <project-name>
+
+## State
+Status: ELICITING
+
+## Questions
+| ID | Question | Answer | Status |
+|----|----------|--------|--------|
diff --git a/docs/post-mortem/2026-04-14-ping-pong-cli-workflow-gaps.md b/docs/post-mortem/2026-04-14-ping-pong-cli-workflow-gaps.md
new file mode 100644
index 0000000..7f1d054
--- /dev/null
+++ b/docs/post-mortem/2026-04-14-ping-pong-cli-workflow-gaps.md
@@ -0,0 +1,176 @@
+# Post-Mortem: ping-pong-cli — Workflow Gaps (v3.1)
+
+## Release Details
+
+| Field | Value |
+|-------|-------|
+| Version | v3.1.20260414 |
+| Date | April 14, 2026 |
+| Feature | ping-pong-cli |
+| Status | APPROVED and shipped |
+| Broken | Yes — game doesn't work |
+
+---
+
+## What Was Shipped
+
+`ping_pong_cli/game.py` — 240 lines:
+
+- 15 top-level functions, zero classes
+- No keyboard input (`get_input()` always returns `""`)
+- Runs a hardcoded 100-frame demo then exits
+- Uses raw `int` and `tuple[int,int]` — no value objects
+- `render_game` has 3 levels of nesting
+- 8-parameter function signatures
+
+Yet it passed: lint, typecheck, 100% coverage, 31 tests, reviewer APPROVED.
+
+---
+
+## What Failed
+
+The acceptance criteria said:
+> Given: The game is running and waiting for input
+> When: The left or right arrow key is pressed
+> Then: The paddle moves
+
+The implementation maps this to a unit test of `update_player("W")`. That test proves the function works in isolation. No test verifies that keyboard input actually reaches `update_player`.
+
+The game shipped with the acceptance criterion satisfied in a narrow technical sense ("paddle moves when 'W' is passed to the function") but broken in the broad user sense ("paddle doesn't move when I press W in the running game").
+
+---
+
+## Gap 1: Acceptance Criteria Don't Require End-to-End Verification
+
+### Problem
+
+The `scope` skill defines "Then must be a single observable, measurable outcome" but doesn't define **observable by whom**. The developer interpreted this as "observable in a unit test" — test calls `update_player("W")` returns expected result.
+
+### Fix
+
+In `scope` skill, add:
+
+> **Observable means observable by the end user.** If the criterion says "When the user presses W", the test must verify that pressing W in the running app produces the expected result — not just that calling `update_player("W")` returns the right number. If end-to-end testing isn't feasible, the criterion must explicitly state the boundary (e.g., "When update_player receives 'W'") so the gap is visible.
+
+In `verify` skill, add:
+
+> **Acceptance Criteria vs. Reality Check**
+>
+> For each criterion whose Given/When/Then describes user-facing behavior:
+> - Read the test that covers it
+> - If the test only exercises an internal function without going through the actual user-facing entry point, flag it as **COVERED BUT NOT VERIFIED**
+> - A criterion that says "When the user presses W" is NOT verified by `test_update_player("W")` — it's verified by a test or manual check that sends W to the running app
+>
+> Any COVERED BUT NOT VERIFIED criterion → REJECTED
+
+---
+
+## Gap 2: Object Calisthenics Listed But Not Enforced by Reviewer
+
+### Problem
+
+The `verify` skill listed all 9 Object Calisthenics rules. The reviewer read them but approved code with:
+
+| # | Rule | Violation in shipped code |
+|---|------|--------------------------|
+| 3 | Wrap primitives | `PlayerPosition = int`, `BallState = tuple[int,int]` are type aliases, not value objects |
+| 4 | First-class collections | No collection classes |
+| 7 | Small entities | `run_game_loop` is ~40 lines |
+| 8 | ≤ 2 instance vars | No classes at all, but 8-parameter function signatures |
+
+The skill didn't say **what to do when violations are found**. Violations were treated as observations, not blockers.
+
+### Fix
+
+In `verify` skill, replace ObjCal prose with a structured table:
+
+> **Object Calisthenics — ANY violation is a REJECT**
+>
+> | # | Rule | How to check | PASS/FAIL |
+> |---|------|-------------|-----------|
+> | 1 | One level of indentation | Check nest depth in source |
+> | 2 | No `else` after return | Search for `else` inside functions |
+> | 3 | Wrap primitives | Bare `int`, `str` as domain concepts = FAIL |
+> | 4 | First-class collections | `list[Type]` not wrapped = FAIL |
+> | 5 | One dot per line | `a.b.c()` = FAIL |
+> | 6 | No abbreviations | `calc`, `mgr` = FAIL |
+> | 7 | Small entities | Lines per function >20 or class >50 = FAIL |
+> | 8 | ≤ 2 instance vars | More than 2 per class = FAIL |
+> | 9 | No getters/setters | `get_x()`, `set_x()` = FAIL |
+
+---
+
+## Gap 3: REFACTOR Step Has No Verification Gate
+
+### Problem
+
+The `implementation` skill says to apply DRY, SOLID, Object Calisthenics during REFACTOR, but when done, it only runs `task test`, `task lint`, `task static-check`. None of those tools check nesting depth, function length, or value objects. The developer skips the self-check, runs the three commands, they all pass.
+
+### Fix
+
+In `implementation` skill, add after REFACTOR section:
+
+> **REFACTOR Self-Check (MANDATORY before commit)**
+>
+> 1. Count lines per function you changed. Any >20 → extract helper
+> 2. Check nesting. Any >2 levels → extract function
+> 3. Check bare primitives as domain concepts. `int` for paddle position → value object
+> 4. Check parameters per function. >4 positional → group into dataclass
+>
+> If you skip this step, the reviewer WILL reject your code.
+
+---
+
+## Gap 4: `timeout 10s uv run task run` Is Not a Playability Test
+
+### Problem
+
+The `verify` skill said: "check that startup completes without error before the timeout." The demo ran for 1.6 seconds and exited cleanly — startup completed, no error. The app passed without being interactive at all.
+
+### Fix
+
+In `verify` skill, replace the timeout check with:
+
+> **For apps with user interaction** (games, CLIs with prompts, web servers):
+> - Run the app, provide sample input via stdin/subprocess
+> - Verify output changes in response to input
+> - A hardcoded demo that auto-plays without input is NOT a playability test
+>
+> If the app doesn't respond to user input → REJECTED
+
+---
+
+## Gap 5: Tests Verify Functions, Not Behavior
+
+### Problem
+
+The `tdd` skill produces unit tests. Every test calls an isolated function. No test sends input to the running game. No test verifies the game loop integrates these functions correctly. 31 tests pass with 100% coverage but none test the actual gameplay loop.
+
+### Fix
+
+In `tdd` skill, add:
+
+> **Integration Test Requirement**
+>
+> For features with multiple components (game loops, handlers, pipelines):
+> - Add at least ONE `@pytest.mark.integration` test
+> - Test must exercise the full path from entry point to observable outcome
+> - Must NOT call internal helpers directly — use the public entry point
+
+---
+
+## Summary
+
+| Gap | Skill | Problem | Fix |
+|-----|-------|---------|-----|
+| 1 | scope + verify | "Observable" undefined = unit test passes | Define user-observable; add COVERED BUT NOT VERIFIED |
+| 2 | verify | Object Calisthenics listed = suggestions | Any rule FAIL = REJECTED (table) |
+| 3 | implementation | REFACTOR has no self-check gate | Add mandatory line/nesting check |
+| 4 | verify | `timeout` = "doesn't hang" not "works" | Must accept and respond to input |
+| 5 | tdd | All unit, no integration | Require one integration test |
+
+---
+
+## Root Cause
+
+The skills already contained the right standards. The problem is that violations were treated as observations, not blockers. Each check needs a clear **FAIL = REJECTED** consequence with a structured table to fill in — so violations can't be glossed over in prose.
diff --git a/docs/post-mortem/2026-04-16-ping-pong-cli-package-and-design-review.md b/docs/post-mortem/2026-04-16-ping-pong-cli-package-and-design-review.md
new file mode 100644
index 0000000..d9b6995
--- /dev/null
+++ b/docs/post-mortem/2026-04-16-ping-pong-cli-package-and-design-review.md
@@ -0,0 +1,108 @@
+# Post-Mortem: ping-pong-cli — Package Directory and Design Review Gaps
+
+## Context
+
+| Field | Value |
+|-------|-------|
+| Date | April 16, 2026 |
+| Feature | ping-pong-cli (follow-up run after v3.1 workflow fixes) |
+| Branch | feat/po-workflow-redesign-v4 |
+
+This post-mortem was conducted after a second ping-pong-cli test run on the updated v3.1 workflow. Two systemic failures were identified that the v3.1 fixes did not address.
+
+---
+
+## Failure 1: Code Created in Wrong Package Directory
+
+### What Happened
+
+The developer created production code under `python_project_template/` (the template's own package) instead of `ping_pong_cli/` (the feature's package). The correct package name was visible in `pyproject.toml` under `[tool.setuptools] packages`, but no step in the workflow required the developer to read it before writing code.
+
+### Why It Happened
+
+The `implementation` skill's Step 2 (Architecture) listed prerequisites and module structure instructions, but contained no explicit step to:
+1. Read `pyproject.toml` to determine the correct package name
+2. Confirm the package directory exists on disk
+3. Record the package name as a hard constraint before writing any files
+
+Without this verification, the developer defaulted to a plausible-looking name rather than the actual configured name.
+
+### Impact
+
+All production code was placed in the wrong directory. The feature appeared to work during development (imports resolved within the wrong package) but would have failed on any fresh install or CI run.
+
+### Fix Applied
+
+Added a **Package Verification** block at the top of Step 2 in `implementation/SKILL.md` (before prerequisites):
+
+```
+1. Read pyproject.toml → [tool.setuptools] → record packages = ["<name>"]
+2. Confirm that directory exists on disk: ls <name>/
+3. Write the correct package name at the top of working notes
+4. All new source files go under <name>/ — never under a template placeholder
+```
+
+Added a corresponding check row to `verify/SKILL.md` section 4g:
+
+> `Imports use correct package name` — confirm all imports match `[tool.setuptools] packages`, not a template placeholder
+
+---
+
+## Failure 2: Design Principle Violations Not Caught in Review
+
+### What Happened
+
+The reviewer approved code containing getters and setters (`get_x()` / `set_x()` pairs), violating Object Calisthenics Rule 9. The violation was visible in the code but was not caught because the review process had no structured mechanism for the developer to declare their own compliance before asking for review.
+
+### Why It Happened
+
+The per-test reviewer check asked the reviewer to verify YAGNI > KISS > DRY > SOLID > ObjCal, but provided no structured checklist or required evidence format. The reviewer was scanning for violations rather than verifying explicit claims. When a reviewer is reading unfamiliar code for the first time, getter/setter patterns can be overlooked if they are not explicitly flagged.
+
+Additionally, the reviewer had no "audit target" — there was nothing the developer had committed to that the reviewer could directly compare against the code.
+
+### Impact
+
+OC Rule 9 (tell-don't-ask) was violated. The design choice propagated into the committed codebase, requiring a later refactor.
+
+### Fix Applied
+
+Added a **Design Self-Declaration** step between REFACTOR and REVIEWER CHECK in `implementation/SKILL.md`:
+
+- Developer fills a checklist covering YAGNI, KISS, DRY, SOLID (all 5 principles), and OC Rules 1–9
+- Each item requires `file:line` evidence or an explicit "does not apply" note
+- The filled checklist is sent to the reviewer as the audit target
+
+Updated the **REVIEWER CHECK** response template from a 3-line compact format to an 11-row structured comparison table (YAGNI, KISS, DRY, SOLID-S/O/L/I/D, OC-1-9, Design patterns, Semantic alignment):
+
+- Developer Claims column (what the developer declared)
+- Reviewer Verdict column (independent verification)
+- Evidence column (`file:line` required for every FAIL)
+- Any FAIL row = rejection
+
+Updated the Cycle State phases to include `SELF-DECLARE` between REFACTOR and REVIEWER:
+
+```
+RED → GREEN → REFACTOR → SELF-DECLARE → REVIEWER(code-design) → COMMITTED
+```
+
+Updated `session-workflow/SKILL.md` Cycle State phase list and Rule 6 to include `SELF-DECLARE`.
+
+Updated `reviewer.md` per-test Step 4 section to reference the structured table and load `skill implementation` for the full protocol.
+
+---
+
+## Summary
+
+| Failure | Root Cause | Fix |
+|---------|-----------|-----|
+| Code in wrong package | No package verification step before writing code | Package Verification block added to Step 2 |
+| OC Rule 9 violation approved | No structured self-declaration; reviewer had no audit target | Design Self-Declaration checklist per test; 11-row verification table |
+
+---
+
+## Systemic Pattern
+
+Both failures share the same root cause: **the workflow relied on agents noticing problems rather than proving compliance**. The fixes shift the burden:
+
+- Package verification: developer must prove the package name is correct before writing the first line
+- Design self-declaration: developer must prove each principle is satisfied before asking for review; reviewer verifies claims rather than scanning from scratch
diff --git a/main.py b/main.py
deleted file mode 100644
index d6175e0..0000000
--- a/main.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Test main file."""
-
-import logging
-from typing import Literal
-
-import fire
-
-from app.version import version
-
-logger = logging.getLogger(__name__)
-
-LOGGER_LEVELS = {
-    "DEBUG": logging.DEBUG,
-    "INFO": logging.INFO,
-    "WARNING": logging.WARNING,
-    "ERROR": logging.ERROR,
-    "CRITICAL": logging.CRITICAL,
-}
-
-ValidVerbosity = Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
-
-
-def main(verbosity: ValidVerbosity = "INFO") -> None:
-    """Run with --verbosity=LEVEL (DEBUG, INFO, WARNING, ERROR, CRITICAL)."""
-    # Validate verbosity at runtime
-    verbosity_upper = verbosity.upper()
-    if verbosity_upper not in LOGGER_LEVELS:
-        valid_levels = ", ".join(LOGGER_LEVELS.keys())
-        raise ValueError(
-            f"Invalid verbosity level '{verbosity}'. Valid options: {valid_levels}"
-        )
-
-    logging.basicConfig(
-        level=LOGGER_LEVELS[verbosity_upper],
-        format="%(levelname)s - %(name)s: %(message)s",
-    )
-    version()
-
-
-if __name__ == "__main__":
-    fire.Fire(main)
diff --git a/project_defaults.json b/project_defaults.json
deleted file mode 100644
index 1e16c33..0000000
--- a/project_defaults.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "github_username": "nullhack",
-  "project_name": "python-project-template",
-  "project_description": "Python template with some awesome tools to quickstart any Python project",
-  "author_name": "eol",
-  "author_email": "nullhack@users.noreply.github.com"
-}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 81e3c6d..2fc66e6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,7 @@ pydocstyle.convention = "google"
 
 [tool.ruff.lint.per-file-ignores]
 "tests/**" = ["S101", "ANN", "D205", "D212", "D415", "D100", "D103"]
+".opencode/skills/**/scripts/*.py" = ["T20"]
 
 [tool.pytest.ini_options]
 minversion = "6.0"
@@ -81,6 +82,7 @@ markers = [
     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
     "unit: isolated tests for a single function or class",
     "integration: tests covering multiple components together",
+    "deprecated: marks tests for deprecated AC; automatically skipped (deselect with '-m \"not deprecated\"')",
 ]
 addopts = """
 --maxfail=10 \
@@ -141,8 +143,12 @@ pytest \
 """
 doc-publish = "task doc-build && ghp-import -n -p -f docs"
 static-check = "pyright"
+gen-id = "python -c \"import uuid; [print(uuid.uuid4().hex[:8]) for _ in range(20)]\""
+gen-tests = "python .opencode/skills/tdd/scripts/gen_test_stubs.py"
+gen-todo = "python .opencode/skills/session-workflow/scripts/gen_todo.py"
 
 [dependency-groups]
 dev = [
+    "gherkin-official>=39.0.0",
     "safety>=3.7.0",
 ]
diff --git a/template-config.yaml b/template-config.yaml
new file mode 100644
index 0000000..c3d5160
--- /dev/null
+++ b/template-config.yaml
@@ -0,0 +1,148 @@
+# Template configuration — single source of truth for project setup.
+# Read by the setup-project agent. The "defaults" section reflects the current
+# template values. The "substitutions" section maps every literal string in every
+# template file to its replacement pattern, using {variable} tokens.
+#
+# After running @setup-project the defaults section is updated with the values
+# the user provided.
+#
+# Parameter descriptions:
+#   github_username     — GitHub handle used in URLs and git remote
+#   project_name        — kebab-case repository name (e.g. my-awesome-project)
+#   package_name        — snake_case Python package directory (e.g. my_awesome_project)
+#   project_description — one sentence describing what the project does
+#   author_name         — author's full name
+#   author_email        — author's email address
+
+defaults:
+  github_username: nullhack
+  project_name: python-project-template
+  package_name: app
+  project_description: "Python template with some awesome tools to quickstart any Python project"
+  author_name: eol
+  author_email: nullhack@users.noreply.github.com
+
+# Substitution map — every file the setup agent must edit.
+# Each entry has:
+#   old:   literal string currently in the file
+#   new:   replacement string with {variable} tokens
+#   count: expected number of replacements (for agent verification)
+
+substitutions:
+  pyproject.toml:
+    - old: 'name = "python-project-template"'
+      new: 'name = "{project_name}"'
+      count: 1
+    - old: '"Python template with some awesome tools to quickstart any Python project"'
+      new: '"{project_description}"'
+      count: 1
+    - old: '{ name = "eol", email = "nullhack@users.noreply.github.com" }'
+      new: '{{ name = "{author_name}", email = "{author_email}" }}'
+      count: 2
+    - old: "https://github.com/nullhack/python-project-template"
+      new: "https://github.com/{github_username}/{project_name}"
+      count: 2
+    - old: 'packages = ["app"]'
+      new: 'packages = ["{package_name}"]'
+      count: 1
+    - old: "python -m app"
+      new: "python -m {package_name}"
+      count: 1
+    - old: "--cov=app"
+      new: "--cov={package_name}"
+      count: 2
+    - old: "pdoc ./app"
+      new: "pdoc ./{package_name}"
+      count: 2
+    # Version: set to 0.1.YYYYMMDD using today's date (agent action, not text substitution)
+
+  README.md:
+    - old: "nullhack"
+      new: "{github_username}"
+      count: many
+    - old: "python-project-template"
+      new: "{project_name}"
+      count: many
+    - old: "eol"
+      new: "{author_name}"
+      count: 1
+      note: "only the author credit line — do not replace occurrences in other contexts"
+
+  .github/workflows/ci.yml:
+    - old: "import app"
+      new: "import {package_name}"
+      count: 2
+
+  Dockerfile:
+    - old: "# Simplified Dockerfile for python-project-template"
+      new: "# Simplified Dockerfile for {project_name}"
+      count: 1
+    - old: "CMD python -m app || exit 1"
+      new: "CMD python -m {package_name} || exit 1"
+      count: 1
+    - old: 'CMD ["python", "-m", "app"]'
+      new: 'CMD ["python", "-m", "{package_name}"]'
+      count: 1
+    - old: 'LABEL maintainer="eol"'
+      new: 'LABEL maintainer="{author_name}"'
+      count: 1
+    - old: '"Python template with some awesome tools to quickstart any Python project"'
+      new: '"{project_description}"'
+      count: 1
+    - old: "https://github.com/nullhack/python-project-template"
+      new: "https://github.com/{github_username}/{project_name}"
+      count: 1
+
+  docker-compose.yml:
+    - old: "# Docker Compose for python-project-template"
+      new: "# Docker Compose for {project_name}"
+      count: 1
+    - old: "./app:/app/app"
+      new: "./{package_name}:/app/{package_name}"
+      count: 1
+    - old: "python -m app"
+      new: "python -m {package_name}"
+      count: 1
+    - old: "./app:/app/app:ro"
+      new: "./{package_name}:/app/{package_name}:ro"
+      count: 1
+
+  .dockerignore:
+    - old: "# Docker ignore file for python-project-template"
+      new: "# Docker ignore file for {project_name}"
+      count: 1
+
+  docs/index.html:
+    - old: 'href="api/app.html"'
+      new: 'href="api/{package_name}.html"'
+      count: 1
+
+  LICENSE:
+    - old: "Copyright (c) 2026, eol"
+      new: "Copyright (c) 2026, {author_name}"
+      count: 1
+
+  tests/unit/app_test.py:
+    - old: "from app.__main__ import"
+      new: "from {package_name}.__main__ import"
+      count: 1
+
+  template-config.yaml:
+    - old: "github_username: nullhack"
+      new: "github_username: {github_username}"
+      count: 1
+    - old: "project_name: python-project-template"
+      new: "project_name: {project_name}"
+      count: 1
+    - old: "package_name: app"
+      new: "package_name: {package_name}"
+      count: 1
+    - old: '"Python template with some awesome tools to quickstart any Python project"'
+      new: '"{project_description}"'
+      count: 1
+    - old: "author_name: eol"
+      new: "author_name: {author_name}"
+      count: 1
+    - old: "author_email: nullhack@users.noreply.github.com"
+      new: "author_email: {author_email}"
+      count: 1
diff --git a/tests/conftest.py b/tests/conftest.py
index a5c8f50..9a606f7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -21,3 +21,10 @@ def pytest_html_results_table_header(cells):
 def pytest_html_results_table_row(report, cells):
     docstring = getattr(report, "docstrings", "") or ""
     cells.insert(2, f"<td style='white-space: pre-wrap;'>{docstring}</td>")
+
+
+def pytest_collection_modifyitems(items):
+    """Automatically skip tests marked as deprecated."""
+    for item in items:
+        if item.get_closest_marker("deprecated"):
+            item.add_marker(pytest.mark.skip(reason="deprecated"))
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 0000000..e0310a0
--- /dev/null
+++ b/tests/unit/__init__.py
@@ -0,0 +1 @@
+"""Unit tests."""
diff --git a/tests/unit/app_test.py b/tests/unit/app_test.py
new file mode 100644
index 0000000..9c52e92
--- /dev/null
+++ b/tests/unit/app_test.py
@@ -0,0 +1,19 @@
+"""Unit tests for the application entry point."""
+
+import pytest
+from hypothesis import example, given
+from hypothesis import strategies as st
+
+from app.__main__ import main
+
+
+@pytest.mark.unit
+@given(verbosity=st.sampled_from(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]))
+@example(verbosity="INFO")
+def test_app_main_runs_with_valid_verbosity(verbosity: str) -> None:
+    """
+    Given: A valid verbosity level string
+    When: main() is called with that verbosity
+    Then: It completes without raising an exception
+    """
+    main(verbosity)
diff --git a/tests/version_test.py b/tests/version_test.py
deleted file mode 100644
index d6447fe..0000000
--- a/tests/version_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""This file contains examples of how to write tests using pytest."""
-
-import logging
-import tomllib
-from io import StringIO
-from pathlib import Path
-from typing import cast
-from unittest.mock import patch
-
-import pytest
-from hypothesis import example, given
-from hypothesis import strategies as st
-
-from app import version as m
-from main import ValidVerbosity, main
-
-
-@pytest.mark.unit
-def test_version_returns_string_from_pyproject() -> None:
-    """3f2a1b4c-d5e6-7890-abcd-ef1234567890
-
-    Given: pyproject.toml exists with a version field
-    When: version() is called
-    Then: The returned string matches the version in pyproject.toml
-    """
-    # Given
-    pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
-    with Path(pyproject_path).open("rb") as f:
-        expected = tomllib.load(f)["project"]["version"]
-    # When
-    result = m.version()
-    # Then
-    assert result == expected
-
-
-@pytest.mark.unit
-def test_version_logs_correct_message(caplog) -> None:
-    """7a8b9c0d-e1f2-3456-bcde-f12345678901
-
-    Given: pyproject.toml exists with a version field
-    When: version() is called
-    Then: An INFO log message in the format "Version: <version>" is emitted
-    """
-    # Given
-    pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
-    with Path(pyproject_path).open("rb") as f:
-        expected_version = tomllib.load(f)["project"]["version"]
-    # When
-    with caplog.at_level(logging.INFO):
-        m.version()
-    # Then
-    assert f"Version: {expected_version}" in caplog.text
-
-
-@pytest.mark.integration
-@pytest.mark.slow
-@example(verbosity="DEBUG")
-@example(verbosity="INFO")
-@given(verbosity=st.sampled_from(["DEBUG", "INFO"]))
-def test_version_appears_in_logs_at_debug_and_info(
-    verbosity: str,
-) -> None:
-    """a1b2c3d4-e5f6-7890-abcd-ef1234567890
-
-    Given: A verbosity level of DEBUG or INFO is passed to main()
-    When: main() is called
-    Then: The version string appears in the log output
-    """
-    # Given
-    pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
-    with Path(pyproject_path).open("rb") as f:
-        expected_version = tomllib.load(f)["project"]["version"]
-    expected_level = getattr(logging, verbosity.upper())
-    log_stream = StringIO()
-    handler = logging.StreamHandler(log_stream)
-    handler.setLevel(expected_level)
-
-    def mock_basic_config(**kwargs):
-        logger = logging.getLogger("app")
-        logger.handlers.clear()
-        logger.addHandler(handler)
-        logger.setLevel(kwargs.get("level", logging.INFO))
-
-    # When
-    with patch("main.logging.basicConfig", side_effect=mock_basic_config):
-        main(cast(ValidVerbosity, verbosity))
-    # Then
-    log_output = log_stream.getvalue()
-    assert f"Version: {expected_version}" in log_output, (
-        f"Expected version message at {verbosity} level, but got output: {log_output!r}"
-    )
-
-
-@pytest.mark.integration
-@pytest.mark.slow
-@example(verbosity="WARNING")
-@example(verbosity="ERROR")
-@given(verbosity=st.sampled_from(["WARNING", "ERROR", "CRITICAL"]))
-def test_version_absent_from_logs_at_warning_and_above(
-    verbosity: str,
-) -> None:
-    """b2c3d4e5-f6a7-8901-bcde-f12345678901
-
-    Given: A verbosity level of WARNING, ERROR, or CRITICAL is passed to main()
-    When: main() is called
-    Then: The version string does not appear in the log output
-    """
-    # Given
-    pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
-    with Path(pyproject_path).open("rb") as f:
-        expected_version = tomllib.load(f)["project"]["version"]
-    expected_level = getattr(logging, verbosity.upper())
-    log_stream = StringIO()
-    handler = logging.StreamHandler(log_stream)
-    handler.setLevel(expected_level)
-
-    def mock_basic_config(**kwargs):
-        logger = logging.getLogger("app")
-        logger.handlers.clear()
-        logger.addHandler(handler)
-        logger.setLevel(kwargs.get("level", logging.INFO))
-
-    # When
-    with patch("main.logging.basicConfig", side_effect=mock_basic_config):
-        main(cast(ValidVerbosity, verbosity))
-    # Then
-    log_output = log_stream.getvalue()
-    assert f"Version: {expected_version}" not in log_output, (
-        f"Expected no version messages at {verbosity} level, "
-        f"but got output: {log_output!r}"
-    )
-
-
-@pytest.mark.unit
-def test_invalid_verbosity_raises_value_error() -> None:
-    """e5f6a7b8-c9d0-1234-defa-012345678903
-
-    Given: An invalid verbosity string is passed to main()
-    When: main() is called
-    Then: A ValueError is raised with the invalid value and valid options listed
-    """
-    # Given
-    invalid_verbosity = "INVALID_LEVEL"
-    # When
-    with pytest.raises(ValueError, match=r"Invalid verbosity level") as exc_info:
-        main(cast(ValidVerbosity, invalid_verbosity))
-    # Then
-    error_message = str(exc_info.value)
-    assert "Invalid verbosity level 'INVALID_LEVEL'" in error_message
-    assert "Valid options: DEBUG, INFO, WARNING, ERROR, CRITICAL" in error_message
diff --git a/uv.lock b/uv.lock
index 2a5d713..76b8ade 100644
--- a/uv.lock
+++ b/uv.lock
@@ -295,6 +295,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/4c/93d0f85318da65923e4b91c1c2ff03d8a458cbefebe3bc612a6693c7906d/fire-0.7.1-py3-none-any.whl", hash = "sha256:e43fd8a5033a9001e7e2973bab96070694b9f12f2e0ecf96d4683971b5ab1882", size = 115945, upload-time = "2025-08-16T20:20:22.87Z" },
 ]
 
+[[package]]
+name = "gherkin-official"
+version = "39.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2f/cf/8c0f7ec0e041c12ab59fae0c01b95ac69113a2fecb45618780525f8ca5ee/gherkin_official-39.0.0.tar.gz", hash = "sha256:675b9c6c0c342b0ec44bddf927de923adbd79879277816ce96bf248533677060", size = 33683, upload-time = "2026-03-01T16:46:42.382Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/b3/743f97b16ef781283cde3c7b06a95b309a75ae2f4003a6611d35abc3c613/gherkin_official-39.0.0-py3-none-any.whl", hash = "sha256:1fd9b8709c00d946c0fd617a9834d4cb2af026213a2e8e7822fe24dd5064fe22", size = 38471, upload-time = "2026-03-01T16:46:43.308Z" },
+]
+
 [[package]]
 name = "ghp-import"
 version = "2.1.0"
@@ -745,6 +757,7 @@ dev = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "gherkin-official" },
     { name = "safety" },
 ]
 
@@ -765,7 +778,10 @@ requires-dist = [
 provides-extras = ["dev"]
 
 [package.metadata.requires-dev]
-dev = [{ name = "safety", specifier = ">=3.7.0" }]
+dev = [
+    { name = "gherkin-official", specifier = ">=39.0.0" },
+    { name = "safety", specifier = ">=3.7.0" },
+]
 
 [[package]]
 name = "regex"