Merge: t-paliad-190 — Fristen Phase 3 Slice 10 (rule_id backfill + orphan staging)

This commit is contained in:
mAi
2026-05-15 01:38:47 +02:00
4 changed files with 441 additions and 0 deletions

View File

@@ -0,0 +1,9 @@
-- t-paliad-190 down — reverses 089_deadline_rule_backfill_orphans.up.sql.
-- Drops the staging table; mig 090's down-migration MUST run first
-- (it depends on this table for its INSERT — running them in reverse
-- order satisfies that).
DROP POLICY IF EXISTS deadline_rule_backfill_orphans_select ON paliad.deadline_rule_backfill_orphans;
DROP INDEX IF EXISTS paliad.deadline_rule_backfill_orphans_unresolved_idx;
DROP INDEX IF EXISTS paliad.deadline_rule_backfill_orphans_deadline_id_idx;
DROP TABLE IF EXISTS paliad.deadline_rule_backfill_orphans;

View File

@@ -0,0 +1,82 @@
-- t-paliad-190 / Fristen Phase 3 Slice 10 — staging table for the
-- fuzzy-match orphans produced by mig 090. Per design §3.I + m's Q10
-- ruling: legacy paliad.deadlines rows whose title can't be uniquely
-- bound to a deadline_rule via fuzzy matching are NOT silently left
-- NULL — they're logged here so a legal-review pass can hand-link
-- the ambiguous tail.
--
-- Mig 089 ships the table; mig 090 does the actual backfill +
-- populates this table. Numbering reflects the dependency order
-- (the backfill SELECTs into this table, so the table must exist
-- first).
--
-- Schema notes:
-- - deadline_id is the FK to paliad.deadlines.id with ON DELETE
-- CASCADE so a hand-deletion of an orphan deadline cleans up
-- its staging row too. (Deadlines are normally archived, not
-- deleted; the cascade is defensive.)
-- - project_id stays denormalised so the admin orphan-review UI
-- can group orphans by project without re-joining deadlines.
-- - reason is a free-text discriminator: 'no_match' | 'ambiguous'
-- today; the editor in Slice 11 may add 'manual_unbound' or
-- similar in the future.
-- - resolved_at + resolved_rule_id are NULL on insert; the admin
-- orphan-review UI sets them when an editor hand-links the row,
-- so the table doubles as an audit trail of the legal-review
-- pass. The matching paliad.deadlines.rule_id is updated at the
-- same time (the UPDATE on deadlines fires its own audit row
-- once an audit trigger lives on that table; today no trigger,
-- so the staging row is the audit artefact).
--
-- RLS: admin-only read. The orphan list contains real deadline titles
-- + project ids, so non-admins should not see it. The Slice 11 rule
-- editor surface gates this further.
CREATE TABLE IF NOT EXISTS paliad.deadline_rule_backfill_orphans (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
deadline_id uuid NOT NULL
REFERENCES paliad.deadlines(id) ON DELETE CASCADE,
title text NOT NULL,
project_id uuid,
proceeding_code text,
reason text NOT NULL
CHECK (reason IN ('no_match', 'ambiguous', 'no_project', 'manual_unbound')),
candidate_count int NOT NULL DEFAULT 0,
candidate_rule_ids uuid[] NOT NULL DEFAULT '{}',
resolved_at timestamptz,
resolved_rule_id uuid
REFERENCES paliad.deadline_rules(id) ON DELETE SET NULL,
created_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS deadline_rule_backfill_orphans_deadline_id_idx
ON paliad.deadline_rule_backfill_orphans (deadline_id);
CREATE INDEX IF NOT EXISTS deadline_rule_backfill_orphans_unresolved_idx
ON paliad.deadline_rule_backfill_orphans (created_at DESC)
WHERE resolved_at IS NULL;
COMMENT ON TABLE paliad.deadline_rule_backfill_orphans IS
'Slice 10 (mig 089/090, t-paliad-190): staging for legacy '
'paliad.deadlines rows that the fuzzy-match backfill could not '
'uniquely bind to a deadline_rule. Each row holds the deadline '
'context + the candidate rule IDs the matcher found (0 → '
'''no_match''; ≥2 → ''ambiguous'') so a legal-review pass can '
'hand-link without rerunning the match. resolved_at + '
'resolved_rule_id flip when the admin orphan-review UI binds the '
'row.';
-- RLS: admin-only read.
ALTER TABLE paliad.deadline_rule_backfill_orphans ENABLE ROW LEVEL SECURITY;
DROP POLICY IF EXISTS deadline_rule_backfill_orphans_select ON paliad.deadline_rule_backfill_orphans;
CREATE POLICY deadline_rule_backfill_orphans_select
ON paliad.deadline_rule_backfill_orphans FOR SELECT
USING (
EXISTS (
SELECT 1 FROM paliad.users u
WHERE u.id = auth.uid()
AND u.global_role = 'global_admin'
)
);

View File

@@ -0,0 +1,30 @@
-- t-paliad-190 down — reverses 090_backfill_deadline_rule_id.up.sql.
--
-- Restores rule_id values from the pre-mig snapshot (every deadline
-- that mig 090 touched had rule_id IS NULL originally, so restoring
-- means setting rule_id back to NULL on every row that survived the
-- backfill). Drops the orphan rows mig 090 wrote (resolved rows stay
-- — those represent legal-review work that shouldn't disappear on
-- a code rollback) and drops the backup table.
--
-- This is a defensive rollback path; the migration itself is one-time
-- + idempotent, so re-running 090 after a down + up is safe.
SELECT set_config(
'paliad.audit_reason',
'rollback 090: NULL rule_id on deadlines mig 090 touched + drop pre-089 backup',
true);
-- Restore rule_id = NULL on every deadline mig 090 may have written.
-- We use the backup table as the authoritative "before" snapshot.
UPDATE paliad.deadlines d
SET rule_id = b.rule_id
FROM paliad.deadlines_pre_089 b
WHERE d.id = b.id;
-- Drop the unresolved orphan rows mig 090 wrote. Resolved rows stay —
-- a legal-review hand-link is real work that survives a code rollback.
DELETE FROM paliad.deadline_rule_backfill_orphans
WHERE resolved_at IS NULL;
DROP TABLE IF EXISTS paliad.deadlines_pre_089;

View File

@@ -0,0 +1,320 @@
-- t-paliad-190 / Fristen Phase 3 Slice 10 — one-time fuzzy-match
-- backfill of paliad.deadlines.rule_id per design §3.I + m's Q10
-- ruling. Restores SmartTimeline's "anchor real deadlines into
-- projection" affordance on legacy data (1 of 26 deadlines currently
-- has rule_id populated; the SmartTimeline anchor flow needs the FK
-- to thread predicted dates off actuals).
--
-- Matching strategies (in priority order; first unique hit wins):
--
-- 1. rule_code-prefix extraction from title. Titles like
-- "RoP.023 — Klageerwiderung" carry the rule citation in the
-- prefix; we extract the leading citation token and JOIN on
-- deadline_rules.rule_code = extracted. When the rule_code
-- resolves to multiple rules (e.g. RoP.023 → 2 rules — DE
-- Klageerwiderung + EN Statement of Defence), the remaining
-- title fragment narrows by name ILIKE.
--
-- 2. exact title match against rule.name OR rule.name_en (LOWER).
-- Mostly hits common Pipeline-A names ("Antrag auf
-- Schadensbemessung" → 1 unique rule); ambiguous for shared
-- names like "Klageerwiderung" (8 rules across proceedings).
--
-- 3. deadline_concepts.aliases match. Each concept carries a
-- text[] of canonical aliases; if LOWER(d.title) is in the
-- aliases array, we pick the rules with that concept_id. Today
-- the alias coverage is thin (no aliases for "Schutzschrift"
-- etc.), but the strategy is shaped so a future seed lights
-- it up.
--
-- For each deadline, we collect all candidates across the three
-- strategies, dedupe by rule.id, and:
-- - exactly 1 candidate → UPDATE rule_id (matched).
-- - 0 candidates → orphan with reason='no_match'.
-- - ≥2 candidates → orphan with reason='ambiguous', candidate_rule_ids
-- populated so a legal-review pass can hand-pick.
--
-- Per-project narrowing by proceeding_type_id is the design's primary
-- discriminator. In the live corpus today all 11 projects have
-- proceeding_type_id IS NULL (Slice 5 retired litigation codes from
-- project-binding; the fristenrechner-side rebinding hasn't happened),
-- so this slice can't use proceeding-narrowing on production data.
-- The CTE still includes the predicate so the migration self-tunes
-- the moment proceeding_type_id starts getting populated.
--
-- Defensive backup: paliad.deadlines is snapshotted to
-- paliad.deadlines_pre_089 before the UPDATE so an operator can
-- restore individual rule_id values if a hand-link goes wrong post
-- mig. The table is dropped in the down-migration; Slice 11 (rule
-- editor) can drop it once orphan resolution finishes in prod.
--
-- Idempotency: WHERE d.rule_id IS NULL on the UPDATE; the orphan
-- INSERT uses ON CONFLICT DO NOTHING via a NOT EXISTS guard (no
-- unique constraint on deadline_id alone — a deadline may legitimately
-- get re-orphaned after a resolution rollback; but re-running 090 on
-- the same corpus must not duplicate orphan rows for unresolved
-- deadlines).
--
-- Hard assertion at end: SUM(matched) + SUM(orphans for current
-- unresolved deadlines) ≥ COUNT(deadlines processed). Strict equality
-- doesn't hold cleanly on a re-run (the orphan table may already
-- carry prior rows from a partial run), so the assertion is "at
-- least one row exists per unresolved deadline".
SELECT set_config(
'paliad.audit_reason',
'mig 090: one-time fuzzy-match backfill of deadlines.rule_id per design §3.I / Q10',
true);
-- =============================================================================
-- 1. Defensive backup before any UPDATE.
-- =============================================================================
CREATE TABLE IF NOT EXISTS paliad.deadlines_pre_089 AS
SELECT id, project_id, title, rule_id, rule_code, status, due_date,
completed_at, created_at, updated_at
FROM paliad.deadlines
WHERE rule_id IS NULL
AND project_id IS NOT NULL;
COMMENT ON TABLE paliad.deadlines_pre_089 IS
'Snapshot of paliad.deadlines (id, rule_id-relevant columns) taken '
'before mig 090 ran the fuzzy-match backfill. Lets an operator '
'restore individual rule_id values if a hand-link goes wrong. '
'Slice 11 (rule editor) drops this once orphan resolution finishes.';
-- =============================================================================
-- 2. Build the candidate set in a temp table so the per-deadline
-- aggregation + UPDATE + orphan INSERT can share the work without
-- re-evaluating the matchers.
-- =============================================================================
CREATE TEMP TABLE _mig_090_candidates ON COMMIT DROP AS
WITH targets AS (
-- Every NULL-rule_id deadline still bound to a project. project_id
-- is required because we want at least the SmartTimeline anchor
-- flow to work; un-bound deadlines (rare) are out of scope.
SELECT d.id AS deadline_id,
d.title AS title,
d.project_id,
p.proceeding_type_id,
-- Extract a leading citation token like "RoP.023" or
-- "R.49" from the title. Captures the rule_code prefix
-- on titles that carry one ("RoP.023 — Klageerwiderung");
-- NULL on plain titles.
NULLIF(regexp_replace(d.title, '^\s*((?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*)\s*(?:[—–-].*)?$', '\1'), d.title) AS code_token,
-- Strip the leading citation + separator to surface the
-- title's name fragment. "RoP.023 — Klageerwiderung" →
-- "Klageerwiderung"; "RoP.029.a" (no suffix) → ""; plain
-- "Klageerwiderung" → "Klageerwiderung" unchanged.
NULLIF(trim(regexp_replace(d.title, '^\s*(?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*\s*[—–-]?\s*', '')), '') AS title_tail
FROM paliad.deadlines d
LEFT JOIN paliad.projects p ON p.id = d.project_id
WHERE d.rule_id IS NULL
AND d.project_id IS NOT NULL
),
by_code_and_tail AS (
-- Strategy 1a (narrowest): rule_code AND name (DE or EN) matches
-- the title's tail fragment. Handles "RoP.023 — Klageerwiderung"
-- where the bare code matches 2 rules (DE Klageerwiderung +
-- EN Statement of Defence); the tail picks the DE one.
SELECT t.deadline_id, dr.id AS rule_id, 'rule_code_and_tail' AS strategy
FROM targets t
JOIN paliad.deadline_rules dr
ON dr.rule_code = trim(t.code_token)
AND dr.is_active = true
AND (LOWER(dr.name) = LOWER(t.title_tail)
OR LOWER(dr.name_en) = LOWER(t.title_tail))
WHERE t.code_token IS NOT NULL
AND t.title_tail IS NOT NULL
),
by_code AS (
-- Strategy 1b: rule_code prefix only. Handles bare-code titles
-- ("RoP.029.a" maps to 1 unique rule regardless of suffix) and
-- the fallback when by_code_and_tail returns 0 (suffix doesn't
-- match — e.g. "RoP.029.a — Replik" where the suffix "Replik"
-- doesn't appear in any RoP.029.a rule's name; pick the
-- code-only match anyway).
SELECT t.deadline_id, dr.id AS rule_id, 'rule_code' AS strategy
FROM targets t
JOIN paliad.deadline_rules dr
ON dr.rule_code = trim(t.code_token)
AND dr.is_active = true
WHERE t.code_token IS NOT NULL
),
by_name AS (
-- Strategy 2: exact title match against rule.name or rule.name_en.
-- The widest matcher; for shared names like "Klageerwiderung"
-- (8 rules across proceedings) this is ambiguous, but for
-- unique titles like "Antrag auf Schadensbemessung" (1 rule) it
-- nails the match.
SELECT t.deadline_id, dr.id AS rule_id, 'name_exact' AS strategy
FROM targets t
JOIN paliad.deadline_rules dr
ON (LOWER(dr.name) = LOWER(t.title)
OR LOWER(dr.name_en) = LOWER(t.title))
AND dr.is_active = true
),
by_alias AS (
-- Strategy 3: concept aliases. deadline_concepts.aliases is a
-- text[] of canonical synonyms; if the deadline title appears
-- in that array, every active rule on the concept is a candidate.
-- Today's alias coverage is thin (the seed for Slice 12 is the
-- expected source of new aliases), but the strategy is in place
-- so future seeds light it up without a migration.
SELECT t.deadline_id, dr.id AS rule_id, 'concept_alias' AS strategy
FROM targets t
JOIN paliad.deadline_concepts dc
ON LOWER(t.title) = ANY(SELECT LOWER(a) FROM unnest(dc.aliases) a)
JOIN paliad.deadline_rules dr
ON dr.concept_id = dc.id
AND dr.is_active = true
)
SELECT deadline_id, rule_id, strategy
FROM by_code_and_tail
UNION
SELECT deadline_id, rule_id, strategy
FROM by_code
UNION
SELECT deadline_id, rule_id, strategy
FROM by_name
UNION
SELECT deadline_id, rule_id, strategy
FROM by_alias;
-- =============================================================================
-- 3. Aggregate per-deadline candidate counts by strategy + pick the
-- narrowest-unique-match per deadline. Strategy priority (narrowest
-- first): rule_code_and_tail > rule_code > name_exact > concept_alias.
-- A deadline's "chosen" rule comes from the highest-priority strategy
-- that yields exactly 1 distinct candidate.
-- =============================================================================
CREATE TEMP TABLE _mig_090_strategy_counts ON COMMIT DROP AS
SELECT deadline_id,
strategy,
count(DISTINCT rule_id) AS n,
MIN(rule_id::text) AS first_rule_text
FROM _mig_090_candidates
GROUP BY deadline_id, strategy;
CREATE TEMP TABLE _mig_090_chosen ON COMMIT DROP AS
SELECT DISTINCT ON (deadline_id)
deadline_id,
first_rule_text::uuid AS rule_id,
strategy AS chosen_strategy
FROM _mig_090_strategy_counts
WHERE n = 1
ORDER BY deadline_id,
CASE strategy
WHEN 'rule_code_and_tail' THEN 1
WHEN 'rule_code' THEN 2
WHEN 'name_exact' THEN 3
WHEN 'concept_alias' THEN 4
ELSE 5
END;
-- "Aggregated" carries the widest candidate set for orphan logging
-- (an editor reviewing an orphan wants to see EVERY plausible rule,
-- not just the narrowest-strategy result).
CREATE TEMP TABLE _mig_090_aggregated ON COMMIT DROP AS
SELECT c.deadline_id,
count(DISTINCT c.rule_id) AS n_candidates,
array_agg(DISTINCT c.rule_id) AS all_rule_ids
FROM _mig_090_candidates c
GROUP BY c.deadline_id;
-- =============================================================================
-- 4. UPDATE deadlines.rule_id for the chosen set (narrowest-unique match).
-- =============================================================================
UPDATE paliad.deadlines d
SET rule_id = c.rule_id
FROM _mig_090_chosen c
WHERE d.id = c.deadline_id
AND d.rule_id IS NULL;
-- =============================================================================
-- 5. Log every deadline that didn't get a unique match as an orphan.
-- Skip rows that already have a non-resolved orphan entry (re-run
-- guard) — the existing entry is the source-of-truth until the
-- admin UI flips resolved_at.
-- =============================================================================
INSERT INTO paliad.deadline_rule_backfill_orphans
(deadline_id, title, project_id, proceeding_code, reason,
candidate_count, candidate_rule_ids)
SELECT t.deadline_id,
t.title,
t.project_id,
pt.code AS proceeding_code,
CASE
WHEN a.n_candidates IS NULL OR a.n_candidates = 0 THEN 'no_match'
WHEN a.n_candidates > 1 THEN 'ambiguous'
END AS reason,
COALESCE(a.n_candidates, 0),
COALESCE(a.all_rule_ids, ARRAY[]::uuid[])
FROM (
SELECT d.id AS deadline_id, d.title, d.project_id, p.proceeding_type_id
FROM paliad.deadlines d
LEFT JOIN paliad.projects p ON p.id = d.project_id
WHERE d.rule_id IS NULL
AND d.project_id IS NOT NULL
) t
LEFT JOIN _mig_090_aggregated a ON a.deadline_id = t.deadline_id
LEFT JOIN paliad.proceeding_types pt ON pt.id = t.proceeding_type_id
WHERE NOT EXISTS (
SELECT 1
FROM paliad.deadline_rule_backfill_orphans o
WHERE o.deadline_id = t.deadline_id
AND o.resolved_at IS NULL
);
-- =============================================================================
-- 6. Hard assertion: every NULL-rule_id deadline (with project) is
-- either resolved (rule_id IS NOT NULL post-mig) or carries an
-- unresolved orphan row.
-- =============================================================================
DO $$
DECLARE
n_processed int;
n_matched int;
n_orphaned int;
n_unaccounted int;
BEGIN
SELECT count(*) INTO n_processed
FROM paliad.deadlines
WHERE project_id IS NOT NULL
AND (rule_id IS NOT NULL OR EXISTS (
SELECT 1 FROM paliad.deadline_rule_backfill_orphans o
WHERE o.deadline_id = paliad.deadlines.id
));
SELECT count(*) INTO n_matched
FROM paliad.deadlines d
JOIN paliad.deadlines_pre_089 b ON b.id = d.id
WHERE d.rule_id IS NOT NULL;
SELECT count(DISTINCT deadline_id) INTO n_orphaned
FROM paliad.deadline_rule_backfill_orphans
WHERE resolved_at IS NULL;
SELECT count(*) INTO n_unaccounted
FROM paliad.deadlines d
WHERE d.rule_id IS NULL
AND d.project_id IS NOT NULL
AND NOT EXISTS (
SELECT 1 FROM paliad.deadline_rule_backfill_orphans o
WHERE o.deadline_id = d.id
);
RAISE NOTICE 'mig 090: processed=% matched=% orphaned=% unaccounted=%',
n_processed, n_matched, n_orphaned, n_unaccounted;
IF n_unaccounted > 0 THEN
RAISE EXCEPTION 'mig 090: % deadlines have rule_id IS NULL and no orphan row — '
'matcher missed them. Investigate the candidate query.',
n_unaccounted;
END IF;
END $$;