-- t-paliad-190 / Fristen Phase 3 Slice 10 — one-time fuzzy-match -- backfill of paliad.deadlines.rule_id per design §3.I + m's Q10 -- ruling. Restores SmartTimeline's "anchor real deadlines into -- projection" affordance on legacy data (1 of 26 deadlines currently -- has rule_id populated; the SmartTimeline anchor flow needs the FK -- to thread predicted dates off actuals). -- -- Matching strategies (in priority order; first unique hit wins): -- -- 1. rule_code-prefix extraction from title. Titles like -- "RoP.023 — Klageerwiderung" carry the rule citation in the -- prefix; we extract the leading citation token and JOIN on -- deadline_rules.rule_code = extracted. When the rule_code -- resolves to multiple rules (e.g. RoP.023 → 2 rules — DE -- Klageerwiderung + EN Statement of Defence), the remaining -- title fragment narrows by name ILIKE. -- -- 2. exact title match against rule.name OR rule.name_en (LOWER). -- Mostly hits common Pipeline-A names ("Antrag auf -- Schadensbemessung" → 1 unique rule); ambiguous for shared -- names like "Klageerwiderung" (8 rules across proceedings). -- -- 3. deadline_concepts.aliases match. Each concept carries a -- text[] of canonical aliases; if LOWER(d.title) is in the -- aliases array, we pick the rules with that concept_id. Today -- the alias coverage is thin (no aliases for "Schutzschrift" -- etc.), but the strategy is shaped so a future seed lights -- it up. -- -- For each deadline, we collect all candidates across the three -- strategies, dedupe by rule.id, and: -- - exactly 1 candidate → UPDATE rule_id (matched). -- - 0 candidates → orphan with reason='no_match'. -- - ≥2 candidates → orphan with reason='ambiguous', candidate_rule_ids -- populated so a legal-review pass can hand-pick. -- -- Per-project narrowing by proceeding_type_id is the design's primary -- discriminator. In the live corpus today all 11 projects have -- proceeding_type_id IS NULL (Slice 5 retired litigation codes from -- project-binding; the fristenrechner-side rebinding hasn't happened), -- so this slice can't use proceeding-narrowing on production data. -- The CTE still includes the predicate so the migration self-tunes -- the moment proceeding_type_id starts getting populated. -- -- Defensive backup: paliad.deadlines is snapshotted to -- paliad.deadlines_pre_089 before the UPDATE so an operator can -- restore individual rule_id values if a hand-link goes wrong post -- mig. The table is dropped in the down-migration; Slice 11 (rule -- editor) can drop it once orphan resolution finishes in prod. -- -- Idempotency: WHERE d.rule_id IS NULL on the UPDATE; the orphan -- INSERT uses ON CONFLICT DO NOTHING via a NOT EXISTS guard (no -- unique constraint on deadline_id alone — a deadline may legitimately -- get re-orphaned after a resolution rollback; but re-running 090 on -- the same corpus must not duplicate orphan rows for unresolved -- deadlines). -- -- Hard assertion at end: SUM(matched) + SUM(orphans for current -- unresolved deadlines) ≥ COUNT(deadlines processed). Strict equality -- doesn't hold cleanly on a re-run (the orphan table may already -- carry prior rows from a partial run), so the assertion is "at -- least one row exists per unresolved deadline". SELECT set_config( 'paliad.audit_reason', 'mig 090: one-time fuzzy-match backfill of deadlines.rule_id per design §3.I / Q10', true); -- ============================================================================= -- 1. Defensive backup before any UPDATE. -- ============================================================================= CREATE TABLE IF NOT EXISTS paliad.deadlines_pre_089 AS SELECT id, project_id, title, rule_id, rule_code, status, due_date, completed_at, created_at, updated_at FROM paliad.deadlines WHERE rule_id IS NULL AND project_id IS NOT NULL; COMMENT ON TABLE paliad.deadlines_pre_089 IS 'Snapshot of paliad.deadlines (id, rule_id-relevant columns) taken ' 'before mig 090 ran the fuzzy-match backfill. Lets an operator ' 'restore individual rule_id values if a hand-link goes wrong. ' 'Slice 11 (rule editor) drops this once orphan resolution finishes.'; -- ============================================================================= -- 2. Build the candidate set in a temp table so the per-deadline -- aggregation + UPDATE + orphan INSERT can share the work without -- re-evaluating the matchers. -- ============================================================================= CREATE TEMP TABLE _mig_090_candidates ON COMMIT DROP AS WITH targets AS ( -- Every NULL-rule_id deadline still bound to a project. project_id -- is required because we want at least the SmartTimeline anchor -- flow to work; un-bound deadlines (rare) are out of scope. SELECT d.id AS deadline_id, d.title AS title, d.project_id, p.proceeding_type_id, -- Extract a leading citation token like "RoP.023" or -- "R.49" from the title. Captures the rule_code prefix -- on titles that carry one ("RoP.023 — Klageerwiderung"); -- NULL on plain titles. NULLIF(regexp_replace(d.title, '^\s*((?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*)\s*(?:[—–-].*)?$', '\1'), d.title) AS code_token, -- Strip the leading citation + separator to surface the -- title's name fragment. "RoP.023 — Klageerwiderung" → -- "Klageerwiderung"; "RoP.029.a" (no suffix) → ""; plain -- "Klageerwiderung" → "Klageerwiderung" unchanged. NULLIF(trim(regexp_replace(d.title, '^\s*(?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*\s*[—–-]?\s*', '')), '') AS title_tail FROM paliad.deadlines d LEFT JOIN paliad.projects p ON p.id = d.project_id WHERE d.rule_id IS NULL AND d.project_id IS NOT NULL ), by_code_and_tail AS ( -- Strategy 1a (narrowest): rule_code AND name (DE or EN) matches -- the title's tail fragment. Handles "RoP.023 — Klageerwiderung" -- where the bare code matches 2 rules (DE Klageerwiderung + -- EN Statement of Defence); the tail picks the DE one. SELECT t.deadline_id, dr.id AS rule_id, 'rule_code_and_tail' AS strategy FROM targets t JOIN paliad.deadline_rules dr ON dr.rule_code = trim(t.code_token) AND dr.is_active = true AND (LOWER(dr.name) = LOWER(t.title_tail) OR LOWER(dr.name_en) = LOWER(t.title_tail)) WHERE t.code_token IS NOT NULL AND t.title_tail IS NOT NULL ), by_code AS ( -- Strategy 1b: rule_code prefix only. Handles bare-code titles -- ("RoP.029.a" maps to 1 unique rule regardless of suffix) and -- the fallback when by_code_and_tail returns 0 (suffix doesn't -- match — e.g. "RoP.029.a — Replik" where the suffix "Replik" -- doesn't appear in any RoP.029.a rule's name; pick the -- code-only match anyway). SELECT t.deadline_id, dr.id AS rule_id, 'rule_code' AS strategy FROM targets t JOIN paliad.deadline_rules dr ON dr.rule_code = trim(t.code_token) AND dr.is_active = true WHERE t.code_token IS NOT NULL ), by_name AS ( -- Strategy 2: exact title match against rule.name or rule.name_en. -- The widest matcher; for shared names like "Klageerwiderung" -- (8 rules across proceedings) this is ambiguous, but for -- unique titles like "Antrag auf Schadensbemessung" (1 rule) it -- nails the match. SELECT t.deadline_id, dr.id AS rule_id, 'name_exact' AS strategy FROM targets t JOIN paliad.deadline_rules dr ON (LOWER(dr.name) = LOWER(t.title) OR LOWER(dr.name_en) = LOWER(t.title)) AND dr.is_active = true ), by_alias AS ( -- Strategy 3: concept aliases. deadline_concepts.aliases is a -- text[] of canonical synonyms; if the deadline title appears -- in that array, every active rule on the concept is a candidate. -- Today's alias coverage is thin (the seed for Slice 12 is the -- expected source of new aliases), but the strategy is in place -- so future seeds light it up without a migration. SELECT t.deadline_id, dr.id AS rule_id, 'concept_alias' AS strategy FROM targets t JOIN paliad.deadline_concepts dc ON LOWER(t.title) = ANY(SELECT LOWER(a) FROM unnest(dc.aliases) a) JOIN paliad.deadline_rules dr ON dr.concept_id = dc.id AND dr.is_active = true ) SELECT deadline_id, rule_id, strategy FROM by_code_and_tail UNION SELECT deadline_id, rule_id, strategy FROM by_code UNION SELECT deadline_id, rule_id, strategy FROM by_name UNION SELECT deadline_id, rule_id, strategy FROM by_alias; -- ============================================================================= -- 3. Aggregate per-deadline candidate counts by strategy + pick the -- narrowest-unique-match per deadline. Strategy priority (narrowest -- first): rule_code_and_tail > rule_code > name_exact > concept_alias. -- A deadline's "chosen" rule comes from the highest-priority strategy -- that yields exactly 1 distinct candidate. -- ============================================================================= CREATE TEMP TABLE _mig_090_strategy_counts ON COMMIT DROP AS SELECT deadline_id, strategy, count(DISTINCT rule_id) AS n, MIN(rule_id::text) AS first_rule_text FROM _mig_090_candidates GROUP BY deadline_id, strategy; CREATE TEMP TABLE _mig_090_chosen ON COMMIT DROP AS SELECT DISTINCT ON (deadline_id) deadline_id, first_rule_text::uuid AS rule_id, strategy AS chosen_strategy FROM _mig_090_strategy_counts WHERE n = 1 ORDER BY deadline_id, CASE strategy WHEN 'rule_code_and_tail' THEN 1 WHEN 'rule_code' THEN 2 WHEN 'name_exact' THEN 3 WHEN 'concept_alias' THEN 4 ELSE 5 END; -- "Aggregated" carries the widest candidate set for orphan logging -- (an editor reviewing an orphan wants to see EVERY plausible rule, -- not just the narrowest-strategy result). CREATE TEMP TABLE _mig_090_aggregated ON COMMIT DROP AS SELECT c.deadline_id, count(DISTINCT c.rule_id) AS n_candidates, array_agg(DISTINCT c.rule_id) AS all_rule_ids FROM _mig_090_candidates c GROUP BY c.deadline_id; -- ============================================================================= -- 4. UPDATE deadlines.rule_id for the chosen set (narrowest-unique match). -- ============================================================================= UPDATE paliad.deadlines d SET rule_id = c.rule_id FROM _mig_090_chosen c WHERE d.id = c.deadline_id AND d.rule_id IS NULL; -- ============================================================================= -- 5. Log every deadline that didn't get a unique match as an orphan. -- Skip rows that already have a non-resolved orphan entry (re-run -- guard) — the existing entry is the source-of-truth until the -- admin UI flips resolved_at. -- ============================================================================= INSERT INTO paliad.deadline_rule_backfill_orphans (deadline_id, title, project_id, proceeding_code, reason, candidate_count, candidate_rule_ids) SELECT t.deadline_id, t.title, t.project_id, pt.code AS proceeding_code, CASE WHEN a.n_candidates IS NULL OR a.n_candidates = 0 THEN 'no_match' WHEN a.n_candidates > 1 THEN 'ambiguous' END AS reason, COALESCE(a.n_candidates, 0), COALESCE(a.all_rule_ids, ARRAY[]::uuid[]) FROM ( SELECT d.id AS deadline_id, d.title, d.project_id, p.proceeding_type_id FROM paliad.deadlines d LEFT JOIN paliad.projects p ON p.id = d.project_id WHERE d.rule_id IS NULL AND d.project_id IS NOT NULL ) t LEFT JOIN _mig_090_aggregated a ON a.deadline_id = t.deadline_id LEFT JOIN paliad.proceeding_types pt ON pt.id = t.proceeding_type_id WHERE NOT EXISTS ( SELECT 1 FROM paliad.deadline_rule_backfill_orphans o WHERE o.deadline_id = t.deadline_id AND o.resolved_at IS NULL ); -- ============================================================================= -- 6. Hard assertion: every NULL-rule_id deadline (with project) is -- either resolved (rule_id IS NOT NULL post-mig) or carries an -- unresolved orphan row. -- ============================================================================= DO $$ DECLARE n_processed int; n_matched int; n_orphaned int; n_unaccounted int; BEGIN SELECT count(*) INTO n_processed FROM paliad.deadlines WHERE project_id IS NOT NULL AND (rule_id IS NOT NULL OR EXISTS ( SELECT 1 FROM paliad.deadline_rule_backfill_orphans o WHERE o.deadline_id = paliad.deadlines.id )); SELECT count(*) INTO n_matched FROM paliad.deadlines d JOIN paliad.deadlines_pre_089 b ON b.id = d.id WHERE d.rule_id IS NOT NULL; SELECT count(DISTINCT deadline_id) INTO n_orphaned FROM paliad.deadline_rule_backfill_orphans WHERE resolved_at IS NULL; SELECT count(*) INTO n_unaccounted FROM paliad.deadlines d WHERE d.rule_id IS NULL AND d.project_id IS NOT NULL AND NOT EXISTS ( SELECT 1 FROM paliad.deadline_rule_backfill_orphans o WHERE o.deadline_id = d.id ); RAISE NOTICE 'mig 090: processed=% matched=% orphaned=% unaccounted=%', n_processed, n_matched, n_orphaned, n_unaccounted; IF n_unaccounted > 0 THEN RAISE EXCEPTION 'mig 090: % deadlines have rule_id IS NULL and no orphan row — ' 'matcher missed them. Investigate the candidate query.', n_unaccounted; END IF; END $$;