From 09615ec48ee06a7d5fc91d6fb36ea4c272196690 Mon Sep 17 00:00:00 2001 From: mAi Date: Fri, 15 May 2026 01:37:57 +0200 Subject: [PATCH] =?UTF-8?q?feat(t-paliad-190):=20mig=20090=20=E2=80=94=20o?= =?UTF-8?q?ne-time=20fuzzy-match=20backfill?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 3 Slice 10 Step I (design §3.I + m's Q10 ruling). Binds legacy paliad.deadlines.rule_id to deadline_rules.id via priority-ordered fuzzy matching; ambiguous + no-match rows log to the orphan staging table (mig 089). Matching strategies (highest priority first; first unique hit wins): 1. rule_code_and_tail — title's leading citation token AND its post-separator name fragment match a rule. Handles "RoP.023 — Klageerwiderung" where the bare code matches 2 rules (DE Klageerwiderung + EN Statement of Defence); the tail picks the right one. 2. rule_code only — bare rule_code from the title prefix. Handles "RoP.029.a — Replik" where RoP.029.a maps to a single rule regardless of suffix (the title's "Replik" doesn't match the rule's actual name but the code is unique). 3. name_exact — full title equals rule.name or rule.name_en (LOWER). Catches "Antrag auf Schadensbemessung" (1 unique rule); ambiguous for shared names like Klageerwiderung (8 candidates). 4. concept_alias — title appears in deadline_concepts.aliases. Thin coverage today; Slice 12 orphan-seed will populate it. Per-deadline aggregation: - Strategy with n_candidates = 1 wins. Priority chain rule_code_and_tail > rule_code > name_exact > concept_alias. - Ambiguous (≥2 across all strategies) → orphan reason='ambiguous' with the full candidate_rule_ids list. - 0 candidates → orphan reason='no_match'. Predicted production outcome (verified via supabase MCP pre-write): - 3 of 25 deadlines (12%) get a unique match: "RoP.023 — Klageerwiderung" via rule_code_and_tail "RoP.029.a — Replik" via rule_code "Antrag auf Schadensbemessung" via name_exact - 15 of 25 deadlines (60%) → orphan reason='ambiguous' (common titles like Klageerwiderung × 4, Duplik × 4, Replik × 4 across multiple proceedings). - 7 of 25 deadlines (28%) → orphan reason='no_match' (free-text titles like "Call me", "Schutzschrift", "Validierungsfrist EP→DE", "Schriftsatz nach R.262 (Klageerwiderung)"). The 60% target the design § hinted at is unachievable on today's corpus because all 11 projects have proceeding_type_id IS NULL post- Slice-5 (the fristenrechner-side rebinding hasn't happened on production data yet) — proceeding-narrowing would cut the Klageerwiderung / Duplik / Replik ambiguity, but the column isn't populated. The orphan-review UI in Slice 11 is the real path to binding the long tail. Defensive backup: paliad.deadlines_pre_089 snapshot taken before any UPDATE. Down-migration restores rule_id from the snapshot + drops unresolved orphan rows (resolved rows survive a rollback — those are legal-review work that shouldn't disappear on a code revert). Idempotency: WHERE rule_id IS NULL on the UPDATE; orphan INSERT skips rows that already have an unresolved orphan entry. Re-running on the same corpus produces no new rows. Hard assertion: every NULL-rule_id deadline (with project) is either resolved post-mig OR has an unresolved orphan row. RAISE EXCEPTION on any unaccounted row — fails the migration loudly rather than silently leaving a deadline un-matched + un-orphaned. Audit-reason wrapper set; the mig 079 deadline_rules audit trigger doesn't fire here (UPDATEs touch paliad.deadlines, not deadline_rules), but the wrapper is the standard pattern. --- .../090_backfill_deadline_rule_id.down.sql | 30 ++ .../090_backfill_deadline_rule_id.up.sql | 320 ++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 internal/db/migrations/090_backfill_deadline_rule_id.down.sql create mode 100644 internal/db/migrations/090_backfill_deadline_rule_id.up.sql diff --git a/internal/db/migrations/090_backfill_deadline_rule_id.down.sql b/internal/db/migrations/090_backfill_deadline_rule_id.down.sql new file mode 100644 index 0000000..2943768 --- /dev/null +++ b/internal/db/migrations/090_backfill_deadline_rule_id.down.sql @@ -0,0 +1,30 @@ +-- t-paliad-190 down — reverses 090_backfill_deadline_rule_id.up.sql. +-- +-- Restores rule_id values from the pre-mig snapshot (every deadline +-- that mig 090 touched had rule_id IS NULL originally, so restoring +-- means setting rule_id back to NULL on every row that survived the +-- backfill). Drops the orphan rows mig 090 wrote (resolved rows stay +-- — those represent legal-review work that shouldn't disappear on +-- a code rollback) and drops the backup table. +-- +-- This is a defensive rollback path; the migration itself is one-time +-- + idempotent, so re-running 090 after a down + up is safe. + +SELECT set_config( + 'paliad.audit_reason', + 'rollback 090: NULL rule_id on deadlines mig 090 touched + drop pre-089 backup', + true); + +-- Restore rule_id = NULL on every deadline mig 090 may have written. +-- We use the backup table as the authoritative "before" snapshot. +UPDATE paliad.deadlines d + SET rule_id = b.rule_id + FROM paliad.deadlines_pre_089 b + WHERE d.id = b.id; + +-- Drop the unresolved orphan rows mig 090 wrote. Resolved rows stay — +-- a legal-review hand-link is real work that survives a code rollback. +DELETE FROM paliad.deadline_rule_backfill_orphans + WHERE resolved_at IS NULL; + +DROP TABLE IF EXISTS paliad.deadlines_pre_089; diff --git a/internal/db/migrations/090_backfill_deadline_rule_id.up.sql b/internal/db/migrations/090_backfill_deadline_rule_id.up.sql new file mode 100644 index 0000000..dbd6379 --- /dev/null +++ b/internal/db/migrations/090_backfill_deadline_rule_id.up.sql @@ -0,0 +1,320 @@ +-- t-paliad-190 / Fristen Phase 3 Slice 10 — one-time fuzzy-match +-- backfill of paliad.deadlines.rule_id per design §3.I + m's Q10 +-- ruling. Restores SmartTimeline's "anchor real deadlines into +-- projection" affordance on legacy data (1 of 26 deadlines currently +-- has rule_id populated; the SmartTimeline anchor flow needs the FK +-- to thread predicted dates off actuals). +-- +-- Matching strategies (in priority order; first unique hit wins): +-- +-- 1. rule_code-prefix extraction from title. Titles like +-- "RoP.023 — Klageerwiderung" carry the rule citation in the +-- prefix; we extract the leading citation token and JOIN on +-- deadline_rules.rule_code = extracted. When the rule_code +-- resolves to multiple rules (e.g. RoP.023 → 2 rules — DE +-- Klageerwiderung + EN Statement of Defence), the remaining +-- title fragment narrows by name ILIKE. +-- +-- 2. exact title match against rule.name OR rule.name_en (LOWER). +-- Mostly hits common Pipeline-A names ("Antrag auf +-- Schadensbemessung" → 1 unique rule); ambiguous for shared +-- names like "Klageerwiderung" (8 rules across proceedings). +-- +-- 3. deadline_concepts.aliases match. Each concept carries a +-- text[] of canonical aliases; if LOWER(d.title) is in the +-- aliases array, we pick the rules with that concept_id. Today +-- the alias coverage is thin (no aliases for "Schutzschrift" +-- etc.), but the strategy is shaped so a future seed lights +-- it up. +-- +-- For each deadline, we collect all candidates across the three +-- strategies, dedupe by rule.id, and: +-- - exactly 1 candidate → UPDATE rule_id (matched). +-- - 0 candidates → orphan with reason='no_match'. +-- - ≥2 candidates → orphan with reason='ambiguous', candidate_rule_ids +-- populated so a legal-review pass can hand-pick. +-- +-- Per-project narrowing by proceeding_type_id is the design's primary +-- discriminator. In the live corpus today all 11 projects have +-- proceeding_type_id IS NULL (Slice 5 retired litigation codes from +-- project-binding; the fristenrechner-side rebinding hasn't happened), +-- so this slice can't use proceeding-narrowing on production data. +-- The CTE still includes the predicate so the migration self-tunes +-- the moment proceeding_type_id starts getting populated. +-- +-- Defensive backup: paliad.deadlines is snapshotted to +-- paliad.deadlines_pre_089 before the UPDATE so an operator can +-- restore individual rule_id values if a hand-link goes wrong post +-- mig. The table is dropped in the down-migration; Slice 11 (rule +-- editor) can drop it once orphan resolution finishes in prod. +-- +-- Idempotency: WHERE d.rule_id IS NULL on the UPDATE; the orphan +-- INSERT uses ON CONFLICT DO NOTHING via a NOT EXISTS guard (no +-- unique constraint on deadline_id alone — a deadline may legitimately +-- get re-orphaned after a resolution rollback; but re-running 090 on +-- the same corpus must not duplicate orphan rows for unresolved +-- deadlines). +-- +-- Hard assertion at end: SUM(matched) + SUM(orphans for current +-- unresolved deadlines) ≥ COUNT(deadlines processed). Strict equality +-- doesn't hold cleanly on a re-run (the orphan table may already +-- carry prior rows from a partial run), so the assertion is "at +-- least one row exists per unresolved deadline". + +SELECT set_config( + 'paliad.audit_reason', + 'mig 090: one-time fuzzy-match backfill of deadlines.rule_id per design §3.I / Q10', + true); + +-- ============================================================================= +-- 1. Defensive backup before any UPDATE. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS paliad.deadlines_pre_089 AS +SELECT id, project_id, title, rule_id, rule_code, status, due_date, + completed_at, created_at, updated_at + FROM paliad.deadlines + WHERE rule_id IS NULL + AND project_id IS NOT NULL; + +COMMENT ON TABLE paliad.deadlines_pre_089 IS + 'Snapshot of paliad.deadlines (id, rule_id-relevant columns) taken ' + 'before mig 090 ran the fuzzy-match backfill. Lets an operator ' + 'restore individual rule_id values if a hand-link goes wrong. ' + 'Slice 11 (rule editor) drops this once orphan resolution finishes.'; + +-- ============================================================================= +-- 2. Build the candidate set in a temp table so the per-deadline +-- aggregation + UPDATE + orphan INSERT can share the work without +-- re-evaluating the matchers. +-- ============================================================================= + +CREATE TEMP TABLE _mig_090_candidates ON COMMIT DROP AS +WITH targets AS ( + -- Every NULL-rule_id deadline still bound to a project. project_id + -- is required because we want at least the SmartTimeline anchor + -- flow to work; un-bound deadlines (rare) are out of scope. + SELECT d.id AS deadline_id, + d.title AS title, + d.project_id, + p.proceeding_type_id, + -- Extract a leading citation token like "RoP.023" or + -- "R.49" from the title. Captures the rule_code prefix + -- on titles that carry one ("RoP.023 — Klageerwiderung"); + -- NULL on plain titles. + NULLIF(regexp_replace(d.title, '^\s*((?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*)\s*(?:[—–-].*)?$', '\1'), d.title) AS code_token, + -- Strip the leading citation + separator to surface the + -- title's name fragment. "RoP.023 — Klageerwiderung" → + -- "Klageerwiderung"; "RoP.029.a" (no suffix) → ""; plain + -- "Klageerwiderung" → "Klageerwiderung" unchanged. + NULLIF(trim(regexp_replace(d.title, '^\s*(?:RoP|R|Art|§)\.?\s*[0-9]+(?:\.[a-z0-9]+)*\s*[—–-]?\s*', '')), '') AS title_tail + FROM paliad.deadlines d + LEFT JOIN paliad.projects p ON p.id = d.project_id + WHERE d.rule_id IS NULL + AND d.project_id IS NOT NULL +), +by_code_and_tail AS ( + -- Strategy 1a (narrowest): rule_code AND name (DE or EN) matches + -- the title's tail fragment. Handles "RoP.023 — Klageerwiderung" + -- where the bare code matches 2 rules (DE Klageerwiderung + + -- EN Statement of Defence); the tail picks the DE one. + SELECT t.deadline_id, dr.id AS rule_id, 'rule_code_and_tail' AS strategy + FROM targets t + JOIN paliad.deadline_rules dr + ON dr.rule_code = trim(t.code_token) + AND dr.is_active = true + AND (LOWER(dr.name) = LOWER(t.title_tail) + OR LOWER(dr.name_en) = LOWER(t.title_tail)) + WHERE t.code_token IS NOT NULL + AND t.title_tail IS NOT NULL +), +by_code AS ( + -- Strategy 1b: rule_code prefix only. Handles bare-code titles + -- ("RoP.029.a" maps to 1 unique rule regardless of suffix) and + -- the fallback when by_code_and_tail returns 0 (suffix doesn't + -- match — e.g. "RoP.029.a — Replik" where the suffix "Replik" + -- doesn't appear in any RoP.029.a rule's name; pick the + -- code-only match anyway). + SELECT t.deadline_id, dr.id AS rule_id, 'rule_code' AS strategy + FROM targets t + JOIN paliad.deadline_rules dr + ON dr.rule_code = trim(t.code_token) + AND dr.is_active = true + WHERE t.code_token IS NOT NULL +), +by_name AS ( + -- Strategy 2: exact title match against rule.name or rule.name_en. + -- The widest matcher; for shared names like "Klageerwiderung" + -- (8 rules across proceedings) this is ambiguous, but for + -- unique titles like "Antrag auf Schadensbemessung" (1 rule) it + -- nails the match. + SELECT t.deadline_id, dr.id AS rule_id, 'name_exact' AS strategy + FROM targets t + JOIN paliad.deadline_rules dr + ON (LOWER(dr.name) = LOWER(t.title) + OR LOWER(dr.name_en) = LOWER(t.title)) + AND dr.is_active = true +), +by_alias AS ( + -- Strategy 3: concept aliases. deadline_concepts.aliases is a + -- text[] of canonical synonyms; if the deadline title appears + -- in that array, every active rule on the concept is a candidate. + -- Today's alias coverage is thin (the seed for Slice 12 is the + -- expected source of new aliases), but the strategy is in place + -- so future seeds light it up without a migration. + SELECT t.deadline_id, dr.id AS rule_id, 'concept_alias' AS strategy + FROM targets t + JOIN paliad.deadline_concepts dc + ON LOWER(t.title) = ANY(SELECT LOWER(a) FROM unnest(dc.aliases) a) + JOIN paliad.deadline_rules dr + ON dr.concept_id = dc.id + AND dr.is_active = true +) +SELECT deadline_id, rule_id, strategy + FROM by_code_and_tail + UNION +SELECT deadline_id, rule_id, strategy + FROM by_code + UNION +SELECT deadline_id, rule_id, strategy + FROM by_name + UNION +SELECT deadline_id, rule_id, strategy + FROM by_alias; + +-- ============================================================================= +-- 3. Aggregate per-deadline candidate counts by strategy + pick the +-- narrowest-unique-match per deadline. Strategy priority (narrowest +-- first): rule_code_and_tail > rule_code > name_exact > concept_alias. +-- A deadline's "chosen" rule comes from the highest-priority strategy +-- that yields exactly 1 distinct candidate. +-- ============================================================================= + +CREATE TEMP TABLE _mig_090_strategy_counts ON COMMIT DROP AS +SELECT deadline_id, + strategy, + count(DISTINCT rule_id) AS n, + MIN(rule_id::text) AS first_rule_text + FROM _mig_090_candidates + GROUP BY deadline_id, strategy; + +CREATE TEMP TABLE _mig_090_chosen ON COMMIT DROP AS +SELECT DISTINCT ON (deadline_id) + deadline_id, + first_rule_text::uuid AS rule_id, + strategy AS chosen_strategy + FROM _mig_090_strategy_counts + WHERE n = 1 + ORDER BY deadline_id, + CASE strategy + WHEN 'rule_code_and_tail' THEN 1 + WHEN 'rule_code' THEN 2 + WHEN 'name_exact' THEN 3 + WHEN 'concept_alias' THEN 4 + ELSE 5 + END; + +-- "Aggregated" carries the widest candidate set for orphan logging +-- (an editor reviewing an orphan wants to see EVERY plausible rule, +-- not just the narrowest-strategy result). +CREATE TEMP TABLE _mig_090_aggregated ON COMMIT DROP AS +SELECT c.deadline_id, + count(DISTINCT c.rule_id) AS n_candidates, + array_agg(DISTINCT c.rule_id) AS all_rule_ids + FROM _mig_090_candidates c + GROUP BY c.deadline_id; + +-- ============================================================================= +-- 4. UPDATE deadlines.rule_id for the chosen set (narrowest-unique match). +-- ============================================================================= + +UPDATE paliad.deadlines d + SET rule_id = c.rule_id + FROM _mig_090_chosen c + WHERE d.id = c.deadline_id + AND d.rule_id IS NULL; + +-- ============================================================================= +-- 5. Log every deadline that didn't get a unique match as an orphan. +-- Skip rows that already have a non-resolved orphan entry (re-run +-- guard) — the existing entry is the source-of-truth until the +-- admin UI flips resolved_at. +-- ============================================================================= + +INSERT INTO paliad.deadline_rule_backfill_orphans + (deadline_id, title, project_id, proceeding_code, reason, + candidate_count, candidate_rule_ids) +SELECT t.deadline_id, + t.title, + t.project_id, + pt.code AS proceeding_code, + CASE + WHEN a.n_candidates IS NULL OR a.n_candidates = 0 THEN 'no_match' + WHEN a.n_candidates > 1 THEN 'ambiguous' + END AS reason, + COALESCE(a.n_candidates, 0), + COALESCE(a.all_rule_ids, ARRAY[]::uuid[]) + FROM ( + SELECT d.id AS deadline_id, d.title, d.project_id, p.proceeding_type_id + FROM paliad.deadlines d + LEFT JOIN paliad.projects p ON p.id = d.project_id + WHERE d.rule_id IS NULL + AND d.project_id IS NOT NULL + ) t + LEFT JOIN _mig_090_aggregated a ON a.deadline_id = t.deadline_id + LEFT JOIN paliad.proceeding_types pt ON pt.id = t.proceeding_type_id + WHERE NOT EXISTS ( + SELECT 1 + FROM paliad.deadline_rule_backfill_orphans o + WHERE o.deadline_id = t.deadline_id + AND o.resolved_at IS NULL + ); + +-- ============================================================================= +-- 6. Hard assertion: every NULL-rule_id deadline (with project) is +-- either resolved (rule_id IS NOT NULL post-mig) or carries an +-- unresolved orphan row. +-- ============================================================================= + +DO $$ +DECLARE + n_processed int; + n_matched int; + n_orphaned int; + n_unaccounted int; +BEGIN + SELECT count(*) INTO n_processed + FROM paliad.deadlines + WHERE project_id IS NOT NULL + AND (rule_id IS NOT NULL OR EXISTS ( + SELECT 1 FROM paliad.deadline_rule_backfill_orphans o + WHERE o.deadline_id = paliad.deadlines.id + )); + + SELECT count(*) INTO n_matched + FROM paliad.deadlines d + JOIN paliad.deadlines_pre_089 b ON b.id = d.id + WHERE d.rule_id IS NOT NULL; + + SELECT count(DISTINCT deadline_id) INTO n_orphaned + FROM paliad.deadline_rule_backfill_orphans + WHERE resolved_at IS NULL; + + SELECT count(*) INTO n_unaccounted + FROM paliad.deadlines d + WHERE d.rule_id IS NULL + AND d.project_id IS NOT NULL + AND NOT EXISTS ( + SELECT 1 FROM paliad.deadline_rule_backfill_orphans o + WHERE o.deadline_id = d.id + ); + + RAISE NOTICE 'mig 090: processed=% matched=% orphaned=% unaccounted=%', + n_processed, n_matched, n_orphaned, n_unaccounted; + + IF n_unaccounted > 0 THEN + RAISE EXCEPTION 'mig 090: % deadlines have rule_id IS NULL and no orphan row — ' + 'matcher missed them. Investigate the candidate query.', + n_unaccounted; + END IF; +END $$;