Compare commits
3 Commits
mai/artemi
...
mai/boltzm
| Author | SHA1 | Date | |
|---|---|---|---|
| 77a26471e6 | |||
| 72fde84e10 | |||
| af028e2bab |
@@ -3,20 +3,23 @@
|
||||
// Three checks against TEST_DATABASE_URL:
|
||||
//
|
||||
// 1. db.ApplyMigrations does not panic and returns nil.
|
||||
// 2. The migration tracker (public.paliad_schema_migrations) advances to
|
||||
// the highest *.up.sql version on disk — no migrations were silently
|
||||
// skipped, no "dirty=true" stragglers left behind.
|
||||
// 2. paliad.applied_migrations covers every on-disk *.up.sql — no
|
||||
// migration was silently skipped, no version is missing. The set
|
||||
// contract is stronger than the old single-counter check: applied
|
||||
// set must EQUAL on-disk set, not just reach the max version.
|
||||
// 3. The handler mux (with /healthz mounted) responds 200 to GET /healthz.
|
||||
//
|
||||
// This is the lightweight cousin of the migration dry-run gate
|
||||
// (internal/db/migrate_test.go): the dry-run catches per-migration syntax
|
||||
// errors before merge; this smoke confirms the apply+bind path the
|
||||
// container actually runs at boot. Together they cover the mig-098 /
|
||||
// mig-099 class of crash-loops end-to-end.
|
||||
// mig-099 class of crash-loops end-to-end, plus the mig-103 parallel-merge
|
||||
// skip-hole that t-paliad-218 closed (m/paliad#44).
|
||||
//
|
||||
// Skipped without TEST_DATABASE_URL — matches the rest of the live-DB tests.
|
||||
//
|
||||
// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1.
|
||||
// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1 and
|
||||
// docs/design-migration-runner-applied-set-2026-05-20.md §6.
|
||||
|
||||
package main
|
||||
|
||||
@@ -51,19 +54,23 @@ func TestBootSmoke(t *testing.T) {
|
||||
t.Fatalf("db.ApplyMigrations: %v", err)
|
||||
}
|
||||
|
||||
// (2) Assert the tracker advanced to the highest *.up.sql version we
|
||||
// embed. If a migration was silently skipped or the tracker is dirty,
|
||||
// the prod container would crash-loop — this turns that into a test
|
||||
// failure with a precise reason.
|
||||
expected := highestEmbeddedMigrationVersion(t)
|
||||
got, dirty := readTrackerVersion(t, url)
|
||||
if dirty {
|
||||
t.Errorf("tracker reports dirty=true at version %d — investigate before deploying", got)
|
||||
// (2) Assert the applied set equals the on-disk set. The new runner
|
||||
// tracks applied state per-migration; a silently-skipped version
|
||||
// would surface as a row missing from paliad.applied_migrations even
|
||||
// though max(version) matches. Comparing sets — not just max —
|
||||
// catches the failure mode the t-paliad-218 post-mortem documented.
|
||||
onDisk := embeddedMigrationVersions(t)
|
||||
applied := appliedMigrationVersions(t, url)
|
||||
|
||||
if missing := setDiff(onDisk, applied); len(missing) > 0 {
|
||||
t.Errorf("paliad.applied_migrations missing %d on-disk versions: %v "+
|
||||
"(a migration was skipped — investigate before deploying)",
|
||||
len(missing), missing)
|
||||
}
|
||||
if got != expected {
|
||||
t.Errorf("tracker at version %d; expected %d (highest *.up.sql on disk). "+
|
||||
"A migration was skipped or applied out of order.",
|
||||
got, expected)
|
||||
if extra := setDiff(applied, onDisk); len(extra) > 0 {
|
||||
t.Errorf("paliad.applied_migrations has %d versions with no on-disk file: %v "+
|
||||
"(orphan rows — either restore the file or DELETE the row)",
|
||||
len(extra), extra)
|
||||
}
|
||||
|
||||
// (3) Mount the public handlers (the same Register call main() makes,
|
||||
@@ -93,11 +100,16 @@ func TestBootSmoke(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// highestEmbeddedMigrationVersion finds max(N) over every NNN_*.up.sql
|
||||
// file in internal/db/migrations/ on disk. Used as the expected tracker
|
||||
// version after a clean apply. We read from disk (not the embed.FS in
|
||||
// the db package — it's unexported) since the test runs from the repo.
|
||||
func highestEmbeddedMigrationVersion(t *testing.T) int {
|
||||
// embeddedMigrationVersions returns every N where N_*.up.sql exists in
|
||||
// internal/db/migrations/ on disk. The boot smoke compares this set
|
||||
// against paliad.applied_migrations to detect skipped or orphan
|
||||
// migrations.
|
||||
//
|
||||
// Read from disk (not the embed.FS inside the db package — it's unexported)
|
||||
// since the test runs from the repo. The two views must agree for the
|
||||
// build to be self-consistent; if they diverge, the smoke test is the
|
||||
// wrong place to learn about it (the build is). We trust them to match.
|
||||
func embeddedMigrationVersions(t *testing.T) []int {
|
||||
t.Helper()
|
||||
root, err := repoRoot()
|
||||
if err != nil {
|
||||
@@ -129,24 +141,52 @@ func highestEmbeddedMigrationVersion(t *testing.T) int {
|
||||
t.Fatalf("no *.up.sql files found in %s", dir)
|
||||
}
|
||||
sort.Ints(versions)
|
||||
return versions[len(versions)-1]
|
||||
return versions
|
||||
}
|
||||
|
||||
// readTrackerVersion fetches the lone row from the tracker. golang-migrate
|
||||
// keeps exactly one row; if we ever see zero or more, that's the dirty-state
|
||||
// the test is designed to flag.
|
||||
func readTrackerVersion(t *testing.T, url string) (version int, dirty bool) {
|
||||
// appliedMigrationVersions reads paliad.applied_migrations and returns
|
||||
// the sorted list of versions. Fails the test if the table doesn't exist —
|
||||
// db.ApplyMigrations is supposed to have created it by this point.
|
||||
func appliedMigrationVersions(t *testing.T, url string) []int {
|
||||
t.Helper()
|
||||
conn, err := sql.Open("postgres", url)
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
row := conn.QueryRow(`SELECT version, dirty FROM public.paliad_schema_migrations LIMIT 1`)
|
||||
if err := row.Scan(&version, &dirty); err != nil {
|
||||
t.Fatalf("read tracker: %v", err)
|
||||
rows, err := conn.Query(`SELECT version FROM paliad.applied_migrations ORDER BY version`)
|
||||
if err != nil {
|
||||
t.Fatalf("read applied_migrations: %v", err)
|
||||
}
|
||||
return version, dirty
|
||||
defer rows.Close()
|
||||
var out []int
|
||||
for rows.Next() {
|
||||
var v int
|
||||
if err := rows.Scan(&v); err != nil {
|
||||
t.Fatalf("scan: %v", err)
|
||||
}
|
||||
out = append(out, v)
|
||||
}
|
||||
if err := rows.Err(); err != nil {
|
||||
t.Fatalf("rows: %v", err)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// setDiff returns the elements of a that are not in b. Inputs are sorted
|
||||
// ascending; output preserves that ordering.
|
||||
func setDiff(a, b []int) []int {
|
||||
bset := make(map[int]bool, len(b))
|
||||
for _, v := range b {
|
||||
bset[v] = true
|
||||
}
|
||||
var out []int
|
||||
for _, v := range a {
|
||||
if !bset[v] {
|
||||
out = append(out, v)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// repoRoot walks upward from the test binary's working directory until it
|
||||
|
||||
474
docs/design-migration-runner-applied-set-2026-05-20.md
Normal file
474
docs/design-migration-runner-applied-set-2026-05-20.md
Normal file
@@ -0,0 +1,474 @@
|
||||
# Design — gap-tolerant migration runner (applied-set tracker)
|
||||
|
||||
**Status:** inventor draft, awaiting m
|
||||
**Task:** t-paliad-218
|
||||
**Gitea issue:** m/paliad#44
|
||||
**Branch:** `mai/boltzmann/inventor-gap-tolerant`
|
||||
**Author:** boltzmann (inventor)
|
||||
**Date:** 2026-05-20
|
||||
|
||||
---
|
||||
|
||||
## §0 Live state verified (2026-05-20, 10:38)
|
||||
|
||||
Three things checked against the youpc Supabase before designing on top of any premise:
|
||||
|
||||
| Check | Result |
|
||||
|---|---|
|
||||
| Live tracker location | `paliad.paliad_schema_migrations` exists, `version=106, dirty=false` |
|
||||
| Stale tracker | `public.paliad_schema_migrations` exists, `version=2, dirty=true` — leftover from a prior outage; not read by anything today |
|
||||
| Mig 103 status on prod | Applied (cols `counter_payload`, `previous_request_id` present; CHECK constraint includes `'changes_requested'`) |
|
||||
| golang-migrate version | `v4.19.1` (per `go.mod`) |
|
||||
| Migration count on disk | 210 files (105 up + 105 down), highest version `106_add_madrid_office` |
|
||||
| `migrate.Down`/`m.Steps(-N)` call sites | **Zero**. No code path rolls migrations back. `.down.sql` files are reference-only. |
|
||||
|
||||
The `migrate.go:36` comment claims the tracker lives in `public` schema. That is misleading — the live tracker is in `paliad`. The `public.paliad_schema_migrations` row at v2 is a corpse from an earlier incident captured in memory `652b856f…` and `638694f8…`. **This is a doc bug** — `migrate.go`'s schema-routing rationale paragraph needs replacing with the truth: the runner connection's `search_path` includes `paliad`, so the tracker lands in `paliad`.
|
||||
|
||||
`main_smoke_test.go:145` and `migrate_test.go:135` both `SELECT FROM public.paliad_schema_migrations`. Against the production DB those tests would read `version=2, dirty=true` and fail. They pass today because they run against a separate scratch DB (`TEST_DATABASE_URL`) whose tracker landed in `public` historically. Both tests must be updated as part of this rollout (covered in §6).
|
||||
|
||||
---
|
||||
|
||||
## §1 The problem in one paragraph
|
||||
|
||||
`golang-migrate/migrate/v4` tracks applied state as a **single integer** in `paliad_schema_migrations(version int, dirty bool)`. Semantics: "every version ≤ current is considered applied; `migrate.Up()` runs only versions strictly greater than current." In a parallel-merge workflow where two workers' migration numbers race, whichever lands first claims the counter and the other gets **permanently skipped** with no visible failure. The 2026-05-20 hertz/fermi race did exactly that to mig 103; production was running approval-suggest-changes code against a schema that didn't have it. Recovery was manual SQL apply, which is invisible to migrate — `migrate.Down(1)` would skip mig 103's down too.
|
||||
|
||||
Convention-level mitigations (head reads `ls migrations/ | tail` before every merge; per the friction note in memory `8e4a2853…`) are real but brittle. The **runner itself should be the safety net**, not the convention.
|
||||
|
||||
---
|
||||
|
||||
## §2 Target shape — one paragraph
|
||||
|
||||
Replace the single-integer counter with a **set** of applied versions stored in `paliad.applied_migrations(version int PK, name text, applied_at timestamptz, checksum text NULL)`. On every deploy, the new runner scans the embedded migrations FS, computes `pending = on_disk \ applied`, and applies pending in ascending order — each one in its own transaction together with the `INSERT INTO applied_migrations`. "Applied" is set-membership, not counter-comparison; gaps in the version space are first-class. After `applied_migrations` exists, the old `paliad.paliad_schema_migrations` tracker becomes irrelevant and gets dropped (along with the stale `public.` one).
|
||||
|
||||
---
|
||||
|
||||
## §3 Schema — `paliad.applied_migrations`
|
||||
|
||||
```sql
|
||||
CREATE TABLE paliad.applied_migrations (
|
||||
version int NOT NULL PRIMARY KEY,
|
||||
name text NOT NULL,
|
||||
applied_at timestamptz NOT NULL DEFAULT now(),
|
||||
checksum text NULL
|
||||
);
|
||||
|
||||
-- One row per applied .up.sql. checksum is SHA-256 hex of the .up.sql file
|
||||
-- contents at apply time, or NULL for migrations applied by the legacy
|
||||
-- runner before checksum tracking existed (1..106 are NULL on backfill).
|
||||
```
|
||||
|
||||
**Why `version` as the PK** — guarantees set semantics: trying to apply the same version twice fails on PK violation (caught by the per-mig transaction and surfaced as a deploy error, not silent dup). One row per version, no soft state, no compound keys.
|
||||
|
||||
**Why `name` separate from version** — debuggability. `SELECT * FROM applied_migrations ORDER BY version` reads like a deploy log without needing to cross-reference the filename layout on disk. Also lets the runner verify name agreement between disk and DB (catches "I renamed mig 098_x to 098_y but kept the version" → mismatched name in DB triggers a hard-fail on next deploy).
|
||||
|
||||
**Why `applied_at` not `applied_at_ms`** — `timestamptz` is the project convention. Microsecond precision is not load-bearing; ordering is by `version`, not by time.
|
||||
|
||||
**Why `checksum` nullable** — backfilled rows (1..106) have no content provenance — those migrations applied via golang-migrate which didn't hash anything. Rather than fabricate a hash by re-reading the .up.sql today (which would give a false sense of "this is what was applied"), we admit ignorance and leave it NULL. The new runner populates it going forward.
|
||||
|
||||
---
|
||||
|
||||
## §4 Runner contract — `internal/db/migrate.go`
|
||||
|
||||
`ApplyMigrations(databaseURL string) error` keeps its current signature. The implementation changes underneath.
|
||||
|
||||
### §4.1 Algorithm
|
||||
|
||||
```
|
||||
1. open DB; ping; bootstrap: CREATE SCHEMA IF NOT EXISTS paliad
|
||||
2. acquire pg_advisory_lock(<paliad-namespace-int>); release on defer
|
||||
3. CREATE TABLE IF NOT EXISTS paliad.applied_migrations (...)
|
||||
4. if applied_migrations is empty:
|
||||
seed from paliad.paliad_schema_migrations: for every version v ≤ old.version,
|
||||
for the on-disk filename of v, INSERT (v, name, now(), NULL)
|
||||
— mig 103 is included because v ≤ 106; checksum left NULL
|
||||
— see §5 for the bootstrap details
|
||||
5. scan embed.FS for *.up.sql files
|
||||
6. parse into list of (version, name, filename)
|
||||
7. hard-fail if two files share a version (collision detection)
|
||||
8. hard-fail if any filename has a name in DB that doesn't match
|
||||
(rename detection — see §3 "Why `name` separate")
|
||||
9. pending = filter(on-disk where version NOT IN applied_migrations)
|
||||
10. sort pending by version ascending
|
||||
11. for each pending:
|
||||
BEGIN
|
||||
execute .up.sql contents
|
||||
INSERT INTO applied_migrations(version, name, applied_at, checksum)
|
||||
VALUES (v, n, now(), sha256(file_bytes))
|
||||
COMMIT — on any error, ROLLBACK + return the error
|
||||
12. release advisory lock
|
||||
```
|
||||
|
||||
Each `.up.sql + INSERT` is **one transaction**. All-or-nothing per migration: if the SQL fails, the INSERT doesn't happen and we re-try on the next deploy. Same fail-fast posture as today.
|
||||
|
||||
### §4.2 Pseudocode skeleton
|
||||
|
||||
```go
|
||||
func ApplyMigrations(databaseURL string) error {
|
||||
conn := openAndPing(databaseURL)
|
||||
defer conn.Close()
|
||||
if _, err := conn.Exec(`CREATE SCHEMA IF NOT EXISTS paliad`); err != nil { ... }
|
||||
|
||||
// Advisory lock — int picked from the hash of "paliad.applied_migrations".
|
||||
// pg_advisory_lock blocks until acquired; matches golang-migrate's own
|
||||
// lock pattern and survives connection death (released on disconnect).
|
||||
if _, err := conn.Exec(`SELECT pg_advisory_lock($1)`, advisoryLockID); err != nil { ... }
|
||||
defer conn.Exec(`SELECT pg_advisory_unlock($1)`, advisoryLockID)
|
||||
|
||||
if _, err := conn.Exec(createAppliedMigrationsSQL); err != nil { ... }
|
||||
if err := bootstrapFromLegacyTracker(conn); err != nil { ... }
|
||||
|
||||
onDisk, err := scanEmbedFS() // returns sorted []migration; fails on collision
|
||||
if err != nil { ... }
|
||||
applied, err := readAppliedMigrations(conn)
|
||||
if err != nil { ... }
|
||||
if err := checkNameAgreement(onDisk, applied); err != nil { ... } // rename detection
|
||||
|
||||
pending := diff(onDisk, applied)
|
||||
for _, m := range pending {
|
||||
if err := applyOne(conn, m); err != nil {
|
||||
return fmt.Errorf("migration %s: %w", m.filename, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func applyOne(conn *sql.DB, m migration) error {
|
||||
body, _ := migrationFS.ReadFile("migrations/" + m.filename)
|
||||
checksum := fmt.Sprintf("%x", sha256.Sum256(body))
|
||||
tx, err := conn.Begin()
|
||||
if err != nil { return err }
|
||||
defer tx.Rollback()
|
||||
if _, err := tx.Exec(string(body)); err != nil { return err }
|
||||
if _, err := tx.Exec(
|
||||
`INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum)
|
||||
VALUES ($1, $2, now(), $3)`, m.version, m.name, checksum); err != nil { return err }
|
||||
return tx.Commit()
|
||||
}
|
||||
```
|
||||
|
||||
### §4.3 Advisory lock ID
|
||||
|
||||
`pg_advisory_lock(int8)` — pick a constant derived from `hash('paliad.applied_migrations')` truncated to int64. Documented in code. Standard pattern; matches what golang-migrate does internally.
|
||||
|
||||
### §4.4 Collision detection (§7 of issue acceptance)
|
||||
|
||||
`scanEmbedFS` groups by `version` prefix. If any version has ≥2 distinct `.up.sql` files, return a fatal error before any tx opens. Deploy fails fast with the filenames in the error message.
|
||||
|
||||
### §4.5 Rename detection (defensive, not in issue acceptance — recommended)
|
||||
|
||||
Pure git accidents — renaming `098_foo.up.sql` to `098_bar.up.sql` after merge — currently break tests but don't break prod because golang-migrate only cares about version. With `applied_migrations.name`, a rename would land a DB row whose name doesn't match the file. Catch it: if `(version, name)` on disk doesn't match `(version, name)` in DB for an already-applied version, hard-fail the deploy. Operator's recovery: revert the rename, or run a one-shot `UPDATE applied_migrations SET name = '<new>' WHERE version = N` if the rename is intentional.
|
||||
|
||||
---
|
||||
|
||||
## §5 Bootstrap — getting from old world to new
|
||||
|
||||
Two viable paths. Recommendation: **runner-bootstrap (option B)**, see §10/Q4.
|
||||
|
||||
### §5.1 Option A — mig 107 SQL file (issue body's proposal)
|
||||
|
||||
`107_backfill_applied_migrations.up.sql`:
|
||||
|
||||
```sql
|
||||
CREATE TABLE IF NOT EXISTS paliad.applied_migrations (
|
||||
version int NOT NULL PRIMARY KEY,
|
||||
name text NOT NULL,
|
||||
applied_at timestamptz NOT NULL DEFAULT now(),
|
||||
checksum text NULL
|
||||
);
|
||||
|
||||
-- One INSERT per historical version 001..106, hardcoded.
|
||||
-- ON CONFLICT DO NOTHING for re-run safety.
|
||||
INSERT INTO paliad.applied_migrations(version, name, applied_at) VALUES
|
||||
(1, 'paliad_schema', now()),
|
||||
(2, 'users', now()),
|
||||
...
|
||||
(103, 'approval_suggest_changes', now()),
|
||||
...
|
||||
(106, 'add_madrid_office', now()),
|
||||
(107, 'backfill_applied_migrations', now()) -- self-record
|
||||
ON CONFLICT (version) DO NOTHING;
|
||||
```
|
||||
|
||||
**Down step:**
|
||||
```sql
|
||||
DROP TABLE IF EXISTS paliad.applied_migrations;
|
||||
```
|
||||
|
||||
**Pros:** visible in `git log -- internal/db/migrations/`, explainable.
|
||||
**Cons:** the in-transition deploy still runs golang-migrate (to apply mig 107). Adds a step where two runners coexist for one deploy. Also: mig 107 has special "self-record" semantics that no other migration has, which is a minor footgun for future readers.
|
||||
|
||||
### §5.2 Option B — runner-bootstrap (recommended)
|
||||
|
||||
No mig 107 file. The new runner code path includes a one-shot bootstrap:
|
||||
|
||||
```go
|
||||
func bootstrapFromLegacyTracker(conn *sql.DB) error {
|
||||
var count int
|
||||
if err := conn.QueryRow(`SELECT count(*) FROM paliad.applied_migrations`).Scan(&count); err != nil {
|
||||
return err
|
||||
}
|
||||
if count > 0 {
|
||||
return nil // already bootstrapped
|
||||
}
|
||||
|
||||
var legacyVer int
|
||||
var legacyDirty bool
|
||||
err := conn.QueryRow(`SELECT version, dirty FROM paliad.paliad_schema_migrations LIMIT 1`).
|
||||
Scan(&legacyVer, &legacyDirty)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil // virgin DB, applied_migrations stays empty; runner will apply 001..N from scratch
|
||||
}
|
||||
if err != nil { return err }
|
||||
if legacyDirty {
|
||||
return fmt.Errorf("legacy tracker is dirty at version %d — recover manually before deploying", legacyVer)
|
||||
}
|
||||
|
||||
// Backfill: every version ≤ legacyVer that exists on disk. checksum NULL.
|
||||
for _, m := range loadAllOnDisk() {
|
||||
if m.version > legacyVer { continue }
|
||||
_, err := conn.Exec(
|
||||
`INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum)
|
||||
VALUES ($1, $2, now(), NULL)
|
||||
ON CONFLICT (version) DO NOTHING`, m.version, m.name)
|
||||
if err != nil { return err }
|
||||
}
|
||||
return nil
|
||||
}
|
||||
```
|
||||
|
||||
**Pros:** idempotent (re-running is a no-op via the empty-check + ON CONFLICT). No SQL file with one-off semantics. One code path. Naturally handles "virgin DB during fresh test setup" (no legacy tracker → bootstrap is a no-op, runner applies 001..N normally).
|
||||
**Cons:** invisible in `git log -- migrations/`. Future devs see `applied_migrations` rows with NULL checksums and need to understand why.
|
||||
|
||||
### §5.3 What about mig 103 specifically?
|
||||
|
||||
Both paths put mig 103 in `applied_migrations` with `checksum=NULL`. That's the right answer:
|
||||
|
||||
- The mig 103 .up.sql DID run on prod (verified §0).
|
||||
- We don't know whether what ran on prod (manual SQL apply via Supabase MCP) was byte-identical to today's embedded `103_approval_suggest_changes.up.sql`. It almost certainly is, but "almost" doesn't earn a non-NULL checksum.
|
||||
- Leaving checksum NULL means: "this version is recorded as applied, but we cannot verify content fidelity." That's honest.
|
||||
|
||||
### §5.4 Deploy ordering — what happens in production
|
||||
|
||||
After this design ships:
|
||||
|
||||
1. **Deploy N (today):** old runner. `paliad.paliad_schema_migrations` at v106. No `paliad.applied_migrations`.
|
||||
2. **Deploy N+1 (this change):** new runner first deploy.
|
||||
- `bootstrapFromLegacyTracker` sees `applied_migrations` empty + legacy tracker at v106 + clean.
|
||||
- INSERTs rows for every on-disk version 1..106 with checksum=NULL.
|
||||
- `pending` = empty (every disk version is now in DB).
|
||||
- No migrations actually run. The deploy is a code-only change.
|
||||
3. **Deploy N+2 (first deploy with a new mig 107):** new runner.
|
||||
- `applied_migrations` has 106 rows from bootstrap.
|
||||
- Disk has 107 files (001..107).
|
||||
- Pending = {107}. Apply 107 in tx, INSERT row.
|
||||
4. **Cleanup (separate PR, after burn-in):** drop `paliad.paliad_schema_migrations` and `public.paliad_schema_migrations` via mig 108_drop_legacy_trackers.
|
||||
|
||||
---
|
||||
|
||||
## §6 Test updates
|
||||
|
||||
Two existing tests read `public.paliad_schema_migrations`:
|
||||
|
||||
1. **`internal/db/migrate_test.go:135`** — `TestMigrations_DryRun` reads the tracker version to compute "pending." Update to read `paliad.applied_migrations`:
|
||||
- `SELECT version FROM paliad.applied_migrations` — set of applied versions
|
||||
- `pending = on_disk \ applied`
|
||||
- Same per-migration BEGIN/ROLLBACK probe. Same skip-without-TEST_DATABASE_URL semantics.
|
||||
|
||||
2. **`cmd/server/main_smoke_test.go:145`** — `TestBootSmoke` reads `version, dirty FROM public.paliad_schema_migrations` after `ApplyMigrations`. Update to:
|
||||
- `SELECT max(version) FROM paliad.applied_migrations` — assert equals `highestEmbeddedMigrationVersion`
|
||||
- No dirty flag in the new schema (each row is fully committed or not present); drop the dirty check.
|
||||
- The boot-smoke contract gets stronger: "applied set after ApplyMigrations matches the on-disk set exactly." Failure mode is no longer "tracker dirty" but "row missing" — more direct.
|
||||
|
||||
A third test-impacting change: every service test that calls `db.ApplyMigrations(url)` (45 call sites per grep) continues to work because `ApplyMigrations` is the same function signature. The first call on a virgin scratch DB now creates `applied_migrations` and applies every embedded migration; the second call sees `applied = on_disk` and no-ops. Identical to today's behavior with golang-migrate. No service-test changes needed.
|
||||
|
||||
The CI dry-run gate (`make verify-migrations` or its equivalent) similarly continues working — `TestMigrations_DryRun` just reads from a different table.
|
||||
|
||||
---
|
||||
|
||||
## §7 Down-migration support
|
||||
|
||||
Today's reality:
|
||||
|
||||
- `migrate.Down`/`m.Steps(-N)` is called from **zero** places in the codebase (verified).
|
||||
- `.down.sql` files exist as reference material and ship in the embed.FS but never run via `db.ApplyMigrations`.
|
||||
|
||||
Three options for the new runner:
|
||||
|
||||
**A. Implement minimal `RollbackOne(databaseURL)` or `RollbackTo(databaseURL, version)`** — runs the highest-version `.down.sql` whose version is currently in `applied_migrations`, plus `DELETE FROM applied_migrations WHERE version = N`, in a tx. No `migrate.Force()`-style override. Adds ~30 LoC.
|
||||
|
||||
**B. Up-only runner.** Document in `migrate.go` comment that down is manual: operator runs `.down.sql` through psql, then `DELETE FROM paliad.applied_migrations WHERE version = N`. Recover-by-runbook. Adds 0 LoC, 0 risk.
|
||||
|
||||
**C. Drop `.down.sql` files entirely.** Aggressive. Not recommended — they're useful as reference even if never auto-applied.
|
||||
|
||||
Recommendation: **B**. Nobody calls down today; designing a Down method for hypothetical future use is premature. If down is needed later, add it as a focused PR. Issue body §"down-migration path still works" reads to me as "don't break the .down.sql file convention," not "build an auto-down runner" — but I want m to confirm (Q3 in §10).
|
||||
|
||||
---
|
||||
|
||||
## §8 Checksum drift detection — three pour-overs
|
||||
|
||||
The optional-v1 piece. Three positions on the dial:
|
||||
|
||||
**A. Full drift detection in v1.** Every deploy re-hashes embedded migrations; if a previously-applied version's checksum differs from DB, hard-fail with the diff. Catches "I edited a shipped migration" instantly. Backfilled rows (NULL checksum) skip the check.
|
||||
|
||||
**B. Populate-but-don't-verify.** Store the checksum on apply (cheap). Skip the verify step. We accumulate the data needed for future drift detection without burning deploys on "you edited a file that's already shipped" before m wants that behavior. Easy to flip on later (one if-block).
|
||||
|
||||
**C. Defer entirely.** No checksum column. Add it when we want drift detection.
|
||||
|
||||
Recommendation: **B**. The cost of storing the checksum is one `sha256` call per migration on apply (~µs). The cost of populating later is "go back through prod history and reconstruct what shipped" — not feasible. So store now, even if we don't verify yet. If m wants verification on day 1, A is the next step up — just flip the if-block.
|
||||
|
||||
---
|
||||
|
||||
## §9 Alternatives considered and rejected
|
||||
|
||||
### §9.1 Swap to goose or dbmate
|
||||
|
||||
OOS per issue body. Both are mature, both support gap-tolerant apply, both would require rewriting all 105 migration files into their preferred format (different up/down separator conventions, different tracker shapes). Not worth the migration overhead for what's structurally a small fix.
|
||||
|
||||
### §9.2 Wrap golang-migrate with a custom `Driver` that uses the applied-set table
|
||||
|
||||
Possible — golang-migrate's Driver interface is small. We'd implement a custom `database.Driver` whose `SetVersion()` becomes `INSERT INTO applied_migrations`, `Version()` becomes `MAX(version)`, etc. Then `migrate.Up()` would Just Work.
|
||||
|
||||
Why I rejected: golang-migrate's contract is **a counter**. `Version()` returns a single int. Inside golang-migrate the algorithm is `for v := current+1; v <= max; v++ { apply(v) }`. If we tell it `Version() = 102` while applied_migrations has {1..102, 104, 105, 106} (the parallel-merge state pre-recovery), it'll try to re-apply 104 — wrong. We'd have to lie about Version() in ways that defeat the library's own loop semantics. **A custom driver doesn't actually buy us much over hand-rolling**, because golang-migrate's algorithm fundamentally assumes the counter model.
|
||||
|
||||
Sticking with golang-migrate as a thin wrapper with the same `iofs.Source` but a different driver is technically feasible but adds indirection without value. Hand-roll is cleaner.
|
||||
|
||||
### §9.3 Per-row INSERT with `ON CONFLICT DO NOTHING` instead of advisory lock
|
||||
|
||||
Without a lock, two concurrent deploys could both decide "version 107 is pending" and both try to apply its SQL. One would commit, the other would get a PK violation on the INSERT and roll back — and ALSO leave half a CREATE TABLE / ALTER COLUMN on prod from the SQL that ran before the failing INSERT (if the .up.sql isn't transactional or uses CONCURRENTLY).
|
||||
|
||||
Advisory lock prevents that race entirely. Cost is negligible (Dokploy doesn't run concurrent deploys typically; the lock is belt-and-braces for the rolling-deploy edge case the issue calls out in OOS).
|
||||
|
||||
### §9.4 Keep `paliad.paliad_schema_migrations` updated as a compat view
|
||||
|
||||
A view over `applied_migrations` that exposes `max(version)` + `false` as `dirty` could keep the old shape readable for any consumer expecting it. Today there is no such consumer — no Grafana dashboard reads it, no alert. So this is gold-plating. Drop the old tables in mig 108.
|
||||
|
||||
### §9.5 Per-deploy snapshot of the applied set
|
||||
|
||||
Tempting: write a "deploy event" row each time the runner runs, with `(deploy_id, applied_at, applied_set jsonb)`. Useful audit trail. Out of scope for v1 — `applied_at` per migration already gives a chronological view.
|
||||
|
||||
---
|
||||
|
||||
## §10 Open questions for m
|
||||
|
||||
Eight decisions, batched as two AskUserQuestion calls (four each).
|
||||
|
||||
### Batch 1 — core design
|
||||
|
||||
**Q1 (Library)** — How should the new runner relate to golang-migrate?
|
||||
- A. **Hand-roll** over `embed.FS` (recommended). ~150 LoC in `internal/db/migrate.go`. Drop the `golang-migrate/migrate/v4` dependency. Cleanest contract.
|
||||
- B. Thin wrapper: keep `golang-migrate/migrate/v4` for source/file reading; replace the driver with one that writes `applied_migrations`. Same complexity, more indirection.
|
||||
- C. Don't change the library, just add `applied_migrations` as a secondary tracker the runner consults. Counterproductive — two trackers means two sources of truth, which is exactly the bug class we're trying to leave.
|
||||
|
||||
**Q2 (Checksum drift detection)** — When does the runner enforce content fidelity?
|
||||
- A. **Populate-but-don't-verify in v1** (recommended). Store `sha256(file_bytes)` on apply. No verify-on-deploy step. Backfilled rows have NULL. Verification ships as a separate small PR when m wants it.
|
||||
- B. Full drift detection in v1. Verify on every deploy; hard-fail if a previously-applied migration's content changed. Catches "you edited a shipped migration" immediately.
|
||||
- C. Defer entirely. No checksum column for now. Add it (and the verify) together when we want drift detection.
|
||||
|
||||
**Q3 (Down migrations)** — Does v1 support `migrate down`?
|
||||
- A. **Up-only runner** (recommended). `.down.sql` files stay as reference material. Operator runs them through psql + `DELETE FROM applied_migrations` manually if a roll-back is needed. Zero call sites today; YAGNI applies.
|
||||
- B. Implement minimal `RollbackOne(databaseURL)` — pops the highest applied version, runs its `.down.sql` + DELETE in a tx. Adds ~30 LoC.
|
||||
- C. Drop `.down.sql` files entirely from `embed.FS`. Aggressive.
|
||||
|
||||
**Q4 (Bootstrap path)** — How does production transition from old tracker to applied_migrations?
|
||||
- A. **Runner-bootstrap on first deploy** (recommended). The new runner detects "applied_migrations empty + legacy tracker at v106" and INSERTs rows 1..106 (checksum NULL). Idempotent. No SQL file. One code path.
|
||||
- B. SQL migration `107_backfill_applied_migrations.up.sql` that CREATE TABLE + INSERTs 1..107. The old runner applies it on the first deploy of the new code; from mig 108+ the new runner takes over. Visible in `git log`, but adds a one-off "self-recording" migration with special semantics.
|
||||
- C. Hybrid: write the mig 107 SQL file AND have the runner be defensive (detect either "table exists with rows from mig 107" or "table missing, bootstrap from legacy"). More code, more paths, no clear win.
|
||||
|
||||
### Batch 2 — operational / cleanup
|
||||
|
||||
**Q5 (Concurrent deploys)** — How do we prevent two rolling deploys racing on the apply loop?
|
||||
- A. **`pg_advisory_lock` around the apply loop** (recommended). Standard pattern, matches what golang-migrate does internally. Belt-and-braces against Dokploy's rolling deploy edge cases.
|
||||
- B. No lock; rely on per-mig INSERT with `ON CONFLICT DO NOTHING` to settle races. Risky — non-transactional DDL (CREATE INDEX CONCURRENTLY, e.g.) can land partial state if two deploys race on the same migration before either INSERTs.
|
||||
- C. Both. Lock + ON CONFLICT for double safety.
|
||||
|
||||
**Q6 (Old tracker tables)** — What happens to `paliad.paliad_schema_migrations` (v106 live) and `public.paliad_schema_migrations` (v2 dirty, stale)?
|
||||
- A. **Drop both in a follow-up mig 108** (recommended), after one or two deploys of burn-in on the new runner. Clean break, single source of truth.
|
||||
- B. Keep `paliad.paliad_schema_migrations` updated as a `MAX(version)` view over `applied_migrations` indefinitely. No consumer exists today; this is gold-plating for hypothetical compat.
|
||||
- C. Drop the stale `public.paliad_schema_migrations` immediately (mig 107) but leave `paliad.paliad_schema_migrations` untouched. Half-measure.
|
||||
|
||||
**Q7 (Tests — boot smoke + dry-run)** — How do `main_smoke_test.go` and `migrate_test.go` get updated?
|
||||
- A. **Both updated to read `paliad.applied_migrations`** (recommended). Boot-smoke asserts `max(applied.version) == highestEmbeddedMigrationVersion`. Dry-run computes `pending = on_disk \ applied`. The dirty-flag check disappears (rows are committed or absent, not "dirty").
|
||||
- B. Keep old tests reading the legacy tracker for a transition period; add new tests against `applied_migrations` alongside. Doubles the test surface during a window where both trackers must agree — adds flakiness risk.
|
||||
- C. Drop the old smoke test entirely; rely on the dry-run + a new "applied-set smoke" test only. Loses the bind-and-serve half of `TestBootSmoke`.
|
||||
|
||||
**Q8 (Collision detection — same version twice)** — How aggressively does the runner refuse on disk collisions?
|
||||
- A. **Hard-fail in `ApplyMigrations`** on startup (recommended). Scan `embed.FS`, group by version; if any group has >1 distinct file, return an error before any tx opens. Deploy fails fast.
|
||||
- B. Add a `go generate` check (e.g., `go run ./internal/db/migrations/check`) that runs during build. Catches it earlier (compile time) but adds tooling and a CI step.
|
||||
- C. Convention-only: rely on PR review + `ls migrations/ | tail`. Status quo. Doesn't address the post-mortem's root cause.
|
||||
|
||||
---
|
||||
|
||||
## §11 Implementation slicing (for the eventual coder)
|
||||
|
||||
Not for m to decide — included so the head can scope.
|
||||
|
||||
1. **Slice 1 — schema + runner + bootstrap + tests.** New runner replaces `ApplyMigrations` body. `bootstrapFromLegacyTracker` runs on first deploy. Tests updated. Existing `migrate.go` comment about `public` schema rewritten with the §0 truth. CI dry-run gate still passes.
|
||||
2. **Slice 2 — drop legacy trackers.** `mig 108_drop_legacy_schema_migrations.up.sql`: `DROP TABLE paliad.paliad_schema_migrations; DROP TABLE public.paliad_schema_migrations;`. Ships after one or two deploys of burn-in on Slice 1.
|
||||
3. **(Optional) Slice 3 — drift detection verify.** Flip on the checksum verify if m picks Q2=A in v1 and wants A→B promotion later. ~10 LoC + a smoke test.
|
||||
|
||||
Slice 1 is ~250 LoC counting tests; one focused PR. Slice 2 is two SQL files. Pattern-fluent Sonnet coder is the right pick — substrate is well-trodden (`internal/db/migrate.go`, `embed.FS`, `sql.Tx`), no novel libraries.
|
||||
|
||||
---
|
||||
|
||||
## §12 Risks called out
|
||||
|
||||
1. **Bootstrap race on first deploy of the new runner.** If Deploy N+1 (new code) starts in two pods simultaneously, both might see `applied_migrations` empty + legacy at v106 and both try to insert. The advisory lock (Q5=A) prevents this — second pod waits for the first.
|
||||
2. **Mig 103 content drift between manual SQL apply and today's embedded file.** Verified §0 that the prod schema matches what `103_approval_suggest_changes.up.sql` produces (cols, CHECK). If they're not byte-identical, future drift detection (when we turn on Q2's verify) would flag 103 — but Q5/Q2 leave 103's checksum NULL, so the verify skips it. Safe.
|
||||
3. **Schema-search-path drift.** The `migrate.go` connection uses default search_path. The current `paliad.paliad_schema_migrations` table exists because some prior connection had `search_path=paliad,public`. The new `applied_migrations` is created with the schema explicit (`paliad.applied_migrations`) — no search_path dependency. Removes a class of latent bug.
|
||||
4. **`paliad_schema_migrations` (live tracker) being read by anything we don't know about.** Searched: no Go code, no migrations file, no skill, no doc. If a Grafana dashboard or monitoring alert reads it, the Slice 2 drop will surprise it. Low probability given paliad is a small app, but worth a `mai instruct head` heads-up before Slice 2 lands.
|
||||
5. **The 210-file embed.FS scan on every deploy boot.** O(N) for N=210; ~µs. Not a concern at this scale. If migrations grow to thousands (won't, given application size), revisit.
|
||||
|
||||
---
|
||||
|
||||
## §13 What this design does NOT change
|
||||
|
||||
- The `NNN_description.up.sql` / `.down.sql` filename convention. Stays.
|
||||
- The `ApplyMigrations(databaseURL string) error` signature. Stays.
|
||||
- The 45 service-test call sites of `db.ApplyMigrations`. Untouched.
|
||||
- The CI dry-run pattern (per-mig BEGIN/ROLLBACK against scratch DB). Stays; just reads from the new tracker.
|
||||
- The `paliad` schema and every existing migration's behavior. Untouched.
|
||||
- The Dokploy auto-deploy hook. Untouched.
|
||||
|
||||
The change is **localized to `internal/db/migrate.go` + two test files + one mig 108 to drop legacy trackers** (after burn-in). Coder LoC estimate: ~250 net new + ~40 net deleted. One PR for Slice 1 should clear review in a single pass.
|
||||
|
||||
---
|
||||
|
||||
## §14 m's decisions (2026-05-20)
|
||||
|
||||
All eight picks matched the inventor recommendation. No tip-against, no reasoning addendum needed.
|
||||
|
||||
- **Q1 (Library):** Hand-roll over `embed.FS`. Drop the `golang-migrate/migrate/v4` dependency.
|
||||
- **Q2 (Drift detect):** Populate-but-don't-verify in v1. Store `sha256(file_bytes)` on apply; rows 1..106 stay NULL on backfill. Verify ships as a separate small PR if/when we want it.
|
||||
- **Q3 (Down migs):** Up-only runner. `.down.sql` files stay as reference. No `RollbackOne` in v1.
|
||||
- **Q4 (Bootstrap):** Runner-bootstrap on first deploy. No mig 107 SQL file. Runner detects empty `applied_migrations` + legacy tracker at v106 and INSERTs rows 1..106 (checksum NULL) with `ON CONFLICT DO NOTHING`.
|
||||
- **Q5 (Locking):** `pg_advisory_lock` around the apply loop. Released on `defer`. Lock ID derived from `hash('paliad.applied_migrations')` truncated to int64.
|
||||
- **Q6 (Old trackers):** Drop both `paliad.paliad_schema_migrations` and `public.paliad_schema_migrations` in a follow-up mig 108, after one or two deploys of burn-in.
|
||||
- **Q7 (Tests):** Both `TestBootSmoke` and `TestMigrations_DryRun` updated to read `paliad.applied_migrations`. Dirty-flag check removed. Smoke asserts `max(applied.version) == highestEmbeddedMigrationVersion`.
|
||||
- **Q8 (Collisions):** Hard-fail in `ApplyMigrations` on startup. Scan groups by version; ≥2 distinct files at the same version → error before any tx opens, both filenames in the message.
|
||||
|
||||
### Locked scope for the coder shift (Slice 1)
|
||||
|
||||
- New `internal/db/migrate.go`: hand-rolled `ApplyMigrations` that
|
||||
- opens DB, ensures `paliad` schema,
|
||||
- acquires `pg_advisory_lock(<paliad-namespace-int>)`,
|
||||
- creates `paliad.applied_migrations` if missing,
|
||||
- bootstraps from `paliad.paliad_schema_migrations` if `applied_migrations` is empty,
|
||||
- scans `embed.FS`, hard-fails on version collisions,
|
||||
- hard-fails on rename mismatch (DB name vs disk name for an already-applied version — §4.5),
|
||||
- applies pending in ascending order, each `.up.sql + INSERT` in one tx with `checksum=sha256(file_bytes)`.
|
||||
- Drop `github.com/golang-migrate/migrate/v4` from `go.mod`.
|
||||
- Rewrite the misleading `public` schema comment block at the top of `migrate.go` with the §0 truth.
|
||||
- Update `internal/db/migrate_test.go:135` and `cmd/server/main_smoke_test.go:145` to read `paliad.applied_migrations`. Drop dirty-flag check from the smoke test.
|
||||
- All 45 service-test call sites of `db.ApplyMigrations` keep working unchanged.
|
||||
|
||||
### Deferred to follow-up PRs (NOT in Slice 1)
|
||||
|
||||
- Mig 108: drop legacy tracker tables. After Slice 1 has burned in on one or two deploys.
|
||||
- Optional drift-detection verify (flip Q2 from "populate" to "populate + verify"). Single if-block, single test.
|
||||
- Optional `RollbackOne` if a real call site materializes.
|
||||
|
||||
### Recommended implementer
|
||||
|
||||
Pattern-fluent Sonnet coder. Substrate is well-trodden (`embed.FS`, `sql.Tx`, `crypto/sha256`, advisory locks). No novel libraries. ~250 net new LoC counting tests; ~40 deleted (`migrate.go` body + golang-migrate imports). One focused PR for Slice 1 should clear review in a single pass.
|
||||
|
||||
Branch convention for the coder shift: `mai/<coder>/migration-runner-applied-set` (separate worker — inventor's branch is design-only per project gate protocol).
|
||||
5
go.mod
5
go.mod
@@ -4,18 +4,19 @@ go 1.24.0
|
||||
|
||||
require (
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/jmoiron/sqlx v1.4.0
|
||||
github.com/lib/pq v1.12.3
|
||||
github.com/xuri/excelize/v2 v2.10.1
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/richardlehane/mscfb v1.0.6 // indirect
|
||||
github.com/richardlehane/msoleps v1.0.6 // indirect
|
||||
github.com/tiendc/go-deepcopy v1.7.2 // indirect
|
||||
github.com/xuri/efp v0.0.1 // indirect
|
||||
github.com/xuri/excelize/v2 v2.10.1 // indirect
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect
|
||||
golang.org/x/crypto v0.48.0 // indirect
|
||||
golang.org/x/net v0.50.0 // indirect
|
||||
|
||||
58
go.sum
58
go.sum
@@ -1,39 +1,11 @@
|
||||
filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA=
|
||||
filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI=
|
||||
github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M=
|
||||
github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE=
|
||||
github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4=
|
||||
github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI=
|
||||
github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
|
||||
github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y=
|
||||
github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY=
|
||||
github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA=
|
||||
github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
|
||||
@@ -43,26 +15,14 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ=
|
||||
github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA=
|
||||
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8=
|
||||
github.com/richardlehane/mscfb v1.0.6/go.mod h1:pe0+IUIc0AHh0+teNzBlJCtSyZdFOGgV4ZK9bsoV+Jo=
|
||||
github.com/richardlehane/msoleps v1.0.6 h1:9BvkpjvD+iUBalUY4esMwv6uBkfOip/Lzvd93jvR9gg=
|
||||
github.com/richardlehane/msoleps v1.0.6/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/tiendc/go-deepcopy v1.7.2 h1:Ut2yYR7W9tWjTQitganoIue4UGxZwCcJy3orjrrIj44=
|
||||
github.com/tiendc/go-deepcopy v1.7.2/go.mod h1:4bKjNC2r7boYOkD2IOuZpYjmlDdzjbpTRyCx+goBCJQ=
|
||||
github.com/xuri/efp v0.0.1 h1:fws5Rv3myXyYni8uwj2qKjVaRP30PdjeYe2Y6FDsCL8=
|
||||
@@ -71,22 +31,12 @@ github.com/xuri/excelize/v2 v2.10.1 h1:V62UlqopMqha3kOpnlHy2CcRVw1V8E63jFoWUmMzx
|
||||
github.com/xuri/excelize/v2 v2.10.1/go.mod h1:iG5tARpgaEeIhTqt3/fgXCGoBRt4hNXgCp3tfXKoOIc=
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE=
|
||||
github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA=
|
||||
go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
|
||||
go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ=
|
||||
go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I=
|
||||
go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE=
|
||||
go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E=
|
||||
go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4=
|
||||
go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0=
|
||||
golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts=
|
||||
golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos=
|
||||
golang.org/x/image v0.25.0 h1:Y6uW6rH1y5y/LK1J8BPWZtr6yZ7hrsy6hFrXjgsc2fQ=
|
||||
golang.org/x/image v0.25.0/go.mod h1:tCAmOEGthTtkalusGp1g3xa2gke8J6c2N565dTyl9Rs=
|
||||
golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60=
|
||||
golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM=
|
||||
golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
|
||||
golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
|
||||
golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk=
|
||||
golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
|
||||
@@ -1,46 +1,78 @@
|
||||
// Package db owns the Paliad Postgres connection and embedded schema migrations.
|
||||
//
|
||||
// Migrations are golang-migrate format (NNN_description.up.sql / .down.sql) and
|
||||
// live in the migrations/ subdirectory, embedded into the binary so a single
|
||||
// artifact ships with its schema. The server applies pending migrations at
|
||||
// startup before binding the HTTP listener.
|
||||
// Migrations are NNN_description.up.sql / .down.sql files in the migrations/
|
||||
// subdirectory, embedded into the binary so a single artifact ships with its
|
||||
// schema. The server applies pending migrations at startup before binding
|
||||
// the HTTP listener.
|
||||
//
|
||||
// The runner tracks applied state as a set, not a counter: every applied
|
||||
// migration gets its own row in paliad.applied_migrations(version PK, name,
|
||||
// applied_at, checksum). On every deploy, pending = on_disk \ applied, in
|
||||
// ascending version order. Gaps in the version space are first-class — a
|
||||
// version that's missing from applied_migrations runs on the next deploy,
|
||||
// regardless of which higher versions are already applied.
|
||||
//
|
||||
// This is what closes the parallel-merge skip-hole that the single-counter
|
||||
// tracker (golang-migrate) silently fell into on 2026-05-20 (m/paliad#44).
|
||||
// Background and design: docs/design-migration-runner-applied-set-2026-05-20.md.
|
||||
//
|
||||
// .down.sql files ship in the embedded FS as reference material but are not
|
||||
// auto-applied — there are no call sites for rolling back, and operator
|
||||
// recovery (psql .down.sql + DELETE FROM paliad.applied_migrations WHERE
|
||||
// version=N) is the documented path. If a real call site for auto-rollback
|
||||
// materializes later, add it as a focused follow-up.
|
||||
package db
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"database/sql"
|
||||
"embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"hash/fnv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/golang-migrate/migrate/v4"
|
||||
"github.com/golang-migrate/migrate/v4/database/postgres"
|
||||
"github.com/golang-migrate/migrate/v4/source/iofs"
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
//go:embed migrations/*.sql
|
||||
var migrationFS embed.FS
|
||||
|
||||
// migrationsTable is the name of the golang-migrate tracking table. We use a
|
||||
// uniquely-named table (not the default "schema_migrations") because the
|
||||
// production Supabase instance hosts multiple apps in the `public` schema,
|
||||
// and a differently-shaped `public.schema_migrations` already exists there.
|
||||
// Using "paliad_schema_migrations" prevents collision at startup.
|
||||
// advisoryLockID is the Postgres advisory-lock id the runner takes around
|
||||
// the apply loop. Derived once from the table name so the value is stable
|
||||
// across processes — two concurrent deploys (rolling Dokploy update, dev
|
||||
// laptop hitting the same scratch DB as CI) serialize on this id rather
|
||||
// than racing on the pending set.
|
||||
//
|
||||
// The table lives in the `public` schema (golang-migrate's default) rather
|
||||
// than `paliad`. Rationale: migration 001's down-step is
|
||||
// DROP SCHEMA IF EXISTS paliad CASCADE
|
||||
// which would take the tracking table with it — breaking any subsequent
|
||||
// migrate.Up() call. Keeping the tracker in `public` makes the down-path
|
||||
// safe and idempotent.
|
||||
const migrationsTable = "paliad_schema_migrations"
|
||||
// FNV-1a-64 is good enough: the id only has to be a stable int64, not
|
||||
// cryptographically uniform. Process-wide constant.
|
||||
var advisoryLockID = func() int64 {
|
||||
h := fnv.New64a()
|
||||
_, _ = h.Write([]byte("paliad.applied_migrations"))
|
||||
return int64(h.Sum64())
|
||||
}()
|
||||
|
||||
// ApplyMigrations runs all pending up-migrations against the given database
|
||||
// URL. Returns nil if no migrations were pending. Safe to call repeatedly.
|
||||
// migration is one *.up.sql file from the embedded FS.
|
||||
type migration struct {
|
||||
version int
|
||||
name string
|
||||
filename string
|
||||
}
|
||||
|
||||
// ApplyMigrations applies every pending up-migration to the given database.
|
||||
//
|
||||
// Pre-creates the `paliad` schema before invoking golang-migrate because the
|
||||
// first migration creates it and golang-migrate's tracking table would
|
||||
// otherwise be created in whatever `current_schema()` happens to be.
|
||||
// Safe to call repeatedly; a fully-applied tree is a no-op. Returns the
|
||||
// first error encountered (with the offending migration filename wrapped
|
||||
// in the message) and leaves the rest of pending unapplied — same fail-fast
|
||||
// posture as the previous golang-migrate runner.
|
||||
//
|
||||
// On first deploy of this code path against a database that still has the
|
||||
// legacy paliad.paliad_schema_migrations counter at version N, the runner
|
||||
// seeds paliad.applied_migrations with rows 1..N (checksum NULL) before
|
||||
// applying anything new. The first deploy is therefore effectively a
|
||||
// no-op against the schema — the bootstrap just relabels existing state.
|
||||
func ApplyMigrations(databaseURL string) error {
|
||||
if databaseURL == "" {
|
||||
return errors.New("database URL is empty")
|
||||
@@ -51,39 +83,250 @@ func ApplyMigrations(databaseURL string) error {
|
||||
return fmt.Errorf("open database: %w", err)
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
if err := conn.Ping(); err != nil {
|
||||
return fmt.Errorf("ping database: %w", err)
|
||||
}
|
||||
|
||||
// Bootstrap the paliad schema so later migrations can target it cleanly.
|
||||
// This duplicates migration 001, but is idempotent via IF NOT EXISTS and
|
||||
// ensures the schema exists before golang-migrate touches the DB.
|
||||
// Ensure the paliad schema exists. Mig 001 also creates it; the
|
||||
// applied_migrations table lives in paliad.* and gets created before
|
||||
// any migrations run, so the schema must exist first.
|
||||
if _, err := conn.Exec(`CREATE SCHEMA IF NOT EXISTS paliad`); err != nil {
|
||||
return fmt.Errorf("ensure paliad schema: %w", err)
|
||||
}
|
||||
|
||||
source, err := iofs.New(migrationFS, "migrations")
|
||||
if err != nil {
|
||||
return fmt.Errorf("open migration source: %w", err)
|
||||
if _, err := conn.Exec(`SELECT pg_advisory_lock($1)`, advisoryLockID); err != nil {
|
||||
return fmt.Errorf("acquire advisory lock: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
_, _ = conn.Exec(`SELECT pg_advisory_unlock($1)`, advisoryLockID)
|
||||
}()
|
||||
|
||||
if _, err := conn.Exec(`
|
||||
CREATE TABLE IF NOT EXISTS paliad.applied_migrations (
|
||||
version int NOT NULL PRIMARY KEY,
|
||||
name text NOT NULL,
|
||||
applied_at timestamptz NOT NULL DEFAULT now(),
|
||||
checksum text NULL
|
||||
)
|
||||
`); err != nil {
|
||||
return fmt.Errorf("create applied_migrations: %w", err)
|
||||
}
|
||||
|
||||
driver, err := postgres.WithInstance(conn, &postgres.Config{
|
||||
// Unique tracking-table name avoids collision with pre-existing
|
||||
// public.schema_migrations owned by other apps on this Postgres.
|
||||
MigrationsTable: migrationsTable,
|
||||
})
|
||||
onDisk, err := scanEmbeddedMigrations()
|
||||
if err != nil {
|
||||
return fmt.Errorf("create migration driver: %w", err)
|
||||
return fmt.Errorf("scan embedded migrations: %w", err)
|
||||
}
|
||||
|
||||
m, err := migrate.NewWithInstance("iofs", source, "postgres", driver)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create migrator: %w", err)
|
||||
if err := bootstrapFromLegacyTracker(conn, onDisk); err != nil {
|
||||
return fmt.Errorf("bootstrap from legacy tracker: %w", err)
|
||||
}
|
||||
|
||||
if err := m.Up(); err != nil && !errors.Is(err, migrate.ErrNoChange) {
|
||||
return fmt.Errorf("apply migrations: %w", err)
|
||||
applied, err := readAppliedMigrations(conn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read applied_migrations: %w", err)
|
||||
}
|
||||
|
||||
if err := checkNameAgreement(onDisk, applied); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, m := range onDisk {
|
||||
if _, ok := applied[m.version]; ok {
|
||||
continue
|
||||
}
|
||||
if err := applyOne(conn, m); err != nil {
|
||||
return fmt.Errorf("apply %s: %w", m.filename, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// scanEmbeddedMigrations returns every NNN_*.up.sql in the embedded FS,
|
||||
// sorted by version ascending. Hard-fails on two files sharing the same
|
||||
// version prefix — that's the failure mode the parallel-merge incident
|
||||
// exposed, and the runner refuses to start rather than silently picking one.
|
||||
func scanEmbeddedMigrations() ([]migration, error) {
|
||||
entries, err := migrationFS.ReadDir("migrations")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read migrations dir: %w", err)
|
||||
}
|
||||
seen := map[int]string{}
|
||||
var out []migration
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !strings.HasSuffix(name, ".up.sql") {
|
||||
continue
|
||||
}
|
||||
v, n, ok := parseMigrationFilename(name)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unparseable migration filename %q "+
|
||||
"(expected NNN_description.up.sql)", name)
|
||||
}
|
||||
if prior, dup := seen[v]; dup {
|
||||
return nil, fmt.Errorf("two migrations at version %d: %q and %q — "+
|
||||
"rename one and redeploy", v, prior, name)
|
||||
}
|
||||
seen[v] = name
|
||||
out = append(out, migration{version: v, name: n, filename: name})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].version < out[j].version })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// parseMigrationFilename splits "NNN_description.up.sql" into (NNN, description).
|
||||
// Returns ok=false on any deviation from that shape.
|
||||
func parseMigrationFilename(filename string) (version int, name string, ok bool) {
|
||||
base := strings.TrimSuffix(filename, ".up.sql")
|
||||
if base == filename {
|
||||
return 0, "", false
|
||||
}
|
||||
underscore := strings.IndexByte(base, '_')
|
||||
if underscore <= 0 {
|
||||
return 0, "", false
|
||||
}
|
||||
v, err := strconv.Atoi(base[:underscore])
|
||||
if err != nil {
|
||||
return 0, "", false
|
||||
}
|
||||
return v, base[underscore+1:], true
|
||||
}
|
||||
|
||||
// readAppliedMigrations returns a map version → name from
|
||||
// paliad.applied_migrations. Returns an empty map (no error) if the table
|
||||
// is missing — that's the fresh-DB path before the CREATE TABLE in
|
||||
// ApplyMigrations runs against it.
|
||||
func readAppliedMigrations(conn *sql.DB) (map[int]string, error) {
|
||||
rows, err := conn.Query(`SELECT version, name FROM paliad.applied_migrations`)
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "does not exist") {
|
||||
return map[int]string{}, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
out := map[int]string{}
|
||||
for rows.Next() {
|
||||
var v int
|
||||
var n string
|
||||
if err := rows.Scan(&v, &n); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
out[v] = n
|
||||
}
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
// bootstrapFromLegacyTracker seeds paliad.applied_migrations from
|
||||
// paliad.paliad_schema_migrations on the first deploy of the new runner
|
||||
// against a DB that previously ran golang-migrate.
|
||||
//
|
||||
// Behavior:
|
||||
// - applied_migrations already has rows → no-op (idempotent).
|
||||
// - applied_migrations empty AND legacy tracker missing → no-op
|
||||
// (virgin DB; the apply loop will run everything from scratch).
|
||||
// - applied_migrations empty AND legacy tracker present, clean, version N
|
||||
// → INSERT rows for every on-disk version ≤ N with checksum NULL.
|
||||
// - applied_migrations empty AND legacy tracker dirty → hard-fail.
|
||||
// The operator must recover the legacy tracker first (it being dirty
|
||||
// means a prior golang-migrate run crashed mid-flight); we will not
|
||||
// paper over an unknown state by guessing what landed.
|
||||
//
|
||||
// Backfilled rows have checksum NULL because the legacy runner didn't hash
|
||||
// anything — we can't fabricate a provenance hash today without falsely
|
||||
// claiming we know the byte-identity of what shipped historically.
|
||||
func bootstrapFromLegacyTracker(conn *sql.DB, onDisk []migration) error {
|
||||
var count int
|
||||
if err := conn.QueryRow(`SELECT count(*) FROM paliad.applied_migrations`).Scan(&count); err != nil {
|
||||
return fmt.Errorf("count applied_migrations: %w", err)
|
||||
}
|
||||
if count > 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var legacyVer int
|
||||
var legacyDirty bool
|
||||
err := conn.QueryRow(`SELECT version, dirty FROM paliad.paliad_schema_migrations LIMIT 1`).
|
||||
Scan(&legacyVer, &legacyDirty)
|
||||
if errors.Is(err, sql.ErrNoRows) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "does not exist") {
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("read legacy tracker: %w", err)
|
||||
}
|
||||
if legacyDirty {
|
||||
return fmt.Errorf("legacy paliad.paliad_schema_migrations is dirty at version %d — "+
|
||||
"recover manually before deploying", legacyVer)
|
||||
}
|
||||
|
||||
for _, m := range onDisk {
|
||||
if m.version > legacyVer {
|
||||
continue
|
||||
}
|
||||
if _, err := conn.Exec(`
|
||||
INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum)
|
||||
VALUES ($1, $2, now(), NULL)
|
||||
ON CONFLICT (version) DO NOTHING
|
||||
`, m.version, m.name); err != nil {
|
||||
return fmt.Errorf("backfill version %d: %w", m.version, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// checkNameAgreement hard-fails if a version that's already applied has a
|
||||
// different name on disk than in the DB. Catches the post-merge rename
|
||||
// accident where someone renames `098_foo.up.sql` to `098_bar.up.sql` —
|
||||
// the SQL has already run on prod with the old name, so the rename is a
|
||||
// lie about history. Operator recovery: revert the rename, or update the
|
||||
// DB row if the rename is intentional.
|
||||
//
|
||||
// Backfilled rows have a name pulled from the on-disk filename, so an
|
||||
// out-of-the-box backfill never trips this check.
|
||||
func checkNameAgreement(onDisk []migration, applied map[int]string) error {
|
||||
for _, m := range onDisk {
|
||||
dbName, ok := applied[m.version]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if dbName != m.name {
|
||||
return fmt.Errorf("migration %d: disk name %q != DB name %q "+
|
||||
"(renamed after apply? revert the rename, or UPDATE paliad.applied_migrations "+
|
||||
"SET name=%q WHERE version=%d if the rename is intentional)",
|
||||
m.version, m.name, dbName, m.name, m.version)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// applyOne runs one migration's .up.sql plus its INSERT row in a single
|
||||
// transaction. All-or-nothing per migration: if the SQL fails, the row
|
||||
// isn't inserted and the next deploy re-tries from the same point. If
|
||||
// the INSERT fails (e.g. PK violation because the lock wasn't held), the
|
||||
// SQL rolls back too.
|
||||
func applyOne(conn *sql.DB, m migration) error {
|
||||
body, err := migrationFS.ReadFile("migrations/" + m.filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read %s: %w", m.filename, err)
|
||||
}
|
||||
checksum := fmt.Sprintf("%x", sha256.Sum256(body))
|
||||
|
||||
tx, err := conn.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("begin tx: %w", err)
|
||||
}
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
if _, err := tx.Exec(string(body)); err != nil {
|
||||
return fmt.Errorf("exec sql: %w", err)
|
||||
}
|
||||
if _, err := tx.Exec(`
|
||||
INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum)
|
||||
VALUES ($1, $2, now(), $3)
|
||||
`, m.version, m.name, checksum); err != nil {
|
||||
return fmt.Errorf("record applied: %w", err)
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
@@ -1,60 +1,49 @@
|
||||
// Package db tests — migration dry-run gate.
|
||||
//
|
||||
// This is the test that catches mig-N crash-loops before they reach prod.
|
||||
// The convention since t-paliad-098/099 is that paliad migrations land in
|
||||
// numeric order on a single trunk; the next deploy runs whichever ones are
|
||||
// pending against the live `public.paliad_schema_migrations` tracker. A
|
||||
// migration that compiles cleanly but fails on apply (typo, missing column,
|
||||
// wrong CHECK shape) crashes the Dokploy container loop before paliad.de
|
||||
// finishes binding :8080, and the only way to learn about it today is to
|
||||
// watch the deploy log.
|
||||
// The new runner tracks applied state as a set in paliad.applied_migrations
|
||||
// (one row per migration; see migrate.go). A migration that compiles cleanly
|
||||
// but fails on apply (typo, missing column, wrong CHECK shape) crashes the
|
||||
// Dokploy container loop before paliad.de finishes binding :8080, and the
|
||||
// only way to learn about it today is to watch the deploy log.
|
||||
//
|
||||
// TestMigrations_DryRun closes that gap: for every *.up.sql in this
|
||||
// directory whose version is greater than the scratch DB's current tracker
|
||||
// version, it opens a transaction, runs the SQL, and ROLLBACKs. Any error
|
||||
// fails the test with the file name + Postgres error. Always non-destructive
|
||||
// — the ROLLBACK runs even on success, so the scratch DB stays at its
|
||||
// starting version.
|
||||
// directory whose version is NOT present in paliad.applied_migrations on
|
||||
// the scratch DB, it opens a transaction, runs the SQL, and ROLLBACKs.
|
||||
// Any error fails the test with the file name + Postgres error. Always
|
||||
// non-destructive — the ROLLBACK runs even on success, so the scratch DB
|
||||
// stays at its starting set.
|
||||
//
|
||||
// "Pending" means: a version that's on disk but not in applied_migrations.
|
||||
// In CI against a fresh scratch DB (where applied_migrations either
|
||||
// doesn't exist or is empty), every migration is pending and gets
|
||||
// verified. On a developer laptop whose scratch DB is already at HEAD,
|
||||
// no migrations are pending and the test logs and passes — the protection
|
||||
// only kicks in the moment a new *.up.sql lands in the tree before the
|
||||
// developer runs `db.ApplyMigrations` against the same scratch DB.
|
||||
//
|
||||
// Requires TEST_DATABASE_URL (same pattern as the rest of the live-DB
|
||||
// tests). Skipped without it.
|
||||
//
|
||||
// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1.
|
||||
// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1 and
|
||||
// docs/design-migration-runner-applied-set-2026-05-20.md §6.
|
||||
|
||||
package db
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
_ "github.com/lib/pq"
|
||||
)
|
||||
|
||||
// migration is one *.up.sql file from the embedded migrations FS.
|
||||
type migration struct {
|
||||
version int
|
||||
name string
|
||||
filename string
|
||||
}
|
||||
|
||||
// TestMigrations_DryRun walks every pending *.up.sql in numeric order,
|
||||
// applies each inside its own BEGIN/ROLLBACK against the scratch DB, and
|
||||
// fails the test on the first SQL error. Reports per-file as a sub-test so
|
||||
// `go test -v` shows which migration failed.
|
||||
//
|
||||
// What "pending" means: greater than the scratch DB's current tracker
|
||||
// version (or 0 if the tracker doesn't exist yet). In CI against a fresh
|
||||
// scratch DB, every migration is pending and gets verified. On a developer
|
||||
// laptop whose scratch DB is already at HEAD, no migrations are pending and
|
||||
// the test logs the start version and passes — the protection only kicks in
|
||||
// the moment a new *.up.sql lands in the tree before the developer runs
|
||||
// `db.ApplyMigrations` against the same scratch DB.
|
||||
func TestMigrations_DryRun(t *testing.T) {
|
||||
url := os.Getenv("TEST_DATABASE_URL")
|
||||
if url == "" {
|
||||
@@ -79,28 +68,32 @@ func TestMigrations_DryRun(t *testing.T) {
|
||||
t.Fatalf("ensure paliad schema: %v", err)
|
||||
}
|
||||
|
||||
startVersion, dirty, err := currentTrackerVersion(conn)
|
||||
applied, err := readAppliedVersions(conn)
|
||||
if err != nil {
|
||||
t.Fatalf("read tracker: %v", err)
|
||||
t.Fatalf("read applied_migrations: %v", err)
|
||||
}
|
||||
if dirty {
|
||||
t.Fatalf("tracker is dirty at version %d — fix that first (DROP the tracker row "+
|
||||
"or restore from backup); the dry-run cannot trust a dirty starting state",
|
||||
startVersion)
|
||||
}
|
||||
t.Logf("scratch DB tracker at version %d; walking pending migrations from %d upward",
|
||||
startVersion, startVersion+1)
|
||||
|
||||
migs, err := loadPendingMigrations(startVersion)
|
||||
onDisk, err := scanEmbeddedMigrations()
|
||||
if err != nil {
|
||||
t.Fatalf("load migrations: %v", err)
|
||||
t.Fatalf("scan embedded migrations: %v", err)
|
||||
}
|
||||
if len(migs) == 0 {
|
||||
t.Logf("no pending migrations — scratch DB is at HEAD (%d)", startVersion)
|
||||
|
||||
var pending []migration
|
||||
for _, m := range onDisk {
|
||||
if !applied[m.version] {
|
||||
pending = append(pending, m)
|
||||
}
|
||||
}
|
||||
|
||||
if len(pending) == 0 {
|
||||
t.Logf("no pending migrations — scratch DB applied set covers every on-disk version (%d total)",
|
||||
len(onDisk))
|
||||
return
|
||||
}
|
||||
t.Logf("scratch DB has %d/%d on-disk migrations applied; walking %d pending",
|
||||
len(applied), len(onDisk), len(pending))
|
||||
|
||||
for _, m := range migs {
|
||||
for _, m := range pending {
|
||||
t.Run(fmt.Sprintf("%03d_%s", m.version, m.name), func(t *testing.T) {
|
||||
body, err := migrationFS.ReadFile("migrations/" + m.filename)
|
||||
if err != nil {
|
||||
@@ -110,10 +103,10 @@ func TestMigrations_DryRun(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("begin: %v", err)
|
||||
}
|
||||
// Always rollback; the dry-run must not leave the scratch DB
|
||||
// at a different version than where it started. Rollback is
|
||||
// safe to call even after a failed Exec — Postgres aborts the
|
||||
// transaction internally on the first error.
|
||||
// Always rollback; the dry-run must not leave the scratch
|
||||
// DB at a different applied set than where it started.
|
||||
// Rollback is safe after a failed Exec — Postgres aborts
|
||||
// the transaction internally on the first error.
|
||||
defer func() { _ = tx.Rollback() }()
|
||||
|
||||
if _, err := tx.Exec(string(body)); err != nil {
|
||||
@@ -123,76 +116,30 @@ func TestMigrations_DryRun(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// currentTrackerVersion reads the latest version + dirty flag from the
|
||||
// `public.paliad_schema_migrations` tracker. Returns (0, false, nil) when the
|
||||
// tracker doesn't exist yet — that's the "fresh scratch DB" path.
|
||||
// readAppliedVersions returns the set of versions present in
|
||||
// paliad.applied_migrations on the scratch DB. Missing table → empty set
|
||||
// (fresh-DB path; the table only exists after the runner has been called).
|
||||
//
|
||||
// We don't use golang-migrate's API to read this because golang-migrate's
|
||||
// driver locks the tracker row on read; a test runner that calls this while
|
||||
// the developer has paliad running locally would race. A plain SELECT is
|
||||
// race-safe and matches what `psql` would show.
|
||||
func currentTrackerVersion(conn *sql.DB) (version int, dirty bool, err error) {
|
||||
const q = `SELECT version, dirty FROM public.paliad_schema_migrations LIMIT 1`
|
||||
row := conn.QueryRow(q)
|
||||
if scanErr := row.Scan(&version, &dirty); scanErr != nil {
|
||||
// Missing table → fresh DB → start at 0. lib/pq surfaces this
|
||||
// as `pq.Error.Code = "42P01"` (undefined_table); the simpler
|
||||
// sql.ErrNoRows fires if the table exists but is empty (also
|
||||
// fresh-DB-shaped).
|
||||
if errors.Is(scanErr, sql.ErrNoRows) {
|
||||
return 0, false, nil
|
||||
}
|
||||
if strings.Contains(scanErr.Error(), "does not exist") {
|
||||
return 0, false, nil
|
||||
}
|
||||
return 0, false, scanErr
|
||||
}
|
||||
return version, dirty, nil
|
||||
}
|
||||
|
||||
// loadPendingMigrations returns every *.up.sql in the embedded FS whose
|
||||
// version is greater than startVersion, sorted by version ascending. A
|
||||
// filename like "098_submission_codes_prefix_and_rename.up.sql" yields
|
||||
// version=98, name="submission_codes_prefix_and_rename".
|
||||
func loadPendingMigrations(startVersion int) ([]migration, error) {
|
||||
entries, err := migrationFS.ReadDir("migrations")
|
||||
// We don't pre-create the table here because the dry-run is supposed to be
|
||||
// a passive observer — it must not mutate the scratch DB outside of its
|
||||
// own per-mig BEGIN/ROLLBACK probes. A "table doesn't exist" outcome is
|
||||
// the right read against a virgin scratch DB.
|
||||
func readAppliedVersions(conn *sql.DB) (map[int]bool, error) {
|
||||
rows, err := conn.Query(`SELECT version FROM paliad.applied_migrations`)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read migrations dir: %w", err)
|
||||
if strings.Contains(err.Error(), "does not exist") {
|
||||
return map[int]bool{}, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
var out []migration
|
||||
for _, e := range entries {
|
||||
name := e.Name()
|
||||
if !strings.HasSuffix(name, ".up.sql") {
|
||||
continue
|
||||
defer rows.Close()
|
||||
out := map[int]bool{}
|
||||
for rows.Next() {
|
||||
var v int
|
||||
if err := rows.Scan(&v); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
v, n, ok := parseMigrationName(name)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unparseable migration filename: %s "+
|
||||
"(expected NNN_description.up.sql)", name)
|
||||
}
|
||||
if v <= startVersion {
|
||||
continue
|
||||
}
|
||||
out = append(out, migration{version: v, name: n, filename: name})
|
||||
out[v] = true
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool { return out[i].version < out[j].version })
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// parseMigrationName splits "NNN_description.up.sql" into (NNN, description).
|
||||
// Returns ok=false on any deviation from that shape.
|
||||
func parseMigrationName(filename string) (version int, name string, ok bool) {
|
||||
base := strings.TrimSuffix(filename, ".up.sql")
|
||||
if base == filename { // suffix wasn't present
|
||||
return 0, "", false
|
||||
}
|
||||
underscore := strings.IndexByte(base, '_')
|
||||
if underscore <= 0 {
|
||||
return 0, "", false
|
||||
}
|
||||
v, err := strconv.Atoi(base[:underscore])
|
||||
if err != nil {
|
||||
return 0, "", false
|
||||
}
|
||||
return v, base[underscore+1:], true
|
||||
return out, rows.Err()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user