diff --git a/cmd/server/main_smoke_test.go b/cmd/server/main_smoke_test.go index 061fdb0..0d123ba 100644 --- a/cmd/server/main_smoke_test.go +++ b/cmd/server/main_smoke_test.go @@ -3,20 +3,23 @@ // Three checks against TEST_DATABASE_URL: // // 1. db.ApplyMigrations does not panic and returns nil. -// 2. The migration tracker (public.paliad_schema_migrations) advances to -// the highest *.up.sql version on disk — no migrations were silently -// skipped, no "dirty=true" stragglers left behind. +// 2. paliad.applied_migrations covers every on-disk *.up.sql — no +// migration was silently skipped, no version is missing. The set +// contract is stronger than the old single-counter check: applied +// set must EQUAL on-disk set, not just reach the max version. // 3. The handler mux (with /healthz mounted) responds 200 to GET /healthz. // // This is the lightweight cousin of the migration dry-run gate // (internal/db/migrate_test.go): the dry-run catches per-migration syntax // errors before merge; this smoke confirms the apply+bind path the // container actually runs at boot. Together they cover the mig-098 / -// mig-099 class of crash-loops end-to-end. +// mig-099 class of crash-loops end-to-end, plus the mig-103 parallel-merge +// skip-hole that t-paliad-218 closed (m/paliad#44). // // Skipped without TEST_DATABASE_URL — matches the rest of the live-DB tests. // -// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1. +// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1 and +// docs/design-migration-runner-applied-set-2026-05-20.md §6. package main @@ -51,19 +54,23 @@ func TestBootSmoke(t *testing.T) { t.Fatalf("db.ApplyMigrations: %v", err) } - // (2) Assert the tracker advanced to the highest *.up.sql version we - // embed. If a migration was silently skipped or the tracker is dirty, - // the prod container would crash-loop — this turns that into a test - // failure with a precise reason. - expected := highestEmbeddedMigrationVersion(t) - got, dirty := readTrackerVersion(t, url) - if dirty { - t.Errorf("tracker reports dirty=true at version %d — investigate before deploying", got) + // (2) Assert the applied set equals the on-disk set. The new runner + // tracks applied state per-migration; a silently-skipped version + // would surface as a row missing from paliad.applied_migrations even + // though max(version) matches. Comparing sets — not just max — + // catches the failure mode the t-paliad-218 post-mortem documented. + onDisk := embeddedMigrationVersions(t) + applied := appliedMigrationVersions(t, url) + + if missing := setDiff(onDisk, applied); len(missing) > 0 { + t.Errorf("paliad.applied_migrations missing %d on-disk versions: %v "+ + "(a migration was skipped — investigate before deploying)", + len(missing), missing) } - if got != expected { - t.Errorf("tracker at version %d; expected %d (highest *.up.sql on disk). "+ - "A migration was skipped or applied out of order.", - got, expected) + if extra := setDiff(applied, onDisk); len(extra) > 0 { + t.Errorf("paliad.applied_migrations has %d versions with no on-disk file: %v "+ + "(orphan rows — either restore the file or DELETE the row)", + len(extra), extra) } // (3) Mount the public handlers (the same Register call main() makes, @@ -93,11 +100,16 @@ func TestBootSmoke(t *testing.T) { } } -// highestEmbeddedMigrationVersion finds max(N) over every NNN_*.up.sql -// file in internal/db/migrations/ on disk. Used as the expected tracker -// version after a clean apply. We read from disk (not the embed.FS in -// the db package — it's unexported) since the test runs from the repo. -func highestEmbeddedMigrationVersion(t *testing.T) int { +// embeddedMigrationVersions returns every N where N_*.up.sql exists in +// internal/db/migrations/ on disk. The boot smoke compares this set +// against paliad.applied_migrations to detect skipped or orphan +// migrations. +// +// Read from disk (not the embed.FS inside the db package — it's unexported) +// since the test runs from the repo. The two views must agree for the +// build to be self-consistent; if they diverge, the smoke test is the +// wrong place to learn about it (the build is). We trust them to match. +func embeddedMigrationVersions(t *testing.T) []int { t.Helper() root, err := repoRoot() if err != nil { @@ -129,24 +141,52 @@ func highestEmbeddedMigrationVersion(t *testing.T) int { t.Fatalf("no *.up.sql files found in %s", dir) } sort.Ints(versions) - return versions[len(versions)-1] + return versions } -// readTrackerVersion fetches the lone row from the tracker. golang-migrate -// keeps exactly one row; if we ever see zero or more, that's the dirty-state -// the test is designed to flag. -func readTrackerVersion(t *testing.T, url string) (version int, dirty bool) { +// appliedMigrationVersions reads paliad.applied_migrations and returns +// the sorted list of versions. Fails the test if the table doesn't exist — +// db.ApplyMigrations is supposed to have created it by this point. +func appliedMigrationVersions(t *testing.T, url string) []int { t.Helper() conn, err := sql.Open("postgres", url) if err != nil { t.Fatalf("open: %v", err) } defer conn.Close() - row := conn.QueryRow(`SELECT version, dirty FROM public.paliad_schema_migrations LIMIT 1`) - if err := row.Scan(&version, &dirty); err != nil { - t.Fatalf("read tracker: %v", err) + rows, err := conn.Query(`SELECT version FROM paliad.applied_migrations ORDER BY version`) + if err != nil { + t.Fatalf("read applied_migrations: %v", err) } - return version, dirty + defer rows.Close() + var out []int + for rows.Next() { + var v int + if err := rows.Scan(&v); err != nil { + t.Fatalf("scan: %v", err) + } + out = append(out, v) + } + if err := rows.Err(); err != nil { + t.Fatalf("rows: %v", err) + } + return out +} + +// setDiff returns the elements of a that are not in b. Inputs are sorted +// ascending; output preserves that ordering. +func setDiff(a, b []int) []int { + bset := make(map[int]bool, len(b)) + for _, v := range b { + bset[v] = true + } + var out []int + for _, v := range a { + if !bset[v] { + out = append(out, v) + } + } + return out } // repoRoot walks upward from the test binary's working directory until it diff --git a/go.mod b/go.mod index 6bb4f63..dbb1c9e 100644 --- a/go.mod +++ b/go.mod @@ -4,18 +4,19 @@ go 1.24.0 require ( github.com/golang-jwt/jwt/v5 v5.3.1 - github.com/golang-migrate/migrate/v4 v4.19.1 github.com/google/uuid v1.6.0 github.com/jmoiron/sqlx v1.4.0 github.com/lib/pq v1.12.3 + github.com/xuri/excelize/v2 v2.10.1 ) require ( + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/richardlehane/mscfb v1.0.6 // indirect github.com/richardlehane/msoleps v1.0.6 // indirect github.com/tiendc/go-deepcopy v1.7.2 // indirect github.com/xuri/efp v0.0.1 // indirect - github.com/xuri/excelize/v2 v2.10.1 // indirect github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect golang.org/x/crypto v0.48.0 // indirect golang.org/x/net v0.50.0 // indirect diff --git a/go.sum b/go.sum index 2d12ed6..be9aa52 100644 --- a/go.sum +++ b/go.sum @@ -1,39 +1,11 @@ filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= -github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= -github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= -github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= -github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= -github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= -github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= -github.com/containerd/errdefs/pkg v0.3.0/go.mod h1:NJw6s9HwNuRhnjJhM7pylWwMyAkmCQvQ4GpJHEqRLVk= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4= -github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU= -github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= -github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= -github.com/docker/docker v28.3.3+incompatible h1:Dypm25kh4rmk49v1eiVbsAtpAsYURjYkaKubwuBdxEI= -github.com/docker/docker v28.3.3+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= -github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c= -github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc= -github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= -github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= -github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= -github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= -github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= -github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= -github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= -github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= -github.com/golang-migrate/migrate/v4 v4.19.1 h1:OCyb44lFuQfYXYLx1SCxPZQGU7mcaZ7gH9yH4jSFbBA= -github.com/golang-migrate/migrate/v4 v4.19.1/go.mod h1:CTcgfjxhaUtsLipnLoQRWCrjYXycRz/g5+RWDuYgPrE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= @@ -43,26 +15,14 @@ github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= -github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= -github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= -github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= -github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= -github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= -github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= -github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= -github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= -github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM= -github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= -github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/richardlehane/mscfb v1.0.6 h1:eN3bvvZCp00bs7Zf52bxNwAx5lJDBK1tCuH19qq5aC8= github.com/richardlehane/mscfb v1.0.6/go.mod h1:pe0+IUIc0AHh0+teNzBlJCtSyZdFOGgV4ZK9bsoV+Jo= github.com/richardlehane/msoleps v1.0.6 h1:9BvkpjvD+iUBalUY4esMwv6uBkfOip/Lzvd93jvR9gg= github.com/richardlehane/msoleps v1.0.6/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tiendc/go-deepcopy v1.7.2 h1:Ut2yYR7W9tWjTQitganoIue4UGxZwCcJy3orjrrIj44= github.com/tiendc/go-deepcopy v1.7.2/go.mod h1:4bKjNC2r7boYOkD2IOuZpYjmlDdzjbpTRyCx+goBCJQ= github.com/xuri/efp v0.0.1 h1:fws5Rv3myXyYni8uwj2qKjVaRP30PdjeYe2Y6FDsCL8= @@ -71,22 +31,12 @@ github.com/xuri/excelize/v2 v2.10.1 h1:V62UlqopMqha3kOpnlHy2CcRVw1V8E63jFoWUmMzx github.com/xuri/excelize/v2 v2.10.1/go.mod h1:iG5tARpgaEeIhTqt3/fgXCGoBRt4hNXgCp3tfXKoOIc= github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE= github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ= -go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= -go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= -go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= -go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= -go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= -go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= -go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= -go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= +golang.org/x/image v0.25.0 h1:Y6uW6rH1y5y/LK1J8BPWZtr6yZ7hrsy6hFrXjgsc2fQ= +golang.org/x/image v0.25.0/go.mod h1:tCAmOEGthTtkalusGp1g3xa2gke8J6c2N565dTyl9Rs= golang.org/x/net v0.50.0 h1:ucWh9eiCGyDR3vtzso0WMQinm2Dnt8cFMuQa9K33J60= golang.org/x/net v0.50.0/go.mod h1:UgoSli3F/pBgdJBHCTc+tp3gmrU4XswgGRgtnwWTfyM= -golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= -golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/internal/db/migrate.go b/internal/db/migrate.go index f187f3d..68b2fe7 100644 --- a/internal/db/migrate.go +++ b/internal/db/migrate.go @@ -1,46 +1,78 @@ // Package db owns the Paliad Postgres connection and embedded schema migrations. // -// Migrations are golang-migrate format (NNN_description.up.sql / .down.sql) and -// live in the migrations/ subdirectory, embedded into the binary so a single -// artifact ships with its schema. The server applies pending migrations at -// startup before binding the HTTP listener. +// Migrations are NNN_description.up.sql / .down.sql files in the migrations/ +// subdirectory, embedded into the binary so a single artifact ships with its +// schema. The server applies pending migrations at startup before binding +// the HTTP listener. +// +// The runner tracks applied state as a set, not a counter: every applied +// migration gets its own row in paliad.applied_migrations(version PK, name, +// applied_at, checksum). On every deploy, pending = on_disk \ applied, in +// ascending version order. Gaps in the version space are first-class — a +// version that's missing from applied_migrations runs on the next deploy, +// regardless of which higher versions are already applied. +// +// This is what closes the parallel-merge skip-hole that the single-counter +// tracker (golang-migrate) silently fell into on 2026-05-20 (m/paliad#44). +// Background and design: docs/design-migration-runner-applied-set-2026-05-20.md. +// +// .down.sql files ship in the embedded FS as reference material but are not +// auto-applied — there are no call sites for rolling back, and operator +// recovery (psql .down.sql + DELETE FROM paliad.applied_migrations WHERE +// version=N) is the documented path. If a real call site for auto-rollback +// materializes later, add it as a focused follow-up. package db import ( + "crypto/sha256" "database/sql" "embed" "errors" "fmt" + "hash/fnv" + "sort" + "strconv" + "strings" - "github.com/golang-migrate/migrate/v4" - "github.com/golang-migrate/migrate/v4/database/postgres" - "github.com/golang-migrate/migrate/v4/source/iofs" _ "github.com/lib/pq" ) //go:embed migrations/*.sql var migrationFS embed.FS -// migrationsTable is the name of the golang-migrate tracking table. We use a -// uniquely-named table (not the default "schema_migrations") because the -// production Supabase instance hosts multiple apps in the `public` schema, -// and a differently-shaped `public.schema_migrations` already exists there. -// Using "paliad_schema_migrations" prevents collision at startup. +// advisoryLockID is the Postgres advisory-lock id the runner takes around +// the apply loop. Derived once from the table name so the value is stable +// across processes — two concurrent deploys (rolling Dokploy update, dev +// laptop hitting the same scratch DB as CI) serialize on this id rather +// than racing on the pending set. // -// The table lives in the `public` schema (golang-migrate's default) rather -// than `paliad`. Rationale: migration 001's down-step is -// DROP SCHEMA IF EXISTS paliad CASCADE -// which would take the tracking table with it — breaking any subsequent -// migrate.Up() call. Keeping the tracker in `public` makes the down-path -// safe and idempotent. -const migrationsTable = "paliad_schema_migrations" +// FNV-1a-64 is good enough: the id only has to be a stable int64, not +// cryptographically uniform. Process-wide constant. +var advisoryLockID = func() int64 { + h := fnv.New64a() + _, _ = h.Write([]byte("paliad.applied_migrations")) + return int64(h.Sum64()) +}() -// ApplyMigrations runs all pending up-migrations against the given database -// URL. Returns nil if no migrations were pending. Safe to call repeatedly. +// migration is one *.up.sql file from the embedded FS. +type migration struct { + version int + name string + filename string +} + +// ApplyMigrations applies every pending up-migration to the given database. // -// Pre-creates the `paliad` schema before invoking golang-migrate because the -// first migration creates it and golang-migrate's tracking table would -// otherwise be created in whatever `current_schema()` happens to be. +// Safe to call repeatedly; a fully-applied tree is a no-op. Returns the +// first error encountered (with the offending migration filename wrapped +// in the message) and leaves the rest of pending unapplied — same fail-fast +// posture as the previous golang-migrate runner. +// +// On first deploy of this code path against a database that still has the +// legacy paliad.paliad_schema_migrations counter at version N, the runner +// seeds paliad.applied_migrations with rows 1..N (checksum NULL) before +// applying anything new. The first deploy is therefore effectively a +// no-op against the schema — the bootstrap just relabels existing state. func ApplyMigrations(databaseURL string) error { if databaseURL == "" { return errors.New("database URL is empty") @@ -51,39 +83,250 @@ func ApplyMigrations(databaseURL string) error { return fmt.Errorf("open database: %w", err) } defer conn.Close() - if err := conn.Ping(); err != nil { return fmt.Errorf("ping database: %w", err) } - // Bootstrap the paliad schema so later migrations can target it cleanly. - // This duplicates migration 001, but is idempotent via IF NOT EXISTS and - // ensures the schema exists before golang-migrate touches the DB. + // Ensure the paliad schema exists. Mig 001 also creates it; the + // applied_migrations table lives in paliad.* and gets created before + // any migrations run, so the schema must exist first. if _, err := conn.Exec(`CREATE SCHEMA IF NOT EXISTS paliad`); err != nil { return fmt.Errorf("ensure paliad schema: %w", err) } - source, err := iofs.New(migrationFS, "migrations") - if err != nil { - return fmt.Errorf("open migration source: %w", err) + if _, err := conn.Exec(`SELECT pg_advisory_lock($1)`, advisoryLockID); err != nil { + return fmt.Errorf("acquire advisory lock: %w", err) + } + defer func() { + _, _ = conn.Exec(`SELECT pg_advisory_unlock($1)`, advisoryLockID) + }() + + if _, err := conn.Exec(` + CREATE TABLE IF NOT EXISTS paliad.applied_migrations ( + version int NOT NULL PRIMARY KEY, + name text NOT NULL, + applied_at timestamptz NOT NULL DEFAULT now(), + checksum text NULL + ) + `); err != nil { + return fmt.Errorf("create applied_migrations: %w", err) } - driver, err := postgres.WithInstance(conn, &postgres.Config{ - // Unique tracking-table name avoids collision with pre-existing - // public.schema_migrations owned by other apps on this Postgres. - MigrationsTable: migrationsTable, - }) + onDisk, err := scanEmbeddedMigrations() if err != nil { - return fmt.Errorf("create migration driver: %w", err) + return fmt.Errorf("scan embedded migrations: %w", err) } - m, err := migrate.NewWithInstance("iofs", source, "postgres", driver) - if err != nil { - return fmt.Errorf("create migrator: %w", err) + if err := bootstrapFromLegacyTracker(conn, onDisk); err != nil { + return fmt.Errorf("bootstrap from legacy tracker: %w", err) } - if err := m.Up(); err != nil && !errors.Is(err, migrate.ErrNoChange) { - return fmt.Errorf("apply migrations: %w", err) + applied, err := readAppliedMigrations(conn) + if err != nil { + return fmt.Errorf("read applied_migrations: %w", err) + } + + if err := checkNameAgreement(onDisk, applied); err != nil { + return err + } + + for _, m := range onDisk { + if _, ok := applied[m.version]; ok { + continue + } + if err := applyOne(conn, m); err != nil { + return fmt.Errorf("apply %s: %w", m.filename, err) + } } return nil } + +// scanEmbeddedMigrations returns every NNN_*.up.sql in the embedded FS, +// sorted by version ascending. Hard-fails on two files sharing the same +// version prefix — that's the failure mode the parallel-merge incident +// exposed, and the runner refuses to start rather than silently picking one. +func scanEmbeddedMigrations() ([]migration, error) { + entries, err := migrationFS.ReadDir("migrations") + if err != nil { + return nil, fmt.Errorf("read migrations dir: %w", err) + } + seen := map[int]string{} + var out []migration + for _, e := range entries { + name := e.Name() + if !strings.HasSuffix(name, ".up.sql") { + continue + } + v, n, ok := parseMigrationFilename(name) + if !ok { + return nil, fmt.Errorf("unparseable migration filename %q "+ + "(expected NNN_description.up.sql)", name) + } + if prior, dup := seen[v]; dup { + return nil, fmt.Errorf("two migrations at version %d: %q and %q — "+ + "rename one and redeploy", v, prior, name) + } + seen[v] = name + out = append(out, migration{version: v, name: n, filename: name}) + } + sort.Slice(out, func(i, j int) bool { return out[i].version < out[j].version }) + return out, nil +} + +// parseMigrationFilename splits "NNN_description.up.sql" into (NNN, description). +// Returns ok=false on any deviation from that shape. +func parseMigrationFilename(filename string) (version int, name string, ok bool) { + base := strings.TrimSuffix(filename, ".up.sql") + if base == filename { + return 0, "", false + } + underscore := strings.IndexByte(base, '_') + if underscore <= 0 { + return 0, "", false + } + v, err := strconv.Atoi(base[:underscore]) + if err != nil { + return 0, "", false + } + return v, base[underscore+1:], true +} + +// readAppliedMigrations returns a map version → name from +// paliad.applied_migrations. Returns an empty map (no error) if the table +// is missing — that's the fresh-DB path before the CREATE TABLE in +// ApplyMigrations runs against it. +func readAppliedMigrations(conn *sql.DB) (map[int]string, error) { + rows, err := conn.Query(`SELECT version, name FROM paliad.applied_migrations`) + if err != nil { + if strings.Contains(err.Error(), "does not exist") { + return map[int]string{}, nil + } + return nil, err + } + defer rows.Close() + out := map[int]string{} + for rows.Next() { + var v int + var n string + if err := rows.Scan(&v, &n); err != nil { + return nil, err + } + out[v] = n + } + return out, rows.Err() +} + +// bootstrapFromLegacyTracker seeds paliad.applied_migrations from +// paliad.paliad_schema_migrations on the first deploy of the new runner +// against a DB that previously ran golang-migrate. +// +// Behavior: +// - applied_migrations already has rows → no-op (idempotent). +// - applied_migrations empty AND legacy tracker missing → no-op +// (virgin DB; the apply loop will run everything from scratch). +// - applied_migrations empty AND legacy tracker present, clean, version N +// → INSERT rows for every on-disk version ≤ N with checksum NULL. +// - applied_migrations empty AND legacy tracker dirty → hard-fail. +// The operator must recover the legacy tracker first (it being dirty +// means a prior golang-migrate run crashed mid-flight); we will not +// paper over an unknown state by guessing what landed. +// +// Backfilled rows have checksum NULL because the legacy runner didn't hash +// anything — we can't fabricate a provenance hash today without falsely +// claiming we know the byte-identity of what shipped historically. +func bootstrapFromLegacyTracker(conn *sql.DB, onDisk []migration) error { + var count int + if err := conn.QueryRow(`SELECT count(*) FROM paliad.applied_migrations`).Scan(&count); err != nil { + return fmt.Errorf("count applied_migrations: %w", err) + } + if count > 0 { + return nil + } + + var legacyVer int + var legacyDirty bool + err := conn.QueryRow(`SELECT version, dirty FROM paliad.paliad_schema_migrations LIMIT 1`). + Scan(&legacyVer, &legacyDirty) + if errors.Is(err, sql.ErrNoRows) { + return nil + } + if err != nil { + if strings.Contains(err.Error(), "does not exist") { + return nil + } + return fmt.Errorf("read legacy tracker: %w", err) + } + if legacyDirty { + return fmt.Errorf("legacy paliad.paliad_schema_migrations is dirty at version %d — "+ + "recover manually before deploying", legacyVer) + } + + for _, m := range onDisk { + if m.version > legacyVer { + continue + } + if _, err := conn.Exec(` + INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum) + VALUES ($1, $2, now(), NULL) + ON CONFLICT (version) DO NOTHING + `, m.version, m.name); err != nil { + return fmt.Errorf("backfill version %d: %w", m.version, err) + } + } + return nil +} + +// checkNameAgreement hard-fails if a version that's already applied has a +// different name on disk than in the DB. Catches the post-merge rename +// accident where someone renames `098_foo.up.sql` to `098_bar.up.sql` — +// the SQL has already run on prod with the old name, so the rename is a +// lie about history. Operator recovery: revert the rename, or update the +// DB row if the rename is intentional. +// +// Backfilled rows have a name pulled from the on-disk filename, so an +// out-of-the-box backfill never trips this check. +func checkNameAgreement(onDisk []migration, applied map[int]string) error { + for _, m := range onDisk { + dbName, ok := applied[m.version] + if !ok { + continue + } + if dbName != m.name { + return fmt.Errorf("migration %d: disk name %q != DB name %q "+ + "(renamed after apply? revert the rename, or UPDATE paliad.applied_migrations "+ + "SET name=%q WHERE version=%d if the rename is intentional)", + m.version, m.name, dbName, m.name, m.version) + } + } + return nil +} + +// applyOne runs one migration's .up.sql plus its INSERT row in a single +// transaction. All-or-nothing per migration: if the SQL fails, the row +// isn't inserted and the next deploy re-tries from the same point. If +// the INSERT fails (e.g. PK violation because the lock wasn't held), the +// SQL rolls back too. +func applyOne(conn *sql.DB, m migration) error { + body, err := migrationFS.ReadFile("migrations/" + m.filename) + if err != nil { + return fmt.Errorf("read %s: %w", m.filename, err) + } + checksum := fmt.Sprintf("%x", sha256.Sum256(body)) + + tx, err := conn.Begin() + if err != nil { + return fmt.Errorf("begin tx: %w", err) + } + defer func() { _ = tx.Rollback() }() + + if _, err := tx.Exec(string(body)); err != nil { + return fmt.Errorf("exec sql: %w", err) + } + if _, err := tx.Exec(` + INSERT INTO paliad.applied_migrations(version, name, applied_at, checksum) + VALUES ($1, $2, now(), $3) + `, m.version, m.name, checksum); err != nil { + return fmt.Errorf("record applied: %w", err) + } + return tx.Commit() +} diff --git a/internal/db/migrate_test.go b/internal/db/migrate_test.go index 6b7b774..a1743d3 100644 --- a/internal/db/migrate_test.go +++ b/internal/db/migrate_test.go @@ -1,60 +1,49 @@ // Package db tests — migration dry-run gate. // // This is the test that catches mig-N crash-loops before they reach prod. -// The convention since t-paliad-098/099 is that paliad migrations land in -// numeric order on a single trunk; the next deploy runs whichever ones are -// pending against the live `public.paliad_schema_migrations` tracker. A -// migration that compiles cleanly but fails on apply (typo, missing column, -// wrong CHECK shape) crashes the Dokploy container loop before paliad.de -// finishes binding :8080, and the only way to learn about it today is to -// watch the deploy log. +// The new runner tracks applied state as a set in paliad.applied_migrations +// (one row per migration; see migrate.go). A migration that compiles cleanly +// but fails on apply (typo, missing column, wrong CHECK shape) crashes the +// Dokploy container loop before paliad.de finishes binding :8080, and the +// only way to learn about it today is to watch the deploy log. // // TestMigrations_DryRun closes that gap: for every *.up.sql in this -// directory whose version is greater than the scratch DB's current tracker -// version, it opens a transaction, runs the SQL, and ROLLBACKs. Any error -// fails the test with the file name + Postgres error. Always non-destructive -// — the ROLLBACK runs even on success, so the scratch DB stays at its -// starting version. +// directory whose version is NOT present in paliad.applied_migrations on +// the scratch DB, it opens a transaction, runs the SQL, and ROLLBACKs. +// Any error fails the test with the file name + Postgres error. Always +// non-destructive — the ROLLBACK runs even on success, so the scratch DB +// stays at its starting set. +// +// "Pending" means: a version that's on disk but not in applied_migrations. +// In CI against a fresh scratch DB (where applied_migrations either +// doesn't exist or is empty), every migration is pending and gets +// verified. On a developer laptop whose scratch DB is already at HEAD, +// no migrations are pending and the test logs and passes — the protection +// only kicks in the moment a new *.up.sql lands in the tree before the +// developer runs `db.ApplyMigrations` against the same scratch DB. // // Requires TEST_DATABASE_URL (same pattern as the rest of the live-DB // tests). Skipped without it. // -// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1. +// Design: docs/design-paliad-test-strategy-2026-05-19.md §5 Slice 1 and +// docs/design-migration-runner-applied-set-2026-05-20.md §6. package db import ( "database/sql" - "errors" "fmt" "os" - "sort" - "strconv" "strings" "testing" _ "github.com/lib/pq" ) -// migration is one *.up.sql file from the embedded migrations FS. -type migration struct { - version int - name string - filename string -} - // TestMigrations_DryRun walks every pending *.up.sql in numeric order, // applies each inside its own BEGIN/ROLLBACK against the scratch DB, and // fails the test on the first SQL error. Reports per-file as a sub-test so // `go test -v` shows which migration failed. -// -// What "pending" means: greater than the scratch DB's current tracker -// version (or 0 if the tracker doesn't exist yet). In CI against a fresh -// scratch DB, every migration is pending and gets verified. On a developer -// laptop whose scratch DB is already at HEAD, no migrations are pending and -// the test logs the start version and passes — the protection only kicks in -// the moment a new *.up.sql lands in the tree before the developer runs -// `db.ApplyMigrations` against the same scratch DB. func TestMigrations_DryRun(t *testing.T) { url := os.Getenv("TEST_DATABASE_URL") if url == "" { @@ -79,28 +68,32 @@ func TestMigrations_DryRun(t *testing.T) { t.Fatalf("ensure paliad schema: %v", err) } - startVersion, dirty, err := currentTrackerVersion(conn) + applied, err := readAppliedVersions(conn) if err != nil { - t.Fatalf("read tracker: %v", err) + t.Fatalf("read applied_migrations: %v", err) } - if dirty { - t.Fatalf("tracker is dirty at version %d — fix that first (DROP the tracker row "+ - "or restore from backup); the dry-run cannot trust a dirty starting state", - startVersion) - } - t.Logf("scratch DB tracker at version %d; walking pending migrations from %d upward", - startVersion, startVersion+1) - migs, err := loadPendingMigrations(startVersion) + onDisk, err := scanEmbeddedMigrations() if err != nil { - t.Fatalf("load migrations: %v", err) + t.Fatalf("scan embedded migrations: %v", err) } - if len(migs) == 0 { - t.Logf("no pending migrations — scratch DB is at HEAD (%d)", startVersion) + + var pending []migration + for _, m := range onDisk { + if !applied[m.version] { + pending = append(pending, m) + } + } + + if len(pending) == 0 { + t.Logf("no pending migrations — scratch DB applied set covers every on-disk version (%d total)", + len(onDisk)) return } + t.Logf("scratch DB has %d/%d on-disk migrations applied; walking %d pending", + len(applied), len(onDisk), len(pending)) - for _, m := range migs { + for _, m := range pending { t.Run(fmt.Sprintf("%03d_%s", m.version, m.name), func(t *testing.T) { body, err := migrationFS.ReadFile("migrations/" + m.filename) if err != nil { @@ -110,10 +103,10 @@ func TestMigrations_DryRun(t *testing.T) { if err != nil { t.Fatalf("begin: %v", err) } - // Always rollback; the dry-run must not leave the scratch DB - // at a different version than where it started. Rollback is - // safe to call even after a failed Exec — Postgres aborts the - // transaction internally on the first error. + // Always rollback; the dry-run must not leave the scratch + // DB at a different applied set than where it started. + // Rollback is safe after a failed Exec — Postgres aborts + // the transaction internally on the first error. defer func() { _ = tx.Rollback() }() if _, err := tx.Exec(string(body)); err != nil { @@ -123,76 +116,30 @@ func TestMigrations_DryRun(t *testing.T) { } } -// currentTrackerVersion reads the latest version + dirty flag from the -// `public.paliad_schema_migrations` tracker. Returns (0, false, nil) when the -// tracker doesn't exist yet — that's the "fresh scratch DB" path. +// readAppliedVersions returns the set of versions present in +// paliad.applied_migrations on the scratch DB. Missing table → empty set +// (fresh-DB path; the table only exists after the runner has been called). // -// We don't use golang-migrate's API to read this because golang-migrate's -// driver locks the tracker row on read; a test runner that calls this while -// the developer has paliad running locally would race. A plain SELECT is -// race-safe and matches what `psql` would show. -func currentTrackerVersion(conn *sql.DB) (version int, dirty bool, err error) { - const q = `SELECT version, dirty FROM public.paliad_schema_migrations LIMIT 1` - row := conn.QueryRow(q) - if scanErr := row.Scan(&version, &dirty); scanErr != nil { - // Missing table → fresh DB → start at 0. lib/pq surfaces this - // as `pq.Error.Code = "42P01"` (undefined_table); the simpler - // sql.ErrNoRows fires if the table exists but is empty (also - // fresh-DB-shaped). - if errors.Is(scanErr, sql.ErrNoRows) { - return 0, false, nil - } - if strings.Contains(scanErr.Error(), "does not exist") { - return 0, false, nil - } - return 0, false, scanErr - } - return version, dirty, nil -} - -// loadPendingMigrations returns every *.up.sql in the embedded FS whose -// version is greater than startVersion, sorted by version ascending. A -// filename like "098_submission_codes_prefix_and_rename.up.sql" yields -// version=98, name="submission_codes_prefix_and_rename". -func loadPendingMigrations(startVersion int) ([]migration, error) { - entries, err := migrationFS.ReadDir("migrations") +// We don't pre-create the table here because the dry-run is supposed to be +// a passive observer — it must not mutate the scratch DB outside of its +// own per-mig BEGIN/ROLLBACK probes. A "table doesn't exist" outcome is +// the right read against a virgin scratch DB. +func readAppliedVersions(conn *sql.DB) (map[int]bool, error) { + rows, err := conn.Query(`SELECT version FROM paliad.applied_migrations`) if err != nil { - return nil, fmt.Errorf("read migrations dir: %w", err) + if strings.Contains(err.Error(), "does not exist") { + return map[int]bool{}, nil + } + return nil, err } - var out []migration - for _, e := range entries { - name := e.Name() - if !strings.HasSuffix(name, ".up.sql") { - continue + defer rows.Close() + out := map[int]bool{} + for rows.Next() { + var v int + if err := rows.Scan(&v); err != nil { + return nil, err } - v, n, ok := parseMigrationName(name) - if !ok { - return nil, fmt.Errorf("unparseable migration filename: %s "+ - "(expected NNN_description.up.sql)", name) - } - if v <= startVersion { - continue - } - out = append(out, migration{version: v, name: n, filename: name}) + out[v] = true } - sort.Slice(out, func(i, j int) bool { return out[i].version < out[j].version }) - return out, nil -} - -// parseMigrationName splits "NNN_description.up.sql" into (NNN, description). -// Returns ok=false on any deviation from that shape. -func parseMigrationName(filename string) (version int, name string, ok bool) { - base := strings.TrimSuffix(filename, ".up.sql") - if base == filename { // suffix wasn't present - return 0, "", false - } - underscore := strings.IndexByte(base, '_') - if underscore <= 0 { - return 0, "", false - } - v, err := strconv.Atoi(base[:underscore]) - if err != nil { - return 0, "", false - } - return v, base[underscore+1:], true + return out, rows.Err() }