Merge: t-paliad-282 Slice A — CI/CD pre-deploy gate + snapshot-based migration smoke (m/paliad#114)
Some checks failed
Paliad CI gate / build (push) Has been cancelled
Paliad CI gate / test-go (push) Has been cancelled
Paliad CI gate / deploy (push) Has been cancelled

This commit is contained in:
mAi
2026-05-25 17:42:51 +02:00
11 changed files with 7006 additions and 27 deletions

242
.gitea/workflows/test.yaml Normal file
View File

@@ -0,0 +1,242 @@
# Paliad CI gate (t-paliad-282 / m/paliad#114).
#
# Single workflow, two purposes:
#
# - On every push: gate tier — build + unit + migration smoke. Red gate
# means no further work and (on main) no deploy.
# - On push to main with gate green: deploy step — calls the Dokploy
# compose-deploy API for paliad's compose Zx147ycurfYagKRl_Zzyo, then
# polls /health/ready until the new container reports 200.
#
# The deploy step REPLACES the previous Gitea-push → Dokploy webhook path
# (per m's Q11.4 pick: soft-launch with both alive for ~1 week, then
# disable the Dokploy auto-deploy toggle). Soft-launch leaves Dokploy's
# autoDeploy=true intact today — the workflow's deploy step is additive
# and idempotent (Dokploy's deploy is itself idempotent).
#
# Catches the three failure classes from 2026-05-25:
#
# - brunel slot collision (~13:20) — TestMigrations_NoDuplicateSlot,
# pure unit, no DB needed.
# - hermes dropped-col refs (~16:05) — TestBootSmoke, applies all NEW
# migrations (those not in the snapshot) end-to-end against a
# scratch DB restored from internal/db/testdata/prod-snapshot.sql.
# - mig 129 42501 ownership (~14:56→) — TestMigrations_EndToEndAsAppRole,
# applies new migrations as the prod-shaped `postgres` role (which
# is NOT a superuser on supabase/postgres — same shape as
# youpc-supabase prod, see internal/db/testdata/README.md).
#
# Snapshot approach: dump paliad schema + applied_migrations rows from
# prod, commit them. CI restores → ApplyMigrations sees existing migs as
# applied, only runs NEW migs (the ones this PR adds). This sidesteps the
# fresh-DB idempotence requirement on historical migrations (some of
# which use raw COMMIT or pre-installed extensions and can't be replayed
# from scratch). To refresh: `make refresh-snapshot`.
#
# Design: docs/design-cicd-pre-deploy-gate-2026-05-25.md (cronus inventor
# shift, t-paliad-282).
name: Paliad CI gate
on:
push:
branches:
- main
- 'mai/**'
pull_request:
branches: [main]
env:
GO_VERSION: '1.24'
BUN_VERSION: '1.2'
jobs:
# Gate job 1 — pure build. Catches go/bun build breakage that local
# `go build` would catch but which a worker might have skipped before
# pushing. Fast (~60 s) so a red here surfaces immediately.
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: ${{ env.GO_VERSION }}
cache: true
- name: go build
run: go build ./...
- name: go vet
run: go vet ./...
- name: Set up Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: ${{ env.BUN_VERSION }}
- name: bun install + build
working-directory: frontend
run: |
bun install --frozen-lockfile
bun run build
# Gate job 2 — Go test suite + migration smoke against snapshot-restored
# scratch DB.
#
# The Postgres service container uses the same supabase/postgres image
# as youpc-supabase prod. The CI scratch DB starts empty; a setup step
# installs pg_trgm + restores the snapshot. After restore, paliad
# schema is at HEAD-of-snapshot and applied_migrations covers every
# migration up to (and including) the snapshot's max version.
#
# ApplyMigrations called in TestBootSmoke / TestMigrations_EndToEndAsAppRole
# sees the snapshot's applied set, finds whatever NEW migrations this
# PR added on top, and applies only those. The role-split smoke runs as
# `postgres` (which is NOT a superuser on supabase/postgres, matching
# the prod role topology) — any new migration that needs supabase_admin
# privilege fails here as it would in prod.
test-go:
runs-on: ubuntu-latest
services:
# supabase/postgres baked-in auth schema + supabase role topology
# matches youpc-supabase prod. `postgres` here is NOT a superuser
# (verified live: \du postgres shows "Create role, Create DB,
# Replication, Bypass RLS" — no Superuser). This is the prod-shaped
# role the deploy uses.
postgres:
image: supabase/postgres:15.8.1.060
env:
POSTGRES_PASSWORD: ci
POSTGRES_DB: paliad_scratch
ports:
- 5432:5432
options: >-
--health-cmd "pg_isready -U postgres"
--health-interval 5s
--health-timeout 5s
--health-retries 30
steps:
- uses: actions/checkout@v4
- name: Set up Go
uses: actions/setup-go@v5
with:
go-version: ${{ env.GO_VERSION }}
cache: true
- name: Install postgresql-client
run: |
apt-get update -qq && apt-get install -y -qq postgresql-client
# Snapshot restore. Two prep steps as supabase_admin (the actual
# superuser): GRANT CREATE so the `postgres` role can later create
# schemas if a new mig needs it; install pg_trgm so the snapshot's
# trigram indexes restore. Snapshot itself loads as `postgres`.
- name: Provision + restore snapshot
env:
PGPASSWORD: ci
run: |
set -euo pipefail
psql -h localhost -U supabase_admin -d paliad_scratch -v ON_ERROR_STOP=1 \
-c "GRANT CREATE ON DATABASE paliad_scratch TO postgres;" \
-c "CREATE EXTENSION IF NOT EXISTS pg_trgm;"
psql -h localhost -U postgres -d paliad_scratch -v ON_ERROR_STOP=1 \
-f internal/db/testdata/prod-snapshot.sql
# Pre-flight: catches brunel slot collision in seconds, no DB
# contact (still useful even though the test-go job has Postgres
# running, because the failure mode is independent).
- name: Migration coordination check
run: go test -count=1 -run TestMigrations_NoDuplicateSlot ./internal/db/
# Role-split end-to-end apply. Connects as `postgres` (NOT a
# superuser on supabase/postgres) and runs ApplyMigrations against
# the snapshot-restored DB. Existing migs are skipped (already in
# applied_migrations); NEW migs in this PR apply here. If a new
# migration assumes supabase_admin privilege, fails with the same
# 42501 error class that took paliad.de offline on 2026-05-25.
- name: Migration end-to-end (deploy role)
env:
TEST_APP_DATABASE_URL: postgres://postgres:ci@localhost:5432/paliad_scratch?sslmode=disable
run: go test -count=1 -run TestMigrations_EndToEndAsAppRole ./internal/db/
# Boot smoke. Confirms ApplyMigrations succeeds + applied set
# matches on-disk set + /healthz returns 200 + /health/ready
# returns 200 (the live-pool variant via TestHealthReady_Live).
- name: Boot smoke + readiness
env:
TEST_DATABASE_URL: postgres://postgres:ci@localhost:5432/paliad_scratch?sslmode=disable
run: go test -count=1 -run 'TestBootSmoke|TestHealthReady_Live' ./cmd/server/
# Full Go test suite WITHOUT TEST_DATABASE_URL so live-DB service
# tests skip (same shape as a developer laptop without a scratch
# DB). Live-DB tests in internal/services/* will be activated by a
# follow-up shift once the snapshot is verified stable across
# multiple PRs — they need investigation against supabase/postgres
# 15.8 (parameter type inference differs subtly from youpc-supabase).
- name: go test ./... (pure + skip-on-no-DB)
run: go test -count=1 ./internal/... ./cmd/...
# Deploy step. Only runs on push to main and only after both gate jobs
# are green. Calls Dokploy's compose.deploy with the paliad compose ID
# (Zx147ycurfYagKRl_Zzyo) and polls /health/ready until it returns 200
# or times out.
#
# Skipped on PR / feature branch pushes — those run the gate tier as
# a status check but don't trigger a prod deploy. Dokploy's existing
# autoDeploy=true webhook continues to fire during the soft-launch
# window (per Q11.4); it can be disabled in the Dokploy UI once this
# workflow has gated ≥5 successful green deploys.
deploy:
runs-on: ubuntu-latest
needs: [build, test-go]
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
steps:
- name: Trigger Dokploy compose deploy
env:
DOKPLOY_KEY: ${{ secrets.DOKPLOY_TOKEN }}
DOKPLOY_API: http://100.99.98.201:3000/api/trpc
COMPOSE_ID: Zx147ycurfYagKRl_Zzyo
run: |
set -euo pipefail
if [ -z "${DOKPLOY_KEY:-}" ]; then
echo "ERROR: DOKPLOY_TOKEN secret is not configured."
echo " Set the secret in Gitea repo settings before this step can deploy."
exit 2
fi
echo "==> POST compose.deploy"
curl -sS --connect-timeout 5 --max-time 30 \
-X POST \
-H "x-api-key: $DOKPLOY_KEY" \
-H "Content-Type: application/json" \
-d "{\"json\":{\"composeId\":\"$COMPOSE_ID\"}}" \
"$DOKPLOY_API/compose.deploy"
echo
- name: Wait for /health/ready
run: |
set -euo pipefail
echo "==> polling https://paliad.de/health/ready"
# Up to 5 minutes (60 × 5 s) — paliad's cold-start is normally
# ≤30 s; the longer budget covers slow image pulls + migration
# apply.
for i in $(seq 1 60); do
status=$(curl -sS --connect-timeout 3 --max-time 5 \
-o /dev/null -w '%{http_code}' \
https://paliad.de/health/ready || echo "000")
if [ "$status" = "200" ]; then
echo "ready after ${i} poll(s)"
exit 0
fi
echo " [$i/60] status=$status — sleeping 5s"
sleep 5
done
echo "ERROR: /health/ready did not return 200 within 5 minutes."
echo " The deploy fired but the new container is not serving."
echo " Investigate: ssh mlake 'docker logs --tail 50 compose-transmit-multi-byte-driver-v7jth9-web-1'"
exit 1

View File

@@ -21,18 +21,24 @@
# the test runner's working dirs. None of them touch internal/db/migrations/
# files.
.PHONY: help verify-migrations verify-mig test test-go
.PHONY: help verify-migrations verify-mig verify-mig-app test test-go test-frontend refresh-snapshot
help:
@echo "Paliad — developer targets"
@echo ""
@echo " verify-migrations Dry-run pending migrations + boot smoke (needs TEST_DATABASE_URL)"
@echo " verify-mig Alias for verify-migrations"
@echo " verify-mig-app End-to-end migration smoke as non-superuser role"
@echo " (needs TEST_APP_DATABASE_URL — t-paliad-282 / m/paliad#114)"
@echo " test Short test pass — covers gate tier"
@echo " test-go Full Go suite with race detector"
@echo " test-frontend Frontend bun:test suite"
@echo ""
@echo "Set TEST_DATABASE_URL to enable live-DB tests. Example:"
@echo " export TEST_DATABASE_URL=postgres://paliad:...@localhost:11833/paliad_test"
@echo ""
@echo "Set TEST_APP_DATABASE_URL to enable the role-split smoke. Example:"
@echo " export TEST_APP_DATABASE_URL=postgres://paliad_app:...@localhost:5432/paliad_scratch"
# Gate target — the test that would have caught mig 098 / mig 099 before
# deploy. Combines:
@@ -71,3 +77,67 @@ test:
# (full suite, not per-PR).
test-go:
go test -race ./...
# Frontend bun:test suite. Runs the 4 existing pure-TS tests today; will
# grow as mendel's Slice 3 (frontend test infill) lands.
test-frontend:
cd frontend && bun test
# Role-split end-to-end migration smoke — the catch for the mig 129 42501
# ownership class (m/paliad#114). Runs ApplyMigrations as a non-superuser
# role against TEST_APP_DATABASE_URL. Fails the build if any migration
# assumes more privilege than the deploy role has.
#
# Developer setup (local):
# psql -c "CREATE ROLE paliad_app LOGIN PASSWORD 'ci' NOSUPERUSER;"
# psql -c "CREATE DATABASE paliad_scratch OWNER paliad_app;"
# export TEST_APP_DATABASE_URL=postgres://paliad_app:ci@localhost:5432/paliad_scratch
verify-mig-app:
@if [ -z "$$TEST_APP_DATABASE_URL" ]; then \
echo "ERROR: TEST_APP_DATABASE_URL is not set."; \
echo " The role-split migration smoke cannot run without a non-superuser scratch DB."; \
echo " See Makefile comments above this target for setup."; \
exit 2; \
fi
go test -count=1 -run TestMigrations_EndToEndAsAppRole ./internal/db/
# Refresh the prod schema snapshot used by CI's migration smoke
# (t-paliad-282 / m/paliad#114). Connects to youpc-supabase prod, dumps
# the paliad schema + applied_migrations rows, strips rows beyond the
# current branch's max on-disk version, and writes
# internal/db/testdata/prod-snapshot.sql.
#
# When to refresh:
# - After merging a PR that added a new migration to main.
# - When CI's migration smoke starts spuriously failing because the
# snapshot's applied set diverges from on-disk by more than this
# branch's worth of new migs.
#
# Requires PALIAD_PROD_DATABASE_URL env var (a Postgres URL with
# pg_dump rights on youpc-supabase). Example:
# export PALIAD_PROD_DATABASE_URL='postgres://postgres:PW@100.99.98.201:11833/postgres'
refresh-snapshot:
@if [ -z "$$PALIAD_PROD_DATABASE_URL" ]; then \
echo "ERROR: PALIAD_PROD_DATABASE_URL is not set."; \
echo " Refresh requires read access to youpc-supabase prod."; \
exit 2; \
fi
@echo "==> dumping paliad schema (no owner, no privs)..."
@pg_dump --schema-only --schema=paliad --no-owner --no-privileges \
--no-publications --no-subscriptions \
"$$PALIAD_PROD_DATABASE_URL" > internal/db/testdata/prod-snapshot.sql.tmp
@echo "==> appending applied_migrations rows..."
@pg_dump --data-only --table=paliad.applied_migrations \
--no-owner --no-privileges \
"$$PALIAD_PROD_DATABASE_URL" >> internal/db/testdata/prod-snapshot.sql.tmp
@echo "==> stripping pg16 \\restrict / \\unrestrict commands for pg15 compat..."
@sed -i.bak '/^\\restrict /d; /^\\unrestrict /d' internal/db/testdata/prod-snapshot.sql.tmp
@rm -f internal/db/testdata/prod-snapshot.sql.tmp.bak
@echo "==> stripping applied_migrations rows beyond branch's max on-disk version..."
@MAX_VER=$$(ls internal/db/migrations/*.up.sql | xargs -I{} basename {} | sed 's/_.*//' | sort -n | tail -1); \
awk -v max=$$MAX_VER ' \
/^[0-9]+\t/ { split($$0, a, "\t"); if (a[1]+0 > max) next; } \
{ print } \
' internal/db/testdata/prod-snapshot.sql.tmp > internal/db/testdata/prod-snapshot.sql
@rm internal/db/testdata/prod-snapshot.sql.tmp
@wc -l internal/db/testdata/prod-snapshot.sql

View File

@@ -165,6 +165,7 @@ func main() {
sysAuditSvc := services.NewSystemAuditLogService(pool)
checklistTemplateSvc := services.NewChecklistTemplateService(pool, checklistCatalogSvc, sysAuditSvc, users)
svcBundle = &handlers.Services{
Pool: pool,
Project: projectSvc,
Team: teamSvc,
PartnerUnit: partnerUnitSvc,

View File

@@ -98,6 +98,51 @@ func TestBootSmoke(t *testing.T) {
if body := strings.TrimSpace(rec.Body.String()); body != "ok" {
t.Errorf("GET /healthz: body=%q; want \"ok\"", body)
}
// (4) Readiness probe. With a nil Services bundle the endpoint MUST
// report 503 — that's the contract documented in handlers/handlers.go.
// A separate svc-with-Pool case is exercised in TestHealthReady (live).
rec = httptest.NewRecorder()
req = httptest.NewRequest(http.MethodGet, "/health/ready", nil)
mux.ServeHTTP(rec, req)
if rec.Code != http.StatusServiceUnavailable {
t.Errorf("GET /health/ready (nil svc): status=%d; want 503", rec.Code)
}
}
// TestHealthReady_Live asserts the readiness probe answers 200 when the
// pool is reachable, 503 when it isn't. Requires TEST_DATABASE_URL.
//
// Why a separate test: TestBootSmoke runs Register with svc=nil to keep
// its setup minimal; the pool-reachable path needs the pool wired in
// through svc.Pool. Two tests, two assertions, no entanglement.
func TestHealthReady_Live(t *testing.T) {
url := os.Getenv("TEST_DATABASE_URL")
if url == "" {
t.Skip("TEST_DATABASE_URL not set — skipping live readiness probe")
}
if err := db.ApplyMigrations(url); err != nil {
t.Fatalf("db.ApplyMigrations: %v", err)
}
pool, err := db.OpenPool(url)
if err != nil {
t.Fatalf("open pool: %v", err)
}
mux := http.NewServeMux()
authClient := auth.NewClient("https://test.invalid", "anon-key", []byte("test-secret"))
handlers.Register(mux, authClient, "", &handlers.Services{Pool: pool})
rec := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/health/ready", nil)
mux.ServeHTTP(rec, req)
if rec.Code != http.StatusOK {
t.Errorf("GET /health/ready (live pool): status=%d, body=%q; want 200", rec.Code, rec.Body.String())
}
if body := strings.TrimSpace(rec.Body.String()); body != "ready" {
t.Errorf("GET /health/ready (live pool): body=%q; want \"ready\"", body)
}
}
// embeddedMigrationVersions returns every N where N_*.up.sql exists in

View File

@@ -0,0 +1,181 @@
# CI/CD runner setup — paliad
**Companion to:** `docs/design-cicd-pre-deploy-gate-2026-05-25.md` (Slice A, t-paliad-282 / m/paliad#114)
**Date:** 2026-05-25
**Audience:** mlake / mriver admin (m or head)
Slice A's `.gitea/workflows/test.yaml` requires (a) at least one online Gitea Actions runner and (b) a Dokploy API token wired as a repo secret. Both are one-time setup actions that paliad's source tree cannot perform itself — they live on infra-side. This doc lists them so the workflow can go green on its first run.
---
## 0. Pre-flight: what already exists
Verified live (2026-05-25 cronus inventor shift):
- Gitea 1.24.4 on `mgit.msbls.de`, `has_actions: true` on `m/paliad`.
- `/api/v1/admin/actions/runners` reports **2 runners** registered. They are likely the shared runners used by `m/mGreen` and `m/mGeo` (both have `.gitea/workflows/deploy.yml` with `runs-on: self-hosted`).
- `m/paliad/actions/tasks` reports `total_count=0` — paliad has never run a workflow yet.
The existing runners may already be capable of running paliad's workflow without further setup. The verification step (§3) below tells you whether they are.
---
## 1. Runner placement decision (m's Q11.1)
m's pick: **mriver**.
Rationale: mriver hosts the mai worker fleet but workers spend most of their time waiting on Anthropic. mlake's Dokploy + Swarm workload is more contended. A new runner on mriver adds the least pressure to either box.
If mriver is offline or saturated when CI first fires, fall back to the existing mlake-side runners (they're already registered; no provisioning needed).
---
## 2. One-time setup (admin steps)
### 2.1 Register a new Gitea Actions runner on mriver
```bash
# On mriver, as m:
# 1. Download the act_runner binary (matching Gitea 1.24.x)
curl -L -o /usr/local/bin/act_runner \
https://gitea.com/gitea/act_runner/releases/download/v0.2.13/act_runner-0.2.13-linux-amd64
chmod +x /usr/local/bin/act_runner
# 2. Get a runner registration token. In the Gitea UI:
# /admin → Actions → Runners → "Create new Runner"
# (or org-scope: /m/paliad/settings/actions/runners)
# Copy the token.
# 3. Register
mkdir -p ~/act_runner && cd ~/act_runner
act_runner register --no-interactive \
--instance https://mgit.msbls.de \
--token <REGISTRATION_TOKEN> \
--name mriver-paliad-1 \
--labels ubuntu-latest:docker://node:20-bookworm
# 4. Run as a systemd unit (preferred) or as a session daemon
# Systemd unit example: /etc/systemd/system/act_runner.service
# [Unit]
# Description=Gitea Actions runner
# After=network.target
# [Service]
# User=m
# WorkingDirectory=/home/m/act_runner
# ExecStart=/usr/local/bin/act_runner daemon
# Restart=on-failure
# [Install]
# WantedBy=multi-user.target
sudo systemctl enable --now act_runner
sudo systemctl status act_runner
```
**Why `ubuntu-latest:docker://node:20-bookworm` for the label?** Gitea Actions' `runs-on: ubuntu-latest` resolves via the runner's label map. Mapping it to a Docker image gives the workflow a sandbox with Docker available — required for our Postgres service container in `test.yaml`. mriver should have Docker (for `paliadin-shim`); if not, install it.
### 2.2 Register the Dokploy API token as a repo secret
The workflow's `deploy` job needs `secrets.DOKPLOY_TOKEN`. Use the existing project-wide Dokploy API key (the one stored in `~/.claude/skills/mai-dokploy/SKILL.md`).
In the Gitea UI:
- Navigate to `https://mgit.msbls.de/m/paliad/settings/actions/secrets`
- Click "Add secret"
- Name: `DOKPLOY_TOKEN`
- Value: `mai-ottosSyRHMhmLhhhXaCbKzbqKBuSqzqEtmKDOPelPCeimTaYsbmaVslVyEgJZGCIxVdz`
Or via API (mAi identity):
```bash
curl --netrc-file ~/.netrc-mai -sS -X POST \
-H "Content-Type: application/json" \
https://mgit.msbls.de/api/v1/repos/m/paliad/actions/secrets/DOKPLOY_TOKEN \
-d '{"data":"mai-ottosSyRHMhmLhhhXaCbKzbqKBuSqzqEtmKDOPelPCeimTaYsbmaVslVyEgJZGCIxVdz"}'
```
(Requires repo-owner permission. If mAi lacks it, m runs it.)
---
## 3. Verify the runner sees the workflow
After (2.1) + (2.2):
```bash
# Push the Slice A branch (the one this doc lives on)
git push origin mai/cronus/coder-cicd-slice-a
# Confirm the runner picked up the job
curl --netrc-file ~/.netrc-mai -sS \
"https://mgit.msbls.de/api/v1/repos/m/paliad/actions/tasks?limit=5" | jq '.'
```
A new task per job should appear (build, test-go). If `total_count` stays 0, the runner labels don't match the workflow's `runs-on`. Re-register with `--labels ubuntu-latest` (no docker:// suffix) and the existing runners on mlake will pick it up via shell mode.
---
## 4. Soft-launch (m's Q11.4)
m's pick: **keep both Dokploy auto-deploy and the workflow's deploy step alive for ~1 week. After ≥5 successful green deploys via the workflow, disable Dokploy's autoDeploy in the Dokploy UI for the paliad compose.**
While both are live, every push to main fires:
1. Dokploy webhook (existing path) → deploys immediately, no gate.
2. Gitea workflow → on green, ALSO calls `compose.deploy`.
The second call is idempotent — if Dokploy already deployed the same commit, this is a no-op. The workflow's value during soft-launch is the **gate signal**: a red workflow on a green main = the bad migration shipped via the unguarded webhook and broke prod, and the workflow is shouting about it.
After confidence builds:
1. In the Dokploy UI, navigate to the paliad compose → Settings.
2. Toggle "Auto Deploy" off.
3. Save.
From this point, the only path to deploy is the workflow's deploy job. Red workflow = no deploy.
---
## 5. What Slice A catches today — and what it doesn't
After this branch (`mai/cronus/coder-cicd-slice-a`) merges to main:
### Catches (active in CI)
- **Build breakage** — `go build`, `go vet`, `bun run build`. Red gate, no deploy.
- **Slot collisions** — `TestMigrations_NoDuplicateSlot` runs without a DB. A PR adding migration N when version N already exists fails at gate time. This is the brunel-class catch (m/paliad#114 ~13:20 outage).
- **New-migration shape errors (hermes class)** — `TestBootSmoke` runs `ApplyMigrations` against the snapshot-restored DB. New migs from this PR get applied for real; any column/relation/syntax error fails the gate before merge.
- **New-migration ownership errors (mig 129 42501 class)** — `TestMigrations_EndToEndAsAppRole` runs `ApplyMigrations` connected as `postgres` (NON-superuser on `supabase/postgres:15.8.1.060`, same role topology as youpc-supabase prod). Any migration that assumes supabase_admin privilege fails with the same `42501 must be owner` error class that took paliad.de offline on 2026-05-25.
- **Readiness probe regressions** — `TestHealthReady_Live` confirms `/health/ready` returns 200 against a live pool, 503 against a nil pool.
- **Pure-Go test regressions** — `go test ./internal/... ./cmd/...` runs without `TEST_DATABASE_URL` (live-DB service tests skip the same way they do on a developer laptop without a scratch DB).
### Mechanism — the snapshot approach
CI's scratch DB starts from a `pg_dump` of youpc-supabase paliad schema +
`paliad.applied_migrations` rows, committed to `internal/db/testdata/prod-snapshot.sql`. After restore, the scratch DB is at "paliad HEAD of snapshot" and `ApplyMigrations` sees only this PR's new migrations as pending.
This sidesteps the fresh-DB idempotence problem: several historical migrations (notably mig 037's missing `CREATE EXTENSION pg_trgm`, mig 051's inner `COMMIT;`) can't be replayed from scratch against `supabase/postgres:15.8.1.060`. The snapshot pins everything that's already applied in prod and lets CI focus on what's new — which is what we actually care about for outage prevention.
Snapshot refresh: `make refresh-snapshot` with `PALIAD_PROD_DATABASE_URL` set (see `internal/db/testdata/README.md`).
### Known gap — live-DB service tests don't run in CI
`internal/services/*_test.go` tests with `TEST_DATABASE_URL` set fail against `supabase/postgres:15.8.1.060` with `42P08 inconsistent types deduced for parameter` errors on some INSERT bind paths. The same tests pass against youpc-supabase prod. Cause is unconfirmed — likely subtle differences in type inference between the dockerized image and the prod cluster's configuration. CI today runs `go test ./...` without `TEST_DATABASE_URL` so these tests skip. Not blocking outage prevention; tracked as a follow-up for the post-Slice-A coder.
### Migration cleanup also bundled in this PR
Two surgical migration improvements that surfaced during snapshot debugging — kept here because they're small and harmless:
- **mig 024 + 027** — `ALTER INDEX` / `ALTER POLICY` exception handlers now catch `undefined_object` OR `undefined_table` OR `duplicate_object`. Old handler caught only `undefined_object`; Postgres raises `undefined_table` when the source object never existed and `duplicate_object` when the destination already exists. The expanded handler makes the migrations truly idempotent across the three plausible states: source-still-German (rename succeeds), already-renamed (catches duplicate_object), and fresh-DB-never-had-German (catches undefined_table).
Other migration history bugs (mig 037 missing pg_trgm, mig 051 inner COMMIT) are tracked as a separate cleanup task — not blocking, because the snapshot bypasses them.
### Verification checklist (after Slice A merges)
1. **Workflow green on its first PR run?** Check `/m/paliad/actions`. If not, fix before merging.
2. **Dokploy `compose.deploy` call succeeds?** The workflow's `deploy` job logs the POST response. A successful response is a Dokploy job ID; a 4xx is an auth or compose-id problem.
3. **`/health/ready` returns 200 within 5 minutes after a green deploy?** The workflow polls this. If it times out, the migration may have failed silently inside the new container — check `docker logs --tail 50 compose-transmit-multi-byte-driver-v7jth9-web-1` on mlake.
4. **Reproduce the slot-collision catch locally:** rename `131_…up.sql` to `129_…` (duplicate slot) → workflow MUST fail at `Migration coordination check`. Revert before pushing.
5. **Reproduce the role-split catch locally:** add a no-op migration `132_test_supersedes.up.sql` containing `REINDEX SYSTEM paliad_scratch;` (requires superuser). Workflow MUST fail at `Migration end-to-end (deploy role)`. Revert before pushing.
---
## 6. Future polish (Slice D, m's Q4 R-pick)
`mai-test` post-merge shift: once Slice A is stable, wire a Gitea webhook on push-to-main that fires `/mai-test` as a follow-up shift. It runs the broader smoke + integration suite and posts results as a Gitea commit status. Not blocking; the gate doesn't depend on it.
Implementation belongs in `m/mAi` (the mai webhook handler), not in paliad. Out of scope for Slice A.

View File

@@ -116,6 +116,57 @@ func TestMigrations_DryRun(t *testing.T) {
}
}
// TestMigrations_NoDuplicateSlot is a free-standing pre-flight check that
// scanEmbeddedMigrations refuses to walk a tree where two *.up.sql files
// claim the same NNN slot. This is the brunel-slot-collision class of
// outage (m/paliad#114, 2026-05-25 ~13:20): a worker writes a migration
// at slot N while another shipped slot N from a separate branch, both
// merge, both end up in the embed.FS, and the runner refuses to start.
//
// Catching this at CI time (no DB needed) lets the second PR fail before
// it merges, instead of breaking prod at the next deploy. Pure unit test;
// runs even on developer laptops that don't set TEST_DATABASE_URL.
func TestMigrations_NoDuplicateSlot(t *testing.T) {
if _, err := scanEmbeddedMigrations(); err != nil {
t.Fatalf("scanEmbeddedMigrations: %v "+
"(two migrations share the same NNN slot — coordinate with head "+
"and rename one of them before merging)", err)
}
}
// TestMigrations_EndToEndAsAppRole applies every embedded migration in
// numeric order against a scratch DB connected as a NON-SUPERUSER role.
// This is the prod-shape smoke that the per-mig BEGIN/ROLLBACK dry-run
// (TestMigrations_DryRun) cannot deliver: the dry-run runs each
// statement in isolation and rolls back, so it cannot reproduce the
// mig-129-class outage (m/paliad#114, 2026-05-25 ~14:56 — pq: must be
// owner of table project_event_choices, SQLSTATE 42501) where a
// migration assumes ownership the deploy role doesn't have.
//
// Requires TEST_APP_DATABASE_URL — a Postgres URL whose role is NOT a
// superuser and does NOT own the `paliad` schema (m's Q11.2 pick:
// generic two-role model, see docs/design-cicd-pre-deploy-gate-2026-05-25.md
// §6.2(a)). The CI workflow creates the role + schema split before
// invoking the test; a developer who wants to reproduce the gate locally
// runs the same SQL preamble (see Makefile target `verify-migrations`).
//
// Skipped without TEST_APP_DATABASE_URL — keeps `go test ./...` green
// on machines that haven't set up the role split.
func TestMigrations_EndToEndAsAppRole(t *testing.T) {
url := os.Getenv("TEST_APP_DATABASE_URL")
if url == "" {
t.Skip("TEST_APP_DATABASE_URL not set — skipping role-split end-to-end migration smoke")
}
if err := ApplyMigrations(url); err != nil {
t.Fatalf("ApplyMigrations as app role failed: %v "+
"(a migration assumes more privilege than the deploy role has — "+
"common cases: ALTER TABLE on a schema-owner table, CREATE EXTENSION "+
"without grants, SET ROLE without permission. Fix the migration to "+
"work as the deploy role, or arrange for the schema to be owned by "+
"the deploy role)", err)
}
}
// readAppliedVersions returns the set of versions present in
// paliad.applied_migrations on the scratch DB. Missing table → empty set
// (fresh-DB path; the table only exists after the runner has been called).

View File

@@ -26,24 +26,24 @@ DO $$ BEGIN ALTER TABLE paliad.department_members RENAME COLUMN dezernat_id TO d
-- Constraints (primary key + foreign keys + check). Renaming a pkey
-- constraint also renames the underlying index of the same name.
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_pkey TO departments_pkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_lead_user_id_fkey TO departments_lead_user_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_office_check TO departments_office_check; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_pkey TO department_members_pkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_dezernat_id_fkey TO department_members_department_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_user_id_fkey TO department_members_user_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_pkey TO departments_pkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_lead_user_id_fkey TO departments_lead_user_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.departments RENAME CONSTRAINT dezernate_office_check TO departments_office_check; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_pkey TO department_members_pkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_dezernat_id_fkey TO department_members_department_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.department_members RENAME CONSTRAINT dezernat_mitglieder_user_id_fkey TO department_members_user_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
-- ---------------------------------------------------------------------------
-- Standalone indexes (non-pkey).
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER INDEX paliad.dezernate_office_idx RENAME TO departments_office_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.dezernate_lead_idx RENAME TO departments_lead_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.dezernat_mitglieder_user_idx RENAME TO department_members_user_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.dezernate_office_idx RENAME TO departments_office_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.dezernate_lead_idx RENAME TO departments_lead_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.dezernat_mitglieder_user_idx RENAME TO department_members_user_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
-- ---------------------------------------------------------------------------
-- RLS policies
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER POLICY dezernate_select ON paliad.departments RENAME TO departments_select; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernate_write ON paliad.departments RENAME TO departments_write; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernat_mitglieder_select ON paliad.department_members RENAME TO department_members_select; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernat_mitglieder_write ON paliad.department_members RENAME TO department_members_write; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernate_select ON paliad.departments RENAME TO departments_select; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernate_write ON paliad.departments RENAME TO departments_write; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernat_mitglieder_select ON paliad.department_members RENAME TO department_members_select; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY dezernat_mitglieder_write ON paliad.department_members RENAME TO department_members_write; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;

View File

@@ -63,27 +63,27 @@ ALTER TABLE paliad.partner_unit_members RENAME COLUMN department_id TO partner_u
-- 5. Rename constraints. Postgres auto-renames the underlying index for
-- pkey/uniq constraints; standalone indexes are renamed in step 6.
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_pkey TO partner_units_pkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_lead_user_id_fkey TO partner_units_lead_user_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_office_check TO partner_units_office_check; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_pkey TO partner_unit_members_pkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_department_id_fkey TO partner_unit_members_partner_unit_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_user_id_fkey TO partner_unit_members_user_id_fkey; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_pkey TO partner_units_pkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_lead_user_id_fkey TO partner_units_lead_user_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_units RENAME CONSTRAINT departments_office_check TO partner_units_office_check; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_pkey TO partner_unit_members_pkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_department_id_fkey TO partner_unit_members_partner_unit_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER TABLE paliad.partner_unit_members RENAME CONSTRAINT department_members_user_id_fkey TO partner_unit_members_user_id_fkey; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
-- ---------------------------------------------------------------------------
-- 6. Rename non-pkey indexes.
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER INDEX paliad.departments_office_idx RENAME TO partner_units_office_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.departments_lead_idx RENAME TO partner_units_lead_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.department_members_user_idx RENAME TO partner_unit_members_user_idx; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.departments_office_idx RENAME TO partner_units_office_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.departments_lead_idx RENAME TO partner_units_lead_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER INDEX paliad.department_members_user_idx RENAME TO partner_unit_members_user_idx; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
-- ---------------------------------------------------------------------------
-- 7. Rename RLS policies.
-- ---------------------------------------------------------------------------
DO $$ BEGIN ALTER POLICY departments_select ON paliad.partner_units RENAME TO partner_units_select; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY departments_write ON paliad.partner_units RENAME TO partner_units_write; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY department_members_select ON paliad.partner_unit_members RENAME TO partner_unit_members_select; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY department_members_write ON paliad.partner_unit_members RENAME TO partner_unit_members_write; EXCEPTION WHEN undefined_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY departments_select ON paliad.partner_units RENAME TO partner_units_select; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY departments_write ON paliad.partner_units RENAME TO partner_units_write; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY department_members_select ON paliad.partner_unit_members RENAME TO partner_unit_members_select; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
DO $$ BEGIN ALTER POLICY department_members_write ON paliad.partner_unit_members RENAME TO partner_unit_members_write; EXCEPTION WHEN undefined_object OR undefined_table OR duplicate_object THEN NULL; END $$;
-- ---------------------------------------------------------------------------
-- 8. Audit table for partner-unit events. Mutations on partner_units +

69
internal/db/testdata/README.md vendored Normal file
View File

@@ -0,0 +1,69 @@
# `internal/db/testdata/` — CI snapshot
## `prod-snapshot.sql`
Schema-only `pg_dump` of paliad's prod DB (youpc-supabase paliad schema)
plus the rows of `paliad.applied_migrations` that match this branch's
on-disk migration set.
**Purpose.** Lets CI's migration smoke (`.gitea/workflows/test.yaml`)
restore a Postgres scratch DB to "paliad at HEAD-of-snapshot" without
having to replay 131 migrations from scratch. ApplyMigrations on the
restored DB sees the applied set and only runs whatever NEW migrations
this PR adds — exactly the integration shape we want to test, and the
same shape prod sees on every deploy.
**Why a snapshot at all.** Running ApplyMigrations from scratch against a
fresh `supabase/postgres:15.8.1.060` surfaces multiple fresh-DB
idempotence bugs in historical migrations (raw `COMMIT;` in mig 051,
missing `CREATE EXTENSION pg_trgm` for mig 037, ALTER POLICY
exception-handler gaps in mig 024/027 — the last is fixed in this PR).
Fixing them all is a separate cleanup. The snapshot sidesteps them by
starting CI from a state where every historical migration is already
applied as it was in prod.
**Schema scope.** `--schema=paliad` only. Auth schema comes baked into
`supabase/postgres`; CI's setup step installs `pg_trgm` before restoring.
**Ownership.** `--no-owner --no-privileges` keeps the dump portable
across role topologies (CI's supabase_admin / postgres / authenticated /
anon don't have to match prod's exact role layout). The role-split smoke
relies on `postgres` being a non-superuser, which is true on
supabase/postgres by default.
**Refresh.** Run `make refresh-snapshot` with `PALIAD_PROD_DATABASE_URL`
set to a Postgres URL with `pg_dump` rights on youpc-supabase. The
target appends data rows for `paliad.applied_migrations`, strips
`\restrict` / `\unrestrict` commands (pg 16 dump → pg 15 restore), and
filters out applied-migrations rows for versions beyond the branch's
local max. The CI workflow consumes the resulting file verbatim.
**Verify a refresh.** Boot a local scratch:
```bash
docker run -d --rm --name paliad-snap \
-e POSTGRES_PASSWORD=ci -e POSTGRES_DB=paliad_scratch \
-p 15433:5432 supabase/postgres:15.8.1.060
sleep 5
docker exec -e PGPASSWORD=ci paliad-snap psql -h localhost -U supabase_admin -d paliad_scratch \
-c "GRANT CREATE ON DATABASE paliad_scratch TO postgres;" \
-c "CREATE EXTENSION IF NOT EXISTS pg_trgm;"
cat internal/db/testdata/prod-snapshot.sql | docker exec -i -e PGPASSWORD=ci paliad-snap \
psql -h localhost -U postgres -d paliad_scratch -v ON_ERROR_STOP=1
TEST_DATABASE_URL="postgres://postgres:ci@localhost:15433/paliad_scratch?sslmode=disable" \
TEST_APP_DATABASE_URL="postgres://postgres:ci@localhost:15433/paliad_scratch?sslmode=disable" \
go test -count=1 -run 'TestMigrations|TestBootSmoke|TestHealthReady_Live' ./internal/db/ ./cmd/server/
docker stop paliad-snap
```
All four named tests must pass. If any fails after a refresh,
investigate before merging — usually because a new migration was added
to prod that this branch doesn't have on disk yet.
**Why is the snapshot not gzipped?** Small enough (~200 KB) that the
diff stays human-readable in `git diff` reviews. If it crosses ~1 MB,
gzip + decompress-on-restore in CI.
**Privacy.** Schema-only dump, no row data from any paliad table (except
`paliad.applied_migrations`, which contains migration filenames +
checksums — public info already in the repo).

6278
internal/db/testdata/prod-snapshot.sql vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,9 +1,13 @@
package handlers
import (
"context"
"encoding/json"
"net/http"
"strings"
"time"
"github.com/jmoiron/sqlx"
"mgit.msbls.de/m/paliad/internal/auth"
"mgit.msbls.de/m/paliad/internal/services"
@@ -50,6 +54,12 @@ func noCachePages(h http.Handler) http.Handler {
// Services bundles the Phase B + C database-backed services. Pass nil if
// DATABASE_URL was unset; the matter-management endpoints will return 503.
type Services struct {
// Pool is the raw connection pool. Held so the readiness probe
// (/health/ready) can ping it without going through any individual
// service. nil when DATABASE_URL was unset — in that case
// /health/ready returns 503.
Pool *sqlx.DB
Project *services.ProjectService
Team *services.TeamService
PartnerUnit *services.PartnerUnitService
@@ -188,6 +198,38 @@ func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc
_, _ = w.Write([]byte("ok\n"))
})
// Readiness probe. Public, no auth. Distinct from /healthz: this
// returns 200 only when the DB pool is reachable. Reaching Register
// at all implies db.ApplyMigrations succeeded (cmd/server/main.go
// calls it before constructing svc), so a 200 here means "migrations
// applied AND pool responsive" — the contract Dokploy / Traefik should
// gate on, not the bind-and-serve check that /healthz answers.
//
// Three outcomes:
// - svc == nil OR svc.Pool == nil → 503 (DB-less knowledge-platform
// deployments report not-ready so an external orchestrator can
// distinguish them from a full prod boot).
// - PingContext fails within 2 s → 503 (pool unreachable).
// - PingContext succeeds → 200 "ready".
//
// Used by docker-compose.yml's healthcheck (Slice B) and by the
// post-deploy verification step in .gitea/workflows/test.yaml.
mux.HandleFunc("GET /health/ready", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
if svc == nil || svc.Pool == nil {
http.Error(w, "db not configured\n", http.StatusServiceUnavailable)
return
}
ctx, cancel := context.WithTimeout(r.Context(), 2*time.Second)
defer cancel()
if err := svc.Pool.PingContext(ctx); err != nil {
http.Error(w, "db unreachable\n", http.StatusServiceUnavailable)
return
}
_, _ = w.Write([]byte("ready\n"))
})
// API endpoints (JSON, public)
mux.HandleFunc("POST /api/login", handleAPILogin)
mux.HandleFunc("POST /api/register", handleAPIRegister)