paliad/internal/services/project_code.go

package services

import (
	"context"
	"fmt"
	"regexp"
	"strings"
	"unicode"

	"github.com/google/uuid"
	"github.com/jmoiron/sqlx"
	"github.com/lib/pq"
	"golang.org/x/text/runes"
	"golang.org/x/text/transform"
	"golang.org/x/text/unicode/norm"

	"mgit.msbls.de/m/paliad/internal/models"
)

// Project codes — t-paliad-222 / m/paliad#50.
//
// BuildProjectCode assembles a dotted code from the ancestor chain of
// a project. Each ancestor contributes one segment derived from its
// type-specific metadata. Missing segments (NULL ancestor field,
// unfilled opponent_code, etc.) are skipped silently — there is no
// placeholder.
//
//   client     → reference if set, else slug(title), capped at 8 chars
//   litigation → opponent_code (the slug the user typed at litigation
//                creation), empty → skipped
//   patent     → last 3 digits of patent_number (full digit-stream when
//                shorter), empty → skipped
//   case       → uppercase tail of proceeding_types.code (jurisdiction
//                segment dropped), empty → skipped
//   project    → "" (generic projects don't contribute a segment)
//
// Custom override: if the target row's `reference` column is non-empty,
// it wins outright — the helper returns the literal `reference` string
// without walking the ancestor chain.
//
// Example: Client EXMPL → Litigation OPNT → Patent EP3456789 → Case
// `upc.inf.cfi` → "EXMPL.OPNT.789.INF.CFI".
//
// Collision handling: codes are display-only (no uniqueness
// constraint). Two cases that derive to the same code both return the
// same string. v1 contract — users disambiguate via `reference` when it
// matters.

// projectChainRow is one row of the ancestor walk. Includes only the
// columns BuildProjectCode needs; trimmed for cheap projection.
type projectChainRow struct {
	ID               uuid.UUID `db:"id"`
	Type             string    `db:"type"`
	Title            string    `db:"title"`
	Reference        *string   `db:"reference"`
	OpponentCode     *string   `db:"opponent_code"`
	PatentNumber     *string   `db:"patent_number"`
	ProceedingTypeID *int      `db:"proceeding_type_id"`
	ProceedingCode   *string   `db:"proceeding_code"`
}

// BuildProjectCode walks the ancestor chain via paliad.projects.path
// and returns the assembled code. One DB round-trip per call; suitable
// for per-row use in single-project projection paths.
//
// paliad.projects.path is stored as TEXT (dot-separated UUIDs), not as
// the ltree extension type — see export_service.go comment "ltree as
// text" and can_see_project's string_to_array decomposition. Ancestor
// walks use the same string_to_array(path, '.')::uuid[] pattern as the
// canonical visibility predicate; ltree operators (@>, nlevel) would
// raise "operator does not exist: text @> text" at runtime.
//
// For list endpoints with many rows, the call still scales fine for
// firm-scale datasets (order-of-100s); if profiling later flags it as
// a hotspot, introduce a materialised view per the design doc §3.2 Q8.
func BuildProjectCode(ctx context.Context, db sqlx.QueryerContext, projectID uuid.UUID) (string, error) {
	const query = `
		SELECT p.id, p.type, p.title, p.reference, p.opponent_code,
		       p.patent_number, p.proceeding_type_id,
		       pt.code AS proceeding_code
		  FROM paliad.projects target
		  JOIN paliad.projects p
		    ON p.id = ANY(string_to_array(target.path, '.')::uuid[])
		  LEFT JOIN paliad.proceeding_types pt ON pt.id = p.proceeding_type_id
		 WHERE target.id = $1
		 ORDER BY array_position(string_to_array(target.path, '.')::uuid[], p.id)
	`
	rows := []projectChainRow{}
	if err := sqlx.SelectContext(ctx, db, &rows, query, projectID); err != nil {
		return "", fmt.Errorf("build project code: load chain: %w", err)
	}
	if len(rows) == 0 {
		return "", nil
	}
	return assembleProjectCode(rows), nil
}

// PopulateProjectCodes assigns .Code on every project in `targets` via
// a single bulk round-trip. Used by List / ListChildren / ListAncestors
// projection paths to avoid N+1 BuildProjectCode calls.
//
// Empty slice → no-op. Rows that can't be matched (orphaned) get an
// empty code rather than an error.
func PopulateProjectCodes(ctx context.Context, db sqlx.QueryerContext, targets []models.Project) error {
	if len(targets) == 0 {
		return nil
	}
	ids := make([]string, len(targets))
	for i, t := range targets {
		ids[i] = t.ID.String()
	}

	// One query: for each target id, fetch the full ancestor chain
	// joined to proceeding_types, ordered so we can group in Go.
	//
	// Ancestor walk uses string_to_array(path, '.')::uuid[] — same shape
	// as can_see_project. paliad.projects.path is TEXT, so ltree
	// operators (@>, nlevel) would fail with "operator does not exist:
	// text @> text". See BuildProjectCode doc comment for context.
	const query = `
		WITH targets AS (
		    SELECT id, path
		      FROM paliad.projects
		     WHERE id = ANY($1::uuid[])
		)
		SELECT t.id AS target_id,
		       p.id, p.type, p.title, p.reference, p.opponent_code,
		       p.patent_number, p.proceeding_type_id,
		       pt.code AS proceeding_code,
		       array_position(string_to_array(t.path, '.')::uuid[], p.id) AS chain_level
		  FROM targets t
		  JOIN paliad.projects p
		    ON p.id = ANY(string_to_array(t.path, '.')::uuid[])
		  LEFT JOIN paliad.proceeding_types pt ON pt.id = p.proceeding_type_id
		 ORDER BY t.id, chain_level
	`
	type bulkRow struct {
		TargetID uuid.UUID `db:"target_id"`
		projectChainRow
		ChainLevel int `db:"chain_level"`
	}

	rows := []bulkRow{}
	if err := sqlx.SelectContext(ctx, db, &rows, query, pq.StringArray(ids)); err != nil {
		return fmt.Errorf("populate project codes: bulk fetch: %w", err)
	}

	chains := make(map[uuid.UUID][]projectChainRow, len(targets))
	for _, r := range rows {
		chains[r.TargetID] = append(chains[r.TargetID], r.projectChainRow)
	}
	for i := range targets {
		targets[i].Code = assembleProjectCode(chains[targets[i].ID])
	}
	return nil
}

// assembleProjectCode is the pure code-assembly step, split out from
// the DB hop so it can be table-tested without fixtures.
//
// Custom override: non-empty `reference` on the target row (last in
// chain) wins; the function returns it verbatim without computing the
// other segments.
func assembleProjectCode(chain []projectChainRow) string {
	if len(chain) == 0 {
		return ""
	}
	target := chain[len(chain)-1]
	if target.Reference != nil {
		if v := strings.TrimSpace(*target.Reference); v != "" {
			return v
		}
	}
	segments := make([]string, 0, len(chain))
	for _, p := range chain {
		seg := projectCodeSegment(p)
		if seg == "" {
			continue
		}
		segments = append(segments, seg)
	}
	return strings.Join(segments, ".")
}

// projectCodeSegment returns the per-row segment string for the dotted
// project code. Empty string → row contributes no segment (skipped by
// the assembler). Pure; never touches the DB. Table-tested.
func projectCodeSegment(p projectChainRow) string {
	switch p.Type {
	case "client":
		if p.Reference != nil {
			if v := sanitizeClientShort(*p.Reference); v != "" {
				return v
			}
		}
		return sanitizeClientShort(p.Title)
	case "litigation":
		if p.OpponentCode != nil {
			return strings.TrimSpace(*p.OpponentCode)
		}
		return ""
	case "patent":
		if p.PatentNumber != nil {
			return patentLast3(*p.PatentNumber)
		}
		return ""
	case "case":
		if p.ProceedingCode != nil {
			return proceedingTail(*p.ProceedingCode)
		}
		return ""
	default:
		// 'project' (generic) and any future types contribute nothing.
		return ""
	}
}

// sanitizeClientShort produces an 8-char uppercase slug from a client
// reference / title. Strips diacritics, replaces non-alphanumerics
// with nothing, trims, caps at 8 chars. Empty input → "".
//
// Examples (verified by table test):
//   "EXMPL"        → "EXMPL"
//   "Example Co."  → "EXAMPLEC"
//   "Müller GmbH"  → "MULLERGM"
//   "  "           → ""
func sanitizeClientShort(s string) string {
	s = strings.TrimSpace(s)
	if s == "" {
		return ""
	}
	// Strip diacritics: NFD-decompose, drop combining marks, NFC-recompose.
	t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
	stripped, _, err := transform.String(t, s)
	if err != nil {
		stripped = s
	}
	var b strings.Builder
	b.Grow(len(stripped))
	for _, r := range stripped {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			b.WriteRune(unicode.ToUpper(r))
		}
	}
	out := b.String()
	if len(out) > 8 {
		out = out[:8]
	}
	return out
}

// patentDigitsPattern matches a run of digits inside a patent number.
// Pre-compiled once to avoid per-call regex compilation cost.
var patentDigitsPattern = regexp.MustCompile(`\d+`)

// patentKindCodeSuffix matches the trailing kind code on a patent
// publication number (A1, A2, B1, B2, C, T3, etc.). Stripped before
// digit extraction so the kind-code's optional digit doesn't sneak
// into the patent number proper.
//
// EP / WO conventions allow A, B, C, T, U as the letter; the digit is
// optional. The regex anchors at end-of-string and tolerates trailing
// whitespace.
var patentKindCodeSuffix = regexp.MustCompile(`[A-Z][0-9]?\s*$`)

// patentLast3 extracts the last 3 digits of a patent number, returning
// the full digit-stream if the patent has fewer than 3 digits total.
//
// Strips a trailing kind-code suffix (A1, B2, C, T3 …) first so its
// optional digit doesn't pollute the result, then collapses all digit
// runs in the remainder to handle spaced / slashed formats. Examples:
//
//   "EP1234567"        → "567"
//   "EP 1 234 567"     → "567"
//   "EP3456789A1"      → "789"
//   "EP1234567 B1"     → "567"
//   "WO2020/123456A1"  → "456"
//   "DE12"             → "12"
//   "EP"               → ""
//   ""                 → ""
func patentLast3(s string) string {
	s = strings.ToUpper(strings.TrimSpace(s))
	if s == "" {
		return ""
	}
	// Strip the trailing kind code (one or two chars at end).
	s = patentKindCodeSuffix.ReplaceAllString(s, "")
	matches := patentDigitsPattern.FindAllString(s, -1)
	if len(matches) == 0 {
		return ""
	}
	digits := strings.Join(matches, "")
	if len(digits) >= 3 {
		return digits[len(digits)-3:]
	}
	return digits
}

// proceedingTail takes a proceeding_types.code (e.g. "upc.inf.cfi") and
// returns the uppercase tail with the leading jurisdiction segment
// dropped. The jurisdiction is implied by the ancestor client / patent
// context, so it's redundant in the code.
//
//   "upc.inf.cfi"     → "INF.CFI"
//   "upc.rev.cfi"     → "REV.CFI"
//   "upc.apl.merits"  → "APL.MERITS"
//   "de.inf.lg"       → "INF.LG"
//   "de.inf.olg"      → "INF.OLG"
//   "single"          → ""        (no tail after dropping the only segment)
//   ""                → ""
func proceedingTail(code string) string {
	code = strings.TrimSpace(code)
	if code == "" {
		return ""
	}
	parts := strings.Split(code, ".")
	if len(parts) < 2 {
		return ""
	}
	tail := parts[1:]
	out := make([]string, len(tail))
	for i, p := range tail {
		out[i] = strings.ToUpper(p)
	}
	return strings.Join(out, ".")
}