paliad/internal/services/submission_render.go

package services

// Submission template renderer — in-house engine for the submission
// generator (t-paliad-215, design doc
// docs/design-submission-generator-2026-05-19.md §6).
//
// Design choice — why not lukasjarosch/go-docx:
// The library's "nested placeholder" guard treats sibling placeholders
// inside the same <w:t> run (e.g. "{{a}} ./. {{b}}") as nested and
// refuses to replace either. Patent submissions routinely have multiple
// placeholders per paragraph (party blocks especially), so the library
// is a non-starter without a custom fork. The in-house renderer below
// is ~150 LoC and handles both the single-run common case and the
// cross-run case (where Word may split a placeholder across runs after
// editing).
//
// Placeholder grammar: {{[A-Za-z][A-Za-z0-9_.]*}} with optional
// whitespace inside braces ({{  project.case_number  }} ≡
// {{project.case_number}}).
//
// Missing-value behaviour: when a placeholder has no binding in the
// PlaceholderMap, the renderer emits a marker token so the lawyer sees
// the gap in Word rather than failing the request. See §6.3 of the
// design doc.

import (
	"archive/zip"
	"bytes"
	"fmt"
	"io"
	"regexp"
	"strings"
)

// PlaceholderMap is the variable bag built by SubmissionVarsService.
// Keys are dotted paths without braces (e.g. "project.case_number").
// Values are the substituted text — already locale-aware, pretty-
// printed, and sanitised by the caller.
type PlaceholderMap map[string]string

// MissingPlaceholderFn translates an unbound placeholder key into the
// in-document marker token. The default in DefaultMissingMarker is
// "[KEIN WERT: <key>]" / "[NO VALUE: <key>]" depending on lang.
type MissingPlaceholderFn func(key string) string

// DefaultMissingMarker returns the standard missing-value marker for
// the given UI language.
func DefaultMissingMarker(lang string) MissingPlaceholderFn {
	prefix := "KEIN WERT"
	if strings.EqualFold(lang, "en") {
		prefix = "NO VALUE"
	}
	return func(key string) string {
		return "[" + prefix + ": " + key + "]"
	}
}

// placeholderRegex matches a single placeholder. The capture group
// extracts the key name without braces or surrounding whitespace.
//
// Restricted to [A-Za-z][A-Za-z0-9_.]* so that stray "{{" sequences in
// legal prose (extremely rare in DE/EN court briefs but possible)
// don't get mistaken for placeholders. A genuine placeholder always
// starts with an ASCII letter.
var placeholderRegex = regexp.MustCompile(`\{\{\s*([A-Za-z][A-Za-z0-9_.]*)\s*\}\}`)

// SubmissionRenderer renders a .docx template into a .docx output by
// substituting {{placeholder}} tokens with values from a PlaceholderMap.
// Stateless; safe for concurrent use.
type SubmissionRenderer struct{}

// NewSubmissionRenderer constructs the renderer.
func NewSubmissionRenderer() *SubmissionRenderer {
	return &SubmissionRenderer{}
}

// Render reads the .docx template at templateBytes, substitutes every
// placeholder from vars (or emits the missing-marker token), and writes
// the result to the returned byte slice. Unknown placeholders never
// fail the render — the lawyer sees the marker in Word and fixes it.
func (r *SubmissionRenderer) Render(templateBytes []byte, vars PlaceholderMap, missing MissingPlaceholderFn) ([]byte, error) {
	if missing == nil {
		missing = DefaultMissingMarker("de")
	}
	zr, err := zip.NewReader(bytes.NewReader(templateBytes), int64(len(templateBytes)))
	if err != nil {
		return nil, fmt.Errorf("submission template: open zip: %w", err)
	}

	var out bytes.Buffer
	zw := zip.NewWriter(&out)
	defer zw.Close()

	for _, entry := range zr.File {
		body, err := readZipEntry(entry)
		if err != nil {
			return nil, fmt.Errorf("submission template: read %s: %w", entry.Name, err)
		}
		if isWordXMLEntry(entry.Name) {
			body = substituteInDocumentXML(body, vars, missing)
		}
		w, err := zw.CreateHeader(&zip.FileHeader{
			Name:     entry.Name,
			Method:   entry.Method,
			Modified: entry.Modified,
		})
		if err != nil {
			return nil, fmt.Errorf("submission template: write header %s: %w", entry.Name, err)
		}
		if _, err := w.Write(body); err != nil {
			return nil, fmt.Errorf("submission template: write %s: %w", entry.Name, err)
		}
	}
	if err := zw.Close(); err != nil {
		return nil, fmt.Errorf("submission template: finalise zip: %w", err)
	}
	return out.Bytes(), nil
}

// isWordXMLEntry returns true for the .docx parts that contain
// substitutable text. We touch document.xml plus header*.xml and
// footer*.xml (templates may put firm letterhead in a header) but
// skip styles, theme, settings, comments, footnotes — none of which
// should carry merge placeholders in a well-formed template.
func isWordXMLEntry(name string) bool {
	switch {
	case name == "word/document.xml":
		return true
	case strings.HasPrefix(name, "word/header") && strings.HasSuffix(name, ".xml"):
		return true
	case strings.HasPrefix(name, "word/footer") && strings.HasSuffix(name, ".xml"):
		return true
	}
	return false
}

// readZipEntry slurps a zip entry's bytes.
func readZipEntry(f *zip.File) ([]byte, error) {
	rc, err := f.Open()
	if err != nil {
		return nil, err
	}
	defer rc.Close()
	return io.ReadAll(rc)
}

// substituteInDocumentXML walks document XML and replaces every
// {{placeholder}} occurrence inside <w:t> text nodes. Handles both
// single-run placeholders (the common case for freshly authored
// templates) and cross-run placeholders (where Word's autocorrect or
// manual editing has split a placeholder across runs).
//
// Two-pass strategy:
//
//  1. Pass 1: replace placeholders that fit entirely within one
//     <w:t>…</w:t>. This is the 99% case and preserves all run-level
//     formatting (bold, italic, font runs).
//  2. Pass 2: for paragraphs that still contain orphan "{{" or "}}"
//     markers after pass 1, merge the text of every <w:t> inside the
//     paragraph, run the replacement on the merged text, and rewrite
//     the paragraph's runs as a single <w:r><w:t>…</w:t></w:r> using
//     the formatting properties of the first run. Loses intra-paragraph
//     formatting on the affected paragraph — but only on paragraphs
//     where Word genuinely fragmented a placeholder.
func substituteInDocumentXML(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
	replaced := substituteInTextNodes(body, vars, missing)
	if !needsCrossRunMerge(replaced) {
		return replaced
	}
	return substituteAcrossRuns(replaced, vars, missing)
}

// wTextNodeRegex matches one <w:t …>contents</w:t> element, capturing
// the contents. Attributes on <w:t> (xml:space="preserve") are preserved
// because the entire match is rewritten.
var wTextNodeRegex = regexp.MustCompile(`<w:t(\s[^>]*)?>([^<]*)</w:t>`)

// substituteInTextNodes runs the placeholder replacement inside each
// <w:t> text node independently. Format-preserving for single-run
// placeholders.
func substituteInTextNodes(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
	return wTextNodeRegex.ReplaceAllFunc(body, func(match []byte) []byte {
		sub := wTextNodeRegex.FindSubmatch(match)
		attrs := string(sub[1])
		contents := xmlDecode(string(sub[2]))
		replaced := replacePlaceholders(contents, vars, missing)
		if replaced == contents {
			return match
		}
		// xml:space="preserve" stays attached whenever the original
		// content had leading/trailing whitespace; ensure it's still
		// declared after replacement to avoid Word collapsing spaces.
		if !strings.Contains(attrs, "xml:space") &&
			(strings.HasPrefix(replaced, " ") || strings.HasSuffix(replaced, " ")) {
			attrs += ` xml:space="preserve"`
		}
		return []byte(`<w:t` + attrs + `>` + xmlEncode(replaced) + `</w:t>`)
	})
}

// needsCrossRunMerge returns true when the body still contains an
// unmatched "{{" or "}}" after pass 1 — a sign that Word fragmented
// the placeholder across runs and pass 1 couldn't touch it.
func needsCrossRunMerge(body []byte) bool {
	// Cheap heuristic: count "{{" vs "}}" inside <w:t> nodes. If we have
	// either marker present in the text-node space, pass 2 will handle
	// it. (Inside attributes or other XML, the markers don't matter.)
	for _, m := range wTextNodeRegex.FindAllSubmatch(body, -1) {
		t := string(m[2])
		if strings.Contains(t, "{{") || strings.Contains(t, "}}") {
			return true
		}
	}
	return false
}

// wParagraphRegex matches one <w:p>…</w:p> paragraph block. Greedy
// inner-content match is safe here because <w:p> elements do not nest
// in WordprocessingML — a paragraph is the leaf container for text.
var wParagraphRegex = regexp.MustCompile(`(?s)<w:p\b[^>]*>.*?</w:p>`)

// wRunPropsRegex pulls the first <w:rPr>…</w:rPr> block from a
// paragraph so we can reuse it as the formatting of the merged run.
var wRunPropsRegex = regexp.MustCompile(`(?s)<w:rPr>.*?</w:rPr>`)

// wParagraphPropsRegex pulls the optional <w:pPr>…</w:pPr> that sits
// at the top of a paragraph (alignment, spacing, etc.). Preserved.
var wParagraphPropsRegex = regexp.MustCompile(`(?s)<w:pPr>.*?</w:pPr>`)

// substituteAcrossRuns is pass 2: for any paragraph that still has a
// split placeholder, concatenate every text node, run replacement, and
// rewrite the paragraph as a single run using the first run's
// properties. Paragraphs without orphan markers are left untouched so
// run-level formatting survives wherever pass 1 already resolved the
// placeholders.
func substituteAcrossRuns(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
	return wParagraphRegex.ReplaceAllFunc(body, func(para []byte) []byte {
		textNodes := wTextNodeRegex.FindAllSubmatch(para, -1)
		if len(textNodes) == 0 {
			return para
		}
		var merged strings.Builder
		for _, m := range textNodes {
			merged.WriteString(xmlDecode(string(m[2])))
		}
		original := merged.String()
		if !strings.Contains(original, "{{") {
			// No fragmented placeholder in this paragraph; leave it
			// alone so pass 1's run-level edits survive.
			return para
		}
		replaced := replacePlaceholders(original, vars, missing)
		if replaced == original {
			return para
		}
		// Preserve paragraph properties (alignment, spacing) and the
		// first run's properties (font, bold/italic).
		pPr := wParagraphPropsRegex.Find(para)
		rPr := wRunPropsRegex.Find(para)
		var rebuilt bytes.Buffer
		rebuilt.WriteString(`<w:p>`)
		if pPr != nil {
			rebuilt.Write(pPr)
		}
		rebuilt.WriteString(`<w:r>`)
		if rPr != nil {
			rebuilt.Write(rPr)
		}
		rebuilt.WriteString(`<w:t xml:space="preserve">`)
		rebuilt.WriteString(xmlEncode(replaced))
		rebuilt.WriteString(`</w:t></w:r></w:p>`)
		return rebuilt.Bytes()
	})
}

// replacePlaceholders performs the actual substitution on a plain
// string. Unbound placeholders render the missing marker.
func replacePlaceholders(s string, vars PlaceholderMap, missing MissingPlaceholderFn) string {
	return placeholderRegex.ReplaceAllStringFunc(s, func(match string) string {
		sub := placeholderRegex.FindStringSubmatch(match)
		if len(sub) < 2 {
			return match
		}
		key := sub[1]
		if value, ok := vars[key]; ok {
			return value
		}
		return missing(key)
	})
}

// xmlDecode reverses the small set of escapes used in WordprocessingML
// text content. We don't need a full XML parser — text nodes carry only
// the standard five entities, and Word never emits numeric-character
// references inside <w:t> for printable content.
func xmlDecode(s string) string {
	s = strings.ReplaceAll(s, "&lt;", "<")
	s = strings.ReplaceAll(s, "&gt;", ">")
	s = strings.ReplaceAll(s, "&quot;", `"`)
	s = strings.ReplaceAll(s, "&apos;", "'")
	s = strings.ReplaceAll(s, "&amp;", "&")
	return s
}

// xmlEncode escapes a substituted value for safe insertion back into a
// WordprocessingML text node. & must be replaced first to avoid double
// encoding the entity prefixes we introduce on the other characters.
func xmlEncode(s string) string {
	s = strings.ReplaceAll(s, "&", "&amp;")
	s = strings.ReplaceAll(s, "<", "&lt;")
	s = strings.ReplaceAll(s, ">", "&gt;")
	s = strings.ReplaceAll(s, `"`, "&quot;")
	s = strings.ReplaceAll(s, "'", "&apos;")
	return s
}