Files
paliad/internal/services/submission_render.go
mAi 8ea3509b98 feat(submissions): t-paliad-215 Slice 1 — in-house .docx render engine
Pure-Go {{path.dot.notation}} placeholder engine + unit tests
(t-paliad-215, design docs/design-submission-generator-2026-05-19.md
§6). Chosen over github.com/lukasjarosch/go-docx because that library
treats sibling placeholders inside one <w:t> run as nested and
refuses to replace them — patent submissions routinely carry multiple
placeholders per paragraph (party blocks especially), so the library
is a non-starter.

Two-pass strategy preserves run-level formatting on the common path:

 1. Pass 1: regex replace inside each <w:t>…</w:t> independently —
    no format loss for the 99% case where placeholders are intact.
 2. Pass 2: paragraph-level merge for paragraphs that still contain
    orphan "{{" or "}}" markers (Word fragmented the placeholder
    across runs).

Missing placeholders render [KEIN WERT: <key>] / [NO VALUE: <key>]
markers so the lawyer sees the gap in Word rather than getting a 400.

Tests cover: single-run, multi-per-run (the go-docx failure mode),
cross-run merge, missing-marker (DE+EN), XML escaping of special
chars, non-document zip entries preserved, placeholder regex
grammar.
2026-05-19 13:42:51 +02:00

316 lines
12 KiB
Go

package services
// Submission template renderer — in-house engine for the submission
// generator (t-paliad-215, design doc
// docs/design-submission-generator-2026-05-19.md §6).
//
// Design choice — why not lukasjarosch/go-docx:
// The library's "nested placeholder" guard treats sibling placeholders
// inside the same <w:t> run (e.g. "{{a}} ./. {{b}}") as nested and
// refuses to replace either. Patent submissions routinely have multiple
// placeholders per paragraph (party blocks especially), so the library
// is a non-starter without a custom fork. The in-house renderer below
// is ~150 LoC and handles both the single-run common case and the
// cross-run case (where Word may split a placeholder across runs after
// editing).
//
// Placeholder grammar: {{[A-Za-z][A-Za-z0-9_.]*}} with optional
// whitespace inside braces ({{ project.case_number }} ≡
// {{project.case_number}}).
//
// Missing-value behaviour: when a placeholder has no binding in the
// PlaceholderMap, the renderer emits a marker token so the lawyer sees
// the gap in Word rather than failing the request. See §6.3 of the
// design doc.
import (
"archive/zip"
"bytes"
"fmt"
"io"
"regexp"
"strings"
)
// PlaceholderMap is the variable bag built by SubmissionVarsService.
// Keys are dotted paths without braces (e.g. "project.case_number").
// Values are the substituted text — already locale-aware, pretty-
// printed, and sanitised by the caller.
type PlaceholderMap map[string]string
// MissingPlaceholderFn translates an unbound placeholder key into the
// in-document marker token. The default in DefaultMissingMarker is
// "[KEIN WERT: <key>]" / "[NO VALUE: <key>]" depending on lang.
type MissingPlaceholderFn func(key string) string
// DefaultMissingMarker returns the standard missing-value marker for
// the given UI language.
func DefaultMissingMarker(lang string) MissingPlaceholderFn {
prefix := "KEIN WERT"
if strings.EqualFold(lang, "en") {
prefix = "NO VALUE"
}
return func(key string) string {
return "[" + prefix + ": " + key + "]"
}
}
// placeholderRegex matches a single placeholder. The capture group
// extracts the key name without braces or surrounding whitespace.
//
// Restricted to [A-Za-z][A-Za-z0-9_.]* so that stray "{{" sequences in
// legal prose (extremely rare in DE/EN court briefs but possible)
// don't get mistaken for placeholders. A genuine placeholder always
// starts with an ASCII letter.
var placeholderRegex = regexp.MustCompile(`\{\{\s*([A-Za-z][A-Za-z0-9_.]*)\s*\}\}`)
// SubmissionRenderer renders a .docx template into a .docx output by
// substituting {{placeholder}} tokens with values from a PlaceholderMap.
// Stateless; safe for concurrent use.
type SubmissionRenderer struct{}
// NewSubmissionRenderer constructs the renderer.
func NewSubmissionRenderer() *SubmissionRenderer {
return &SubmissionRenderer{}
}
// Render reads the .docx template at templateBytes, substitutes every
// placeholder from vars (or emits the missing-marker token), and writes
// the result to the returned byte slice. Unknown placeholders never
// fail the render — the lawyer sees the marker in Word and fixes it.
func (r *SubmissionRenderer) Render(templateBytes []byte, vars PlaceholderMap, missing MissingPlaceholderFn) ([]byte, error) {
if missing == nil {
missing = DefaultMissingMarker("de")
}
zr, err := zip.NewReader(bytes.NewReader(templateBytes), int64(len(templateBytes)))
if err != nil {
return nil, fmt.Errorf("submission template: open zip: %w", err)
}
var out bytes.Buffer
zw := zip.NewWriter(&out)
defer zw.Close()
for _, entry := range zr.File {
body, err := readZipEntry(entry)
if err != nil {
return nil, fmt.Errorf("submission template: read %s: %w", entry.Name, err)
}
if isWordXMLEntry(entry.Name) {
body = substituteInDocumentXML(body, vars, missing)
}
w, err := zw.CreateHeader(&zip.FileHeader{
Name: entry.Name,
Method: entry.Method,
Modified: entry.Modified,
})
if err != nil {
return nil, fmt.Errorf("submission template: write header %s: %w", entry.Name, err)
}
if _, err := w.Write(body); err != nil {
return nil, fmt.Errorf("submission template: write %s: %w", entry.Name, err)
}
}
if err := zw.Close(); err != nil {
return nil, fmt.Errorf("submission template: finalise zip: %w", err)
}
return out.Bytes(), nil
}
// isWordXMLEntry returns true for the .docx parts that contain
// substitutable text. We touch document.xml plus header*.xml and
// footer*.xml (templates may put firm letterhead in a header) but
// skip styles, theme, settings, comments, footnotes — none of which
// should carry merge placeholders in a well-formed template.
func isWordXMLEntry(name string) bool {
switch {
case name == "word/document.xml":
return true
case strings.HasPrefix(name, "word/header") && strings.HasSuffix(name, ".xml"):
return true
case strings.HasPrefix(name, "word/footer") && strings.HasSuffix(name, ".xml"):
return true
}
return false
}
// readZipEntry slurps a zip entry's bytes.
func readZipEntry(f *zip.File) ([]byte, error) {
rc, err := f.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// substituteInDocumentXML walks document XML and replaces every
// {{placeholder}} occurrence inside <w:t> text nodes. Handles both
// single-run placeholders (the common case for freshly authored
// templates) and cross-run placeholders (where Word's autocorrect or
// manual editing has split a placeholder across runs).
//
// Two-pass strategy:
//
// 1. Pass 1: replace placeholders that fit entirely within one
// <w:t>…</w:t>. This is the 99% case and preserves all run-level
// formatting (bold, italic, font runs).
// 2. Pass 2: for paragraphs that still contain orphan "{{" or "}}"
// markers after pass 1, merge the text of every <w:t> inside the
// paragraph, run the replacement on the merged text, and rewrite
// the paragraph's runs as a single <w:r><w:t>…</w:t></w:r> using
// the formatting properties of the first run. Loses intra-paragraph
// formatting on the affected paragraph — but only on paragraphs
// where Word genuinely fragmented a placeholder.
func substituteInDocumentXML(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
replaced := substituteInTextNodes(body, vars, missing)
if !needsCrossRunMerge(replaced) {
return replaced
}
return substituteAcrossRuns(replaced, vars, missing)
}
// wTextNodeRegex matches one <w:t …>contents</w:t> element, capturing
// the contents. Attributes on <w:t> (xml:space="preserve") are preserved
// because the entire match is rewritten.
var wTextNodeRegex = regexp.MustCompile(`<w:t(\s[^>]*)?>([^<]*)</w:t>`)
// substituteInTextNodes runs the placeholder replacement inside each
// <w:t> text node independently. Format-preserving for single-run
// placeholders.
func substituteInTextNodes(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
return wTextNodeRegex.ReplaceAllFunc(body, func(match []byte) []byte {
sub := wTextNodeRegex.FindSubmatch(match)
attrs := string(sub[1])
contents := xmlDecode(string(sub[2]))
replaced := replacePlaceholders(contents, vars, missing)
if replaced == contents {
return match
}
// xml:space="preserve" stays attached whenever the original
// content had leading/trailing whitespace; ensure it's still
// declared after replacement to avoid Word collapsing spaces.
if !strings.Contains(attrs, "xml:space") &&
(strings.HasPrefix(replaced, " ") || strings.HasSuffix(replaced, " ")) {
attrs += ` xml:space="preserve"`
}
return []byte(`<w:t` + attrs + `>` + xmlEncode(replaced) + `</w:t>`)
})
}
// needsCrossRunMerge returns true when the body still contains an
// unmatched "{{" or "}}" after pass 1 — a sign that Word fragmented
// the placeholder across runs and pass 1 couldn't touch it.
func needsCrossRunMerge(body []byte) bool {
// Cheap heuristic: count "{{" vs "}}" inside <w:t> nodes. If we have
// either marker present in the text-node space, pass 2 will handle
// it. (Inside attributes or other XML, the markers don't matter.)
for _, m := range wTextNodeRegex.FindAllSubmatch(body, -1) {
t := string(m[2])
if strings.Contains(t, "{{") || strings.Contains(t, "}}") {
return true
}
}
return false
}
// wParagraphRegex matches one <w:p>…</w:p> paragraph block. Greedy
// inner-content match is safe here because <w:p> elements do not nest
// in WordprocessingML — a paragraph is the leaf container for text.
var wParagraphRegex = regexp.MustCompile(`(?s)<w:p\b[^>]*>.*?</w:p>`)
// wRunPropsRegex pulls the first <w:rPr>…</w:rPr> block from a
// paragraph so we can reuse it as the formatting of the merged run.
var wRunPropsRegex = regexp.MustCompile(`(?s)<w:rPr>.*?</w:rPr>`)
// wParagraphPropsRegex pulls the optional <w:pPr>…</w:pPr> that sits
// at the top of a paragraph (alignment, spacing, etc.). Preserved.
var wParagraphPropsRegex = regexp.MustCompile(`(?s)<w:pPr>.*?</w:pPr>`)
// substituteAcrossRuns is pass 2: for any paragraph that still has a
// split placeholder, concatenate every text node, run replacement, and
// rewrite the paragraph as a single run using the first run's
// properties. Paragraphs without orphan markers are left untouched so
// run-level formatting survives wherever pass 1 already resolved the
// placeholders.
func substituteAcrossRuns(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
return wParagraphRegex.ReplaceAllFunc(body, func(para []byte) []byte {
textNodes := wTextNodeRegex.FindAllSubmatch(para, -1)
if len(textNodes) == 0 {
return para
}
var merged strings.Builder
for _, m := range textNodes {
merged.WriteString(xmlDecode(string(m[2])))
}
original := merged.String()
if !strings.Contains(original, "{{") {
// No fragmented placeholder in this paragraph; leave it
// alone so pass 1's run-level edits survive.
return para
}
replaced := replacePlaceholders(original, vars, missing)
if replaced == original {
return para
}
// Preserve paragraph properties (alignment, spacing) and the
// first run's properties (font, bold/italic).
pPr := wParagraphPropsRegex.Find(para)
rPr := wRunPropsRegex.Find(para)
var rebuilt bytes.Buffer
rebuilt.WriteString(`<w:p>`)
if pPr != nil {
rebuilt.Write(pPr)
}
rebuilt.WriteString(`<w:r>`)
if rPr != nil {
rebuilt.Write(rPr)
}
rebuilt.WriteString(`<w:t xml:space="preserve">`)
rebuilt.WriteString(xmlEncode(replaced))
rebuilt.WriteString(`</w:t></w:r></w:p>`)
return rebuilt.Bytes()
})
}
// replacePlaceholders performs the actual substitution on a plain
// string. Unbound placeholders render the missing marker.
func replacePlaceholders(s string, vars PlaceholderMap, missing MissingPlaceholderFn) string {
return placeholderRegex.ReplaceAllStringFunc(s, func(match string) string {
sub := placeholderRegex.FindStringSubmatch(match)
if len(sub) < 2 {
return match
}
key := sub[1]
if value, ok := vars[key]; ok {
return value
}
return missing(key)
})
}
// xmlDecode reverses the small set of escapes used in WordprocessingML
// text content. We don't need a full XML parser — text nodes carry only
// the standard five entities, and Word never emits numeric-character
// references inside <w:t> for printable content.
func xmlDecode(s string) string {
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", `"`)
s = strings.ReplaceAll(s, "&apos;", "'")
s = strings.ReplaceAll(s, "&amp;", "&")
return s
}
// xmlEncode escapes a substituted value for safe insertion back into a
// WordprocessingML text node. & must be replaced first to avoid double
// encoding the entity prefixes we introduce on the other characters.
func xmlEncode(s string) string {
s = strings.ReplaceAll(s, "&", "&amp;")
s = strings.ReplaceAll(s, "<", "&lt;")
s = strings.ReplaceAll(s, ">", "&gt;")
s = strings.ReplaceAll(s, `"`, "&quot;")
s = strings.ReplaceAll(s, "'", "&apos;")
return s
}