Pure-Go {{path.dot.notation}} placeholder engine + unit tests
(t-paliad-215, design docs/design-submission-generator-2026-05-19.md
§6). Chosen over github.com/lukasjarosch/go-docx because that library
treats sibling placeholders inside one <w:t> run as nested and
refuses to replace them — patent submissions routinely carry multiple
placeholders per paragraph (party blocks especially), so the library
is a non-starter.
Two-pass strategy preserves run-level formatting on the common path:
1. Pass 1: regex replace inside each <w:t>…</w:t> independently —
no format loss for the 99% case where placeholders are intact.
2. Pass 2: paragraph-level merge for paragraphs that still contain
orphan "{{" or "}}" markers (Word fragmented the placeholder
across runs).
Missing placeholders render [KEIN WERT: <key>] / [NO VALUE: <key>]
markers so the lawyer sees the gap in Word rather than getting a 400.
Tests cover: single-run, multi-per-run (the go-docx failure mode),
cross-run merge, missing-marker (DE+EN), XML escaping of special
chars, non-document zip entries preserved, placeholder regex
grammar.
316 lines
12 KiB
Go
316 lines
12 KiB
Go
package services
|
|
|
|
// Submission template renderer — in-house engine for the submission
|
|
// generator (t-paliad-215, design doc
|
|
// docs/design-submission-generator-2026-05-19.md §6).
|
|
//
|
|
// Design choice — why not lukasjarosch/go-docx:
|
|
// The library's "nested placeholder" guard treats sibling placeholders
|
|
// inside the same <w:t> run (e.g. "{{a}} ./. {{b}}") as nested and
|
|
// refuses to replace either. Patent submissions routinely have multiple
|
|
// placeholders per paragraph (party blocks especially), so the library
|
|
// is a non-starter without a custom fork. The in-house renderer below
|
|
// is ~150 LoC and handles both the single-run common case and the
|
|
// cross-run case (where Word may split a placeholder across runs after
|
|
// editing).
|
|
//
|
|
// Placeholder grammar: {{[A-Za-z][A-Za-z0-9_.]*}} with optional
|
|
// whitespace inside braces ({{ project.case_number }} ≡
|
|
// {{project.case_number}}).
|
|
//
|
|
// Missing-value behaviour: when a placeholder has no binding in the
|
|
// PlaceholderMap, the renderer emits a marker token so the lawyer sees
|
|
// the gap in Word rather than failing the request. See §6.3 of the
|
|
// design doc.
|
|
|
|
import (
|
|
"archive/zip"
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// PlaceholderMap is the variable bag built by SubmissionVarsService.
|
|
// Keys are dotted paths without braces (e.g. "project.case_number").
|
|
// Values are the substituted text — already locale-aware, pretty-
|
|
// printed, and sanitised by the caller.
|
|
type PlaceholderMap map[string]string
|
|
|
|
// MissingPlaceholderFn translates an unbound placeholder key into the
|
|
// in-document marker token. The default in DefaultMissingMarker is
|
|
// "[KEIN WERT: <key>]" / "[NO VALUE: <key>]" depending on lang.
|
|
type MissingPlaceholderFn func(key string) string
|
|
|
|
// DefaultMissingMarker returns the standard missing-value marker for
|
|
// the given UI language.
|
|
func DefaultMissingMarker(lang string) MissingPlaceholderFn {
|
|
prefix := "KEIN WERT"
|
|
if strings.EqualFold(lang, "en") {
|
|
prefix = "NO VALUE"
|
|
}
|
|
return func(key string) string {
|
|
return "[" + prefix + ": " + key + "]"
|
|
}
|
|
}
|
|
|
|
// placeholderRegex matches a single placeholder. The capture group
|
|
// extracts the key name without braces or surrounding whitespace.
|
|
//
|
|
// Restricted to [A-Za-z][A-Za-z0-9_.]* so that stray "{{" sequences in
|
|
// legal prose (extremely rare in DE/EN court briefs but possible)
|
|
// don't get mistaken for placeholders. A genuine placeholder always
|
|
// starts with an ASCII letter.
|
|
var placeholderRegex = regexp.MustCompile(`\{\{\s*([A-Za-z][A-Za-z0-9_.]*)\s*\}\}`)
|
|
|
|
// SubmissionRenderer renders a .docx template into a .docx output by
|
|
// substituting {{placeholder}} tokens with values from a PlaceholderMap.
|
|
// Stateless; safe for concurrent use.
|
|
type SubmissionRenderer struct{}
|
|
|
|
// NewSubmissionRenderer constructs the renderer.
|
|
func NewSubmissionRenderer() *SubmissionRenderer {
|
|
return &SubmissionRenderer{}
|
|
}
|
|
|
|
// Render reads the .docx template at templateBytes, substitutes every
|
|
// placeholder from vars (or emits the missing-marker token), and writes
|
|
// the result to the returned byte slice. Unknown placeholders never
|
|
// fail the render — the lawyer sees the marker in Word and fixes it.
|
|
func (r *SubmissionRenderer) Render(templateBytes []byte, vars PlaceholderMap, missing MissingPlaceholderFn) ([]byte, error) {
|
|
if missing == nil {
|
|
missing = DefaultMissingMarker("de")
|
|
}
|
|
zr, err := zip.NewReader(bytes.NewReader(templateBytes), int64(len(templateBytes)))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("submission template: open zip: %w", err)
|
|
}
|
|
|
|
var out bytes.Buffer
|
|
zw := zip.NewWriter(&out)
|
|
defer zw.Close()
|
|
|
|
for _, entry := range zr.File {
|
|
body, err := readZipEntry(entry)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("submission template: read %s: %w", entry.Name, err)
|
|
}
|
|
if isWordXMLEntry(entry.Name) {
|
|
body = substituteInDocumentXML(body, vars, missing)
|
|
}
|
|
w, err := zw.CreateHeader(&zip.FileHeader{
|
|
Name: entry.Name,
|
|
Method: entry.Method,
|
|
Modified: entry.Modified,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("submission template: write header %s: %w", entry.Name, err)
|
|
}
|
|
if _, err := w.Write(body); err != nil {
|
|
return nil, fmt.Errorf("submission template: write %s: %w", entry.Name, err)
|
|
}
|
|
}
|
|
if err := zw.Close(); err != nil {
|
|
return nil, fmt.Errorf("submission template: finalise zip: %w", err)
|
|
}
|
|
return out.Bytes(), nil
|
|
}
|
|
|
|
// isWordXMLEntry returns true for the .docx parts that contain
|
|
// substitutable text. We touch document.xml plus header*.xml and
|
|
// footer*.xml (templates may put firm letterhead in a header) but
|
|
// skip styles, theme, settings, comments, footnotes — none of which
|
|
// should carry merge placeholders in a well-formed template.
|
|
func isWordXMLEntry(name string) bool {
|
|
switch {
|
|
case name == "word/document.xml":
|
|
return true
|
|
case strings.HasPrefix(name, "word/header") && strings.HasSuffix(name, ".xml"):
|
|
return true
|
|
case strings.HasPrefix(name, "word/footer") && strings.HasSuffix(name, ".xml"):
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// readZipEntry slurps a zip entry's bytes.
|
|
func readZipEntry(f *zip.File) ([]byte, error) {
|
|
rc, err := f.Open()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rc.Close()
|
|
return io.ReadAll(rc)
|
|
}
|
|
|
|
// substituteInDocumentXML walks document XML and replaces every
|
|
// {{placeholder}} occurrence inside <w:t> text nodes. Handles both
|
|
// single-run placeholders (the common case for freshly authored
|
|
// templates) and cross-run placeholders (where Word's autocorrect or
|
|
// manual editing has split a placeholder across runs).
|
|
//
|
|
// Two-pass strategy:
|
|
//
|
|
// 1. Pass 1: replace placeholders that fit entirely within one
|
|
// <w:t>…</w:t>. This is the 99% case and preserves all run-level
|
|
// formatting (bold, italic, font runs).
|
|
// 2. Pass 2: for paragraphs that still contain orphan "{{" or "}}"
|
|
// markers after pass 1, merge the text of every <w:t> inside the
|
|
// paragraph, run the replacement on the merged text, and rewrite
|
|
// the paragraph's runs as a single <w:r><w:t>…</w:t></w:r> using
|
|
// the formatting properties of the first run. Loses intra-paragraph
|
|
// formatting on the affected paragraph — but only on paragraphs
|
|
// where Word genuinely fragmented a placeholder.
|
|
func substituteInDocumentXML(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
|
|
replaced := substituteInTextNodes(body, vars, missing)
|
|
if !needsCrossRunMerge(replaced) {
|
|
return replaced
|
|
}
|
|
return substituteAcrossRuns(replaced, vars, missing)
|
|
}
|
|
|
|
// wTextNodeRegex matches one <w:t …>contents</w:t> element, capturing
|
|
// the contents. Attributes on <w:t> (xml:space="preserve") are preserved
|
|
// because the entire match is rewritten.
|
|
var wTextNodeRegex = regexp.MustCompile(`<w:t(\s[^>]*)?>([^<]*)</w:t>`)
|
|
|
|
// substituteInTextNodes runs the placeholder replacement inside each
|
|
// <w:t> text node independently. Format-preserving for single-run
|
|
// placeholders.
|
|
func substituteInTextNodes(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
|
|
return wTextNodeRegex.ReplaceAllFunc(body, func(match []byte) []byte {
|
|
sub := wTextNodeRegex.FindSubmatch(match)
|
|
attrs := string(sub[1])
|
|
contents := xmlDecode(string(sub[2]))
|
|
replaced := replacePlaceholders(contents, vars, missing)
|
|
if replaced == contents {
|
|
return match
|
|
}
|
|
// xml:space="preserve" stays attached whenever the original
|
|
// content had leading/trailing whitespace; ensure it's still
|
|
// declared after replacement to avoid Word collapsing spaces.
|
|
if !strings.Contains(attrs, "xml:space") &&
|
|
(strings.HasPrefix(replaced, " ") || strings.HasSuffix(replaced, " ")) {
|
|
attrs += ` xml:space="preserve"`
|
|
}
|
|
return []byte(`<w:t` + attrs + `>` + xmlEncode(replaced) + `</w:t>`)
|
|
})
|
|
}
|
|
|
|
// needsCrossRunMerge returns true when the body still contains an
|
|
// unmatched "{{" or "}}" after pass 1 — a sign that Word fragmented
|
|
// the placeholder across runs and pass 1 couldn't touch it.
|
|
func needsCrossRunMerge(body []byte) bool {
|
|
// Cheap heuristic: count "{{" vs "}}" inside <w:t> nodes. If we have
|
|
// either marker present in the text-node space, pass 2 will handle
|
|
// it. (Inside attributes or other XML, the markers don't matter.)
|
|
for _, m := range wTextNodeRegex.FindAllSubmatch(body, -1) {
|
|
t := string(m[2])
|
|
if strings.Contains(t, "{{") || strings.Contains(t, "}}") {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// wParagraphRegex matches one <w:p>…</w:p> paragraph block. Greedy
|
|
// inner-content match is safe here because <w:p> elements do not nest
|
|
// in WordprocessingML — a paragraph is the leaf container for text.
|
|
var wParagraphRegex = regexp.MustCompile(`(?s)<w:p\b[^>]*>.*?</w:p>`)
|
|
|
|
// wRunPropsRegex pulls the first <w:rPr>…</w:rPr> block from a
|
|
// paragraph so we can reuse it as the formatting of the merged run.
|
|
var wRunPropsRegex = regexp.MustCompile(`(?s)<w:rPr>.*?</w:rPr>`)
|
|
|
|
// wParagraphPropsRegex pulls the optional <w:pPr>…</w:pPr> that sits
|
|
// at the top of a paragraph (alignment, spacing, etc.). Preserved.
|
|
var wParagraphPropsRegex = regexp.MustCompile(`(?s)<w:pPr>.*?</w:pPr>`)
|
|
|
|
// substituteAcrossRuns is pass 2: for any paragraph that still has a
|
|
// split placeholder, concatenate every text node, run replacement, and
|
|
// rewrite the paragraph as a single run using the first run's
|
|
// properties. Paragraphs without orphan markers are left untouched so
|
|
// run-level formatting survives wherever pass 1 already resolved the
|
|
// placeholders.
|
|
func substituteAcrossRuns(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn) []byte {
|
|
return wParagraphRegex.ReplaceAllFunc(body, func(para []byte) []byte {
|
|
textNodes := wTextNodeRegex.FindAllSubmatch(para, -1)
|
|
if len(textNodes) == 0 {
|
|
return para
|
|
}
|
|
var merged strings.Builder
|
|
for _, m := range textNodes {
|
|
merged.WriteString(xmlDecode(string(m[2])))
|
|
}
|
|
original := merged.String()
|
|
if !strings.Contains(original, "{{") {
|
|
// No fragmented placeholder in this paragraph; leave it
|
|
// alone so pass 1's run-level edits survive.
|
|
return para
|
|
}
|
|
replaced := replacePlaceholders(original, vars, missing)
|
|
if replaced == original {
|
|
return para
|
|
}
|
|
// Preserve paragraph properties (alignment, spacing) and the
|
|
// first run's properties (font, bold/italic).
|
|
pPr := wParagraphPropsRegex.Find(para)
|
|
rPr := wRunPropsRegex.Find(para)
|
|
var rebuilt bytes.Buffer
|
|
rebuilt.WriteString(`<w:p>`)
|
|
if pPr != nil {
|
|
rebuilt.Write(pPr)
|
|
}
|
|
rebuilt.WriteString(`<w:r>`)
|
|
if rPr != nil {
|
|
rebuilt.Write(rPr)
|
|
}
|
|
rebuilt.WriteString(`<w:t xml:space="preserve">`)
|
|
rebuilt.WriteString(xmlEncode(replaced))
|
|
rebuilt.WriteString(`</w:t></w:r></w:p>`)
|
|
return rebuilt.Bytes()
|
|
})
|
|
}
|
|
|
|
// replacePlaceholders performs the actual substitution on a plain
|
|
// string. Unbound placeholders render the missing marker.
|
|
func replacePlaceholders(s string, vars PlaceholderMap, missing MissingPlaceholderFn) string {
|
|
return placeholderRegex.ReplaceAllStringFunc(s, func(match string) string {
|
|
sub := placeholderRegex.FindStringSubmatch(match)
|
|
if len(sub) < 2 {
|
|
return match
|
|
}
|
|
key := sub[1]
|
|
if value, ok := vars[key]; ok {
|
|
return value
|
|
}
|
|
return missing(key)
|
|
})
|
|
}
|
|
|
|
// xmlDecode reverses the small set of escapes used in WordprocessingML
|
|
// text content. We don't need a full XML parser — text nodes carry only
|
|
// the standard five entities, and Word never emits numeric-character
|
|
// references inside <w:t> for printable content.
|
|
func xmlDecode(s string) string {
|
|
s = strings.ReplaceAll(s, "<", "<")
|
|
s = strings.ReplaceAll(s, ">", ">")
|
|
s = strings.ReplaceAll(s, """, `"`)
|
|
s = strings.ReplaceAll(s, "'", "'")
|
|
s = strings.ReplaceAll(s, "&", "&")
|
|
return s
|
|
}
|
|
|
|
// xmlEncode escapes a substituted value for safe insertion back into a
|
|
// WordprocessingML text node. & must be replaced first to avoid double
|
|
// encoding the entity prefixes we introduce on the other characters.
|
|
func xmlEncode(s string) string {
|
|
s = strings.ReplaceAll(s, "&", "&")
|
|
s = strings.ReplaceAll(s, "<", "<")
|
|
s = strings.ReplaceAll(s, ">", ">")
|
|
s = strings.ReplaceAll(s, `"`, """)
|
|
s = strings.ReplaceAll(s, "'", "'")
|
|
return s
|
|
}
|