Files
paliad/pkg/docforge/docx/merge.go
mAi 78a30a7ee0 refactor(docforge): slice 1 — extract .docx engine to pkg/docforge/docx (t-paliad-349)
Relocate the in-house OOXML machinery out of internal/services into the
first docforge adapter, with zero behaviour change:

  submission_merge.go  -> pkg/docforge/docx/merge.go     (placeholder
                          substitution renderer + preview-HTML emitter)
  submission_md.go     -> pkg/docforge/docx/markdown.go  (Markdown->OOXML
                          walker incl. the b78a984 underscore-fix)
  submission_render.go -> pkg/docforge/docx/dotm.go      (.dotm->.docx)
  + their _test.go files (git-tracked renames, 84-99% identical)

internal/services keeps thin type-alias + forwarder shims
(docforge_shims.go) so every caller in services/handlers/main compiles
and behaves identically: PlaceholderMap, MissingPlaceholderFn,
SubmissionRenderer, HyperlinkAllocator (aliases); NewSubmissionRenderer,
DefaultMissingMarker, RenderMarkdownToOOXML[WithStyles], ConvertDotmToDocx,
SanitiseSubmissionFileName (forwarders). docx.XMLAttrEscape is exported so
submission_compose.go's hyperlink-rels inserts reuse the walker's escaping.

Three mis-filed pretty-printer tests (legalSourcePretty, ourSideDE/EN,
patentNumberUPC) that exercise the vars layer move back to
internal/services/submission_vars_pretty_test.go.

Placeholder grammar + PlaceholderMap stay co-located with the renderer in
docx for now; slice 3 hoists the format-neutral grammar to the docforge
root with the VariableResolver interface.

Verification: go build ./... clean, go vet clean, full module test green
(the byte-exact OOXML golden tests in merge/compose/render pass unchanged
= behaviour preserved). gofmt drift on the moved files is pre-existing
(72/169 services files already drift; no gofmt gate).

m/paliad#157
2026-05-29 14:51:59 +02:00

532 lines
19 KiB
Go

package docx
// Submission template renderer — in-house engine for the submission
// draft editor (t-paliad-238, design doc
// docs/design-submission-page-2026-05-22.md §3 / §6.2).
//
// Resurrected from commit 8ea3509 (the original t-paliad-215 Slice 1
// "in-house .docx render engine"). Kept in a separate file from the
// format-only converter (submission_render.go) so the t-paliad-230
// /generate one-click path stays unchanged and the merge engine doesn't
// have to share zip-helper names with it.
//
// Why not lukasjarosch/go-docx: the library's "nested placeholder" guard
// treats sibling placeholders inside the same <w:t> run (e.g.
// "{{a}} ./. {{b}}") as nested and refuses to replace either. Patent
// submissions routinely have multiple placeholders per paragraph (party
// blocks especially), so the library is a non-starter. This renderer
// handles single-run placeholders (preserving run-level formatting) AND
// cross-run placeholders (rewriting the paragraph as one run when Word
// has fragmented the placeholder across runs).
//
// Placeholder grammar: {{[A-Za-z][A-Za-z0-9_.]*}} with optional
// whitespace inside braces ({{ project.case_number }} ≡
// {{project.case_number}}).
//
// Missing-value behaviour: when a placeholder has no binding in the
// PlaceholderMap, the renderer emits a marker token so the lawyer sees
// the gap in Word rather than failing the request.
import (
"archive/zip"
"bytes"
"fmt"
"io"
"regexp"
"strings"
)
// PlaceholderMap is the variable bag built by SubmissionVarsService.
// Keys are dotted paths without braces (e.g. "project.case_number").
// Values are the substituted text — already locale-aware, pretty-
// printed, and sanitised by the caller.
type PlaceholderMap map[string]string
// MissingPlaceholderFn translates an unbound placeholder key into the
// in-document marker token. The default in DefaultMissingMarker is
// "[KEIN WERT: <key>]" / "[NO VALUE: <key>]" depending on lang.
type MissingPlaceholderFn func(key string) string
// valueWrapperFn wraps a substituted value with a marker the HTML
// preview emitter can recognise — used by RenderHTML to turn each
// substituted value into a clickable <span class="draft-var" …>
// (t-paliad-261, click-variable-in-preview → jump-to-field). nil means
// no wrapping; the .docx export path uses nil so its output is
// byte-identical to the wrapper-free build. The wrapper is invoked for
// both resolved values and missing-marker text so clicking a missing
// placeholder still jumps to the corresponding sidebar input.
type valueWrapperFn func(key, value string) string
// Private-Use-Area sentinels for the HTML preview wrap. PUA characters
// are valid in XML 1.0 content, never appear in legitimate template
// text, pass unchanged through xmlEncode/xmlDecode/htmlEscape, and are
// stripped by emitTextWithDraftVars when the preview HTML is assembled.
const (
previewVarBegin = ""
previewVarMid = ""
previewVarEnd = ""
)
// htmlPreviewWrapper wraps a substituted value with the PUA sentinels
// emitTextWithDraftVars recognises. Used only by RenderHTML; the .docx
// Render path uses nil so its output is identical to the pre-261 build.
func htmlPreviewWrapper(key, value string) string {
return previewVarBegin + key + previewVarMid + value + previewVarEnd
}
// DefaultMissingMarker returns the standard missing-value marker for
// the given UI language.
func DefaultMissingMarker(lang string) MissingPlaceholderFn {
prefix := "KEIN WERT"
if strings.EqualFold(lang, "en") {
prefix = "NO VALUE"
}
return func(key string) string {
return "[" + prefix + ": " + key + "]"
}
}
// placeholderRegex matches a single placeholder. The capture group
// extracts the key name without braces or surrounding whitespace.
//
// Restricted to [A-Za-z][A-Za-z0-9_.]* so that stray "{{" sequences in
// legal prose don't get mistaken for placeholders. A genuine placeholder
// always starts with an ASCII letter.
var placeholderRegex = regexp.MustCompile(`\{\{\s*([A-Za-z][A-Za-z0-9_.]*)\s*\}\}`)
// SubmissionRenderer renders a .docx template into a .docx output by
// substituting {{placeholder}} tokens with values from a PlaceholderMap.
// Stateless; safe for concurrent use.
type SubmissionRenderer struct{}
// NewSubmissionRenderer constructs the renderer.
func NewSubmissionRenderer() *SubmissionRenderer {
return &SubmissionRenderer{}
}
// Render reads the .docx template at templateBytes, substitutes every
// placeholder from vars (or emits the missing-marker token), and returns
// the merged .docx bytes. Unknown placeholders never fail the render —
// the lawyer sees the marker in Word and fixes it.
//
// Pre-pass: ConvertDotmToDocx is called on the input so a .dotm
// template (macro-bearing) is downgraded to a plain .docx before the
// merge step runs. Idempotent on inputs that are already plain .docx.
func (r *SubmissionRenderer) Render(templateBytes []byte, vars PlaceholderMap, missing MissingPlaceholderFn) ([]byte, error) {
if missing == nil {
missing = DefaultMissingMarker("de")
}
cleanBytes, err := ConvertDotmToDocx(templateBytes)
if err != nil {
return nil, fmt.Errorf("submission render: pre-pass convert: %w", err)
}
zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes)))
if err != nil {
return nil, fmt.Errorf("submission render: open zip: %w", err)
}
var out bytes.Buffer
zw := zip.NewWriter(&out)
for _, entry := range zr.File {
body, err := readMergeZipEntry(entry)
if err != nil {
return nil, fmt.Errorf("submission render: read %s: %w", entry.Name, err)
}
if isWordXMLEntry(entry.Name) {
body = substituteInDocumentXML(body, vars, missing, nil)
}
w, err := zw.CreateHeader(&zip.FileHeader{
Name: entry.Name,
Method: entry.Method,
Modified: entry.Modified,
})
if err != nil {
return nil, fmt.Errorf("submission render: write header %s: %w", entry.Name, err)
}
if _, err := w.Write(body); err != nil {
return nil, fmt.Errorf("submission render: write %s: %w", entry.Name, err)
}
}
if err := zw.Close(); err != nil {
return nil, fmt.Errorf("submission render: finalise zip: %w", err)
}
return out.Bytes(), nil
}
// RenderHTML produces a read-only HTML rendering of the merged document
// body for the draft-editor preview pane. Walks the SAME placeholder
// substitution as Render, then extracts the body text from word/document.xml
// and emits semantic HTML — one <p> per <w:p>, with <strong>/<em> spans
// for runs that carry <w:b>/<w:i> formatting. Tables, lists, and complex
// formatting collapse to plain paragraphs (the preview is a fidelity
// guide, not a WYSIWYG editor — final formatting comes from Word at
// export).
//
// Returns escaped HTML safe to inject into the page via dangerouslySet
// or innerHTML. The caller is responsible for wrapping in an outer
// container; this method emits only the body fragment.
func (r *SubmissionRenderer) RenderHTML(templateBytes []byte, vars PlaceholderMap, missing MissingPlaceholderFn) (string, error) {
if missing == nil {
missing = DefaultMissingMarker("de")
}
cleanBytes, err := ConvertDotmToDocx(templateBytes)
if err != nil {
return "", fmt.Errorf("submission render html: pre-pass convert: %w", err)
}
zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes)))
if err != nil {
return "", fmt.Errorf("submission render html: open zip: %w", err)
}
var docXML []byte
for _, entry := range zr.File {
if entry.Name != "word/document.xml" {
continue
}
docXML, err = readMergeZipEntry(entry)
if err != nil {
return "", fmt.Errorf("submission render html: read document.xml: %w", err)
}
break
}
if docXML == nil {
return "", fmt.Errorf("submission render html: word/document.xml missing")
}
merged := substituteInDocumentXML(docXML, vars, missing, htmlPreviewWrapper)
return docXMLToHTML(merged), nil
}
// isWordXMLEntry returns true for the .docx parts that contain
// substitutable text. We touch document.xml plus header*.xml and
// footer*.xml (templates may put firm letterhead in a header) but
// skip styles, theme, settings, comments, footnotes — none of which
// should carry merge placeholders in a well-formed template.
func isWordXMLEntry(name string) bool {
switch {
case name == "word/document.xml":
return true
case strings.HasPrefix(name, "word/header") && strings.HasSuffix(name, ".xml"):
return true
case strings.HasPrefix(name, "word/footer") && strings.HasSuffix(name, ".xml"):
return true
}
return false
}
// readMergeZipEntry slurps a zip entry's bytes. Named distinctly from
// the helper in submission_render.go (readZipFile) to keep this file
// self-contained — the two are functionally identical.
func readMergeZipEntry(f *zip.File) ([]byte, error) {
rc, err := f.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// substituteInDocumentXML walks document XML and replaces every
// {{placeholder}} occurrence inside <w:t> text nodes. Handles both
// single-run placeholders (the common case for freshly authored
// templates) and cross-run placeholders (where Word's autocorrect or
// manual editing has split a placeholder across runs).
//
// Two-pass strategy:
//
// 1. Pass 1: replace placeholders that fit entirely within one
// <w:t>…</w:t>. This is the 99% case and preserves all run-level
// formatting (bold, italic, font runs).
// 2. Pass 2: for paragraphs that still contain orphan "{{" or "}}"
// markers after pass 1, merge the text of every <w:t> inside the
// paragraph, run the replacement on the merged text, and rewrite
// the paragraph's runs as a single <w:r><w:t>…</w:t></w:r> using
// the formatting properties of the first run.
func substituteInDocumentXML(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn, wrap valueWrapperFn) []byte {
replaced := substituteInTextNodes(body, vars, missing, wrap)
if !needsCrossRunMerge(replaced) {
return replaced
}
return substituteAcrossRuns(replaced, vars, missing, wrap)
}
// wTextNodeRegex matches one <w:t …>contents</w:t> element, capturing
// the contents.
var wTextNodeRegex = regexp.MustCompile(`<w:t(\s[^>]*)?>([^<]*)</w:t>`)
// substituteInTextNodes runs the placeholder replacement inside each
// <w:t> text node independently. Format-preserving for single-run
// placeholders.
func substituteInTextNodes(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn, wrap valueWrapperFn) []byte {
return wTextNodeRegex.ReplaceAllFunc(body, func(match []byte) []byte {
sub := wTextNodeRegex.FindSubmatch(match)
attrs := string(sub[1])
contents := xmlDecode(string(sub[2]))
replaced := replacePlaceholders(contents, vars, missing, wrap)
if replaced == contents {
return match
}
if !strings.Contains(attrs, "xml:space") &&
(strings.HasPrefix(replaced, " ") || strings.HasSuffix(replaced, " ")) {
attrs += ` xml:space="preserve"`
}
return []byte(`<w:t` + attrs + `>` + xmlEncode(replaced) + `</w:t>`)
})
}
// needsCrossRunMerge returns true when the body still contains an
// unmatched "{{" or "}}" inside any <w:t> after pass 1.
func needsCrossRunMerge(body []byte) bool {
for _, m := range wTextNodeRegex.FindAllSubmatch(body, -1) {
t := string(m[2])
if strings.Contains(t, "{{") || strings.Contains(t, "}}") {
return true
}
}
return false
}
// wParagraphRegex matches one <w:p>…</w:p> paragraph block. Greedy
// inner-content match is safe — <w:p> elements do not nest.
var wParagraphRegex = regexp.MustCompile(`(?s)<w:p\b[^>]*>.*?</w:p>`)
// wRunPropsRegex pulls the first <w:rPr>…</w:rPr> block from a paragraph.
var wRunPropsRegex = regexp.MustCompile(`(?s)<w:rPr>.*?</w:rPr>`)
// wParagraphPropsRegex pulls the optional <w:pPr>…</w:pPr>.
var wParagraphPropsRegex = regexp.MustCompile(`(?s)<w:pPr>.*?</w:pPr>`)
// substituteAcrossRuns is pass 2: concatenate every text node in a
// fragmented-placeholder paragraph, run replacement, rewrite as one run.
func substituteAcrossRuns(body []byte, vars PlaceholderMap, missing MissingPlaceholderFn, wrap valueWrapperFn) []byte {
return wParagraphRegex.ReplaceAllFunc(body, func(para []byte) []byte {
textNodes := wTextNodeRegex.FindAllSubmatch(para, -1)
if len(textNodes) == 0 {
return para
}
var merged strings.Builder
for _, m := range textNodes {
merged.WriteString(xmlDecode(string(m[2])))
}
original := merged.String()
if !strings.Contains(original, "{{") {
return para
}
replaced := replacePlaceholders(original, vars, missing, wrap)
if replaced == original {
return para
}
pPr := wParagraphPropsRegex.Find(para)
rPr := wRunPropsRegex.Find(para)
var rebuilt bytes.Buffer
rebuilt.WriteString(`<w:p>`)
if pPr != nil {
rebuilt.Write(pPr)
}
rebuilt.WriteString(`<w:r>`)
if rPr != nil {
rebuilt.Write(rPr)
}
rebuilt.WriteString(`<w:t xml:space="preserve">`)
rebuilt.WriteString(xmlEncode(replaced))
rebuilt.WriteString(`</w:t></w:r></w:p>`)
return rebuilt.Bytes()
})
}
// replacePlaceholders performs the actual substitution on a plain
// string. Unbound placeholders render the missing marker. When wrap is
// non-nil, both the resolved value AND the missing-marker text are
// passed through wrap(key, value) — the HTML preview path uses this to
// emit clickable spans around every substituted placeholder, including
// missing ones (clicking a missing marker jumps to the corresponding
// sidebar input).
func replacePlaceholders(s string, vars PlaceholderMap, missing MissingPlaceholderFn, wrap valueWrapperFn) string {
return placeholderRegex.ReplaceAllStringFunc(s, func(match string) string {
sub := placeholderRegex.FindStringSubmatch(match)
if len(sub) < 2 {
return match
}
key := sub[1]
var value string
if v, ok := vars[key]; ok {
value = v
} else {
value = missing(key)
}
if wrap != nil {
return wrap(key, value)
}
return value
})
}
// xmlDecode reverses the five standard XML entities Word emits in
// <w:t> content.
func xmlDecode(s string) string {
s = strings.ReplaceAll(s, "&lt;", "<")
s = strings.ReplaceAll(s, "&gt;", ">")
s = strings.ReplaceAll(s, "&quot;", `"`)
s = strings.ReplaceAll(s, "&apos;", "'")
s = strings.ReplaceAll(s, "&amp;", "&")
return s
}
// xmlEncode escapes for safe insertion back into <w:t> content. & first
// to avoid double-encoding the entity prefixes.
func xmlEncode(s string) string {
s = strings.ReplaceAll(s, "&", "&amp;")
s = strings.ReplaceAll(s, "<", "&lt;")
s = strings.ReplaceAll(s, ">", "&gt;")
s = strings.ReplaceAll(s, `"`, "&quot;")
s = strings.ReplaceAll(s, "'", "&apos;")
return s
}
// docXMLToHTML walks the post-merge document XML and emits HTML for
// the preview pane. One <p> per <w:p>; <strong>/<em> spans for runs
// carrying <w:b>/<w:i>. Tables/lists/images collapse to text. Output
// is HTML-escaped except for the structural <p>/<strong>/<em> tags
// this function emits.
func docXMLToHTML(docXML []byte) string {
paragraphs := wParagraphRegex.FindAll(docXML, -1)
var out bytes.Buffer
for _, para := range paragraphs {
out.WriteString("<p>")
out.WriteString(paragraphToHTML(para))
out.WriteString("</p>\n")
}
if out.Len() == 0 {
return "<p></p>"
}
return out.String()
}
// wRunRegex matches one <w:r>…</w:r> run. Greedy match safe — <w:r>
// elements do not nest.
var wRunRegex = regexp.MustCompile(`(?s)<w:r\b[^>]*>.*?</w:r>`)
// wBoldRegex / wItalicRegex detect the bold/italic flags inside a run's
// <w:rPr>. Word emits <w:b/> or <w:b w:val="true"/>; matching the open
// tag covers both forms.
var (
wBoldRegex = regexp.MustCompile(`<w:b\b[^>]*/?>`)
wItalicRegex = regexp.MustCompile(`<w:i\b[^>]*/?>`)
)
// paragraphToHTML extracts the text from each <w:r> inside a paragraph,
// wraps runs flagged bold/italic with the corresponding HTML tags, and
// HTML-escapes the text content.
func paragraphToHTML(para []byte) string {
runs := wRunRegex.FindAll(para, -1)
if len(runs) == 0 {
// Empty paragraph (line break).
return ""
}
var out bytes.Buffer
for _, run := range runs {
text := extractRunText(run)
if text == "" {
continue
}
// Check for bold/italic on the run's <w:rPr>.
rPr := wRunPropsRegex.Find(run)
bold := rPr != nil && wBoldRegex.Match(rPr) && !isFalseFlag(rPr, wBoldRegex)
italic := rPr != nil && wItalicRegex.Match(rPr) && !isFalseFlag(rPr, wItalicRegex)
if bold {
out.WriteString("<strong>")
}
if italic {
out.WriteString("<em>")
}
out.WriteString(emitTextWithDraftVars(text))
if italic {
out.WriteString("</em>")
}
if bold {
out.WriteString("</strong>")
}
}
return out.String()
}
// emitTextWithDraftVars HTML-escapes run text while converting any
// preview-only sentinels emitted by htmlPreviewWrapper into
// <span class="draft-var" data-var="<key>">…</span>. The key is
// restricted to [A-Za-z][A-Za-z0-9_.]* by placeholderRegex, so no
// attribute-escaping is needed on the key; the value is HTML-escaped
// normally. Sentinel-free text (the Render path output, or template
// text outside placeholders) is passed straight through htmlEscape, so
// callers that never invoked wrap see byte-identical HTML.
//
// t-paliad-261: makes substituted variables clickable in the preview
// pane so the user can jump to the matching input in the sidebar.
func emitTextWithDraftVars(text string) string {
if !strings.Contains(text, previewVarBegin) {
return htmlEscape(text)
}
var out strings.Builder
rest := text
for {
i := strings.Index(rest, previewVarBegin)
if i < 0 {
out.WriteString(htmlEscape(rest))
return out.String()
}
out.WriteString(htmlEscape(rest[:i]))
body := rest[i+len(previewVarBegin):]
mid := strings.Index(body, previewVarMid)
end := strings.Index(body, previewVarEnd)
if mid < 0 || end < 0 || mid > end {
// Malformed sentinel — emit the marker as plain escaped
// text and continue past it so the rest of the run still
// renders.
out.WriteString(htmlEscape(previewVarBegin))
rest = body
continue
}
key := body[:mid]
value := body[mid+len(previewVarMid) : end]
out.WriteString(`<span class="draft-var" data-var="`)
out.WriteString(key)
out.WriteString(`">`)
out.WriteString(htmlEscape(value))
out.WriteString(`</span>`)
rest = body[end+len(previewVarEnd):]
}
}
// extractRunText concatenates every <w:t> inside a run, XML-decoding
// the content as it goes.
func extractRunText(run []byte) string {
var out strings.Builder
for _, m := range wTextNodeRegex.FindAllSubmatch(run, -1) {
out.WriteString(xmlDecode(string(m[2])))
}
return out.String()
}
// isFalseFlag returns true if the matched tag explicitly carries
// w:val="false" or w:val="0" — Word's way of turning off an inherited
// format. The default match (just `<w:b/>` or `<w:b w:val="true"/>`)
// is "on".
func isFalseFlag(rPr []byte, rx *regexp.Regexp) bool {
match := rx.Find(rPr)
if match == nil {
return false
}
s := string(match)
return strings.Contains(s, `w:val="false"`) || strings.Contains(s, `w:val="0"`)
}
// htmlEscape escapes the five HTML-significant characters for safe
// insertion into the preview pane.
func htmlEscape(s string) string {
s = strings.ReplaceAll(s, "&", "&amp;")
s = strings.ReplaceAll(s, "<", "&lt;")
s = strings.ReplaceAll(s, ">", "&gt;")
s = strings.ReplaceAll(s, `"`, "&quot;")
s = strings.ReplaceAll(s, "'", "&#39;")
return s
}