Move the full compose pipeline (anchor-pair splicing, append-before-sectPr,
hyperlink-rels patching, zip split/repack, final placeholder pass) into
pkg/docforge/docx/compose.go, decoupled from paliad's DB row types. The
engine now owns the entire .docx assembly.
New neutral types in docx:
- Carrier{Bytes, Stylemap} — the opaque base .docx, preserved
byte-for-byte outside the spliced regions (the lossless docforge
carrier for .docx).
- Section{Key, OrderIndex, Included, ContentMDDE, ContentMDEN} — the
format-neutral content input.
- Composer / NewComposer / ComposeOptions on those neutral types.
internal/services keeps SubmissionComposer + ComposeOptions as a thin
mapping wrapper (SubmissionSection -> docx.Section, Base.SectionSpec.Stylemap
+ BaseBytes -> docx.Carrier). handlers + the comprehensive compose_test are
unchanged; the test drives the wrapper end-to-end and its byte-exact OOXML
assertions pass = behaviour preserved.
Retired the slice-1 docx.XMLAttrEscape wrapper + its services forwarder:
compose now calls the local xmlAttrEscape inside the docx package.
Sequencing note: the paragraph-level neutral model (Document/Block/Slot the
PRD §3.2 sketches) is deferred to slice 6, where the authoring importer +
format exporters consume it. Building it now, ahead of any consumer, would
be speculative and risk the byte-identical guarantee for no gain (PRD §4 B3
principle). Carrier is the part of the model that earns its keep this cycle.
Verification: go build ./... clean, go vet clean, full module test green.
m/paliad#157
504 lines
16 KiB
Go
504 lines
16 KiB
Go
package docx
|
|
|
|
// Markdown → OOXML walker for Composer section content (t-paliad-313
|
|
// Slice B, design doc §9.2).
|
|
//
|
|
// Scope per the head's Slice B brief: paragraphs + inline bold/italic
|
|
// only. Headings, lists, blockquote, links land in Slice D's rich-prose
|
|
// pass. This walker is intentionally minimal — every Markdown construct
|
|
// it doesn't recognise is rendered as a plain paragraph so the lawyer's
|
|
// prose round-trips losslessly even when they hit Markdown the walker
|
|
// doesn't yet understand.
|
|
//
|
|
// The output uses the base's stylemap.paragraph entry for the
|
|
// <w:pStyle> on each paragraph so the styling matches the base's
|
|
// typography (HLpat-Body-B0 on the HLC base, Normal on the neutral
|
|
// base, etc.).
|
|
//
|
|
// Placeholders ({{path.dot.notation}}) are preserved verbatim — they
|
|
// pass through the walker untouched and get substituted by the v1
|
|
// SubmissionRenderer's placeholder pass after the composer assembly.
|
|
//
|
|
// Grammar supported:
|
|
//
|
|
// - Blank line → paragraph break
|
|
// - `**bold**` → <w:r><w:rPr><w:b/></w:rPr><w:t>…</w:t></w:r>
|
|
// - `*italic*` or `_italic_` → <w:r><w:rPr><w:i/></w:rPr>…</w:r>
|
|
// - Otherwise → plain text run
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// HyperlinkAllocator hands the walker a `rId` for each external URL
|
|
// it encounters in `[label](url)` inline links. The composer's
|
|
// post-pass uses these allocations to mutate
|
|
// `word/_rels/document.xml.rels` so the emitted `<w:hyperlink
|
|
// r:id="…">` elements resolve correctly. Pass nil to drop links to
|
|
// plain text (the label survives, the URL doesn't render).
|
|
//
|
|
// t-paliad-316 Slice D.
|
|
type HyperlinkAllocator func(url string) string
|
|
|
|
// RenderMarkdownToOOXML renders the given Markdown source into OOXML
|
|
// paragraph elements (`<w:p>…</w:p>`), suitable for splicing into a
|
|
// .docx body. Each paragraph carries `<w:pStyle w:val="<paragraphStyle>"/>`
|
|
// when paragraphStyle is non-empty.
|
|
//
|
|
// Slice B shipped paragraphs + bold/italic. Slice D extends to
|
|
// headings (h1/h2/h3), bullet/numbered lists, blockquote, and inline
|
|
// hyperlinks via the optional HyperlinkAllocator.
|
|
//
|
|
// stylemap supplies the paragraph-style names for each kind:
|
|
// stylemap["paragraph"] — default body
|
|
// stylemap["heading_1/2/3"] — heading levels
|
|
// stylemap["list_bullet"] — bullet list paragraph style
|
|
// stylemap["list_numbered"] — numbered list paragraph style
|
|
// stylemap["blockquote"] — blockquote
|
|
// Missing entries fall back to the "paragraph" style.
|
|
//
|
|
// Empty input renders one empty paragraph so the splice site is
|
|
// well-formed even when the lawyer hasn't typed anything in this
|
|
// section.
|
|
func RenderMarkdownToOOXML(md, paragraphStyle string) string {
|
|
return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil)
|
|
}
|
|
|
|
// RenderMarkdownToOOXMLWithStyles is the full Slice-D-aware entry
|
|
// point. Slice B's RenderMarkdownToOOXML is a wrapper for back-compat.
|
|
func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string {
|
|
defaultStyle := stylemap["paragraph"]
|
|
if md == "" {
|
|
return emptyParagraph(defaultStyle)
|
|
}
|
|
blocks := splitMarkdownBlocks(md)
|
|
if len(blocks) == 0 {
|
|
return emptyParagraph(defaultStyle)
|
|
}
|
|
// Numbered-list counter resets on every non-numbered block so
|
|
// "1. A\n2. B\n\n1. C" renders as 1./2./1. (the lawyer's input
|
|
// determined the ordinal, the walker just renders).
|
|
numberedCounter := 0
|
|
var b strings.Builder
|
|
for _, blk := range blocks {
|
|
style := stylemap[blk.styleKey]
|
|
if style == "" {
|
|
style = defaultStyle
|
|
}
|
|
if blk.styleKey == "list_numbered" {
|
|
numberedCounter++
|
|
} else {
|
|
numberedCounter = 0
|
|
}
|
|
b.WriteString(renderBlockParagraph(blk, style, links, numberedCounter))
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// mdBlock is one rendered paragraph: a kind (paragraph / heading_*
|
|
// / list_bullet / list_numbered / blockquote) and the inline content
|
|
// text. List markers, heading hashes, blockquote `> ` etc. are
|
|
// stripped from the text before storage.
|
|
type mdBlock struct {
|
|
styleKey string // "paragraph" | "heading_1" | "heading_2" | "heading_3" | "list_bullet" | "list_numbered" | "blockquote"
|
|
text string
|
|
}
|
|
|
|
// splitMarkdownBlocks parses the source into a sequence of blocks,
|
|
// detecting heading / list / blockquote prefixes line-by-line. Blank
|
|
// lines split paragraph runs (same semantics as splitMarkdownParagraphs)
|
|
// but each line is also tagged with its block kind.
|
|
//
|
|
// Lines that look like block markers don't merge with their neighbours
|
|
// even across blank lines — every list / heading / blockquote line is
|
|
// its own block in the output. A run of unmarked lines collapses into
|
|
// one "paragraph" block (so soft line breaks inside a paragraph still
|
|
// concatenate).
|
|
//
|
|
// CRLF normalised to LF before parsing.
|
|
func splitMarkdownBlocks(md string) []mdBlock {
|
|
normalised := strings.ReplaceAll(md, "\r\n", "\n")
|
|
lines := strings.Split(normalised, "\n")
|
|
var blocks []mdBlock
|
|
var pendingPara []string
|
|
blankRun := 0
|
|
|
|
flushPara := func() {
|
|
if len(pendingPara) > 0 {
|
|
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: strings.Join(pendingPara, "\n")})
|
|
pendingPara = nil
|
|
}
|
|
}
|
|
|
|
for _, raw := range lines {
|
|
line := raw
|
|
if strings.TrimSpace(line) == "" {
|
|
if len(pendingPara) > 0 {
|
|
flushPara()
|
|
blankRun = 1
|
|
continue
|
|
}
|
|
blankRun++
|
|
continue
|
|
}
|
|
// Detect heading / list / blockquote markers BEFORE we accumulate
|
|
// into the paragraph buffer.
|
|
kind, payload, ok := detectBlockMarker(line)
|
|
if ok {
|
|
flushPara()
|
|
// Emit spacing paragraphs equivalent to (blankRun - 1) extra.
|
|
for i := 1; i < blankRun; i++ {
|
|
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
|
|
}
|
|
blankRun = 0
|
|
blocks = append(blocks, mdBlock{styleKey: kind, text: payload})
|
|
continue
|
|
}
|
|
// Plain paragraph line.
|
|
if len(pendingPara) == 0 {
|
|
// Starting a new paragraph after a blank run — emit
|
|
// (blankRun-1) extra empty paragraphs for vertical spacing.
|
|
for i := 1; i < blankRun; i++ {
|
|
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
|
|
}
|
|
}
|
|
blankRun = 0
|
|
pendingPara = append(pendingPara, line)
|
|
}
|
|
flushPara()
|
|
return blocks
|
|
}
|
|
|
|
// detectBlockMarker classifies a single line. Returns (styleKey,
|
|
// payload-with-marker-stripped, true) for recognised markers; false
|
|
// for plain paragraph lines.
|
|
//
|
|
// Recognised markers (Slice D):
|
|
// # Heading → heading_1
|
|
// ## Heading → heading_2
|
|
// ### Heading → heading_3
|
|
// - item / * item → list_bullet
|
|
// 1. item / 2. item ... → list_numbered (any positive integer)
|
|
// > quote → blockquote
|
|
//
|
|
// Leading whitespace inside the line is tolerated up to 3 spaces (per
|
|
// CommonMark) so the lawyer's contentEditable indentation doesn't
|
|
// hide the marker.
|
|
func detectBlockMarker(line string) (string, string, bool) {
|
|
trimmed := strings.TrimLeft(line, " ")
|
|
// Cap to 3 spaces of leading indent — beyond that, treat as a
|
|
// regular paragraph line (matches CommonMark).
|
|
if len(line)-len(trimmed) > 3 {
|
|
return "", "", false
|
|
}
|
|
if strings.HasPrefix(trimmed, "### ") {
|
|
return "heading_3", strings.TrimSpace(trimmed[4:]), true
|
|
}
|
|
if strings.HasPrefix(trimmed, "## ") {
|
|
return "heading_2", strings.TrimSpace(trimmed[3:]), true
|
|
}
|
|
if strings.HasPrefix(trimmed, "# ") {
|
|
return "heading_1", strings.TrimSpace(trimmed[2:]), true
|
|
}
|
|
if strings.HasPrefix(trimmed, "> ") {
|
|
return "blockquote", strings.TrimSpace(trimmed[2:]), true
|
|
}
|
|
if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") {
|
|
return "list_bullet", strings.TrimSpace(trimmed[2:]), true
|
|
}
|
|
// Numbered: "N. " where N is one or more digits.
|
|
if i := indexOfNumberedMarker(trimmed); i > 0 {
|
|
return "list_numbered", strings.TrimSpace(trimmed[i:]), true
|
|
}
|
|
return "", "", false
|
|
}
|
|
|
|
// indexOfNumberedMarker checks for "N. " or "N) " at the start of the
|
|
// trimmed line; returns the byte index just past the marker, or -1 if
|
|
// no marker present.
|
|
func indexOfNumberedMarker(s string) int {
|
|
i := 0
|
|
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
|
|
i++
|
|
}
|
|
if i == 0 {
|
|
return -1
|
|
}
|
|
if i >= len(s) {
|
|
return -1
|
|
}
|
|
if s[i] != '.' && s[i] != ')' {
|
|
return -1
|
|
}
|
|
if i+1 >= len(s) || s[i+1] != ' ' {
|
|
return -1
|
|
}
|
|
return i + 2
|
|
}
|
|
|
|
// renderBlockParagraph emits one `<w:p>` for a block. List blocks
|
|
// keep the same paragraph style as a default paragraph (the Slice D
|
|
// design's contract — list styles come from the base's stylemap and
|
|
// Word's numbering.xml is honoured by adding a leading bullet/number
|
|
// prefix in the rendered text). This keeps the composer free of
|
|
// numbering.xml mutations.
|
|
func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
|
|
var b strings.Builder
|
|
b.WriteString(`<w:p>`)
|
|
if paragraphStyle != "" {
|
|
b.WriteString(`<w:pPr><w:pStyle w:val="`)
|
|
b.WriteString(xmlAttrEscape(paragraphStyle))
|
|
b.WriteString(`"/></w:pPr>`)
|
|
}
|
|
if blk.text == "" {
|
|
b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r>`)
|
|
b.WriteString(`</w:p>`)
|
|
return b.String()
|
|
}
|
|
text := blk.text
|
|
// List blocks emit a visible "• " / "N. " prefix run. The
|
|
// stylemap entry handles paragraph indentation if the base
|
|
// defines a list paragraph style; otherwise the prefix at least
|
|
// surfaces the structure in plain Word. Lawyers who want Word's
|
|
// auto-numbering reapply a list style post-export.
|
|
switch blk.styleKey {
|
|
case "list_bullet":
|
|
b.WriteString(`<w:r><w:t xml:space="preserve">• </w:t></w:r>`)
|
|
case "list_numbered":
|
|
ordinal := numberedOrdinal
|
|
if ordinal <= 0 {
|
|
ordinal = 1
|
|
}
|
|
b.WriteString(`<w:r><w:t xml:space="preserve">`)
|
|
b.WriteString(fmt.Sprintf("%d. ", ordinal))
|
|
b.WriteString(`</w:t></w:r>`)
|
|
}
|
|
for _, run := range parseInlineRuns(text, links) {
|
|
b.WriteString(run)
|
|
}
|
|
b.WriteString(`</w:p>`)
|
|
return b.String()
|
|
}
|
|
|
|
// parseInlineRuns extracts inline spans + hyperlink runs and serialises
|
|
// each to OOXML. Hyperlinks become `<w:hyperlink r:id="RID">…runs…</w:hyperlink>`
|
|
// where RID comes from the HyperlinkAllocator.
|
|
func parseInlineRuns(text string, links HyperlinkAllocator) []string {
|
|
// Phase 1: find all hyperlink spans `[label](url)` and split the
|
|
// text around them.
|
|
type segment struct {
|
|
text string
|
|
isLink bool
|
|
url string
|
|
}
|
|
var segs []segment
|
|
rest := text
|
|
for {
|
|
idx := strings.Index(rest, "[")
|
|
if idx < 0 {
|
|
if rest != "" {
|
|
segs = append(segs, segment{text: rest})
|
|
}
|
|
break
|
|
}
|
|
// Find matching closing bracket, then a "(" right after.
|
|
closeBracket := strings.Index(rest[idx:], "](")
|
|
if closeBracket < 0 {
|
|
segs = append(segs, segment{text: rest})
|
|
break
|
|
}
|
|
closeParen := strings.Index(rest[idx+closeBracket:], ")")
|
|
if closeParen < 0 {
|
|
segs = append(segs, segment{text: rest})
|
|
break
|
|
}
|
|
// idx = start of "["
|
|
// idx+closeBracket = position of "]"
|
|
// idx+closeBracket+1 = position of "("
|
|
// idx+closeBracket+closeParen = position of ")"
|
|
label := rest[idx+1 : idx+closeBracket]
|
|
url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
|
|
if idx > 0 {
|
|
segs = append(segs, segment{text: rest[:idx]})
|
|
}
|
|
segs = append(segs, segment{text: label, isLink: true, url: url})
|
|
rest = rest[idx+closeBracket+closeParen+1:]
|
|
}
|
|
|
|
var runs []string
|
|
for _, seg := range segs {
|
|
if seg.isLink && links != nil {
|
|
rid := links(seg.url)
|
|
if rid != "" {
|
|
var hb strings.Builder
|
|
hb.WriteString(`<w:hyperlink r:id="`)
|
|
hb.WriteString(xmlAttrEscape(rid))
|
|
hb.WriteString(`">`)
|
|
for _, span := range parseInlineSpans(seg.text) {
|
|
hb.WriteString(renderRunWithLinkStyle(span))
|
|
}
|
|
hb.WriteString(`</w:hyperlink>`)
|
|
runs = append(runs, hb.String())
|
|
continue
|
|
}
|
|
}
|
|
for _, span := range parseInlineSpans(seg.text) {
|
|
runs = append(runs, renderRun(span))
|
|
}
|
|
}
|
|
return runs
|
|
}
|
|
|
|
// renderRunWithLinkStyle emits a hyperlink child run. Same B/I support
|
|
// as renderRun, but additionally tags the run with the "Hyperlink"
|
|
// character style (Word's built-in) so the link renders in the
|
|
// document's hyperlink colour + underline.
|
|
func renderRunWithLinkStyle(span inlineSpan) string {
|
|
var b strings.Builder
|
|
b.WriteString(`<w:r><w:rPr><w:rStyle w:val="Hyperlink"/>`)
|
|
if span.Bold {
|
|
b.WriteString(`<w:b/>`)
|
|
}
|
|
if span.Italic {
|
|
b.WriteString(`<w:i/>`)
|
|
}
|
|
b.WriteString(`</w:rPr><w:t xml:space="preserve">`)
|
|
b.WriteString(xmlTextEscape(span.Text))
|
|
b.WriteString(`</w:t></w:r>`)
|
|
return b.String()
|
|
}
|
|
|
|
// inlineSpan is one piece of inline content: a text payload plus
|
|
// formatting flags. Bold and italic are independent — `***both***`
|
|
// produces one span with both flags set.
|
|
type inlineSpan struct {
|
|
Text string
|
|
Bold bool
|
|
Italic bool
|
|
}
|
|
|
|
// parseInlineSpans tokenises Markdown inline formatting into runs of
|
|
// (text, bold, italic). The grammar is intentionally narrow:
|
|
//
|
|
// - `**…**` → bold
|
|
// - `__…__` → bold (Markdown alternate)
|
|
// - `*…*` → italic
|
|
// - `_…_` → italic (Markdown alternate)
|
|
// - Anything else flows through as plain text.
|
|
//
|
|
// Unbalanced delimiters fall through as literal characters — the
|
|
// walker never errors on malformed Markdown. Nested formatting (e.g.
|
|
// `**bold *bold-italic* bold**`) toggles flags as it walks.
|
|
func parseInlineSpans(text string) []inlineSpan {
|
|
var out []inlineSpan
|
|
var cur strings.Builder
|
|
bold := false
|
|
italic := false
|
|
flush := func() {
|
|
if cur.Len() == 0 {
|
|
return
|
|
}
|
|
out = append(out, inlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
|
|
cur.Reset()
|
|
}
|
|
i := 0
|
|
n := len(text)
|
|
for i < n {
|
|
// Preserve {{...}} placeholders verbatim. Underscores and
|
|
// other Markdown-significant chars inside a placeholder key
|
|
// (e.g. {{project.case_number}}) must not be interpreted as
|
|
// bold/italic delimiters — otherwise the key gets stripped of
|
|
// its underscores and the v1 placeholder pass looks up the
|
|
// wrong key, surfacing [KEIN WERT: project.casenumber] in the
|
|
// preview.
|
|
if i+1 < n && text[i] == '{' && text[i+1] == '{' {
|
|
rel := strings.Index(text[i+2:], "}}")
|
|
if rel >= 0 {
|
|
end := i + 2 + rel + 2
|
|
cur.WriteString(text[i:end])
|
|
i = end
|
|
continue
|
|
}
|
|
// Unmatched {{ — fall through to plain character handling.
|
|
}
|
|
// Bold delimiters first (longer match wins over italic).
|
|
if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
|
|
flush()
|
|
bold = !bold
|
|
i += 2
|
|
continue
|
|
}
|
|
if text[i] == '*' || text[i] == '_' {
|
|
flush()
|
|
italic = !italic
|
|
i++
|
|
continue
|
|
}
|
|
cur.WriteByte(text[i])
|
|
i++
|
|
}
|
|
flush()
|
|
if len(out) == 0 {
|
|
out = append(out, inlineSpan{Text: ""})
|
|
}
|
|
return out
|
|
}
|
|
|
|
// renderRun emits one `<w:r>` element for an inline span. Empty text
|
|
// spans render as empty runs (Word accepts them; they're harmless).
|
|
func renderRun(span inlineSpan) string {
|
|
var b strings.Builder
|
|
b.WriteString(`<w:r>`)
|
|
if span.Bold || span.Italic {
|
|
b.WriteString(`<w:rPr>`)
|
|
if span.Bold {
|
|
b.WriteString(`<w:b/>`)
|
|
}
|
|
if span.Italic {
|
|
b.WriteString(`<w:i/>`)
|
|
}
|
|
b.WriteString(`</w:rPr>`)
|
|
}
|
|
b.WriteString(`<w:t xml:space="preserve">`)
|
|
b.WriteString(xmlTextEscape(span.Text))
|
|
b.WriteString(`</w:t></w:r>`)
|
|
return b.String()
|
|
}
|
|
|
|
// emptyParagraph returns one empty `<w:p>` with the given style. Used
|
|
// when a section's content_md is empty so the splice site stays
|
|
// well-formed.
|
|
func emptyParagraph(paragraphStyle string) string {
|
|
var b strings.Builder
|
|
b.WriteString(`<w:p>`)
|
|
if paragraphStyle != "" {
|
|
b.WriteString(`<w:pPr><w:pStyle w:val="`)
|
|
b.WriteString(xmlAttrEscape(paragraphStyle))
|
|
b.WriteString(`"/></w:pPr>`)
|
|
}
|
|
b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r></w:p>`)
|
|
return b.String()
|
|
}
|
|
|
|
// xmlTextEscape escapes the five XML-significant characters for safe
|
|
// insertion into <w:t> content. & first to avoid double-encoding.
|
|
func xmlTextEscape(s string) string {
|
|
s = strings.ReplaceAll(s, "&", "&")
|
|
s = strings.ReplaceAll(s, "<", "<")
|
|
s = strings.ReplaceAll(s, ">", ">")
|
|
// Quotes and apostrophes are legal inside element text content;
|
|
// no need to escape them here.
|
|
return s
|
|
}
|
|
|
|
// xmlAttrEscape escapes for safe insertion into an attribute value
|
|
// (e.g. `<w:pStyle w:val="…"/>`).
|
|
func xmlAttrEscape(s string) string {
|
|
s = strings.ReplaceAll(s, "&", "&")
|
|
s = strings.ReplaceAll(s, "<", "<")
|
|
s = strings.ReplaceAll(s, ">", ">")
|
|
s = strings.ReplaceAll(s, `"`, """)
|
|
return s
|
|
}
|