package docx // Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 + // §9.2). Assembles a base .docx and a draft's section rows into a // merged .docx ready for export. // // Pipeline (high-level): // // 1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx). // 2. Locate `word/document.xml` inside the zip; pull the body XML. // 3. For each section in the draft (order_index ASC, included=true): // render content_md_ → OOXML via RenderMarkdownToOOXML using // base.section_spec.stylemap.paragraph. // 4. Splice the rendered OOXML into the base body. Two splice modes: // - Anchor mode: when the body carries `{{#section:KEY}}` / // `{{/section:KEY}}` marker pairs, replace the slot's content // (including the anchor paragraphs themselves) with the rendered // section. // - Append mode: when no anchor pair is found for a section, the // rendered OOXML appends at the end of the body, just before any // `` element. Sections with `included=false` are // dropped silently. // 5. Strip any leftover unmatched anchor paragraphs. // 6. Re-pack the document.xml into the zip, leaving every other part // untouched. // 7. Run the v1 SubmissionRenderer placeholder pass over the assembly // so `{{path}}` placeholders inside section content (and inside // the base's untouched chrome) get substituted by the merged bag. // Cross-run merge in pass 2 handles autocorrect-fragmented // placeholders the same as v1. // // Result: a fully-merged .docx. No new third-party Go dep — reuses // archive/zip + the existing SubmissionRenderer. import ( "archive/zip" "bytes" "context" "fmt" "io" "regexp" "sort" "strings" "time" "mgit.msbls.de/m/paliad/pkg/docforge" ) // Composer assembles base + sections into a final .docx. // Stateless; safe for concurrent use. type Composer struct { renderer *SubmissionRenderer } // NewComposer wires the composer. The renderer is required — // a nil renderer is a programmer error and the composer panics at // construction. func NewComposer(renderer *SubmissionRenderer) *Composer { if renderer == nil { panic("submission composer: renderer required") } return &Composer{renderer: renderer} } // Carrier is the opaque base document the composer splices rendered // content into. Its bytes are preserved verbatim outside the regions the // splice touches — the {{#section:KEY}} anchor paragraphs and the // {{placeholder}} tokens — so the firm's letterhead, styles, headers, and // footers survive a compose byte-for-byte. This is the docforge "carrier" // for the .docx format: the lossless host for editable content. type Carrier struct { // Bytes is the raw base .docx. May be a .dotm/.docm/.dotx; Compose // runs ConvertDotmToDocx on it first (idempotent on a plain .docx). Bytes []byte // Stylemap maps a logical block kind (paragraph, heading_1/2/3, // list_bullet, list_numbered, blockquote) to the Word paragraph // style name the base defines for it. Drives the Markdown walker's // . Missing entries fall back to the "paragraph" style. Stylemap map[string]string } // Section is one editable content block the composer renders and splices. // It is the format-neutral input the docforge engine consumes; the // consuming application maps its own row type onto it (paliad maps // SubmissionSection → Section). type Section struct { // Key matches a {{#section:KEY}} anchor in the carrier, or — when no // anchor matches — marks an append-mode section. Key string // OrderIndex sets append-mode ordering (ascending). OrderIndex int // Included=false drops the section entirely. Included bool // ContentMDDE / ContentMDEN are the bilingual Markdown sources; Lang // selects which one renders. ContentMDDE string ContentMDEN string } // ComposeOptions carries the per-call composition inputs. type ComposeOptions struct { // Sections are the draft's section rows in display order. The // composer renders included sections; excluded rows are dropped. // Caller is responsible for visibility — by the time the composer // runs, the section rows have already been gated by the caller. Sections []Section // Carrier is the base .docx chrome plus its stylemap. Required. Carrier Carrier // Lang ('de' or 'en') selects which content_md_* column the // composer reads per section. Defaults to 'de' if empty. Lang string // Vars is the merged placeholder bag the v1 renderer pass // substitutes after the composer assembly. Passed straight through // to SubmissionRenderer.Render. Vars docforge.PlaceholderMap // Missing translates an unbound placeholder key into the marker // the lawyer sees in Word. Passed straight to the renderer. Missing docforge.MissingPlaceholderFn } // Compose runs the full pipeline and returns the merged .docx bytes. func (c *Composer) Compose(ctx context.Context, opts ComposeOptions) ([]byte, error) { _ = ctx // reserved for cancellation propagation in later slices sections := opts.Sections // Pre-pass: strip macros so the base reads as a plain .docx zip. cleanBytes, err := ConvertDotmToDocx(opts.Carrier.Bytes) if err != nil { return nil, fmt.Errorf("submission compose: convert base: %w", err) } // Locate + extract word/document.xml so we can splice in-place. documentXML, otherParts, err := splitBaseZip(cleanBytes) if err != nil { return nil, err } // Per-compose hyperlink allocator. Each unique URL gets a fresh // rId outside the base's existing namespace. The post-pass // (patchDocumentXMLRels) writes the matching Relationship rows // before the zip is repacked. Slice D adds inline `[label](url)` // hyperlink support. linkAlloc := newComposerLinkAllocator() // Build the rendered-section map: section_key → OOXML span. stylemap := opts.Carrier.Stylemap rendered := make(map[string]string, len(sections)) keptSections := make([]Section, 0, len(sections)) for _, sec := range sections { if !sec.Included { continue } md := sec.ContentMDDE if strings.EqualFold(opts.Lang, "en") { md = sec.ContentMDEN } rendered[sec.Key] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc) keptSections = append(keptSections, sec) } // Stable order — already sorted ascending by ListForDraft, but // belt-and-braces in case the caller swaps the ordering policy // later. sort.SliceStable(keptSections, func(i, j int) bool { return keptSections[i].OrderIndex < keptSections[j].OrderIndex }) assembledBody := spliceSections(documentXML, rendered, keptSections, sections) // Slice D hyperlink patch: when the walker emitted hyperlink rIds // for inline `[label](url)` links, the base's // word/_rels/document.xml.rels needs matching // entries so Word can resolve the rIds. Mutates one zip part in // otherParts (or appends if missing). if linkAlloc.HasLinks() { updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs()) if err != nil { return nil, err } otherParts = updatedParts } // Re-pack into a zip with the assembled document.xml. All other // parts (styles, fonts, headers, footers, theme, settings) pass // through bit-for-bit at their original mtime + compression. repacked, err := repackBaseZip(otherParts, assembledBody) if err != nil { return nil, err } // Final pass: substitute placeholders against the merged bag. The // existing renderer handles cross-run fragmentation, the `{{rule.X}}` // alias contract, and the missing-marker emission. Reusing it // guarantees v1's placeholder grammar stays intact inside section // content + base chrome. merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing) if err != nil { return nil, fmt.Errorf("submission compose: placeholder pass: %w", err) } return merged, nil } // ───────────────────────────────────────────────────────────────────── // Section splicing // ───────────────────────────────────────────────────────────────────── // Anchor markers as they appear inside a text node. We don't // need a full XML parse — finding the marker text inside the body is // sufficient because: // - {{ and }} are never legitimate document content (placeholders // follow the same convention everywhere else in paliad). // - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML // special characters. // - Each anchor lives in exactly one ..., which lives in // exactly one ..., which lives in exactly one // .... We expand from the marker outward to find the // enclosing span and drop the entire paragraph as part of // the splice. // // RE2 has no lookahead, so the "find enclosing " logic is // implemented as manual byte-index search around the marker hit // (anchorParagraphSpan below) rather than a single regex pattern. const ( anchorOpenPrefix = "{{#section:" anchorClosePrefix = "{{/section:" anchorSuffix = "}}" ) // anchorKeyRegex validates that the captured anchor key is a clean // identifier. Keys that include other characters (which can't actually // appear in our authored .docx) are treated as no match. var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`) // anchorPair records the byte span of one matched anchor pair inside // the body — from the start of the opening anchor's element // through the end of the closing anchor's . type anchorPair struct { key string openStart int // start of for the opening anchor closeEnd int // index just past for the closing anchor } // findAllAnchorPairs scans the body for matched open/close anchor // pairs. Unbalanced markers (open without close, or vice versa) are // dropped from the result. Returns pairs in body-order; each pair's // span is non-overlapping. func findAllAnchorPairs(body string) []anchorPair { type marker struct { key string paraStart int paraEnd int isOpen bool } var markers []marker collect := func(prefix string, isOpen bool) { offset := 0 for { idx := strings.Index(body[offset:], prefix) if idx < 0 { return } start := offset + idx suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix) if suffixIdx < 0 { return } key := body[start+len(prefix) : start+len(prefix)+suffixIdx] if !anchorKeyRegex.MatchString(key) { offset = start + len(prefix) continue } markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix) pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd) if !ok { offset = markerEnd continue } markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen}) offset = pEnd } } collect(anchorOpenPrefix, true) collect(anchorClosePrefix, false) // Walk markers in body-order, matching each open with the next // close that carries the same key. sort.SliceStable(markers, func(i, j int) bool { return markers[i].paraStart < markers[j].paraStart }) var pairs []anchorPair openStack := map[string]marker{} for _, m := range markers { if m.isOpen { openStack[m.key] = m continue } o, ok := openStack[m.key] if !ok { continue } pairs = append(pairs, anchorPair{ key: m.key, openStart: o.paraStart, closeEnd: m.paraEnd, }) delete(openStack, m.key) } return pairs } // paragraphSpanAround returns the byte span of the smallest `...` // element that fully contains the byte range [markerStart, markerEnd). // Returns false when the byte range doesn't sit inside a single // paragraph (which would mean the marker survived a cross-paragraph // edit — defensive guard, shouldn't happen in well-formed input). func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) { // Walk backwards to find the nearest unclosed opening. // Since doesn't nest, the nearest 0 { idx := strings.LastIndex(body[:cursor], "). if idx+4 <= len(body) { after := body[idx+4] if after == ' ' || after == '>' || after == '/' { // or ; not . close := strings.Index(body[idx:], ">") if close < 0 { return 0, 0, false } pStart = idx break } } cursor = idx } if pStart < 0 { return 0, 0, false } // Walk forward to find the matching . doesn't nest so // the next after the marker is the close. pEndIdx := strings.Index(body[markerEnd:], "") if pEndIdx < 0 { return 0, 0, false } pEnd := markerEnd + pEndIdx + len("") return pStart, pEnd, true } // spliceSections replaces anchor slots with rendered sections and // appends any unanchored sections before sectPr. Returns the assembled // document.xml body. func spliceSections(documentXML []byte, rendered map[string]string, kept []Section, all []Section) []byte { body := string(documentXML) pairs := findAllAnchorPairs(body) // Build a lookup of kept section keys for quick membership tests. keptByKey := map[string]int{} for i, sec := range kept { keptByKey[sec.Key] = i } allByKey := map[string]int{} for i, sec := range all { allByKey[sec.Key] = i } matchedKeys := map[string]bool{} // Walk pairs in REVERSE body-order so slice mutations don't shift // later offsets. sort.SliceStable(pairs, func(i, j int) bool { return pairs[i].openStart > pairs[j].openStart }) for _, p := range pairs { replacement := "" if idx, ok := keptByKey[p.key]; ok { replacement = rendered[p.key] matchedKeys[p.key] = true _ = idx } else if _, isOnDraft := allByKey[p.key]; isOnDraft { // Anchor matches an excluded section on the draft — drop // the entire slot. replacement = "" } else { // Anchor doesn't match any section on this draft — drop // to leave the base's chrome unbroken. replacement = "" } body = body[:p.openStart] + replacement + body[p.closeEnd:] } // Append unanchored sections before sectPr in order_index ASC. var unanchored strings.Builder for _, sec := range kept { if matchedKeys[sec.Key] { continue } unanchored.WriteString(rendered[sec.Key]) } if unanchored.Len() > 0 { body = appendBeforeSectPr(body, unanchored.String()) } return []byte(body) } // appendBeforeSectPr inserts content immediately before the first // `` if present, else at // the very end. idx := strings.LastIndex(body, "") if idx < 0 { return body + content } return body[:idx] + content + body[idx:] } return body[:loc[0]] + content + body[loc[0]:] } // ───────────────────────────────────────────────────────────────────── // Zip plumbing // ───────────────────────────────────────────────────────────────────── // baseZipPart captures one zip entry we kept aside while extracting // document.xml. type baseZipPart struct { name string method uint16 modTime int64 // wall seconds; converted back to time.Time on repack body []byte } // splitBaseZip extracts document.xml and returns it alongside every // other zip entry, ready for repacking. func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) { zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes))) if err != nil { return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err) } var documentXML []byte parts := make([]baseZipPart, 0, len(zr.File)) for _, f := range zr.File { body, err := readZipEntry(f) if err != nil { return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err) } if f.Name == "word/document.xml" { documentXML = body parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil}) continue } parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body}) } if documentXML == nil { return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml") } return documentXML, parts, nil } // repackBaseZip rebuilds the zip, swapping document.xml for the // assembled body and leaving every other part untouched. func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) { var out bytes.Buffer zw := zip.NewWriter(&out) for _, p := range parts { hdr := &zip.FileHeader{ Name: p.name, Method: p.method, } if p.modTime > 0 { hdr.Modified = time.Unix(p.modTime, 0) } w, err := zw.CreateHeader(hdr) if err != nil { return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err) } body := p.body if p.name == "word/document.xml" { body = assembledBody } if _, err := w.Write(body); err != nil { return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err) } } if err := zw.Close(); err != nil { return nil, fmt.Errorf("submission compose: finalise zip: %w", err) } return out.Bytes(), nil } func readZipEntry(f *zip.File) ([]byte, error) { rc, err := f.Open() if err != nil { return nil, err } defer rc.Close() return io.ReadAll(rc) } // ───────────────────────────────────────────────────────────────────── // Slice D — hyperlink wiring // ───────────────────────────────────────────────────────────────────── // composerLinkAllocator hands out fresh rIds for inline hyperlink // targets discovered by the MD walker. Each unique URL gets one rId // (deduped — repeated links to the same URL share one Relationship). // Allocations land outside the base's rId namespace by prefixing with // "rIdComposer" so they can't collide with existing relationships. type composerLinkAllocator struct { next int byURL map[string]string order []string // URLs in allocation order } func newComposerLinkAllocator() *composerLinkAllocator { return &composerLinkAllocator{byURL: map[string]string{}} } // Alloc returns the rId for url, allocating one on first sight. func (a *composerLinkAllocator) Alloc(url string) string { if rid, ok := a.byURL[url]; ok { return rid } a.next++ rid := fmt.Sprintf("rIdComposer%d", a.next) a.byURL[url] = rid a.order = append(a.order, url) return rid } // HasLinks reports whether any links were allocated during this compose. func (a *composerLinkAllocator) HasLinks() bool { return len(a.order) > 0 } // Pairs returns the (rId, URL) pairs in allocation order. The // document.xml.rels patcher consumes this to emit // elements. func (a *composerLinkAllocator) Pairs() [][2]string { pairs := make([][2]string, 0, len(a.order)) for _, url := range a.order { pairs = append(pairs, [2]string{a.byURL[url], url}) } return pairs } // patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry // in `parts` to append the given (rId, URL) pairs as hyperlink // relationships. If the rels part doesn't exist (some bases omit it // when the body has no relationships), this function appends a fresh // part with the minimal Relationships wrapper. // // Idempotent on (rId, URL) pairs already present (e.g. when a base // already references the URL for some other reason). // // Returns the (possibly extended) parts slice — callers must overwrite // their reference because the append in the no-rels-yet case grows the // backing array. func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) { const path = "word/_rels/document.xml.rels" const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" existingIdx := -1 for i := range parts { if parts[i].name == path { existingIdx = i break } } var body string if existingIdx >= 0 { body = string(parts[existingIdx].body) } else { body = `` + `` } var inserts strings.Builder for _, p := range pairs { rid := p[0] url := p[1] if strings.Contains(body, `Id="`+rid+`"`) { continue } inserts.WriteString(``) } if inserts.Len() == 0 { return parts, nil } closeIdx := strings.LastIndex(body, "") if closeIdx < 0 { return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)") } patched := body[:closeIdx] + inserts.String() + body[closeIdx:] if existingIdx >= 0 { parts[existingIdx].body = []byte(patched) return parts, nil } parts = append(parts, baseZipPart{ name: path, method: zip.Deflate, modTime: time.Now().Unix(), body: []byte(patched), }) return parts, nil }