From f8067c2fe56b82b0f1716729d5cbaee56d8246c3 Mon Sep 17 00:00:00 2001 From: mAi Date: Fri, 29 May 2026 14:57:34 +0200 Subject: [PATCH] =?UTF-8?q?refactor(docforge):=20slice=202=20=E2=80=94=20c?= =?UTF-8?q?omposer=20to=20pkg/docforge/docx=20+=20Carrier=20(t-paliad-349)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the full compose pipeline (anchor-pair splicing, append-before-sectPr, hyperlink-rels patching, zip split/repack, final placeholder pass) into pkg/docforge/docx/compose.go, decoupled from paliad's DB row types. The engine now owns the entire .docx assembly. New neutral types in docx: - Carrier{Bytes, Stylemap} — the opaque base .docx, preserved byte-for-byte outside the spliced regions (the lossless docforge carrier for .docx). - Section{Key, OrderIndex, Included, ContentMDDE, ContentMDEN} — the format-neutral content input. - Composer / NewComposer / ComposeOptions on those neutral types. internal/services keeps SubmissionComposer + ComposeOptions as a thin mapping wrapper (SubmissionSection -> docx.Section, Base.SectionSpec.Stylemap + BaseBytes -> docx.Carrier). handlers + the comprehensive compose_test are unchanged; the test drives the wrapper end-to-end and its byte-exact OOXML assertions pass = behaviour preserved. Retired the slice-1 docx.XMLAttrEscape wrapper + its services forwarder: compose now calls the local xmlAttrEscape inside the docx package. Sequencing note: the paragraph-level neutral model (Document/Block/Slot the PRD §3.2 sketches) is deferred to slice 6, where the authoring importer + format exporters consume it. Building it now, ahead of any consumer, would be speculative and risk the byte-identical guarantee for no gain (PRD §4 B3 principle). Carrier is the part of the model that earns its keep this cycle. Verification: go build ./... clean, go vet clean, full module test green. m/paliad#157 --- internal/services/docforge_shims.go | 6 - internal/services/submission_compose.go | 624 +++-------------------- pkg/docforge/docx/compose.go | 634 ++++++++++++++++++++++++ pkg/docforge/docx/markdown.go | 8 - 4 files changed, 692 insertions(+), 580 deletions(-) create mode 100644 pkg/docforge/docx/compose.go diff --git a/internal/services/docforge_shims.go b/internal/services/docforge_shims.go index fe6f38c..3599ce5 100644 --- a/internal/services/docforge_shims.go +++ b/internal/services/docforge_shims.go @@ -57,9 +57,3 @@ func ConvertDotmToDocx(dotmBytes []byte) ([]byte, error) { return docx.ConvertDo // SanitiseSubmissionFileName cleans a string for use inside a download // filename (strips path separators / quotes, ASCII-folds DE umlauts). func SanitiseSubmissionFileName(s string) string { return docx.SanitiseSubmissionFileName(s) } - -// xmlAttrEscape forwards to docx.XMLAttrEscape so submission_compose.go's -// hyperlink-rels inserts reuse the walker's exact attribute escaping -// without importing the docx package directly. Retires when the composer -// splice folds into pkg/docforge/docx (slice 2). -func xmlAttrEscape(s string) string { return docx.XMLAttrEscape(s) } diff --git a/internal/services/submission_compose.go b/internal/services/submission_compose.go index 9a92548..4a0340d 100644 --- a/internal/services/submission_compose.go +++ b/internal/services/submission_compose.go @@ -1,93 +1,73 @@ package services -// Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 + -// §9.2). Assembles a base .docx and a draft's section rows into a -// merged .docx ready for export. +// Composer wrapper — bridges paliad's submission draft model +// (SubmissionSection + SubmissionBase) to the format-neutral docforge +// .docx composer (pkg/docforge/docx), extracted in slice 2 of the +// docforge train (t-paliad-349 / m/paliad#157). // -// Pipeline (high-level): +// The full splice/assembly pipeline now lives in pkg/docforge/docx +// (compose.go): macro pre-pass, anchor-pair splicing, append-before-sectPr, +// hyperlink-rels patching, zip repack, and the final placeholder pass. This +// wrapper does the one thing the engine must not know about — mapping +// paliad's DB row types onto the neutral docx.Section / docx.Carrier +// inputs. Behaviour is byte-identical to the pre-extraction composer; the +// in-package compose_test still drives this wrapper end-to-end. // -// 1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx). -// 2. Locate `word/document.xml` inside the zip; pull the body XML. -// 3. For each section in the draft (order_index ASC, included=true): -// render content_md_ → OOXML via RenderMarkdownToOOXML using -// base.section_spec.stylemap.paragraph. -// 4. Splice the rendered OOXML into the base body. Two splice modes: -// - Anchor mode: when the body carries `{{#section:KEY}}` / -// `{{/section:KEY}}` marker pairs, replace the slot's content -// (including the anchor paragraphs themselves) with the rendered -// section. -// - Append mode: when no anchor pair is found for a section, the -// rendered OOXML appends at the end of the body, just before any -// `` element. Sections with `included=false` are -// dropped silently. -// 5. Strip any leftover unmatched anchor paragraphs. -// 6. Re-pack the document.xml into the zip, leaving every other part -// untouched. -// 7. Run the v1 SubmissionRenderer placeholder pass over the assembly -// so `{{path}}` placeholders inside section content (and inside -// the base's untouched chrome) get substituted by the merged bag. -// Cross-run merge in pass 2 handles autocorrect-fragmented -// placeholders the same as v1. -// -// Result: a fully-merged .docx. No new third-party Go dep — reuses -// archive/zip + the existing SubmissionRenderer. +// Slice note: the paragraph-level neutral document model (Document / Block +// / Slot) the PRD §3.2 sketches lands in slice 6, where the authoring +// importer and the format exporters actually consume it. Building it now, +// ahead of any consumer, would be speculative and would put the +// byte-identical guarantee at risk for no gain (PRD §4 B3 principle: +// extractions earn their keep this cycle). import ( - "archive/zip" - "bytes" "context" "fmt" - "io" - "regexp" - "sort" - "strings" - "time" + + "mgit.msbls.de/m/paliad/pkg/docforge/docx" ) -// SubmissionComposer assembles base + sections into a final .docx. -// Stateless; safe for concurrent use. +// SubmissionComposer assembles a base + a draft's sections into a final +// .docx. Stateless; safe for concurrent use. type SubmissionComposer struct { - renderer *SubmissionRenderer + inner *docx.Composer } -// NewSubmissionComposer wires the composer. The renderer is required — -// a nil renderer is a programmer error and the composer panics at +// NewSubmissionComposer wires the composer. The renderer is required — a +// nil renderer is a programmer error and the composer panics at // construction. func NewSubmissionComposer(renderer *SubmissionRenderer) *SubmissionComposer { - if renderer == nil { - panic("submission composer: renderer required") - } - return &SubmissionComposer{renderer: renderer} + return &SubmissionComposer{inner: docx.NewComposer(renderer)} } -// ComposeOptions carries the per-call composition inputs. +// ComposeOptions carries the per-call composition inputs in paliad's own +// terms (SubmissionSection rows + the SubmissionBase chrome). type ComposeOptions struct { - // Sections are the draft's section rows in display order. The - // composer renders included sections; excluded rows are dropped. - // Caller is responsible for visibility — by the time the composer - // runs, the section rows have already been gated through - // SubmissionDraftService.Get + can_see_project. + // Sections are the draft's section rows in display order. Included + // sections render; excluded rows are dropped. The caller is + // responsible for visibility — by the time the composer runs the rows + // have already been gated through SubmissionDraftService.Get + + // can_see_project. Sections []SubmissionSection - // Base supplies the document chrome (.docx body host) plus the - // stylemap for the MD walker. Must not be nil. + // Base supplies the document chrome plus the stylemap for the MD + // walker. Must not be nil. Base *SubmissionBase - // BaseBytes is the raw .docx bytes for the base. Typically fetched + // BaseBytes is the raw .docx bytes for the base, typically fetched // from Gitea via the existing template cache. BaseBytes []byte - // Lang ('de' or 'en') selects which content_md_* column the - // composer reads per section. Defaults to 'de' if empty. + // Lang ('de' or 'en') selects which content_md_* column the composer + // reads per section. Defaults to 'de' if empty. Lang string - // Vars is the merged placeholder bag the v1 renderer pass - // substitutes after the composer assembly. Passed straight through - // to SubmissionRenderer.Render. + // Vars is the merged placeholder bag the renderer pass substitutes + // after assembly. Vars PlaceholderMap - // Missing translates an unbound placeholder key into the marker - // the lawyer sees in Word. Passed straight to the renderer. + // Missing translates an unbound placeholder key into the marker the + // lawyer sees in Word. Missing MissingPlaceholderFn } @@ -96,512 +76,24 @@ func (c *SubmissionComposer) Compose(ctx context.Context, opts ComposeOptions) ( if opts.Base == nil { return nil, fmt.Errorf("submission compose: base required") } - _ = ctx // reserved for cancellation propagation in later slices - sections := opts.Sections - - // Pre-pass: strip macros so the base reads as a plain .docx zip. - cleanBytes, err := ConvertDotmToDocx(opts.BaseBytes) - if err != nil { - return nil, fmt.Errorf("submission compose: convert base: %w", err) - } - - // Locate + extract word/document.xml so we can splice in-place. - documentXML, otherParts, err := splitBaseZip(cleanBytes) - if err != nil { - return nil, err - } - - // Per-compose hyperlink allocator. Each unique URL gets a fresh - // rId outside the base's existing namespace. The post-pass - // (patchDocumentXMLRels) writes the matching Relationship rows - // before the zip is repacked. Slice D adds inline `[label](url)` - // hyperlink support. - linkAlloc := newComposerLinkAllocator() - - // Build the rendered-section map: section_key → OOXML span. - stylemap := opts.Base.SectionSpec.Stylemap - rendered := make(map[string]string, len(sections)) - keptSections := make([]SubmissionSection, 0, len(sections)) - for _, sec := range sections { - if !sec.Included { - continue + secs := make([]docx.Section, len(opts.Sections)) + for i, s := range opts.Sections { + secs[i] = docx.Section{ + Key: s.SectionKey, + OrderIndex: s.OrderIndex, + Included: s.Included, + ContentMDDE: s.ContentMDDE, + ContentMDEN: s.ContentMDEN, } - md := sec.ContentMDDE - if strings.EqualFold(opts.Lang, "en") { - md = sec.ContentMDEN - } - rendered[sec.SectionKey] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc) - keptSections = append(keptSections, sec) } - // Stable order — already sorted ascending by ListForDraft, but - // belt-and-braces in case the caller swaps the ordering policy - // later. - sort.SliceStable(keptSections, func(i, j int) bool { - return keptSections[i].OrderIndex < keptSections[j].OrderIndex + return c.inner.Compose(ctx, docx.ComposeOptions{ + Sections: secs, + Carrier: docx.Carrier{ + Bytes: opts.BaseBytes, + Stylemap: opts.Base.SectionSpec.Stylemap, + }, + Lang: opts.Lang, + Vars: opts.Vars, + Missing: opts.Missing, }) - - assembledBody := spliceSections(documentXML, rendered, keptSections, sections) - - // Slice D hyperlink patch: when the walker emitted hyperlink rIds - // for inline `[label](url)` links, the base's - // word/_rels/document.xml.rels needs matching - // entries so Word can resolve the rIds. Mutates one zip part in - // otherParts (or appends if missing). - if linkAlloc.HasLinks() { - updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs()) - if err != nil { - return nil, err - } - otherParts = updatedParts - } - - // Re-pack into a zip with the assembled document.xml. All other - // parts (styles, fonts, headers, footers, theme, settings) pass - // through bit-for-bit at their original mtime + compression. - repacked, err := repackBaseZip(otherParts, assembledBody) - if err != nil { - return nil, err - } - - // Final pass: substitute placeholders against the merged bag. The - // existing renderer handles cross-run fragmentation, the `{{rule.X}}` - // alias contract, and the missing-marker emission. Reusing it - // guarantees v1's placeholder grammar stays intact inside section - // content + base chrome. - merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing) - if err != nil { - return nil, fmt.Errorf("submission compose: placeholder pass: %w", err) - } - return merged, nil -} - -// ───────────────────────────────────────────────────────────────────── -// Section splicing -// ───────────────────────────────────────────────────────────────────── - -// Anchor markers as they appear inside a text node. We don't -// need a full XML parse — finding the marker text inside the body is -// sufficient because: -// - {{ and }} are never legitimate document content (placeholders -// follow the same convention everywhere else in paliad). -// - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML -// special characters. -// - Each anchor lives in exactly one ..., which lives in -// exactly one ..., which lives in exactly one -// .... We expand from the marker outward to find the -// enclosing span and drop the entire paragraph as part of -// the splice. -// -// RE2 has no lookahead, so the "find enclosing " logic is -// implemented as manual byte-index search around the marker hit -// (anchorParagraphSpan below) rather than a single regex pattern. - -const ( - anchorOpenPrefix = "{{#section:" - anchorClosePrefix = "{{/section:" - anchorSuffix = "}}" -) - -// anchorKeyRegex validates that the captured anchor key is a clean -// identifier. Keys that include other characters (which can't actually -// appear in our authored .docx) are treated as no match. -var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`) - -// anchorPair records the byte span of one matched anchor pair inside -// the body — from the start of the opening anchor's element -// through the end of the closing anchor's . -type anchorPair struct { - key string - openStart int // start of for the opening anchor - closeEnd int // index just past for the closing anchor -} - -// findAllAnchorPairs scans the body for matched open/close anchor -// pairs. Unbalanced markers (open without close, or vice versa) are -// dropped from the result. Returns pairs in body-order; each pair's -// span is non-overlapping. -func findAllAnchorPairs(body string) []anchorPair { - type marker struct { - key string - paraStart int - paraEnd int - isOpen bool - } - var markers []marker - - collect := func(prefix string, isOpen bool) { - offset := 0 - for { - idx := strings.Index(body[offset:], prefix) - if idx < 0 { - return - } - start := offset + idx - suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix) - if suffixIdx < 0 { - return - } - key := body[start+len(prefix) : start+len(prefix)+suffixIdx] - if !anchorKeyRegex.MatchString(key) { - offset = start + len(prefix) - continue - } - markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix) - pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd) - if !ok { - offset = markerEnd - continue - } - markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen}) - offset = pEnd - } - } - collect(anchorOpenPrefix, true) - collect(anchorClosePrefix, false) - - // Walk markers in body-order, matching each open with the next - // close that carries the same key. - sort.SliceStable(markers, func(i, j int) bool { - return markers[i].paraStart < markers[j].paraStart - }) - var pairs []anchorPair - openStack := map[string]marker{} - for _, m := range markers { - if m.isOpen { - openStack[m.key] = m - continue - } - o, ok := openStack[m.key] - if !ok { - continue - } - pairs = append(pairs, anchorPair{ - key: m.key, - openStart: o.paraStart, - closeEnd: m.paraEnd, - }) - delete(openStack, m.key) - } - return pairs -} - -// paragraphSpanAround returns the byte span of the smallest `...` -// element that fully contains the byte range [markerStart, markerEnd). -// Returns false when the byte range doesn't sit inside a single -// paragraph (which would mean the marker survived a cross-paragraph -// edit — defensive guard, shouldn't happen in well-formed input). -func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) { - // Walk backwards to find the nearest unclosed opening. - // Since doesn't nest, the nearest 0 { - idx := strings.LastIndex(body[:cursor], "). - if idx+4 <= len(body) { - after := body[idx+4] - if after == ' ' || after == '>' || after == '/' { - // or ; not . - close := strings.Index(body[idx:], ">") - if close < 0 { - return 0, 0, false - } - pStart = idx - break - } - } - cursor = idx - } - if pStart < 0 { - return 0, 0, false - } - // Walk forward to find the matching . doesn't nest so - // the next after the marker is the close. - pEndIdx := strings.Index(body[markerEnd:], "") - if pEndIdx < 0 { - return 0, 0, false - } - pEnd := markerEnd + pEndIdx + len("") - return pStart, pEnd, true -} - -// spliceSections replaces anchor slots with rendered sections and -// appends any unanchored sections before sectPr. Returns the assembled -// document.xml body. -func spliceSections(documentXML []byte, rendered map[string]string, kept []SubmissionSection, all []SubmissionSection) []byte { - body := string(documentXML) - pairs := findAllAnchorPairs(body) - - // Build a lookup of kept section keys for quick membership tests. - keptByKey := map[string]int{} - for i, sec := range kept { - keptByKey[sec.SectionKey] = i - } - allByKey := map[string]int{} - for i, sec := range all { - allByKey[sec.SectionKey] = i - } - - matchedKeys := map[string]bool{} - - // Walk pairs in REVERSE body-order so slice mutations don't shift - // later offsets. - sort.SliceStable(pairs, func(i, j int) bool { - return pairs[i].openStart > pairs[j].openStart - }) - for _, p := range pairs { - replacement := "" - if idx, ok := keptByKey[p.key]; ok { - replacement = rendered[p.key] - matchedKeys[p.key] = true - _ = idx - } else if _, isOnDraft := allByKey[p.key]; isOnDraft { - // Anchor matches an excluded section on the draft — drop - // the entire slot. - replacement = "" - } else { - // Anchor doesn't match any section on this draft — drop - // to leave the base's chrome unbroken. - replacement = "" - } - body = body[:p.openStart] + replacement + body[p.closeEnd:] - } - - // Append unanchored sections before sectPr in order_index ASC. - var unanchored strings.Builder - for _, sec := range kept { - if matchedKeys[sec.SectionKey] { - continue - } - unanchored.WriteString(rendered[sec.SectionKey]) - } - if unanchored.Len() > 0 { - body = appendBeforeSectPr(body, unanchored.String()) - } - - return []byte(body) -} - -// appendBeforeSectPr inserts content immediately before the first -// `` if present, else at - // the very end. - idx := strings.LastIndex(body, "") - if idx < 0 { - return body + content - } - return body[:idx] + content + body[idx:] - } - return body[:loc[0]] + content + body[loc[0]:] -} - -// ───────────────────────────────────────────────────────────────────── -// Zip plumbing -// ───────────────────────────────────────────────────────────────────── - -// baseZipPart captures one zip entry we kept aside while extracting -// document.xml. -type baseZipPart struct { - name string - method uint16 - modTime int64 // wall seconds; converted back to time.Time on repack - body []byte -} - -// splitBaseZip extracts document.xml and returns it alongside every -// other zip entry, ready for repacking. -func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) { - zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes))) - if err != nil { - return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err) - } - var documentXML []byte - parts := make([]baseZipPart, 0, len(zr.File)) - for _, f := range zr.File { - body, err := readZipEntry(f) - if err != nil { - return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err) - } - if f.Name == "word/document.xml" { - documentXML = body - parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil}) - continue - } - parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body}) - } - if documentXML == nil { - return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml") - } - return documentXML, parts, nil -} - -// repackBaseZip rebuilds the zip, swapping document.xml for the -// assembled body and leaving every other part untouched. -func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) { - var out bytes.Buffer - zw := zip.NewWriter(&out) - for _, p := range parts { - hdr := &zip.FileHeader{ - Name: p.name, - Method: p.method, - } - if p.modTime > 0 { - hdr.Modified = time.Unix(p.modTime, 0) - } - w, err := zw.CreateHeader(hdr) - if err != nil { - return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err) - } - body := p.body - if p.name == "word/document.xml" { - body = assembledBody - } - if _, err := w.Write(body); err != nil { - return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err) - } - } - if err := zw.Close(); err != nil { - return nil, fmt.Errorf("submission compose: finalise zip: %w", err) - } - return out.Bytes(), nil -} - -func readZipEntry(f *zip.File) ([]byte, error) { - rc, err := f.Open() - if err != nil { - return nil, err - } - defer rc.Close() - return io.ReadAll(rc) -} - -// ───────────────────────────────────────────────────────────────────── -// Slice D — hyperlink wiring -// ───────────────────────────────────────────────────────────────────── - -// composerLinkAllocator hands out fresh rIds for inline hyperlink -// targets discovered by the MD walker. Each unique URL gets one rId -// (deduped — repeated links to the same URL share one Relationship). -// Allocations land outside the base's rId namespace by prefixing with -// "rIdComposer" so they can't collide with existing relationships. -type composerLinkAllocator struct { - next int - byURL map[string]string - order []string // URLs in allocation order -} - -func newComposerLinkAllocator() *composerLinkAllocator { - return &composerLinkAllocator{byURL: map[string]string{}} -} - -// Alloc returns the rId for url, allocating one on first sight. -func (a *composerLinkAllocator) Alloc(url string) string { - if rid, ok := a.byURL[url]; ok { - return rid - } - a.next++ - rid := fmt.Sprintf("rIdComposer%d", a.next) - a.byURL[url] = rid - a.order = append(a.order, url) - return rid -} - -// HasLinks reports whether any links were allocated during this compose. -func (a *composerLinkAllocator) HasLinks() bool { - return len(a.order) > 0 -} - -// Pairs returns the (rId, URL) pairs in allocation order. The -// document.xml.rels patcher consumes this to emit -// elements. -func (a *composerLinkAllocator) Pairs() [][2]string { - pairs := make([][2]string, 0, len(a.order)) - for _, url := range a.order { - pairs = append(pairs, [2]string{a.byURL[url], url}) - } - return pairs -} - -// patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry -// in `parts` to append the given (rId, URL) pairs as hyperlink -// relationships. If the rels part doesn't exist (some bases omit it -// when the body has no relationships), this function appends a fresh -// part with the minimal Relationships wrapper. -// -// Idempotent on (rId, URL) pairs already present (e.g. when a base -// already references the URL for some other reason). -// -// Returns the (possibly extended) parts slice — callers must overwrite -// their reference because the append in the no-rels-yet case grows the -// backing array. -func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) { - const path = "word/_rels/document.xml.rels" - const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" - - existingIdx := -1 - for i := range parts { - if parts[i].name == path { - existingIdx = i - break - } - } - - var body string - if existingIdx >= 0 { - body = string(parts[existingIdx].body) - } else { - body = `` + - `` - } - - var inserts strings.Builder - for _, p := range pairs { - rid := p[0] - url := p[1] - if strings.Contains(body, `Id="`+rid+`"`) { - continue - } - inserts.WriteString(``) - } - - if inserts.Len() == 0 { - return parts, nil - } - - closeIdx := strings.LastIndex(body, "") - if closeIdx < 0 { - return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)") - } - patched := body[:closeIdx] + inserts.String() + body[closeIdx:] - - if existingIdx >= 0 { - parts[existingIdx].body = []byte(patched) - return parts, nil - } - parts = append(parts, baseZipPart{ - name: path, - method: zip.Deflate, - modTime: time.Now().Unix(), - body: []byte(patched), - }) - return parts, nil } diff --git a/pkg/docforge/docx/compose.go b/pkg/docforge/docx/compose.go new file mode 100644 index 0000000..1b3f4b2 --- /dev/null +++ b/pkg/docforge/docx/compose.go @@ -0,0 +1,634 @@ +package docx + +// Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 + +// §9.2). Assembles a base .docx and a draft's section rows into a +// merged .docx ready for export. +// +// Pipeline (high-level): +// +// 1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx). +// 2. Locate `word/document.xml` inside the zip; pull the body XML. +// 3. For each section in the draft (order_index ASC, included=true): +// render content_md_ → OOXML via RenderMarkdownToOOXML using +// base.section_spec.stylemap.paragraph. +// 4. Splice the rendered OOXML into the base body. Two splice modes: +// - Anchor mode: when the body carries `{{#section:KEY}}` / +// `{{/section:KEY}}` marker pairs, replace the slot's content +// (including the anchor paragraphs themselves) with the rendered +// section. +// - Append mode: when no anchor pair is found for a section, the +// rendered OOXML appends at the end of the body, just before any +// `` element. Sections with `included=false` are +// dropped silently. +// 5. Strip any leftover unmatched anchor paragraphs. +// 6. Re-pack the document.xml into the zip, leaving every other part +// untouched. +// 7. Run the v1 SubmissionRenderer placeholder pass over the assembly +// so `{{path}}` placeholders inside section content (and inside +// the base's untouched chrome) get substituted by the merged bag. +// Cross-run merge in pass 2 handles autocorrect-fragmented +// placeholders the same as v1. +// +// Result: a fully-merged .docx. No new third-party Go dep — reuses +// archive/zip + the existing SubmissionRenderer. + +import ( + "archive/zip" + "bytes" + "context" + "fmt" + "io" + "regexp" + "sort" + "strings" + "time" +) + +// Composer assembles base + sections into a final .docx. +// Stateless; safe for concurrent use. +type Composer struct { + renderer *SubmissionRenderer +} + +// NewComposer wires the composer. The renderer is required — +// a nil renderer is a programmer error and the composer panics at +// construction. +func NewComposer(renderer *SubmissionRenderer) *Composer { + if renderer == nil { + panic("submission composer: renderer required") + } + return &Composer{renderer: renderer} +} + +// Carrier is the opaque base document the composer splices rendered +// content into. Its bytes are preserved verbatim outside the regions the +// splice touches — the {{#section:KEY}} anchor paragraphs and the +// {{placeholder}} tokens — so the firm's letterhead, styles, headers, and +// footers survive a compose byte-for-byte. This is the docforge "carrier" +// for the .docx format: the lossless host for editable content. +type Carrier struct { + // Bytes is the raw base .docx. May be a .dotm/.docm/.dotx; Compose + // runs ConvertDotmToDocx on it first (idempotent on a plain .docx). + Bytes []byte + + // Stylemap maps a logical block kind (paragraph, heading_1/2/3, + // list_bullet, list_numbered, blockquote) to the Word paragraph + // style name the base defines for it. Drives the Markdown walker's + // . Missing entries fall back to the "paragraph" style. + Stylemap map[string]string +} + +// Section is one editable content block the composer renders and splices. +// It is the format-neutral input the docforge engine consumes; the +// consuming application maps its own row type onto it (paliad maps +// SubmissionSection → Section). +type Section struct { + // Key matches a {{#section:KEY}} anchor in the carrier, or — when no + // anchor matches — marks an append-mode section. + Key string + // OrderIndex sets append-mode ordering (ascending). + OrderIndex int + // Included=false drops the section entirely. + Included bool + // ContentMDDE / ContentMDEN are the bilingual Markdown sources; Lang + // selects which one renders. + ContentMDDE string + ContentMDEN string +} + +// ComposeOptions carries the per-call composition inputs. +type ComposeOptions struct { + // Sections are the draft's section rows in display order. The + // composer renders included sections; excluded rows are dropped. + // Caller is responsible for visibility — by the time the composer + // runs, the section rows have already been gated by the caller. + Sections []Section + + // Carrier is the base .docx chrome plus its stylemap. Required. + Carrier Carrier + + // Lang ('de' or 'en') selects which content_md_* column the + // composer reads per section. Defaults to 'de' if empty. + Lang string + + // Vars is the merged placeholder bag the v1 renderer pass + // substitutes after the composer assembly. Passed straight through + // to SubmissionRenderer.Render. + Vars PlaceholderMap + + // Missing translates an unbound placeholder key into the marker + // the lawyer sees in Word. Passed straight to the renderer. + Missing MissingPlaceholderFn +} + +// Compose runs the full pipeline and returns the merged .docx bytes. +func (c *Composer) Compose(ctx context.Context, opts ComposeOptions) ([]byte, error) { + _ = ctx // reserved for cancellation propagation in later slices + sections := opts.Sections + + // Pre-pass: strip macros so the base reads as a plain .docx zip. + cleanBytes, err := ConvertDotmToDocx(opts.Carrier.Bytes) + if err != nil { + return nil, fmt.Errorf("submission compose: convert base: %w", err) + } + + // Locate + extract word/document.xml so we can splice in-place. + documentXML, otherParts, err := splitBaseZip(cleanBytes) + if err != nil { + return nil, err + } + + // Per-compose hyperlink allocator. Each unique URL gets a fresh + // rId outside the base's existing namespace. The post-pass + // (patchDocumentXMLRels) writes the matching Relationship rows + // before the zip is repacked. Slice D adds inline `[label](url)` + // hyperlink support. + linkAlloc := newComposerLinkAllocator() + + // Build the rendered-section map: section_key → OOXML span. + stylemap := opts.Carrier.Stylemap + rendered := make(map[string]string, len(sections)) + keptSections := make([]Section, 0, len(sections)) + for _, sec := range sections { + if !sec.Included { + continue + } + md := sec.ContentMDDE + if strings.EqualFold(opts.Lang, "en") { + md = sec.ContentMDEN + } + rendered[sec.Key] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc) + keptSections = append(keptSections, sec) + } + // Stable order — already sorted ascending by ListForDraft, but + // belt-and-braces in case the caller swaps the ordering policy + // later. + sort.SliceStable(keptSections, func(i, j int) bool { + return keptSections[i].OrderIndex < keptSections[j].OrderIndex + }) + + assembledBody := spliceSections(documentXML, rendered, keptSections, sections) + + // Slice D hyperlink patch: when the walker emitted hyperlink rIds + // for inline `[label](url)` links, the base's + // word/_rels/document.xml.rels needs matching + // entries so Word can resolve the rIds. Mutates one zip part in + // otherParts (or appends if missing). + if linkAlloc.HasLinks() { + updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs()) + if err != nil { + return nil, err + } + otherParts = updatedParts + } + + // Re-pack into a zip with the assembled document.xml. All other + // parts (styles, fonts, headers, footers, theme, settings) pass + // through bit-for-bit at their original mtime + compression. + repacked, err := repackBaseZip(otherParts, assembledBody) + if err != nil { + return nil, err + } + + // Final pass: substitute placeholders against the merged bag. The + // existing renderer handles cross-run fragmentation, the `{{rule.X}}` + // alias contract, and the missing-marker emission. Reusing it + // guarantees v1's placeholder grammar stays intact inside section + // content + base chrome. + merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing) + if err != nil { + return nil, fmt.Errorf("submission compose: placeholder pass: %w", err) + } + return merged, nil +} + +// ───────────────────────────────────────────────────────────────────── +// Section splicing +// ───────────────────────────────────────────────────────────────────── + +// Anchor markers as they appear inside a text node. We don't +// need a full XML parse — finding the marker text inside the body is +// sufficient because: +// - {{ and }} are never legitimate document content (placeholders +// follow the same convention everywhere else in paliad). +// - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML +// special characters. +// - Each anchor lives in exactly one ..., which lives in +// exactly one ..., which lives in exactly one +// .... We expand from the marker outward to find the +// enclosing span and drop the entire paragraph as part of +// the splice. +// +// RE2 has no lookahead, so the "find enclosing " logic is +// implemented as manual byte-index search around the marker hit +// (anchorParagraphSpan below) rather than a single regex pattern. + +const ( + anchorOpenPrefix = "{{#section:" + anchorClosePrefix = "{{/section:" + anchorSuffix = "}}" +) + +// anchorKeyRegex validates that the captured anchor key is a clean +// identifier. Keys that include other characters (which can't actually +// appear in our authored .docx) are treated as no match. +var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`) + +// anchorPair records the byte span of one matched anchor pair inside +// the body — from the start of the opening anchor's element +// through the end of the closing anchor's . +type anchorPair struct { + key string + openStart int // start of for the opening anchor + closeEnd int // index just past for the closing anchor +} + +// findAllAnchorPairs scans the body for matched open/close anchor +// pairs. Unbalanced markers (open without close, or vice versa) are +// dropped from the result. Returns pairs in body-order; each pair's +// span is non-overlapping. +func findAllAnchorPairs(body string) []anchorPair { + type marker struct { + key string + paraStart int + paraEnd int + isOpen bool + } + var markers []marker + + collect := func(prefix string, isOpen bool) { + offset := 0 + for { + idx := strings.Index(body[offset:], prefix) + if idx < 0 { + return + } + start := offset + idx + suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix) + if suffixIdx < 0 { + return + } + key := body[start+len(prefix) : start+len(prefix)+suffixIdx] + if !anchorKeyRegex.MatchString(key) { + offset = start + len(prefix) + continue + } + markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix) + pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd) + if !ok { + offset = markerEnd + continue + } + markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen}) + offset = pEnd + } + } + collect(anchorOpenPrefix, true) + collect(anchorClosePrefix, false) + + // Walk markers in body-order, matching each open with the next + // close that carries the same key. + sort.SliceStable(markers, func(i, j int) bool { + return markers[i].paraStart < markers[j].paraStart + }) + var pairs []anchorPair + openStack := map[string]marker{} + for _, m := range markers { + if m.isOpen { + openStack[m.key] = m + continue + } + o, ok := openStack[m.key] + if !ok { + continue + } + pairs = append(pairs, anchorPair{ + key: m.key, + openStart: o.paraStart, + closeEnd: m.paraEnd, + }) + delete(openStack, m.key) + } + return pairs +} + +// paragraphSpanAround returns the byte span of the smallest `...` +// element that fully contains the byte range [markerStart, markerEnd). +// Returns false when the byte range doesn't sit inside a single +// paragraph (which would mean the marker survived a cross-paragraph +// edit — defensive guard, shouldn't happen in well-formed input). +func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) { + // Walk backwards to find the nearest unclosed opening. + // Since doesn't nest, the nearest 0 { + idx := strings.LastIndex(body[:cursor], "). + if idx+4 <= len(body) { + after := body[idx+4] + if after == ' ' || after == '>' || after == '/' { + // or ; not . + close := strings.Index(body[idx:], ">") + if close < 0 { + return 0, 0, false + } + pStart = idx + break + } + } + cursor = idx + } + if pStart < 0 { + return 0, 0, false + } + // Walk forward to find the matching . doesn't nest so + // the next after the marker is the close. + pEndIdx := strings.Index(body[markerEnd:], "") + if pEndIdx < 0 { + return 0, 0, false + } + pEnd := markerEnd + pEndIdx + len("") + return pStart, pEnd, true +} + +// spliceSections replaces anchor slots with rendered sections and +// appends any unanchored sections before sectPr. Returns the assembled +// document.xml body. +func spliceSections(documentXML []byte, rendered map[string]string, kept []Section, all []Section) []byte { + body := string(documentXML) + pairs := findAllAnchorPairs(body) + + // Build a lookup of kept section keys for quick membership tests. + keptByKey := map[string]int{} + for i, sec := range kept { + keptByKey[sec.Key] = i + } + allByKey := map[string]int{} + for i, sec := range all { + allByKey[sec.Key] = i + } + + matchedKeys := map[string]bool{} + + // Walk pairs in REVERSE body-order so slice mutations don't shift + // later offsets. + sort.SliceStable(pairs, func(i, j int) bool { + return pairs[i].openStart > pairs[j].openStart + }) + for _, p := range pairs { + replacement := "" + if idx, ok := keptByKey[p.key]; ok { + replacement = rendered[p.key] + matchedKeys[p.key] = true + _ = idx + } else if _, isOnDraft := allByKey[p.key]; isOnDraft { + // Anchor matches an excluded section on the draft — drop + // the entire slot. + replacement = "" + } else { + // Anchor doesn't match any section on this draft — drop + // to leave the base's chrome unbroken. + replacement = "" + } + body = body[:p.openStart] + replacement + body[p.closeEnd:] + } + + // Append unanchored sections before sectPr in order_index ASC. + var unanchored strings.Builder + for _, sec := range kept { + if matchedKeys[sec.Key] { + continue + } + unanchored.WriteString(rendered[sec.Key]) + } + if unanchored.Len() > 0 { + body = appendBeforeSectPr(body, unanchored.String()) + } + + return []byte(body) +} + +// appendBeforeSectPr inserts content immediately before the first +// `` if present, else at + // the very end. + idx := strings.LastIndex(body, "") + if idx < 0 { + return body + content + } + return body[:idx] + content + body[idx:] + } + return body[:loc[0]] + content + body[loc[0]:] +} + +// ───────────────────────────────────────────────────────────────────── +// Zip plumbing +// ───────────────────────────────────────────────────────────────────── + +// baseZipPart captures one zip entry we kept aside while extracting +// document.xml. +type baseZipPart struct { + name string + method uint16 + modTime int64 // wall seconds; converted back to time.Time on repack + body []byte +} + +// splitBaseZip extracts document.xml and returns it alongside every +// other zip entry, ready for repacking. +func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) { + zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes))) + if err != nil { + return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err) + } + var documentXML []byte + parts := make([]baseZipPart, 0, len(zr.File)) + for _, f := range zr.File { + body, err := readZipEntry(f) + if err != nil { + return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err) + } + if f.Name == "word/document.xml" { + documentXML = body + parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil}) + continue + } + parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body}) + } + if documentXML == nil { + return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml") + } + return documentXML, parts, nil +} + +// repackBaseZip rebuilds the zip, swapping document.xml for the +// assembled body and leaving every other part untouched. +func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) { + var out bytes.Buffer + zw := zip.NewWriter(&out) + for _, p := range parts { + hdr := &zip.FileHeader{ + Name: p.name, + Method: p.method, + } + if p.modTime > 0 { + hdr.Modified = time.Unix(p.modTime, 0) + } + w, err := zw.CreateHeader(hdr) + if err != nil { + return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err) + } + body := p.body + if p.name == "word/document.xml" { + body = assembledBody + } + if _, err := w.Write(body); err != nil { + return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err) + } + } + if err := zw.Close(); err != nil { + return nil, fmt.Errorf("submission compose: finalise zip: %w", err) + } + return out.Bytes(), nil +} + +func readZipEntry(f *zip.File) ([]byte, error) { + rc, err := f.Open() + if err != nil { + return nil, err + } + defer rc.Close() + return io.ReadAll(rc) +} + +// ───────────────────────────────────────────────────────────────────── +// Slice D — hyperlink wiring +// ───────────────────────────────────────────────────────────────────── + +// composerLinkAllocator hands out fresh rIds for inline hyperlink +// targets discovered by the MD walker. Each unique URL gets one rId +// (deduped — repeated links to the same URL share one Relationship). +// Allocations land outside the base's rId namespace by prefixing with +// "rIdComposer" so they can't collide with existing relationships. +type composerLinkAllocator struct { + next int + byURL map[string]string + order []string // URLs in allocation order +} + +func newComposerLinkAllocator() *composerLinkAllocator { + return &composerLinkAllocator{byURL: map[string]string{}} +} + +// Alloc returns the rId for url, allocating one on first sight. +func (a *composerLinkAllocator) Alloc(url string) string { + if rid, ok := a.byURL[url]; ok { + return rid + } + a.next++ + rid := fmt.Sprintf("rIdComposer%d", a.next) + a.byURL[url] = rid + a.order = append(a.order, url) + return rid +} + +// HasLinks reports whether any links were allocated during this compose. +func (a *composerLinkAllocator) HasLinks() bool { + return len(a.order) > 0 +} + +// Pairs returns the (rId, URL) pairs in allocation order. The +// document.xml.rels patcher consumes this to emit +// elements. +func (a *composerLinkAllocator) Pairs() [][2]string { + pairs := make([][2]string, 0, len(a.order)) + for _, url := range a.order { + pairs = append(pairs, [2]string{a.byURL[url], url}) + } + return pairs +} + +// patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry +// in `parts` to append the given (rId, URL) pairs as hyperlink +// relationships. If the rels part doesn't exist (some bases omit it +// when the body has no relationships), this function appends a fresh +// part with the minimal Relationships wrapper. +// +// Idempotent on (rId, URL) pairs already present (e.g. when a base +// already references the URL for some other reason). +// +// Returns the (possibly extended) parts slice — callers must overwrite +// their reference because the append in the no-rels-yet case grows the +// backing array. +func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) { + const path = "word/_rels/document.xml.rels" + const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" + + existingIdx := -1 + for i := range parts { + if parts[i].name == path { + existingIdx = i + break + } + } + + var body string + if existingIdx >= 0 { + body = string(parts[existingIdx].body) + } else { + body = `` + + `` + } + + var inserts strings.Builder + for _, p := range pairs { + rid := p[0] + url := p[1] + if strings.Contains(body, `Id="`+rid+`"`) { + continue + } + inserts.WriteString(``) + } + + if inserts.Len() == 0 { + return parts, nil + } + + closeIdx := strings.LastIndex(body, "") + if closeIdx < 0 { + return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)") + } + patched := body[:closeIdx] + inserts.String() + body[closeIdx:] + + if existingIdx >= 0 { + parts[existingIdx].body = []byte(patched) + return parts, nil + } + parts = append(parts, baseZipPart{ + name: path, + method: zip.Deflate, + modTime: time.Now().Unix(), + body: []byte(patched), + }) + return parts, nil +} diff --git a/pkg/docforge/docx/markdown.go b/pkg/docforge/docx/markdown.go index 120ffc9..812b01a 100644 --- a/pkg/docforge/docx/markdown.go +++ b/pkg/docforge/docx/markdown.go @@ -492,14 +492,6 @@ func xmlTextEscape(s string) string { return s } -// XMLAttrEscape is the exported form of xmlAttrEscape, used by the -// paliad-side composer (submission_compose.go) when it builds hyperlink -// relationship inserts. It exists so the composer can reuse the exact -// attribute-escaping the walker applies without reaching across the -// package boundary for an unexported helper. Slice 2 folds the -// composer's splice into this package, after which the wrapper retires. -func XMLAttrEscape(s string) string { return xmlAttrEscape(s) } - // xmlAttrEscape escapes for safe insertion into an attribute value // (e.g. ``). func xmlAttrEscape(s string) string {