paliad/pkg/docforge/docx/compose.go

package docx

// Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 +
// §9.2). Assembles a base .docx and a draft's section rows into a
// merged .docx ready for export.
//
// Pipeline (high-level):
//
//  1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx).
//  2. Locate `word/document.xml` inside the zip; pull the body XML.
//  3. For each section in the draft (order_index ASC, included=true):
//     render content_md_<lang> → OOXML via RenderMarkdownToOOXML using
//     base.section_spec.stylemap.paragraph.
//  4. Splice the rendered OOXML into the base body. Two splice modes:
//     - Anchor mode: when the body carries `{{#section:KEY}}` /
//       `{{/section:KEY}}` marker pairs, replace the slot's content
//       (including the anchor paragraphs themselves) with the rendered
//       section.
//     - Append mode: when no anchor pair is found for a section, the
//       rendered OOXML appends at the end of the body, just before any
//       `<w:sectPr>` element. Sections with `included=false` are
//       dropped silently.
//  5. Strip any leftover unmatched anchor paragraphs.
//  6. Re-pack the document.xml into the zip, leaving every other part
//     untouched.
//  7. Run the v1 SubmissionRenderer placeholder pass over the assembly
//     so `{{path}}` placeholders inside section content (and inside
//     the base's untouched chrome) get substituted by the merged bag.
//     Cross-run merge in pass 2 handles autocorrect-fragmented
//     placeholders the same as v1.
//
// Result: a fully-merged .docx. No new third-party Go dep — reuses
// archive/zip + the existing SubmissionRenderer.

import (
	"archive/zip"
	"bytes"
	"context"
	"fmt"
	"io"
	"regexp"
	"sort"
	"strings"
	"time"

	"mgit.msbls.de/m/paliad/pkg/docforge"
)

// Composer assembles base + sections into a final .docx.
// Stateless; safe for concurrent use.
type Composer struct {
	renderer *SubmissionRenderer
}

// NewComposer wires the composer. The renderer is required —
// a nil renderer is a programmer error and the composer panics at
// construction.
func NewComposer(renderer *SubmissionRenderer) *Composer {
	if renderer == nil {
		panic("submission composer: renderer required")
	}
	return &Composer{renderer: renderer}
}

// Carrier is the opaque base document the composer splices rendered
// content into. Its bytes are preserved verbatim outside the regions the
// splice touches — the {{#section:KEY}} anchor paragraphs and the
// {{placeholder}} tokens — so the firm's letterhead, styles, headers, and
// footers survive a compose byte-for-byte. This is the docforge "carrier"
// for the .docx format: the lossless host for editable content.
type Carrier struct {
	// Bytes is the raw base .docx. May be a .dotm/.docm/.dotx; Compose
	// runs ConvertDotmToDocx on it first (idempotent on a plain .docx).
	Bytes []byte

	// Stylemap maps a logical block kind (paragraph, heading_1/2/3,
	// list_bullet, list_numbered, blockquote) to the Word paragraph
	// style name the base defines for it. Drives the Markdown walker's
	// <w:pStyle>. Missing entries fall back to the "paragraph" style.
	Stylemap map[string]string
}

// Section is one editable content block the composer renders and splices.
// It is the format-neutral input the docforge engine consumes; the
// consuming application maps its own row type onto it (paliad maps
// SubmissionSection → Section).
type Section struct {
	// Key matches a {{#section:KEY}} anchor in the carrier, or — when no
	// anchor matches — marks an append-mode section.
	Key string
	// OrderIndex sets append-mode ordering (ascending).
	OrderIndex int
	// Included=false drops the section entirely.
	Included bool
	// ContentMDDE / ContentMDEN are the bilingual Markdown sources; Lang
	// selects which one renders.
	ContentMDDE string
	ContentMDEN string
}

// ComposeOptions carries the per-call composition inputs.
type ComposeOptions struct {
	// Sections are the draft's section rows in display order. The
	// composer renders included sections; excluded rows are dropped.
	// Caller is responsible for visibility — by the time the composer
	// runs, the section rows have already been gated by the caller.
	Sections []Section

	// Carrier is the base .docx chrome plus its stylemap. Required.
	Carrier Carrier

	// Lang ('de' or 'en') selects which content_md_* column the
	// composer reads per section. Defaults to 'de' if empty.
	Lang string

	// Vars is the merged placeholder bag the v1 renderer pass
	// substitutes after the composer assembly. Passed straight through
	// to SubmissionRenderer.Render.
	Vars docforge.PlaceholderMap

	// Missing translates an unbound placeholder key into the marker
	// the lawyer sees in Word. Passed straight to the renderer.
	Missing docforge.MissingPlaceholderFn
}

// Compose runs the full pipeline and returns the merged .docx bytes.
func (c *Composer) Compose(ctx context.Context, opts ComposeOptions) ([]byte, error) {
	_ = ctx // reserved for cancellation propagation in later slices
	sections := opts.Sections

	// Pre-pass: strip macros so the base reads as a plain .docx zip.
	cleanBytes, err := ConvertDotmToDocx(opts.Carrier.Bytes)
	if err != nil {
		return nil, fmt.Errorf("submission compose: convert base: %w", err)
	}

	// Locate + extract word/document.xml so we can splice in-place.
	documentXML, otherParts, err := splitBaseZip(cleanBytes)
	if err != nil {
		return nil, err
	}

	// Per-compose hyperlink allocator. Each unique URL gets a fresh
	// rId outside the base's existing namespace. The post-pass
	// (patchDocumentXMLRels) writes the matching Relationship rows
	// before the zip is repacked. Slice D adds inline `[label](url)`
	// hyperlink support.
	linkAlloc := newComposerLinkAllocator()

	// Build the rendered-section map: section_key → OOXML span.
	stylemap := opts.Carrier.Stylemap
	rendered := make(map[string]string, len(sections))
	keptSections := make([]Section, 0, len(sections))
	for _, sec := range sections {
		if !sec.Included {
			continue
		}
		md := sec.ContentMDDE
		if strings.EqualFold(opts.Lang, "en") {
			md = sec.ContentMDEN
		}
		rendered[sec.Key] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc)
		keptSections = append(keptSections, sec)
	}
	// Stable order — already sorted ascending by ListForDraft, but
	// belt-and-braces in case the caller swaps the ordering policy
	// later.
	sort.SliceStable(keptSections, func(i, j int) bool {
		return keptSections[i].OrderIndex < keptSections[j].OrderIndex
	})

	assembledBody := spliceSections(documentXML, rendered, keptSections, sections)

	// Slice D hyperlink patch: when the walker emitted hyperlink rIds
	// for inline `[label](url)` links, the base's
	// word/_rels/document.xml.rels needs matching <Relationship>
	// entries so Word can resolve the rIds. Mutates one zip part in
	// otherParts (or appends if missing).
	if linkAlloc.HasLinks() {
		updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs())
		if err != nil {
			return nil, err
		}
		otherParts = updatedParts
	}

	// Re-pack into a zip with the assembled document.xml. All other
	// parts (styles, fonts, headers, footers, theme, settings) pass
	// through bit-for-bit at their original mtime + compression.
	repacked, err := repackBaseZip(otherParts, assembledBody)
	if err != nil {
		return nil, err
	}

	// Final pass: substitute placeholders against the merged bag. The
	// existing renderer handles cross-run fragmentation, the `{{rule.X}}`
	// alias contract, and the missing-marker emission. Reusing it
	// guarantees v1's placeholder grammar stays intact inside section
	// content + base chrome.
	merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing)
	if err != nil {
		return nil, fmt.Errorf("submission compose: placeholder pass: %w", err)
	}
	return merged, nil
}

// ─────────────────────────────────────────────────────────────────────
// Section splicing
// ─────────────────────────────────────────────────────────────────────

// Anchor markers as they appear inside a <w:t> text node. We don't
// need a full XML parse — finding the marker text inside the body is
// sufficient because:
//   - {{ and }} are never legitimate document content (placeholders
//     follow the same convention everywhere else in paliad).
//   - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML
//     special characters.
//   - Each anchor lives in exactly one <w:t>...<w:t>, which lives in
//     exactly one <w:r>...</w:r>, which lives in exactly one
//     <w:p>...</w:p>. We expand from the marker outward to find the
//     enclosing <w:p> span and drop the entire paragraph as part of
//     the splice.
//
// RE2 has no lookahead, so the "find enclosing <w:p>" logic is
// implemented as manual byte-index search around the marker hit
// (anchorParagraphSpan below) rather than a single regex pattern.

const (
	anchorOpenPrefix  = "{{#section:"
	anchorClosePrefix = "{{/section:"
	anchorSuffix      = "}}"
)

// anchorKeyRegex validates that the captured anchor key is a clean
// identifier. Keys that include other characters (which can't actually
// appear in our authored .docx) are treated as no match.
var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`)

// anchorPair records the byte span of one matched anchor pair inside
// the body — from the start of the opening anchor's <w:p> element
// through the end of the closing anchor's </w:p>.
type anchorPair struct {
	key      string
	openStart int // start of <w:p> for the opening anchor
	closeEnd  int // index just past </w:p> for the closing anchor
}

// findAllAnchorPairs scans the body for matched open/close anchor
// pairs. Unbalanced markers (open without close, or vice versa) are
// dropped from the result. Returns pairs in body-order; each pair's
// span is non-overlapping.
func findAllAnchorPairs(body string) []anchorPair {
	type marker struct {
		key      string
		paraStart int
		paraEnd   int
		isOpen   bool
	}
	var markers []marker

	collect := func(prefix string, isOpen bool) {
		offset := 0
		for {
			idx := strings.Index(body[offset:], prefix)
			if idx < 0 {
				return
			}
			start := offset + idx
			suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix)
			if suffixIdx < 0 {
				return
			}
			key := body[start+len(prefix) : start+len(prefix)+suffixIdx]
			if !anchorKeyRegex.MatchString(key) {
				offset = start + len(prefix)
				continue
			}
			markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix)
			pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd)
			if !ok {
				offset = markerEnd
				continue
			}
			markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen})
			offset = pEnd
		}
	}
	collect(anchorOpenPrefix, true)
	collect(anchorClosePrefix, false)

	// Walk markers in body-order, matching each open with the next
	// close that carries the same key.
	sort.SliceStable(markers, func(i, j int) bool {
		return markers[i].paraStart < markers[j].paraStart
	})
	var pairs []anchorPair
	openStack := map[string]marker{}
	for _, m := range markers {
		if m.isOpen {
			openStack[m.key] = m
			continue
		}
		o, ok := openStack[m.key]
		if !ok {
			continue
		}
		pairs = append(pairs, anchorPair{
			key:       m.key,
			openStart: o.paraStart,
			closeEnd:  m.paraEnd,
		})
		delete(openStack, m.key)
	}
	return pairs
}

// paragraphSpanAround returns the byte span of the smallest `<w:p>...</w:p>`
// element that fully contains the byte range [markerStart, markerEnd).
// Returns false when the byte range doesn't sit inside a single
// paragraph (which would mean the marker survived a cross-paragraph
// edit — defensive guard, shouldn't happen in well-formed input).
func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) {
	// Walk backwards to find the nearest unclosed <w:p ... > opening.
	// Since <w:p> doesn't nest, the nearest <w:p before markerStart is
	// the enclosing paragraph's opening tag.
	pStart := -1
	cursor := markerStart
	for cursor > 0 {
		idx := strings.LastIndex(body[:cursor], "<w:p")
		if idx < 0 {
			break
		}
		// Confirm this is a paragraph open, not a different
		// w:p-prefixed tag (e.g. <w:pPr>).
		if idx+4 <= len(body) {
			after := body[idx+4]
			if after == ' ' || after == '>' || after == '/' {
				// <w:p ...> or <w:p>; not <w:pPr>.
				close := strings.Index(body[idx:], ">")
				if close < 0 {
					return 0, 0, false
				}
				pStart = idx
				break
			}
		}
		cursor = idx
	}
	if pStart < 0 {
		return 0, 0, false
	}
	// Walk forward to find the matching </w:p>. <w:p> doesn't nest so
	// the next </w:p> after the marker is the close.
	pEndIdx := strings.Index(body[markerEnd:], "</w:p>")
	if pEndIdx < 0 {
		return 0, 0, false
	}
	pEnd := markerEnd + pEndIdx + len("</w:p>")
	return pStart, pEnd, true
}

// spliceSections replaces anchor slots with rendered sections and
// appends any unanchored sections before sectPr. Returns the assembled
// document.xml body.
func spliceSections(documentXML []byte, rendered map[string]string, kept []Section, all []Section) []byte {
	body := string(documentXML)
	pairs := findAllAnchorPairs(body)

	// Build a lookup of kept section keys for quick membership tests.
	keptByKey := map[string]int{}
	for i, sec := range kept {
		keptByKey[sec.Key] = i
	}
	allByKey := map[string]int{}
	for i, sec := range all {
		allByKey[sec.Key] = i
	}

	matchedKeys := map[string]bool{}

	// Walk pairs in REVERSE body-order so slice mutations don't shift
	// later offsets.
	sort.SliceStable(pairs, func(i, j int) bool {
		return pairs[i].openStart > pairs[j].openStart
	})
	for _, p := range pairs {
		replacement := ""
		if idx, ok := keptByKey[p.key]; ok {
			replacement = rendered[p.key]
			matchedKeys[p.key] = true
			_ = idx
		} else if _, isOnDraft := allByKey[p.key]; isOnDraft {
			// Anchor matches an excluded section on the draft — drop
			// the entire slot.
			replacement = ""
		} else {
			// Anchor doesn't match any section on this draft — drop
			// to leave the base's chrome unbroken.
			replacement = ""
		}
		body = body[:p.openStart] + replacement + body[p.closeEnd:]
	}

	// Append unanchored sections before sectPr in order_index ASC.
	var unanchored strings.Builder
	for _, sec := range kept {
		if matchedKeys[sec.Key] {
			continue
		}
		unanchored.WriteString(rendered[sec.Key])
	}
	if unanchored.Len() > 0 {
		body = appendBeforeSectPr(body, unanchored.String())
	}

	return []byte(body)
}

// appendBeforeSectPr inserts content immediately before the first
// `<w:sectPr` element in the body, or at the end of the body if there
// is none. Word documents conventionally close the body with a sectPr
// describing page setup; we want to land sections before that element
// so they show up on the actual pages.
var sectPrRegex = regexp.MustCompile(`<w:sectPr\b`)

func appendBeforeSectPr(body, content string) string {
	loc := sectPrRegex.FindStringIndex(body)
	if loc == nil {
		// No sectPr → append before `</w:body>` if present, else at
		// the very end.
		idx := strings.LastIndex(body, "</w:body>")
		if idx < 0 {
			return body + content
		}
		return body[:idx] + content + body[idx:]
	}
	return body[:loc[0]] + content + body[loc[0]:]
}

// ─────────────────────────────────────────────────────────────────────
// Zip plumbing
// ─────────────────────────────────────────────────────────────────────

// baseZipPart captures one zip entry we kept aside while extracting
// document.xml.
type baseZipPart struct {
	name    string
	method  uint16
	modTime int64 // wall seconds; converted back to time.Time on repack
	body    []byte
}

// splitBaseZip extracts document.xml and returns it alongside every
// other zip entry, ready for repacking.
func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) {
	zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes)))
	if err != nil {
		return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err)
	}
	var documentXML []byte
	parts := make([]baseZipPart, 0, len(zr.File))
	for _, f := range zr.File {
		body, err := readZipEntry(f)
		if err != nil {
			return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err)
		}
		if f.Name == "word/document.xml" {
			documentXML = body
			parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil})
			continue
		}
		parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body})
	}
	if documentXML == nil {
		return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml")
	}
	return documentXML, parts, nil
}

// repackBaseZip rebuilds the zip, swapping document.xml for the
// assembled body and leaving every other part untouched.
func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) {
	var out bytes.Buffer
	zw := zip.NewWriter(&out)
	for _, p := range parts {
		hdr := &zip.FileHeader{
			Name:   p.name,
			Method: p.method,
		}
		if p.modTime > 0 {
			hdr.Modified = time.Unix(p.modTime, 0)
		}
		w, err := zw.CreateHeader(hdr)
		if err != nil {
			return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err)
		}
		body := p.body
		if p.name == "word/document.xml" {
			body = assembledBody
		}
		if _, err := w.Write(body); err != nil {
			return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err)
		}
	}
	if err := zw.Close(); err != nil {
		return nil, fmt.Errorf("submission compose: finalise zip: %w", err)
	}
	return out.Bytes(), nil
}

func readZipEntry(f *zip.File) ([]byte, error) {
	rc, err := f.Open()
	if err != nil {
		return nil, err
	}
	defer rc.Close()
	return io.ReadAll(rc)
}

// ─────────────────────────────────────────────────────────────────────
// Slice D — hyperlink wiring
// ─────────────────────────────────────────────────────────────────────

// composerLinkAllocator hands out fresh rIds for inline hyperlink
// targets discovered by the MD walker. Each unique URL gets one rId
// (deduped — repeated links to the same URL share one Relationship).
// Allocations land outside the base's rId namespace by prefixing with
// "rIdComposer" so they can't collide with existing relationships.
type composerLinkAllocator struct {
	next  int
	byURL map[string]string
	order []string // URLs in allocation order
}

func newComposerLinkAllocator() *composerLinkAllocator {
	return &composerLinkAllocator{byURL: map[string]string{}}
}

// Alloc returns the rId for url, allocating one on first sight.
func (a *composerLinkAllocator) Alloc(url string) string {
	if rid, ok := a.byURL[url]; ok {
		return rid
	}
	a.next++
	rid := fmt.Sprintf("rIdComposer%d", a.next)
	a.byURL[url] = rid
	a.order = append(a.order, url)
	return rid
}

// HasLinks reports whether any links were allocated during this compose.
func (a *composerLinkAllocator) HasLinks() bool {
	return len(a.order) > 0
}

// Pairs returns the (rId, URL) pairs in allocation order. The
// document.xml.rels patcher consumes this to emit <Relationship>
// elements.
func (a *composerLinkAllocator) Pairs() [][2]string {
	pairs := make([][2]string, 0, len(a.order))
	for _, url := range a.order {
		pairs = append(pairs, [2]string{a.byURL[url], url})
	}
	return pairs
}

// patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry
// in `parts` to append the given (rId, URL) pairs as hyperlink
// relationships. If the rels part doesn't exist (some bases omit it
// when the body has no relationships), this function appends a fresh
// part with the minimal Relationships wrapper.
//
// Idempotent on (rId, URL) pairs already present (e.g. when a base
// already references the URL for some other reason).
//
// Returns the (possibly extended) parts slice — callers must overwrite
// their reference because the append in the no-rels-yet case grows the
// backing array.
func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) {
	const path = "word/_rels/document.xml.rels"
	const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"

	existingIdx := -1
	for i := range parts {
		if parts[i].name == path {
			existingIdx = i
			break
		}
	}

	var body string
	if existingIdx >= 0 {
		body = string(parts[existingIdx].body)
	} else {
		body = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>` +
			`<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>`
	}

	var inserts strings.Builder
	for _, p := range pairs {
		rid := p[0]
		url := p[1]
		if strings.Contains(body, `Id="`+rid+`"`) {
			continue
		}
		inserts.WriteString(`<Relationship Id="`)
		inserts.WriteString(xmlAttrEscape(rid))
		inserts.WriteString(`" Type="`)
		inserts.WriteString(hyperlinkType)
		inserts.WriteString(`" Target="`)
		inserts.WriteString(xmlAttrEscape(url))
		inserts.WriteString(`" TargetMode="External"/>`)
	}

	if inserts.Len() == 0 {
		return parts, nil
	}

	closeIdx := strings.LastIndex(body, "</Relationships>")
	if closeIdx < 0 {
		return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)")
	}
	patched := body[:closeIdx] + inserts.String() + body[closeIdx:]

	if existingIdx >= 0 {
		parts[existingIdx].body = []byte(patched)
		return parts, nil
	}
	parts = append(parts, baseZipPart{
		name:    path,
		method:  zip.Deflate,
		modTime: time.Now().Unix(),
		body:    []byte(patched),
	})
	return parts, nil
}