paliad/pkg/docforge/docx/markdown.go

package docx

// Markdown → OOXML walker for Composer section content (t-paliad-313
// Slice B, design doc §9.2).
//
// Scope per the head's Slice B brief: paragraphs + inline bold/italic
// only. Headings, lists, blockquote, links land in Slice D's rich-prose
// pass. This walker is intentionally minimal — every Markdown construct
// it doesn't recognise is rendered as a plain paragraph so the lawyer's
// prose round-trips losslessly even when they hit Markdown the walker
// doesn't yet understand.
//
// The output uses the base's stylemap.paragraph entry for the
// <w:pStyle> on each paragraph so the styling matches the base's
// typography (HLpat-Body-B0 on the HLC base, Normal on the neutral
// base, etc.).
//
// Placeholders ({{path.dot.notation}}) are preserved verbatim — they
// pass through the walker untouched and get substituted by the v1
// SubmissionRenderer's placeholder pass after the composer assembly.
//
// Grammar supported:
//
//   - Blank line          → paragraph break
//   - `**bold**`          → <w:r><w:rPr><w:b/></w:rPr><w:t>…</w:t></w:r>
//   - `*italic*` or `_italic_` → <w:r><w:rPr><w:i/></w:rPr>…</w:r>
//   - Otherwise           → plain text run

import (
	"fmt"
	"strings"
)

// HyperlinkAllocator hands the walker a `rId` for each external URL
// it encounters in `[label](url)` inline links. The composer's
// post-pass uses these allocations to mutate
// `word/_rels/document.xml.rels` so the emitted `<w:hyperlink
// r:id="…">` elements resolve correctly. Pass nil to drop links to
// plain text (the label survives, the URL doesn't render).
//
// t-paliad-316 Slice D.
type HyperlinkAllocator func(url string) string

// RenderMarkdownToOOXML renders the given Markdown source into OOXML
// paragraph elements (`<w:p>…</w:p>`), suitable for splicing into a
// .docx body. Each paragraph carries `<w:pStyle w:val="<paragraphStyle>"/>`
// when paragraphStyle is non-empty.
//
// Slice B shipped paragraphs + bold/italic. Slice D extends to
// headings (h1/h2/h3), bullet/numbered lists, blockquote, and inline
// hyperlinks via the optional HyperlinkAllocator.
//
// stylemap supplies the paragraph-style names for each kind:
//   stylemap["paragraph"]      — default body
//   stylemap["heading_1/2/3"]  — heading levels
//   stylemap["list_bullet"]    — bullet list paragraph style
//   stylemap["list_numbered"]  — numbered list paragraph style
//   stylemap["blockquote"]     — blockquote
// Missing entries fall back to the "paragraph" style.
//
// Empty input renders one empty paragraph so the splice site is
// well-formed even when the lawyer hasn't typed anything in this
// section.
func RenderMarkdownToOOXML(md, paragraphStyle string) string {
	return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil)
}

// RenderMarkdownToOOXMLWithStyles is the full Slice-D-aware entry
// point. Slice B's RenderMarkdownToOOXML is a wrapper for back-compat.
func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string {
	defaultStyle := stylemap["paragraph"]
	if md == "" {
		return emptyParagraph(defaultStyle)
	}
	blocks := splitMarkdownBlocks(md)
	if len(blocks) == 0 {
		return emptyParagraph(defaultStyle)
	}
	// Numbered-list counter resets on every non-numbered block so
	// "1. A\n2. B\n\n1. C" renders as 1./2./1. (the lawyer's input
	// determined the ordinal, the walker just renders).
	numberedCounter := 0
	var b strings.Builder
	for _, blk := range blocks {
		style := stylemap[blk.styleKey]
		if style == "" {
			style = defaultStyle
		}
		if blk.styleKey == "list_numbered" {
			numberedCounter++
		} else {
			numberedCounter = 0
		}
		b.WriteString(renderBlockParagraph(blk, style, links, numberedCounter))
	}
	return b.String()
}

// mdBlock is one rendered paragraph: a kind (paragraph / heading_*
// / list_bullet / list_numbered / blockquote) and the inline content
// text. List markers, heading hashes, blockquote `> ` etc. are
// stripped from the text before storage.
type mdBlock struct {
	styleKey string // "paragraph" | "heading_1" | "heading_2" | "heading_3" | "list_bullet" | "list_numbered" | "blockquote"
	text     string
}

// splitMarkdownBlocks parses the source into a sequence of blocks,
// detecting heading / list / blockquote prefixes line-by-line. Blank
// lines split paragraph runs (same semantics as splitMarkdownParagraphs)
// but each line is also tagged with its block kind.
//
// Lines that look like block markers don't merge with their neighbours
// even across blank lines — every list / heading / blockquote line is
// its own block in the output. A run of unmarked lines collapses into
// one "paragraph" block (so soft line breaks inside a paragraph still
// concatenate).
//
// CRLF normalised to LF before parsing.
func splitMarkdownBlocks(md string) []mdBlock {
	normalised := strings.ReplaceAll(md, "\r\n", "\n")
	lines := strings.Split(normalised, "\n")
	var blocks []mdBlock
	var pendingPara []string
	blankRun := 0

	flushPara := func() {
		if len(pendingPara) > 0 {
			blocks = append(blocks, mdBlock{styleKey: "paragraph", text: strings.Join(pendingPara, "\n")})
			pendingPara = nil
		}
	}

	for _, raw := range lines {
		line := raw
		if strings.TrimSpace(line) == "" {
			if len(pendingPara) > 0 {
				flushPara()
				blankRun = 1
				continue
			}
			blankRun++
			continue
		}
		// Detect heading / list / blockquote markers BEFORE we accumulate
		// into the paragraph buffer.
		kind, payload, ok := detectBlockMarker(line)
		if ok {
			flushPara()
			// Emit spacing paragraphs equivalent to (blankRun - 1) extra.
			for i := 1; i < blankRun; i++ {
				blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
			}
			blankRun = 0
			blocks = append(blocks, mdBlock{styleKey: kind, text: payload})
			continue
		}
		// Plain paragraph line.
		if len(pendingPara) == 0 {
			// Starting a new paragraph after a blank run — emit
			// (blankRun-1) extra empty paragraphs for vertical spacing.
			for i := 1; i < blankRun; i++ {
				blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
			}
		}
		blankRun = 0
		pendingPara = append(pendingPara, line)
	}
	flushPara()
	return blocks
}

// detectBlockMarker classifies a single line. Returns (styleKey,
// payload-with-marker-stripped, true) for recognised markers; false
// for plain paragraph lines.
//
// Recognised markers (Slice D):
//   # Heading       → heading_1
//   ## Heading      → heading_2
//   ### Heading     → heading_3
//   - item / * item → list_bullet
//   1. item / 2. item ... → list_numbered (any positive integer)
//   > quote         → blockquote
//
// Leading whitespace inside the line is tolerated up to 3 spaces (per
// CommonMark) so the lawyer's contentEditable indentation doesn't
// hide the marker.
func detectBlockMarker(line string) (string, string, bool) {
	trimmed := strings.TrimLeft(line, " ")
	// Cap to 3 spaces of leading indent — beyond that, treat as a
	// regular paragraph line (matches CommonMark).
	if len(line)-len(trimmed) > 3 {
		return "", "", false
	}
	if strings.HasPrefix(trimmed, "### ") {
		return "heading_3", strings.TrimSpace(trimmed[4:]), true
	}
	if strings.HasPrefix(trimmed, "## ") {
		return "heading_2", strings.TrimSpace(trimmed[3:]), true
	}
	if strings.HasPrefix(trimmed, "# ") {
		return "heading_1", strings.TrimSpace(trimmed[2:]), true
	}
	if strings.HasPrefix(trimmed, "> ") {
		return "blockquote", strings.TrimSpace(trimmed[2:]), true
	}
	if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") {
		return "list_bullet", strings.TrimSpace(trimmed[2:]), true
	}
	// Numbered: "N. " where N is one or more digits.
	if i := indexOfNumberedMarker(trimmed); i > 0 {
		return "list_numbered", strings.TrimSpace(trimmed[i:]), true
	}
	return "", "", false
}

// indexOfNumberedMarker checks for "N. " or "N) " at the start of the
// trimmed line; returns the byte index just past the marker, or -1 if
// no marker present.
func indexOfNumberedMarker(s string) int {
	i := 0
	for i < len(s) && s[i] >= '0' && s[i] <= '9' {
		i++
	}
	if i == 0 {
		return -1
	}
	if i >= len(s) {
		return -1
	}
	if s[i] != '.' && s[i] != ')' {
		return -1
	}
	if i+1 >= len(s) || s[i+1] != ' ' {
		return -1
	}
	return i + 2
}

// renderBlockParagraph emits one `<w:p>` for a block. List blocks
// keep the same paragraph style as a default paragraph (the Slice D
// design's contract — list styles come from the base's stylemap and
// Word's numbering.xml is honoured by adding a leading bullet/number
// prefix in the rendered text). This keeps the composer free of
// numbering.xml mutations.
func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
	var b strings.Builder
	b.WriteString(`<w:p>`)
	if paragraphStyle != "" {
		b.WriteString(`<w:pPr><w:pStyle w:val="`)
		b.WriteString(xmlAttrEscape(paragraphStyle))
		b.WriteString(`"/></w:pPr>`)
	}
	if blk.text == "" {
		b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r>`)
		b.WriteString(`</w:p>`)
		return b.String()
	}
	text := blk.text
	// List blocks emit a visible "• " / "N. " prefix run. The
	// stylemap entry handles paragraph indentation if the base
	// defines a list paragraph style; otherwise the prefix at least
	// surfaces the structure in plain Word. Lawyers who want Word's
	// auto-numbering reapply a list style post-export.
	switch blk.styleKey {
	case "list_bullet":
		b.WriteString(`<w:r><w:t xml:space="preserve">• </w:t></w:r>`)
	case "list_numbered":
		ordinal := numberedOrdinal
		if ordinal <= 0 {
			ordinal = 1
		}
		b.WriteString(`<w:r><w:t xml:space="preserve">`)
		b.WriteString(fmt.Sprintf("%d. ", ordinal))
		b.WriteString(`</w:t></w:r>`)
	}
	for _, run := range parseInlineRuns(text, links) {
		b.WriteString(run)
	}
	b.WriteString(`</w:p>`)
	return b.String()
}

// parseInlineRuns extracts inline spans + hyperlink runs and serialises
// each to OOXML. Hyperlinks become `<w:hyperlink r:id="RID">…runs…</w:hyperlink>`
// where RID comes from the HyperlinkAllocator.
func parseInlineRuns(text string, links HyperlinkAllocator) []string {
	// Phase 1: find all hyperlink spans `[label](url)` and split the
	// text around them.
	type segment struct {
		text   string
		isLink bool
		url    string
	}
	var segs []segment
	rest := text
	for {
		idx := strings.Index(rest, "[")
		if idx < 0 {
			if rest != "" {
				segs = append(segs, segment{text: rest})
			}
			break
		}
		// Find matching closing bracket, then a "(" right after.
		closeBracket := strings.Index(rest[idx:], "](")
		if closeBracket < 0 {
			segs = append(segs, segment{text: rest})
			break
		}
		closeParen := strings.Index(rest[idx+closeBracket:], ")")
		if closeParen < 0 {
			segs = append(segs, segment{text: rest})
			break
		}
		// idx                    = start of "["
		// idx+closeBracket       = position of "]"
		// idx+closeBracket+1     = position of "("
		// idx+closeBracket+closeParen = position of ")"
		label := rest[idx+1 : idx+closeBracket]
		url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
		if idx > 0 {
			segs = append(segs, segment{text: rest[:idx]})
		}
		segs = append(segs, segment{text: label, isLink: true, url: url})
		rest = rest[idx+closeBracket+closeParen+1:]
	}

	var runs []string
	for _, seg := range segs {
		if seg.isLink && links != nil {
			rid := links(seg.url)
			if rid != "" {
				var hb strings.Builder
				hb.WriteString(`<w:hyperlink r:id="`)
				hb.WriteString(xmlAttrEscape(rid))
				hb.WriteString(`">`)
				for _, span := range parseInlineSpans(seg.text) {
					hb.WriteString(renderRunWithLinkStyle(span))
				}
				hb.WriteString(`</w:hyperlink>`)
				runs = append(runs, hb.String())
				continue
			}
		}
		for _, span := range parseInlineSpans(seg.text) {
			runs = append(runs, renderRun(span))
		}
	}
	return runs
}

// renderRunWithLinkStyle emits a hyperlink child run. Same B/I support
// as renderRun, but additionally tags the run with the "Hyperlink"
// character style (Word's built-in) so the link renders in the
// document's hyperlink colour + underline.
func renderRunWithLinkStyle(span inlineSpan) string {
	var b strings.Builder
	b.WriteString(`<w:r><w:rPr><w:rStyle w:val="Hyperlink"/>`)
	if span.Bold {
		b.WriteString(`<w:b/>`)
	}
	if span.Italic {
		b.WriteString(`<w:i/>`)
	}
	b.WriteString(`</w:rPr><w:t xml:space="preserve">`)
	b.WriteString(xmlTextEscape(span.Text))
	b.WriteString(`</w:t></w:r>`)
	return b.String()
}

// inlineSpan is one piece of inline content: a text payload plus
// formatting flags. Bold and italic are independent — `***both***`
// produces one span with both flags set.
type inlineSpan struct {
	Text   string
	Bold   bool
	Italic bool
}

// parseInlineSpans tokenises Markdown inline formatting into runs of
// (text, bold, italic). The grammar is intentionally narrow:
//
//   - `**…**` → bold
//   - `__…__` → bold (Markdown alternate)
//   - `*…*`   → italic
//   - `_…_`   → italic (Markdown alternate)
//   - Anything else flows through as plain text.
//
// Unbalanced delimiters fall through as literal characters — the
// walker never errors on malformed Markdown. Nested formatting (e.g.
// `**bold *bold-italic* bold**`) toggles flags as it walks.
func parseInlineSpans(text string) []inlineSpan {
	var out []inlineSpan
	var cur strings.Builder
	bold := false
	italic := false
	flush := func() {
		if cur.Len() == 0 {
			return
		}
		out = append(out, inlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
		cur.Reset()
	}
	i := 0
	n := len(text)
	for i < n {
		// Preserve {{...}} placeholders verbatim. Underscores and
		// other Markdown-significant chars inside a placeholder key
		// (e.g. {{project.case_number}}) must not be interpreted as
		// bold/italic delimiters — otherwise the key gets stripped of
		// its underscores and the v1 placeholder pass looks up the
		// wrong key, surfacing [KEIN WERT: project.casenumber] in the
		// preview.
		if i+1 < n && text[i] == '{' && text[i+1] == '{' {
			rel := strings.Index(text[i+2:], "}}")
			if rel >= 0 {
				end := i + 2 + rel + 2
				cur.WriteString(text[i:end])
				i = end
				continue
			}
			// Unmatched {{ — fall through to plain character handling.
		}
		// Bold delimiters first (longer match wins over italic).
		if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
			flush()
			bold = !bold
			i += 2
			continue
		}
		if text[i] == '*' || text[i] == '_' {
			flush()
			italic = !italic
			i++
			continue
		}
		cur.WriteByte(text[i])
		i++
	}
	flush()
	if len(out) == 0 {
		out = append(out, inlineSpan{Text: ""})
	}
	return out
}

// renderRun emits one `<w:r>` element for an inline span. Empty text
// spans render as empty runs (Word accepts them; they're harmless).
func renderRun(span inlineSpan) string {
	var b strings.Builder
	b.WriteString(`<w:r>`)
	if span.Bold || span.Italic {
		b.WriteString(`<w:rPr>`)
		if span.Bold {
			b.WriteString(`<w:b/>`)
		}
		if span.Italic {
			b.WriteString(`<w:i/>`)
		}
		b.WriteString(`</w:rPr>`)
	}
	b.WriteString(`<w:t xml:space="preserve">`)
	b.WriteString(xmlTextEscape(span.Text))
	b.WriteString(`</w:t></w:r>`)
	return b.String()
}

// emptyParagraph returns one empty `<w:p>` with the given style. Used
// when a section's content_md is empty so the splice site stays
// well-formed.
func emptyParagraph(paragraphStyle string) string {
	var b strings.Builder
	b.WriteString(`<w:p>`)
	if paragraphStyle != "" {
		b.WriteString(`<w:pPr><w:pStyle w:val="`)
		b.WriteString(xmlAttrEscape(paragraphStyle))
		b.WriteString(`"/></w:pPr>`)
	}
	b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r></w:p>`)
	return b.String()
}

// xmlTextEscape escapes the five XML-significant characters for safe
// insertion into <w:t> content. & first to avoid double-encoding.
func xmlTextEscape(s string) string {
	s = strings.ReplaceAll(s, "&", "&amp;")
	s = strings.ReplaceAll(s, "<", "&lt;")
	s = strings.ReplaceAll(s, ">", "&gt;")
	// Quotes and apostrophes are legal inside element text content;
	// no need to escape them here.
	return s
}

// xmlAttrEscape escapes for safe insertion into an attribute value
// (e.g. `<w:pStyle w:val="…"/>`).
func xmlAttrEscape(s string) string {
	s = strings.ReplaceAll(s, "&", "&amp;")
	s = strings.ReplaceAll(s, "<", "&lt;")
	s = strings.ReplaceAll(s, ">", "&gt;")
	s = strings.ReplaceAll(s, `"`, "&quot;")
	return s
}