The final slice: land the format-neutral document model with REAL consumers
and unify the Markdown parser — no duplication, byte-identical output.
Neutral model (pkg/docforge/model.go): Document / Block / InlineSpan.
BlockKind values are the stylemap keys. A hyperlink is a span with Link set
+ Children (the label's spans), preserving link boundaries so adjacent
same-URL links stay distinct — byte-exact with the pre-model walker.
Markdown importer (pkg/docforge/markdown): Import(md) → Document. The SINGLE
Markdown parser for docforge — block split, marker detection, inline
bold/italic/link tokenisation, {{placeholder}} pass-through (the b78a984
fix). Relocated out of the docx walker.
docx renderer (pkg/docforge/docx/markdown.go): now RENDERS a Document →
OOXML (RenderDocumentToOOXML); RenderMarkdownToOOXML[WithStyles] = render(
markdown.Import(md)). The shipped submission walker routes through the model,
so there is one parser, not two. The comprehensive byte-exact render tests
(RenderMarkdownToOOXML_*) all PASS unchanged = output identical.
Exporter interface (pkg/docforge/exporter.go, PRD §4 B4): Exporter{Format,
MIMEType, RenderBody(Document)} with the .docx impl (pkg/docforge/docx/
exporter.go). The seam a future PDF/HTML exporter slots into.
Tests: parser tests relocated to the markdown pkg (parseSpans/detectBlockMarker)
+ new importer Document tests + exporter conformance test.
Verification: go build/vet clean; gofmt clean; full NO-DB test suite GREEN
(authoritative — proves no regression); docforge byte-exact render oracle
PASS; composer live test renders through the rewired walker (PASS); bun build
+ bun test 274/274. The shared-DB live run fails ~85 tests across unrelated
services from a harness pq-42P08 $1-type seeding quirk + a stale
deadline_rules test — systemic/environmental (the no-DB run is clean), not
this change.
docforge train complete: 8 slices, the engine extracted + cleaned + a working
author→generate→export loop on uploaded templates, plus the neutral model +
importer + exporter seam for future formats/consumers.
m/paliad#157
231 lines
6.7 KiB
Go
231 lines
6.7 KiB
Go
// Package markdown imports Markdown source into the neutral
|
|
// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary
|
|
// input format). It is the single Markdown parser for docforge: the .docx
|
|
// renderer consumes the Document this produces, so block-splitting and
|
|
// inline tokenisation live here, not in the format adapter.
|
|
//
|
|
// Grammar (intentionally narrow — unrecognised syntax flows through as a
|
|
// plain paragraph, so lawyer prose never errors):
|
|
//
|
|
// blank line → paragraph break
|
|
// # / ## / ### Heading → heading_1 / 2 / 3
|
|
// - item / * item → bullet list item
|
|
// N. item / N) item → numbered list item
|
|
// > quote → blockquote
|
|
// **x** / __x__ → bold
|
|
// *x* / _x_ → italic
|
|
// [label](url) → hyperlink
|
|
// {{key}} → preserved verbatim (substituted downstream)
|
|
package markdown
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"mgit.msbls.de/m/paliad/pkg/docforge"
|
|
)
|
|
|
|
// Import parses Markdown into a Document. Empty (or all-blank) input yields
|
|
// a single empty paragraph so a splice site stays well-formed.
|
|
func Import(md string) docforge.Document {
|
|
blocks := splitBlocks(md)
|
|
if len(blocks) == 0 {
|
|
return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}}
|
|
}
|
|
out := make([]docforge.Block, 0, len(blocks))
|
|
for _, blk := range blocks {
|
|
b := docforge.Block{Kind: docforge.BlockKind(blk.kind)}
|
|
// An empty-text block is an intentional empty paragraph: leave
|
|
// Spans nil so the exporter emits a single empty run.
|
|
if blk.text != "" {
|
|
b.Spans = parseInline(blk.text)
|
|
}
|
|
out = append(out, b)
|
|
}
|
|
return docforge.Document{Blocks: out}
|
|
}
|
|
|
|
// rawBlock is the intermediate (kind, stripped-text) form before inline
|
|
// parsing. kind values match docforge.BlockKind string values.
|
|
type rawBlock struct {
|
|
kind string
|
|
text string
|
|
}
|
|
|
|
// splitBlocks parses the source into a sequence of (kind, text) blocks,
|
|
// detecting heading / list / blockquote prefixes line-by-line. A run of
|
|
// unmarked lines collapses into one paragraph block (soft line breaks
|
|
// inside a paragraph concatenate); each marked line is its own block.
|
|
// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised.
|
|
func splitBlocks(md string) []rawBlock {
|
|
normalised := strings.ReplaceAll(md, "\r\n", "\n")
|
|
lines := strings.Split(normalised, "\n")
|
|
var blocks []rawBlock
|
|
var pendingPara []string
|
|
blankRun := 0
|
|
|
|
flushPara := func() {
|
|
if len(pendingPara) > 0 {
|
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")})
|
|
pendingPara = nil
|
|
}
|
|
}
|
|
|
|
for _, line := range lines {
|
|
if strings.TrimSpace(line) == "" {
|
|
if len(pendingPara) > 0 {
|
|
flushPara()
|
|
blankRun = 1
|
|
continue
|
|
}
|
|
blankRun++
|
|
continue
|
|
}
|
|
if kind, payload, ok := detectBlockMarker(line); ok {
|
|
flushPara()
|
|
for i := 1; i < blankRun; i++ {
|
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
|
|
}
|
|
blankRun = 0
|
|
blocks = append(blocks, rawBlock{kind: kind, text: payload})
|
|
continue
|
|
}
|
|
if len(pendingPara) == 0 {
|
|
for i := 1; i < blankRun; i++ {
|
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
|
|
}
|
|
}
|
|
blankRun = 0
|
|
pendingPara = append(pendingPara, line)
|
|
}
|
|
flushPara()
|
|
return blocks
|
|
}
|
|
|
|
// detectBlockMarker classifies a single line. Tolerates up to 3 leading
|
|
// spaces (CommonMark) before treating the line as a plain paragraph.
|
|
func detectBlockMarker(line string) (kind, payload string, ok bool) {
|
|
trimmed := strings.TrimLeft(line, " ")
|
|
if len(line)-len(trimmed) > 3 {
|
|
return "", "", false
|
|
}
|
|
switch {
|
|
case strings.HasPrefix(trimmed, "### "):
|
|
return "heading_3", strings.TrimSpace(trimmed[4:]), true
|
|
case strings.HasPrefix(trimmed, "## "):
|
|
return "heading_2", strings.TrimSpace(trimmed[3:]), true
|
|
case strings.HasPrefix(trimmed, "# "):
|
|
return "heading_1", strings.TrimSpace(trimmed[2:]), true
|
|
case strings.HasPrefix(trimmed, "> "):
|
|
return "blockquote", strings.TrimSpace(trimmed[2:]), true
|
|
case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "):
|
|
return "list_bullet", strings.TrimSpace(trimmed[2:]), true
|
|
}
|
|
if i := indexOfNumberedMarker(trimmed); i > 0 {
|
|
return "list_numbered", strings.TrimSpace(trimmed[i:]), true
|
|
}
|
|
return "", "", false
|
|
}
|
|
|
|
// indexOfNumberedMarker returns the byte index just past an "N. " / "N) "
|
|
// marker at the start of s, or -1 when absent.
|
|
func indexOfNumberedMarker(s string) int {
|
|
i := 0
|
|
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
|
|
i++
|
|
}
|
|
if i == 0 || i >= len(s) {
|
|
return -1
|
|
}
|
|
if s[i] != '.' && s[i] != ')' {
|
|
return -1
|
|
}
|
|
if i+1 >= len(s) || s[i+1] != ' ' {
|
|
return -1
|
|
}
|
|
return i + 2
|
|
}
|
|
|
|
// parseInline splits text around [label](url) hyperlinks and tokenises the
|
|
// rest into bold/italic spans. Hyperlinks become a span with Link set and
|
|
// the label's spans as Children, preserving link boundaries.
|
|
func parseInline(text string) []docforge.InlineSpan {
|
|
var out []docforge.InlineSpan
|
|
rest := text
|
|
for {
|
|
idx := strings.Index(rest, "[")
|
|
if idx < 0 {
|
|
if rest != "" {
|
|
out = append(out, parseSpans(rest)...)
|
|
}
|
|
break
|
|
}
|
|
closeBracket := strings.Index(rest[idx:], "](")
|
|
if closeBracket < 0 {
|
|
out = append(out, parseSpans(rest)...)
|
|
break
|
|
}
|
|
closeParen := strings.Index(rest[idx+closeBracket:], ")")
|
|
if closeParen < 0 {
|
|
out = append(out, parseSpans(rest)...)
|
|
break
|
|
}
|
|
label := rest[idx+1 : idx+closeBracket]
|
|
url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
|
|
if idx > 0 {
|
|
out = append(out, parseSpans(rest[:idx])...)
|
|
}
|
|
out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)})
|
|
rest = rest[idx+closeBracket+closeParen+1:]
|
|
}
|
|
return out
|
|
}
|
|
|
|
// parseSpans tokenises Markdown inline bold/italic into spans, preserving
|
|
// {{...}} placeholders verbatim (the b78a984 fix — underscores in a
|
|
// placeholder key must not be read as italic delimiters). Empty input
|
|
// yields one empty span.
|
|
func parseSpans(text string) []docforge.InlineSpan {
|
|
var out []docforge.InlineSpan
|
|
var cur strings.Builder
|
|
bold := false
|
|
italic := false
|
|
flush := func() {
|
|
if cur.Len() == 0 {
|
|
return
|
|
}
|
|
out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
|
|
cur.Reset()
|
|
}
|
|
i := 0
|
|
n := len(text)
|
|
for i < n {
|
|
if i+1 < n && text[i] == '{' && text[i+1] == '{' {
|
|
if rel := strings.Index(text[i+2:], "}}"); rel >= 0 {
|
|
end := i + 2 + rel + 2
|
|
cur.WriteString(text[i:end])
|
|
i = end
|
|
continue
|
|
}
|
|
}
|
|
if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
|
|
flush()
|
|
bold = !bold
|
|
i += 2
|
|
continue
|
|
}
|
|
if text[i] == '*' || text[i] == '_' {
|
|
flush()
|
|
italic = !italic
|
|
i++
|
|
continue
|
|
}
|
|
cur.WriteByte(text[i])
|
|
i++
|
|
}
|
|
flush()
|
|
if len(out) == 0 {
|
|
out = append(out, docforge.InlineSpan{Text: ""})
|
|
}
|
|
return out
|
|
}
|