Files
paliad/pkg/docforge/docx/dotm.go
mAi 78a30a7ee0 refactor(docforge): slice 1 — extract .docx engine to pkg/docforge/docx (t-paliad-349)
Relocate the in-house OOXML machinery out of internal/services into the
first docforge adapter, with zero behaviour change:

  submission_merge.go  -> pkg/docforge/docx/merge.go     (placeholder
                          substitution renderer + preview-HTML emitter)
  submission_md.go     -> pkg/docforge/docx/markdown.go  (Markdown->OOXML
                          walker incl. the b78a984 underscore-fix)
  submission_render.go -> pkg/docforge/docx/dotm.go      (.dotm->.docx)
  + their _test.go files (git-tracked renames, 84-99% identical)

internal/services keeps thin type-alias + forwarder shims
(docforge_shims.go) so every caller in services/handlers/main compiles
and behaves identically: PlaceholderMap, MissingPlaceholderFn,
SubmissionRenderer, HyperlinkAllocator (aliases); NewSubmissionRenderer,
DefaultMissingMarker, RenderMarkdownToOOXML[WithStyles], ConvertDotmToDocx,
SanitiseSubmissionFileName (forwarders). docx.XMLAttrEscape is exported so
submission_compose.go's hyperlink-rels inserts reuse the walker's escaping.

Three mis-filed pretty-printer tests (legalSourcePretty, ourSideDE/EN,
patentNumberUPC) that exercise the vars layer move back to
internal/services/submission_vars_pretty_test.go.

Placeholder grammar + PlaceholderMap stay co-located with the renderer in
docx for now; slice 3 hoists the format-neutral grammar to the docforge
root with the VariableResolver interface.

Verification: go build ./... clean, go vet clean, full module test green
(the byte-exact OOXML golden tests in merge/compose/render pass unchanged
= behaviour preserved). gofmt drift on the moved files is pre-existing
(72/169 services files already drift; no gofmt gate).

m/paliad#157
2026-05-29 14:51:59 +02:00

205 lines
7.6 KiB
Go

package docx
// Submission .dotm → .docx converter (t-paliad-230, "format-only" scope
// reduction of the original t-paliad-215 submission generator).
//
// Word .dotm (macro-enabled template), .docm (macro-enabled document),
// .dotx (template, no macros), and .docx (document, no macros) are all
// OOXML zip containers. The macro-bearing variants carry an extra set
// of parts:
//
// word/vbaProject.bin — the VBA project binary
// word/_rels/vbaProject.bin.rels — auxiliary relationships
// word/vbaData.xml — VBA support data
// word/customizations.xml — keyMapCustomizations
//
// plus a Content-Types override for each of those, a Default extension
// declaring all .bin files as vbaProject, and a different "main" content
// type for word/document.xml itself.
//
// ConvertDotmToDocx walks the zip, drops the macro parts, rewrites
// [Content_Types].xml and word/_rels/document.xml.rels to remove every
// reference to them, and switches the main document content type to
// the plain .docx form. Every other part — styles, fonts, theme,
// settings, document body, header/footer/numbering, glossary, custom
// XML — passes through bit-for-bit at the original compression method
// and modification time.
//
// No variable substitution. Today's slice hands the lawyer the firm
// style template as a clean .docx so they can edit and save under
// their own filename. The merge-engine slice is deferred.
import (
"archive/zip"
"bytes"
"fmt"
"io"
"regexp"
"strings"
)
// The four OOXML "main" content types we may see on word/document.xml.
// Anything other than docxMainContentType gets rewritten so the output
// reads as a plain document.
const (
dotmMainContentType = "application/vnd.ms-word.template.macroEnabledTemplate.main+xml"
docmMainContentType = "application/vnd.ms-word.document.macroEnabled.main+xml"
dotxMainContentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml"
docxMainContentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
)
// Macro-related parts dropped wholesale from the output zip.
var macroParts = map[string]bool{
"word/vbaProject.bin": true,
"word/_rels/vbaProject.bin.rels": true,
"word/vbaData.xml": true,
"word/customizations.xml": true,
}
const (
contentTypesPath = "[Content_Types].xml"
documentRelsPath = "word/_rels/document.xml.rels"
)
// vbaDefaultExtensionRegex matches the `<Default Extension="bin"
// ContentType=".../vbaProject"/>` row in [Content_Types].xml. After
// vbaProject.bin is dropped, the Default is dead weight (and Word will
// flag the file as macro-bearing if it survives).
var vbaDefaultExtensionRegex = regexp.MustCompile(
`\s*<Default\b[^>]*\bExtension\s*=\s*"bin"[^>]*\bContentType\s*=\s*"application/vnd\.ms-office\.vbaProject"[^>]*/>`,
)
// macroOverridePartRegex matches any <Override PartName="…"/> element
// whose PartName is one of the dropped macro parts. The /word/
// prefix is the OOXML convention for the absolute part path in
// [Content_Types].xml — file paths in the zip itself omit the leading
// slash.
var macroOverridePartRegex = regexp.MustCompile(
`\s*<Override\b[^>]*\bPartName\s*=\s*"/word/(?:vbaProject\.bin|vbaData\.xml|customizations\.xml)"[^>]*/>`,
)
// macroRelTypeRegex matches the two macro-related relationship Types
// in word/_rels/document.xml.rels: vbaProject (binds to vbaProject.bin)
// and keyMapCustomizations (binds to customizations.xml). After both
// targets are dropped, leaving the relationships in would make Word
// flag the file as corrupt.
var macroRelTypeRegex = regexp.MustCompile(
`\s*<Relationship\b[^>]*\bType\s*=\s*"http://schemas\.microsoft\.com/office/2006/relationships/(?:vbaProject|keyMapCustomizations)"[^>]*/>`,
)
// ConvertDotmToDocx rewrites a .dotm (or .docm, or .dotx) zip into a
// clean .docx zip. Idempotent on a zip that is already a plain .docx.
// Returns an error if the input is not a valid zip.
func ConvertDotmToDocx(dotmBytes []byte) ([]byte, error) {
zr, err := zip.NewReader(bytes.NewReader(dotmBytes), int64(len(dotmBytes)))
if err != nil {
return nil, fmt.Errorf("dotm→docx: open zip: %w", err)
}
var out bytes.Buffer
zw := zip.NewWriter(&out)
for _, entry := range zr.File {
if macroParts[entry.Name] {
continue
}
body, err := readZipFile(entry)
if err != nil {
return nil, fmt.Errorf("dotm→docx: read %s: %w", entry.Name, err)
}
switch entry.Name {
case contentTypesPath:
body = rewriteContentTypes(body)
case documentRelsPath:
body = rewriteDocumentRels(body)
}
w, err := zw.CreateHeader(&zip.FileHeader{
Name: entry.Name,
Method: entry.Method,
Modified: entry.Modified,
})
if err != nil {
return nil, fmt.Errorf("dotm→docx: write header %s: %w", entry.Name, err)
}
if _, err := w.Write(body); err != nil {
return nil, fmt.Errorf("dotm→docx: write body %s: %w", entry.Name, err)
}
}
if err := zw.Close(); err != nil {
return nil, fmt.Errorf("dotm→docx: finalise zip: %w", err)
}
return out.Bytes(), nil
}
// rewriteContentTypes demotes any of the three non-docx "main" content
// types to plain docx, drops the bin Default-Extension entry, and
// drops every Override that targeted a dropped macro part.
//
// String-level substitution rather than encoding/xml: round-tripping
// through Go's XML marshaller would re-emit the document with
// canonical namespace declarations on every child, which Word reads
// but which makes the binary diff unnecessarily large. Direct
// substitution preserves the file's original shape.
func rewriteContentTypes(body []byte) []byte {
body = bytes.ReplaceAll(body, []byte(dotmMainContentType), []byte(docxMainContentType))
body = bytes.ReplaceAll(body, []byte(docmMainContentType), []byte(docxMainContentType))
body = bytes.ReplaceAll(body, []byte(dotxMainContentType), []byte(docxMainContentType))
body = vbaDefaultExtensionRegex.ReplaceAll(body, nil)
body = macroOverridePartRegex.ReplaceAll(body, nil)
return body
}
// rewriteDocumentRels drops the two macro-related relationships from
// word/_rels/document.xml.rels (vbaProject + keyMapCustomizations) so
// the manifest no longer points at parts the zip no longer carries.
// Every other relationship — styles, settings, numbering, theme,
// headers/footers, customXml — passes through untouched.
func rewriteDocumentRels(body []byte) []byte {
return macroRelTypeRegex.ReplaceAll(body, nil)
}
// readZipFile slurps a zip entry's bytes.
func readZipFile(f *zip.File) ([]byte, error) {
rc, err := f.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// SanitiseSubmissionFileName cleans a string for use inside a download
// filename — strips path separators and quote characters that would
// break Content-Disposition or confuse browsers across OSes. ASCII-folds
// the small set of German umlaut letters that show up in submission
// names today (Klageerwiderung, Berufungsbegründung, …) so the file
// lands cleanly on legacy SMB shares whose layer is still cp1252.
// Other Unicode is preserved so non-DE/EN names still produce a
// recognisable file.
func SanitiseSubmissionFileName(s string) string {
s = strings.TrimSpace(s)
s = umlautFolder.Replace(s)
s = strings.Map(func(r rune) rune {
switch r {
case '/', '\\':
return '_'
case '"', '\'':
return -1
}
return r
}, s)
return s
}
// umlautFolder turns the four DE umlaut letters (both cases) into ASCII
// digraphs; ß → ss.
var umlautFolder = strings.NewReplacer(
"ä", "ae", "ö", "oe", "ü", "ue",
"Ä", "Ae", "Ö", "Oe", "Ü", "Ue",
"ß", "ss",
)