package services // Submission .dotm → .docx converter (t-paliad-230, "format-only" scope // reduction of the original t-paliad-215 submission generator). // // Word .dotm (macro-enabled template), .docm (macro-enabled document), // .dotx (template, no macros), and .docx (document, no macros) are all // OOXML zip containers. The macro-bearing variants carry an extra set // of parts: // // word/vbaProject.bin — the VBA project binary // word/_rels/vbaProject.bin.rels — auxiliary relationships // word/vbaData.xml — VBA support data // word/customizations.xml — keyMapCustomizations // // plus a Content-Types override for each of those, a Default extension // declaring all .bin files as vbaProject, and a different "main" content // type for word/document.xml itself. // // ConvertDotmToDocx walks the zip, drops the macro parts, rewrites // [Content_Types].xml and word/_rels/document.xml.rels to remove every // reference to them, and switches the main document content type to // the plain .docx form. Every other part — styles, fonts, theme, // settings, document body, header/footer/numbering, glossary, custom // XML — passes through bit-for-bit at the original compression method // and modification time. // // No variable substitution. Today's slice hands the lawyer the firm // style template as a clean .docx so they can edit and save under // their own filename. The merge-engine slice is deferred. import ( "archive/zip" "bytes" "fmt" "io" "regexp" "strings" ) // The four OOXML "main" content types we may see on word/document.xml. // Anything other than docxMainContentType gets rewritten so the output // reads as a plain document. const ( dotmMainContentType = "application/vnd.ms-word.template.macroEnabledTemplate.main+xml" docmMainContentType = "application/vnd.ms-word.document.macroEnabled.main+xml" dotxMainContentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.template.main+xml" docxMainContentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml" ) // Macro-related parts dropped wholesale from the output zip. var macroParts = map[string]bool{ "word/vbaProject.bin": true, "word/_rels/vbaProject.bin.rels": true, "word/vbaData.xml": true, "word/customizations.xml": true, } const ( contentTypesPath = "[Content_Types].xml" documentRelsPath = "word/_rels/document.xml.rels" ) // vbaDefaultExtensionRegex matches the `` row in [Content_Types].xml. After // vbaProject.bin is dropped, the Default is dead weight (and Word will // flag the file as macro-bearing if it survives). var vbaDefaultExtensionRegex = regexp.MustCompile( `\s*]*\bExtension\s*=\s*"bin"[^>]*\bContentType\s*=\s*"application/vnd\.ms-office\.vbaProject"[^>]*/>`, ) // macroOverridePartRegex matches any element // whose PartName is one of the dropped macro parts. The /word/ // prefix is the OOXML convention for the absolute part path in // [Content_Types].xml — file paths in the zip itself omit the leading // slash. var macroOverridePartRegex = regexp.MustCompile( `\s*]*\bPartName\s*=\s*"/word/(?:vbaProject\.bin|vbaData\.xml|customizations\.xml)"[^>]*/>`, ) // macroRelTypeRegex matches the two macro-related relationship Types // in word/_rels/document.xml.rels: vbaProject (binds to vbaProject.bin) // and keyMapCustomizations (binds to customizations.xml). After both // targets are dropped, leaving the relationships in would make Word // flag the file as corrupt. var macroRelTypeRegex = regexp.MustCompile( `\s*]*\bType\s*=\s*"http://schemas\.microsoft\.com/office/2006/relationships/(?:vbaProject|keyMapCustomizations)"[^>]*/>`, ) // ConvertDotmToDocx rewrites a .dotm (or .docm, or .dotx) zip into a // clean .docx zip. Idempotent on a zip that is already a plain .docx. // Returns an error if the input is not a valid zip. func ConvertDotmToDocx(dotmBytes []byte) ([]byte, error) { zr, err := zip.NewReader(bytes.NewReader(dotmBytes), int64(len(dotmBytes))) if err != nil { return nil, fmt.Errorf("dotm→docx: open zip: %w", err) } var out bytes.Buffer zw := zip.NewWriter(&out) for _, entry := range zr.File { if macroParts[entry.Name] { continue } body, err := readZipFile(entry) if err != nil { return nil, fmt.Errorf("dotm→docx: read %s: %w", entry.Name, err) } switch entry.Name { case contentTypesPath: body = rewriteContentTypes(body) case documentRelsPath: body = rewriteDocumentRels(body) } w, err := zw.CreateHeader(&zip.FileHeader{ Name: entry.Name, Method: entry.Method, Modified: entry.Modified, }) if err != nil { return nil, fmt.Errorf("dotm→docx: write header %s: %w", entry.Name, err) } if _, err := w.Write(body); err != nil { return nil, fmt.Errorf("dotm→docx: write body %s: %w", entry.Name, err) } } if err := zw.Close(); err != nil { return nil, fmt.Errorf("dotm→docx: finalise zip: %w", err) } return out.Bytes(), nil } // rewriteContentTypes demotes any of the three non-docx "main" content // types to plain docx, drops the bin Default-Extension entry, and // drops every Override that targeted a dropped macro part. // // String-level substitution rather than encoding/xml: round-tripping // through Go's XML marshaller would re-emit the document with // canonical namespace declarations on every child, which Word reads // but which makes the binary diff unnecessarily large. Direct // substitution preserves the file's original shape. func rewriteContentTypes(body []byte) []byte { body = bytes.ReplaceAll(body, []byte(dotmMainContentType), []byte(docxMainContentType)) body = bytes.ReplaceAll(body, []byte(docmMainContentType), []byte(docxMainContentType)) body = bytes.ReplaceAll(body, []byte(dotxMainContentType), []byte(docxMainContentType)) body = vbaDefaultExtensionRegex.ReplaceAll(body, nil) body = macroOverridePartRegex.ReplaceAll(body, nil) return body } // rewriteDocumentRels drops the two macro-related relationships from // word/_rels/document.xml.rels (vbaProject + keyMapCustomizations) so // the manifest no longer points at parts the zip no longer carries. // Every other relationship — styles, settings, numbering, theme, // headers/footers, customXml — passes through untouched. func rewriteDocumentRels(body []byte) []byte { return macroRelTypeRegex.ReplaceAll(body, nil) } // readZipFile slurps a zip entry's bytes. func readZipFile(f *zip.File) ([]byte, error) { rc, err := f.Open() if err != nil { return nil, err } defer rc.Close() return io.ReadAll(rc) } // SanitiseSubmissionFileName cleans a string for use inside a download // filename — strips path separators and quote characters that would // break Content-Disposition or confuse browsers across OSes. ASCII-folds // the small set of German umlaut letters that show up in submission // names today (Klageerwiderung, Berufungsbegründung, …) so the file // lands cleanly on legacy SMB shares whose layer is still cp1252. // Other Unicode is preserved so non-DE/EN names still produce a // recognisable file. func SanitiseSubmissionFileName(s string) string { s = strings.TrimSpace(s) s = umlautFolder.Replace(s) s = strings.Map(func(r rune) rune { switch r { case '/', '\\': return '_' case '"', '\'': return -1 } return r }, s) return s } // umlautFolder turns the four DE umlaut letters (both cases) into ASCII // digraphs; ß → ss. var umlautFolder = strings.NewReplacer( "ä", "ae", "ö", "oe", "ü", "ue", "Ä", "Ae", "Ö", "Oe", "Ü", "Ue", "ß", "ss", )