All checks were successful
Deploy to VPS / deploy (push) Successful in 1m10s
pdf-parse v2 depends on @napi-rs/canvas (native module) which fails in Next.js standalone Docker builds — native binaries aren't traced/copied to the standalone output, causing DOMMatrix is not defined at runtime. Replaced pdf-parse entirely with pdfjs-dist legacy build which works natively in Node.js without canvas or DOM API dependencies: - New src/lib/pdf.ts: extractTextFromPdf() using pdfjs-dist/legacy/build - Worker file explicitly imported so Next.js file tracer includes it - Updated all call sites: documents, norms/parse, contracts - Removed pdf-parse from dependencies, added pdfjs-dist directly - Changed serverExternalPackages from pdf-parse to pdfjs-dist Verified: build succeeds, both pdf.mjs and pdf.worker.mjs present in .next/standalone, text extraction works in standalone context. Co-Authored-By: Paperclip <noreply@paperclip.ing>
41 lines
1.3 KiB
TypeScript
41 lines
1.3 KiB
TypeScript
// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix)
|
|
|
|
// Force Next.js file tracer to include the worker file in standalone builds
|
|
import 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
|
|
|
/**
|
|
* Extract all text from a PDF buffer.
|
|
* Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs.
|
|
*/
|
|
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
|
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
|
|
|
// Resolve the worker path at runtime so pdfjs can find it in standalone builds
|
|
const { createRequire } = await import('module');
|
|
const require = createRequire(import.meta.url ?? __filename);
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = require.resolve(
|
|
'pdfjs-dist/legacy/build/pdf.worker.mjs',
|
|
);
|
|
|
|
const data = new Uint8Array(buffer);
|
|
const doc = await pdfjsLib.getDocument({
|
|
data,
|
|
useSystemFonts: true,
|
|
isEvalSupported: false,
|
|
}).promise;
|
|
|
|
const pages: string[] = [];
|
|
for (let i = 1; i <= doc.numPages; i++) {
|
|
const page = await doc.getPage(i);
|
|
const content = await page.getTextContent();
|
|
const pageText = content.items
|
|
.filter((item) => 'str' in item)
|
|
.map((item) => (item as { str: string }).str)
|
|
.join(' ');
|
|
pages.push(pageText);
|
|
}
|
|
|
|
doc.destroy();
|
|
return pages.join('\n');
|
|
}
|