fix: replace pdf-parse with direct pdfjs-dist to fix DOMMatrix error in production
All checks were successful
Deploy to VPS / deploy (push) Successful in 1m10s
All checks were successful
Deploy to VPS / deploy (push) Successful in 1m10s
pdf-parse v2 depends on @napi-rs/canvas (native module) which fails in Next.js standalone Docker builds — native binaries aren't traced/copied to the standalone output, causing DOMMatrix is not defined at runtime. Replaced pdf-parse entirely with pdfjs-dist legacy build which works natively in Node.js without canvas or DOM API dependencies: - New src/lib/pdf.ts: extractTextFromPdf() using pdfjs-dist/legacy/build - Worker file explicitly imported so Next.js file tracer includes it - Updated all call sites: documents, norms/parse, contracts - Removed pdf-parse from dependencies, added pdfjs-dist directly - Changed serverExternalPackages from pdf-parse to pdfjs-dist Verified: build succeeds, both pdf.mjs and pdf.worker.mjs present in .next/standalone, text extraction works in standalone context. Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -11,7 +11,7 @@ const commitHash = (() => {
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
output: "standalone",
|
||||
serverExternalPackages: ["pdf-parse", "drizzle-orm", "pg"],
|
||||
serverExternalPackages: ["pdfjs-dist", "drizzle-orm", "pg"],
|
||||
env: {
|
||||
NEXT_PUBLIC_BUILD_HASH: commitHash,
|
||||
},
|
||||
|
||||
23
package-lock.json
generated
23
package-lock.json
generated
@@ -17,7 +17,7 @@
|
||||
"mammoth": "^1.12.0",
|
||||
"next": "16.2.3",
|
||||
"next-auth": "^4.24.13",
|
||||
"pdf-parse": "^2.4.5",
|
||||
"pdfjs-dist": "^5.4.296",
|
||||
"pg": "^8.20.0",
|
||||
"react": "19.2.4",
|
||||
"react-dom": "19.2.4"
|
||||
@@ -2058,6 +2058,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz",
|
||||
"integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"workspaces": [
|
||||
"e2e/*"
|
||||
],
|
||||
@@ -7148,26 +7149,6 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/pdf-parse": {
|
||||
"version": "2.4.5",
|
||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz",
|
||||
"integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@napi-rs/canvas": "0.1.80",
|
||||
"pdfjs-dist": "5.4.296"
|
||||
},
|
||||
"bin": {
|
||||
"pdf-parse": "bin/cli.mjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20.16.0 <21 || >=22.3.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/mehmet-kozan"
|
||||
}
|
||||
},
|
||||
"node_modules/pdfjs-dist": {
|
||||
"version": "5.4.296",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
"mammoth": "^1.12.0",
|
||||
"next": "16.2.3",
|
||||
"next-auth": "^4.24.13",
|
||||
"pdf-parse": "^2.4.5",
|
||||
"pdfjs-dist": "^5.4.296",
|
||||
"pg": "^8.20.0",
|
||||
"react": "19.2.4",
|
||||
"react-dom": "19.2.4"
|
||||
|
||||
@@ -39,12 +39,9 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das
|
||||
}
|
||||
]`;
|
||||
|
||||
async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
const parser = new PDFParse({ data: buffer });
|
||||
const data = await parser.getText();
|
||||
await parser.destroy();
|
||||
return data.text;
|
||||
async function extractTextFromPdfBuffer(buffer: Buffer): Promise<string> {
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
return extractTextFromPdf(buffer);
|
||||
}
|
||||
|
||||
const CHUNK_CHAR_LIMIT = 10_000;
|
||||
@@ -100,7 +97,7 @@ export async function POST(request: Request) {
|
||||
|
||||
if (file.type === 'application/pdf' || file.name.endsWith('.pdf')) {
|
||||
try {
|
||||
text = await extractTextFromPdf(buffer);
|
||||
text = await extractTextFromPdfBuffer(buffer);
|
||||
} catch (err) {
|
||||
console.error('PDF parse error:', err);
|
||||
return Response.json(
|
||||
|
||||
@@ -137,11 +137,8 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
const parser = new PDFParse({ data: fileBuffer });
|
||||
const pdfData = await parser.getText();
|
||||
await parser.destroy();
|
||||
text = pdfData.text;
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
text = await extractTextFromPdf(fileBuffer);
|
||||
} else {
|
||||
// DOCX — use mammoth for extraction
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
@@ -133,12 +133,9 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
const { PDFParse } = await import('pdf-parse');
|
||||
let pdfData;
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
try {
|
||||
const parser = new PDFParse({ data: fileBuffer });
|
||||
pdfData = await parser.getText();
|
||||
await parser.destroy();
|
||||
text = await extractTextFromPdf(fileBuffer);
|
||||
} catch (pdfErr) {
|
||||
const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr);
|
||||
if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) {
|
||||
@@ -146,7 +143,6 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
}
|
||||
throw new Error(`PDF konnte nicht gelesen werden: ${pdfMessage}`);
|
||||
}
|
||||
text = pdfData.text;
|
||||
|
||||
// Detect scanned PDFs with no text layer
|
||||
if (!text || text.trim().length === 0) {
|
||||
|
||||
40
src/lib/pdf.ts
Normal file
40
src/lib/pdf.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix)
|
||||
|
||||
// Force Next.js file tracer to include the worker file in standalone builds
|
||||
import 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
||||
|
||||
/**
|
||||
* Extract all text from a PDF buffer.
|
||||
* Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs.
|
||||
*/
|
||||
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
|
||||
// Resolve the worker path at runtime so pdfjs can find it in standalone builds
|
||||
const { createRequire } = await import('module');
|
||||
const require = createRequire(import.meta.url ?? __filename);
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = require.resolve(
|
||||
'pdfjs-dist/legacy/build/pdf.worker.mjs',
|
||||
);
|
||||
|
||||
const data = new Uint8Array(buffer);
|
||||
const doc = await pdfjsLib.getDocument({
|
||||
data,
|
||||
useSystemFonts: true,
|
||||
isEvalSupported: false,
|
||||
}).promise;
|
||||
|
||||
const pages: string[] = [];
|
||||
for (let i = 1; i <= doc.numPages; i++) {
|
||||
const page = await doc.getPage(i);
|
||||
const content = await page.getTextContent();
|
||||
const pageText = content.items
|
||||
.filter((item) => 'str' in item)
|
||||
.map((item) => (item as { str: string }).str)
|
||||
.join(' ');
|
||||
pages.push(pageText);
|
||||
}
|
||||
|
||||
doc.destroy();
|
||||
return pages.join('\n');
|
||||
}
|
||||
52
src/types/pdf-parse.d.ts
vendored
52
src/types/pdf-parse.d.ts
vendored
@@ -1,19 +1,45 @@
|
||||
declare module 'pdf-parse' {
|
||||
interface TextResult {
|
||||
text: string;
|
||||
total: number;
|
||||
pages: Array<{ page: number; text: string }>;
|
||||
// Worker module — imported for side-effect (file tracer) only
|
||||
declare module 'pdfjs-dist/legacy/build/pdf.worker.mjs' {}
|
||||
|
||||
// pdfjs-dist legacy build type shim for dynamic import
|
||||
declare module 'pdfjs-dist/legacy/build/pdf.mjs' {
|
||||
export const GlobalWorkerOptions: {
|
||||
workerSrc: string;
|
||||
};
|
||||
|
||||
export function getDocument(params: {
|
||||
data: Uint8Array;
|
||||
useSystemFonts?: boolean;
|
||||
isEvalSupported?: boolean;
|
||||
disableAutoFetch?: boolean;
|
||||
}): { promise: Promise<PDFDocumentProxy> };
|
||||
|
||||
interface PDFDocumentProxy {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PDFPageProxy>;
|
||||
destroy(): void;
|
||||
}
|
||||
|
||||
interface PDFParseOptions {
|
||||
data?: Buffer | ArrayBuffer | Uint8Array;
|
||||
url?: string;
|
||||
interface PDFPageProxy {
|
||||
getTextContent(): Promise<TextContent>;
|
||||
}
|
||||
|
||||
export class PDFParse {
|
||||
constructor(options: PDFParseOptions);
|
||||
getText(options?: { partial?: number[] }): Promise<TextResult>;
|
||||
getInfo(options?: { parsePageInfo?: boolean }): Promise<Record<string, unknown>>;
|
||||
destroy(): Promise<void>;
|
||||
interface TextContent {
|
||||
items: Array<TextItem | TextMarkedContent>;
|
||||
}
|
||||
|
||||
interface TextItem {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
}
|
||||
|
||||
interface TextMarkedContent {
|
||||
type: string;
|
||||
id: string;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user