diff --git a/next.config.ts b/next.config.ts index 2068f1f..eaf1744 100644 --- a/next.config.ts +++ b/next.config.ts @@ -11,7 +11,7 @@ const commitHash = (() => { const nextConfig: NextConfig = { output: "standalone", - serverExternalPackages: ["pdf-parse", "drizzle-orm", "pg"], + serverExternalPackages: ["pdfjs-dist", "drizzle-orm", "pg"], env: { NEXT_PUBLIC_BUILD_HASH: commitHash, }, diff --git a/package-lock.json b/package-lock.json index 6dc4a4b..a6a39c8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "mammoth": "^1.12.0", "next": "16.2.3", "next-auth": "^4.24.13", - "pdf-parse": "^2.4.5", + "pdfjs-dist": "^5.4.296", "pg": "^8.20.0", "react": "19.2.4", "react-dom": "19.2.4" @@ -2058,6 +2058,7 @@ "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz", "integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==", "license": "MIT", + "optional": true, "workspaces": [ "e2e/*" ], @@ -7148,26 +7149,6 @@ "dev": true, "license": "MIT" }, - "node_modules/pdf-parse": { - "version": "2.4.5", - "resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz", - "integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==", - "license": "Apache-2.0", - "dependencies": { - "@napi-rs/canvas": "0.1.80", - "pdfjs-dist": "5.4.296" - }, - "bin": { - "pdf-parse": "bin/cli.mjs" - }, - "engines": { - "node": ">=20.16.0 <21 || >=22.3.0" - }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/mehmet-kozan" - } - }, "node_modules/pdfjs-dist": { "version": "5.4.296", "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz", diff --git a/package.json b/package.json index fe8b6bf..bc1debe 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "mammoth": "^1.12.0", "next": "16.2.3", "next-auth": "^4.24.13", - "pdf-parse": "^2.4.5", + "pdfjs-dist": "^5.4.296", "pg": "^8.20.0", "react": "19.2.4", "react-dom": "19.2.4" diff --git a/src/app/api/norms/parse/route.ts b/src/app/api/norms/parse/route.ts index f53a539..d308133 100644 --- a/src/app/api/norms/parse/route.ts +++ b/src/app/api/norms/parse/route.ts @@ -39,12 +39,9 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das } ]`; -async function extractTextFromPdf(buffer: Buffer): Promise { - const { PDFParse } = await import('pdf-parse'); - const parser = new PDFParse({ data: buffer }); - const data = await parser.getText(); - await parser.destroy(); - return data.text; +async function extractTextFromPdfBuffer(buffer: Buffer): Promise { + const { extractTextFromPdf } = await import('@/lib/pdf'); + return extractTextFromPdf(buffer); } const CHUNK_CHAR_LIMIT = 10_000; @@ -100,7 +97,7 @@ export async function POST(request: Request) { if (file.type === 'application/pdf' || file.name.endsWith('.pdf')) { try { - text = await extractTextFromPdf(buffer); + text = await extractTextFromPdfBuffer(buffer); } catch (err) { console.error('PDF parse error:', err); return Response.json( diff --git a/src/lib/contracts/index.ts b/src/lib/contracts/index.ts index 2b075eb..81cf91b 100644 --- a/src/lib/contracts/index.ts +++ b/src/lib/contracts/index.ts @@ -137,11 +137,8 @@ export async function extractDocumentText(tenantId: string, documentId: string): let text: string; if (doc.mimeType === 'application/pdf') { - const { PDFParse } = await import('pdf-parse'); - const parser = new PDFParse({ data: fileBuffer }); - const pdfData = await parser.getText(); - await parser.destroy(); - text = pdfData.text; + const { extractTextFromPdf } = await import('@/lib/pdf'); + text = await extractTextFromPdf(fileBuffer); } else { // DOCX — use mammoth for extraction const mammoth = await import('mammoth'); diff --git a/src/lib/documents/index.ts b/src/lib/documents/index.ts index 7a494f5..87ea7cd 100644 --- a/src/lib/documents/index.ts +++ b/src/lib/documents/index.ts @@ -133,12 +133,9 @@ export async function extractDocumentText(tenantId: string, documentId: string): let text: string; if (doc.mimeType === 'application/pdf') { - const { PDFParse } = await import('pdf-parse'); - let pdfData; + const { extractTextFromPdf } = await import('@/lib/pdf'); try { - const parser = new PDFParse({ data: fileBuffer }); - pdfData = await parser.getText(); - await parser.destroy(); + text = await extractTextFromPdf(fileBuffer); } catch (pdfErr) { const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr); if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) { @@ -146,7 +143,6 @@ export async function extractDocumentText(tenantId: string, documentId: string): } throw new Error(`PDF konnte nicht gelesen werden: ${pdfMessage}`); } - text = pdfData.text; // Detect scanned PDFs with no text layer if (!text || text.trim().length === 0) { diff --git a/src/lib/pdf.ts b/src/lib/pdf.ts new file mode 100644 index 0000000..8111db1 --- /dev/null +++ b/src/lib/pdf.ts @@ -0,0 +1,40 @@ +// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix) + +// Force Next.js file tracer to include the worker file in standalone builds +import 'pdfjs-dist/legacy/build/pdf.worker.mjs'; + +/** + * Extract all text from a PDF buffer. + * Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs. + */ +export async function extractTextFromPdf(buffer: Buffer): Promise { + const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs'); + + // Resolve the worker path at runtime so pdfjs can find it in standalone builds + const { createRequire } = await import('module'); + const require = createRequire(import.meta.url ?? __filename); + pdfjsLib.GlobalWorkerOptions.workerSrc = require.resolve( + 'pdfjs-dist/legacy/build/pdf.worker.mjs', + ); + + const data = new Uint8Array(buffer); + const doc = await pdfjsLib.getDocument({ + data, + useSystemFonts: true, + isEvalSupported: false, + }).promise; + + const pages: string[] = []; + for (let i = 1; i <= doc.numPages; i++) { + const page = await doc.getPage(i); + const content = await page.getTextContent(); + const pageText = content.items + .filter((item) => 'str' in item) + .map((item) => (item as { str: string }).str) + .join(' '); + pages.push(pageText); + } + + doc.destroy(); + return pages.join('\n'); +} diff --git a/src/types/pdf-parse.d.ts b/src/types/pdf-parse.d.ts index 12553c1..24a5671 100644 --- a/src/types/pdf-parse.d.ts +++ b/src/types/pdf-parse.d.ts @@ -1,19 +1,45 @@ -declare module 'pdf-parse' { - interface TextResult { - text: string; - total: number; - pages: Array<{ page: number; text: string }>; +// Worker module — imported for side-effect (file tracer) only +declare module 'pdfjs-dist/legacy/build/pdf.worker.mjs' {} + +// pdfjs-dist legacy build type shim for dynamic import +declare module 'pdfjs-dist/legacy/build/pdf.mjs' { + export const GlobalWorkerOptions: { + workerSrc: string; + }; + + export function getDocument(params: { + data: Uint8Array; + useSystemFonts?: boolean; + isEvalSupported?: boolean; + disableAutoFetch?: boolean; + }): { promise: Promise }; + + interface PDFDocumentProxy { + numPages: number; + getPage(pageNumber: number): Promise; + destroy(): void; } - interface PDFParseOptions { - data?: Buffer | ArrayBuffer | Uint8Array; - url?: string; + interface PDFPageProxy { + getTextContent(): Promise; } - export class PDFParse { - constructor(options: PDFParseOptions); - getText(options?: { partial?: number[] }): Promise; - getInfo(options?: { parsePageInfo?: boolean }): Promise>; - destroy(): Promise; + interface TextContent { + items: Array; + } + + interface TextItem { + str: string; + dir: string; + width: number; + height: number; + transform: number[]; + fontName: string; + hasEOL: boolean; + } + + interface TextMarkedContent { + type: string; + id: string; } }