From 94b89cb1e279a783202db5139693c60159404a3b Mon Sep 17 00:00:00 2001 From: CTO Date: Fri, 10 Apr 2026 19:43:28 +0000 Subject: [PATCH] fix: improve document ingestion robustness and add progress/debug UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix PDF extraction: detect scanned documents (no text layer), encrypted PDFs, empty files, and missing files with clear German error messages - Add error logging to extraction pipeline (was silently swallowed) - Return errorMessage in document list API so UI can display failure reasons - Add GET /api/documents/[id] endpoint for status polling - Rewrite DokumentUpload component with: - Auto-polling every 2s while documents are processing - Visual step-by-step progress indicator (Hochgeladen → Extrahiere Text → Fertig) - Error message display when extraction fails - Debug toggle showing document ID, MIME type, size, timestamps Co-Authored-By: Paperclip --- src/app/api/documents/[id]/route.ts | 34 ++++ src/app/api/documents/route.ts | 4 +- src/components/documents/dokument-upload.tsx | 159 ++++++++++++++++--- src/lib/documents/index.ts | 36 ++++- 4 files changed, 208 insertions(+), 25 deletions(-) create mode 100644 src/app/api/documents/[id]/route.ts diff --git a/src/app/api/documents/[id]/route.ts b/src/app/api/documents/[id]/route.ts new file mode 100644 index 0000000..5108e92 --- /dev/null +++ b/src/app/api/documents/[id]/route.ts @@ -0,0 +1,34 @@ +// GET /api/documents/:id — get document status and metadata (used for polling) + +import { type NextRequest } from 'next/server'; +import { getDocument } from '@/lib/documents'; +import { requirePermission } from '@/lib/auth/rbac'; + +export async function GET( + request: NextRequest, + { params }: { params: Promise<{ id: string }> }, +) { + const auth = await requirePermission('cases:read'); + if ('response' in auth) return auth.response; + const { ctx } = auth; + + const { id } = await params; + + const doc = await getDocument(ctx.tenantId, id); + + if (!doc) { + return Response.json({ error: 'Dokument nicht gefunden.' }, { status: 404 }); + } + + return Response.json({ + id: doc.id, + filename: doc.filename, + mimeType: doc.mimeType, + fileSizeBytes: doc.fileSizeBytes, + category: doc.category, + status: doc.status, + errorMessage: doc.errorMessage, + createdAt: doc.createdAt, + updatedAt: doc.updatedAt, + }); +} diff --git a/src/app/api/documents/route.ts b/src/app/api/documents/route.ts index 7010d70..389bf10 100644 --- a/src/app/api/documents/route.ts +++ b/src/app/api/documents/route.ts @@ -61,8 +61,8 @@ export async function POST(request: NextRequest) { ); // Trigger text extraction asynchronously (fire-and-forget) - extractDocumentText(ctx.tenantId, result.documentId).catch(() => { - // Extraction errors are stored in the document record + extractDocumentText(ctx.tenantId, result.documentId).catch((err) => { + console.error(`[documents] Text extraction failed for ${result.documentId}:`, err); }); return Response.json(result, { status: 201 }); diff --git a/src/components/documents/dokument-upload.tsx b/src/components/documents/dokument-upload.tsx index 44299f5..e304620 100644 --- a/src/components/documents/dokument-upload.tsx +++ b/src/components/documents/dokument-upload.tsx @@ -18,12 +18,13 @@ interface DocumentItem { mimeType: string; fileSizeBytes: number; status: string; + errorMessage: string | null; createdAt: string; } const STATUS_LABELS: Record = { uploaded: 'Hochgeladen', - extracting: 'Text wird extrahiert...', + extracting: 'Extrahiere Text...', extracted: 'Extrahiert', failed: 'Fehlgeschlagen', }; @@ -35,12 +36,83 @@ const STATUS_COLORS: Record = { failed: 'bg-red-500/10 text-red-700', }; +const STEP_LABELS = [ + { key: 'uploaded', label: 'Hochgeladen' }, + { key: 'extracting', label: 'Extrahiere Text' }, + { key: 'extracted', label: 'Fertig' }, +]; + function formatFileSize(bytes: number): string { if (bytes < 1024) return `${bytes} B`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`; return `${(bytes / 1024 / 1024).toFixed(1)} MB`; } +function getStepIndex(status: string): number { + if (status === 'uploaded') return 0; + if (status === 'extracting') return 1; + if (status === 'extracted') return 2; + return -1; // failed +} + +function IngestionProgress({ doc, debug }: { doc: DocumentItem; debug: boolean }) { + const stepIdx = getStepIndex(doc.status); + const isFailed = doc.status === 'failed'; + + return ( +
+ {/* Step indicators */} +
+ {STEP_LABELS.map((step, i) => { + const isActive = i === stepIdx; + const isComplete = i < stepIdx; + const isCurrent = isActive && !isFailed; + + let dotClass = 'w-2.5 h-2.5 rounded-full shrink-0 transition-colors'; + if (isComplete) dotClass += ' bg-green-500'; + else if (isCurrent) dotClass += ' bg-yellow-500 animate-pulse'; + else if (isFailed && i === 1) dotClass += ' bg-red-500'; + else dotClass += ' bg-gray-300'; + + let lineClass = 'flex-1 h-0.5 transition-colors'; + if (isComplete) lineClass += ' bg-green-500'; + else lineClass += ' bg-gray-200'; + + return ( +
+
+ + {step.label} + + {i < STEP_LABELS.length - 1 &&
} +
+ ); + })} +
+ + {/* Error display */} + {isFailed && doc.errorMessage && ( +
+ Fehler: + {doc.errorMessage} +
+ )} + + {/* Debug info */} + {debug && ( +
+
ID: {doc.id}
+
Status: {doc.status}
+
MIME: {doc.mimeType}
+
Groesse: {formatFileSize(doc.fileSizeBytes)}
+
Hochgeladen: {new Date(doc.createdAt).toLocaleString('de-DE')}
+ {doc.errorMessage &&
Fehler: {doc.errorMessage}
} +
+ )} +
+ ); +} + export default function DokumentUpload({ category, caseId, @@ -53,7 +125,9 @@ export default function DokumentUpload({ const [success, setSuccess] = useState(''); const [documents, setDocuments] = useState([]); const [dragging, setDragging] = useState(false); + const [debug, setDebug] = useState(false); const fileRef = useRef(null); + const pollRef = useRef | null>(null); const fetchDocuments = useCallback(async () => { const params = new URLSearchParams({ category }); @@ -72,6 +146,27 @@ export default function DokumentUpload({ } }, [category, caseId, decisionId, normInstrumentId]); + // Determine if any documents need polling (in-progress states) + const hasPending = documents.some( + (d) => d.status === 'uploaded' || d.status === 'extracting', + ); + + // Poll for status updates when documents are being processed + useEffect(() => { + if (hasPending) { + pollRef.current = setInterval(fetchDocuments, 2000); + } else if (pollRef.current) { + clearInterval(pollRef.current); + pollRef.current = null; + } + return () => { + if (pollRef.current) { + clearInterval(pollRef.current); + pollRef.current = null; + } + }; + }, [hasPending, fetchDocuments]); + useEffect(() => { fetchDocuments(); }, [fetchDocuments]); @@ -99,7 +194,7 @@ export default function DokumentUpload({ throw new Error(data.error || 'Upload fehlgeschlagen'); } - setSuccess(`"${file.name}" erfolgreich hochgeladen.`); + setSuccess(`"${file.name}" erfolgreich hochgeladen. Textextraktion laeuft...`); if (fileRef.current) fileRef.current.value = ''; fetchDocuments(); } catch (err) { @@ -171,31 +266,51 @@ export default function DokumentUpload({ {documents.length > 0 && (
-

- Dokumente ({documents.length}) -

-
+
+

+ Dokumente ({documents.length}) +

+ +
+
{documents.map((doc) => (
-
-

- {doc.filename} -

-

- {formatFileSize(doc.fileSizeBytes)} ·{' '} - {new Date(doc.createdAt).toLocaleDateString('de-DE')} -

+
+
+

+ {doc.filename} +

+

+ {formatFileSize(doc.fileSizeBytes)} ·{' '} + {new Date(doc.createdAt).toLocaleDateString('de-DE')} +

+
+ + {STATUS_LABELS[doc.status] ?? doc.status} +
- - {STATUS_LABELS[doc.status] ?? doc.status} - + + {/* Show progress for non-extracted documents or if debug is on */} + {(doc.status !== 'extracted' || debug) && ( + + )}
))}
diff --git a/src/lib/documents/index.ts b/src/lib/documents/index.ts index aee2e3e..878e560 100644 --- a/src/lib/documents/index.ts +++ b/src/lib/documents/index.ts @@ -116,18 +116,50 @@ export async function extractDocumentText(tenantId: string, documentId: string): try { const fs = await import('node:fs/promises'); + + // Verify file exists before attempting extraction + try { + await fs.access(doc.storagePath); + } catch { + throw new Error(`Datei nicht gefunden: ${doc.storagePath}`); + } + const fileBuffer = await fs.readFile(doc.storagePath); + if (fileBuffer.length === 0) { + throw new Error('Datei ist leer (0 Bytes).'); + } + let text: string; if (doc.mimeType === 'application/pdf') { const pdfParse = (await import('pdf-parse')).default; - const pdfData = await pdfParse(fileBuffer); + let pdfData; + try { + pdfData = await pdfParse(fileBuffer); + } catch (pdfErr) { + const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr); + if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) { + throw new Error('PDF ist passwortgeschuetzt oder verschluesselt. Bitte ungeschuetzte Version hochladen.'); + } + throw new Error(`PDF konnte nicht gelesen werden: ${pdfMessage}`); + } text = pdfData.text; + + // Detect scanned PDFs with no text layer + if (!text || text.trim().length === 0) { + throw new Error( + 'PDF enthaelt keinen extrahierbaren Text. Moeglicherweise handelt es sich um ein gescanntes Dokument ohne Textebene (OCR erforderlich).', + ); + } } else { const mammoth = await import('mammoth'); const result = await mammoth.extractRawText({ buffer: fileBuffer }); text = result.value; + + if (!text || text.trim().length === 0) { + throw new Error('DOCX enthaelt keinen extrahierbaren Text.'); + } } await withTenantDb(tenantId, async (tdb) => { @@ -144,6 +176,7 @@ export async function extractDocumentText(tenantId: string, documentId: string): return text; } catch (err) { const message = err instanceof Error ? err.message : 'Textextraktion fehlgeschlagen'; + console.error(`[extractDocumentText] Document ${documentId} failed:`, message); await withTenantDb(tenantId, async (tdb) => { await tdb .update(documents) @@ -196,6 +229,7 @@ export async function listDocuments( fileSizeBytes: documents.fileSizeBytes, category: documents.category, status: documents.status, + errorMessage: documents.errorMessage, caseId: documents.caseId, decisionId: documents.decisionId, normInstrumentId: documents.normInstrumentId,