fix: improve document ingestion robustness and add progress/debug UI
Some checks failed
Deploy to VPS / deploy (push) Failing after 2s

- Fix PDF extraction: detect scanned documents (no text layer), encrypted PDFs,
  empty files, and missing files with clear German error messages
- Add error logging to extraction pipeline (was silently swallowed)
- Return errorMessage in document list API so UI can display failure reasons
- Add GET /api/documents/[id] endpoint for status polling
- Rewrite DokumentUpload component with:
  - Auto-polling every 2s while documents are processing
  - Visual step-by-step progress indicator (Hochgeladen → Extrahiere Text → Fertig)
  - Error message display when extraction fails
  - Debug toggle showing document ID, MIME type, size, timestamps

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
CTO
2026-04-10 19:43:28 +00:00
parent fe838d5916
commit 94b89cb1e2
4 changed files with 208 additions and 25 deletions

View File

@@ -0,0 +1,34 @@
// GET /api/documents/:id — get document status and metadata (used for polling)
import { type NextRequest } from 'next/server';
import { getDocument } from '@/lib/documents';
import { requirePermission } from '@/lib/auth/rbac';
export async function GET(
request: NextRequest,
{ params }: { params: Promise<{ id: string }> },
) {
const auth = await requirePermission('cases:read');
if ('response' in auth) return auth.response;
const { ctx } = auth;
const { id } = await params;
const doc = await getDocument(ctx.tenantId, id);
if (!doc) {
return Response.json({ error: 'Dokument nicht gefunden.' }, { status: 404 });
}
return Response.json({
id: doc.id,
filename: doc.filename,
mimeType: doc.mimeType,
fileSizeBytes: doc.fileSizeBytes,
category: doc.category,
status: doc.status,
errorMessage: doc.errorMessage,
createdAt: doc.createdAt,
updatedAt: doc.updatedAt,
});
}

View File

@@ -61,8 +61,8 @@ export async function POST(request: NextRequest) {
);
// Trigger text extraction asynchronously (fire-and-forget)
extractDocumentText(ctx.tenantId, result.documentId).catch(() => {
// Extraction errors are stored in the document record
extractDocumentText(ctx.tenantId, result.documentId).catch((err) => {
console.error(`[documents] Text extraction failed for ${result.documentId}:`, err);
});
return Response.json(result, { status: 201 });

View File

@@ -18,12 +18,13 @@ interface DocumentItem {
mimeType: string;
fileSizeBytes: number;
status: string;
errorMessage: string | null;
createdAt: string;
}
const STATUS_LABELS: Record<string, string> = {
uploaded: 'Hochgeladen',
extracting: 'Text wird extrahiert...',
extracting: 'Extrahiere Text...',
extracted: 'Extrahiert',
failed: 'Fehlgeschlagen',
};
@@ -35,12 +36,83 @@ const STATUS_COLORS: Record<string, string> = {
failed: 'bg-red-500/10 text-red-700',
};
const STEP_LABELS = [
{ key: 'uploaded', label: 'Hochgeladen' },
{ key: 'extracting', label: 'Extrahiere Text' },
{ key: 'extracted', label: 'Fertig' },
];
function formatFileSize(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
}
function getStepIndex(status: string): number {
if (status === 'uploaded') return 0;
if (status === 'extracting') return 1;
if (status === 'extracted') return 2;
return -1; // failed
}
function IngestionProgress({ doc, debug }: { doc: DocumentItem; debug: boolean }) {
const stepIdx = getStepIndex(doc.status);
const isFailed = doc.status === 'failed';
return (
<div className="mt-2 space-y-2">
{/* Step indicators */}
<div className="flex items-center gap-1">
{STEP_LABELS.map((step, i) => {
const isActive = i === stepIdx;
const isComplete = i < stepIdx;
const isCurrent = isActive && !isFailed;
let dotClass = 'w-2.5 h-2.5 rounded-full shrink-0 transition-colors';
if (isComplete) dotClass += ' bg-green-500';
else if (isCurrent) dotClass += ' bg-yellow-500 animate-pulse';
else if (isFailed && i === 1) dotClass += ' bg-red-500';
else dotClass += ' bg-gray-300';
let lineClass = 'flex-1 h-0.5 transition-colors';
if (isComplete) lineClass += ' bg-green-500';
else lineClass += ' bg-gray-200';
return (
<div key={step.key} className="flex items-center gap-1 flex-1">
<div className={dotClass} />
<span className={`text-[10px] ${isActive || isComplete ? 'text-foreground font-medium' : 'text-muted'}`}>
{step.label}
</span>
{i < STEP_LABELS.length - 1 && <div className={lineClass} />}
</div>
);
})}
</div>
{/* Error display */}
{isFailed && doc.errorMessage && (
<div className="bg-red-50 border border-red-200 rounded-lg p-2.5 text-xs text-red-700">
<span className="font-medium">Fehler: </span>
{doc.errorMessage}
</div>
)}
{/* Debug info */}
{debug && (
<div className="bg-gray-50 border border-gray-200 rounded-lg p-2.5 text-[11px] font-mono text-gray-600 space-y-0.5">
<div>ID: {doc.id}</div>
<div>Status: {doc.status}</div>
<div>MIME: {doc.mimeType}</div>
<div>Groesse: {formatFileSize(doc.fileSizeBytes)}</div>
<div>Hochgeladen: {new Date(doc.createdAt).toLocaleString('de-DE')}</div>
{doc.errorMessage && <div className="text-red-600">Fehler: {doc.errorMessage}</div>}
</div>
)}
</div>
);
}
export default function DokumentUpload({
category,
caseId,
@@ -53,7 +125,9 @@ export default function DokumentUpload({
const [success, setSuccess] = useState('');
const [documents, setDocuments] = useState<DocumentItem[]>([]);
const [dragging, setDragging] = useState(false);
const [debug, setDebug] = useState(false);
const fileRef = useRef<HTMLInputElement>(null);
const pollRef = useRef<ReturnType<typeof setInterval> | null>(null);
const fetchDocuments = useCallback(async () => {
const params = new URLSearchParams({ category });
@@ -72,6 +146,27 @@ export default function DokumentUpload({
}
}, [category, caseId, decisionId, normInstrumentId]);
// Determine if any documents need polling (in-progress states)
const hasPending = documents.some(
(d) => d.status === 'uploaded' || d.status === 'extracting',
);
// Poll for status updates when documents are being processed
useEffect(() => {
if (hasPending) {
pollRef.current = setInterval(fetchDocuments, 2000);
} else if (pollRef.current) {
clearInterval(pollRef.current);
pollRef.current = null;
}
return () => {
if (pollRef.current) {
clearInterval(pollRef.current);
pollRef.current = null;
}
};
}, [hasPending, fetchDocuments]);
useEffect(() => {
fetchDocuments();
}, [fetchDocuments]);
@@ -99,7 +194,7 @@ export default function DokumentUpload({
throw new Error(data.error || 'Upload fehlgeschlagen');
}
setSuccess(`"${file.name}" erfolgreich hochgeladen.`);
setSuccess(`"${file.name}" erfolgreich hochgeladen. Textextraktion laeuft...`);
if (fileRef.current) fileRef.current.value = '';
fetchDocuments();
} catch (err) {
@@ -171,31 +266,51 @@ export default function DokumentUpload({
{documents.length > 0 && (
<div className="bg-card-bg border border-card-border rounded-xl p-5">
<h3 className="text-sm font-semibold text-foreground mb-3">
Dokumente ({documents.length})
</h3>
<div className="space-y-2">
<div className="flex items-center justify-between mb-3">
<h3 className="text-sm font-semibold text-foreground">
Dokumente ({documents.length})
</h3>
<button
type="button"
onClick={() => setDebug((prev) => !prev)}
className={`text-[11px] px-2 py-0.5 rounded-full border transition-colors ${
debug
? 'border-primary bg-primary/10 text-primary font-medium'
: 'border-card-border text-muted hover:text-foreground'
}`}
>
Debug {debug ? 'an' : 'aus'}
</button>
</div>
<div className="space-y-3">
{documents.map((doc) => (
<div
key={doc.id}
className="flex items-center justify-between p-3 rounded-lg border border-card-border"
className="p-3 rounded-lg border border-card-border"
>
<div className="min-w-0 flex-1">
<p className="text-sm font-medium text-foreground truncate">
{doc.filename}
</p>
<p className="text-xs text-muted">
{formatFileSize(doc.fileSizeBytes)} &middot;{' '}
{new Date(doc.createdAt).toLocaleDateString('de-DE')}
</p>
<div className="flex items-center justify-between">
<div className="min-w-0 flex-1">
<p className="text-sm font-medium text-foreground truncate">
{doc.filename}
</p>
<p className="text-xs text-muted">
{formatFileSize(doc.fileSizeBytes)} &middot;{' '}
{new Date(doc.createdAt).toLocaleDateString('de-DE')}
</p>
</div>
<span
className={`text-xs px-2 py-0.5 rounded-full font-medium shrink-0 ml-3 ${
STATUS_COLORS[doc.status] ?? 'bg-gray-500/10 text-gray-600'
}`}
>
{STATUS_LABELS[doc.status] ?? doc.status}
</span>
</div>
<span
className={`text-xs px-2 py-0.5 rounded-full font-medium shrink-0 ml-3 ${
STATUS_COLORS[doc.status] ?? 'bg-gray-500/10 text-gray-600'
}`}
>
{STATUS_LABELS[doc.status] ?? doc.status}
</span>
{/* Show progress for non-extracted documents or if debug is on */}
{(doc.status !== 'extracted' || debug) && (
<IngestionProgress doc={doc} debug={debug} />
)}
</div>
))}
</div>

View File

@@ -116,18 +116,50 @@ export async function extractDocumentText(tenantId: string, documentId: string):
try {
const fs = await import('node:fs/promises');
// Verify file exists before attempting extraction
try {
await fs.access(doc.storagePath);
} catch {
throw new Error(`Datei nicht gefunden: ${doc.storagePath}`);
}
const fileBuffer = await fs.readFile(doc.storagePath);
if (fileBuffer.length === 0) {
throw new Error('Datei ist leer (0 Bytes).');
}
let text: string;
if (doc.mimeType === 'application/pdf') {
const pdfParse = (await import('pdf-parse')).default;
const pdfData = await pdfParse(fileBuffer);
let pdfData;
try {
pdfData = await pdfParse(fileBuffer);
} catch (pdfErr) {
const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr);
if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) {
throw new Error('PDF ist passwortgeschuetzt oder verschluesselt. Bitte ungeschuetzte Version hochladen.');
}
throw new Error(`PDF konnte nicht gelesen werden: ${pdfMessage}`);
}
text = pdfData.text;
// Detect scanned PDFs with no text layer
if (!text || text.trim().length === 0) {
throw new Error(
'PDF enthaelt keinen extrahierbaren Text. Moeglicherweise handelt es sich um ein gescanntes Dokument ohne Textebene (OCR erforderlich).',
);
}
} else {
const mammoth = await import('mammoth');
const result = await mammoth.extractRawText({ buffer: fileBuffer });
text = result.value;
if (!text || text.trim().length === 0) {
throw new Error('DOCX enthaelt keinen extrahierbaren Text.');
}
}
await withTenantDb(tenantId, async (tdb) => {
@@ -144,6 +176,7 @@ export async function extractDocumentText(tenantId: string, documentId: string):
return text;
} catch (err) {
const message = err instanceof Error ? err.message : 'Textextraktion fehlgeschlagen';
console.error(`[extractDocumentText] Document ${documentId} failed:`, message);
await withTenantDb(tenantId, async (tdb) => {
await tdb
.update(documents)
@@ -196,6 +229,7 @@ export async function listDocuments(
fileSizeBytes: documents.fileSizeBytes,
category: documents.category,
status: documents.status,
errorMessage: documents.errorMessage,
caseId: documents.caseId,
decisionId: documents.decisionId,
normInstrumentId: documents.normInstrumentId,