Compare commits
10 Commits
feat/aiia-
...
b4ad27ad02
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b4ad27ad02 | ||
| e5d9d3cef3 | |||
|
|
79191c3810 | ||
|
|
17c1b6587a | ||
|
|
4e74e4b5c9 | ||
|
|
3e0efd10e9 | ||
|
|
1e431145dd | ||
|
|
5ff2347aac | ||
| a89bf8380d | |||
|
|
94b89cb1e2 |
@@ -9,16 +9,15 @@ jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Deploy via SSH
|
||||
uses: appleboy/ssh-action@v1
|
||||
with:
|
||||
host: ${{ secrets.VPS_HOST }}
|
||||
username: ${{ secrets.VPS_USER }}
|
||||
key: ${{ secrets.VPS_SSH_KEY }}
|
||||
port: ${{ secrets.VPS_PORT || 22 }}
|
||||
script: |
|
||||
cd ${{ secrets.VPS_PROJECT_PATH || '/opt/legalai' }}
|
||||
git pull origin master
|
||||
docker compose build app
|
||||
docker compose up -d app
|
||||
echo "Deployed commit: $(git rev-parse --short HEAD)"
|
||||
- name: Pull latest code
|
||||
run: |
|
||||
cd /home/remmer/StageAI
|
||||
git pull origin master
|
||||
|
||||
- name: Build and deploy
|
||||
run: |
|
||||
cd /home/remmer/StageAI
|
||||
export COMMIT_HASH=$(git rev-parse --short HEAD)
|
||||
docker compose build app
|
||||
docker compose up -d app
|
||||
echo "Deployed commit: $COMMIT_HASH"
|
||||
|
||||
@@ -7,6 +7,8 @@ RUN npm ci
|
||||
|
||||
FROM base AS builder
|
||||
WORKDIR /app
|
||||
ARG COMMIT_HASH=dev
|
||||
ENV NEXT_PUBLIC_BUILD_HASH=$COMMIT_HASH
|
||||
COPY --from=deps /app/node_modules ./node_modules
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
@@ -38,8 +38,9 @@ git reset --hard "origin/$BRANCH"
|
||||
echo "Updated to: $(git log --oneline -1)"
|
||||
|
||||
# Rebuild containers
|
||||
export COMMIT_HASH=$(git rev-parse --short HEAD)
|
||||
echo ""
|
||||
echo "Rebuilding containers..."
|
||||
echo "Rebuilding containers (commit: $COMMIT_HASH)..."
|
||||
docker compose -p "$PROJECT_NAME" build --no-cache app
|
||||
|
||||
if [[ "$BUILD_ONLY" == true ]]; then
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
services:
|
||||
app:
|
||||
build: .
|
||||
build:
|
||||
context: .
|
||||
args:
|
||||
COMMIT_HASH: ${COMMIT_HASH:-dev}
|
||||
ports:
|
||||
- "3002:3000"
|
||||
environment:
|
||||
|
||||
@@ -11,7 +11,7 @@ const commitHash = (() => {
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
output: "standalone",
|
||||
serverExternalPackages: ["pdf-parse", "drizzle-orm", "pg"],
|
||||
serverExternalPackages: ["pdfjs-dist", "drizzle-orm", "pg"],
|
||||
env: {
|
||||
NEXT_PUBLIC_BUILD_HASH: commitHash,
|
||||
},
|
||||
|
||||
23
package-lock.json
generated
23
package-lock.json
generated
@@ -17,7 +17,7 @@
|
||||
"mammoth": "^1.12.0",
|
||||
"next": "16.2.3",
|
||||
"next-auth": "^4.24.13",
|
||||
"pdf-parse": "^2.4.5",
|
||||
"pdfjs-dist": "^5.4.296",
|
||||
"pg": "^8.20.0",
|
||||
"react": "19.2.4",
|
||||
"react-dom": "19.2.4"
|
||||
@@ -2058,6 +2058,7 @@
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.80.tgz",
|
||||
"integrity": "sha512-DxuT1ClnIPts1kQx8FBmkk4BQDTfI5kIzywAaMjQSXfNnra5UFU9PwurXrl+Je3bJ6BGsp/zmshVVFbCmyI+ww==",
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"workspaces": [
|
||||
"e2e/*"
|
||||
],
|
||||
@@ -7148,26 +7149,6 @@
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/pdf-parse": {
|
||||
"version": "2.4.5",
|
||||
"resolved": "https://registry.npmjs.org/pdf-parse/-/pdf-parse-2.4.5.tgz",
|
||||
"integrity": "sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"@napi-rs/canvas": "0.1.80",
|
||||
"pdfjs-dist": "5.4.296"
|
||||
},
|
||||
"bin": {
|
||||
"pdf-parse": "bin/cli.mjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20.16.0 <21 || >=22.3.0"
|
||||
},
|
||||
"funding": {
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/mehmet-kozan"
|
||||
}
|
||||
},
|
||||
"node_modules/pdfjs-dist": {
|
||||
"version": "5.4.296",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.296.tgz",
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
"mammoth": "^1.12.0",
|
||||
"next": "16.2.3",
|
||||
"next-auth": "^4.24.13",
|
||||
"pdf-parse": "^2.4.5",
|
||||
"pdfjs-dist": "^5.4.296",
|
||||
"pg": "^8.20.0",
|
||||
"react": "19.2.4",
|
||||
"react-dom": "19.2.4"
|
||||
|
||||
@@ -22,6 +22,9 @@ export default async function DashboardLayout({
|
||||
<main className="flex-1 p-8 overflow-auto">
|
||||
{children}
|
||||
</main>
|
||||
<footer className="px-8 py-3 text-xs text-gray-400 border-t border-gray-200">
|
||||
Build {process.env.NEXT_PUBLIC_BUILD_HASH || 'dev'}
|
||||
</footer>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
|
||||
64
src/app/api/documents/[id]/route.ts
Normal file
64
src/app/api/documents/[id]/route.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
// GET /api/documents/:id — get document status and metadata (used for polling)
|
||||
// DELETE /api/documents/:id — delete a document and its stored file
|
||||
|
||||
import { type NextRequest } from 'next/server';
|
||||
import { getDocument, deleteDocument } from '@/lib/documents';
|
||||
import { logAuditEvent } from '@/lib/auth/audit';
|
||||
import { requirePermission } from '@/lib/auth/rbac';
|
||||
|
||||
export async function GET(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ id: string }> },
|
||||
) {
|
||||
const auth = await requirePermission('cases:read');
|
||||
if ('response' in auth) return auth.response;
|
||||
const { ctx } = auth;
|
||||
|
||||
const { id } = await params;
|
||||
|
||||
const doc = await getDocument(ctx.tenantId, id);
|
||||
|
||||
if (!doc) {
|
||||
return Response.json({ error: 'Dokument nicht gefunden.' }, { status: 404 });
|
||||
}
|
||||
|
||||
return Response.json({
|
||||
id: doc.id,
|
||||
filename: doc.filename,
|
||||
mimeType: doc.mimeType,
|
||||
fileSizeBytes: doc.fileSizeBytes,
|
||||
category: doc.category,
|
||||
status: doc.status,
|
||||
errorMessage: doc.errorMessage,
|
||||
createdAt: doc.createdAt,
|
||||
updatedAt: doc.updatedAt,
|
||||
});
|
||||
}
|
||||
|
||||
export async function DELETE(
|
||||
request: NextRequest,
|
||||
{ params }: { params: Promise<{ id: string }> },
|
||||
) {
|
||||
const auth = await requirePermission('cases:edit');
|
||||
if ('response' in auth) return auth.response;
|
||||
const { ctx } = auth;
|
||||
|
||||
const { id } = await params;
|
||||
|
||||
const deleted = await deleteDocument(ctx.tenantId, id);
|
||||
|
||||
if (!deleted) {
|
||||
return Response.json(
|
||||
{ error: 'Dokument nicht gefunden.' },
|
||||
{ status: 404 },
|
||||
);
|
||||
}
|
||||
|
||||
const ip =
|
||||
request.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ??
|
||||
request.headers.get('x-real-ip') ??
|
||||
undefined;
|
||||
await logAuditEvent(ctx, 'delete', 'document', id, { filename: deleted.filename }, ip);
|
||||
|
||||
return Response.json({ deleted: true });
|
||||
}
|
||||
@@ -66,8 +66,8 @@ export async function POST(request: NextRequest) {
|
||||
);
|
||||
|
||||
// Trigger text extraction asynchronously (fire-and-forget)
|
||||
extractDocumentText(ctx.tenantId, result.documentId).catch(() => {
|
||||
// Extraction errors are stored in the document record
|
||||
extractDocumentText(ctx.tenantId, result.documentId).catch((err) => {
|
||||
console.error(`[documents] Text extraction failed for ${result.documentId}:`, err);
|
||||
});
|
||||
|
||||
return Response.json(result, { status: 201 });
|
||||
|
||||
@@ -39,10 +39,9 @@ Antworte NUR mit einem JSON-Array. Kein erklaerener Text, kein Markdown, nur das
|
||||
}
|
||||
]`;
|
||||
|
||||
async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const data = await pdfParse(buffer);
|
||||
return data.text;
|
||||
async function extractTextFromPdfBuffer(buffer: Buffer): Promise<string> {
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
return extractTextFromPdf(buffer);
|
||||
}
|
||||
|
||||
const CHUNK_CHAR_LIMIT = 10_000;
|
||||
@@ -98,7 +97,7 @@ export async function POST(request: Request) {
|
||||
|
||||
if (file.type === 'application/pdf' || file.name.endsWith('.pdf')) {
|
||||
try {
|
||||
text = await extractTextFromPdf(buffer);
|
||||
text = await extractTextFromPdfBuffer(buffer);
|
||||
} catch (err) {
|
||||
console.error('PDF parse error:', err);
|
||||
return Response.json(
|
||||
|
||||
@@ -20,12 +20,13 @@ interface DocumentItem {
|
||||
mimeType: string;
|
||||
fileSizeBytes: number;
|
||||
status: string;
|
||||
errorMessage: string | null;
|
||||
createdAt: string;
|
||||
}
|
||||
|
||||
const STATUS_LABELS: Record<string, string> = {
|
||||
uploaded: 'Hochgeladen',
|
||||
extracting: 'Text wird extrahiert...',
|
||||
extracting: 'Extrahiere Text...',
|
||||
extracted: 'Extrahiert',
|
||||
failed: 'Fehlgeschlagen',
|
||||
};
|
||||
@@ -37,12 +38,83 @@ const STATUS_COLORS: Record<string, string> = {
|
||||
failed: 'bg-red-500/10 text-red-700',
|
||||
};
|
||||
|
||||
const STEP_LABELS = [
|
||||
{ key: 'uploaded', label: 'Hochgeladen' },
|
||||
{ key: 'extracting', label: 'Extrahiere Text' },
|
||||
{ key: 'extracted', label: 'Fertig' },
|
||||
];
|
||||
|
||||
function formatFileSize(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(0)} KB`;
|
||||
return `${(bytes / 1024 / 1024).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
function getStepIndex(status: string): number {
|
||||
if (status === 'uploaded') return 0;
|
||||
if (status === 'extracting') return 1;
|
||||
if (status === 'extracted') return 2;
|
||||
return -1; // failed
|
||||
}
|
||||
|
||||
function IngestionProgress({ doc, debug }: { doc: DocumentItem; debug: boolean }) {
|
||||
const stepIdx = getStepIndex(doc.status);
|
||||
const isFailed = doc.status === 'failed';
|
||||
|
||||
return (
|
||||
<div className="mt-2 space-y-2">
|
||||
{/* Step indicators */}
|
||||
<div className="flex items-center gap-1">
|
||||
{STEP_LABELS.map((step, i) => {
|
||||
const isActive = i === stepIdx;
|
||||
const isComplete = i < stepIdx;
|
||||
const isCurrent = isActive && !isFailed;
|
||||
|
||||
let dotClass = 'w-2.5 h-2.5 rounded-full shrink-0 transition-colors';
|
||||
if (isComplete) dotClass += ' bg-green-500';
|
||||
else if (isCurrent) dotClass += ' bg-yellow-500 animate-pulse';
|
||||
else if (isFailed && i === 1) dotClass += ' bg-red-500';
|
||||
else dotClass += ' bg-gray-300';
|
||||
|
||||
let lineClass = 'flex-1 h-0.5 transition-colors';
|
||||
if (isComplete) lineClass += ' bg-green-500';
|
||||
else lineClass += ' bg-gray-200';
|
||||
|
||||
return (
|
||||
<div key={step.key} className="flex items-center gap-1 flex-1">
|
||||
<div className={dotClass} />
|
||||
<span className={`text-[10px] ${isActive || isComplete ? 'text-foreground font-medium' : 'text-muted'}`}>
|
||||
{step.label}
|
||||
</span>
|
||||
{i < STEP_LABELS.length - 1 && <div className={lineClass} />}
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
|
||||
{/* Error display */}
|
||||
{isFailed && doc.errorMessage && (
|
||||
<div className="bg-red-50 border border-red-200 rounded-lg p-2.5 text-xs text-red-700">
|
||||
<span className="font-medium">Fehler: </span>
|
||||
{doc.errorMessage}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Debug info */}
|
||||
{debug && (
|
||||
<div className="bg-gray-50 border border-gray-200 rounded-lg p-2.5 text-[11px] font-mono text-gray-600 space-y-0.5">
|
||||
<div>ID: {doc.id}</div>
|
||||
<div>Status: {doc.status}</div>
|
||||
<div>MIME: {doc.mimeType}</div>
|
||||
<div>Groesse: {formatFileSize(doc.fileSizeBytes)}</div>
|
||||
<div>Hochgeladen: {new Date(doc.createdAt).toLocaleString('de-DE')}</div>
|
||||
{doc.errorMessage && <div className="text-red-600">Fehler: {doc.errorMessage}</div>}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
export default function DokumentUpload({
|
||||
category,
|
||||
sourceScope,
|
||||
@@ -52,11 +124,14 @@ export default function DokumentUpload({
|
||||
label = 'Dokument hochladen',
|
||||
}: DokumentUploadProps) {
|
||||
const [uploading, setUploading] = useState(false);
|
||||
const [deleting, setDeleting] = useState<string | null>(null);
|
||||
const [error, setError] = useState('');
|
||||
const [success, setSuccess] = useState('');
|
||||
const [documents, setDocuments] = useState<DocumentItem[]>([]);
|
||||
const [dragging, setDragging] = useState(false);
|
||||
const [debug, setDebug] = useState(false);
|
||||
const fileRef = useRef<HTMLInputElement>(null);
|
||||
const pollRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||
|
||||
const fetchDocuments = useCallback(async () => {
|
||||
const params = new URLSearchParams({ category });
|
||||
@@ -75,6 +150,27 @@ export default function DokumentUpload({
|
||||
}
|
||||
}, [category, caseId, decisionId, normInstrumentId]);
|
||||
|
||||
// Determine if any documents need polling (in-progress states)
|
||||
const hasPending = documents.some(
|
||||
(d) => d.status === 'uploaded' || d.status === 'extracting',
|
||||
);
|
||||
|
||||
// Poll for status updates when documents are being processed
|
||||
useEffect(() => {
|
||||
if (hasPending) {
|
||||
pollRef.current = setInterval(fetchDocuments, 2000);
|
||||
} else if (pollRef.current) {
|
||||
clearInterval(pollRef.current);
|
||||
pollRef.current = null;
|
||||
}
|
||||
return () => {
|
||||
if (pollRef.current) {
|
||||
clearInterval(pollRef.current);
|
||||
pollRef.current = null;
|
||||
}
|
||||
};
|
||||
}, [hasPending, fetchDocuments]);
|
||||
|
||||
useEffect(() => {
|
||||
fetchDocuments();
|
||||
}, [fetchDocuments]);
|
||||
@@ -103,7 +199,7 @@ export default function DokumentUpload({
|
||||
throw new Error(data.error || 'Upload fehlgeschlagen');
|
||||
}
|
||||
|
||||
setSuccess(`"${file.name}" erfolgreich hochgeladen.`);
|
||||
setSuccess(`"${file.name}" erfolgreich hochgeladen. Textextraktion laeuft...`);
|
||||
if (fileRef.current) fileRef.current.value = '';
|
||||
fetchDocuments();
|
||||
} catch (err) {
|
||||
@@ -113,6 +209,28 @@ export default function DokumentUpload({
|
||||
}
|
||||
}
|
||||
|
||||
async function handleDelete(docId: string, filename: string) {
|
||||
if (!confirm(`"${filename}" wirklich loeschen?`)) return;
|
||||
|
||||
setError('');
|
||||
setSuccess('');
|
||||
setDeleting(docId);
|
||||
|
||||
try {
|
||||
const res = await fetch(`/api/documents/${docId}`, { method: 'DELETE' });
|
||||
if (!res.ok) {
|
||||
const data = await res.json();
|
||||
throw new Error(data.error || 'Loeschen fehlgeschlagen');
|
||||
}
|
||||
setSuccess(`"${filename}" wurde geloescht.`);
|
||||
fetchDocuments();
|
||||
} catch (err) {
|
||||
setError(err instanceof Error ? err.message : 'Ein Fehler ist aufgetreten');
|
||||
} finally {
|
||||
setDeleting(null);
|
||||
}
|
||||
}
|
||||
|
||||
function handleSubmit(e: React.FormEvent) {
|
||||
e.preventDefault();
|
||||
const file = fileRef.current?.files?.[0];
|
||||
@@ -175,31 +293,60 @@ export default function DokumentUpload({
|
||||
|
||||
{documents.length > 0 && (
|
||||
<div className="bg-card-bg border border-card-border rounded-xl p-5">
|
||||
<h3 className="text-sm font-semibold text-foreground mb-3">
|
||||
Dokumente ({documents.length})
|
||||
</h3>
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center justify-between mb-3">
|
||||
<h3 className="text-sm font-semibold text-foreground">
|
||||
Dokumente ({documents.length})
|
||||
</h3>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setDebug((prev) => !prev)}
|
||||
className={`text-[11px] px-2 py-0.5 rounded-full border transition-colors ${
|
||||
debug
|
||||
? 'border-primary bg-primary/10 text-primary font-medium'
|
||||
: 'border-card-border text-muted hover:text-foreground'
|
||||
}`}
|
||||
>
|
||||
Debug {debug ? 'an' : 'aus'}
|
||||
</button>
|
||||
</div>
|
||||
<div className="space-y-3">
|
||||
{documents.map((doc) => (
|
||||
<div
|
||||
key={doc.id}
|
||||
className="flex items-center justify-between p-3 rounded-lg border border-card-border"
|
||||
className="p-3 rounded-lg border border-card-border"
|
||||
>
|
||||
<div className="min-w-0 flex-1">
|
||||
<p className="text-sm font-medium text-foreground truncate">
|
||||
{doc.filename}
|
||||
</p>
|
||||
<p className="text-xs text-muted">
|
||||
{formatFileSize(doc.fileSizeBytes)} ·{' '}
|
||||
{new Date(doc.createdAt).toLocaleDateString('de-DE')}
|
||||
</p>
|
||||
<div className="flex items-center justify-between">
|
||||
<div className="min-w-0 flex-1">
|
||||
<p className="text-sm font-medium text-foreground truncate">
|
||||
{doc.filename}
|
||||
</p>
|
||||
<p className="text-xs text-muted">
|
||||
{formatFileSize(doc.fileSizeBytes)} ·{' '}
|
||||
{new Date(doc.createdAt).toLocaleDateString('de-DE')}
|
||||
</p>
|
||||
</div>
|
||||
<span
|
||||
className={`text-xs px-2 py-0.5 rounded-full font-medium shrink-0 ml-3 ${
|
||||
STATUS_COLORS[doc.status] ?? 'bg-gray-500/10 text-gray-600'
|
||||
}`}
|
||||
>
|
||||
{STATUS_LABELS[doc.status] ?? doc.status}
|
||||
</span>
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => handleDelete(doc.id, doc.filename)}
|
||||
disabled={deleting === doc.id}
|
||||
className="ml-2 text-xs text-danger hover:text-danger/80 transition-colors disabled:opacity-50 shrink-0"
|
||||
title="Dokument loeschen"
|
||||
>
|
||||
{deleting === doc.id ? '...' : 'Loeschen'}
|
||||
</button>
|
||||
</div>
|
||||
<span
|
||||
className={`text-xs px-2 py-0.5 rounded-full font-medium shrink-0 ml-3 ${
|
||||
STATUS_COLORS[doc.status] ?? 'bg-gray-500/10 text-gray-600'
|
||||
}`}
|
||||
>
|
||||
{STATUS_LABELS[doc.status] ?? doc.status}
|
||||
</span>
|
||||
|
||||
{/* Show progress for non-extracted documents or if debug is on */}
|
||||
{(doc.status !== 'extracted' || debug) && (
|
||||
<IngestionProgress doc={doc} debug={debug} />
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
@@ -137,10 +137,8 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
// Dynamic import for pdf-parse (optional dependency)
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const pdfData = await pdfParse(fileBuffer);
|
||||
text = pdfData.text;
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
text = await extractTextFromPdf(fileBuffer);
|
||||
} else {
|
||||
// DOCX — use mammoth for extraction
|
||||
const mammoth = await import('mammoth');
|
||||
|
||||
@@ -121,18 +121,48 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
|
||||
try {
|
||||
const fs = await import('node:fs/promises');
|
||||
|
||||
// Verify file exists before attempting extraction
|
||||
try {
|
||||
await fs.access(doc.storagePath);
|
||||
} catch {
|
||||
throw new Error(`Datei nicht gefunden: ${doc.storagePath}`);
|
||||
}
|
||||
|
||||
const fileBuffer = await fs.readFile(doc.storagePath);
|
||||
|
||||
if (fileBuffer.length === 0) {
|
||||
throw new Error('Datei ist leer (0 Bytes).');
|
||||
}
|
||||
|
||||
let text: string;
|
||||
|
||||
if (doc.mimeType === 'application/pdf') {
|
||||
const pdfParse = (await import('pdf-parse')).default;
|
||||
const pdfData = await pdfParse(fileBuffer);
|
||||
text = pdfData.text;
|
||||
const { extractTextFromPdf } = await import('@/lib/pdf');
|
||||
try {
|
||||
text = await extractTextFromPdf(fileBuffer);
|
||||
} catch (pdfErr) {
|
||||
const pdfMessage = pdfErr instanceof Error ? pdfErr.message : String(pdfErr);
|
||||
if (pdfMessage.includes('encrypted') || pdfMessage.includes('password')) {
|
||||
throw new Error('PDF ist passwortgeschuetzt oder verschluesselt. Bitte ungeschuetzte Version hochladen.');
|
||||
}
|
||||
throw new Error(`PDF konnte nicht gelesen werden: ${pdfMessage}`);
|
||||
}
|
||||
|
||||
// Detect scanned PDFs with no text layer
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new Error(
|
||||
'PDF enthaelt keinen extrahierbaren Text. Moeglicherweise handelt es sich um ein gescanntes Dokument ohne Textebene (OCR erforderlich).',
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const mammoth = await import('mammoth');
|
||||
const result = await mammoth.extractRawText({ buffer: fileBuffer });
|
||||
text = result.value;
|
||||
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new Error('DOCX enthaelt keinen extrahierbaren Text.');
|
||||
}
|
||||
}
|
||||
|
||||
await withTenantDb(tenantId, async (tdb) => {
|
||||
@@ -149,6 +179,7 @@ export async function extractDocumentText(tenantId: string, documentId: string):
|
||||
return text;
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : 'Textextraktion fehlgeschlagen';
|
||||
console.error(`[extractDocumentText] Document ${documentId} failed:`, message);
|
||||
await withTenantDb(tenantId, async (tdb) => {
|
||||
await tdb
|
||||
.update(documents)
|
||||
@@ -206,6 +237,7 @@ export async function listDocuments(
|
||||
category: documents.category,
|
||||
sourceScope: documents.sourceScope,
|
||||
status: documents.status,
|
||||
errorMessage: documents.errorMessage,
|
||||
caseId: documents.caseId,
|
||||
decisionId: documents.decisionId,
|
||||
normInstrumentId: documents.normInstrumentId,
|
||||
@@ -232,3 +264,26 @@ export async function getDocument(tenantId: string, documentId: string) {
|
||||
return doc ?? null;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a document by ID. Removes the DB record and the stored file from disk.
|
||||
* Returns the deleted document row, or null if not found.
|
||||
*/
|
||||
export async function deleteDocument(tenantId: string, documentId: string) {
|
||||
const deleted = await withTenantDb(tenantId, async (tdb) => {
|
||||
const [row] = await tdb
|
||||
.delete(documents)
|
||||
.where(eq(documents.id, documentId))
|
||||
.returning();
|
||||
return row ?? null;
|
||||
});
|
||||
|
||||
if (deleted?.storagePath) {
|
||||
const fs = await import('node:fs/promises');
|
||||
await fs.unlink(deleted.storagePath).catch(() => {
|
||||
// File may already be removed — ignore cleanup errors
|
||||
});
|
||||
}
|
||||
|
||||
return deleted;
|
||||
}
|
||||
|
||||
40
src/lib/pdf.ts
Normal file
40
src/lib/pdf.ts
Normal file
@@ -0,0 +1,40 @@
|
||||
// PDF text extraction using pdfjs-dist legacy build (Node.js compatible, no canvas/DOMMatrix)
|
||||
|
||||
// Force Next.js file tracer to include the worker file in standalone builds
|
||||
import 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
||||
|
||||
/**
|
||||
* Extract all text from a PDF buffer.
|
||||
* Uses pdfjs-dist legacy build which works in Node.js without canvas or DOM APIs.
|
||||
*/
|
||||
export async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
|
||||
// Resolve the worker path at runtime so pdfjs can find it in standalone builds
|
||||
const { createRequire } = await import('module');
|
||||
const require = createRequire(import.meta.url ?? __filename);
|
||||
pdfjsLib.GlobalWorkerOptions.workerSrc = require.resolve(
|
||||
'pdfjs-dist/legacy/build/pdf.worker.mjs',
|
||||
);
|
||||
|
||||
const data = new Uint8Array(buffer);
|
||||
const doc = await pdfjsLib.getDocument({
|
||||
data,
|
||||
useSystemFonts: true,
|
||||
isEvalSupported: false,
|
||||
}).promise;
|
||||
|
||||
const pages: string[] = [];
|
||||
for (let i = 1; i <= doc.numPages; i++) {
|
||||
const page = await doc.getPage(i);
|
||||
const content = await page.getTextContent();
|
||||
const pageText = content.items
|
||||
.filter((item) => 'str' in item)
|
||||
.map((item) => (item as { str: string }).str)
|
||||
.join(' ');
|
||||
pages.push(pageText);
|
||||
}
|
||||
|
||||
doc.destroy();
|
||||
return pages.join('\n');
|
||||
}
|
||||
51
src/types/pdf-parse.d.ts
vendored
51
src/types/pdf-parse.d.ts
vendored
@@ -1,14 +1,45 @@
|
||||
declare module 'pdf-parse' {
|
||||
interface PdfData {
|
||||
numpages: number;
|
||||
numrender: number;
|
||||
info: Record<string, unknown>;
|
||||
metadata: Record<string, unknown>;
|
||||
text: string;
|
||||
version: string;
|
||||
// Worker module — imported for side-effect (file tracer) only
|
||||
declare module 'pdfjs-dist/legacy/build/pdf.worker.mjs' {}
|
||||
|
||||
// pdfjs-dist legacy build type shim for dynamic import
|
||||
declare module 'pdfjs-dist/legacy/build/pdf.mjs' {
|
||||
export const GlobalWorkerOptions: {
|
||||
workerSrc: string;
|
||||
};
|
||||
|
||||
export function getDocument(params: {
|
||||
data: Uint8Array;
|
||||
useSystemFonts?: boolean;
|
||||
isEvalSupported?: boolean;
|
||||
disableAutoFetch?: boolean;
|
||||
}): { promise: Promise<PDFDocumentProxy> };
|
||||
|
||||
interface PDFDocumentProxy {
|
||||
numPages: number;
|
||||
getPage(pageNumber: number): Promise<PDFPageProxy>;
|
||||
destroy(): void;
|
||||
}
|
||||
|
||||
function pdfParse(dataBuffer: Buffer, options?: Record<string, unknown>): Promise<PdfData>;
|
||||
interface PDFPageProxy {
|
||||
getTextContent(): Promise<TextContent>;
|
||||
}
|
||||
|
||||
export default pdfParse;
|
||||
interface TextContent {
|
||||
items: Array<TextItem | TextMarkedContent>;
|
||||
}
|
||||
|
||||
interface TextItem {
|
||||
str: string;
|
||||
dir: string;
|
||||
width: number;
|
||||
height: number;
|
||||
transform: number[];
|
||||
fontName: string;
|
||||
hasEOL: boolean;
|
||||
}
|
||||
|
||||
interface TextMarkedContent {
|
||||
type: string;
|
||||
id: string;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user