fix: chunk large law texts to prevent truncated AI JSON output

Texts >10k chars are now split at § boundaries and parsed in separate
AI calls, then merged.  This prevents maxOutputTokens truncation that
caused "AI returned invalid JSON" on large imports (~50k+ chars).

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
CTO (LegalAI)
2026-04-09 16:20:35 +00:00
parent 1493b84787
commit 2509b907ae

View File

@@ -45,6 +45,37 @@ async function extractTextFromPdf(buffer: Buffer): Promise<string> {
return data.text;
}
const CHUNK_CHAR_LIMIT = 10_000;
/**
* Split a large law text into chunks at paragraph (§) boundaries.
* Each chunk stays under CHUNK_CHAR_LIMIT characters when possible.
* If a single paragraph exceeds the limit it becomes its own chunk.
*/
function splitTextIntoChunks(text: string): string[] {
if (text.length <= CHUNK_CHAR_LIMIT) return [text];
// Split at § boundaries, keeping the delimiter with the following section
const sections = text.split(/(?=§\s*\d)/);
const chunks: string[] = [];
let current = '';
for (const section of sections) {
if (current.length + section.length > CHUNK_CHAR_LIMIT && current.length > 0) {
chunks.push(current);
current = section;
} else {
current += section;
}
}
if (current.length > 0) {
chunks.push(current);
}
return chunks;
}
export async function POST(request: Request) {
const auth = await requirePermission('norms:write');
if ('response' in auth) return auth.response;
@@ -106,50 +137,58 @@ export async function POST(request: Request) {
try {
const { model } = await getModelForTenant(ctx.tenantId);
const result = await generateText({
model,
system: PARSE_SYSTEM_PROMPT,
messages: [
{
role: 'user',
content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${text}`,
},
],
maxOutputTokens: 16384,
});
const chunks = splitTextIntoChunks(text);
const allProvisions: Array<{ paragraph: string; title: string; body: string }> = [];
// Parse the JSON response from the AI
const responseText = result.text.trim();
for (const chunk of chunks) {
const result = await generateText({
model,
system: PARSE_SYSTEM_PROMPT,
messages: [
{
role: 'user',
content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${chunk}`,
},
],
maxOutputTokens: 16384,
});
// Extract JSON array — handle possible markdown code fences
let jsonStr = responseText;
const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
if (fenceMatch) {
jsonStr = fenceMatch[1].trim();
}
const responseText = result.text.trim();
let provisions: Array<{ paragraph: string; title: string; body: string }>;
try {
provisions = JSON.parse(jsonStr);
} catch {
return Response.json(
{
error: 'AI returned invalid JSON. Please try again.',
rawResponse: responseText.substring(0, 500),
},
{ status: 502 },
);
}
// Extract JSON array — handle possible markdown code fences
let jsonStr = responseText;
const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
if (fenceMatch) {
jsonStr = fenceMatch[1].trim();
}
if (!Array.isArray(provisions)) {
return Response.json(
{ error: 'AI did not return an array of provisions.' },
{ status: 502 },
);
let provisions: Array<{ paragraph: string; title: string; body: string }>;
try {
provisions = JSON.parse(jsonStr);
} catch {
return Response.json(
{
error: 'AI returned invalid JSON. Please try again.',
rawResponse: responseText.substring(0, 500),
failedChunk: chunks.indexOf(chunk) + 1,
totalChunks: chunks.length,
},
{ status: 502 },
);
}
if (!Array.isArray(provisions)) {
return Response.json(
{ error: 'AI did not return an array of provisions.' },
{ status: 502 },
);
}
allProvisions.push(...provisions);
}
// Validate and clean provisions
const cleaned = provisions
const cleaned = allProvisions
.filter(
(p) =>
p &&