fix: chunk large law texts to prevent truncated AI JSON output
Texts >10k chars are now split at § boundaries and parsed in separate AI calls, then merged. This prevents maxOutputTokens truncation that caused "AI returned invalid JSON" on large imports (~50k+ chars). Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
@@ -45,6 +45,37 @@ async function extractTextFromPdf(buffer: Buffer): Promise<string> {
|
||||
return data.text;
|
||||
}
|
||||
|
||||
const CHUNK_CHAR_LIMIT = 10_000;
|
||||
|
||||
/**
|
||||
* Split a large law text into chunks at paragraph (§) boundaries.
|
||||
* Each chunk stays under CHUNK_CHAR_LIMIT characters when possible.
|
||||
* If a single paragraph exceeds the limit it becomes its own chunk.
|
||||
*/
|
||||
function splitTextIntoChunks(text: string): string[] {
|
||||
if (text.length <= CHUNK_CHAR_LIMIT) return [text];
|
||||
|
||||
// Split at § boundaries, keeping the delimiter with the following section
|
||||
const sections = text.split(/(?=§\s*\d)/);
|
||||
|
||||
const chunks: string[] = [];
|
||||
let current = '';
|
||||
|
||||
for (const section of sections) {
|
||||
if (current.length + section.length > CHUNK_CHAR_LIMIT && current.length > 0) {
|
||||
chunks.push(current);
|
||||
current = section;
|
||||
} else {
|
||||
current += section;
|
||||
}
|
||||
}
|
||||
if (current.length > 0) {
|
||||
chunks.push(current);
|
||||
}
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
export async function POST(request: Request) {
|
||||
const auth = await requirePermission('norms:write');
|
||||
if ('response' in auth) return auth.response;
|
||||
@@ -106,50 +137,58 @@ export async function POST(request: Request) {
|
||||
try {
|
||||
const { model } = await getModelForTenant(ctx.tenantId);
|
||||
|
||||
const result = await generateText({
|
||||
model,
|
||||
system: PARSE_SYSTEM_PROMPT,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${text}`,
|
||||
},
|
||||
],
|
||||
maxOutputTokens: 16384,
|
||||
});
|
||||
const chunks = splitTextIntoChunks(text);
|
||||
const allProvisions: Array<{ paragraph: string; title: string; body: string }> = [];
|
||||
|
||||
// Parse the JSON response from the AI
|
||||
const responseText = result.text.trim();
|
||||
for (const chunk of chunks) {
|
||||
const result = await generateText({
|
||||
model,
|
||||
system: PARSE_SYSTEM_PROMPT,
|
||||
messages: [
|
||||
{
|
||||
role: 'user',
|
||||
content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${chunk}`,
|
||||
},
|
||||
],
|
||||
maxOutputTokens: 16384,
|
||||
});
|
||||
|
||||
// Extract JSON array — handle possible markdown code fences
|
||||
let jsonStr = responseText;
|
||||
const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (fenceMatch) {
|
||||
jsonStr = fenceMatch[1].trim();
|
||||
}
|
||||
const responseText = result.text.trim();
|
||||
|
||||
let provisions: Array<{ paragraph: string; title: string; body: string }>;
|
||||
try {
|
||||
provisions = JSON.parse(jsonStr);
|
||||
} catch {
|
||||
return Response.json(
|
||||
{
|
||||
error: 'AI returned invalid JSON. Please try again.',
|
||||
rawResponse: responseText.substring(0, 500),
|
||||
},
|
||||
{ status: 502 },
|
||||
);
|
||||
}
|
||||
// Extract JSON array — handle possible markdown code fences
|
||||
let jsonStr = responseText;
|
||||
const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
if (fenceMatch) {
|
||||
jsonStr = fenceMatch[1].trim();
|
||||
}
|
||||
|
||||
if (!Array.isArray(provisions)) {
|
||||
return Response.json(
|
||||
{ error: 'AI did not return an array of provisions.' },
|
||||
{ status: 502 },
|
||||
);
|
||||
let provisions: Array<{ paragraph: string; title: string; body: string }>;
|
||||
try {
|
||||
provisions = JSON.parse(jsonStr);
|
||||
} catch {
|
||||
return Response.json(
|
||||
{
|
||||
error: 'AI returned invalid JSON. Please try again.',
|
||||
rawResponse: responseText.substring(0, 500),
|
||||
failedChunk: chunks.indexOf(chunk) + 1,
|
||||
totalChunks: chunks.length,
|
||||
},
|
||||
{ status: 502 },
|
||||
);
|
||||
}
|
||||
|
||||
if (!Array.isArray(provisions)) {
|
||||
return Response.json(
|
||||
{ error: 'AI did not return an array of provisions.' },
|
||||
{ status: 502 },
|
||||
);
|
||||
}
|
||||
|
||||
allProvisions.push(...provisions);
|
||||
}
|
||||
|
||||
// Validate and clean provisions
|
||||
const cleaned = provisions
|
||||
const cleaned = allProvisions
|
||||
.filter(
|
||||
(p) =>
|
||||
p &&
|
||||
|
||||
Reference in New Issue
Block a user