fix: chunk large law texts to prevent truncated AI JSON output

Texts >10k chars are now split at § boundaries and parsed in separate AI calls, then merged. This prevents maxOutputTokens truncation that caused "AI returned invalid JSON" on large imports (~50k+ chars). Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-09 16:20:35 +00:00
parent 1493b84787
commit 2509b907ae
1 changed files with 76 additions and 37 deletions
--- a/src/app/api/norms/parse/route.ts
+++ b/src/app/api/norms/parse/route.ts
@@ -45,6 +45,37 @@ async function extractTextFromPdf(buffer: Buffer): Promise<string> {
  return data.text;
 }

+const CHUNK_CHAR_LIMIT = 10_000;
+
+/**
+ * Split a large law text into chunks at paragraph (§) boundaries.
+ * Each chunk stays under CHUNK_CHAR_LIMIT characters when possible.
+ * If a single paragraph exceeds the limit it becomes its own chunk.
+ */
+function splitTextIntoChunks(text: string): string[] {
+  if (text.length <= CHUNK_CHAR_LIMIT) return [text];
+
+  // Split at § boundaries, keeping the delimiter with the following section
+  const sections = text.split(/(?=§\s*\d)/);
+
+  const chunks: string[] = [];
+  let current = '';
+
+  for (const section of sections) {
+    if (current.length + section.length > CHUNK_CHAR_LIMIT && current.length > 0) {
+      chunks.push(current);
+      current = section;
+    } else {
+      current += section;
+    }
+  }
+  if (current.length > 0) {
+    chunks.push(current);
+  }
+
+  return chunks;
+}
+
 export async function POST(request: Request) {
  const auth = await requirePermission('norms:write');
  if ('response' in auth) return auth.response;
@@ -106,50 +137,58 @@ export async function POST(request: Request) {
  try {
    const { model } = await getModelForTenant(ctx.tenantId);

-    const result = await generateText({
-      model,
-      system: PARSE_SYSTEM_PROMPT,
-      messages: [
-        {
-          role: 'user',
-          content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${text}`,
-        },
-      ],
-      maxOutputTokens: 16384,
-    });
+    const chunks = splitTextIntoChunks(text);
+    const allProvisions: Array<{ paragraph: string; title: string; body: string }> = [];

-    // Parse the JSON response from the AI
-    const responseText = result.text.trim();
+    for (const chunk of chunks) {
+      const result = await generateText({
+        model,
+        system: PARSE_SYSTEM_PROMPT,
+        messages: [
+          {
+            role: 'user',
+            content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${chunk}`,
+          },
+        ],
+        maxOutputTokens: 16384,
+      });

-    // Extract JSON array — handle possible markdown code fences
-    let jsonStr = responseText;
-    const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
-    if (fenceMatch) {
-      jsonStr = fenceMatch[1].trim();
-    }
+      const responseText = result.text.trim();

-    let provisions: Array<{ paragraph: string; title: string; body: string }>;
-    try {
-      provisions = JSON.parse(jsonStr);
-    } catch {
-      return Response.json(
-        {
-          error: 'AI returned invalid JSON. Please try again.',
-          rawResponse: responseText.substring(0, 500),
-        },
-        { status: 502 },
-      );
-    }
+      // Extract JSON array — handle possible markdown code fences
+      let jsonStr = responseText;
+      const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/);
+      if (fenceMatch) {
+        jsonStr = fenceMatch[1].trim();
+      }

-    if (!Array.isArray(provisions)) {
-      return Response.json(
-        { error: 'AI did not return an array of provisions.' },
-        { status: 502 },
-      );
+      let provisions: Array<{ paragraph: string; title: string; body: string }>;
+      try {
+        provisions = JSON.parse(jsonStr);
+      } catch {
+        return Response.json(
+          {
+            error: 'AI returned invalid JSON. Please try again.',
+            rawResponse: responseText.substring(0, 500),
+            failedChunk: chunks.indexOf(chunk) + 1,
+            totalChunks: chunks.length,
+          },
+          { status: 502 },
+        );
+      }
+
+      if (!Array.isArray(provisions)) {
+        return Response.json(
+          { error: 'AI did not return an array of provisions.' },
+          { status: 502 },
+        );
+      }
+
+      allProvisions.push(...provisions);
    }

    // Validate and clean provisions
-    const cleaned = provisions
+    const cleaned = allProvisions
      .filter(
        (p) =>
          p &&