diff --git a/src/app/api/norms/parse/route.ts b/src/app/api/norms/parse/route.ts index 0d27915..48ffe3f 100644 --- a/src/app/api/norms/parse/route.ts +++ b/src/app/api/norms/parse/route.ts @@ -45,6 +45,37 @@ async function extractTextFromPdf(buffer: Buffer): Promise { return data.text; } +const CHUNK_CHAR_LIMIT = 10_000; + +/** + * Split a large law text into chunks at paragraph (§) boundaries. + * Each chunk stays under CHUNK_CHAR_LIMIT characters when possible. + * If a single paragraph exceeds the limit it becomes its own chunk. + */ +function splitTextIntoChunks(text: string): string[] { + if (text.length <= CHUNK_CHAR_LIMIT) return [text]; + + // Split at § boundaries, keeping the delimiter with the following section + const sections = text.split(/(?=§\s*\d)/); + + const chunks: string[] = []; + let current = ''; + + for (const section of sections) { + if (current.length + section.length > CHUNK_CHAR_LIMIT && current.length > 0) { + chunks.push(current); + current = section; + } else { + current += section; + } + } + if (current.length > 0) { + chunks.push(current); + } + + return chunks; +} + export async function POST(request: Request) { const auth = await requirePermission('norms:write'); if ('response' in auth) return auth.response; @@ -106,50 +137,58 @@ export async function POST(request: Request) { try { const { model } = await getModelForTenant(ctx.tenantId); - const result = await generateText({ - model, - system: PARSE_SYSTEM_PROMPT, - messages: [ - { - role: 'user', - content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${text}`, - }, - ], - maxOutputTokens: 16384, - }); + const chunks = splitTextIntoChunks(text); + const allProvisions: Array<{ paragraph: string; title: string; body: string }> = []; - // Parse the JSON response from the AI - const responseText = result.text.trim(); + for (const chunk of chunks) { + const result = await generateText({ + model, + system: PARSE_SYSTEM_PROMPT, + messages: [ + { + role: 'user', + content: `Bitte zerlege den folgenden Gesetzestext in einzelne Paragraphen:\n\n${chunk}`, + }, + ], + maxOutputTokens: 16384, + }); - // Extract JSON array — handle possible markdown code fences - let jsonStr = responseText; - const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/); - if (fenceMatch) { - jsonStr = fenceMatch[1].trim(); - } + const responseText = result.text.trim(); - let provisions: Array<{ paragraph: string; title: string; body: string }>; - try { - provisions = JSON.parse(jsonStr); - } catch { - return Response.json( - { - error: 'AI returned invalid JSON. Please try again.', - rawResponse: responseText.substring(0, 500), - }, - { status: 502 }, - ); - } + // Extract JSON array — handle possible markdown code fences + let jsonStr = responseText; + const fenceMatch = responseText.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fenceMatch) { + jsonStr = fenceMatch[1].trim(); + } - if (!Array.isArray(provisions)) { - return Response.json( - { error: 'AI did not return an array of provisions.' }, - { status: 502 }, - ); + let provisions: Array<{ paragraph: string; title: string; body: string }>; + try { + provisions = JSON.parse(jsonStr); + } catch { + return Response.json( + { + error: 'AI returned invalid JSON. Please try again.', + rawResponse: responseText.substring(0, 500), + failedChunk: chunks.indexOf(chunk) + 1, + totalChunks: chunks.length, + }, + { status: 502 }, + ); + } + + if (!Array.isArray(provisions)) { + return Response.json( + { error: 'AI did not return an array of provisions.' }, + { status: 502 }, + ); + } + + allProvisions.push(...provisions); } // Validate and clean provisions - const cleaned = provisions + const cleaned = allProvisions .filter( (p) => p &&