Live SYSTEM_PROMPT on mDock had drifted heavily from the repo template
(detailed correspondent fuzzy-matching catalogue, full existing-names
list, refined title-generation rules). Reconciled by adopting the live
prompt as the new baseline in SYSTEM_PROMPT.txt and layering two fixes
on top:
1. Recipient rule (Rule 1): Matthias / Mathias Siebels and any address-
block variant ("Herr Siebels", "Empfaengeradresse Windscheidstr. 33")
must NEVER be set as correspondent — m is the recipient of nearly
every doc. Paul Siebels: also recipient by default, only correspondent
when nachweislich Autor (eigener Brief, Schadensmeldung von Paul).
Triggering misclassification (issue body): doc 280 (Vattenfall
Stromliefervertrag) was tagged correspondent="Matthias Siebels"
because the AI picked the recipient address block as sender.
2. Soften "Bevorzuge IMMER existierenden Correspondent" -> only when
semantic similarity is clear. Genuinely new senders (Versorger, Arzt,
Versicherer, Vermieter, ...) get a new correspondent rather than
being force-mapped to the nearest existing name. Fixes the
Vattenfall -> Telekom drift on docs 283/284 (also addressed by head
adding Vattenfall ID 257 manually).
Also migrated push_system_prompt.py from m/otto into this repo so the
deploy mechanism (render template -> push to /app/data/.env -> restart
paperless-ai) lives next to the template. Added RECIPIENT_EXCLUDE
filter so Matthias/Mathias Siebels are stripped from the rendered
correspondents list — defense in depth on top of the prompt rule.
Paperless correspondent records (IDs 3, 255) are preserved for the
historical doc assignments that still reference them.
Applied to live mDock paperless-ai (backup .env.bak.20260516T162255).
39 of 41 Siebels-correspondent doc assignments cleared + their
paperless-AI sqlite tracker rows (processed_documents,
history_documents, openai_metrics) deleted so they reclassify on the
next scan. Two kept (doc 117 Vollmacht from Paul, doc 130
Schadensmeldung filled by Paul — both genuine Paul-as-author cases per
the new rule).
Refs: m/mDMS#3
188 lines
6.2 KiB
Python
188 lines
6.2 KiB
Python
"""
|
|
Render SYSTEM_PROMPT.txt with the live correspondent list and push it to
|
|
the paperless-ai container's /app/data/.env on mDock.
|
|
|
|
The repo SYSTEM_PROMPT.txt is the template (with the placeholder
|
|
{{CORRESPONDENTS_LIST}}). This script:
|
|
|
|
1. Reads the current correspondents from the Paperless API.
|
|
2. Filters out names that must never appear as correspondent
|
|
(recipients of m's mail — see RECIPIENT_EXCLUDE).
|
|
3. Renders the prompt by substituting the placeholder.
|
|
4. Reads the live /app/data/.env from the paperless-ai container.
|
|
5. Replaces the SYSTEM_PROMPT=`…` block.
|
|
6. Backs up the old .env (.bak.<ts>) and writes the new one.
|
|
7. Restarts the paperless-ai container.
|
|
|
|
Dry-run is the default: prints the would-be rendered prompt without
|
|
writing.
|
|
|
|
Usage:
|
|
python3 push_system_prompt.py # dry run
|
|
python3 push_system_prompt.py --apply # write + restart
|
|
|
|
Migrated into m/mDMS from m/otto on 2026-05-16 (mDMS#3).
|
|
"""
|
|
import argparse
|
|
import datetime
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
|
|
PAPERLESS_HOST = "mdock"
|
|
PAPERLESS_AI_CONTAINER = "paperless-ai"
|
|
PAPERLESS_WEB_CONTAINER = "paperless-webserver-1"
|
|
ENV_PATH = "/app/data/.env"
|
|
HERE = os.path.dirname(os.path.abspath(__file__))
|
|
TEMPLATE_PATH = os.path.join(HERE, "SYSTEM_PROMPT.txt")
|
|
PLACEHOLDER = "{{CORRESPONDENTS_LIST}}"
|
|
|
|
# Names that are m or his household — recipients, never correspondents.
|
|
# Substring match, case-insensitive. Keep the actual correspondent records
|
|
# in Paperless (data integrity for historical doc assignments), but never
|
|
# show them to the LLM as candidate senders.
|
|
RECIPIENT_EXCLUDE = ("matthias siebels", "mathias siebels")
|
|
|
|
|
|
def get_token() -> str:
|
|
out = subprocess.run(
|
|
["ssh", PAPERLESS_HOST,
|
|
f"docker exec {PAPERLESS_AI_CONTAINER} sh -c "
|
|
f"'grep ^PAPERLESS_API_TOKEN {ENV_PATH} | cut -d= -f2'"],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
return out.stdout.strip()
|
|
|
|
|
|
def fetch_correspondents(token: str) -> list[str]:
|
|
cmd = (
|
|
f"docker exec {PAPERLESS_WEB_CONTAINER} "
|
|
f"curl -s -H 'Authorization: Token {token}' "
|
|
f"'http://localhost:8000/api/correspondents/?page_size=500'"
|
|
)
|
|
out = subprocess.run(
|
|
["ssh", PAPERLESS_HOST, cmd],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
if out.returncode != 0:
|
|
raise RuntimeError(f"fetch failed: {out.stderr}")
|
|
data = json.loads(out.stdout)
|
|
names = [c["name"] for c in data["results"]]
|
|
filtered = [n for n in names
|
|
if not any(x in n.lower() for x in RECIPIENT_EXCLUDE)]
|
|
dropped = sorted(set(names) - set(filtered))
|
|
if dropped:
|
|
print(f"filtered out recipient-names: {dropped}")
|
|
return sorted(filtered, key=lambda s: s.lower())
|
|
|
|
|
|
def render_prompt(template: str, names: list[str]) -> str:
|
|
listing = "\n".join(f"- {n}" for n in names)
|
|
return template.replace(PLACEHOLDER, listing)
|
|
|
|
|
|
def read_remote_env() -> str:
|
|
out = subprocess.run(
|
|
["ssh", PAPERLESS_HOST,
|
|
f"docker exec {PAPERLESS_AI_CONTAINER} cat {ENV_PATH}"],
|
|
capture_output=True, text=True, timeout=15,
|
|
)
|
|
if out.returncode != 0:
|
|
raise RuntimeError(f"cat failed: {out.stderr}")
|
|
return out.stdout
|
|
|
|
|
|
def replace_system_prompt(env: str, new_prompt: str) -> str:
|
|
"""Replace the SYSTEM_PROMPT=`…` block with the new one.
|
|
|
|
Paperless-AI's .env uses backtick-delimited values for multi-line
|
|
settings (JS .env loader convention; bash would not accept this).
|
|
"""
|
|
lines = env.splitlines(keepends=True)
|
|
out = []
|
|
inside = False
|
|
replaced = False
|
|
for line in lines:
|
|
if not inside and line.startswith("SYSTEM_PROMPT="):
|
|
out.append(f"SYSTEM_PROMPT=`{new_prompt.rstrip()}`\n")
|
|
replaced = True
|
|
stripped_value = line[len("SYSTEM_PROMPT="):].rstrip("\n")
|
|
if stripped_value.startswith("`") and stripped_value.count("`") >= 2:
|
|
continue
|
|
inside = True
|
|
continue
|
|
if inside:
|
|
if "`" in line:
|
|
inside = False
|
|
continue
|
|
out.append(line)
|
|
if not replaced:
|
|
raise SystemExit("SYSTEM_PROMPT= line not found in .env")
|
|
return "".join(out)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--apply", action="store_true",
|
|
help="Write new .env and restart paperless-ai")
|
|
args = ap.parse_args()
|
|
|
|
with open(TEMPLATE_PATH) as f:
|
|
template = f.read()
|
|
if PLACEHOLDER not in template:
|
|
sys.exit(f"template missing placeholder {PLACEHOLDER}")
|
|
|
|
token = get_token()
|
|
names = fetch_correspondents(token)
|
|
print(f"fetched {len(names)} live correspondents (after recipient filter)")
|
|
rendered = render_prompt(template, names)
|
|
print(f"rendered prompt: {len(rendered)} chars, {len(rendered.splitlines())} lines")
|
|
|
|
env_before = read_remote_env()
|
|
env_after = replace_system_prompt(env_before, rendered)
|
|
if env_before == env_after:
|
|
print("no change — live prompt already matches rendered template")
|
|
return
|
|
|
|
if not args.apply:
|
|
print("--- new SYSTEM_PROMPT block ---")
|
|
for line in env_after.splitlines():
|
|
if line.startswith("SYSTEM_PROMPT="):
|
|
print(line[:200] + ("…" if len(line) > 200 else ""))
|
|
print()
|
|
print("DRY RUN — re-run with --apply to write + restart paperless-ai")
|
|
return
|
|
|
|
ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
|
|
backup = f"{ENV_PATH}.bak.{ts}"
|
|
subprocess.run(
|
|
["ssh", PAPERLESS_HOST,
|
|
f"docker exec {PAPERLESS_AI_CONTAINER} cp {ENV_PATH} {backup}"],
|
|
check=True, timeout=15,
|
|
)
|
|
print(f"backup: {backup}")
|
|
|
|
write_cmd = (
|
|
f"docker exec -i {PAPERLESS_AI_CONTAINER} "
|
|
f"sh -c 'cat > {ENV_PATH}'"
|
|
)
|
|
proc = subprocess.run(
|
|
["ssh", PAPERLESS_HOST, write_cmd],
|
|
input=env_after, capture_output=True, text=True, timeout=30,
|
|
)
|
|
if proc.returncode != 0:
|
|
sys.exit(f"write failed: {proc.stderr}")
|
|
print(f"wrote {len(env_after)} bytes to {ENV_PATH}")
|
|
|
|
subprocess.run(
|
|
["ssh", PAPERLESS_HOST, f"docker restart {PAPERLESS_AI_CONTAINER}"],
|
|
check=True, timeout=60,
|
|
)
|
|
print(f"restarted {PAPERLESS_AI_CONTAINER}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|