mAi: #10 - Anti-AI-Text-Lint im Build

tools/anti-ai-lint.py: Python-Linter (stdlib + yq) prueft jede build/<domain>/index.html gegen die Blacklist in tools/anti-ai-blacklist.yaml. HTML wird via html.parser auf sichtbaren Text reduziert (Skripte/Styles werden ignoriert), dann werden Vokabel- Substrings (DE+EN, case-insensitive) und Regex-Patterns gematcht. Severity warn = Build geht durch, fail = Build bricht ab. Whitelist-Mechanismen: - HTML-Kommentar im Markup:  - Per-Site in site.yaml: anti_ai_allow: [term1, term2] Integration in build.sh als Schritt 4/4, mit --skip-lint fuer Notfaelle. Dockerfile installiert python3 zusaetzlich; nur im Builder-Stage, kein Effekt aufs Caddy-Image. Tests via tools/test-anti-ai-lint.sh: synthetische AI-Fixture wird korrekt geflagged, Whitelists unterdruecken Hits, fail-Severity triggert exit 1, neutraler Text exit 0. Initial-Lauf auf 59 bestehenden Sites: 2 warn (killusion.de "revolutionaer" in ironischem Kontext, kilofant.de "robust"), 0 fail. Cleanup ist Folge-Issue. README + docs/geo-seo-guideline.md aktualisiert mit der konkreten Tool-Position.
2026-04-30 02:50:50 +02:00
parent 156f156aa7
commit fdac496a6f
7 changed files with 547 additions and 6 deletions
--- a/tools/anti-ai-lint.py
+++ b/tools/anti-ai-lint.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""anti-ai-lint — flag AI-text fingerprints in built sites.
+
+Reads tools/anti-ai-blacklist.yaml, walks build/<domain>/index.html, prints
+findings. Exits 1 if any finding has severity=fail, else 0.
+
+Usage:
+    tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet]
+                          [--json] [BUILD_DIR]
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from html.parser import HTMLParser
+
+
+def _ansi(code: str) -> str:
+    return code if sys.stdout.isatty() else ""
+
+
+RED = _ansi("\033[31m")
+YELLOW = _ansi("\033[33m")
+GREEN = _ansi("\033[32m")
+DIM = _ansi("\033[2m")
+BOLD = _ansi("\033[1m")
+RESET = _ansi("\033[0m")
+
+
+class TextExtractor(HTMLParser):
+    """Extract visible text and per-site allow directives."""
+
+    SKIP_TAGS = {"script", "style", "noscript", "template"}
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self.skip_depth = 0
+        self.fragments: list[tuple[int, str]] = []
+        self.allows: set[str] = set()
+        self.html_lang: str | None = None
+
+    def handle_starttag(self, tag: str, attrs) -> None:
+        if tag == "html" and self.html_lang is None:
+            for k, v in attrs:
+                if k == "lang" and v:
+                    self.html_lang = v.lower().split("-")[0]
+                    break
+        if tag in self.SKIP_TAGS:
+            self.skip_depth += 1
+
+    def handle_startendtag(self, tag: str, attrs) -> None:
+        # Self-closing — never enters skip depth, no data either.
+        pass
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self.SKIP_TAGS and self.skip_depth > 0:
+            self.skip_depth -= 1
+
+    def handle_data(self, data: str) -> None:
+        if self.skip_depth == 0 and data.strip():
+            line, _ = self.getpos()
+            self.fragments.append((line, data))
+
+    def handle_comment(self, data: str) -> None:
+        m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE)
+        if m:
+            for token in re.split(r"[,\s]+", m.group(1)):
+                token = token.strip()
+                if token:
+                    self.allows.add(token)
+                    self.allows.add(token.lower())
+
+
+def load_blacklist(path: str) -> dict:
+    """Convert YAML to JSON via yq, parse with stdlib json."""
+    try:
+        out = subprocess.check_output(
+            ["yq", "-o=json", path],
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+    except FileNotFoundError:
+        sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)")
+    except subprocess.CalledProcessError as e:
+        sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}")
+    return json.loads(out)
+
+
+def site_allow_yaml(site_yaml: str) -> list[str]:
+    if not os.path.isfile(site_yaml):
+        return []
+    try:
+        out = subprocess.check_output(
+            ["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+    except subprocess.CalledProcessError:
+        return []
+    return [line.strip() for line in out.splitlines() if line.strip()]
+
+
+def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]:
+    with open(html_path, "r", encoding="utf-8", errors="replace") as f:
+        raw = f.read()
+
+    parser = TextExtractor()
+    try:
+        parser.feed(raw)
+        parser.close()
+    except Exception as e:
+        # Malformed HTML — record a single warning and skip.
+        return [{
+            "kind": "parse",
+            "name": "html-parse-error",
+            "severity": "warn",
+            "line": 0,
+            "snippet": str(e)[:120],
+        }]
+
+    allow_set = set(parser.allows)
+    for tok in extra_allows:
+        allow_set.add(tok)
+        allow_set.add(tok.lower())
+
+    findings: list[dict] = []
+    seen: set[tuple[str, str]] = set()  # (kind, name) — one report per file
+
+    # Lint vocab in BOTH languages — sites may carry data-en attributes
+    # that surface translated text alongside the primary language.
+    vocab = blacklist.get("vocab") or {}
+    for lang in ("de", "en"):
+        bucket = vocab.get(lang) or {}
+        for severity in ("warn", "fail"):
+            for term in bucket.get(severity) or []:
+                key = ("vocab", term.lower())
+                if key in seen:
+                    continue
+                if term in allow_set or term.lower() in allow_set:
+                    continue
+                term_lc = term.lower()
+                for line_no, frag in parser.fragments:
+                    if term_lc in frag.lower():
+                        findings.append({
+                            "kind": "vocab",
+                            "lang": lang,
+                            "name": term,
+                            "severity": severity,
+                            "line": line_no,
+                            "snippet": frag.strip()[:120],
+                        })
+                        seen.add(key)
+                        break
+
+    # Patterns
+    for pat in blacklist.get("patterns") or []:
+        name = pat.get("name") or pat.get("regex", "")[:40]
+        key = ("pattern", name)
+        if key in seen:
+            continue
+        if name in allow_set or name.lower() in allow_set:
+            continue
+        flags = re.MULTILINE
+        if not pat.get("case_sensitive"):
+            flags |= re.IGNORECASE
+        try:
+            rx = re.compile(pat["regex"], flags)
+        except re.error as e:
+            findings.append({
+                "kind": "config",
+                "name": name,
+                "severity": "warn",
+                "line": 0,
+                "snippet": f"invalid regex: {e}",
+            })
+            continue
+        for line_no, frag in parser.fragments:
+            m = rx.search(frag)
+            if m:
+                findings.append({
+                    "kind": "pattern",
+                    "name": name,
+                    "severity": pat.get("severity", "warn"),
+                    "line": line_no,
+                    "snippet": (frag.strip()[:120] or m.group(0)[:120]),
+                })
+                seen.add(key)
+                break
+
+    return findings
+
+
+def main() -> int:
+    here = os.path.dirname(os.path.abspath(__file__))
+    repo = os.path.dirname(here)
+
+    ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.")
+    ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build"))
+    ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml"))
+    ap.add_argument("--sources", default=os.path.join(repo, "sites"),
+                    help="sites/ root (for per-site site.yaml allow lists)")
+    ap.add_argument("--quiet", action="store_true",
+                    help="Suppress warnings; only show fails.")
+    ap.add_argument("--json", action="store_true", help="Emit JSON report.")
+    args = ap.parse_args()
+
+    if not os.path.isdir(args.build_dir):
+        print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr)
+        return 2
+    if not os.path.isfile(args.blacklist):
+        print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr)
+        return 2
+
+    blacklist = load_blacklist(args.blacklist)
+
+    total_warn = 0
+    total_fail = 0
+    sites_with_findings = 0
+    sites_total = 0
+    json_sites: list[dict] = []
+
+    for entry in sorted(os.listdir(args.build_dir)):
+        site_dir = os.path.join(args.build_dir, entry)
+        html = os.path.join(site_dir, "index.html")
+        if not os.path.isfile(html):
+            continue
+        sites_total += 1
+
+        site_yaml = os.path.join(args.sources, entry, "site.yaml")
+        extra_allows = site_allow_yaml(site_yaml)
+
+        findings = lint_file(html, blacklist, extra_allows)
+        warns = [f for f in findings if f["severity"] == "warn"]
+        fails = [f for f in findings if f["severity"] == "fail"]
+
+        if findings:
+            sites_with_findings += 1
+            total_warn += len(warns)
+            total_fail += len(fails)
+            if args.json:
+                json_sites.append({"site": entry, "findings": findings})
+            else:
+                visible = fails if args.quiet else findings
+                if visible:
+                    print(f"{BOLD}{entry}{RESET}")
+                    for f in visible:
+                        if f["severity"] == "fail":
+                            color, tag = RED, "FAIL"
+                        else:
+                            color, tag = YELLOW, "warn"
+                        lang = f" ({f['lang']})" if "lang" in f else ""
+                        print(
+                            f"  {color}{tag}{RESET} {f['kind']}{lang}: "
+                            f"{BOLD}{f['name']}{RESET}  "
+                            f"{DIM}line {f['line']}: {f['snippet']}{RESET}"
+                        )
+
+    if args.json:
+        json.dump(
+            {
+                "summary": {
+                    "sites_total": sites_total,
+                    "sites_with_findings": sites_with_findings,
+                    "warn": total_warn,
+                    "fail": total_fail,
+                },
+                "sites": json_sites,
+            },
+            sys.stdout,
+            indent=2,
+            ensure_ascii=False,
+        )
+        print()
+    else:
+        if total_fail > 0:
+            tag, color = "FAIL", RED
+        elif total_warn > 0:
+            tag, color = "WARN", YELLOW
+        else:
+            tag, color = "OK", GREEN
+        print(
+            f"\n{color}anti-ai-lint: {tag}{RESET} — "
+            f"{sites_with_findings}/{sites_total} sites flagged "
+            f"({total_fail} fail, {total_warn} warn)"
+        )
+
+    return 1 if total_fail > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())