Merge branch 'mai/artemis/issue-10-anti-ai-lint': Anti-AI-Lint im Build (#10)

2026-04-30 02:53:39 +02:00
parent d3a2bdce97 fdac496a6f
commit b12352473c
7 changed files with 547 additions and 6 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 FROM alpine:3.21 AS builder

-RUN apk add --no-cache bash yq coreutils findutils
+RUN apk add --no-cache bash yq coreutils findutils python3

 WORKDIR /src
 COPY . .
--- a/README.md
+++ b/README.md
@@ -31,10 +31,32 @@ build/           # Generated output (gitignored)
 ### Build

 ```bash
-./build.sh
+./build.sh              # build + anti-AI text lint
+./build.sh --skip-lint  # build only (emergencies)
 ```

-Requires `yq` for YAML parsing. Outputs to `build/` directory.
+Requires `yq` for YAML parsing and `python3` for the lint step. Outputs to `build/`.
+
+### Anti-AI text lint
+
+Every build runs `tools/anti-ai-lint.py` against `build/<domain>/index.html`,
+flagging text fingerprints typical of LLM-generated content (vocab and structure
+patterns from `tools/anti-ai-blacklist.yaml`). Severity `warn` prints a message;
+`fail` aborts the build.
+
+Whitelist a hit:
+
+- HTML comment in the affected page:
+  `<!-- anti-ai-allow: revolutionär, em-dash-3-bullet -->`
+- Per-site override in `site.yaml`:
+  ```yaml
+  anti_ai_allow:
+    - revolutionär
+    - em-dash-3-bullet
+  ```
+
+The blacklist source is `docs/geo-seo-guideline.md` §3.6. Test the linter with
+`tools/test-anti-ai-lint.sh`.

 ### Deploy

--- a/build.sh
+++ b/build.sh
@@ -5,6 +5,19 @@ set -euo pipefail
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 BUILD_DIR="$SCRIPT_DIR/build"

+skip_lint=0
+for arg in "$@"; do
+    case "$arg" in
+        --skip-lint) skip_lint=1 ;;
+        -h|--help)
+            echo "Usage: $0 [--skip-lint]"
+            echo "  --skip-lint   Skip the anti-AI text lint step (emergencies only)."
+            exit 0
+            ;;
+        *) echo "Unknown argument: $arg" >&2; exit 2 ;;
+    esac
+done
+
 echo "=== Onepager Build ==="

 # Clean build directory
@@ -50,6 +63,14 @@ echo "[3/3] Copying shared assets..."
 cp -r "$SCRIPT_DIR/shared" "$BUILD_DIR/shared"
 echo "  -> shared/ copied"

-# 4. Report
-echo "[4/4] Build complete"
+# 4. Anti-AI text lint
+if [ "$skip_lint" -eq 1 ]; then
+    echo "[4/4] Anti-AI lint skipped (--skip-lint)"
+elif ! command -v python3 >/dev/null 2>&1; then
+    echo "[4/4] python3 not found — skipping anti-AI lint"
+else
+    echo "[4/4] Anti-AI text lint..."
+    python3 "$SCRIPT_DIR/tools/anti-ai-lint.py" "$BUILD_DIR"
+fi
+
 echo "=== Build complete: $count sites ==="
--- a/docs/geo-seo-guideline.md
+++ b/docs/geo-seo-guideline.md
@@ -248,7 +248,7 @@ Mehrere dieser Wörter im selben Absatz sind das stärkste Tell.

 **Praktische Umsetzung:**

- Lint-Skript im Build (`scripts/anti-ai-lint.sh`) das Vokabel-Blacklist über alle `index.html` und `*.md` läuft, mit Schwellenwert (z. B. mehr als 3 Marker pro 500 Wörter → Warnung).
+- Lint-Skript im Build: für onepager implementiert in `tools/anti-ai-lint.py` mit Vokabel-Blacklist in `tools/anti-ai-blacklist.yaml`. Läuft am Ende von `build.sh` über alle `build/<domain>/index.html`. Severity `warn` (Build geht durch) vs. `fail` (Build bricht ab). Whitelist via `<!-- anti-ai-allow: term -->`-Kommentar oder `anti_ai_allow:`-Liste in `site.yaml`. Notfall-Override: `build.sh --skip-lint`.
 - Bei AI-generierten Drafts: bewusst gegen die Blacklist redigieren.
 - Vor Veröffentlichung laut lesen. Wenn es klingt wie ein Pressemitteilungs-Bot, ist es einer.

--- a/tools/anti-ai-blacklist.yaml
+++ b/tools/anti-ai-blacklist.yaml
@@ -0,0 +1,97 @@
+# Anti-AI lint rules: textual fingerprints typical of LLM-generated content.
+#
+# Severity:
+#   warn — build proceeds, message printed
+#   fail — build aborts (exit 1) unless build.sh --skip-lint
+#
+# Whitelisting matches:
+#   In an HTML file:        <!-- anti-ai-allow: term -->
+#                            <!-- anti-ai-allow: term1, term2 -->
+#   Per site (site.yaml):    anti_ai_allow:
+#                              - leverage
+#                              - em-dash-3-bullet
+#
+# Vocab matches are case-insensitive substring matches against the visible
+# text of the rendered HTML (script/style/comments stripped). Pattern matches
+# are regex (Python re), case-insensitive by default, against the same.
+#
+# Source: docs/geo-seo-guideline.md §3.6 (Wikipedia AI-content signals).
+
+vocab:
+  de:
+    warn:
+      - "nahtlos"
+      - "robust"
+      - "umfassend"
+      - "ganzheitlich"
+      - "fungiert als"
+      - "dient als Brücke"
+      - "Symbiose"
+      - "im Bereich der"
+      - "in der heutigen schnelllebigen"
+      - "ein Meilenstein"
+      - "ein Beweis für"
+      - "hat Spuren hinterlassen"
+      - "Es ist wichtig zu erwähnen"
+      - "Es ist wichtig zu beachten"
+      - "bahnbrechend"
+      - "revolutionär"
+    fail:
+      - "in der sich entwickelnden Landschaft"
+      - "Herausforderungen und Zukunftsaussichten"
+      - "Herausforderungen und Perspektiven"
+
+  en:
+    warn:
+      - "delve"
+      - "tapestry"
+      - "testament"
+      - "intricate"
+      - "garnered"
+      - "bolstered"
+      - "enduring"
+      - "robust"
+      - "comprehensive"
+      - "meticulous"
+      - "interplay"
+      - "pivotal"
+      - "underscore"
+      - "moreover"
+      - "furthermore"
+      - "additionally"
+      - "crucial"
+      - "showcasing"
+      - "highlighting"
+      - "leverage"
+      - "streamline"
+      - "holistic"
+      - "seamless"
+      - "unleash"
+      - "ecosystem"
+      - "in the realm of"
+      - "dive into"
+      - "It's important to note that"
+      - "It is important to note that"
+      - "In this article, we'll"
+    fail:
+      - "in today's evolving landscape"
+      - "in the ever-evolving landscape"
+      - "Challenges and Future Prospects"
+
+patterns:
+  - name: em-dash-3-bullet
+    description: |
+      Three "Word: text — Word: text — Word: …" segments in one block.
+      Classic AI bullet pattern.
+    regex: '(\w[\w\s]{0,30}:\s+[^—\n]{2,80}—\s*){2,}\w[\w\s]{0,30}:'
+    severity: warn
+
+  - name: not-only-but-also
+    description: '"not only X, but also Y" / "nicht nur X, sondern auch Y" filler.'
+    regex: '\b(?:not only|nicht nur)\b[^.,;\n]{1,80}\b(?:but also|sondern auch)\b'
+    severity: warn
+
+  - name: as-an-ai
+    description: Leftover AI self-disclosure.
+    regex: '\b(?:as an? (?:AI|language model)|als (?:eine?|eine\s+)?(?:KI|Sprachmodell))\b'
+    severity: fail
--- a/tools/anti-ai-lint.py
+++ b/tools/anti-ai-lint.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""anti-ai-lint — flag AI-text fingerprints in built sites.
+
+Reads tools/anti-ai-blacklist.yaml, walks build/<domain>/index.html, prints
+findings. Exits 1 if any finding has severity=fail, else 0.
+
+Usage:
+    tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet]
+                          [--json] [BUILD_DIR]
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from html.parser import HTMLParser
+
+
+def _ansi(code: str) -> str:
+    return code if sys.stdout.isatty() else ""
+
+
+RED = _ansi("\033[31m")
+YELLOW = _ansi("\033[33m")
+GREEN = _ansi("\033[32m")
+DIM = _ansi("\033[2m")
+BOLD = _ansi("\033[1m")
+RESET = _ansi("\033[0m")
+
+
+class TextExtractor(HTMLParser):
+    """Extract visible text and per-site allow directives."""
+
+    SKIP_TAGS = {"script", "style", "noscript", "template"}
+
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self.skip_depth = 0
+        self.fragments: list[tuple[int, str]] = []
+        self.allows: set[str] = set()
+        self.html_lang: str | None = None
+
+    def handle_starttag(self, tag: str, attrs) -> None:
+        if tag == "html" and self.html_lang is None:
+            for k, v in attrs:
+                if k == "lang" and v:
+                    self.html_lang = v.lower().split("-")[0]
+                    break
+        if tag in self.SKIP_TAGS:
+            self.skip_depth += 1
+
+    def handle_startendtag(self, tag: str, attrs) -> None:
+        # Self-closing — never enters skip depth, no data either.
+        pass
+
+    def handle_endtag(self, tag: str) -> None:
+        if tag in self.SKIP_TAGS and self.skip_depth > 0:
+            self.skip_depth -= 1
+
+    def handle_data(self, data: str) -> None:
+        if self.skip_depth == 0 and data.strip():
+            line, _ = self.getpos()
+            self.fragments.append((line, data))
+
+    def handle_comment(self, data: str) -> None:
+        m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE)
+        if m:
+            for token in re.split(r"[,\s]+", m.group(1)):
+                token = token.strip()
+                if token:
+                    self.allows.add(token)
+                    self.allows.add(token.lower())
+
+
+def load_blacklist(path: str) -> dict:
+    """Convert YAML to JSON via yq, parse with stdlib json."""
+    try:
+        out = subprocess.check_output(
+            ["yq", "-o=json", path],
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+    except FileNotFoundError:
+        sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)")
+    except subprocess.CalledProcessError as e:
+        sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}")
+    return json.loads(out)
+
+
+def site_allow_yaml(site_yaml: str) -> list[str]:
+    if not os.path.isfile(site_yaml):
+        return []
+    try:
+        out = subprocess.check_output(
+            ["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        )
+    except subprocess.CalledProcessError:
+        return []
+    return [line.strip() for line in out.splitlines() if line.strip()]
+
+
+def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]:
+    with open(html_path, "r", encoding="utf-8", errors="replace") as f:
+        raw = f.read()
+
+    parser = TextExtractor()
+    try:
+        parser.feed(raw)
+        parser.close()
+    except Exception as e:
+        # Malformed HTML — record a single warning and skip.
+        return [{
+            "kind": "parse",
+            "name": "html-parse-error",
+            "severity": "warn",
+            "line": 0,
+            "snippet": str(e)[:120],
+        }]
+
+    allow_set = set(parser.allows)
+    for tok in extra_allows:
+        allow_set.add(tok)
+        allow_set.add(tok.lower())
+
+    findings: list[dict] = []
+    seen: set[tuple[str, str]] = set()  # (kind, name) — one report per file
+
+    # Lint vocab in BOTH languages — sites may carry data-en attributes
+    # that surface translated text alongside the primary language.
+    vocab = blacklist.get("vocab") or {}
+    for lang in ("de", "en"):
+        bucket = vocab.get(lang) or {}
+        for severity in ("warn", "fail"):
+            for term in bucket.get(severity) or []:
+                key = ("vocab", term.lower())
+                if key in seen:
+                    continue
+                if term in allow_set or term.lower() in allow_set:
+                    continue
+                term_lc = term.lower()
+                for line_no, frag in parser.fragments:
+                    if term_lc in frag.lower():
+                        findings.append({
+                            "kind": "vocab",
+                            "lang": lang,
+                            "name": term,
+                            "severity": severity,
+                            "line": line_no,
+                            "snippet": frag.strip()[:120],
+                        })
+                        seen.add(key)
+                        break
+
+    # Patterns
+    for pat in blacklist.get("patterns") or []:
+        name = pat.get("name") or pat.get("regex", "")[:40]
+        key = ("pattern", name)
+        if key in seen:
+            continue
+        if name in allow_set or name.lower() in allow_set:
+            continue
+        flags = re.MULTILINE
+        if not pat.get("case_sensitive"):
+            flags |= re.IGNORECASE
+        try:
+            rx = re.compile(pat["regex"], flags)
+        except re.error as e:
+            findings.append({
+                "kind": "config",
+                "name": name,
+                "severity": "warn",
+                "line": 0,
+                "snippet": f"invalid regex: {e}",
+            })
+            continue
+        for line_no, frag in parser.fragments:
+            m = rx.search(frag)
+            if m:
+                findings.append({
+                    "kind": "pattern",
+                    "name": name,
+                    "severity": pat.get("severity", "warn"),
+                    "line": line_no,
+                    "snippet": (frag.strip()[:120] or m.group(0)[:120]),
+                })
+                seen.add(key)
+                break
+
+    return findings
+
+
+def main() -> int:
+    here = os.path.dirname(os.path.abspath(__file__))
+    repo = os.path.dirname(here)
+
+    ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.")
+    ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build"))
+    ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml"))
+    ap.add_argument("--sources", default=os.path.join(repo, "sites"),
+                    help="sites/ root (for per-site site.yaml allow lists)")
+    ap.add_argument("--quiet", action="store_true",
+                    help="Suppress warnings; only show fails.")
+    ap.add_argument("--json", action="store_true", help="Emit JSON report.")
+    args = ap.parse_args()
+
+    if not os.path.isdir(args.build_dir):
+        print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr)
+        return 2
+    if not os.path.isfile(args.blacklist):
+        print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr)
+        return 2
+
+    blacklist = load_blacklist(args.blacklist)
+
+    total_warn = 0
+    total_fail = 0
+    sites_with_findings = 0
+    sites_total = 0
+    json_sites: list[dict] = []
+
+    for entry in sorted(os.listdir(args.build_dir)):
+        site_dir = os.path.join(args.build_dir, entry)
+        html = os.path.join(site_dir, "index.html")
+        if not os.path.isfile(html):
+            continue
+        sites_total += 1
+
+        site_yaml = os.path.join(args.sources, entry, "site.yaml")
+        extra_allows = site_allow_yaml(site_yaml)
+
+        findings = lint_file(html, blacklist, extra_allows)
+        warns = [f for f in findings if f["severity"] == "warn"]
+        fails = [f for f in findings if f["severity"] == "fail"]
+
+        if findings:
+            sites_with_findings += 1
+            total_warn += len(warns)
+            total_fail += len(fails)
+            if args.json:
+                json_sites.append({"site": entry, "findings": findings})
+            else:
+                visible = fails if args.quiet else findings
+                if visible:
+                    print(f"{BOLD}{entry}{RESET}")
+                    for f in visible:
+                        if f["severity"] == "fail":
+                            color, tag = RED, "FAIL"
+                        else:
+                            color, tag = YELLOW, "warn"
+                        lang = f" ({f['lang']})" if "lang" in f else ""
+                        print(
+                            f"  {color}{tag}{RESET} {f['kind']}{lang}: "
+                            f"{BOLD}{f['name']}{RESET}  "
+                            f"{DIM}line {f['line']}: {f['snippet']}{RESET}"
+                        )
+
+    if args.json:
+        json.dump(
+            {
+                "summary": {
+                    "sites_total": sites_total,
+                    "sites_with_findings": sites_with_findings,
+                    "warn": total_warn,
+                    "fail": total_fail,
+                },
+                "sites": json_sites,
+            },
+            sys.stdout,
+            indent=2,
+            ensure_ascii=False,
+        )
+        print()
+    else:
+        if total_fail > 0:
+            tag, color = "FAIL", RED
+        elif total_warn > 0:
+            tag, color = "WARN", YELLOW
+        else:
+            tag, color = "OK", GREEN
+        print(
+            f"\n{color}anti-ai-lint: {tag}{RESET} — "
+            f"{sites_with_findings}/{sites_total} sites flagged "
+            f"({total_fail} fail, {total_warn} warn)"
+        )
+
+    return 1 if total_fail > 0 else 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tools/test-anti-ai-lint.sh
+++ b/tools/test-anti-ai-lint.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# Self-test for tools/anti-ai-lint.py.
+# Builds a synthetic AI-text fixture in a temp dir, asserts the linter
+# flags it, then verifies whitelist comments suppress the hit.
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
+LINT="$SCRIPT_DIR/anti-ai-lint.py"
+
+tmp=$(mktemp -d)
+trap 'rm -rf "$tmp"' EXIT
+
+mkdir -p "$tmp/build/synthetic-ai.test"
+
+cat > "$tmp/build/synthetic-ai.test/index.html" <<'HTML'
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <title>Synthetic AI sample</title>
+  <style>.foo { color: red; } /* leverage in CSS comment must be ignored */</style>
+  <script>const x = "leverage"; // in JS, must be ignored</script>
+</head>
+<body>
+  <h1>In today's evolving landscape</h1>
+  <p>This is a comprehensive, robust, holistic solution that lets us leverage emerging trends.</p>
+  <p>We delve into the intricate tapestry of AI to navigate this pivotal moment.</p>
+  <h2>Challenges and Future Prospects</h2>
+  <ul>
+    <li>Effizienz: hoch — Skalierbarkeit: gut — Sicherheit: solide</li>
+  </ul>
+</body>
+</html>
+HTML
+
+expect_finding() {
+    # expect_finding <json> <name>
+    python3 -c '
+import json, sys
+data = json.loads(sys.argv[1])
+target = sys.argv[2]
+hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target]
+if len(hits) != 1:
+    print(f"expected exactly 1 finding for {target!r}, got {len(hits)}", file=sys.stderr)
+    sys.exit(1)
+' "$1" "$2"
+}
+
+expect_no_finding() {
+    python3 -c '
+import json, sys
+data = json.loads(sys.argv[1])
+target = sys.argv[2]
+hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target]
+if hits:
+    print(f"unexpected finding for {target!r}: {hits}", file=sys.stderr)
+    sys.exit(1)
+' "$1" "$2"
+}
+
+echo "[1] expecting FAIL on synthetic AI fixture..."
+report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) && rc=0 || rc=$?
+if [ "$rc" -ne 1 ]; then
+    echo "FAIL: expected exit 1, got $rc" >&2
+    echo "$report" >&2
+    exit 1
+fi
+for term in "in today's evolving landscape" "Challenges and Future Prospects" \
+            "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do
+    expect_finding "$report" "$term" || exit 1
+done
+echo "  OK"
+
+echo "[2] expecting whitelist comment to suppress hits..."
+sed -i '4a\  <!-- anti-ai-allow: leverage, comprehensive, delve, em-dash-3-bullet -->' \
+    "$tmp/build/synthetic-ai.test/index.html"
+report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) || true
+for term in "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do
+    expect_no_finding "$report" "$term" || exit 1
+done
+# fail-level "in today's evolving landscape" should still be reported
+expect_finding "$report" "in today's evolving landscape" || exit 1
+echo "  OK"
+
+echo "[3] expecting fail-level hit still triggers exit 1..."
+python3 "$LINT" "$tmp/build" >/dev/null 2>&1 && rc=0 || rc=$?
+if [ "$rc" -ne 1 ]; then
+    echo "FAIL: expected exit 1, got $rc" >&2
+    exit 1
+fi
+echo "  OK"
+
+echo "[4] expecting clean exit on neutral fixture..."
+rm "$tmp/build/synthetic-ai.test/index.html"
+mkdir -p "$tmp/build/clean.test"
+echo '<!DOCTYPE html><html lang="de"><body><p>Ein einfacher Satz ohne KI-Vokabular.</p></body></html>' \
+    > "$tmp/build/clean.test/index.html"
+rm -rf "$tmp/build/synthetic-ai.test"
+out=$(python3 "$LINT" "$tmp/build" 2>&1) && rc=0 || rc=$?
+if [ "$rc" -ne 0 ]; then
+    echo "FAIL: clean fixture should exit 0, got $rc" >&2
+    echo "$out"
+    exit 1
+fi
+echo "  OK"
+
+echo
+echo "all tests passed"