Files
onepager/tools/anti-ai-lint.py
mAi fdac496a6f mAi: #10 - Anti-AI-Text-Lint im Build
tools/anti-ai-lint.py: Python-Linter (stdlib + yq) prueft jede
build/<domain>/index.html gegen die Blacklist in
tools/anti-ai-blacklist.yaml. HTML wird via html.parser auf sichtbaren
Text reduziert (Skripte/Styles werden ignoriert), dann werden Vokabel-
Substrings (DE+EN, case-insensitive) und Regex-Patterns gematcht.
Severity warn = Build geht durch, fail = Build bricht ab.

Whitelist-Mechanismen:
- HTML-Kommentar im Markup: <!-- anti-ai-allow: term1, term2 -->
- Per-Site in site.yaml: anti_ai_allow: [term1, term2]

Integration in build.sh als Schritt 4/4, mit --skip-lint fuer
Notfaelle. Dockerfile installiert python3 zusaetzlich; nur im
Builder-Stage, kein Effekt aufs Caddy-Image.

Tests via tools/test-anti-ai-lint.sh: synthetische AI-Fixture wird
korrekt geflagged, Whitelists unterdruecken Hits, fail-Severity
triggert exit 1, neutraler Text exit 0.

Initial-Lauf auf 59 bestehenden Sites: 2 warn (killusion.de
"revolutionaer" in ironischem Kontext, kilofant.de "robust"),
0 fail. Cleanup ist Folge-Issue.

README + docs/geo-seo-guideline.md aktualisiert mit der konkreten
Tool-Position.
2026-04-30 02:50:50 +02:00

295 lines
9.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""anti-ai-lint — flag AI-text fingerprints in built sites.
Reads tools/anti-ai-blacklist.yaml, walks build/<domain>/index.html, prints
findings. Exits 1 if any finding has severity=fail, else 0.
Usage:
tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet]
[--json] [BUILD_DIR]
"""
import argparse
import json
import os
import re
import subprocess
import sys
from html.parser import HTMLParser
def _ansi(code: str) -> str:
return code if sys.stdout.isatty() else ""
RED = _ansi("\033[31m")
YELLOW = _ansi("\033[33m")
GREEN = _ansi("\033[32m")
DIM = _ansi("\033[2m")
BOLD = _ansi("\033[1m")
RESET = _ansi("\033[0m")
class TextExtractor(HTMLParser):
"""Extract visible text and per-site allow directives."""
SKIP_TAGS = {"script", "style", "noscript", "template"}
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.skip_depth = 0
self.fragments: list[tuple[int, str]] = []
self.allows: set[str] = set()
self.html_lang: str | None = None
def handle_starttag(self, tag: str, attrs) -> None:
if tag == "html" and self.html_lang is None:
for k, v in attrs:
if k == "lang" and v:
self.html_lang = v.lower().split("-")[0]
break
if tag in self.SKIP_TAGS:
self.skip_depth += 1
def handle_startendtag(self, tag: str, attrs) -> None:
# Self-closing — never enters skip depth, no data either.
pass
def handle_endtag(self, tag: str) -> None:
if tag in self.SKIP_TAGS and self.skip_depth > 0:
self.skip_depth -= 1
def handle_data(self, data: str) -> None:
if self.skip_depth == 0 and data.strip():
line, _ = self.getpos()
self.fragments.append((line, data))
def handle_comment(self, data: str) -> None:
m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE)
if m:
for token in re.split(r"[,\s]+", m.group(1)):
token = token.strip()
if token:
self.allows.add(token)
self.allows.add(token.lower())
def load_blacklist(path: str) -> dict:
"""Convert YAML to JSON via yq, parse with stdlib json."""
try:
out = subprocess.check_output(
["yq", "-o=json", path],
stderr=subprocess.PIPE,
text=True,
)
except FileNotFoundError:
sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)")
except subprocess.CalledProcessError as e:
sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}")
return json.loads(out)
def site_allow_yaml(site_yaml: str) -> list[str]:
if not os.path.isfile(site_yaml):
return []
try:
out = subprocess.check_output(
["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml],
stderr=subprocess.DEVNULL,
text=True,
)
except subprocess.CalledProcessError:
return []
return [line.strip() for line in out.splitlines() if line.strip()]
def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]:
with open(html_path, "r", encoding="utf-8", errors="replace") as f:
raw = f.read()
parser = TextExtractor()
try:
parser.feed(raw)
parser.close()
except Exception as e:
# Malformed HTML — record a single warning and skip.
return [{
"kind": "parse",
"name": "html-parse-error",
"severity": "warn",
"line": 0,
"snippet": str(e)[:120],
}]
allow_set = set(parser.allows)
for tok in extra_allows:
allow_set.add(tok)
allow_set.add(tok.lower())
findings: list[dict] = []
seen: set[tuple[str, str]] = set() # (kind, name) — one report per file
# Lint vocab in BOTH languages — sites may carry data-en attributes
# that surface translated text alongside the primary language.
vocab = blacklist.get("vocab") or {}
for lang in ("de", "en"):
bucket = vocab.get(lang) or {}
for severity in ("warn", "fail"):
for term in bucket.get(severity) or []:
key = ("vocab", term.lower())
if key in seen:
continue
if term in allow_set or term.lower() in allow_set:
continue
term_lc = term.lower()
for line_no, frag in parser.fragments:
if term_lc in frag.lower():
findings.append({
"kind": "vocab",
"lang": lang,
"name": term,
"severity": severity,
"line": line_no,
"snippet": frag.strip()[:120],
})
seen.add(key)
break
# Patterns
for pat in blacklist.get("patterns") or []:
name = pat.get("name") or pat.get("regex", "")[:40]
key = ("pattern", name)
if key in seen:
continue
if name in allow_set or name.lower() in allow_set:
continue
flags = re.MULTILINE
if not pat.get("case_sensitive"):
flags |= re.IGNORECASE
try:
rx = re.compile(pat["regex"], flags)
except re.error as e:
findings.append({
"kind": "config",
"name": name,
"severity": "warn",
"line": 0,
"snippet": f"invalid regex: {e}",
})
continue
for line_no, frag in parser.fragments:
m = rx.search(frag)
if m:
findings.append({
"kind": "pattern",
"name": name,
"severity": pat.get("severity", "warn"),
"line": line_no,
"snippet": (frag.strip()[:120] or m.group(0)[:120]),
})
seen.add(key)
break
return findings
def main() -> int:
here = os.path.dirname(os.path.abspath(__file__))
repo = os.path.dirname(here)
ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.")
ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build"))
ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml"))
ap.add_argument("--sources", default=os.path.join(repo, "sites"),
help="sites/ root (for per-site site.yaml allow lists)")
ap.add_argument("--quiet", action="store_true",
help="Suppress warnings; only show fails.")
ap.add_argument("--json", action="store_true", help="Emit JSON report.")
args = ap.parse_args()
if not os.path.isdir(args.build_dir):
print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr)
return 2
if not os.path.isfile(args.blacklist):
print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr)
return 2
blacklist = load_blacklist(args.blacklist)
total_warn = 0
total_fail = 0
sites_with_findings = 0
sites_total = 0
json_sites: list[dict] = []
for entry in sorted(os.listdir(args.build_dir)):
site_dir = os.path.join(args.build_dir, entry)
html = os.path.join(site_dir, "index.html")
if not os.path.isfile(html):
continue
sites_total += 1
site_yaml = os.path.join(args.sources, entry, "site.yaml")
extra_allows = site_allow_yaml(site_yaml)
findings = lint_file(html, blacklist, extra_allows)
warns = [f for f in findings if f["severity"] == "warn"]
fails = [f for f in findings if f["severity"] == "fail"]
if findings:
sites_with_findings += 1
total_warn += len(warns)
total_fail += len(fails)
if args.json:
json_sites.append({"site": entry, "findings": findings})
else:
visible = fails if args.quiet else findings
if visible:
print(f"{BOLD}{entry}{RESET}")
for f in visible:
if f["severity"] == "fail":
color, tag = RED, "FAIL"
else:
color, tag = YELLOW, "warn"
lang = f" ({f['lang']})" if "lang" in f else ""
print(
f" {color}{tag}{RESET} {f['kind']}{lang}: "
f"{BOLD}{f['name']}{RESET} "
f"{DIM}line {f['line']}: {f['snippet']}{RESET}"
)
if args.json:
json.dump(
{
"summary": {
"sites_total": sites_total,
"sites_with_findings": sites_with_findings,
"warn": total_warn,
"fail": total_fail,
},
"sites": json_sites,
},
sys.stdout,
indent=2,
ensure_ascii=False,
)
print()
else:
if total_fail > 0:
tag, color = "FAIL", RED
elif total_warn > 0:
tag, color = "WARN", YELLOW
else:
tag, color = "OK", GREEN
print(
f"\n{color}anti-ai-lint: {tag}{RESET}"
f"{sites_with_findings}/{sites_total} sites flagged "
f"({total_fail} fail, {total_warn} warn)"
)
return 1 if total_fail > 0 else 0
if __name__ == "__main__":
sys.exit(main())