mAi: #10 - Anti-AI-Text-Lint im Build
tools/anti-ai-lint.py: Python-Linter (stdlib + yq) prueft jede build/<domain>/index.html gegen die Blacklist in tools/anti-ai-blacklist.yaml. HTML wird via html.parser auf sichtbaren Text reduziert (Skripte/Styles werden ignoriert), dann werden Vokabel- Substrings (DE+EN, case-insensitive) und Regex-Patterns gematcht. Severity warn = Build geht durch, fail = Build bricht ab. Whitelist-Mechanismen: - HTML-Kommentar im Markup: <!-- anti-ai-allow: term1, term2 --> - Per-Site in site.yaml: anti_ai_allow: [term1, term2] Integration in build.sh als Schritt 4/4, mit --skip-lint fuer Notfaelle. Dockerfile installiert python3 zusaetzlich; nur im Builder-Stage, kein Effekt aufs Caddy-Image. Tests via tools/test-anti-ai-lint.sh: synthetische AI-Fixture wird korrekt geflagged, Whitelists unterdruecken Hits, fail-Severity triggert exit 1, neutraler Text exit 0. Initial-Lauf auf 59 bestehenden Sites: 2 warn (killusion.de "revolutionaer" in ironischem Kontext, kilofant.de "robust"), 0 fail. Cleanup ist Folge-Issue. README + docs/geo-seo-guideline.md aktualisiert mit der konkreten Tool-Position.
This commit is contained in:
294
tools/anti-ai-lint.py
Executable file
294
tools/anti-ai-lint.py
Executable file
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env python3
|
||||
"""anti-ai-lint — flag AI-text fingerprints in built sites.
|
||||
|
||||
Reads tools/anti-ai-blacklist.yaml, walks build/<domain>/index.html, prints
|
||||
findings. Exits 1 if any finding has severity=fail, else 0.
|
||||
|
||||
Usage:
|
||||
tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet]
|
||||
[--json] [BUILD_DIR]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from html.parser import HTMLParser
|
||||
|
||||
|
||||
def _ansi(code: str) -> str:
|
||||
return code if sys.stdout.isatty() else ""
|
||||
|
||||
|
||||
RED = _ansi("\033[31m")
|
||||
YELLOW = _ansi("\033[33m")
|
||||
GREEN = _ansi("\033[32m")
|
||||
DIM = _ansi("\033[2m")
|
||||
BOLD = _ansi("\033[1m")
|
||||
RESET = _ansi("\033[0m")
|
||||
|
||||
|
||||
class TextExtractor(HTMLParser):
|
||||
"""Extract visible text and per-site allow directives."""
|
||||
|
||||
SKIP_TAGS = {"script", "style", "noscript", "template"}
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.skip_depth = 0
|
||||
self.fragments: list[tuple[int, str]] = []
|
||||
self.allows: set[str] = set()
|
||||
self.html_lang: str | None = None
|
||||
|
||||
def handle_starttag(self, tag: str, attrs) -> None:
|
||||
if tag == "html" and self.html_lang is None:
|
||||
for k, v in attrs:
|
||||
if k == "lang" and v:
|
||||
self.html_lang = v.lower().split("-")[0]
|
||||
break
|
||||
if tag in self.SKIP_TAGS:
|
||||
self.skip_depth += 1
|
||||
|
||||
def handle_startendtag(self, tag: str, attrs) -> None:
|
||||
# Self-closing — never enters skip depth, no data either.
|
||||
pass
|
||||
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
if tag in self.SKIP_TAGS and self.skip_depth > 0:
|
||||
self.skip_depth -= 1
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
if self.skip_depth == 0 and data.strip():
|
||||
line, _ = self.getpos()
|
||||
self.fragments.append((line, data))
|
||||
|
||||
def handle_comment(self, data: str) -> None:
|
||||
m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE)
|
||||
if m:
|
||||
for token in re.split(r"[,\s]+", m.group(1)):
|
||||
token = token.strip()
|
||||
if token:
|
||||
self.allows.add(token)
|
||||
self.allows.add(token.lower())
|
||||
|
||||
|
||||
def load_blacklist(path: str) -> dict:
|
||||
"""Convert YAML to JSON via yq, parse with stdlib json."""
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["yq", "-o=json", path],
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)")
|
||||
except subprocess.CalledProcessError as e:
|
||||
sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}")
|
||||
return json.loads(out)
|
||||
|
||||
|
||||
def site_allow_yaml(site_yaml: str) -> list[str]:
|
||||
if not os.path.isfile(site_yaml):
|
||||
return []
|
||||
try:
|
||||
out = subprocess.check_output(
|
||||
["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml],
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
return []
|
||||
return [line.strip() for line in out.splitlines() if line.strip()]
|
||||
|
||||
|
||||
def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]:
|
||||
with open(html_path, "r", encoding="utf-8", errors="replace") as f:
|
||||
raw = f.read()
|
||||
|
||||
parser = TextExtractor()
|
||||
try:
|
||||
parser.feed(raw)
|
||||
parser.close()
|
||||
except Exception as e:
|
||||
# Malformed HTML — record a single warning and skip.
|
||||
return [{
|
||||
"kind": "parse",
|
||||
"name": "html-parse-error",
|
||||
"severity": "warn",
|
||||
"line": 0,
|
||||
"snippet": str(e)[:120],
|
||||
}]
|
||||
|
||||
allow_set = set(parser.allows)
|
||||
for tok in extra_allows:
|
||||
allow_set.add(tok)
|
||||
allow_set.add(tok.lower())
|
||||
|
||||
findings: list[dict] = []
|
||||
seen: set[tuple[str, str]] = set() # (kind, name) — one report per file
|
||||
|
||||
# Lint vocab in BOTH languages — sites may carry data-en attributes
|
||||
# that surface translated text alongside the primary language.
|
||||
vocab = blacklist.get("vocab") or {}
|
||||
for lang in ("de", "en"):
|
||||
bucket = vocab.get(lang) or {}
|
||||
for severity in ("warn", "fail"):
|
||||
for term in bucket.get(severity) or []:
|
||||
key = ("vocab", term.lower())
|
||||
if key in seen:
|
||||
continue
|
||||
if term in allow_set or term.lower() in allow_set:
|
||||
continue
|
||||
term_lc = term.lower()
|
||||
for line_no, frag in parser.fragments:
|
||||
if term_lc in frag.lower():
|
||||
findings.append({
|
||||
"kind": "vocab",
|
||||
"lang": lang,
|
||||
"name": term,
|
||||
"severity": severity,
|
||||
"line": line_no,
|
||||
"snippet": frag.strip()[:120],
|
||||
})
|
||||
seen.add(key)
|
||||
break
|
||||
|
||||
# Patterns
|
||||
for pat in blacklist.get("patterns") or []:
|
||||
name = pat.get("name") or pat.get("regex", "")[:40]
|
||||
key = ("pattern", name)
|
||||
if key in seen:
|
||||
continue
|
||||
if name in allow_set or name.lower() in allow_set:
|
||||
continue
|
||||
flags = re.MULTILINE
|
||||
if not pat.get("case_sensitive"):
|
||||
flags |= re.IGNORECASE
|
||||
try:
|
||||
rx = re.compile(pat["regex"], flags)
|
||||
except re.error as e:
|
||||
findings.append({
|
||||
"kind": "config",
|
||||
"name": name,
|
||||
"severity": "warn",
|
||||
"line": 0,
|
||||
"snippet": f"invalid regex: {e}",
|
||||
})
|
||||
continue
|
||||
for line_no, frag in parser.fragments:
|
||||
m = rx.search(frag)
|
||||
if m:
|
||||
findings.append({
|
||||
"kind": "pattern",
|
||||
"name": name,
|
||||
"severity": pat.get("severity", "warn"),
|
||||
"line": line_no,
|
||||
"snippet": (frag.strip()[:120] or m.group(0)[:120]),
|
||||
})
|
||||
seen.add(key)
|
||||
break
|
||||
|
||||
return findings
|
||||
|
||||
|
||||
def main() -> int:
|
||||
here = os.path.dirname(os.path.abspath(__file__))
|
||||
repo = os.path.dirname(here)
|
||||
|
||||
ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.")
|
||||
ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build"))
|
||||
ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml"))
|
||||
ap.add_argument("--sources", default=os.path.join(repo, "sites"),
|
||||
help="sites/ root (for per-site site.yaml allow lists)")
|
||||
ap.add_argument("--quiet", action="store_true",
|
||||
help="Suppress warnings; only show fails.")
|
||||
ap.add_argument("--json", action="store_true", help="Emit JSON report.")
|
||||
args = ap.parse_args()
|
||||
|
||||
if not os.path.isdir(args.build_dir):
|
||||
print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr)
|
||||
return 2
|
||||
if not os.path.isfile(args.blacklist):
|
||||
print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
blacklist = load_blacklist(args.blacklist)
|
||||
|
||||
total_warn = 0
|
||||
total_fail = 0
|
||||
sites_with_findings = 0
|
||||
sites_total = 0
|
||||
json_sites: list[dict] = []
|
||||
|
||||
for entry in sorted(os.listdir(args.build_dir)):
|
||||
site_dir = os.path.join(args.build_dir, entry)
|
||||
html = os.path.join(site_dir, "index.html")
|
||||
if not os.path.isfile(html):
|
||||
continue
|
||||
sites_total += 1
|
||||
|
||||
site_yaml = os.path.join(args.sources, entry, "site.yaml")
|
||||
extra_allows = site_allow_yaml(site_yaml)
|
||||
|
||||
findings = lint_file(html, blacklist, extra_allows)
|
||||
warns = [f for f in findings if f["severity"] == "warn"]
|
||||
fails = [f for f in findings if f["severity"] == "fail"]
|
||||
|
||||
if findings:
|
||||
sites_with_findings += 1
|
||||
total_warn += len(warns)
|
||||
total_fail += len(fails)
|
||||
if args.json:
|
||||
json_sites.append({"site": entry, "findings": findings})
|
||||
else:
|
||||
visible = fails if args.quiet else findings
|
||||
if visible:
|
||||
print(f"{BOLD}{entry}{RESET}")
|
||||
for f in visible:
|
||||
if f["severity"] == "fail":
|
||||
color, tag = RED, "FAIL"
|
||||
else:
|
||||
color, tag = YELLOW, "warn"
|
||||
lang = f" ({f['lang']})" if "lang" in f else ""
|
||||
print(
|
||||
f" {color}{tag}{RESET} {f['kind']}{lang}: "
|
||||
f"{BOLD}{f['name']}{RESET} "
|
||||
f"{DIM}line {f['line']}: {f['snippet']}{RESET}"
|
||||
)
|
||||
|
||||
if args.json:
|
||||
json.dump(
|
||||
{
|
||||
"summary": {
|
||||
"sites_total": sites_total,
|
||||
"sites_with_findings": sites_with_findings,
|
||||
"warn": total_warn,
|
||||
"fail": total_fail,
|
||||
},
|
||||
"sites": json_sites,
|
||||
},
|
||||
sys.stdout,
|
||||
indent=2,
|
||||
ensure_ascii=False,
|
||||
)
|
||||
print()
|
||||
else:
|
||||
if total_fail > 0:
|
||||
tag, color = "FAIL", RED
|
||||
elif total_warn > 0:
|
||||
tag, color = "WARN", YELLOW
|
||||
else:
|
||||
tag, color = "OK", GREEN
|
||||
print(
|
||||
f"\n{color}anti-ai-lint: {tag}{RESET} — "
|
||||
f"{sites_with_findings}/{sites_total} sites flagged "
|
||||
f"({total_fail} fail, {total_warn} warn)"
|
||||
)
|
||||
|
||||
return 1 if total_fail > 0 else 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user