#!/usr/bin/env python3 """anti-ai-lint — flag AI-text fingerprints in built sites. Reads tools/anti-ai-blacklist.yaml, walks build//index.html, prints findings. Exits 1 if any finding has severity=fail, else 0. Usage: tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet] [--json] [BUILD_DIR] """ import argparse import json import os import re import subprocess import sys from html.parser import HTMLParser def _ansi(code: str) -> str: return code if sys.stdout.isatty() else "" RED = _ansi("\033[31m") YELLOW = _ansi("\033[33m") GREEN = _ansi("\033[32m") DIM = _ansi("\033[2m") BOLD = _ansi("\033[1m") RESET = _ansi("\033[0m") class TextExtractor(HTMLParser): """Extract visible text and per-site allow directives.""" SKIP_TAGS = {"script", "style", "noscript", "template"} def __init__(self) -> None: super().__init__(convert_charrefs=True) self.skip_depth = 0 self.fragments: list[tuple[int, str]] = [] self.allows: set[str] = set() self.html_lang: str | None = None def handle_starttag(self, tag: str, attrs) -> None: if tag == "html" and self.html_lang is None: for k, v in attrs: if k == "lang" and v: self.html_lang = v.lower().split("-")[0] break if tag in self.SKIP_TAGS: self.skip_depth += 1 def handle_startendtag(self, tag: str, attrs) -> None: # Self-closing — never enters skip depth, no data either. pass def handle_endtag(self, tag: str) -> None: if tag in self.SKIP_TAGS and self.skip_depth > 0: self.skip_depth -= 1 def handle_data(self, data: str) -> None: if self.skip_depth == 0 and data.strip(): line, _ = self.getpos() self.fragments.append((line, data)) def handle_comment(self, data: str) -> None: m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE) if m: for token in re.split(r"[,\s]+", m.group(1)): token = token.strip() if token: self.allows.add(token) self.allows.add(token.lower()) def load_blacklist(path: str) -> dict: """Convert YAML to JSON via yq, parse with stdlib json.""" try: out = subprocess.check_output( ["yq", "-o=json", path], stderr=subprocess.PIPE, text=True, ) except FileNotFoundError: sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)") except subprocess.CalledProcessError as e: sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}") return json.loads(out) def site_allow_yaml(site_yaml: str) -> list[str]: if not os.path.isfile(site_yaml): return [] try: out = subprocess.check_output( ["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml], stderr=subprocess.DEVNULL, text=True, ) except subprocess.CalledProcessError: return [] return [line.strip() for line in out.splitlines() if line.strip()] def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]: with open(html_path, "r", encoding="utf-8", errors="replace") as f: raw = f.read() parser = TextExtractor() try: parser.feed(raw) parser.close() except Exception as e: # Malformed HTML — record a single warning and skip. return [{ "kind": "parse", "name": "html-parse-error", "severity": "warn", "line": 0, "snippet": str(e)[:120], }] allow_set = set(parser.allows) for tok in extra_allows: allow_set.add(tok) allow_set.add(tok.lower()) findings: list[dict] = [] seen: set[tuple[str, str]] = set() # (kind, name) — one report per file # Lint vocab in BOTH languages — sites may carry data-en attributes # that surface translated text alongside the primary language. vocab = blacklist.get("vocab") or {} for lang in ("de", "en"): bucket = vocab.get(lang) or {} for severity in ("warn", "fail"): for term in bucket.get(severity) or []: key = ("vocab", term.lower()) if key in seen: continue if term in allow_set or term.lower() in allow_set: continue term_lc = term.lower() for line_no, frag in parser.fragments: if term_lc in frag.lower(): findings.append({ "kind": "vocab", "lang": lang, "name": term, "severity": severity, "line": line_no, "snippet": frag.strip()[:120], }) seen.add(key) break # Patterns for pat in blacklist.get("patterns") or []: name = pat.get("name") or pat.get("regex", "")[:40] key = ("pattern", name) if key in seen: continue if name in allow_set or name.lower() in allow_set: continue flags = re.MULTILINE if not pat.get("case_sensitive"): flags |= re.IGNORECASE try: rx = re.compile(pat["regex"], flags) except re.error as e: findings.append({ "kind": "config", "name": name, "severity": "warn", "line": 0, "snippet": f"invalid regex: {e}", }) continue for line_no, frag in parser.fragments: m = rx.search(frag) if m: findings.append({ "kind": "pattern", "name": name, "severity": pat.get("severity", "warn"), "line": line_no, "snippet": (frag.strip()[:120] or m.group(0)[:120]), }) seen.add(key) break return findings def main() -> int: here = os.path.dirname(os.path.abspath(__file__)) repo = os.path.dirname(here) ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.") ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build")) ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml")) ap.add_argument("--sources", default=os.path.join(repo, "sites"), help="sites/ root (for per-site site.yaml allow lists)") ap.add_argument("--quiet", action="store_true", help="Suppress warnings; only show fails.") ap.add_argument("--json", action="store_true", help="Emit JSON report.") args = ap.parse_args() if not os.path.isdir(args.build_dir): print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr) return 2 if not os.path.isfile(args.blacklist): print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr) return 2 blacklist = load_blacklist(args.blacklist) total_warn = 0 total_fail = 0 sites_with_findings = 0 sites_total = 0 json_sites: list[dict] = [] for entry in sorted(os.listdir(args.build_dir)): site_dir = os.path.join(args.build_dir, entry) html = os.path.join(site_dir, "index.html") if not os.path.isfile(html): continue sites_total += 1 site_yaml = os.path.join(args.sources, entry, "site.yaml") extra_allows = site_allow_yaml(site_yaml) findings = lint_file(html, blacklist, extra_allows) warns = [f for f in findings if f["severity"] == "warn"] fails = [f for f in findings if f["severity"] == "fail"] if findings: sites_with_findings += 1 total_warn += len(warns) total_fail += len(fails) if args.json: json_sites.append({"site": entry, "findings": findings}) else: visible = fails if args.quiet else findings if visible: print(f"{BOLD}{entry}{RESET}") for f in visible: if f["severity"] == "fail": color, tag = RED, "FAIL" else: color, tag = YELLOW, "warn" lang = f" ({f['lang']})" if "lang" in f else "" print( f" {color}{tag}{RESET} {f['kind']}{lang}: " f"{BOLD}{f['name']}{RESET} " f"{DIM}line {f['line']}: {f['snippet']}{RESET}" ) if args.json: json.dump( { "summary": { "sites_total": sites_total, "sites_with_findings": sites_with_findings, "warn": total_warn, "fail": total_fail, }, "sites": json_sites, }, sys.stdout, indent=2, ensure_ascii=False, ) print() else: if total_fail > 0: tag, color = "FAIL", RED elif total_warn > 0: tag, color = "WARN", YELLOW else: tag, color = "OK", GREEN print( f"\n{color}anti-ai-lint: {tag}{RESET} — " f"{sites_with_findings}/{sites_total} sites flagged " f"({total_fail} fail, {total_warn} warn)" ) return 1 if total_fail > 0 else 0 if __name__ == "__main__": sys.exit(main())