#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.11" # dependencies = [ # "pymupdf>=1.24", # "Pillow>=10.0", # ] # /// """Strip blank pages from a PDF — used by mdms-mover before promoting to toprocess. Usage: strip_blank_pages.py Exit codes: 0 output.pdf written (either stripped or copied unchanged) 2 all pages would be dropped — output NOT written, caller should keep the original file in the inbox and log a warning 1 error (input unreadable, write failed, etc.) A page counts as "blank" iff BOTH of: * embedded text is empty / whitespace-only, AND * rendered thumbnail is >= MDMS_BLANK_THRESHOLD near-white pixels. False-negatives are preferred over false-positives — borderline pages stay. Env: MDMS_BLANK_THRESHOLD near-white pixel ratio (0.0-1.0, default 0.97) MDMS_BLANK_NEAR_WHITE near-white cutoff in 0-255 grayscale (default 240) MDMS_BLANK_DPI thumbnail render DPI (default 50) PyMuPDF is used instead of pdf2image+pikepdf+pypdf so the whole pipeline is one self-contained wheel — no poppler-utils apt-install on mdock, no multiple text-extraction libraries to keep in sync. """ from __future__ import annotations import io import os import shutil import sys from pathlib import Path import fitz # PyMuPDF from PIL import Image def near_white_ratio(image: Image.Image, near_white: int) -> float: gray = image.convert("L") if image.mode != "L" else image hist = gray.histogram() total = sum(hist) if total == 0: return 1.0 return sum(hist[near_white:]) / total def page_is_blank(page: "fitz.Page", threshold: float, near_white: int, dpi: int) -> bool: text = (page.get_text("text") or "").strip() if text: return False pix = page.get_pixmap(dpi=dpi, colorspace=fitz.csGRAY) image = Image.frombytes("L", (pix.width, pix.height), pix.samples) return near_white_ratio(image, near_white) >= threshold def main() -> int: if len(sys.argv) != 3: print(f"usage: {sys.argv[0]} ", file=sys.stderr) return 1 src = Path(sys.argv[1]) dst = Path(sys.argv[2]) threshold = float(os.environ.get("MDMS_BLANK_THRESHOLD", "0.97")) near_white = int(os.environ.get("MDMS_BLANK_NEAR_WHITE", "240")) dpi = int(os.environ.get("MDMS_BLANK_DPI", "50")) try: doc = fitz.open(src) except Exception as exc: print(f"failed to open {src}: {exc}", file=sys.stderr) return 1 try: page_count = doc.page_count if page_count <= 1: shutil.copyfile(src, dst) return 0 keep: list[int] = [] for i in range(page_count): if not page_is_blank(doc[i], threshold, near_white, dpi): keep.append(i) if not keep: print(f"all pages blank in {src.name}", file=sys.stderr) return 2 if len(keep) == page_count: shutil.copyfile(src, dst) return 0 out = fitz.open() try: for i in keep: out.insert_pdf(doc, from_page=i, to_page=i) out.save(dst) finally: out.close() dropped = page_count - len(keep) print( f"{src.name}: dropped {dropped}/{page_count} blank page(s)", file=sys.stderr, ) return 0 finally: doc.close() if __name__ == "__main__": sys.exit(main())