random_corp/.github/scripts/preflight.py

#!/usr/bin/env python3
"""
MD-DDL Pre-Flight Check

Mechanical Level-1 syntax validation per the MD-DDL spec (1-Foundation.md).
Run against a domain folder before committing or publishing.

Usage:
    python preflight.py <domain-folder>
    python preflight.py domains/customer
    python preflight.py .          (run from within the domain folder)

Requires: Python 3.8+, pyyaml   (pip install pyyaml)

Exit codes:
    0  no findings
    1  one or more pre-flight failures
    2  usage or invocation error
"""

import re
import sys
import yaml
from dataclasses import dataclass
from pathlib import Path


# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------

@dataclass
class Finding:
    file: str
    line: int
    check: str
    message: str


# ---------------------------------------------------------------------------
# Mermaid diagram types recognised by the spec
# ---------------------------------------------------------------------------

MERMAID_DIAGRAM_TYPES = {
    "graph", "flowchart", "sequenceDiagram", "classDiagram",
    "stateDiagram", "stateDiagram-v2", "erDiagram", "gantt",
    "journey", "gitGraph", "pie", "quadrantChart", "requirementDiagram",
    "mindmap", "timeline", "block-beta", "packet-beta",
    "xychart-beta", "sankey-beta", "kanban", "architecture-beta",
}

# YAML keys in relationship and event blocks that must name a domain entity.
# 'actor' is deliberately excluded — event actors may be roles or external
# systems, not MD-DDL entities.
ENTITY_REF_KEYS = {"source", "target", "entity", "extends"}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _read(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="replace")


def _extract_code_blocks(text: str, lang: str) -> list[tuple[int, str]]:
    """Return (start_line_1indexed, content) for every fenced block of lang."""
    blocks: list[tuple[int, str]] = []
    lines = text.splitlines()
    fence = re.compile(rf"^```{re.escape(lang)}\s*$", re.IGNORECASE)
    close = re.compile(r"^```\s*$")
    in_block = False
    start = 0
    buf: list[str] = []
    for i, line in enumerate(lines, 1):
        if not in_block:
            if fence.match(line):
                in_block = True
                start = i + 1
                buf = []
        else:
            if close.match(line):
                blocks.append((start, "\n".join(buf)))
                in_block = False
                buf = []
            else:
                buf.append(line)
    return blocks


def _heading_slug(text: str) -> str:
    """GitHub-compatible anchor slug for a heading line."""
    # Strip inline markdown (bold, italic, backticks, links)
    text = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text)
    text = re.sub(r"[`*_]", "", text)
    text = text.lower().strip()
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"[\s_]+", "-", text).strip("-")
    return text


def _get_headings(path: Path) -> set[str]:
    """All heading slugs in a markdown file."""
    slugs: set[str] = set()
    try:
        for line in _read(path).splitlines():
            m = re.match(r"^#{1,6}\s+(.+)$", line)
            if m:
                slugs.add(_heading_slug(m.group(1)))
    except OSError:
        pass
    return slugs


# ---------------------------------------------------------------------------
# Check 1 — YAML syntax
# ---------------------------------------------------------------------------

def check_yaml_syntax(md_file: Path) -> list[Finding]:
    findings: list[Finding] = []
    text = _read(md_file)
    for start, content in _extract_code_blocks(text, "yaml"):
        try:
            yaml.safe_load(content)
        except yaml.YAMLError as exc:
            line_num = start
            if hasattr(exc, "problem_mark") and exc.problem_mark:
                line_num = start + exc.problem_mark.line
            msg = exc.problem if hasattr(exc, "problem") else str(exc)
            findings.append(Finding(str(md_file), line_num, "yaml-syntax",
                                    f"YAML parse error: {msg}"))
    return findings


# ---------------------------------------------------------------------------
# Check 2 — Mermaid syntax
# ---------------------------------------------------------------------------

def check_mermaid_syntax(md_file: Path) -> list[Finding]:
    findings: list[Finding] = []
    text = _read(md_file)
    for start, content in _extract_code_blocks(text, "mermaid"):
        stripped = content.strip()
        if not stripped:
            findings.append(Finding(str(md_file), start, "mermaid-syntax",
                                    "Empty Mermaid block"))
            continue

        lines = stripped.splitlines()
        idx = 0

        # Skip optional YAML config block (---...---)
        if lines[0].strip() == "---":
            idx = 1
            while idx < len(lines) and lines[idx].strip() != "---":
                idx += 1
            idx += 1  # step past closing ---

        # Find first meaningful content line
        diagram_type = None
        while idx < len(lines):
            line = lines[idx].strip()
            if line and not line.startswith("%%"):
                diagram_type = line.split()[0].rstrip(":")
                break
            idx += 1

        if diagram_type is None:
            findings.append(Finding(str(md_file), start, "mermaid-syntax",
                                    "Mermaid block has no diagram type declaration"))
        elif diagram_type not in MERMAID_DIAGRAM_TYPES:
            findings.append(Finding(str(md_file), start + idx, "mermaid-syntax",
                                    f"Unrecognised Mermaid diagram type: '{diagram_type}'"))
    return findings


# ---------------------------------------------------------------------------
# Check 3 — Internal link integrity
# ---------------------------------------------------------------------------

# Patterns that carry URL/path values in MD-DDL files:
#   [text](url)          — standard markdown link (also catches ![alt](url) images)
#   href='url'           — HTML anchor inside Mermaid node labels (single quotes)
#   href="url"           — same, double-quote variant
#
# Not checked here (out of scope for domain preflight):
#   {{INCLUDE: path}}    — agent/skill file directive; not used in domain folders
#   reference-style links [text][ref] / [ref]: url — only found as external URLs in README

_MD_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)]+)\)")
_HREF_RE = re.compile(r'href=["\']([^"\']+)["\']')


def _check_url(url: str, line_num: int, md_file: Path, findings: list[Finding]) -> None:
    """Validate a single URL extracted from md_file at line_num."""
    # Skip external links — not our concern
    if url.startswith(("http://", "https://", "mailto:")):
        return

    # Pure same-page anchor: #heading — verify heading exists in this file
    if url.startswith("#"):
        anchor = url[1:]
        if anchor and _heading_slug(anchor) not in _get_headings(md_file):
            findings.append(Finding(str(md_file), line_num, "internal-links",
                                    f"Broken same-page anchor: '#{anchor}' not found in this file"))
        return

    # File path, with optional anchor
    file_part, anchor = (url.rsplit("#", 1) if "#" in url else (url, None))
    if not file_part:
        return

    target = (md_file.parent / file_part).resolve()
    if not target.exists():
        findings.append(Finding(str(md_file), line_num, "internal-links",
                                f"Broken link: '{file_part}' does not exist"))
    elif anchor:
        if _heading_slug(anchor) not in _get_headings(target):
            findings.append(Finding(str(md_file), line_num, "internal-links",
                                    f"Broken anchor: '#{anchor}' not found in {file_part}"))


def check_internal_links(md_file: Path) -> list[Finding]:
    findings: list[Finding] = []
    lines = _read(md_file).splitlines()

    for line_num, line in enumerate(lines, 1):
        seen: set[str] = set()
        for m in _MD_LINK_RE.finditer(line):
            url = m.group(1).strip()
            if url not in seen:
                seen.add(url)
                _check_url(url, line_num, md_file, findings)
        for m in _HREF_RE.finditer(line):
            url = m.group(1).strip()
            if url not in seen:
                seen.add(url)
                _check_url(url, line_num, md_file, findings)

    return findings


# ---------------------------------------------------------------------------
# Check 4 — Entity reference consistency
# ---------------------------------------------------------------------------

def _entity_names_from_domain(domain_file: Path) -> set[str]:
    """Extract canonical entity names from the ## Entities table in domain.md."""
    names: set[str] = set()
    text = _read(domain_file)
    in_section = False
    in_table = False

    for line in text.splitlines():
        if re.match(r"^##\s+Entities\s*$", line):
            in_section = True
            in_table = False
            continue
        if re.match(r"^##\s+", line) and in_section:
            break
        if in_section:
            if re.match(r"^\s*Name\s*\|", line):
                in_table = True
                continue
            if in_table and "|" in line:
                # Strip separator rows (--- | --- | ...)
                cells = [c.strip() for c in line.split("|") if c.strip()]
                if not cells or re.match(r"^-+$", cells[0]):
                    continue
                name_cell = cells[0]
                lm = re.match(r"\[([^\]]+)\]\([^)]*\)", name_cell)
                name = lm.group(1).strip() if lm else name_cell
                if name:
                    names.add(name)
    return names


def check_entity_references(domain_file: Path) -> list[Finding]:
    """YAML source/target/actor/entity/extends values must name a domain entity."""
    findings: list[Finding] = []
    entity_names = _entity_names_from_domain(domain_file)
    if not entity_names:
        return []

    domain_root = domain_file.parent

    # Check entity and event detail files; skip sources/ (cross-domain references allowed)
    for md_file in domain_root.rglob("*.md"):
        rel = md_file.relative_to(domain_root)
        # Skip sources/ (cross-domain entity refs allowed) and products/
        # (the 'source' key there names a data source system, not an entity)
        if rel.parts and rel.parts[0] in {"sources", "products"}:
            continue

        text = _read(md_file)
        for start, content in _extract_code_blocks(text, "yaml"):
            try:
                data = yaml.safe_load(content)
            except yaml.YAMLError:
                continue  # caught by yaml-syntax check
            if not isinstance(data, dict):
                continue

            block_lines = content.splitlines()
            for key in ENTITY_REF_KEYS:
                value = data.get(key)
                if not isinstance(value, str):
                    continue
                if value in entity_names:
                    continue

                # Find the line number of the key within this block
                key_line = start
                for i, bl in enumerate(block_lines):
                    if re.match(rf"^{re.escape(key)}\s*:", bl):
                        key_line = start + i
                        break

                findings.append(Finding(
                    str(md_file), key_line, "entity-references",
                    f"'{key}: {value}' does not match any entity in domain.md",
                ))
    return findings


# ---------------------------------------------------------------------------
# Check 5 — Domain version field
# ---------------------------------------------------------------------------

def check_domain_version(domain_file: Path) -> list[Finding]:
    findings: list[Finding] = []
    text = _read(domain_file)
    lines = text.splitlines()

    in_metadata = False
    collecting = False
    buf: list[str] = []
    start = 1
    found_block = False

    for i, line in enumerate(lines, 1):
        if re.match(r"^##\s+Metadata\s*$", line):
            in_metadata = True
            continue
        if in_metadata and re.match(r"^##\s+", line):
            in_metadata = False
        if in_metadata and re.match(r"^```yaml\s*$", line):
            collecting = True
            start = i + 1
            buf = []
            continue
        if collecting:
            if re.match(r"^```\s*$", line):
                collecting = False
                in_metadata = False
                found_block = True
                content = "\n".join(buf)
                try:
                    data = yaml.safe_load(content)
                    if not isinstance(data, dict) or "version" not in data:
                        findings.append(Finding(
                            str(domain_file), start, "domain-version",
                            "Metadata YAML block is missing the 'version:' field",
                        ))
                except yaml.YAMLError:
                    pass  # caught by yaml-syntax check
                break
            buf.append(line)

    if not found_block:
        findings.append(Finding(
            str(domain_file), 1, "domain-version",
            "No YAML block found under '## Metadata' — 'version:' field cannot be verified",
        ))
    return findings


# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------

def run_preflight(domain_folder: str) -> list[Finding]:
    domain_root = Path(domain_folder).resolve()
    if not domain_root.exists():
        print(f"error: path not found: {domain_folder}", file=sys.stderr)
        sys.exit(2)
    if not domain_root.is_dir():
        print(f"error: not a directory: {domain_folder}", file=sys.stderr)
        sys.exit(2)

    domain_file = domain_root / "domain.md"
    findings: list[Finding] = []

    # Checks 1–3: run across every .md file in the domain folder
    for md_file in sorted(domain_root.rglob("*.md")):
        findings += check_yaml_syntax(md_file)
        findings += check_mermaid_syntax(md_file)
        findings += check_internal_links(md_file)

    # Check 4: entity reference consistency (requires domain.md)
    if domain_file.exists():
        findings += check_entity_references(domain_file)

    # Check 5: domain version (requires domain.md)
    if domain_file.exists():
        findings += check_domain_version(domain_file)
    else:
        findings.append(Finding(
            str(domain_file), 0, "domain-version",
            "domain.md not found — is this a domain folder?",
        ))

    return findings


def main() -> None:
    if len(sys.argv) != 2:
        print("Usage: python preflight.py <domain-folder>", file=sys.stderr)
        sys.exit(2)

    findings = run_preflight(sys.argv[1])

    if not findings:
        print("Pre-flight passed. No findings.")
        sys.exit(0)

    # Group by check for readability
    by_check: dict[str, list[Finding]] = {}
    for f in findings:
        by_check.setdefault(f.check, []).append(f)

    print(f"Pre-flight: {len(findings)} finding(s)\n")
    for check, group in by_check.items():
        print(f"  [{check}]  {len(group)} finding(s)")
        for f in group:
            path = f.file
            print(f"    {path}:{f.line}")
            print(f"    {f.message}")
        print()

    sys.exit(1)


if __name__ == "__main__":
    main()