#!/usr/bin/env python3 """ MD-DDL Pre-Flight Check Mechanical Level-1 syntax validation per the MD-DDL spec (1-Foundation.md). Run against a domain folder before committing or publishing. Usage: python preflight.py python preflight.py domains/customer python preflight.py . (run from within the domain folder) Requires: Python 3.8+, pyyaml (pip install pyyaml) Exit codes: 0 no findings 1 one or more pre-flight failures 2 usage or invocation error """ import re import sys import yaml from dataclasses import dataclass from pathlib import Path # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @dataclass class Finding: file: str line: int check: str message: str # --------------------------------------------------------------------------- # Mermaid diagram types recognised by the spec # --------------------------------------------------------------------------- MERMAID_DIAGRAM_TYPES = { "graph", "flowchart", "sequenceDiagram", "classDiagram", "stateDiagram", "stateDiagram-v2", "erDiagram", "gantt", "journey", "gitGraph", "pie", "quadrantChart", "requirementDiagram", "mindmap", "timeline", "block-beta", "packet-beta", "xychart-beta", "sankey-beta", "kanban", "architecture-beta", } # YAML keys in relationship and event blocks that must name a domain entity. # 'actor' is deliberately excluded — event actors may be roles or external # systems, not MD-DDL entities. ENTITY_REF_KEYS = {"source", "target", "entity", "extends"} # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _read(path: Path) -> str: return path.read_text(encoding="utf-8", errors="replace") def _extract_code_blocks(text: str, lang: str) -> list[tuple[int, str]]: """Return (start_line_1indexed, content) for every fenced block of lang.""" blocks: list[tuple[int, str]] = [] lines = text.splitlines() fence = re.compile(rf"^```{re.escape(lang)}\s*$", re.IGNORECASE) close = re.compile(r"^```\s*$") in_block = False start = 0 buf: list[str] = [] for i, line in enumerate(lines, 1): if not in_block: if fence.match(line): in_block = True start = i + 1 buf = [] else: if close.match(line): blocks.append((start, "\n".join(buf))) in_block = False buf = [] else: buf.append(line) return blocks def _heading_slug(text: str) -> str: """GitHub-compatible anchor slug for a heading line.""" # Strip inline markdown (bold, italic, backticks, links) text = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text) text = re.sub(r"[`*_]", "", text) text = text.lower().strip() text = re.sub(r"[^\w\s-]", "", text) text = re.sub(r"[\s_]+", "-", text).strip("-") return text def _get_headings(path: Path) -> set[str]: """All heading slugs in a markdown file.""" slugs: set[str] = set() try: for line in _read(path).splitlines(): m = re.match(r"^#{1,6}\s+(.+)$", line) if m: slugs.add(_heading_slug(m.group(1))) except OSError: pass return slugs # --------------------------------------------------------------------------- # Check 1 — YAML syntax # --------------------------------------------------------------------------- def check_yaml_syntax(md_file: Path) -> list[Finding]: findings: list[Finding] = [] text = _read(md_file) for start, content in _extract_code_blocks(text, "yaml"): try: yaml.safe_load(content) except yaml.YAMLError as exc: line_num = start if hasattr(exc, "problem_mark") and exc.problem_mark: line_num = start + exc.problem_mark.line msg = exc.problem if hasattr(exc, "problem") else str(exc) findings.append(Finding(str(md_file), line_num, "yaml-syntax", f"YAML parse error: {msg}")) return findings # --------------------------------------------------------------------------- # Check 2 — Mermaid syntax # --------------------------------------------------------------------------- def check_mermaid_syntax(md_file: Path) -> list[Finding]: findings: list[Finding] = [] text = _read(md_file) for start, content in _extract_code_blocks(text, "mermaid"): stripped = content.strip() if not stripped: findings.append(Finding(str(md_file), start, "mermaid-syntax", "Empty Mermaid block")) continue lines = stripped.splitlines() idx = 0 # Skip optional YAML config block (---...---) if lines[0].strip() == "---": idx = 1 while idx < len(lines) and lines[idx].strip() != "---": idx += 1 idx += 1 # step past closing --- # Find first meaningful content line diagram_type = None while idx < len(lines): line = lines[idx].strip() if line and not line.startswith("%%"): diagram_type = line.split()[0].rstrip(":") break idx += 1 if diagram_type is None: findings.append(Finding(str(md_file), start, "mermaid-syntax", "Mermaid block has no diagram type declaration")) elif diagram_type not in MERMAID_DIAGRAM_TYPES: findings.append(Finding(str(md_file), start + idx, "mermaid-syntax", f"Unrecognised Mermaid diagram type: '{diagram_type}'")) return findings # --------------------------------------------------------------------------- # Check 3 — Internal link integrity # --------------------------------------------------------------------------- # Patterns that carry URL/path values in MD-DDL files: # [text](url) — standard markdown link (also catches ![alt](url) images) # href='url' — HTML anchor inside Mermaid node labels (single quotes) # href="url" — same, double-quote variant # # Not checked here (out of scope for domain preflight): # {{INCLUDE: path}} — agent/skill file directive; not used in domain folders # reference-style links [text][ref] / [ref]: url — only found as external URLs in README _MD_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)]+)\)") _HREF_RE = re.compile(r'href=["\']([^"\']+)["\']') def _check_url(url: str, line_num: int, md_file: Path, findings: list[Finding]) -> None: """Validate a single URL extracted from md_file at line_num.""" # Skip external links — not our concern if url.startswith(("http://", "https://", "mailto:")): return # Pure same-page anchor: #heading — verify heading exists in this file if url.startswith("#"): anchor = url[1:] if anchor and _heading_slug(anchor) not in _get_headings(md_file): findings.append(Finding(str(md_file), line_num, "internal-links", f"Broken same-page anchor: '#{anchor}' not found in this file")) return # File path, with optional anchor file_part, anchor = (url.rsplit("#", 1) if "#" in url else (url, None)) if not file_part: return target = (md_file.parent / file_part).resolve() if not target.exists(): findings.append(Finding(str(md_file), line_num, "internal-links", f"Broken link: '{file_part}' does not exist")) elif anchor: if _heading_slug(anchor) not in _get_headings(target): findings.append(Finding(str(md_file), line_num, "internal-links", f"Broken anchor: '#{anchor}' not found in {file_part}")) def check_internal_links(md_file: Path) -> list[Finding]: findings: list[Finding] = [] lines = _read(md_file).splitlines() for line_num, line in enumerate(lines, 1): seen: set[str] = set() for m in _MD_LINK_RE.finditer(line): url = m.group(1).strip() if url not in seen: seen.add(url) _check_url(url, line_num, md_file, findings) for m in _HREF_RE.finditer(line): url = m.group(1).strip() if url not in seen: seen.add(url) _check_url(url, line_num, md_file, findings) return findings # --------------------------------------------------------------------------- # Check 4 — Entity reference consistency # --------------------------------------------------------------------------- def _entity_names_from_domain(domain_file: Path) -> set[str]: """Extract canonical entity names from the ## Entities table in domain.md.""" names: set[str] = set() text = _read(domain_file) in_section = False in_table = False for line in text.splitlines(): if re.match(r"^##\s+Entities\s*$", line): in_section = True in_table = False continue if re.match(r"^##\s+", line) and in_section: break if in_section: if re.match(r"^\s*Name\s*\|", line): in_table = True continue if in_table and "|" in line: # Strip separator rows (--- | --- | ...) cells = [c.strip() for c in line.split("|") if c.strip()] if not cells or re.match(r"^-+$", cells[0]): continue name_cell = cells[0] lm = re.match(r"\[([^\]]+)\]\([^)]*\)", name_cell) name = lm.group(1).strip() if lm else name_cell if name: names.add(name) return names def check_entity_references(domain_file: Path) -> list[Finding]: """YAML source/target/actor/entity/extends values must name a domain entity.""" findings: list[Finding] = [] entity_names = _entity_names_from_domain(domain_file) if not entity_names: return [] domain_root = domain_file.parent # Check entity and event detail files; skip sources/ (cross-domain references allowed) for md_file in domain_root.rglob("*.md"): rel = md_file.relative_to(domain_root) # Skip sources/ (cross-domain entity refs allowed) and products/ # (the 'source' key there names a data source system, not an entity) if rel.parts and rel.parts[0] in {"sources", "products"}: continue text = _read(md_file) for start, content in _extract_code_blocks(text, "yaml"): try: data = yaml.safe_load(content) except yaml.YAMLError: continue # caught by yaml-syntax check if not isinstance(data, dict): continue block_lines = content.splitlines() for key in ENTITY_REF_KEYS: value = data.get(key) if not isinstance(value, str): continue if value in entity_names: continue # Find the line number of the key within this block key_line = start for i, bl in enumerate(block_lines): if re.match(rf"^{re.escape(key)}\s*:", bl): key_line = start + i break findings.append(Finding( str(md_file), key_line, "entity-references", f"'{key}: {value}' does not match any entity in domain.md", )) return findings # --------------------------------------------------------------------------- # Check 5 — Domain version field # --------------------------------------------------------------------------- def check_domain_version(domain_file: Path) -> list[Finding]: findings: list[Finding] = [] text = _read(domain_file) lines = text.splitlines() in_metadata = False collecting = False buf: list[str] = [] start = 1 found_block = False for i, line in enumerate(lines, 1): if re.match(r"^##\s+Metadata\s*$", line): in_metadata = True continue if in_metadata and re.match(r"^##\s+", line): in_metadata = False if in_metadata and re.match(r"^```yaml\s*$", line): collecting = True start = i + 1 buf = [] continue if collecting: if re.match(r"^```\s*$", line): collecting = False in_metadata = False found_block = True content = "\n".join(buf) try: data = yaml.safe_load(content) if not isinstance(data, dict) or "version" not in data: findings.append(Finding( str(domain_file), start, "domain-version", "Metadata YAML block is missing the 'version:' field", )) except yaml.YAMLError: pass # caught by yaml-syntax check break buf.append(line) if not found_block: findings.append(Finding( str(domain_file), 1, "domain-version", "No YAML block found under '## Metadata' — 'version:' field cannot be verified", )) return findings # --------------------------------------------------------------------------- # Orchestration # --------------------------------------------------------------------------- def run_preflight(domain_folder: str) -> list[Finding]: domain_root = Path(domain_folder).resolve() if not domain_root.exists(): print(f"error: path not found: {domain_folder}", file=sys.stderr) sys.exit(2) if not domain_root.is_dir(): print(f"error: not a directory: {domain_folder}", file=sys.stderr) sys.exit(2) domain_file = domain_root / "domain.md" findings: list[Finding] = [] # Checks 1–3: run across every .md file in the domain folder for md_file in sorted(domain_root.rglob("*.md")): findings += check_yaml_syntax(md_file) findings += check_mermaid_syntax(md_file) findings += check_internal_links(md_file) # Check 4: entity reference consistency (requires domain.md) if domain_file.exists(): findings += check_entity_references(domain_file) # Check 5: domain version (requires domain.md) if domain_file.exists(): findings += check_domain_version(domain_file) else: findings.append(Finding( str(domain_file), 0, "domain-version", "domain.md not found — is this a domain folder?", )) return findings def main() -> None: if len(sys.argv) != 2: print("Usage: python preflight.py ", file=sys.stderr) sys.exit(2) findings = run_preflight(sys.argv[1]) if not findings: print("Pre-flight passed. No findings.") sys.exit(0) # Group by check for readability by_check: dict[str, list[Finding]] = {} for f in findings: by_check.setdefault(f.check, []).append(f) print(f"Pre-flight: {len(findings)} finding(s)\n") for check, group in by_check.items(): print(f" [{check}] {len(group)} finding(s)") for f in group: path = f.file print(f" {path}:{f.line}") print(f" {f.message}") print() sys.exit(1) if __name__ == "__main__": main()