Initial commit: Financial Crime domain exemplar

This commit is contained in:
2026-06-01 21:18:19 +12:00
commit 2fc4dacd59
70 changed files with 5776 additions and 0 deletions
+86
View File
@@ -0,0 +1,86 @@
param(
[string]$SpecDir = "md-ddl-specification",
[string]$OutputFileName = "MD-DDL-Complete.md"
)
$repoRoot = (Resolve-Path (Join-Path $PSScriptRoot "..\..")).Path
$specPath = Join-Path $repoRoot $SpecDir
$outputPath = Join-Path $specPath $OutputFileName
# Deterministic section order for MD-DDL spec generation.
$sectionFiles = @(
"1-Foundation.md",
"2-Domains.md",
"3-Entities.md",
"4-Enumerations.md",
"5-Relationships.md",
"6-Events.md",
"7-Sources.md",
"8-Transformations.md",
"9-Data-Products.md",
"10-Adoption.md"
)
$files = foreach ($name in $sectionFiles) {
$fullPath = Join-Path $specPath $name
if (-not (Test-Path $fullPath)) {
throw "Missing required spec section file: $name"
}
Get-Item $fullPath
}
$combined = New-Object System.Collections.Generic.List[string]
for ($i = 0; $i -lt $files.Count; $i++) {
$file = $files[$i]
$lines = [System.IO.File]::ReadAllLines($file.FullName)
if ($lines.Length -lt 3) {
$body = @()
}
else {
# Keep one global H1 by dropping each section's first two lines in the body.
$body = [System.Collections.Generic.List[string]]::new()
foreach ($line in $lines[2..($lines.Length - 1)]) {
[void]$body.Add($line)
}
# Remove optional trailing navigation block: blank lines, '---', blank lines, '...next:' line.
while ($body.Count -gt 0 -and [string]::IsNullOrWhiteSpace($body[$body.Count - 1])) {
$body.RemoveAt($body.Count - 1)
}
if ($body.Count -gt 0 -and $body[$body.Count - 1] -like "...next:*") {
$body.RemoveAt($body.Count - 1)
while ($body.Count -gt 0 -and [string]::IsNullOrWhiteSpace($body[$body.Count - 1])) {
$body.RemoveAt($body.Count - 1)
}
if ($body.Count -gt 0 -and $body[$body.Count - 1] -eq "---") {
$body.RemoveAt($body.Count - 1)
}
while ($body.Count -gt 0 -and [string]::IsNullOrWhiteSpace($body[$body.Count - 1])) {
$body.RemoveAt($body.Count - 1)
}
}
}
if ($i -eq 0) {
# Preserve single top-level title from section 1.
[void]$combined.Add($lines[0])
[void]$combined.Add("")
}
if ($i -gt 0) {
[void]$combined.Add("")
}
foreach ($line in $body) {
[void]$combined.Add($line)
}
}
# Ensure UTF-8 output (without BOM) so punctuation remains stable across toolchains.
$utf8NoBom = New-Object System.Text.UTF8Encoding($false)
[System.IO.File]::WriteAllLines($outputPath, $combined, $utf8NoBom)
Write-Output "Regenerated $OutputFileName from $($files.Count) files in $SpecDir"
+445
View File
@@ -0,0 +1,445 @@
#!/usr/bin/env python3
"""
MD-DDL Pre-Flight Check
Mechanical Level-1 syntax validation per the MD-DDL spec (1-Foundation.md).
Run against a domain folder before committing or publishing.
Usage:
python preflight.py <domain-folder>
python preflight.py domains/customer
python preflight.py . (run from within the domain folder)
Requires: Python 3.8+, pyyaml (pip install pyyaml)
Exit codes:
0 no findings
1 one or more pre-flight failures
2 usage or invocation error
"""
import re
import sys
import yaml
from dataclasses import dataclass
from pathlib import Path
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@dataclass
class Finding:
file: str
line: int
check: str
message: str
# ---------------------------------------------------------------------------
# Mermaid diagram types recognised by the spec
# ---------------------------------------------------------------------------
MERMAID_DIAGRAM_TYPES = {
"graph", "flowchart", "sequenceDiagram", "classDiagram",
"stateDiagram", "stateDiagram-v2", "erDiagram", "gantt",
"journey", "gitGraph", "pie", "quadrantChart", "requirementDiagram",
"mindmap", "timeline", "block-beta", "packet-beta",
"xychart-beta", "sankey-beta", "kanban", "architecture-beta",
}
# YAML keys in relationship and event blocks that must name a domain entity.
# 'actor' is deliberately excluded — event actors may be roles or external
# systems, not MD-DDL entities.
ENTITY_REF_KEYS = {"source", "target", "entity", "extends"}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _read(path: Path) -> str:
return path.read_text(encoding="utf-8", errors="replace")
def _extract_code_blocks(text: str, lang: str) -> list[tuple[int, str]]:
"""Return (start_line_1indexed, content) for every fenced block of lang."""
blocks: list[tuple[int, str]] = []
lines = text.splitlines()
fence = re.compile(rf"^```{re.escape(lang)}\s*$", re.IGNORECASE)
close = re.compile(r"^```\s*$")
in_block = False
start = 0
buf: list[str] = []
for i, line in enumerate(lines, 1):
if not in_block:
if fence.match(line):
in_block = True
start = i + 1
buf = []
else:
if close.match(line):
blocks.append((start, "\n".join(buf)))
in_block = False
buf = []
else:
buf.append(line)
return blocks
def _heading_slug(text: str) -> str:
"""GitHub-compatible anchor slug for a heading line."""
# Strip inline markdown (bold, italic, backticks, links)
text = re.sub(r"\[([^\]]*)\]\([^)]*\)", r"\1", text)
text = re.sub(r"[`*_]", "", text)
text = text.lower().strip()
text = re.sub(r"[^\w\s-]", "", text)
text = re.sub(r"[\s_]+", "-", text).strip("-")
return text
def _get_headings(path: Path) -> set[str]:
"""All heading slugs in a markdown file."""
slugs: set[str] = set()
try:
for line in _read(path).splitlines():
m = re.match(r"^#{1,6}\s+(.+)$", line)
if m:
slugs.add(_heading_slug(m.group(1)))
except OSError:
pass
return slugs
# ---------------------------------------------------------------------------
# Check 1 — YAML syntax
# ---------------------------------------------------------------------------
def check_yaml_syntax(md_file: Path) -> list[Finding]:
findings: list[Finding] = []
text = _read(md_file)
for start, content in _extract_code_blocks(text, "yaml"):
try:
yaml.safe_load(content)
except yaml.YAMLError as exc:
line_num = start
if hasattr(exc, "problem_mark") and exc.problem_mark:
line_num = start + exc.problem_mark.line
msg = exc.problem if hasattr(exc, "problem") else str(exc)
findings.append(Finding(str(md_file), line_num, "yaml-syntax",
f"YAML parse error: {msg}"))
return findings
# ---------------------------------------------------------------------------
# Check 2 — Mermaid syntax
# ---------------------------------------------------------------------------
def check_mermaid_syntax(md_file: Path) -> list[Finding]:
findings: list[Finding] = []
text = _read(md_file)
for start, content in _extract_code_blocks(text, "mermaid"):
stripped = content.strip()
if not stripped:
findings.append(Finding(str(md_file), start, "mermaid-syntax",
"Empty Mermaid block"))
continue
lines = stripped.splitlines()
idx = 0
# Skip optional YAML config block (---...---)
if lines[0].strip() == "---":
idx = 1
while idx < len(lines) and lines[idx].strip() != "---":
idx += 1
idx += 1 # step past closing ---
# Find first meaningful content line
diagram_type = None
while idx < len(lines):
line = lines[idx].strip()
if line and not line.startswith("%%"):
diagram_type = line.split()[0].rstrip(":")
break
idx += 1
if diagram_type is None:
findings.append(Finding(str(md_file), start, "mermaid-syntax",
"Mermaid block has no diagram type declaration"))
elif diagram_type not in MERMAID_DIAGRAM_TYPES:
findings.append(Finding(str(md_file), start + idx, "mermaid-syntax",
f"Unrecognised Mermaid diagram type: '{diagram_type}'"))
return findings
# ---------------------------------------------------------------------------
# Check 3 — Internal link integrity
# ---------------------------------------------------------------------------
# Patterns that carry URL/path values in MD-DDL files:
# [text](url) — standard markdown link (also catches ![alt](url) images)
# href='url' — HTML anchor inside Mermaid node labels (single quotes)
# href="url" — same, double-quote variant
#
# Not checked here (out of scope for domain preflight):
# {{INCLUDE: path}} — agent/skill file directive; not used in domain folders
# reference-style links [text][ref] / [ref]: url — only found as external URLs in README
_MD_LINK_RE = re.compile(r"\[[^\]]*\]\(([^)]+)\)")
_HREF_RE = re.compile(r'href=["\']([^"\']+)["\']')
def _check_url(url: str, line_num: int, md_file: Path, findings: list[Finding]) -> None:
"""Validate a single URL extracted from md_file at line_num."""
# Skip external links — not our concern
if url.startswith(("http://", "https://", "mailto:")):
return
# Pure same-page anchor: #heading — verify heading exists in this file
if url.startswith("#"):
anchor = url[1:]
if anchor and _heading_slug(anchor) not in _get_headings(md_file):
findings.append(Finding(str(md_file), line_num, "internal-links",
f"Broken same-page anchor: '#{anchor}' not found in this file"))
return
# File path, with optional anchor
file_part, anchor = (url.rsplit("#", 1) if "#" in url else (url, None))
if not file_part:
return
target = (md_file.parent / file_part).resolve()
if not target.exists():
findings.append(Finding(str(md_file), line_num, "internal-links",
f"Broken link: '{file_part}' does not exist"))
elif anchor:
if _heading_slug(anchor) not in _get_headings(target):
findings.append(Finding(str(md_file), line_num, "internal-links",
f"Broken anchor: '#{anchor}' not found in {file_part}"))
def check_internal_links(md_file: Path) -> list[Finding]:
findings: list[Finding] = []
lines = _read(md_file).splitlines()
for line_num, line in enumerate(lines, 1):
seen: set[str] = set()
for m in _MD_LINK_RE.finditer(line):
url = m.group(1).strip()
if url not in seen:
seen.add(url)
_check_url(url, line_num, md_file, findings)
for m in _HREF_RE.finditer(line):
url = m.group(1).strip()
if url not in seen:
seen.add(url)
_check_url(url, line_num, md_file, findings)
return findings
# ---------------------------------------------------------------------------
# Check 4 — Entity reference consistency
# ---------------------------------------------------------------------------
def _entity_names_from_domain(domain_file: Path) -> set[str]:
"""Extract canonical entity names from the ## Entities table in domain.md."""
names: set[str] = set()
text = _read(domain_file)
in_section = False
in_table = False
for line in text.splitlines():
if re.match(r"^##\s+Entities\s*$", line):
in_section = True
in_table = False
continue
if re.match(r"^##\s+", line) and in_section:
break
if in_section:
if re.match(r"^\s*Name\s*\|", line):
in_table = True
continue
if in_table and "|" in line:
# Strip separator rows (--- | --- | ...)
cells = [c.strip() for c in line.split("|") if c.strip()]
if not cells or re.match(r"^-+$", cells[0]):
continue
name_cell = cells[0]
lm = re.match(r"\[([^\]]+)\]\([^)]*\)", name_cell)
name = lm.group(1).strip() if lm else name_cell
if name:
names.add(name)
return names
def check_entity_references(domain_file: Path) -> list[Finding]:
"""YAML source/target/actor/entity/extends values must name a domain entity."""
findings: list[Finding] = []
entity_names = _entity_names_from_domain(domain_file)
if not entity_names:
return []
domain_root = domain_file.parent
# Check entity and event detail files; skip sources/ (cross-domain references allowed)
for md_file in domain_root.rglob("*.md"):
rel = md_file.relative_to(domain_root)
# Skip sources/ (cross-domain entity refs allowed) and products/
# (the 'source' key there names a data source system, not an entity)
if rel.parts and rel.parts[0] in {"sources", "products"}:
continue
text = _read(md_file)
for start, content in _extract_code_blocks(text, "yaml"):
try:
data = yaml.safe_load(content)
except yaml.YAMLError:
continue # caught by yaml-syntax check
if not isinstance(data, dict):
continue
block_lines = content.splitlines()
for key in ENTITY_REF_KEYS:
value = data.get(key)
if not isinstance(value, str):
continue
if value in entity_names:
continue
# Find the line number of the key within this block
key_line = start
for i, bl in enumerate(block_lines):
if re.match(rf"^{re.escape(key)}\s*:", bl):
key_line = start + i
break
findings.append(Finding(
str(md_file), key_line, "entity-references",
f"'{key}: {value}' does not match any entity in domain.md",
))
return findings
# ---------------------------------------------------------------------------
# Check 5 — Domain version field
# ---------------------------------------------------------------------------
def check_domain_version(domain_file: Path) -> list[Finding]:
findings: list[Finding] = []
text = _read(domain_file)
lines = text.splitlines()
in_metadata = False
collecting = False
buf: list[str] = []
start = 1
found_block = False
for i, line in enumerate(lines, 1):
if re.match(r"^##\s+Metadata\s*$", line):
in_metadata = True
continue
if in_metadata and re.match(r"^##\s+", line):
in_metadata = False
if in_metadata and re.match(r"^```yaml\s*$", line):
collecting = True
start = i + 1
buf = []
continue
if collecting:
if re.match(r"^```\s*$", line):
collecting = False
in_metadata = False
found_block = True
content = "\n".join(buf)
try:
data = yaml.safe_load(content)
if not isinstance(data, dict) or "version" not in data:
findings.append(Finding(
str(domain_file), start, "domain-version",
"Metadata YAML block is missing the 'version:' field",
))
except yaml.YAMLError:
pass # caught by yaml-syntax check
break
buf.append(line)
if not found_block:
findings.append(Finding(
str(domain_file), 1, "domain-version",
"No YAML block found under '## Metadata''version:' field cannot be verified",
))
return findings
# ---------------------------------------------------------------------------
# Orchestration
# ---------------------------------------------------------------------------
def run_preflight(domain_folder: str) -> list[Finding]:
domain_root = Path(domain_folder).resolve()
if not domain_root.exists():
print(f"error: path not found: {domain_folder}", file=sys.stderr)
sys.exit(2)
if not domain_root.is_dir():
print(f"error: not a directory: {domain_folder}", file=sys.stderr)
sys.exit(2)
domain_file = domain_root / "domain.md"
findings: list[Finding] = []
# Checks 13: run across every .md file in the domain folder
for md_file in sorted(domain_root.rglob("*.md")):
findings += check_yaml_syntax(md_file)
findings += check_mermaid_syntax(md_file)
findings += check_internal_links(md_file)
# Check 4: entity reference consistency (requires domain.md)
if domain_file.exists():
findings += check_entity_references(domain_file)
# Check 5: domain version (requires domain.md)
if domain_file.exists():
findings += check_domain_version(domain_file)
else:
findings.append(Finding(
str(domain_file), 0, "domain-version",
"domain.md not found — is this a domain folder?",
))
return findings
def main() -> None:
if len(sys.argv) != 2:
print("Usage: python preflight.py <domain-folder>", file=sys.stderr)
sys.exit(2)
findings = run_preflight(sys.argv[1])
if not findings:
print("Pre-flight passed. No findings.")
sys.exit(0)
# Group by check for readability
by_check: dict[str, list[Finding]] = {}
for f in findings:
by_check.setdefault(f.check, []).append(f)
print(f"Pre-flight: {len(findings)} finding(s)\n")
for check, group in by_check.items():
print(f" [{check}] {len(group)} finding(s)")
for f in group:
path = f.file
print(f" {path}:{f.line}")
print(f" {f.message}")
print()
sys.exit(1)
if __name__ == "__main__":
main()