Auto schemas

This commit is contained in:
2026-05-26 22:34:28 +12:00
parent 35d70a7746
commit d4969172c2
7 changed files with 320 additions and 132 deletions
+116 -32
View File
@@ -9,8 +9,10 @@ returns metadata (no file contents are returned).
"""
from __future__ import annotations
import csv
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from typing import Any
@@ -153,23 +155,96 @@ async def update_config(reference: str, request: Request):
)
# ── Schema inference helpers ────────────────────────────────────────────────
_DATE_FORMATS = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y%m%d", "%d-%m-%Y"]
_DATETIME_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"]
def _infer_type(values: list[str]) -> str:
sample = [v.strip() for v in values if v.strip()][:30]
if not sample:
return "str"
try:
[int(v.replace(",", "")) for v in sample]
return "int"
except ValueError:
pass
try:
[float(v.replace(",", "")) for v in sample]
return "float"
except ValueError:
pass
for fmt in _DATETIME_FORMATS:
try:
[datetime.strptime(v, fmt) for v in sample]
return f"datetime('{fmt}')"
except ValueError:
pass
for fmt in _DATE_FORMATS:
try:
[datetime.strptime(v, fmt) for v in sample]
return f"date('{fmt}')"
except ValueError:
pass
return "str"
def _csv_schema(path: str, delimiter: str = ",", has_header: bool = True) -> dict[str, str]:
try:
with open(path, newline="", encoding="utf-8", errors="replace") as f:
reader = csv.reader(f, delimiter=delimiter)
rows = [r for _, r in zip(range(31), reader)]
if not rows:
return {}
headers = rows[0] if has_header else [f"col_{i}" for i in range(len(rows[0]))]
data_rows = rows[1:] if has_header else rows
return {
col: _infer_type([r[i] for r in data_rows if i < len(r)])
for i, col in enumerate(headers)
if col.strip()
}
except Exception:
return {}
def _xml_schema(path: str, xpathstr: str = "./*") -> dict[str, str]:
try:
tree = ET.parse(path)
root = tree.getroot()
elements = root.findall(xpathstr) or list(root)
if not elements:
return {}
schema: dict[str, str] = {}
for el in elements[:1]:
for child in el:
schema[child.tag] = "str"
for attr in el.attrib:
schema[f"@{attr}"] = "str"
return schema
except Exception:
return {}
# ── Test URL ────────────────────────────────────────────────────────────────
@router.post("/test-url")
async def test_url(request: Request):
"""Return metadata for a URL pattern — no file contents ever returned."""
"""Return file/DB metadata and inferred schema — no row data ever returned."""
from datetime import date as date_type
body = await request.json()
url: str = body.get("url", "")
as_at_date: str = body.get("as_at_date", datetime.now().strftime("%Y%m%d"))
csv_spec: dict = body.get("csv_spec", {})
# ── DB URL — schema inspection wired up per-connector when available ──────
if not url.startswith("file://"):
return {"type": "non-file", "message": "Only file:// URLs can be tested here. Database connections are validated at run time."}
scheme = url.split("://")[0] if "://" in url else url
return {"type": "db", "scheme": scheme, "schema": {}, "found": False}
# Resolve the path part (strip file://)
# ── file:// URL ──────────────────────────────────────────────────────────
raw_path = url[7:]
# Replace simple template variables for preview purposes
from datetime import date as date_type
try:
as_at = datetime.strptime(as_at_date, "%Y%m%d").date()
except ValueError:
@@ -183,40 +258,49 @@ async def test_url(request: Request):
.replace("{{today.strftime('%Y-%m-%d')}}", date_type.today().strftime("%Y-%m-%d"))
)
# Treat as glob pattern for anything still containing {
def _file_info(p: str) -> dict:
try:
st = os.stat(p)
return {"path": p, "size_bytes": st.st_size,
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds")}
except OSError:
return {"path": p, "error": "Could not stat file"}
def _schema_for(p: str) -> dict[str, str]:
pl = p.lower()
if pl.endswith(".xml"):
return _xml_schema(p)
delimiter = csv_spec.get("delimiter", ",") or ","
has_header = csv_spec.get("has_header", True)
return _csv_schema(p, delimiter=delimiter, has_header=has_header)
# Glob pattern
if "*" in resolved or "?" in resolved or "{" in resolved:
matches = glob.glob(resolved)
matches = sorted(glob.glob(resolved))
if not matches:
return {"type": "file", "resolved": resolved, "found": False, "message": "No files matched the pattern."}
file_infos = []
for p in sorted(matches)[:10]:
try:
st = os.stat(p)
file_infos.append({
"path": p,
"size_bytes": st.st_size,
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
})
except OSError:
file_infos.append({"path": p, "error": "Could not stat file"})
return {"type": "file", "resolved": resolved, "found": True, "matches": len(matches), "files": file_infos}
return {"type": "file", "resolved": resolved, "found": False,
"message": "No files matched the pattern.", "schema": {}}
schema = _schema_for(matches[0])
return {
"type": "file", "resolved": resolved, "found": True,
"matches": len(matches),
"files": [_file_info(p) for p in matches[:10]],
"schema": schema,
}
# Exact path
p = Path(resolved)
if not p.exists():
return {"type": "file", "resolved": resolved, "found": False, "message": f"File not found: {resolved}"}
return {"type": "file", "resolved": resolved, "found": False,
"message": f"File not found: {resolved}", "schema": {}}
try:
st = p.stat()
schema = _schema_for(str(p))
return {
"type": "file",
"resolved": resolved,
"found": True,
"type": "file", "resolved": resolved, "found": True,
"matches": 1,
"files": [{
"path": str(p),
"size_bytes": st.st_size,
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
}],
"files": [_file_info(str(p))],
"schema": schema,
}
except OSError as e:
return {"type": "file", "resolved": resolved, "found": False, "message": str(e)}
return {"type": "file", "resolved": resolved, "found": False,
"message": str(e), "schema": {}}