Auto schemas
This commit is contained in:
+116
-32
@@ -9,8 +9,10 @@ returns metadata (no file contents are returned).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import glob
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -153,23 +155,96 @@ async def update_config(reference: str, request: Request):
|
||||
)
|
||||
|
||||
|
||||
# ── Schema inference helpers ────────────────────────────────────────────────
|
||||
|
||||
_DATE_FORMATS = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y%m%d", "%d-%m-%Y"]
|
||||
_DATETIME_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"]
|
||||
|
||||
|
||||
def _infer_type(values: list[str]) -> str:
|
||||
sample = [v.strip() for v in values if v.strip()][:30]
|
||||
if not sample:
|
||||
return "str"
|
||||
try:
|
||||
[int(v.replace(",", "")) for v in sample]
|
||||
return "int"
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
[float(v.replace(",", "")) for v in sample]
|
||||
return "float"
|
||||
except ValueError:
|
||||
pass
|
||||
for fmt in _DATETIME_FORMATS:
|
||||
try:
|
||||
[datetime.strptime(v, fmt) for v in sample]
|
||||
return f"datetime('{fmt}')"
|
||||
except ValueError:
|
||||
pass
|
||||
for fmt in _DATE_FORMATS:
|
||||
try:
|
||||
[datetime.strptime(v, fmt) for v in sample]
|
||||
return f"date('{fmt}')"
|
||||
except ValueError:
|
||||
pass
|
||||
return "str"
|
||||
|
||||
|
||||
def _csv_schema(path: str, delimiter: str = ",", has_header: bool = True) -> dict[str, str]:
|
||||
try:
|
||||
with open(path, newline="", encoding="utf-8", errors="replace") as f:
|
||||
reader = csv.reader(f, delimiter=delimiter)
|
||||
rows = [r for _, r in zip(range(31), reader)]
|
||||
if not rows:
|
||||
return {}
|
||||
headers = rows[0] if has_header else [f"col_{i}" for i in range(len(rows[0]))]
|
||||
data_rows = rows[1:] if has_header else rows
|
||||
return {
|
||||
col: _infer_type([r[i] for r in data_rows if i < len(r)])
|
||||
for i, col in enumerate(headers)
|
||||
if col.strip()
|
||||
}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _xml_schema(path: str, xpathstr: str = "./*") -> dict[str, str]:
|
||||
try:
|
||||
tree = ET.parse(path)
|
||||
root = tree.getroot()
|
||||
elements = root.findall(xpathstr) or list(root)
|
||||
if not elements:
|
||||
return {}
|
||||
schema: dict[str, str] = {}
|
||||
for el in elements[:1]:
|
||||
for child in el:
|
||||
schema[child.tag] = "str"
|
||||
for attr in el.attrib:
|
||||
schema[f"@{attr}"] = "str"
|
||||
return schema
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
# ── Test URL ────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/test-url")
|
||||
async def test_url(request: Request):
|
||||
"""Return metadata for a URL pattern — no file contents ever returned."""
|
||||
"""Return file/DB metadata and inferred schema — no row data ever returned."""
|
||||
from datetime import date as date_type
|
||||
|
||||
body = await request.json()
|
||||
url: str = body.get("url", "")
|
||||
as_at_date: str = body.get("as_at_date", datetime.now().strftime("%Y%m%d"))
|
||||
csv_spec: dict = body.get("csv_spec", {})
|
||||
|
||||
# ── DB URL — schema inspection wired up per-connector when available ──────
|
||||
if not url.startswith("file://"):
|
||||
return {"type": "non-file", "message": "Only file:// URLs can be tested here. Database connections are validated at run time."}
|
||||
scheme = url.split("://")[0] if "://" in url else url
|
||||
return {"type": "db", "scheme": scheme, "schema": {}, "found": False}
|
||||
|
||||
# Resolve the path part (strip file://)
|
||||
# ── file:// URL ──────────────────────────────────────────────────────────
|
||||
raw_path = url[7:]
|
||||
|
||||
# Replace simple template variables for preview purposes
|
||||
from datetime import date as date_type
|
||||
try:
|
||||
as_at = datetime.strptime(as_at_date, "%Y%m%d").date()
|
||||
except ValueError:
|
||||
@@ -183,40 +258,49 @@ async def test_url(request: Request):
|
||||
.replace("{{today.strftime('%Y-%m-%d')}}", date_type.today().strftime("%Y-%m-%d"))
|
||||
)
|
||||
|
||||
# Treat as glob pattern for anything still containing {
|
||||
def _file_info(p: str) -> dict:
|
||||
try:
|
||||
st = os.stat(p)
|
||||
return {"path": p, "size_bytes": st.st_size,
|
||||
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds")}
|
||||
except OSError:
|
||||
return {"path": p, "error": "Could not stat file"}
|
||||
|
||||
def _schema_for(p: str) -> dict[str, str]:
|
||||
pl = p.lower()
|
||||
if pl.endswith(".xml"):
|
||||
return _xml_schema(p)
|
||||
delimiter = csv_spec.get("delimiter", ",") or ","
|
||||
has_header = csv_spec.get("has_header", True)
|
||||
return _csv_schema(p, delimiter=delimiter, has_header=has_header)
|
||||
|
||||
# Glob pattern
|
||||
if "*" in resolved or "?" in resolved or "{" in resolved:
|
||||
matches = glob.glob(resolved)
|
||||
matches = sorted(glob.glob(resolved))
|
||||
if not matches:
|
||||
return {"type": "file", "resolved": resolved, "found": False, "message": "No files matched the pattern."}
|
||||
file_infos = []
|
||||
for p in sorted(matches)[:10]:
|
||||
try:
|
||||
st = os.stat(p)
|
||||
file_infos.append({
|
||||
"path": p,
|
||||
"size_bytes": st.st_size,
|
||||
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
|
||||
})
|
||||
except OSError:
|
||||
file_infos.append({"path": p, "error": "Could not stat file"})
|
||||
return {"type": "file", "resolved": resolved, "found": True, "matches": len(matches), "files": file_infos}
|
||||
return {"type": "file", "resolved": resolved, "found": False,
|
||||
"message": "No files matched the pattern.", "schema": {}}
|
||||
schema = _schema_for(matches[0])
|
||||
return {
|
||||
"type": "file", "resolved": resolved, "found": True,
|
||||
"matches": len(matches),
|
||||
"files": [_file_info(p) for p in matches[:10]],
|
||||
"schema": schema,
|
||||
}
|
||||
|
||||
# Exact path
|
||||
p = Path(resolved)
|
||||
if not p.exists():
|
||||
return {"type": "file", "resolved": resolved, "found": False, "message": f"File not found: {resolved}"}
|
||||
return {"type": "file", "resolved": resolved, "found": False,
|
||||
"message": f"File not found: {resolved}", "schema": {}}
|
||||
try:
|
||||
st = p.stat()
|
||||
schema = _schema_for(str(p))
|
||||
return {
|
||||
"type": "file",
|
||||
"resolved": resolved,
|
||||
"found": True,
|
||||
"type": "file", "resolved": resolved, "found": True,
|
||||
"matches": 1,
|
||||
"files": [{
|
||||
"path": str(p),
|
||||
"size_bytes": st.st_size,
|
||||
"modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
|
||||
}],
|
||||
"files": [_file_info(str(p))],
|
||||
"schema": schema,
|
||||
}
|
||||
except OSError as e:
|
||||
return {"type": "file", "resolved": resolved, "found": False, "message": str(e)}
|
||||
return {"type": "file", "resolved": resolved, "found": False,
|
||||
"message": str(e), "schema": {}}
|
||||
|
||||
Reference in New Issue
Block a user