Auto schemas

2026-05-26 22:34:28 +12:00
parent 35d70a7746
commit d4969172c2
7 changed files with 320 additions and 132 deletions
@@ -9,8 +9,10 @@ returns metadata (no file contents are returned).
 """
 from __future__ import annotations

+import csv
 import glob
 import os
+import xml.etree.ElementTree as ET
 from datetime import datetime
 from pathlib import Path
 from typing import Any
@@ -153,23 +155,96 @@ async def update_config(reference: str, request: Request):
    )


+# ── Schema inference helpers ────────────────────────────────────────────────
+
+_DATE_FORMATS = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y%m%d", "%d-%m-%Y"]
+_DATETIME_FORMATS = ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"]
+
+
+def _infer_type(values: list[str]) -> str:
+    sample = [v.strip() for v in values if v.strip()][:30]
+    if not sample:
+        return "str"
+    try:
+        [int(v.replace(",", "")) for v in sample]
+        return "int"
+    except ValueError:
+        pass
+    try:
+        [float(v.replace(",", "")) for v in sample]
+        return "float"
+    except ValueError:
+        pass
+    for fmt in _DATETIME_FORMATS:
+        try:
+            [datetime.strptime(v, fmt) for v in sample]
+            return f"datetime('{fmt}')"
+        except ValueError:
+            pass
+    for fmt in _DATE_FORMATS:
+        try:
+            [datetime.strptime(v, fmt) for v in sample]
+            return f"date('{fmt}')"
+        except ValueError:
+            pass
+    return "str"
+
+
+def _csv_schema(path: str, delimiter: str = ",", has_header: bool = True) -> dict[str, str]:
+    try:
+        with open(path, newline="", encoding="utf-8", errors="replace") as f:
+            reader = csv.reader(f, delimiter=delimiter)
+            rows = [r for _, r in zip(range(31), reader)]
+        if not rows:
+            return {}
+        headers = rows[0] if has_header else [f"col_{i}" for i in range(len(rows[0]))]
+        data_rows = rows[1:] if has_header else rows
+        return {
+            col: _infer_type([r[i] for r in data_rows if i < len(r)])
+            for i, col in enumerate(headers)
+            if col.strip()
+        }
+    except Exception:
+        return {}
+
+
+def _xml_schema(path: str, xpathstr: str = "./*") -> dict[str, str]:
+    try:
+        tree = ET.parse(path)
+        root = tree.getroot()
+        elements = root.findall(xpathstr) or list(root)
+        if not elements:
+            return {}
+        schema: dict[str, str] = {}
+        for el in elements[:1]:
+            for child in el:
+                schema[child.tag] = "str"
+            for attr in el.attrib:
+                schema[f"@{attr}"] = "str"
+        return schema
+    except Exception:
+        return {}
+
+
 # ── Test URL ────────────────────────────────────────────────────────────────

@router.post("/test-url")
 async def test_url(request: Request):
-    """Return metadata for a URL pattern — no file contents ever returned."""
+    """Return file/DB metadata and inferred schema — no row data ever returned."""
+    from datetime import date as date_type
+
    body = await request.json()
    url: str = body.get("url", "")
    as_at_date: str = body.get("as_at_date", datetime.now().strftime("%Y%m%d"))
+    csv_spec: dict = body.get("csv_spec", {})

+    # ── DB URL — schema inspection wired up per-connector when available ──────
    if not url.startswith("file://"):
-        return {"type": "non-file", "message": "Only file:// URLs can be tested here. Database connections are validated at run time."}
+        scheme = url.split("://")[0] if "://" in url else url
+        return {"type": "db", "scheme": scheme, "schema": {}, "found": False}

-    # Resolve the path part (strip file://)
+    # ── file:// URL ──────────────────────────────────────────────────────────
    raw_path = url[7:]
-
-    # Replace simple template variables for preview purposes
-    from datetime import date as date_type
    try:
        as_at = datetime.strptime(as_at_date, "%Y%m%d").date()
    except ValueError:
@@ -183,40 +258,49 @@ async def test_url(request: Request):
        .replace("{{today.strftime('%Y-%m-%d')}}", date_type.today().strftime("%Y-%m-%d"))
    )

-    # Treat as glob pattern for anything still containing {
+    def _file_info(p: str) -> dict:
+        try:
+            st = os.stat(p)
+            return {"path": p, "size_bytes": st.st_size,
+                    "modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds")}
+        except OSError:
+            return {"path": p, "error": "Could not stat file"}
+
+    def _schema_for(p: str) -> dict[str, str]:
+        pl = p.lower()
+        if pl.endswith(".xml"):
+            return _xml_schema(p)
+        delimiter = csv_spec.get("delimiter", ",") or ","
+        has_header = csv_spec.get("has_header", True)
+        return _csv_schema(p, delimiter=delimiter, has_header=has_header)
+
+    # Glob pattern
    if "*" in resolved or "?" in resolved or "{" in resolved:
-        matches = glob.glob(resolved)
+        matches = sorted(glob.glob(resolved))
        if not matches:
-            return {"type": "file", "resolved": resolved, "found": False, "message": "No files matched the pattern."}
-        file_infos = []
-        for p in sorted(matches)[:10]:
-            try:
-                st = os.stat(p)
-                file_infos.append({
-                    "path": p,
-                    "size_bytes": st.st_size,
-                    "modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
-                })
-            except OSError:
-                file_infos.append({"path": p, "error": "Could not stat file"})
-        return {"type": "file", "resolved": resolved, "found": True, "matches": len(matches), "files": file_infos}
+            return {"type": "file", "resolved": resolved, "found": False,
+                    "message": "No files matched the pattern.", "schema": {}}
+        schema = _schema_for(matches[0])
+        return {
+            "type": "file", "resolved": resolved, "found": True,
+            "matches": len(matches),
+            "files": [_file_info(p) for p in matches[:10]],
+            "schema": schema,
+        }

    # Exact path
    p = Path(resolved)
    if not p.exists():
-        return {"type": "file", "resolved": resolved, "found": False, "message": f"File not found: {resolved}"}
+        return {"type": "file", "resolved": resolved, "found": False,
+                "message": f"File not found: {resolved}", "schema": {}}
    try:
-        st = p.stat()
+        schema = _schema_for(str(p))
        return {
-            "type": "file",
-            "resolved": resolved,
-            "found": True,
+            "type": "file", "resolved": resolved, "found": True,
            "matches": 1,
-            "files": [{
-                "path": str(p),
-                "size_bytes": st.st_size,
-                "modified": datetime.fromtimestamp(st.st_mtime).isoformat(timespec="seconds"),
-            }],
+            "files": [_file_info(str(p))],
+            "schema": schema,
        }
    except OSError as e:
-        return {"type": "file", "resolved": resolved, "found": False, "message": str(e)}
+        return {"type": "file", "resolved": resolved, "found": False,
+                "message": str(e), "schema": {}}