demo + playground + docs

One-command demo so the gateway can be exercised end-to-end without a GPU or a real model download: - demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags, /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count and eval_count on the final frame, /api/embed, /api/show, /api/version). Non-root multi-stage Dockerfile, never published (internal network only). - docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground. Mirrors the prod posture (mock-ollama not exposed). - demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with allow_all_models and a fresh API key via the bootstrap CLI inside the container, then prints the key, the playground URL, and five ready-to-paste curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull). ./demo.sh --down tears everything back down with volumes. - playground/index.html — single-file dark-themed UI served same-origin by the gateway at /playground (CORS-free). Per-endpoint About card with method/ auth/streaming badges, a real description, sample request body, sample response, and a footer note. Live SSE/NDJSON rendering of the response. A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh are visibly gated until an API key is in the field; the Base URL is force-pinned to location.origin three times to defeat browser autofill. - docs/ — API.md (full endpoint reference with curl, streaming formats, error model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule, pointing at a real Ollama backend, env reference), THREAT_MODEL.md (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md. mkdocs.yml (Material theme) wires them together.
2026-05-26 20:52:33 +02:00
parent 844b02aade
commit b47a09db91
13 changed files with 2501 additions and 0 deletions
--- a/demo.sh
+++ b/demo.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+#
+# demo.sh — the neuronetz-gateway one-command presentation.
+#
+# Brings up the demo stack (postgres + redis + mock-ollama + gateway) with NO
+# GPU and NO model downloads, creates a demo tenant + API key via the bootstrap
+# CLI *inside the gateway container*, and prints a clean summary with the key,
+# the playground URL, and ready-to-paste curl commands.
+#
+# Usage:
+#   ./demo.sh          # build + start, bootstrap a tenant/key, print summary
+#   ./demo.sh --down   # tear the whole stack down (and remove volumes)
+#   ./demo.sh --help   # this help
+#
+# Re-runnable: existing tenant/key are handled gracefully. The full API key is
+# only ever printed once at creation (SPEC §11), so on a re-run where the key
+# already exists this script creates a fresh, uniquely-named key and prints it.
+set -euo pipefail
+
+# ──────────────────────────────────────────────────────────────────────────
+# Configuration
+# ──────────────────────────────────────────────────────────────────────────
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+COMPOSE_FILE="${SCRIPT_DIR}/docker-compose.demo.yml"
+COMPOSE=(docker compose -f "${COMPOSE_FILE}")
+
+GATEWAY_URL="http://localhost:8080"
+PLAYGROUND_URL="${GATEWAY_URL}/playground"
+TENANT_NAME="demo"
+KEY_NAME="demo-key"
+
+# Colours (disabled when stdout is not a TTY).
+if [ -t 1 ]; then
+  BOLD="$(printf '\033[1m')"; DIM="$(printf '\033[2m')"; RESET="$(printf '\033[0m')"
+  CYAN="$(printf '\033[36m')"; GREEN="$(printf '\033[32m')"; YELLOW="$(printf '\033[33m')"
+else
+  BOLD=""; DIM=""; RESET=""; CYAN=""; GREEN=""; YELLOW=""
+fi
+
+log()  { printf '%s\n' "${CYAN}==>${RESET} ${BOLD}$*${RESET}"; }
+warn() { printf '%s\n' "${YELLOW}!!${RESET} $*" >&2; }
+die()  { printf '%s\n' "${YELLOW}xx${RESET} $*" >&2; exit 1; }
+
+# ──────────────────────────────────────────────────────────────────────────
+# Subcommands
+# ──────────────────────────────────────────────────────────────────────────
+usage() {
+  sed -n '3,18p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
+}
+
+down() {
+  log "Tearing down the demo stack (containers + volumes)…"
+  "${COMPOSE[@]}" down --volumes --remove-orphans
+  log "Done. The demo stack is gone."
+}
+
+# Run the bootstrap CLI inside the running gateway container.
+gw_cli() {
+  "${COMPOSE[@]}" exec -T gateway neuronetz-gateway "$@"
+}
+
+wait_for_health() {
+  log "Waiting for the gateway to become healthy at ${GATEWAY_URL}/healthz …"
+  local deadline=$(( $(date +%s) + 180 ))
+  until curl -fsS "${GATEWAY_URL}/healthz" >/dev/null 2>&1; do
+    if [ "$(date +%s)" -ge "${deadline}" ]; then
+      warn "Gateway did not become healthy in time. Recent gateway logs:"
+      "${COMPOSE[@]}" logs --tail=50 gateway >&2 || true
+      die "Aborting."
+    fi
+    sleep 2
+  done
+  log "Gateway is up."
+}
+
+# Create the demo tenant if it does not already exist (idempotent).
+ensure_tenant() {
+  log "Creating demo tenant '${TENANT_NAME}' (allow-all-models) …"
+  local out
+  if out="$(gw_cli create-tenant --name "${TENANT_NAME}" --allow-all-models 2>&1)"; then
+    printf '%s\n' "${DIM}${out}${RESET}"
+  else
+    # Already-exists (or similar) is fine — surface it but keep going.
+    if printf '%s' "${out}" | grep -qiE 'exist|duplicate|unique'; then
+      log "Tenant '${TENANT_NAME}' already exists — reusing it."
+    else
+      warn "create-tenant reported:"
+      printf '%s\n' "${out}" >&2
+      warn "Continuing; the tenant may already be present."
+    fi
+  fi
+}
+
+# Create a fresh API key and capture the printed key. The key is printed once.
+# We give each created key a unique name so re-runs always succeed and always
+# yield a usable key to print.
+create_key() {
+  local unique_name="${KEY_NAME}-$(date +%Y%m%d-%H%M%S)"
+  log "Creating API key '${unique_name}' for tenant '${TENANT_NAME}' …" >&2
+  local out
+  if ! out="$(gw_cli create-key --tenant "${TENANT_NAME}" --name "${unique_name}" 2>&1)"; then
+    warn "create-key failed:" >&2
+    printf '%s\n' "${out}" >&2
+    return 1
+  fi
+  # The CLI prints both the 12-char prefix (e.g. "prefix nz_abc12345Yz") AND the
+  # full key on a later line. Both match /nz_[A-Za-z0-9]+/, so pick the longest
+  # match — that's the full key (44 chars), never the prefix (12).
+  local key
+  key="$(printf '%s' "${out}" | grep -oE 'nz_[A-Za-z0-9]+' \
+       | awk '{ if (length($0) > maxlen) { maxlen = length($0); k = $0 } } END { print k }' \
+       || true)"
+  if [ -z "${key}" ]; then
+    warn "Could not parse an API key from create-key output:" >&2
+    printf '%s\n' "${out}" >&2
+    return 1
+  fi
+  printf '%s' "${key}"
+}
+
+print_summary() {
+  local key="$1"
+  local cl='application/json'
+
+  cat <<EOF
+
+${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
+${GREEN}${BOLD}  neuronetz-gateway demo is live${RESET}
+${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
+
+  ${BOLD}API base URL${RESET}     ${CYAN}${GATEWAY_URL}${RESET}
+  ${BOLD}Playground${RESET}       ${CYAN}${PLAYGROUND_URL}${RESET}
+  ${BOLD}API key${RESET}          ${YELLOW}${key}${RESET}
+                   ${DIM}(printed once — copy it now; re-run ./demo.sh to mint another)${RESET}
+
+  ${BOLD}Model backend${RESET}    mock-ollama (internal network only, never published)
+  ${BOLD}Models${RESET}           llama3.1:8b · mistral:7b · qwen2.5:3b · nomic-embed-text
+
+${BOLD}── Ready-to-paste curl commands ───────────────────────────────────────${RESET}
+
+${DIM}# 1) Streaming chat — OpenAI-compatible SSE (data: {...}  …  data: [DONE])${RESET}
+curl -N ${GATEWAY_URL}/v1/chat/completions \\
+  -H "Authorization: Bearer ${key}" \\
+  -H "Content-Type: ${cl}" \\
+  -d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
+
+${DIM}# 2) Streaming chat — native Ollama NDJSON (one JSON object per line)${RESET}
+curl -N ${GATEWAY_URL}/api/chat \\
+  -H "Authorization: Bearer ${key}" \\
+  -H "Content-Type: ${cl}" \\
+  -d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
+
+${DIM}# 3) List models — the tenant's effective (live-discovered) set, OpenAI format${RESET}
+curl ${GATEWAY_URL}/v1/models \\
+  -H "Authorization: Bearer ${key}"
+
+${DIM}# 4) 401 Unauthorized — no/invalid key, fail-closed, no upstream details leaked${RESET}
+curl -i ${GATEWAY_URL}/v1/models \\
+  -H "Authorization: Bearer nz_invalidKEYdoesNotExist000000000000000000"
+
+${DIM}# 5) 403 Forbidden — model-mutating endpoint is hard-blocked (not configurable)${RESET}
+curl -i ${GATEWAY_URL}/api/pull \\
+  -H "Authorization: Bearer ${key}" \\
+  -H "Content-Type: ${cl}" \\
+  -d '{"model":"llama3.1:8b"}'
+
+${BOLD}───────────────────────────────────────────────────────────────────────${RESET}
+  Tear it all down with:  ${CYAN}./demo.sh --down${RESET}
+
+EOF
+}
+
+up() {
+  command -v docker >/dev/null 2>&1 || die "docker is required but not found on PATH."
+  command -v curl   >/dev/null 2>&1 || die "curl is required but not found on PATH."
+  [ -f "${COMPOSE_FILE}" ] || die "Missing ${COMPOSE_FILE}"
+
+  log "Building and starting the demo stack (postgres + redis + mock-ollama + gateway) …"
+  "${COMPOSE[@]}" up --build -d
+
+  wait_for_health
+  ensure_tenant
+
+  local key
+  if ! key="$(create_key)"; then
+    die "Could not create/parse an API key. See logs above."
+  fi
+
+  print_summary "${key}"
+}
+
+# ──────────────────────────────────────────────────────────────────────────
+# Entry point
+# ──────────────────────────────────────────────────────────────────────────
+main() {
+  case "${1:-}" in
+    --down|-d|down)   down ;;
+    --help|-h|help)   usage ;;
+    "")               up ;;
+    *)                die "Unknown argument: $1 (try --help)" ;;
+  esac
+}
+
+main "$@"
--- a/demo/mock-ollama/Dockerfile
+++ b/demo/mock-ollama/Dockerfile
@@ -0,0 +1,61 @@
+# syntax=docker/dockerfile:1.7
+#
+# mock-ollama — a tiny FastAPI app emulating the Ollama HTTP API for the demo.
+#
+#   builder stage : installs deps into a self-contained virtualenv.
+#   runtime stage : copies the venv + app, drops to a NON-ROOT user, no build
+#                   tools, runs uvicorn on :11434.
+#
+# This image exists ONLY for the demo stack (docker-compose.demo.yml). It lets
+# the demo run with no GPU and no model downloads. It is never published to the
+# host — like real Ollama, it is reachable only on the internal Docker network.
+
+# ----------------------------------------------------------------------------
+# Stage 1 — builder
+# ----------------------------------------------------------------------------
+FROM python:3.12-slim AS builder
+
+ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    VIRTUAL_ENV=/opt/venv \
+    PATH=/opt/venv/bin:$PATH
+
+RUN python -m venv /opt/venv
+
+WORKDIR /app
+COPY requirements.txt ./
+RUN pip install -r requirements.txt
+
+# ----------------------------------------------------------------------------
+# Stage 2 — runtime
+# ----------------------------------------------------------------------------
+FROM python:3.12-slim AS runtime
+
+# curl is used by the compose healthcheck.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Non-root user.
+RUN groupadd --system --gid 10001 mock \
+    && useradd --system --uid 10001 --gid mock --home-dir /app --shell /usr/sbin/nologin mock
+
+ENV VIRTUAL_ENV=/opt/venv \
+    PATH=/opt/venv/bin:$PATH \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    MOCK_OLLAMA_PORT=11434
+
+WORKDIR /app
+
+COPY --from=builder /opt/venv /opt/venv
+COPY app.py ./
+
+USER mock
+
+EXPOSE 11434
+
+HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=5 \
+    CMD curl -fsS "http://127.0.0.1:${MOCK_OLLAMA_PORT}/api/version" || exit 1
+
+CMD ["python", "-m", "app"]
--- a/demo/mock-ollama/app.py
+++ b/demo/mock-ollama/app.py
@@ -0,0 +1,361 @@
+"""Standalone mock Ollama service for the neuronetz-gateway demo.
+
+This is a containerised sibling of ``tests/integration/mock_ollama.py``: it
+emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so
+the demo runs with **no GPU and no model downloads**. The response *shapes*
+match real Ollama closely enough that the gateway's token counter, model
+discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths.
+
+Endpoints emulated:
+
+* ``GET  /api/tags``      - model catalogue (size/digest/modified_at/details)
+* ``POST /api/chat``      - NDJSON streaming (default) or single JSON
+* ``POST /api/generate``  - NDJSON streaming (default) or single JSON
+* ``POST /api/embed``     - newer batch embeddings (field ``embeddings``)
+* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``)
+* ``POST /api/show``      - returns template/system so the gateway can prove it
+                            strips them
+* ``GET  /api/version``   - plausible upstream version
+
+The terminal NDJSON object of every chat/generate response carries realistic
+``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the
+gateway counts tokens for real. Reply text is ``"Echo: <prompt>"``.
+
+Runs uvicorn on :11434 as a non-root user inside the container.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+from collections.abc import AsyncIterator, Iterable
+from datetime import UTC, datetime
+from typing import Any
+
+import uvicorn
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+
+NDJSON_MEDIA_TYPE = "application/x-ndjson"
+
+# A small, realistic catalogue. Sizes/digests are plausible but fixed so the
+# demo is fully deterministic.
+MODELS: tuple[dict[str, Any], ...] = (
+    {
+        "name": "llama3.1:8b",
+        "family": "llama",
+        "parameter_size": "8.0B",
+        "quantization_level": "Q4_0",
+        "size": 4_661_211_808,
+    },
+    {
+        "name": "mistral:7b",
+        "family": "llama",
+        "parameter_size": "7.2B",
+        "quantization_level": "Q4_0",
+        "size": 4_109_865_159,
+    },
+    {
+        "name": "qwen2.5:3b",
+        "family": "qwen2",
+        "parameter_size": "3.1B",
+        "quantization_level": "Q4_K_M",
+        "size": 1_929_889_677,
+    },
+    {
+        "name": "nomic-embed-text",
+        "family": "nomic-bert",
+        "parameter_size": "137M",
+        "quantization_level": "F16",
+        "size": 274_302_450,
+    },
+)
+
+
+def _now_iso() -> str:
+    return datetime.now(UTC).isoformat().replace("+00:00", "Z")
+
+
+def _digest_for(name: str) -> str:
+    return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest()
+
+
+def _details_for(name: str) -> dict[str, Any]:
+    for m in MODELS:
+        if m["name"] == name:
+            return {
+                "parent_model": "",
+                "format": "gguf",
+                "family": m["family"],
+                "families": [m["family"]],
+                "parameter_size": m["parameter_size"],
+                "quantization_level": m["quantization_level"],
+            }
+    return {
+        "parent_model": "",
+        "format": "gguf",
+        "family": name.split(":", 1)[0],
+        "families": [name.split(":", 1)[0]],
+        "parameter_size": "8B",
+        "quantization_level": "Q4_0",
+    }
+
+
+def _reply_for(prompt: str, override: str | None) -> str:
+    if override is not None:
+        return override
+    if not prompt:
+        return "Hello from the mock Ollama backend."
+    return f"Echo: {prompt}"
+
+
+def _tokenize(text: str) -> list[str]:
+    return text.split()
+
+
+def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]:
+    """Timing/usage fields Ollama attaches to the terminal stream object."""
+    return {
+        "total_duration": 1_234_567_890,
+        "load_duration": 12_345_678,
+        "prompt_eval_count": prompt_tokens,
+        "prompt_eval_duration": 23_456_789,
+        "eval_count": completion_tokens,
+        "eval_duration": 34_567_890,
+    }
+
+
+def _chat_chunk(
+    model: str,
+    *,
+    content: str,
+    done: bool,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+) -> dict[str, Any]:
+    obj: dict[str, Any] = {
+        "model": model,
+        "created_at": _now_iso(),
+        "message": {"role": "assistant", "content": content},
+        "done": done,
+    }
+    if done:
+        obj["done_reason"] = "stop"
+        obj.update(_final_metrics(prompt_tokens, completion_tokens))
+    return obj
+
+
+def _generate_chunk(
+    model: str,
+    *,
+    response: str,
+    done: bool,
+    prompt_tokens: int = 0,
+    completion_tokens: int = 0,
+) -> dict[str, Any]:
+    obj: dict[str, Any] = {
+        "model": model,
+        "created_at": _now_iso(),
+        "response": response,
+        "done": done,
+    }
+    if done:
+        obj["done_reason"] = "stop"
+        obj["context"] = [1, 2, 3]
+        obj.update(_final_metrics(prompt_tokens, completion_tokens))
+    return obj
+
+
+async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]:
+    for obj in objects:
+        yield (json.dumps(obj) + "\n").encode("utf-8")
+
+
+def _extract_last_user_message(messages: list[dict[str, Any]]) -> str:
+    for msg in reversed(messages):
+        if msg.get("role") == "user":
+            content = msg.get("content", "")
+            return content if isinstance(content, str) else ""
+    return ""
+
+
+def create_app() -> FastAPI:
+    app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None)
+
+    @app.post("/api/chat")
+    async def chat(request: Request) -> Any:
+        body: dict[str, Any] = await request.json()
+        model: str = body.get("model", "llama3.1:8b")
+        stream: bool = body.get("stream", True)
+        reply_override: str | None = body.get("reply_text")
+        prompt = _extract_last_user_message(body.get("messages", []))
+        reply = _reply_for(prompt, reply_override)
+
+        prompt_tokens = len(_tokenize(prompt))
+        completion_tokens = len(_tokenize(reply))
+
+        if not stream:
+            return JSONResponse(
+                _chat_chunk(
+                    model,
+                    content=reply,
+                    done=True,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            )
+
+        words = _tokenize(reply) or [""]
+
+        def chunks() -> list[dict[str, Any]]:
+            out: list[dict[str, Any]] = []
+            for i, word in enumerate(words):
+                piece = word if i == 0 else f" {word}"
+                out.append(_chat_chunk(model, content=piece, done=False))
+            out.append(
+                _chat_chunk(
+                    model,
+                    content="",
+                    done=True,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            )
+            return out
+
+        return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
+
+    @app.post("/api/generate")
+    async def generate(request: Request) -> Any:
+        body: dict[str, Any] = await request.json()
+        model: str = body.get("model", "llama3.1:8b")
+        stream: bool = body.get("stream", True)
+        prompt = body.get("prompt", "")
+        reply = _reply_for(prompt, body.get("reply_text"))
+
+        prompt_tokens = len(_tokenize(prompt))
+        completion_tokens = len(_tokenize(reply))
+
+        if not stream:
+            return JSONResponse(
+                _generate_chunk(
+                    model,
+                    response=reply,
+                    done=True,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            )
+
+        words = _tokenize(reply) or [""]
+
+        def chunks() -> list[dict[str, Any]]:
+            out: list[dict[str, Any]] = []
+            for i, word in enumerate(words):
+                piece = word if i == 0 else f" {word}"
+                out.append(_generate_chunk(model, response=piece, done=False))
+            out.append(
+                _generate_chunk(
+                    model,
+                    response="",
+                    done=True,
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                )
+            )
+            return out
+
+        return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
+
+    @app.post("/api/embed")
+    async def embed(request: Request) -> Any:
+        body: dict[str, Any] = await request.json()
+        model: str = body.get("model", "nomic-embed-text")
+        inp = body.get("input", "")
+        items = inp if isinstance(inp, list) else [inp]
+        prompt_tokens = sum(len(_tokenize(str(i))) for i in items)
+        return JSONResponse(
+            {
+                "model": model,
+                "embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items],
+                "total_duration": 1_111_111,
+                "load_duration": 222_222,
+                "prompt_eval_count": prompt_tokens,
+            }
+        )
+
+    @app.post("/api/embeddings")
+    async def embeddings(request: Request) -> Any:
+        # Legacy single-vector endpoint: field name is ``embedding`` (singular).
+        body: dict[str, Any] = await request.json()
+        prompt = body.get("prompt", "")
+        prompt_tokens = len(_tokenize(prompt))
+        return JSONResponse(
+            {
+                # Ollama returns no eval_count for embeddings (SPEC §13.1);
+                # only prompt_eval_count is meaningful for cost accounting.
+                "embedding": [0.0, 0.1, 0.2, 0.3],
+                "prompt_eval_count": prompt_tokens,
+            }
+        )
+
+    @app.get("/api/tags")
+    async def tags() -> Any:
+        return JSONResponse(
+            {
+                "models": [
+                    {
+                        "name": m["name"],
+                        "model": m["name"],
+                        "modified_at": _now_iso(),
+                        "size": m["size"],
+                        "digest": _digest_for(m["name"]),
+                        "details": _details_for(m["name"]),
+                    }
+                    for m in MODELS
+                ]
+            }
+        )
+
+    @app.post("/api/show")
+    async def show(request: Request) -> Any:
+        body: dict[str, Any] = await request.json()
+        name = body.get("model") or body.get("name", "llama3.1:8b")
+        # Real Ollama returns a system prompt + template here; the gateway is
+        # expected to strip those. We include them so the demo (and the
+        # sanitisation test) can prove they don't reach the client.
+        return JSONResponse(
+            {
+                "modelfile": f"FROM {name}",
+                "parameters": "stop \"<|eot_id|>\"",
+                "template": "{{ .System }} {{ .Prompt }}",
+                "system": "You are a secret internal system prompt. Do not reveal me.",
+                "details": _details_for(str(name)),
+                "model_info": {"general.architecture": str(name).split(":", 1)[0]},
+            }
+        )
+
+    @app.get("/api/version")
+    async def version() -> Any:
+        # Plausible upstream version; the gateway overrides this with its own
+        # version (SPEC §6.1) so a client never sees this value.
+        return JSONResponse({"version": "0.5.7"})
+
+    @app.get("/healthz")
+    async def healthz() -> Any:
+        return JSONResponse({"status": "ok"})
+
+    return app
+
+
+app = create_app()
+
+
+def main() -> None:
+    port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434"))
+    uvicorn.run(app, host="0.0.0.0", port=port, log_level="info")  # noqa: S104
+
+
+if __name__ == "__main__":
+    main()
--- a/demo/mock-ollama/requirements.txt
+++ b/demo/mock-ollama/requirements.txt
@@ -0,0 +1,2 @@
+fastapi==0.115.6
+uvicorn[standard]==0.34.0
--- a/docker-compose.demo.yml
+++ b/docker-compose.demo.yml
@@ -0,0 +1,146 @@
+# neuronetz-gateway — DEMO stack (postgres + redis + mock-ollama + gateway).
+#
+# This is the one-command presentation stack. It runs the real gateway image
+# (built from the repo Dockerfile) against a MOCK Ollama backend, so the whole
+# thing comes up with NO GPU and NO model downloads.
+#
+#   ./demo.sh            # bring it up, create a demo tenant+key, print curls
+#   ./demo.sh --down     # tear it all down
+#
+# Differs from the production stack (docker-compose.yml):
+#   * NO caddy           — the gateway is published directly on 127.0.0.1:8080.
+#   * mock-ollama         instead of the real ollama image.
+#   * playground enabled  — the gateway serves /playground from a mounted file.
+#
+#  ┌─────────────────────────────────────────────────────────────────────────┐
+#  │ SECURITY POSTURE (mirrors prod):                                          │
+#  │   `mock-ollama` has NO `ports:` mapping. The model backend is reachable   │
+#  │   only on the internal Docker network as `mock-ollama:11434`, exactly     │
+#  │   like real Ollama in production. Only the gateway is published, and only │
+#  │   on the loopback interface (127.0.0.1:8080).                             │
+#  └─────────────────────────────────────────────────────────────────────────┘
+
+services:
+  gateway:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    restart: unless-stopped
+    ports:
+      - "127.0.0.1:8080:8080"
+    environment:
+      GATEWAY_BIND_HOST: 0.0.0.0
+      GATEWAY_BIND_PORT: "8080"
+      GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
+      GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
+      GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
+      GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
+      # Serve the interactive playground from the mounted file (flag-gated;
+      # OFF by default in prod). See playground/index.html.
+      PLAYGROUND_ENABLED: "true"
+      PLAYGROUND_FILE: /app/playground/index.html
+      # Point the gateway at the mock Ollama on the internal network.
+      OLLAMA_BASE_URL: http://mock-ollama:11434
+      OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
+      OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
+      OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
+      # Discover models quickly so the demo feels live.
+      MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-15}
+      MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-60}
+      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
+      DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
+      DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
+      REDIS_URL: redis://redis:6379/0
+      REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
+      DEFAULT_RPM: ${DEFAULT_RPM:-60}
+      DEFAULT_TPM: ${DEFAULT_TPM:-100000}
+      DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
+      MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
+      MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
+      ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
+      ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
+      ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
+      AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
+      AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
+      PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
+      AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
+    volumes:
+      # The gateway serves /playground by reading this file at request time.
+      # Read-only mount: the demo never lets the container modify it.
+      - ./playground:/app/playground:ro
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      mock-ollama:
+        condition: service_healthy
+    # Apply migrations, then start the server (mirrors docker-compose.dev.yml).
+    command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
+      interval: 10s
+      timeout: 3s
+      retries: 5
+      start_period: 30s
+    networks:
+      - internal
+
+  # ───────────────────────────────────────────────────────────────────────────
+  # mock-ollama — INTERNAL NETWORK ONLY. Stands in for the real Ollama backend.
+  # NO `ports:` mapping, mirroring the production "Ollama is never exposed" rule.
+  # Reachable only as `http://mock-ollama:11434` from the gateway container.
+  # ───────────────────────────────────────────────────────────────────────────
+  mock-ollama:
+    build:
+      context: ./demo/mock-ollama
+      dockerfile: Dockerfile
+    restart: unless-stopped
+    # !!! NO `ports:` — the model backend is never published. !!!
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:11434/api/version"]
+      interval: 10s
+      timeout: 3s
+      retries: 5
+      start_period: 5s
+    networks:
+      - internal
+
+  postgres:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER:-gateway}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
+      POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
+    volumes:
+      - postgres_demo_data:/var/lib/postgresql/data
+    # No `ports:` — Postgres is internal-only.
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+    networks:
+      - internal
+
+  redis:
+    image: redis:7-alpine
+    restart: unless-stopped
+    command: ["redis-server", "--save", "", "--appendonly", "no"]
+    # No `ports:` — Redis is internal-only.
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+    networks:
+      - internal
+
+networks:
+  # Private network for inter-service traffic; not reachable from the host.
+  internal:
+    driver: bridge
+
+volumes:
+  postgres_demo_data:
--- a/docs/API.md
+++ b/docs/API.md
@@ -0,0 +1,253 @@
+# neuronetz-gateway — API Reference
+
+The gateway exposes two compatible API surfaces in front of the Ollama backend:
+
+- **Native Ollama** under `/api/*` — NDJSON streaming, identical request shapes to Ollama.
+- **OpenAI-compatible** under `/v1/*` — SSE streaming, drop-in for the OpenAI SDKs.
+
+Plus unauthenticated health endpoints. Everything else is blocked.
+
+> Source of truth: [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §6. Where this doc and the
+> SPEC disagree, the SPEC wins.
+
+---
+
+## Authentication
+
+Every model endpoint requires an API key as a Bearer token:
+
+```
+Authorization: Bearer nz_<12-char-prefix><32-char-random>
+```
+
+- **Key format:** `nz_` namespace + random base62 body. The first 12 characters
+  (`nz_` + entropy) are the **prefix**, stored in cleartext and indexed for O(1) lookup.
+  The full key is **argon2id**-hashed; it is shown **exactly once** at creation
+  (`neuronetz-gateway create-key`) and never stored or logged.
+- **Fail-closed:** a missing, malformed, expired, disabled, or revoked key returns **401**.
+  No upstream/Ollama detail is ever leaked in the error.
+- Health endpoints (`/healthz`, `/readyz`) require **no** auth.
+
+The placeholder key `nz_demoKEY...` is used throughout this doc. `./demo.sh` prints a
+**real** key for the local demo.
+
+---
+
+## Response headers (SPEC §6.5)
+
+Every proxied response carries:
+
+| Header | Meaning |
+|---|---|
+| `X-Request-ID` | Correlates the response with the audit log row. Present on errors too. |
+| `X-RateLimit-Limit-Requests` | Effective RPM limit for this key/tenant. |
+| `X-RateLimit-Remaining-Requests` | Requests remaining in the current window. |
+| `X-RateLimit-Limit-Tokens` | Effective TPM limit. |
+| `X-RateLimit-Remaining-Tokens` | Tokens remaining in the current window. |
+| `X-Budget-Period` | `day` \| `month` \| `total` — the binding budget period. |
+| `X-Budget-Tokens-Remaining` | Tokens left in the binding budget period. |
+
+`429 Too Many Requests` responses additionally carry `Retry-After: <seconds>`.
+
+---
+
+## Error model
+
+Errors are **sanitized** at the gateway boundary — Ollama internals are never reflected.
+The body is a small generic JSON object and the `X-Request-ID` header ties it to the audit log.
+
+```json
+{ "error": { "message": "forbidden", "type": "forbidden", "code": 403 }, "request_id": "b3f1…" }
+```
+
+| Status | When |
+|---|---|
+| `400` | Malformed body, schema violation, or `num_predict` over the cap. |
+| `401` | Missing / invalid / expired / revoked key. |
+| `403` | Endpoint hard-blocked, or model outside the tenant's effective set (no existence disclosure). |
+| `413` | Request body over `MAX_REQUEST_BODY_BYTES` (default 256 KiB). |
+| `429` | Rate limit or budget exceeded (carries `Retry-After`). |
+| `502` | Ollama upstream unreachable / circuit breaker open. |
+| `503` | A required subsystem (Postgres read, Redis) is down — **fail-closed**, never "allow". |
+
+A model that is *installed-but-unpermitted* and a model that is *not installed* return the
+**same** generic `403`, to prevent enumeration (SPEC §13.6).
+
+---
+
+## Native Ollama endpoints (`/api/*`)
+
+### `POST /api/chat`
+
+Streamed (NDJSON, default) or non-streamed chat completion.
+
+```bash
+curl -N http://localhost:8080/api/chat \
+  -H "Authorization: Bearer nz_demoKEY..." \
+  -H "Content-Type: application/json" \
+  -d '{"model":"llama3.1:8b","stream":true,
+       "messages":[{"role":"user","content":"Say hello in one sentence."}]}'
+```
+
+**Streaming response** — `Content-Type: application/x-ndjson`, one JSON object per line:
+
+```
+{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
+{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" Say"},"done":false}
+…
+{"model":"llama3.1:8b","done":true,"done_reason":"stop",
+ "prompt_eval_count":6,"eval_count":7,"total_duration":1234567890,"eval_duration":34567890}
+```
+
+The **final** object carries `prompt_eval_count` (tokens in) and `eval_count` (tokens out);
+the gateway uses these for precise token accounting (SPEC §4.3 step 12).
+
+**Non-streaming** (`"stream": false`) returns a single JSON object of the same shape with
+`"done": true`.
+
+### `POST /api/generate`
+
+Same semantics as `/api/chat` but uses a flat `prompt` string and returns `response`
+fields instead of `message` objects.
+
+```bash
+curl -N http://localhost:8080/api/generate \
+  -H "Authorization: Bearer nz_demoKEY..." \
+  -H "Content-Type: application/json" \
+  -d '{"model":"llama3.1:8b","stream":true,"prompt":"Write a haiku about routers."}'
+```
+
+### `POST /api/embed` / `POST /api/embeddings`
+
+Non-streamed embeddings. `/api/embed` is the newer batch endpoint (field `embeddings`,
+a list of vectors); `/api/embeddings` is the legacy single-vector endpoint (field
+`embedding`). Ollama returns no `eval_count` for embeddings; cost is charged on
+`prompt_eval_count` only (SPEC §13.1).
+
+```bash
+curl http://localhost:8080/api/embed \
+  -H "Authorization: Bearer nz_demoKEY..." \
+  -H "Content-Type: application/json" \
+  -d '{"model":"nomic-embed-text","input":["hello","world"]}'
+```
+
+```json
+{ "model": "nomic-embed-text", "embeddings": [[0.0, 0.1, …], [0.0, 0.1, …]], "prompt_eval_count": 2 }
+```
+
+### `GET /api/tags`
+
+Returns the tenant's **effective** model set — the live-discovered set intersected with the
+tenant's allowlist, or *all* discovered models when `allow_all_models` is on. Sourced from
+discovery (SPEC §4.6), never a static list.
+
+```bash
+curl http://localhost:8080/api/tags -H "Authorization: Bearer nz_demoKEY..."
+```
+
+### `POST /api/show`
+
+Allowed only for models in the effective set; returns **sanitized** model info.
+The system prompt and template that Ollama returns are **stripped** by the gateway.
+
+### `GET /api/version`
+
+Returns the **gateway** version, not the Ollama version.
+
+```json
+{ "version": "0.1.0" }
+```
+
+---
+
+## Hard-blocked endpoints (always `403`)
+
+These model-mutating endpoints are blocked at the gateway. **Not configurable, not behind a
+flag** (SPEC §6.2, AGENT_PROMPT non-negotiable #5):
+
+```
+/api/pull   /api/push   /api/create   /api/copy   /api/delete   /api/blobs/*
+```
+
+```bash
+# Always 403, even with a valid key:
+curl -i http://localhost:8080/api/pull \
+  -H "Authorization: Bearer nz_demoKEY..." \
+  -H "Content-Type: application/json" -d '{"model":"llama3.1:8b"}'
+```
+
+`GET /api/ps` is also blocked (it would leak which models are loaded).
+
+---
+
+## OpenAI-compatible endpoints (`/v1/*`)
+
+| Path | Method | Maps to |
+|---|---|---|
+| `/v1/chat/completions` | POST | `/api/chat` |
+| `/v1/completions` | POST | `/api/generate` |
+| `/v1/embeddings` | POST | `/api/embed` |
+| `/v1/models` | GET | `/api/tags` (effective set, OpenAI list format) |
+
+Streaming uses **SSE**: `data: {…}\n\n` events terminated by a literal `data: [DONE]\n\n`.
+
+### `POST /v1/chat/completions`
+
+```bash
+curl -N http://localhost:8080/v1/chat/completions \
+  -H "Authorization: Bearer nz_demoKEY..." \
+  -H "Content-Type: application/json" \
+  -d '{"model":"llama3.1:8b","stream":true,
+       "messages":[{"role":"user","content":"Say hello in one sentence."}]}'
+```
+
+**Streaming response** — `Content-Type: text/event-stream`:
+
+```
+data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":" Say"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":6,"completion_tokens":7,"total_tokens":13}}
+
+data: [DONE]
+```
+
+Works with the OpenAI Python SDK by pointing `base_url` at `http://localhost:8080/v1`.
+
+### `GET /v1/models`
+
+```bash
+curl http://localhost:8080/v1/models -H "Authorization: Bearer nz_demoKEY..."
+```
+
+```json
+{ "object": "list", "data": [
+  { "id": "llama3.1:8b", "object": "model", "owned_by": "neuronetz" },
+  { "id": "mistral:7b",  "object": "model", "owned_by": "neuronetz" }
+] }
+```
+
+---
+
+## Health endpoints
+
+| Path | Method | Auth | Purpose |
+|---|---|---|---|
+| `/healthz` | GET | none | Liveness — process responsive (`200`). |
+| `/readyz` | GET | none | Readiness — DB + Redis + Ollama reachable, else `503`. |
+| `/metrics` | GET | none (loopback only) | Prometheus exposition. |
+
+```bash
+curl -i http://localhost:8080/healthz   # 200 {"status":"ok"}
+curl -i http://localhost:8080/readyz    # 200 when all deps up; 503 otherwise
+```
+
+---
+
+## Quick reference: streaming formats
+
+| Surface | Content-Type | Frame | Terminator |
+|---|---|---|---|
+| Native `/api/*` | `application/x-ndjson` | one JSON object per `\n` | final object has `"done": true` |
+| OpenAI `/v1/*` | `text/event-stream` | `data: {…}\n\n` | `data: [DONE]\n\n` |
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,168 @@
+# neuronetz-gateway — Architecture
+
+Distilled from [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §4. The SPEC is the source of truth.
+
+The gateway is the **hot path** of the Neuronetz API: a secure, multi-tenant proxy in front
+of an Ollama instance. The Ollama backend must never be reachable directly from the public
+internet — all access flows through this gateway. Administration (dashboards, tenant
+self-service) lives in a separate service, `neuronetz-console`, and is out of scope here.
+
+---
+
+## Component diagram (SPEC §4.1)
+
+```
+                          Internet
+                              │ TLS
+                              ▼
+                  ┌──────────────────────┐
+                  │ Caddy (sidecar)      │  Let's Encrypt for api.neuronetz.ai
+                  │ - TLS termination    │  HSTS, security headers
+                  │ - HTTP/2, HTTP/3     │
+                  └──────────┬───────────┘
+                             │ HTTP/1.1 internal
+                  ┌──────────▼───────────┐
+                  │ neuronetz-gateway    │  FastAPI + uvicorn
+                  │  - authn             │
+                  │  - rate limit        │
+                  │  - budget check      │
+                  │  - proxy + stream    │
+                  │  - token count       │
+                  │  - audit write       │
+                  └──┬────────┬──────┬───┘
+                     │        │      │
+              ┌──────▼──┐  ┌──▼───┐  │
+              │Postgres │  │Redis │  │
+              │ schema: │  │ keys │  │
+              │ gateway │  │bucket│  │
+              └─────────┘  └──────┘  │
+                                     │ internal network only
+                              ┌──────▼──────┐
+                              │   Ollama    │
+                              │ 127.0.0.1   │
+                              └─────────────┘
+
+Same Compose stack also hosts (separate from this SPEC):
+  - neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
+```
+
+Only **Caddy** publishes ports. Postgres, Redis and (critically) **Ollama** have no
+published ports and are reachable only on the internal Docker network.
+
+---
+
+## Database schemas (SPEC §4.2)
+
+A single Postgres instance with two schemas:
+
+- **`gateway`** — owned by this service; full DDL. Tables: `tenants`, `tenant_limits`,
+  `api_keys`, `key_limits`, `budget_usage`, `audit_log`, `prompt_log`, `revocations`
+  (see SPEC §5 for the full DDL).
+- **`console`** — owned by `neuronetz-console` (out of scope). The console role gets
+  `SELECT` on all `gateway.*` tables and `INSERT` on `gateway.revocations` only.
+
+If the console needs to mutate gateway state (e.g. revoke a key), it does so by inserting
+into the `gateway.revocations` **outbox** table, which the gateway tails (see Revocation below).
+
+**Limit inheritance:** limits and budgets resolve key → tenant. A `NULL` key-level value
+inherits the tenant value. For `allow_all_models`, a non-`NULL` key value overrides the
+tenant flag; otherwise the tenant flag applies (SPEC §13.7).
+
+---
+
+## Request lifecycle (SPEC §4.3)
+
+1. Caddy terminates TLS and forwards to the gateway on the internal port.
+2. Middleware extracts `Authorization: Bearer <key>`.
+3. The 12-char prefix is the Redis cache key. On miss, look up `gateway.api_keys` by prefix,
+   verify the full key with argon2id, and cache resolved metadata in Redis (TTL 60 s).
+4. **Rate limit** check — sliding window in Redis (Lua-atomic): per-key RPM + per-tenant RPM.
+5. **Budget** check — Redis counter for the current period; Postgres ledger is the source of
+   truth on reset.
+6. **Concurrency** semaphore — Redis `INCR` with TTL.
+7. **Model allowlist** check — resolve the effective set (see below); the request `model`
+   must be in it, else a generic `403`.
+8. **Endpoint allowlist** check — mutating endpoints are hard-blocked.
+9. **Body validation** — size, schema, `num_predict` cap.
+10. If an OpenAI-compat path, translate the request to the Ollama schema.
+11. Open an httpx async stream to Ollama.
+12. Stream the response back to the client, accumulating the final `prompt_eval_count` +
+    `eval_count`.
+13. On stream close: write the `gateway.audit_log` row; decrement the budget; release the
+    semaphore; if prompt logging is enabled, write `gateway.prompt_log`.
+14. On any failure: sanitized error to the client, audit row with the status code, semaphore
+    released.
+
+**Streaming integrity:** token counting and the audit write happen **after** stream close,
+never on the hot path — time-to-first-byte is not degraded by bookkeeping (SPEC §9).
+
+---
+
+## Model discovery (SPEC §4.6)
+
+The set of usable models is **never hand-maintained**; it is extracted live from Ollama.
+
+- A background task (started in the app lifespan, alongside the revocation listener) polls
+  Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
+- The parsed set (names + sanitized metadata: family, parameter size, quantization, size,
+  modified-at) is cached in Redis under `gateway:models:discovered` with TTL
+  `MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
+- An initial fetch runs at startup; if Ollama is unreachable the discovered set is empty.
+- **Fail-closed:** an empty or expired-and-unrefreshable discovered set means *no model
+  resolves* and requests are denied. Discovery never opens access on failure.
+- **Auto-grant:** because the effective set intersects with `discovered` (or *is*
+  `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band becomes
+  usable to `allow_all` tenants on the next refresh — no per-tenant config change.
+- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
+  endpoint; it never triggers a model pull.
+
+### Effective-set resolution (SPEC §4.3 step 7)
+
+```
+allow_all := key.allow_all_models ?? tenant.allow_all_models
+effective := discovered                                          if allow_all
+             (key.allowed_models ?? tenant.allowed_models) ∩ discovered   otherwise
+```
+
+`/api/tags` and `/v1/models` return exactly this effective set, so the listing never reveals
+models outside the tenant's reach. A model that is installed-but-unpermitted and one that is
+not installed both return the same generic `403` — no existence disclosure (SPEC §13.6).
+
+---
+
+## Failure modes — fail-closed (SPEC §4.4)
+
+| Subsystem | If down | Behavior |
+|---|---|---|
+| Postgres (read) | Key lookup fails | `503` with retry-after; nothing proxied. |
+| Postgres (write) | Audit write fails | Request still succeeds; audit row buffered in-memory ring (max 1000), drained on recovery; if the buffer fills, switch to deny mode. |
+| Redis | Rate limit / budget unavailable | `503` — fail closed. Never "allow because we can't check." |
+| Ollama | Upstream unreachable | `502` with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30 s. |
+| Caddy | Not a gateway concern | — |
+
+The governing rule (AGENT_PROMPT non-negotiable #1): **if a security or budgeting check
+cannot be performed, deny.** Never default to allow.
+
+---
+
+## Cache invalidation / key revocation (SPEC §4.5)
+
+The console revokes a key by inserting into `gateway.revocations(key_id, ts, reason)`.
+A background task in the gateway lifespan:
+
+- `LISTEN`s on the Postgres channel `key_revoked` (the gateway emits `NOTIFY` on its own
+  write path; the console's INSERT fires a trigger that emits it).
+- On notification, evicts the Redis cache entry for that key's prefix.
+
+This makes revocation effectively immediate (≤ Redis RTT) with no cross-service HTTP.
+
+---
+
+## Observability
+
+- **Structured logs** (structlog), JSON in production. Secrets/keys are never logged.
+- **Prometheus** `/metrics` (loopback only): `gateway_requests_total{tenant,model,status}`,
+  `gateway_tokens_total{tenant,model,direction}`,
+  `gateway_request_duration_seconds{tenant,model}` (histogram). Labelled by `tenant`, never
+  by `key_id` (cardinality — SPEC §13.3); per-key data lives in Postgres.
+- **Audit log** — always-on request metadata. **Prompt log** — opt-in per key, TTL'd.
--- a/docs/DEPLOYMENT.md
+++ b/docs/DEPLOYMENT.md
@@ -0,0 +1,188 @@
+# neuronetz-gateway — Deployment
+
+Production deployment is a single Docker Compose stack: **Caddy + gateway + Postgres + Redis
+ Ollama**. Caddy is the only public-facing component; it terminates TLS via Let's Encrypt
+for `api.neuronetz.ai` and reverse-proxies to the internal-only gateway.
+
+> For the local, no-GPU demo (mock Ollama + playground), see [`PLAYGROUND.md`](PLAYGROUND.md)
+> and run `./demo.sh`. This document is the **production** path.
+
+---
+
+## The one rule that must never break
+
+> ## ⛔ Ollama is NEVER exposed to the host or the internet.
+>
+> The `ollama` service in `docker-compose.yml` has **no `ports:` mapping** and must never
+> get one. Ollama is reachable only on the internal Docker network as `ollama:11434`.
+> Publishing it would re-open the exact unauthenticated exposure this whole project exists
+> to close (SPEC §1, §3; AGENT_PROMPT non-negotiable #2).
+
+The same posture applies to **Postgres** and **Redis** in the production compose file — no
+published ports. Only **Caddy** binds host ports (80/443, 443/udp for HTTP/3).
+
+---
+
+## Prerequisites
+
+- A host with Docker + Docker Compose.
+- DNS: `api.neuronetz.ai` → the host's public IP (for Let's Encrypt).
+- Ports 80 and 443 reachable from the internet (ACME HTTP/TLS challenge + serving).
+
+---
+
+## Steps
+
+```bash
+git clone <repo> neuronetz-gateway && cd neuronetz-gateway
+
+# 1. Configure. Copy the example env and change EVERY secret.
+cp .env.example .env
+#   - POSTGRES_PASSWORD: a strong, unique value
+#   - DATABASE_URL: must match the POSTGRES_* values
+#   - GATEWAY_LOG_FORMAT=json for production
+
+# 2. Configure Caddy for your domain + ACME email.
+cp ops/caddy/Caddyfile.example ops/caddy/Caddyfile   # then edit the site + email
+#   (docker-compose.yml mounts Caddyfile.example by default; point it at your edited file
+#    or edit in place.)
+
+# 3. Bring up the full stack. The gateway runs `alembic upgrade head`, then serves.
+docker compose up -d --build
+
+# 4. Bootstrap a tenant + key (CLI runs inside the gateway container).
+docker compose exec gateway neuronetz-gateway create-tenant --name acme --rpm 120 --tpm 200000
+docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
+#   ^ prints the full key ONCE — store it in your secret manager now.
+
+# 5. Smoke test (through Caddy / TLS).
+curl https://api.neuronetz.ai/healthz
+curl -N https://api.neuronetz.ai/v1/chat/completions \
+  -H "Authorization: Bearer nz_…" -H "Content-Type: application/json" \
+  -d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"hi"}]}'
+```
+
+Caddy obtains and renews the certificate automatically. For local testing without a public
+domain, use the `localhost { tls internal … }` block documented in `Caddyfile.example`
+(trust Caddy's local CA or pass `-k` to curl).
+
+---
+
+## Pointing at a real Ollama backend
+
+The gateway reaches Ollama via `OLLAMA_BASE_URL`. In the bundled stack this is the in-stack
+`ollama` service: `OLLAMA_BASE_URL=http://ollama:11434`.
+
+To use an **existing/external** Ollama host instead:
+
+1. Remove the `ollama` service from `docker-compose.yml` (or leave it; it just won't be used).
+2. Set `OLLAMA_BASE_URL` to the backend address reachable from the gateway container, e.g.
+   `http://10.0.0.5:11434` or an internal DNS name.
+3. Ensure that backend is itself **not** exposed to the internet — the gateway is the only
+   thing that should ever reach it. Use a private network / firewall rule, not a public port.
+4. Pull the models you want available on that backend. They appear in tenants' effective sets
+   automatically on the next discovery refresh (SPEC §4.6) — no gateway config change for
+   `allow_all_models` tenants.
+
+Discovery polls `OLLAMA_BASE_URL/api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. If the
+backend is unreachable, the discovered set is empty and requests **fail closed**.
+
+---
+
+## Environment reference (SPEC §7)
+
+All configuration is via environment variables, validated by Pydantic Settings on boot. Boot
+**fails loudly** on invalid config. See [`.env.example`](../.env.example) for a copyable file.
+
+### Service
+| Var | Default | Notes |
+|---|---|---|
+| `GATEWAY_BIND_HOST` | `0.0.0.0` | Bind-all inside the container. |
+| `GATEWAY_BIND_PORT` | `8080` | Internal port; never published directly in prod. |
+| `GATEWAY_LOG_LEVEL` | `INFO` | |
+| `GATEWAY_LOG_FORMAT` | `json` | `json` in prod, `console` for local dev. |
+| `GATEWAY_REQUEST_ID_HEADER` | `X-Request-ID` | |
+| `GATEWAY_TRUSTED_PROXIES` | `127.0.0.1,caddy` | Sources trusted for `X-Forwarded-For`. |
+
+### Upstream (Ollama)
+| Var | Default | Notes |
+|---|---|---|
+| `OLLAMA_BASE_URL` | `http://ollama:11434` | Internal address of the backend. |
+| `OLLAMA_CONNECT_TIMEOUT_S` | `5` | |
+| `OLLAMA_READ_TIMEOUT_S` | `600` | Long, for slow generations. |
+| `OLLAMA_MAX_CONNECTIONS` | `64` | httpx pool size. |
+
+### Model discovery (§4.6)
+| Var | Default | Notes |
+|---|---|---|
+| `MODEL_DISCOVERY_REFRESH_S` | `60` | How often to re-query `/api/tags`. |
+| `MODEL_DISCOVERY_CACHE_TTL_S` | `120` | Redis TTL for the discovered set. |
+
+### Database
+| Var | Default | Notes |
+|---|---|---|
+| `DATABASE_URL` | `postgresql+asyncpg://…` | asyncpg driver. |
+| `DATABASE_POOL_SIZE` | `10` | |
+| `DATABASE_POOL_OVERFLOW` | `20` | |
+
+### Redis
+| Var | Default | Notes |
+|---|---|---|
+| `REDIS_URL` | `redis://redis:6379/0` | |
+| `REDIS_KEY_CACHE_TTL_S` | `60` | Resolved-key cache TTL. |
+
+### Limits (defaults; per-tenant/key DB overrides win)
+| Var | Default | Notes |
+|---|---|---|
+| `DEFAULT_RPM` | `60` | |
+| `DEFAULT_TPM` | `100000` | |
+| `DEFAULT_CONCURRENT` | `8` | |
+| `MAX_REQUEST_BODY_BYTES` | `262144` | 256 KiB request cap. |
+| `MAX_NUM_PREDICT` | `4096` | Hard cap on requested completion tokens. |
+
+### Security
+| Var | Default | Notes |
+|---|---|---|
+| `ARGON2_TIME_COST` | `3` | |
+| `ARGON2_MEMORY_COST_KIB` | `65536` | 64 MiB. |
+| `ARGON2_PARALLELISM` | `4` | |
+| `AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN` | `20` | Throttles auth brute-force per source IP. |
+
+### Audit
+| Var | Default | Notes |
+|---|---|---|
+| `AUDIT_BUFFER_SIZE` | `1000` | Ring buffer; full ⇒ deny mode. |
+| `PROMPT_LOG_DEFAULT_RETENTION_DAYS` | `30` | |
+| `AUDIT_LOG_DEFAULT_RETENTION_DAYS` | `365` | |
+
+---
+
+## TLS & security headers (Caddy)
+
+`ops/caddy/Caddyfile.example` already sets:
+
+- **HSTS** `max-age=63072000; includeSubDomains; preload`
+- `X-Content-Type-Options: nosniff`
+- `X-Frame-Options: DENY`
+- `Referrer-Policy: no-referrer`
+- strips `Server` and `X-Powered-By`
+
+Edit the site address and ACME `email` before deploying.
+
+---
+
+## Non-Compose (systemd)
+
+A systemd unit is provided for hosts that run the image directly (`ops/systemd/`). The
+gateway still requires reachable Postgres, Redis, and Ollama, and the same environment
+variables. TLS in that topology is whatever fronts the host (Caddy, nginx, a load balancer) —
+**Ollama still must not be publicly reachable.**
+
+---
+
+## Upgrades & migrations
+
+The gateway runs `alembic upgrade head` on container start, so a normal
+`docker compose up -d --build` after pulling a new version applies pending migrations. For
+zero-downtime upgrades, run migrations as a one-off
+(`docker compose run --rm gateway alembic upgrade head`) before rolling the service.
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@@ -0,0 +1,172 @@
+# neuronetz-gateway — Operations Runbook
+
+Day-2 operations for the gateway: managing tenants and keys, budgets, model policy, usage,
+and the fail-closed behaviors you'll encounter. All administration is via the **bootstrap
+CLI** (SPEC §11), run inside the gateway container. There are no admin HTTP endpoints in the
+gateway (that's `neuronetz-console`'s job).
+
+> Run the CLI inside the running container:
+> ```bash
+> docker compose exec gateway neuronetz-gateway <command> …
+> ```
+> In the demo stack, swap the compose file: `docker compose -f docker-compose.demo.yml exec gateway …`
+
+---
+
+## Keys
+
+### Create a key
+
+```bash
+docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
+# optional: --scopes chat,embeddings   (default: chat,embeddings)
+```
+
+The **full key is printed exactly once** in the form `nz_<prefix><secret>`. Store it
+immediately in your secret manager — it is argon2id-hashed and cannot be recovered. Only the
+12-char `prefix` is retained server-side.
+
+### List keys (never shows full keys)
+
+```bash
+docker compose exec gateway neuronetz-gateway list-keys --tenant acme
+# prints: <prefix>  status=active  name='prod-server-1'  created=…
+```
+
+### Revoke a key
+
+```bash
+docker compose exec gateway neuronetz-gateway revoke-key --prefix nz_abc12345
+```
+
+This sets the key status to `revoked` and writes the `gateway.revocations` outbox row. A
+Postgres `NOTIFY` on channel `key_revoked` fires; the gateway evicts the key's Redis cache
+entry, so revocation takes effect within ~1 second (SPEC §4.5) without restarting anything.
+A subsequent request with that key returns **401**.
+
+> The console (`neuronetz-console`) revokes keys the same way — by inserting into
+> `gateway.revocations`. The trigger-driven NOTIFY makes it immediate without any
+> cross-service HTTP call.
+
+### Rotate a key
+
+There is no in-place rotate. Rotate by: create a new key → deploy it to the client → verify
+traffic on the new prefix → revoke the old prefix.
+
+---
+
+## Tenants & limits
+
+### Create a tenant
+
+```bash
+docker compose exec gateway neuronetz-gateway create-tenant --name acme \
+  --rpm 120 --tpm 200000 --concurrent 8
+# add --allow-all-models to opt into using any installed model (default: off)
+```
+
+Limits inherit **key → tenant**: a `NULL` key-level limit uses the tenant value.
+
+---
+
+## Budgets
+
+Set per-key token budgets (any combination of daily / monthly / total):
+
+```bash
+docker compose exec gateway neuronetz-gateway set-budget --key nz_abc12345 \
+  --daily 1000000 --monthly 30000000 --total 500000000
+```
+
+- Budgets are enforced **fail-closed**: when the binding period hits zero remaining, requests
+  return **429** with a descriptive error and a `Retry-After` header. The binding period and
+  remaining balance are surfaced on every response via `X-Budget-Period` and
+  `X-Budget-Tokens-Remaining` (SPEC §6.5).
+- Live counters live in Redis; the Postgres ledger (`gateway.budget_usage`) is the source of
+  truth on period rollover/reset.
+
+---
+
+## Model policy
+
+### Set an explicit allowlist (default-deny)
+
+```bash
+docker compose exec gateway neuronetz-gateway set-models --tenant acme \
+  --models llama3.1:8b,mistral:7b
+```
+
+The tenant's **effective set** is `allowed_models ∩ discovered` — entries that aren't
+actually installed on the backend silently never resolve. A request for a model outside the
+effective set returns a generic **403** (same response as "doesn't exist" — no enumeration).
+
+### Toggle `allow_all_models`
+
+```bash
+docker compose exec gateway neuronetz-gateway set-models --tenant acme --allow-all      # opt in
+docker compose exec gateway neuronetz-gateway set-models --tenant acme --no-allow-all   # back to allowlist
+```
+
+With `allow_all_models` on, the effective set **is** the live discovered set — any model
+pulled into Ollama becomes usable on the next discovery refresh, with no further config
+change. This is an audited convenience; prefer explicit allowlists for untrusted tenants
+(see [`THREAT_MODEL.md`](THREAT_MODEL.md)).
+
+### Inspect discovery and effective sets
+
+```bash
+docker compose exec gateway neuronetz-gateway list-models                 # live-discovered models
+docker compose exec gateway neuronetz-gateway list-models --tenant acme   # + that tenant's effective set
+```
+
+---
+
+## Usage
+
+```bash
+docker compose exec gateway neuronetz-gateway show-usage --tenant acme --period day
+# prints: requests=…  tokens_in=…  tokens_out=…   (period: day|month|total)
+```
+
+For per-key forensics and finer slicing, query `gateway.audit_log` directly (it records
+`request_id`, `key_prefix`, `model`, `tokens_in/out`, `status`, `latency_ms`, `client_ip`).
+
+---
+
+## How model discovery refresh works (SPEC §4.6)
+
+- A background task polls Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds and
+  caches the result in Redis (`gateway:models:discovered`, TTL `MODEL_DISCOVERY_CACHE_TTL_S`)
+  plus an in-process copy for hot reads.
+- A model pulled into Ollama out-of-band appears in `allow_all_models` tenants' effective sets
+  within one refresh interval — no config change.
+- Discovery is **read-only** and uses only the allowlisted `/api/tags` endpoint; it never
+  triggers a pull.
+- To force a faster pickup, lower `MODEL_DISCOVERY_REFRESH_S` (the demo uses 15 s).
+
+---
+
+## Fail-closed behaviors to expect
+
+| Symptom | Cause | Correct behavior |
+|---|---|---|
+| `503` on every request | Redis or Postgres-read down | Fail-closed — rate-limit/budget/auth can't be checked, so deny. Restore the backend. |
+| `502` with retry-after | Ollama unreachable | Circuit breaker opens after 5 consecutive failures, half-opens after 30 s. Check the backend / `OLLAMA_BASE_URL`. |
+| `403` for a model you "know" exists | Model not in the tenant's effective set, **or** discovery cache empty/expired | Check `list-models --tenant …`; verify the backend is reachable and the model is installed. Empty discovery = deny by design. |
+| `429` with `Retry-After` | Rate limit or budget exhausted | Inspect headers (`X-RateLimit-*`, `X-Budget-*`); raise limits/budget or wait. |
+| `401` immediately after revoke | Working as intended | Revocation propagated via NOTIFY → Redis eviction. |
+
+`/readyz` returns `503` when **any** dependency (DB, Redis, Ollama) is unreachable; use it as
+the load-balancer health gate. `/healthz` only checks process liveness.
+
+---
+
+## Logs, metrics, audit
+
+- **Logs:** structured (structlog), JSON in production, to stdout. Keys/secrets are never
+  logged.
+- **Metrics:** Prometheus at `/metrics` (loopback only): `gateway_requests_total`,
+  `gateway_tokens_total`, `gateway_request_duration_seconds`, labelled by `tenant` and
+  `model` (never `key_id`).
+- **Audit log:** always-on in `gateway.audit_log`. **Prompt log** is opt-in per key and TTL'd
+  (`PROMPT_LOG_DEFAULT_RETENTION_DAYS`); a sweeper enforces retention.
--- a/docs/PLAYGROUND.md
+++ b/docs/PLAYGROUND.md
@@ -0,0 +1,113 @@
+# neuronetz-gateway — Demo & Playground
+
+The fastest way to see the gateway working end-to-end, with **no GPU and no model downloads**.
+`./demo.sh` brings up the gateway against a mock Ollama backend, mints a demo API key, and
+prints ready-to-paste curl commands and a link to an interactive browser playground.
+
+---
+
+## Launch the demo
+
+From the repo root:
+
+```bash
+./demo.sh
+```
+
+This will:
+
+1. Build and start the demo stack (`docker-compose.demo.yml`): **postgres + redis +
+   mock-ollama + gateway**. No Caddy; the gateway is published on `127.0.0.1:8080`.
+2. Wait for the gateway to report healthy at `/healthz`.
+3. Create a demo tenant (`--allow-all-models`) and an API key via the bootstrap CLI **inside
+   the gateway container**, capturing the key (which is printed exactly once).
+4. Print a summary: the **API key**, the **playground URL**
+   `http://localhost:8080/playground`, and five ready-to-paste curl commands —
+   - streaming `/v1/chat/completions` (OpenAI SSE),
+   - streaming `/api/chat` (native NDJSON),
+   - `GET /v1/models`,
+   - a **401** example (no/bad key),
+   - a **403** example (`POST /api/pull`, hard-blocked).
+
+The script is **re-runnable**: an existing tenant is reused, and each run mints a fresh,
+uniquely-named key (the full key only ever prints at creation).
+
+Tear everything down (containers + volumes):
+
+```bash
+./demo.sh --down
+```
+
+### What's running
+
+| Service | Exposed? | Notes |
+|---|---|---|
+| `gateway` | `127.0.0.1:8080` | The real gateway image, built from the repo `Dockerfile`. |
+| `mock-ollama` | **no** | Internal network only — mirrors the prod "Ollama is never exposed" rule. |
+| `postgres` | **no** | Internal only. |
+| `redis` | **no** | Internal only. |
+
+The mock backend (`demo/mock-ollama/`) emulates Ollama's API shapes — including realistic
+`prompt_eval_count` / `eval_count` on the final stream object — so token counting, model
+discovery, and `/api/show` sanitization all exercise real gateway code paths. It serves a
+small catalogue: `llama3.1:8b`, `mistral:7b`, `qwen2.5:3b`, `nomic-embed-text`.
+
+---
+
+## Use the playground
+
+Open **http://localhost:8080/playground** in a browser. It is a single self-contained HTML
+page, served **same-origin** by the gateway (so no CORS to worry about).
+
+1. **Base URL** is pre-filled with the current origin; leave it as is for the demo.
+2. Paste the **API key** from the `./demo.sh` output into the Bearer field. (Typing a key
+   auto-loads the model dropdown; you can also hit **↻ Refresh**.)
+3. Pick an **endpoint** tab: `/v1/chat/completions`, `/api/chat`, `/api/generate`,
+   `/v1/models`, `/api/tags`, `/healthz`, `/readyz`.
+4. Choose a **model** from the auto-populated dropdown, type a prompt, toggle **stream**.
+5. Hit **▶ Run**. The streamed output renders **live** — SSE `data:` deltas (incl. `[DONE]`)
+   for `/v1/*`, NDJSON lines for `/api/*`.
+6. The panel shows the **response status** and the rate-limit / budget **response headers**
+   (`X-Request-ID`, `X-RateLimit-*`, `X-Budget-*`; SPEC §6.5).
+7. The **Exact curl** box mirrors precisely what **Run** sends — copy it to reproduce in a
+   terminal.
+
+Try the 403 path too: there's no mutating-endpoint tab by design, but the printed `curl` for
+`POST /api/pull` shows the hard block, and an invalid key in the Bearer field demonstrates the
+401 fail-closed response.
+
+---
+
+## ⚠️ Security note: the playground is OFF by default in production
+
+The playground route is **flag-gated** and **disabled by default**. The demo stack turns it on
+explicitly:
+
+```yaml
+# docker-compose.demo.yml (gateway service)
+GATEWAY_PLAYGROUND_ENABLED: "true"
+GATEWAY_PLAYGROUND_FILE: /app/playground/index.html
+```
+
+with the file mounted read-only into the container:
+
+```yaml
+volumes:
+  - ./playground:/app/playground:ro
+```
+
+The production stack (`docker-compose.yml`) does **not** set `GATEWAY_PLAYGROUND_ENABLED`, so
+the route is absent. Do not enable it on a public deployment: it is a convenience for demos and
+local development, not a production surface. Leaving it off keeps the public attack surface to
+the documented API only.
+
+---
+
+## Files behind the demo
+
+| Path | What it is |
+|---|---|
+| `demo.sh` | The one-command entrypoint (up / `--down`). |
+| `docker-compose.demo.yml` | The demo stack definition. |
+| `demo/mock-ollama/` | The standalone mock Ollama service (FastAPI app + Dockerfile). |
+| `playground/index.html` | The self-contained browser playground served at `/playground`. |
--- a/docs/THREAT_MODEL.md
+++ b/docs/THREAT_MODEL.md
@@ -0,0 +1,77 @@
+# neuronetz-gateway — Threat Model
+
+From [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §3. The governing principle, in one line:
+
+> **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down,
+> DB unreachable, ambiguous state), **deny** the request. Never default to allow.
+> (AGENT_PROMPT non-negotiable #1.)
+
+The gateway exists because the Ollama instance at `api.neuronetz.ai` was exposed without
+authentication — a standing security incident. Every defense below traces back to closing
+that gap and keeping it closed.
+
+---
+
+## Threats & mitigations (SPEC §3)
+
+| Threat | Mitigation |
+|---|---|
+| Internet scanners hitting Ollama directly | Ollama bound to the internal Docker network; **never published**. No `ports:` mapping in any shipped compose file. |
+| Unauthenticated API abuse | Mandatory Bearer token; **fail-closed** on auth errors (401). |
+| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP (`AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN`). |
+| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent-connection cap. |
+| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096). |
+| Model enumeration / training-data exfil via uncommon models | Model allowlist, **default-deny**. Discovery only exposes models actually installed; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the **same** generic response. |
+| Discovery backend unreachable | **Fail-closed:** an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models." |
+| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) **hard-blocked** at the gateway, not configurable. |
+| Information disclosure via error messages | Upstream errors **sanitized** at the boundary; Ollama internals never proxied to the client. Each error carries an `X-Request-ID` for correlation. |
+| Audit log tampering | Append-only at the app layer; DB role separation; optional WAL archiving. |
+| Prompt data leakage | Prompt logging **off by default**; opt-in per key; TTL'd retention; redaction hook. |
+| Redis outage causing "fail open" | **Fail-closed:** if the rate-limit/budget backend is unavailable, deny (503), not allow. |
+| Compromised admin token | There is **no admin endpoint** in the gateway. Admin lives in `neuronetz-console`; the gateway has nothing to compromise here. |
+
+---
+
+## Notes on selected defenses
+
+### `allow_all_models` is an audited opt-in
+
+`allow_all_models` lets a tenant use any currently-installed model, so models newly pulled
+into Ollama are auto-granted on the next discovery refresh. This is convenient but widens the
+attack surface for *that tenant*, so it is:
+
+- **opt-in per tenant** (default `false`), set explicitly via the CLI
+  (`create-tenant --allow-all-models` or `set-models --allow-all`);
+- **overridable per key** — a non-`NULL` key-level `allow_all_models` overrides the tenant
+  flag; otherwise the tenant flag applies (SPEC §13.7);
+- **audited** — every request records the model used in `gateway.audit_log`.
+
+Default-deny tenants instead see only `allowed_models ∩ discovered`. Either way the effective
+set is always intersected with the *live* discovered set, so stale or typo'd allowlist entries
+never resolve.
+
+### No existence disclosure
+
+A model that is installed-but-unpermitted and a model that is not installed both return the
+**same** generic `403`. An attacker cannot use the gateway to enumerate which models exist on
+the backend (SPEC §13.6).
+
+### Sanitized errors + request IDs
+
+Clients never receive Ollama's error text, stack traces, or internal hostnames. Errors are
+mapped to generic `4xx`/`5xx` JSON with a `request_id`. Operators correlate that ID with the
+audit log to investigate without leaking internals to callers (SPEC §4.3 step 14).
+
+### Streaming integrity is also a safety property
+
+Token counting and audit writes happen **after** stream close, never on the hot path. This
+keeps time-to-first-byte honest and ensures budget decrements and audit rows reflect the true
+final token counts reported by Ollama (`prompt_eval_count` + `eval_count`), not estimates.
+
+---
+
+## Out of scope (v0.1.0)
+
+Documented as future work, **not** mitigations present today: content moderation /
+prompt-injection filtering, response caching, multi-backend routing, billing, SSO/OAuth2 for
+admin, and any web admin UI (that lives in `neuronetz-console`).
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -0,0 +1,40 @@
+# mkdocs configuration for the neuronetz-gateway documentation.
+#
+#   pip install mkdocs-material
+#   mkdocs serve      # live preview at http://127.0.0.1:8000
+#   mkdocs build      # static site into ./site
+#
+# Docs live in docs/. This wires them into a single Material-themed site.
+site_name: neuronetz-gateway
+site_description: Secure, multi-tenant API gateway in front of Ollama.
+docs_dir: docs
+
+theme:
+  name: material
+  palette:
+    - scheme: slate
+      primary: indigo
+      accent: indigo
+  features:
+    - navigation.sections
+    - navigation.top
+    - content.code.copy
+    - content.code.annotate
+
+markdown_extensions:
+  - admonition
+  - tables
+  - toc:
+      permalink: true
+  - pymdownx.highlight:
+      anchor_linenums: true
+  - pymdownx.superfences
+  - pymdownx.inlinehilite
+
+nav:
+  - Architecture: ARCHITECTURE.md
+  - API Reference: API.md
+  - Deployment: DEPLOYMENT.md
+  - Threat Model: THREAT_MODEL.md
+  - Operations Runbook: OPERATIONS.md
+  - Demo & Playground: PLAYGROUND.md
--- a/playground/index.html
+++ b/playground/index.html
@@ -0,0 +1,716 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>neuronetz-gateway · playground</title>
+<style>
+  :root {
+    --bg:        #0a0e16;
+    --bg-2:      #10151f;
+    --panel:     #141b27;
+    --panel-2:   #1a2333;
+    --border:    #243047;
+    --text:      #e6edf6;
+    --muted:     #8b9bb4;
+    --accent:    #4f8cff;
+    --accent-2:  #7c5cff;
+    --good:      #3fcf8e;
+    --warn:      #f0b429;
+    --bad:       #ff5d6c;
+    --mono:      ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace;
+    --sans:      ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+  }
+  * { box-sizing: border-box; }
+  html, body { margin: 0; height: 100%; }
+  body {
+    background:
+      radial-gradient(1200px 600px at 80% -10%, rgba(124,92,255,.10), transparent 60%),
+      radial-gradient(900px 500px at -10% 110%, rgba(79,140,255,.10), transparent 55%),
+      var(--bg);
+    color: var(--text);
+    font-family: var(--sans);
+    font-size: 14px;
+    line-height: 1.5;
+    -webkit-font-smoothing: antialiased;
+  }
+  a { color: var(--accent); }
+  header {
+    display: flex; align-items: center; gap: 14px;
+    padding: 18px 26px;
+    border-bottom: 1px solid var(--border);
+    background: linear-gradient(180deg, rgba(255,255,255,.02), transparent);
+    position: sticky; top: 0; z-index: 5;
+    backdrop-filter: blur(6px);
+  }
+  .logo {
+    width: 34px; height: 34px; border-radius: 9px;
+    background: linear-gradient(135deg, var(--accent), var(--accent-2));
+    display: grid; place-items: center;
+    font-weight: 800; color: #fff; letter-spacing: -1px;
+    box-shadow: 0 6px 20px rgba(79,140,255,.35);
+  }
+  header h1 { font-size: 16px; margin: 0; font-weight: 700; letter-spacing: .2px; }
+  header .sub { color: var(--muted); font-size: 12px; }
+  .grow { flex: 1; }
+  .pill {
+    font-size: 11px; color: var(--muted);
+    border: 1px solid var(--border); border-radius: 999px;
+    padding: 4px 10px; font-family: var(--mono);
+  }
+
+  main {
+    display: grid;
+    grid-template-columns: 380px 1fr;
+    gap: 18px;
+    padding: 18px 26px 40px;
+    max-width: 1400px; margin: 0 auto;
+  }
+  @media (max-width: 920px) { main { grid-template-columns: 1fr; } }
+
+  .panel {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 14px;
+    padding: 16px;
+  }
+  .panel h2 {
+    font-size: 12px; text-transform: uppercase; letter-spacing: .12em;
+    color: var(--muted); margin: 0 0 12px;
+  }
+  label { display: block; font-size: 12px; color: var(--muted); margin: 12px 0 5px; }
+  label:first-of-type { margin-top: 0; }
+  input, select, textarea {
+    width: 100%; background: var(--bg-2); color: var(--text);
+    border: 1px solid var(--border); border-radius: 9px;
+    padding: 9px 11px; font-size: 13px; font-family: var(--sans);
+    outline: none; transition: border-color .15s, box-shadow .15s;
+  }
+  input:focus, select:focus, textarea:focus {
+    border-color: var(--accent);
+    box-shadow: 0 0 0 3px rgba(79,140,255,.18);
+  }
+  textarea { resize: vertical; min-height: 90px; font-family: var(--mono); font-size: 12.5px; }
+  .row { display: flex; gap: 8px; }
+  .row > * { flex: 1; }
+  .inline { display: flex; align-items: center; gap: 8px; }
+  .inline input[type=checkbox] { width: auto; }
+
+  .tabs { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 8px; }
+  .tab {
+    font-family: var(--mono); font-size: 11.5px;
+    padding: 6px 10px; border-radius: 8px; cursor: pointer;
+    border: 1px solid var(--border); background: var(--bg-2); color: var(--muted);
+    transition: all .12s;
+  }
+  .tab:hover { color: var(--text); border-color: #34425f; }
+  .tab.active {
+    color: #fff; border-color: transparent;
+    background: linear-gradient(135deg, var(--accent), var(--accent-2));
+  }
+
+  button.run {
+    margin-top: 14px; width: 100%;
+    background: linear-gradient(135deg, var(--accent), var(--accent-2));
+    color: #fff; border: none; border-radius: 10px;
+    padding: 12px; font-size: 14px; font-weight: 700; cursor: pointer;
+    box-shadow: 0 8px 22px rgba(79,140,255,.3);
+    transition: transform .08s, filter .15s;
+  }
+  button.run:hover { filter: brightness(1.07); }
+  button.run:active { transform: translateY(1px); }
+  button.run:disabled { filter: grayscale(.6) brightness(.8); cursor: progress; }
+
+  .ghost {
+    background: var(--panel-2); color: var(--muted);
+    border: 1px solid var(--border); border-radius: 8px;
+    padding: 7px 10px; font-size: 12px; cursor: pointer; transition: all .12s;
+  }
+  .ghost:hover { color: var(--text); border-color: #34425f; }
+
+  .field-with-btn { display: flex; gap: 8px; align-items: stretch; }
+  .field-with-btn select { flex: 1; }
+
+  .out-head { display: flex; align-items: center; gap: 10px; margin-bottom: 10px; }
+  .status {
+    font-family: var(--mono); font-size: 12px; padding: 3px 9px; border-radius: 7px;
+    border: 1px solid var(--border); color: var(--muted);
+  }
+  .status.s2 { color: var(--good); border-color: rgba(63,207,142,.4); background: rgba(63,207,142,.08); }
+  .status.s4 { color: var(--warn); border-color: rgba(240,180,41,.4); background: rgba(240,180,41,.08); }
+  .status.s5 { color: var(--bad);  border-color: rgba(255,93,108,.4); background: rgba(255,93,108,.08); }
+
+  pre, .codebox {
+    background: #0b0f17; border: 1px solid var(--border); border-radius: 10px;
+    padding: 13px; font-family: var(--mono); font-size: 12.5px;
+    white-space: pre-wrap; word-break: break-word; margin: 0;
+    max-height: 460px; overflow: auto;
+  }
+  .codebox.curl { color: #c9d6ea; }
+  .out-body { min-height: 120px; }
+
+  .headers {
+    margin-top: 12px; font-family: var(--mono); font-size: 11.5px;
+    border: 1px solid var(--border); border-radius: 10px; overflow: hidden;
+  }
+  .headers .hrow { display: flex; border-top: 1px solid var(--border); }
+  .headers .hrow:first-child { border-top: none; }
+  .headers .hk { width: 46%; padding: 6px 10px; color: var(--muted); background: var(--bg-2); }
+  .headers .hv { flex: 1; padding: 6px 10px; color: var(--text); word-break: break-all; }
+
+  .section-title {
+    display: flex; align-items: center; justify-content: space-between; margin: 0 0 8px;
+  }
+  .section-title .copy { font-size: 11px; }
+  .hint { color: var(--muted); font-size: 11.5px; margin-top: 6px; }
+  .stack { display: grid; gap: 16px; }
+
+  /* "About this endpoint" panel */
+  .ep-head { display: flex; align-items: center; gap: 8px; flex-wrap: wrap; margin-bottom: 8px; }
+  #endpointInfo h2 { font-family: ui-monospace, "JetBrains Mono", "Fira Code", monospace; font-size: 14px; letter-spacing: 0.2px; }
+  .summary { margin: 4px 0 12px; color: var(--text); font-size: 13.5px; line-height: 1.55; }
+  .sub-title { margin: 10px 0 6px; color: var(--muted); font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.8px; }
+  .codebox.sample { max-height: 200px; overflow: auto; font-size: 11.5px; color: #c9d6ea; }
+  .badge {
+    font-size: 10.5px; text-transform: uppercase; letter-spacing: 0.6px;
+    padding: 2px 7px; border-radius: 999px; border: 1px solid var(--border);
+    color: var(--muted); background: var(--bg-2);
+  }
+  .badge-post { color: #ffb84a; border-color: rgba(255,184,74,.35); background: rgba(255,184,74,.08); }
+  .badge-get  { color: #5fc8ff; border-color: rgba(95,200,255,.35); background: rgba(95,200,255,.08); }
+  .badge-auth { color: #c9b6ff; border-color: rgba(201,182,255,.35); background: rgba(201,182,255,.08); }
+  .badge-open { color: #3fcf8e; border-color: rgba(63,207,142,.35); background: rgba(63,207,142,.08); }
+  .blink { animation: blink 1s steps(2,start) infinite; }
+  @keyframes blink { to { opacity: .25; } }
+</style>
+</head>
+<body>
+<header>
+  <div class="logo">N</div>
+  <div>
+    <h1>neuronetz-gateway <span class="sub">· playground</span></h1>
+    <div class="sub">Authenticated, rate-limited, audited access to the model backend</div>
+  </div>
+  <div class="grow"></div>
+  <div class="pill" id="originPill">same-origin</div>
+</header>
+
+<main>
+  <!-- ── Left: request builder ─────────────────────────────────────────── -->
+  <section class="panel">
+    <h2>Request</h2>
+
+    <label for="baseUrl">Base URL</label>
+    <div class="field-with-btn">
+      <input id="baseUrl" type="text" spellcheck="false" autocomplete="off" autocapitalize="off" autocorrect="off" />
+      <button class="ghost" id="resetBase" title="Reset Base URL to this page's origin">⟳ This origin</button>
+    </div>
+
+    <label for="apiKey">API key (Bearer)</label>
+    <input id="apiKey" type="password" placeholder="nz_…" spellcheck="false" autocomplete="off" />
+    <div class="hint" id="keyHint">Created by <code>./demo.sh</code> and printed once in your terminal.</div>
+
+    <label>Endpoint</label>
+    <div class="tabs" id="tabs"></div>
+
+    <div id="modelWrap">
+      <label for="model">Model</label>
+      <div class="field-with-btn">
+        <select id="model"><option value="">— enter a key, then refresh —</option></select>
+        <button class="ghost" id="refreshModels" title="Load /v1/models with your key">↻ Refresh</button>
+      </div>
+    </div>
+
+    <div id="promptWrap">
+      <label for="prompt" id="promptLabel">Prompt</label>
+      <textarea id="prompt" spellcheck="false">Say hello in one sentence.</textarea>
+      <label class="inline" id="streamWrap" style="margin-top:10px">
+        <input id="stream" type="checkbox" checked /> Stream the response
+      </label>
+    </div>
+
+    <button class="run" id="run">▶ Run</button>
+    <div class="hint" id="methodHint"></div>
+  </section>
+
+  <!-- ── Right: about + response + curl ────────────────────────────────── -->
+  <div class="stack">
+    <section class="panel" id="endpointInfo">
+      <div class="ep-head">
+        <h2 id="epTitle" style="margin:0">POST /v1/chat/completions</h2>
+        <div class="grow"></div>
+        <span class="badge" id="epMethod">POST</span>
+        <span class="badge" id="epAuth">auth: bearer</span>
+        <span class="badge" id="epStream">streams · SSE</span>
+      </div>
+      <p class="summary" id="epSummary"></p>
+      <div class="sub-title">Sample request body</div>
+      <pre class="codebox sample" id="epSampleReq"></pre>
+      <div class="sub-title">Sample response</div>
+      <pre class="codebox sample" id="epSampleResp"></pre>
+      <div class="hint" id="epNote"></div>
+    </section>
+
+    <section class="panel">
+      <div class="out-head">
+        <h2 style="margin:0">Response</h2>
+        <div class="grow"></div>
+        <span class="status" id="status">idle</span>
+      </div>
+      <pre class="codebox out-body" id="output">Run a request to see the response stream here.</pre>
+      <div class="headers" id="headers" style="display:none"></div>
+    </section>
+
+    <section class="panel">
+      <div class="section-title">
+        <h2 style="margin:0">Exact curl</h2>
+        <button class="ghost copy" id="copyCurl">Copy</button>
+      </div>
+      <pre class="codebox curl" id="curl"></pre>
+      <div class="hint">This is exactly what <b>Run</b> sends — paste it into a terminal to reproduce.</div>
+    </section>
+  </div>
+</main>
+
+<script>
+"use strict";
+
+// ── Endpoint catalogue ──────────────────────────────────────────────────
+// Each endpoint knows its method, format, body shape, and how to render itself
+// in the "About this endpoint" panel: summary, sample request, sample response,
+// and an optional note. Mirrors SPEC §6.
+const ENDPOINTS = {
+  "/v1/chat/completions": {
+    method: "POST", canStream: true, format: "sse", needsModel: true, needsPrompt: true,
+    summary: "OpenAI-compatible Chat Completions — a drop-in replacement for OpenAI's endpoint. Point any OpenAI SDK at this gateway's base URL with your nz_ key and existing client code works unchanged. Streaming uses Server-Sent Events terminated by `data: [DONE]`.",
+    body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
+    sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
+    sampleResponse:
+`data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":" hi"},"finish_reason":null}]}
+
+data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":1,"completion_tokens":2,"total_tokens":3}}
+
+data: [DONE]`,
+    note: "Non-streaming (`stream: false`) returns one `chat.completion` JSON object — same shape as OpenAI.",
+  },
+  "/api/chat": {
+    method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
+    summary: "Native Ollama chat. Streams NDJSON — one JSON object per line; the final object carries `prompt_eval_count` + `eval_count` for exact token accounting in the audit log.",
+    body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
+    sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
+    sampleResponse:
+`{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
+{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" hi"},"done":false}
+{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2,"total_duration":12345678}`,
+    note: "Errors are sanitized but every response carries an X-Request-ID; upstream internals never leak.",
+  },
+  "/api/generate": {
+    method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
+    summary: "Native Ollama text generation. Takes a plain `prompt` string (no chat message structure) and streams NDJSON `response` chunks plus a final done frame with token counts.",
+    body: (s) => ({ model: s.model, stream: s.stream, prompt: s.prompt }),
+    sampleRequest: { model: "mistral:7b", stream: true, prompt: "Say hello in one sentence." },
+    sampleResponse:
+`{"model":"mistral:7b","created_at":"…","response":"Echo:","done":false}
+{"model":"mistral:7b","created_at":"…","response":" hi","done":false}
+{"model":"mistral:7b","created_at":"…","response":"","done":true,"prompt_eval_count":1,"eval_count":2}`,
+    note: "Use this when you don't need chat-message structure; otherwise prefer `/api/chat` or `/v1/chat/completions`.",
+  },
+  "/v1/models": {
+    method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
+    summary: "Lists the tenant's effective model set in OpenAI format: (live-discovered ∩ allowed_models), or all discovered models when the tenant has allow_all_models enabled. There is no static list — discovery polls the Ollama backend in the background.",
+    sampleRequest: null,
+    sampleResponse:
+`{
+  "object": "list",
+  "data": [
+    {"id": "llama3.1:8b",      "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
+    {"id": "mistral:7b",       "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
+    {"id": "qwen2.5:3b",       "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
+    {"id": "nomic-embed-text", "object": "model", "created": 1779492441, "owned_by": "neuronetz"}
+  ]
+}`,
+    note: "Refreshed automatically every MODEL_DISCOVERY_REFRESH_S (default 60s). Cached fail-closed.",
+  },
+  "/api/tags": {
+    method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
+    summary: "Native Ollama model list, filtered to the tenant's effective set. Same data as /v1/models but in Ollama's `models` shape — includes size, digest, modified_at, family and quantization details.",
+    sampleRequest: null,
+    sampleResponse:
+`{
+  "models": [
+    {
+      "name": "llama3.1:8b",
+      "model": "llama3.1:8b",
+      "modified_at": "2026-04-01T12:00:00Z",
+      "size": 4920624384,
+      "digest": "sha256:…",
+      "details": {"family": "llama", "parameter_size": "8B", "quantization_level": "Q4_K_M"}
+    }
+  ]
+}`,
+    note: "",
+  },
+  "/healthz": {
+    method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
+    summary: "Liveness probe. Returns 200 as long as the gateway process can respond — does NOT check downstream dependencies. Safe for load-balancer health checks. No authentication required.",
+    sampleRequest: null,
+    sampleResponse: `{"status": "ok"}`,
+    note: "",
+  },
+  "/readyz": {
+    method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
+    summary: "Readiness probe. Returns 200 only when Postgres + Redis + the Ollama backend are all reachable; 503 otherwise with which dependencies are down. No authentication required.",
+    sampleRequest: null,
+    sampleResponse:
+`# 200 OK
+{"status": "ready", "checks": {"postgres": true, "redis": true, "ollama": true}}
+
+# 503 Service Unavailable
+{"status": "not_ready", "checks": {"postgres": true, "redis": true, "ollama": false}}`,
+    note: "In this demo, /readyz will return 200 — the mock Ollama is reachable. In dev-only stacks without an Ollama backend, /readyz fails closed.",
+  },
+};
+
+// Response headers worth surfacing (SPEC §6.5).
+const SURFACE_HEADERS = [
+  "x-request-id",
+  "x-ratelimit-limit-requests", "x-ratelimit-remaining-requests",
+  "x-ratelimit-limit-tokens", "x-ratelimit-remaining-tokens",
+  "x-budget-period", "x-budget-tokens-remaining",
+  "retry-after", "content-type",
+];
+
+const $ = (id) => document.getElementById(id);
+let current = "/v1/chat/completions";
+
+// ── State helpers ───────────────────────────────────────────────────────
+function state() {
+  return {
+    base: $("baseUrl").value.replace(/\/+$/, ""),
+    key: $("apiKey").value.trim(),
+    model: $("model").value,
+    prompt: $("prompt").value,
+    stream: $("stream").checked,
+  };
+}
+
+function buildTabs() {
+  const tabs = $("tabs");
+  tabs.innerHTML = "";
+  for (const path of Object.keys(ENDPOINTS)) {
+    const el = document.createElement("div");
+    el.className = "tab" + (path === current ? " active" : "");
+    el.textContent = path;
+    el.onclick = () => { current = path; buildTabs(); syncForm(); updateCurl(); };
+    tabs.appendChild(el);
+  }
+}
+
+function syncForm() {
+  const ep = ENDPOINTS[current];
+  $("modelWrap").style.display = ep.needsModel ? "" : "none";
+  $("promptWrap").style.display = ep.needsPrompt ? "" : "none";
+  $("streamWrap").style.display = ep.canStream ? "" : "none";
+  $("promptLabel").textContent = current === "/api/generate" ? "Prompt" : "Message";
+  $("methodHint").textContent = `${ep.method} · ${ep.canStream ? `streams ${ep.format.toUpperCase()}` : ep.format.toUpperCase()} · ${ep.noAuth ? "no auth" : "requires Bearer"}`;
+  renderEndpointInfo();
+  refreshGating();
+}
+
+// Populate the "About this endpoint" panel from the current endpoint's metadata.
+function renderEndpointInfo() {
+  const ep = ENDPOINTS[current];
+  $("epTitle").textContent = ep.method + " " + current;
+
+  const method = $("epMethod");
+  method.textContent = ep.method;
+  method.className = "badge badge-" + ep.method.toLowerCase();
+
+  const auth = $("epAuth");
+  auth.textContent = ep.noAuth ? "no auth" : "auth: bearer";
+  auth.className = "badge " + (ep.noAuth ? "badge-open" : "badge-auth");
+
+  const streamBadge = $("epStream");
+  if (ep.canStream) {
+    streamBadge.style.display = "";
+    streamBadge.textContent = "streams · " + (ep.format === "sse" ? "SSE" : "NDJSON");
+  } else {
+    streamBadge.style.display = "none";
+  }
+
+  $("epSummary").textContent = ep.summary;
+  $("epSampleReq").textContent = ep.sampleRequest
+    ? JSON.stringify(ep.sampleRequest, null, 2)
+    : "(no request body — GET)";
+  $("epSampleResp").textContent = ep.sampleResponse;
+
+  const note = $("epNote");
+  if (ep.note) { note.textContent = ep.note; note.style.display = ""; }
+  else         { note.style.display = "none"; }
+}
+
+// Visibly disable Run/Refresh when no key is present (most endpoints need auth)
+// and surface the reason RIGHT next to the API-key field — not just in the right
+// pane where it's easy to miss.
+function refreshGating() {
+  const ep = ENDPOINTS[current];
+  const hasKey = $("apiKey").value.trim().length > 0;
+  const needsKey = !ep.noAuth;
+  const run = $("run");
+  const refresh = $("refreshModels");
+  const blocked = needsKey && !hasKey;
+  run.disabled = blocked;
+  refresh.disabled = !hasKey; // refresh always needs a key
+  run.style.opacity = blocked ? "0.45" : "";
+  run.style.cursor = blocked ? "not-allowed" : "";
+  refresh.style.opacity = !hasKey ? "0.45" : "";
+  refresh.style.cursor = !hasKey ? "not-allowed" : "";
+  const hint = $("keyHint");
+  if (blocked) {
+    hint.innerHTML = "⚠ <b style=\"color:#ffb84a\">Paste your API key above</b> to enable Run and Refresh. Get one by running <code>./demo.sh</code>.";
+  } else {
+    hint.innerHTML = "Created by <code>./demo.sh</code> and printed once in your terminal.";
+  }
+}
+
+// ── curl preview (must match exactly what Run sends) ────────────────────
+function buildRequest() {
+  const s = state();
+  const ep = ENDPOINTS[current];
+  const url = (s.base || location.origin) + current;
+  const headers = {};
+  if (!ep.noAuth) headers["Authorization"] = "Bearer " + (s.key || "nz_YOUR_KEY");
+  let body = null;
+  if (ep.method === "POST") {
+    headers["Content-Type"] = "application/json";
+    body = JSON.stringify(ep.body(s));
+  }
+  return { url, method: ep.method, headers, body, ep };
+}
+
+function updateCurl() {
+  const r = buildRequest();
+  const parts = ["curl"];
+  if (r.ep.canStream && state().stream && r.method === "POST") parts.push("-N");
+  if (r.method === "GET") parts.push("-i");
+  parts.push(shellQuote(r.url));
+  for (const [k, v] of Object.entries(r.headers)) {
+    parts.push("\\\n  -H " + shellQuote(k + ": " + v));
+  }
+  if (r.body) parts.push("\\\n  -d " + shellQuote(r.body));
+  $("curl").textContent = parts.join(" ");
+}
+
+function shellQuote(s) {
+  if (/^[A-Za-z0-9_\-:/.@]+$/.test(s)) return s;
+  return "'" + s.replace(/'/g, "'\\''") + "'";
+}
+
+// ── Status + header rendering ───────────────────────────────────────────
+function setStatus(text, code) {
+  const el = $("status");
+  el.textContent = text;
+  el.className = "status" + (code ? " s" + String(code)[0] : "");
+}
+
+function renderHeaders(resp) {
+  const box = $("headers");
+  const rows = [];
+  for (const h of SURFACE_HEADERS) {
+    const v = resp.headers.get(h);
+    if (v != null) rows.push([h, v]);
+  }
+  if (!rows.length) { box.style.display = "none"; return; }
+  box.innerHTML = rows.map(([k, v]) =>
+    `<div class="hrow"><div class="hk">${k}</div><div class="hv">${escapeHtml(v)}</div></div>`
+  ).join("");
+  box.style.display = "";
+}
+
+function escapeHtml(s) {
+  return String(s).replace(/[&<>]/g, (c) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[c]));
+}
+
+// ── Model dropdown population ───────────────────────────────────────────
+async function refreshModels() {
+  const s = state();
+  if (!s.key) { setOutput("Enter an API key first, then refresh models."); return; }
+  const sel = $("model");
+  const btn = $("refreshModels");
+  btn.disabled = true; btn.textContent = "…";
+  try {
+    const resp = await fetch((s.base || location.origin) + "/v1/models", {
+      headers: { "Authorization": "Bearer " + s.key },
+    });
+    if (!resp.ok) { setOutput("Could not load models: HTTP " + resp.status); return; }
+    const data = await resp.json();
+    const names = (data.data || []).map((m) => m.id).filter(Boolean);
+    const prev = sel.value;
+    sel.innerHTML = "";
+    if (!names.length) {
+      sel.innerHTML = '<option value="">(no models in your effective set)</option>';
+    } else {
+      for (const n of names) {
+        const o = document.createElement("option");
+        o.value = n; o.textContent = n;
+        sel.appendChild(o);
+      }
+      if (names.includes(prev)) sel.value = prev;
+    }
+    updateCurl();
+  } catch (e) {
+    setOutput("Network error loading models: " + e.message);
+  } finally {
+    btn.disabled = false; btn.textContent = "↻ Refresh";
+  }
+}
+
+function setOutput(text) { $("output").textContent = text; }
+function appendOutput(text) { $("output").textContent += text; }
+
+// ── Run ─────────────────────────────────────────────────────────────────
+let running = false;
+async function run() {
+  if (running) return;
+  running = true;
+  const btn = $("run");
+  btn.disabled = true;
+  setStatus("connecting…");
+  setOutput("");
+  $("headers").style.display = "none";
+
+  const r = buildRequest();
+  const willStream = r.ep.canStream && state().stream && r.method === "POST";
+
+  try {
+    const resp = await fetch(r.url, { method: r.method, headers: r.headers, body: r.body });
+    setStatus(resp.status + " " + resp.statusText, resp.status);
+    renderHeaders(resp);
+
+    if (willStream && resp.body && resp.ok) {
+      await consumeStream(resp, r.ep.format);
+    } else {
+      const text = await resp.text();
+      setOutput(prettyMaybeJson(text));
+    }
+  } catch (e) {
+    setStatus("network error", 5);
+    setOutput("Request failed: " + e.message + "\n\n(Check the Base URL and that the gateway is running.)");
+  } finally {
+    running = false;
+    btn.disabled = false;
+  }
+}
+
+function prettyMaybeJson(text) {
+  try { return JSON.stringify(JSON.parse(text), null, 2); } catch { return text || "(empty response)"; }
+}
+
+// Parse SSE (data: {...} … data: [DONE]) or NDJSON (one JSON object per line),
+// rendering text deltas live as they arrive.
+async function consumeStream(resp, format) {
+  const reader = resp.body.getReader();
+  const decoder = new TextDecoder();
+  let buffer = "";
+  setOutput("");
+  const cursor = "▌";
+  const render = (txt) => { $("output").textContent = txt + cursor; };
+  let acc = "";
+
+  while (true) {
+    const { value, done } = await reader.read();
+    if (done) break;
+    buffer += decoder.decode(value, { stream: true });
+
+    let idx;
+    // SSE events are separated by blank lines; NDJSON by single newlines.
+    const sep = format === "sse" ? "\n\n" : "\n";
+    while ((idx = buffer.indexOf(sep)) >= 0) {
+      const raw = buffer.slice(0, idx);
+      buffer = buffer.slice(idx + sep.length);
+      acc += handleEvent(raw, format);
+      render(acc);
+    }
+  }
+  if (buffer.trim()) acc += handleEvent(buffer, format);
+  $("output").textContent = acc || "(stream produced no text)";
+}
+
+// Returns the text delta extracted from one event/line.
+function handleEvent(raw, format) {
+  if (format === "sse") {
+    let out = "";
+    for (let line of raw.split("\n")) {
+      line = line.trim();
+      if (!line.startsWith("data:")) continue;
+      const payload = line.slice(5).trim();
+      if (payload === "[DONE]") continue;
+      try {
+        const obj = JSON.parse(payload);
+        const delta = obj.choices && obj.choices[0] && obj.choices[0].delta;
+        if (delta && typeof delta.content === "string") out += delta.content;
+      } catch { /* ignore keep-alives / partial */ }
+    }
+    return out;
+  }
+  // NDJSON
+  const line = raw.trim();
+  if (!line) return "";
+  try {
+    const obj = JSON.parse(line);
+    if (obj.message && typeof obj.message.content === "string") return obj.message.content; // /api/chat
+    if (typeof obj.response === "string") return obj.response;                              // /api/generate
+  } catch { /* partial line */ }
+  return "";
+}
+
+// ── Wiring ──────────────────────────────────────────────────────────────
+function init() {
+  // Set the base URL to this page's origin. Browsers love to autofill text
+  // inputs from history *after* the page scripts run, so we ALSO re-assert it on
+  // the next microtask and again after a short delay — that wins against
+  // chromium/firefox autofill, which can otherwise replace the value with a
+  // stale entry like https://api.neuronetz.ai.
+  const setOrigin = () => { $("baseUrl").value = location.origin; };
+  setOrigin();
+  $("originPill").textContent = location.origin;
+  queueMicrotask(setOrigin);
+  setTimeout(setOrigin, 250);
+
+  buildTabs();
+  syncForm();
+  updateCurl();
+  refreshGating();
+
+  for (const id of ["baseUrl", "apiKey", "model", "prompt"]) {
+    $(id).addEventListener("input", updateCurl);
+  }
+  $("apiKey").addEventListener("input", refreshGating);
+  $("stream").addEventListener("change", updateCurl);
+  $("run").addEventListener("click", run);
+  $("refreshModels").addEventListener("click", refreshModels);
+  $("resetBase").addEventListener("click", () => {
+    $("baseUrl").value = location.origin;
+    updateCurl();
+  });
+  $("copyCurl").addEventListener("click", async () => {
+    try {
+      await navigator.clipboard.writeText($("curl").textContent);
+      const b = $("copyCurl"); b.textContent = "Copied!"; setTimeout(() => (b.textContent = "Copy"), 1200);
+    } catch { /* clipboard may be blocked; ignore */ }
+  });
+
+  // Convenience: refresh models when a key is pasted/typed (debounced).
+  let t = null;
+  $("apiKey").addEventListener("input", () => {
+    clearTimeout(t);
+    if ($("apiKey").value.trim().length > 8) t = setTimeout(refreshModels, 500);
+  });
+}
+
+document.addEventListener("DOMContentLoaded", init);
+</script>
+</body>
+</html>