demo + playground + docs

One-command demo so the gateway can be exercised end-to-end without a GPU or a
real model download:

- demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags,
  /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count
  and eval_count on the final frame, /api/embed, /api/show, /api/version).
  Non-root multi-stage Dockerfile, never published (internal network only).
- docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with
  PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground.
  Mirrors the prod posture (mock-ollama not exposed).
- demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with
  allow_all_models and a fresh API key via the bootstrap CLI inside the
  container, then prints the key, the playground URL, and five ready-to-paste
  curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull).
  ./demo.sh --down tears everything back down with volumes.
- playground/index.html — single-file dark-themed UI served same-origin by
  the gateway at /playground (CORS-free). Per-endpoint About card with method/
  auth/streaming badges, a real description, sample request body, sample
  response, and a footer note. Live SSE/NDJSON rendering of the response.
  A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh
  are visibly gated until an API key is in the field; the Base URL is
  force-pinned to location.origin three times to defeat browser autofill.
- docs/ — API.md (full endpoint reference with curl, streaming formats, error
  model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery
  + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule,
  pointing at a real Ollama backend, env reference), THREAT_MODEL.md
  (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md
  (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md.
  mkdocs.yml (Material theme) wires them together.
This commit is contained in:
Stephan Berbig
2026-05-26 20:52:33 +02:00
parent 844b02aade
commit b47a09db91
13 changed files with 2501 additions and 0 deletions

204
demo.sh Executable file
View File

@@ -0,0 +1,204 @@
#!/usr/bin/env bash
#
# demo.sh — the neuronetz-gateway one-command presentation.
#
# Brings up the demo stack (postgres + redis + mock-ollama + gateway) with NO
# GPU and NO model downloads, creates a demo tenant + API key via the bootstrap
# CLI *inside the gateway container*, and prints a clean summary with the key,
# the playground URL, and ready-to-paste curl commands.
#
# Usage:
# ./demo.sh # build + start, bootstrap a tenant/key, print summary
# ./demo.sh --down # tear the whole stack down (and remove volumes)
# ./demo.sh --help # this help
#
# Re-runnable: existing tenant/key are handled gracefully. The full API key is
# only ever printed once at creation (SPEC §11), so on a re-run where the key
# already exists this script creates a fresh, uniquely-named key and prints it.
set -euo pipefail
# ──────────────────────────────────────────────────────────────────────────
# Configuration
# ──────────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
COMPOSE_FILE="${SCRIPT_DIR}/docker-compose.demo.yml"
COMPOSE=(docker compose -f "${COMPOSE_FILE}")
GATEWAY_URL="http://localhost:8080"
PLAYGROUND_URL="${GATEWAY_URL}/playground"
TENANT_NAME="demo"
KEY_NAME="demo-key"
# Colours (disabled when stdout is not a TTY).
if [ -t 1 ]; then
BOLD="$(printf '\033[1m')"; DIM="$(printf '\033[2m')"; RESET="$(printf '\033[0m')"
CYAN="$(printf '\033[36m')"; GREEN="$(printf '\033[32m')"; YELLOW="$(printf '\033[33m')"
else
BOLD=""; DIM=""; RESET=""; CYAN=""; GREEN=""; YELLOW=""
fi
log() { printf '%s\n' "${CYAN}==>${RESET} ${BOLD}$*${RESET}"; }
warn() { printf '%s\n' "${YELLOW}!!${RESET} $*" >&2; }
die() { printf '%s\n' "${YELLOW}xx${RESET} $*" >&2; exit 1; }
# ──────────────────────────────────────────────────────────────────────────
# Subcommands
# ──────────────────────────────────────────────────────────────────────────
usage() {
sed -n '3,18p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
}
down() {
log "Tearing down the demo stack (containers + volumes)…"
"${COMPOSE[@]}" down --volumes --remove-orphans
log "Done. The demo stack is gone."
}
# Run the bootstrap CLI inside the running gateway container.
gw_cli() {
"${COMPOSE[@]}" exec -T gateway neuronetz-gateway "$@"
}
wait_for_health() {
log "Waiting for the gateway to become healthy at ${GATEWAY_URL}/healthz …"
local deadline=$(( $(date +%s) + 180 ))
until curl -fsS "${GATEWAY_URL}/healthz" >/dev/null 2>&1; do
if [ "$(date +%s)" -ge "${deadline}" ]; then
warn "Gateway did not become healthy in time. Recent gateway logs:"
"${COMPOSE[@]}" logs --tail=50 gateway >&2 || true
die "Aborting."
fi
sleep 2
done
log "Gateway is up."
}
# Create the demo tenant if it does not already exist (idempotent).
ensure_tenant() {
log "Creating demo tenant '${TENANT_NAME}' (allow-all-models) …"
local out
if out="$(gw_cli create-tenant --name "${TENANT_NAME}" --allow-all-models 2>&1)"; then
printf '%s\n' "${DIM}${out}${RESET}"
else
# Already-exists (or similar) is fine — surface it but keep going.
if printf '%s' "${out}" | grep -qiE 'exist|duplicate|unique'; then
log "Tenant '${TENANT_NAME}' already exists — reusing it."
else
warn "create-tenant reported:"
printf '%s\n' "${out}" >&2
warn "Continuing; the tenant may already be present."
fi
fi
}
# Create a fresh API key and capture the printed key. The key is printed once.
# We give each created key a unique name so re-runs always succeed and always
# yield a usable key to print.
create_key() {
local unique_name="${KEY_NAME}-$(date +%Y%m%d-%H%M%S)"
log "Creating API key '${unique_name}' for tenant '${TENANT_NAME}' …" >&2
local out
if ! out="$(gw_cli create-key --tenant "${TENANT_NAME}" --name "${unique_name}" 2>&1)"; then
warn "create-key failed:" >&2
printf '%s\n' "${out}" >&2
return 1
fi
# The CLI prints both the 12-char prefix (e.g. "prefix nz_abc12345Yz") AND the
# full key on a later line. Both match /nz_[A-Za-z0-9]+/, so pick the longest
# match — that's the full key (44 chars), never the prefix (12).
local key
key="$(printf '%s' "${out}" | grep -oE 'nz_[A-Za-z0-9]+' \
| awk '{ if (length($0) > maxlen) { maxlen = length($0); k = $0 } } END { print k }' \
|| true)"
if [ -z "${key}" ]; then
warn "Could not parse an API key from create-key output:" >&2
printf '%s\n' "${out}" >&2
return 1
fi
printf '%s' "${key}"
}
print_summary() {
local key="$1"
local cl='application/json'
cat <<EOF
${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
${GREEN}${BOLD} neuronetz-gateway demo is live${RESET}
${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
${BOLD}API base URL${RESET} ${CYAN}${GATEWAY_URL}${RESET}
${BOLD}Playground${RESET} ${CYAN}${PLAYGROUND_URL}${RESET}
${BOLD}API key${RESET} ${YELLOW}${key}${RESET}
${DIM}(printed once — copy it now; re-run ./demo.sh to mint another)${RESET}
${BOLD}Model backend${RESET} mock-ollama (internal network only, never published)
${BOLD}Models${RESET} llama3.1:8b · mistral:7b · qwen2.5:3b · nomic-embed-text
${BOLD}── Ready-to-paste curl commands ───────────────────────────────────────${RESET}
${DIM}# 1) Streaming chat — OpenAI-compatible SSE (data: {...} … data: [DONE])${RESET}
curl -N ${GATEWAY_URL}/v1/chat/completions \\
-H "Authorization: Bearer ${key}" \\
-H "Content-Type: ${cl}" \\
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
${DIM}# 2) Streaming chat — native Ollama NDJSON (one JSON object per line)${RESET}
curl -N ${GATEWAY_URL}/api/chat \\
-H "Authorization: Bearer ${key}" \\
-H "Content-Type: ${cl}" \\
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
${DIM}# 3) List models — the tenant's effective (live-discovered) set, OpenAI format${RESET}
curl ${GATEWAY_URL}/v1/models \\
-H "Authorization: Bearer ${key}"
${DIM}# 4) 401 Unauthorized — no/invalid key, fail-closed, no upstream details leaked${RESET}
curl -i ${GATEWAY_URL}/v1/models \\
-H "Authorization: Bearer nz_invalidKEYdoesNotExist000000000000000000"
${DIM}# 5) 403 Forbidden — model-mutating endpoint is hard-blocked (not configurable)${RESET}
curl -i ${GATEWAY_URL}/api/pull \\
-H "Authorization: Bearer ${key}" \\
-H "Content-Type: ${cl}" \\
-d '{"model":"llama3.1:8b"}'
${BOLD}───────────────────────────────────────────────────────────────────────${RESET}
Tear it all down with: ${CYAN}./demo.sh --down${RESET}
EOF
}
up() {
command -v docker >/dev/null 2>&1 || die "docker is required but not found on PATH."
command -v curl >/dev/null 2>&1 || die "curl is required but not found on PATH."
[ -f "${COMPOSE_FILE}" ] || die "Missing ${COMPOSE_FILE}"
log "Building and starting the demo stack (postgres + redis + mock-ollama + gateway) …"
"${COMPOSE[@]}" up --build -d
wait_for_health
ensure_tenant
local key
if ! key="$(create_key)"; then
die "Could not create/parse an API key. See logs above."
fi
print_summary "${key}"
}
# ──────────────────────────────────────────────────────────────────────────
# Entry point
# ──────────────────────────────────────────────────────────────────────────
main() {
case "${1:-}" in
--down|-d|down) down ;;
--help|-h|help) usage ;;
"") up ;;
*) die "Unknown argument: $1 (try --help)" ;;
esac
}
main "$@"

View File

@@ -0,0 +1,61 @@
# syntax=docker/dockerfile:1.7
#
# mock-ollama — a tiny FastAPI app emulating the Ollama HTTP API for the demo.
#
# builder stage : installs deps into a self-contained virtualenv.
# runtime stage : copies the venv + app, drops to a NON-ROOT user, no build
# tools, runs uvicorn on :11434.
#
# This image exists ONLY for the demo stack (docker-compose.demo.yml). It lets
# the demo run with no GPU and no model downloads. It is never published to the
# host — like real Ollama, it is reachable only on the internal Docker network.
# ----------------------------------------------------------------------------
# Stage 1 — builder
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS builder
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH
RUN python -m venv /opt/venv
WORKDIR /app
COPY requirements.txt ./
RUN pip install -r requirements.txt
# ----------------------------------------------------------------------------
# Stage 2 — runtime
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS runtime
# curl is used by the compose healthcheck.
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
# Non-root user.
RUN groupadd --system --gid 10001 mock \
&& useradd --system --uid 10001 --gid mock --home-dir /app --shell /usr/sbin/nologin mock
ENV VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
MOCK_OLLAMA_PORT=11434
WORKDIR /app
COPY --from=builder /opt/venv /opt/venv
COPY app.py ./
USER mock
EXPOSE 11434
HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=5 \
CMD curl -fsS "http://127.0.0.1:${MOCK_OLLAMA_PORT}/api/version" || exit 1
CMD ["python", "-m", "app"]

361
demo/mock-ollama/app.py Normal file
View File

@@ -0,0 +1,361 @@
"""Standalone mock Ollama service for the neuronetz-gateway demo.
This is a containerised sibling of ``tests/integration/mock_ollama.py``: it
emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so
the demo runs with **no GPU and no model downloads**. The response *shapes*
match real Ollama closely enough that the gateway's token counter, model
discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths.
Endpoints emulated:
* ``GET /api/tags`` - model catalogue (size/digest/modified_at/details)
* ``POST /api/chat`` - NDJSON streaming (default) or single JSON
* ``POST /api/generate`` - NDJSON streaming (default) or single JSON
* ``POST /api/embed`` - newer batch embeddings (field ``embeddings``)
* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``)
* ``POST /api/show`` - returns template/system so the gateway can prove it
strips them
* ``GET /api/version`` - plausible upstream version
The terminal NDJSON object of every chat/generate response carries realistic
``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the
gateway counts tokens for real. Reply text is ``"Echo: <prompt>"``.
Runs uvicorn on :11434 as a non-root user inside the container.
"""
from __future__ import annotations
import hashlib
import json
import os
from collections.abc import AsyncIterator, Iterable
from datetime import UTC, datetime
from typing import Any
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse
NDJSON_MEDIA_TYPE = "application/x-ndjson"
# A small, realistic catalogue. Sizes/digests are plausible but fixed so the
# demo is fully deterministic.
MODELS: tuple[dict[str, Any], ...] = (
{
"name": "llama3.1:8b",
"family": "llama",
"parameter_size": "8.0B",
"quantization_level": "Q4_0",
"size": 4_661_211_808,
},
{
"name": "mistral:7b",
"family": "llama",
"parameter_size": "7.2B",
"quantization_level": "Q4_0",
"size": 4_109_865_159,
},
{
"name": "qwen2.5:3b",
"family": "qwen2",
"parameter_size": "3.1B",
"quantization_level": "Q4_K_M",
"size": 1_929_889_677,
},
{
"name": "nomic-embed-text",
"family": "nomic-bert",
"parameter_size": "137M",
"quantization_level": "F16",
"size": 274_302_450,
},
)
def _now_iso() -> str:
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
def _digest_for(name: str) -> str:
return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest()
def _details_for(name: str) -> dict[str, Any]:
for m in MODELS:
if m["name"] == name:
return {
"parent_model": "",
"format": "gguf",
"family": m["family"],
"families": [m["family"]],
"parameter_size": m["parameter_size"],
"quantization_level": m["quantization_level"],
}
return {
"parent_model": "",
"format": "gguf",
"family": name.split(":", 1)[0],
"families": [name.split(":", 1)[0]],
"parameter_size": "8B",
"quantization_level": "Q4_0",
}
def _reply_for(prompt: str, override: str | None) -> str:
if override is not None:
return override
if not prompt:
return "Hello from the mock Ollama backend."
return f"Echo: {prompt}"
def _tokenize(text: str) -> list[str]:
return text.split()
def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]:
"""Timing/usage fields Ollama attaches to the terminal stream object."""
return {
"total_duration": 1_234_567_890,
"load_duration": 12_345_678,
"prompt_eval_count": prompt_tokens,
"prompt_eval_duration": 23_456_789,
"eval_count": completion_tokens,
"eval_duration": 34_567_890,
}
def _chat_chunk(
model: str,
*,
content: str,
done: bool,
prompt_tokens: int = 0,
completion_tokens: int = 0,
) -> dict[str, Any]:
obj: dict[str, Any] = {
"model": model,
"created_at": _now_iso(),
"message": {"role": "assistant", "content": content},
"done": done,
}
if done:
obj["done_reason"] = "stop"
obj.update(_final_metrics(prompt_tokens, completion_tokens))
return obj
def _generate_chunk(
model: str,
*,
response: str,
done: bool,
prompt_tokens: int = 0,
completion_tokens: int = 0,
) -> dict[str, Any]:
obj: dict[str, Any] = {
"model": model,
"created_at": _now_iso(),
"response": response,
"done": done,
}
if done:
obj["done_reason"] = "stop"
obj["context"] = [1, 2, 3]
obj.update(_final_metrics(prompt_tokens, completion_tokens))
return obj
async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]:
for obj in objects:
yield (json.dumps(obj) + "\n").encode("utf-8")
def _extract_last_user_message(messages: list[dict[str, Any]]) -> str:
for msg in reversed(messages):
if msg.get("role") == "user":
content = msg.get("content", "")
return content if isinstance(content, str) else ""
return ""
def create_app() -> FastAPI:
app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None)
@app.post("/api/chat")
async def chat(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "llama3.1:8b")
stream: bool = body.get("stream", True)
reply_override: str | None = body.get("reply_text")
prompt = _extract_last_user_message(body.get("messages", []))
reply = _reply_for(prompt, reply_override)
prompt_tokens = len(_tokenize(prompt))
completion_tokens = len(_tokenize(reply))
if not stream:
return JSONResponse(
_chat_chunk(
model,
content=reply,
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
words = _tokenize(reply) or [""]
def chunks() -> list[dict[str, Any]]:
out: list[dict[str, Any]] = []
for i, word in enumerate(words):
piece = word if i == 0 else f" {word}"
out.append(_chat_chunk(model, content=piece, done=False))
out.append(
_chat_chunk(
model,
content="",
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
return out
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
@app.post("/api/generate")
async def generate(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "llama3.1:8b")
stream: bool = body.get("stream", True)
prompt = body.get("prompt", "")
reply = _reply_for(prompt, body.get("reply_text"))
prompt_tokens = len(_tokenize(prompt))
completion_tokens = len(_tokenize(reply))
if not stream:
return JSONResponse(
_generate_chunk(
model,
response=reply,
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
words = _tokenize(reply) or [""]
def chunks() -> list[dict[str, Any]]:
out: list[dict[str, Any]] = []
for i, word in enumerate(words):
piece = word if i == 0 else f" {word}"
out.append(_generate_chunk(model, response=piece, done=False))
out.append(
_generate_chunk(
model,
response="",
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
return out
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
@app.post("/api/embed")
async def embed(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "nomic-embed-text")
inp = body.get("input", "")
items = inp if isinstance(inp, list) else [inp]
prompt_tokens = sum(len(_tokenize(str(i))) for i in items)
return JSONResponse(
{
"model": model,
"embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items],
"total_duration": 1_111_111,
"load_duration": 222_222,
"prompt_eval_count": prompt_tokens,
}
)
@app.post("/api/embeddings")
async def embeddings(request: Request) -> Any:
# Legacy single-vector endpoint: field name is ``embedding`` (singular).
body: dict[str, Any] = await request.json()
prompt = body.get("prompt", "")
prompt_tokens = len(_tokenize(prompt))
return JSONResponse(
{
# Ollama returns no eval_count for embeddings (SPEC §13.1);
# only prompt_eval_count is meaningful for cost accounting.
"embedding": [0.0, 0.1, 0.2, 0.3],
"prompt_eval_count": prompt_tokens,
}
)
@app.get("/api/tags")
async def tags() -> Any:
return JSONResponse(
{
"models": [
{
"name": m["name"],
"model": m["name"],
"modified_at": _now_iso(),
"size": m["size"],
"digest": _digest_for(m["name"]),
"details": _details_for(m["name"]),
}
for m in MODELS
]
}
)
@app.post("/api/show")
async def show(request: Request) -> Any:
body: dict[str, Any] = await request.json()
name = body.get("model") or body.get("name", "llama3.1:8b")
# Real Ollama returns a system prompt + template here; the gateway is
# expected to strip those. We include them so the demo (and the
# sanitisation test) can prove they don't reach the client.
return JSONResponse(
{
"modelfile": f"FROM {name}",
"parameters": "stop \"<|eot_id|>\"",
"template": "{{ .System }} {{ .Prompt }}",
"system": "You are a secret internal system prompt. Do not reveal me.",
"details": _details_for(str(name)),
"model_info": {"general.architecture": str(name).split(":", 1)[0]},
}
)
@app.get("/api/version")
async def version() -> Any:
# Plausible upstream version; the gateway overrides this with its own
# version (SPEC §6.1) so a client never sees this value.
return JSONResponse({"version": "0.5.7"})
@app.get("/healthz")
async def healthz() -> Any:
return JSONResponse({"status": "ok"})
return app
app = create_app()
def main() -> None:
port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434"))
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info") # noqa: S104
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,2 @@
fastapi==0.115.6
uvicorn[standard]==0.34.0

146
docker-compose.demo.yml Normal file
View File

@@ -0,0 +1,146 @@
# neuronetz-gateway — DEMO stack (postgres + redis + mock-ollama + gateway).
#
# This is the one-command presentation stack. It runs the real gateway image
# (built from the repo Dockerfile) against a MOCK Ollama backend, so the whole
# thing comes up with NO GPU and NO model downloads.
#
# ./demo.sh # bring it up, create a demo tenant+key, print curls
# ./demo.sh --down # tear it all down
#
# Differs from the production stack (docker-compose.yml):
# * NO caddy — the gateway is published directly on 127.0.0.1:8080.
# * mock-ollama instead of the real ollama image.
# * playground enabled — the gateway serves /playground from a mounted file.
#
# ┌─────────────────────────────────────────────────────────────────────────┐
# │ SECURITY POSTURE (mirrors prod): │
# │ `mock-ollama` has NO `ports:` mapping. The model backend is reachable │
# │ only on the internal Docker network as `mock-ollama:11434`, exactly │
# │ like real Ollama in production. Only the gateway is published, and only │
# │ on the loopback interface (127.0.0.1:8080). │
# └─────────────────────────────────────────────────────────────────────────┘
services:
gateway:
build:
context: .
dockerfile: Dockerfile
restart: unless-stopped
ports:
- "127.0.0.1:8080:8080"
environment:
GATEWAY_BIND_HOST: 0.0.0.0
GATEWAY_BIND_PORT: "8080"
GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
# Serve the interactive playground from the mounted file (flag-gated;
# OFF by default in prod). See playground/index.html.
PLAYGROUND_ENABLED: "true"
PLAYGROUND_FILE: /app/playground/index.html
# Point the gateway at the mock Ollama on the internal network.
OLLAMA_BASE_URL: http://mock-ollama:11434
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
# Discover models quickly so the demo feels live.
MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-15}
MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-60}
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
REDIS_URL: redis://redis:6379/0
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
DEFAULT_RPM: ${DEFAULT_RPM:-60}
DEFAULT_TPM: ${DEFAULT_TPM:-100000}
DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
volumes:
# The gateway serves /playground by reading this file at request time.
# Read-only mount: the demo never lets the container modify it.
- ./playground:/app/playground:ro
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
mock-ollama:
condition: service_healthy
# Apply migrations, then start the server (mirrors docker-compose.dev.yml).
command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
interval: 10s
timeout: 3s
retries: 5
start_period: 30s
networks:
- internal
# ───────────────────────────────────────────────────────────────────────────
# mock-ollama — INTERNAL NETWORK ONLY. Stands in for the real Ollama backend.
# NO `ports:` mapping, mirroring the production "Ollama is never exposed" rule.
# Reachable only as `http://mock-ollama:11434` from the gateway container.
# ───────────────────────────────────────────────────────────────────────────
mock-ollama:
build:
context: ./demo/mock-ollama
dockerfile: Dockerfile
restart: unless-stopped
# !!! NO `ports:` — the model backend is never published. !!!
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:11434/api/version"]
interval: 10s
timeout: 3s
retries: 5
start_period: 5s
networks:
- internal
postgres:
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_USER: ${POSTGRES_USER:-gateway}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
volumes:
- postgres_demo_data:/var/lib/postgresql/data
# No `ports:` — Postgres is internal-only.
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
interval: 5s
timeout: 3s
retries: 10
networks:
- internal
redis:
image: redis:7-alpine
restart: unless-stopped
command: ["redis-server", "--save", "", "--appendonly", "no"]
# No `ports:` — Redis is internal-only.
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 3s
retries: 10
networks:
- internal
networks:
# Private network for inter-service traffic; not reachable from the host.
internal:
driver: bridge
volumes:
postgres_demo_data:

253
docs/API.md Normal file
View File

@@ -0,0 +1,253 @@
# neuronetz-gateway — API Reference
The gateway exposes two compatible API surfaces in front of the Ollama backend:
- **Native Ollama** under `/api/*` — NDJSON streaming, identical request shapes to Ollama.
- **OpenAI-compatible** under `/v1/*` — SSE streaming, drop-in for the OpenAI SDKs.
Plus unauthenticated health endpoints. Everything else is blocked.
> Source of truth: [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §6. Where this doc and the
> SPEC disagree, the SPEC wins.
---
## Authentication
Every model endpoint requires an API key as a Bearer token:
```
Authorization: Bearer nz_<12-char-prefix><32-char-random>
```
- **Key format:** `nz_` namespace + random base62 body. The first 12 characters
(`nz_` + entropy) are the **prefix**, stored in cleartext and indexed for O(1) lookup.
The full key is **argon2id**-hashed; it is shown **exactly once** at creation
(`neuronetz-gateway create-key`) and never stored or logged.
- **Fail-closed:** a missing, malformed, expired, disabled, or revoked key returns **401**.
No upstream/Ollama detail is ever leaked in the error.
- Health endpoints (`/healthz`, `/readyz`) require **no** auth.
The placeholder key `nz_demoKEY...` is used throughout this doc. `./demo.sh` prints a
**real** key for the local demo.
---
## Response headers (SPEC §6.5)
Every proxied response carries:
| Header | Meaning |
|---|---|
| `X-Request-ID` | Correlates the response with the audit log row. Present on errors too. |
| `X-RateLimit-Limit-Requests` | Effective RPM limit for this key/tenant. |
| `X-RateLimit-Remaining-Requests` | Requests remaining in the current window. |
| `X-RateLimit-Limit-Tokens` | Effective TPM limit. |
| `X-RateLimit-Remaining-Tokens` | Tokens remaining in the current window. |
| `X-Budget-Period` | `day` \| `month` \| `total` — the binding budget period. |
| `X-Budget-Tokens-Remaining` | Tokens left in the binding budget period. |
`429 Too Many Requests` responses additionally carry `Retry-After: <seconds>`.
---
## Error model
Errors are **sanitized** at the gateway boundary — Ollama internals are never reflected.
The body is a small generic JSON object and the `X-Request-ID` header ties it to the audit log.
```json
{ "error": { "message": "forbidden", "type": "forbidden", "code": 403 }, "request_id": "b3f1…" }
```
| Status | When |
|---|---|
| `400` | Malformed body, schema violation, or `num_predict` over the cap. |
| `401` | Missing / invalid / expired / revoked key. |
| `403` | Endpoint hard-blocked, or model outside the tenant's effective set (no existence disclosure). |
| `413` | Request body over `MAX_REQUEST_BODY_BYTES` (default 256 KiB). |
| `429` | Rate limit or budget exceeded (carries `Retry-After`). |
| `502` | Ollama upstream unreachable / circuit breaker open. |
| `503` | A required subsystem (Postgres read, Redis) is down — **fail-closed**, never "allow". |
A model that is *installed-but-unpermitted* and a model that is *not installed* return the
**same** generic `403`, to prevent enumeration (SPEC §13.6).
---
## Native Ollama endpoints (`/api/*`)
### `POST /api/chat`
Streamed (NDJSON, default) or non-streamed chat completion.
```bash
curl -N http://localhost:8080/api/chat \
-H "Authorization: Bearer nz_demoKEY..." \
-H "Content-Type: application/json" \
-d '{"model":"llama3.1:8b","stream":true,
"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
```
**Streaming response**`Content-Type: application/x-ndjson`, one JSON object per line:
```
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" Say"},"done":false}
{"model":"llama3.1:8b","done":true,"done_reason":"stop",
"prompt_eval_count":6,"eval_count":7,"total_duration":1234567890,"eval_duration":34567890}
```
The **final** object carries `prompt_eval_count` (tokens in) and `eval_count` (tokens out);
the gateway uses these for precise token accounting (SPEC §4.3 step 12).
**Non-streaming** (`"stream": false`) returns a single JSON object of the same shape with
`"done": true`.
### `POST /api/generate`
Same semantics as `/api/chat` but uses a flat `prompt` string and returns `response`
fields instead of `message` objects.
```bash
curl -N http://localhost:8080/api/generate \
-H "Authorization: Bearer nz_demoKEY..." \
-H "Content-Type: application/json" \
-d '{"model":"llama3.1:8b","stream":true,"prompt":"Write a haiku about routers."}'
```
### `POST /api/embed` / `POST /api/embeddings`
Non-streamed embeddings. `/api/embed` is the newer batch endpoint (field `embeddings`,
a list of vectors); `/api/embeddings` is the legacy single-vector endpoint (field
`embedding`). Ollama returns no `eval_count` for embeddings; cost is charged on
`prompt_eval_count` only (SPEC §13.1).
```bash
curl http://localhost:8080/api/embed \
-H "Authorization: Bearer nz_demoKEY..." \
-H "Content-Type: application/json" \
-d '{"model":"nomic-embed-text","input":["hello","world"]}'
```
```json
{ "model": "nomic-embed-text", "embeddings": [[0.0, 0.1, ], [0.0, 0.1, ]], "prompt_eval_count": 2 }
```
### `GET /api/tags`
Returns the tenant's **effective** model set — the live-discovered set intersected with the
tenant's allowlist, or *all* discovered models when `allow_all_models` is on. Sourced from
discovery (SPEC §4.6), never a static list.
```bash
curl http://localhost:8080/api/tags -H "Authorization: Bearer nz_demoKEY..."
```
### `POST /api/show`
Allowed only for models in the effective set; returns **sanitized** model info.
The system prompt and template that Ollama returns are **stripped** by the gateway.
### `GET /api/version`
Returns the **gateway** version, not the Ollama version.
```json
{ "version": "0.1.0" }
```
---
## Hard-blocked endpoints (always `403`)
These model-mutating endpoints are blocked at the gateway. **Not configurable, not behind a
flag** (SPEC §6.2, AGENT_PROMPT non-negotiable #5):
```
/api/pull /api/push /api/create /api/copy /api/delete /api/blobs/*
```
```bash
# Always 403, even with a valid key:
curl -i http://localhost:8080/api/pull \
-H "Authorization: Bearer nz_demoKEY..." \
-H "Content-Type: application/json" -d '{"model":"llama3.1:8b"}'
```
`GET /api/ps` is also blocked (it would leak which models are loaded).
---
## OpenAI-compatible endpoints (`/v1/*`)
| Path | Method | Maps to |
|---|---|---|
| `/v1/chat/completions` | POST | `/api/chat` |
| `/v1/completions` | POST | `/api/generate` |
| `/v1/embeddings` | POST | `/api/embed` |
| `/v1/models` | GET | `/api/tags` (effective set, OpenAI list format) |
Streaming uses **SSE**: `data: {…}\n\n` events terminated by a literal `data: [DONE]\n\n`.
### `POST /v1/chat/completions`
```bash
curl -N http://localhost:8080/v1/chat/completions \
-H "Authorization: Bearer nz_demoKEY..." \
-H "Content-Type: application/json" \
-d '{"model":"llama3.1:8b","stream":true,
"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
```
**Streaming response**`Content-Type: text/event-stream`:
```
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":" Say"},"finish_reason":null}]}
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":6,"completion_tokens":7,"total_tokens":13}}
data: [DONE]
```
Works with the OpenAI Python SDK by pointing `base_url` at `http://localhost:8080/v1`.
### `GET /v1/models`
```bash
curl http://localhost:8080/v1/models -H "Authorization: Bearer nz_demoKEY..."
```
```json
{ "object": "list", "data": [
{ "id": "llama3.1:8b", "object": "model", "owned_by": "neuronetz" },
{ "id": "mistral:7b", "object": "model", "owned_by": "neuronetz" }
] }
```
---
## Health endpoints
| Path | Method | Auth | Purpose |
|---|---|---|---|
| `/healthz` | GET | none | Liveness — process responsive (`200`). |
| `/readyz` | GET | none | Readiness — DB + Redis + Ollama reachable, else `503`. |
| `/metrics` | GET | none (loopback only) | Prometheus exposition. |
```bash
curl -i http://localhost:8080/healthz # 200 {"status":"ok"}
curl -i http://localhost:8080/readyz # 200 when all deps up; 503 otherwise
```
---
## Quick reference: streaming formats
| Surface | Content-Type | Frame | Terminator |
|---|---|---|---|
| Native `/api/*` | `application/x-ndjson` | one JSON object per `\n` | final object has `"done": true` |
| OpenAI `/v1/*` | `text/event-stream` | `data: {…}\n\n` | `data: [DONE]\n\n` |

168
docs/ARCHITECTURE.md Normal file
View File

@@ -0,0 +1,168 @@
# neuronetz-gateway — Architecture
Distilled from [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §4. The SPEC is the source of truth.
The gateway is the **hot path** of the Neuronetz API: a secure, multi-tenant proxy in front
of an Ollama instance. The Ollama backend must never be reachable directly from the public
internet — all access flows through this gateway. Administration (dashboards, tenant
self-service) lives in a separate service, `neuronetz-console`, and is out of scope here.
---
## Component diagram (SPEC §4.1)
```
Internet
│ TLS
┌──────────────────────┐
│ Caddy (sidecar) │ Let's Encrypt for api.neuronetz.ai
│ - TLS termination │ HSTS, security headers
│ - HTTP/2, HTTP/3 │
└──────────┬───────────┘
│ HTTP/1.1 internal
┌──────────▼───────────┐
│ neuronetz-gateway │ FastAPI + uvicorn
│ - authn │
│ - rate limit │
│ - budget check │
│ - proxy + stream │
│ - token count │
│ - audit write │
└──┬────────┬──────┬───┘
│ │ │
┌──────▼──┐ ┌──▼───┐ │
│Postgres │ │Redis │ │
│ schema: │ │ keys │ │
│ gateway │ │bucket│ │
└─────────┘ └──────┘ │
│ internal network only
┌──────▼──────┐
│ Ollama │
│ 127.0.0.1 │
└─────────────┘
Same Compose stack also hosts (separate from this SPEC):
- neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
```
Only **Caddy** publishes ports. Postgres, Redis and (critically) **Ollama** have no
published ports and are reachable only on the internal Docker network.
---
## Database schemas (SPEC §4.2)
A single Postgres instance with two schemas:
- **`gateway`** — owned by this service; full DDL. Tables: `tenants`, `tenant_limits`,
`api_keys`, `key_limits`, `budget_usage`, `audit_log`, `prompt_log`, `revocations`
(see SPEC §5 for the full DDL).
- **`console`** — owned by `neuronetz-console` (out of scope). The console role gets
`SELECT` on all `gateway.*` tables and `INSERT` on `gateway.revocations` only.
If the console needs to mutate gateway state (e.g. revoke a key), it does so by inserting
into the `gateway.revocations` **outbox** table, which the gateway tails (see Revocation below).
**Limit inheritance:** limits and budgets resolve key → tenant. A `NULL` key-level value
inherits the tenant value. For `allow_all_models`, a non-`NULL` key value overrides the
tenant flag; otherwise the tenant flag applies (SPEC §13.7).
---
## Request lifecycle (SPEC §4.3)
1. Caddy terminates TLS and forwards to the gateway on the internal port.
2. Middleware extracts `Authorization: Bearer <key>`.
3. The 12-char prefix is the Redis cache key. On miss, look up `gateway.api_keys` by prefix,
verify the full key with argon2id, and cache resolved metadata in Redis (TTL 60 s).
4. **Rate limit** check — sliding window in Redis (Lua-atomic): per-key RPM + per-tenant RPM.
5. **Budget** check — Redis counter for the current period; Postgres ledger is the source of
truth on reset.
6. **Concurrency** semaphore — Redis `INCR` with TTL.
7. **Model allowlist** check — resolve the effective set (see below); the request `model`
must be in it, else a generic `403`.
8. **Endpoint allowlist** check — mutating endpoints are hard-blocked.
9. **Body validation** — size, schema, `num_predict` cap.
10. If an OpenAI-compat path, translate the request to the Ollama schema.
11. Open an httpx async stream to Ollama.
12. Stream the response back to the client, accumulating the final `prompt_eval_count` +
`eval_count`.
13. On stream close: write the `gateway.audit_log` row; decrement the budget; release the
semaphore; if prompt logging is enabled, write `gateway.prompt_log`.
14. On any failure: sanitized error to the client, audit row with the status code, semaphore
released.
**Streaming integrity:** token counting and the audit write happen **after** stream close,
never on the hot path — time-to-first-byte is not degraded by bookkeeping (SPEC §9).
---
## Model discovery (SPEC §4.6)
The set of usable models is **never hand-maintained**; it is extracted live from Ollama.
- A background task (started in the app lifespan, alongside the revocation listener) polls
Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
- The parsed set (names + sanitized metadata: family, parameter size, quantization, size,
modified-at) is cached in Redis under `gateway:models:discovered` with TTL
`MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
- An initial fetch runs at startup; if Ollama is unreachable the discovered set is empty.
- **Fail-closed:** an empty or expired-and-unrefreshable discovered set means *no model
resolves* and requests are denied. Discovery never opens access on failure.
- **Auto-grant:** because the effective set intersects with `discovered` (or *is*
`discovered` when `allow_all_models`), a model pulled into Ollama out-of-band becomes
usable to `allow_all` tenants on the next refresh — no per-tenant config change.
- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
endpoint; it never triggers a model pull.
### Effective-set resolution (SPEC §4.3 step 7)
```
allow_all := key.allow_all_models ?? tenant.allow_all_models
effective := discovered if allow_all
(key.allowed_models ?? tenant.allowed_models) ∩ discovered otherwise
```
`/api/tags` and `/v1/models` return exactly this effective set, so the listing never reveals
models outside the tenant's reach. A model that is installed-but-unpermitted and one that is
not installed both return the same generic `403` — no existence disclosure (SPEC §13.6).
---
## Failure modes — fail-closed (SPEC §4.4)
| Subsystem | If down | Behavior |
|---|---|---|
| Postgres (read) | Key lookup fails | `503` with retry-after; nothing proxied. |
| Postgres (write) | Audit write fails | Request still succeeds; audit row buffered in-memory ring (max 1000), drained on recovery; if the buffer fills, switch to deny mode. |
| Redis | Rate limit / budget unavailable | `503` — fail closed. Never "allow because we can't check." |
| Ollama | Upstream unreachable | `502` with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30 s. |
| Caddy | Not a gateway concern | — |
The governing rule (AGENT_PROMPT non-negotiable #1): **if a security or budgeting check
cannot be performed, deny.** Never default to allow.
---
## Cache invalidation / key revocation (SPEC §4.5)
The console revokes a key by inserting into `gateway.revocations(key_id, ts, reason)`.
A background task in the gateway lifespan:
- `LISTEN`s on the Postgres channel `key_revoked` (the gateway emits `NOTIFY` on its own
write path; the console's INSERT fires a trigger that emits it).
- On notification, evicts the Redis cache entry for that key's prefix.
This makes revocation effectively immediate (≤ Redis RTT) with no cross-service HTTP.
---
## Observability
- **Structured logs** (structlog), JSON in production. Secrets/keys are never logged.
- **Prometheus** `/metrics` (loopback only): `gateway_requests_total{tenant,model,status}`,
`gateway_tokens_total{tenant,model,direction}`,
`gateway_request_duration_seconds{tenant,model}` (histogram). Labelled by `tenant`, never
by `key_id` (cardinality — SPEC §13.3); per-key data lives in Postgres.
- **Audit log** — always-on request metadata. **Prompt log** — opt-in per key, TTL'd.

188
docs/DEPLOYMENT.md Normal file
View File

@@ -0,0 +1,188 @@
# neuronetz-gateway — Deployment
Production deployment is a single Docker Compose stack: **Caddy + gateway + Postgres + Redis
+ Ollama**. Caddy is the only public-facing component; it terminates TLS via Let's Encrypt
for `api.neuronetz.ai` and reverse-proxies to the internal-only gateway.
> For the local, no-GPU demo (mock Ollama + playground), see [`PLAYGROUND.md`](PLAYGROUND.md)
> and run `./demo.sh`. This document is the **production** path.
---
## The one rule that must never break
> ## ⛔ Ollama is NEVER exposed to the host or the internet.
>
> The `ollama` service in `docker-compose.yml` has **no `ports:` mapping** and must never
> get one. Ollama is reachable only on the internal Docker network as `ollama:11434`.
> Publishing it would re-open the exact unauthenticated exposure this whole project exists
> to close (SPEC §1, §3; AGENT_PROMPT non-negotiable #2).
The same posture applies to **Postgres** and **Redis** in the production compose file — no
published ports. Only **Caddy** binds host ports (80/443, 443/udp for HTTP/3).
---
## Prerequisites
- A host with Docker + Docker Compose.
- DNS: `api.neuronetz.ai` → the host's public IP (for Let's Encrypt).
- Ports 80 and 443 reachable from the internet (ACME HTTP/TLS challenge + serving).
---
## Steps
```bash
git clone <repo> neuronetz-gateway && cd neuronetz-gateway
# 1. Configure. Copy the example env and change EVERY secret.
cp .env.example .env
# - POSTGRES_PASSWORD: a strong, unique value
# - DATABASE_URL: must match the POSTGRES_* values
# - GATEWAY_LOG_FORMAT=json for production
# 2. Configure Caddy for your domain + ACME email.
cp ops/caddy/Caddyfile.example ops/caddy/Caddyfile # then edit the site + email
# (docker-compose.yml mounts Caddyfile.example by default; point it at your edited file
# or edit in place.)
# 3. Bring up the full stack. The gateway runs `alembic upgrade head`, then serves.
docker compose up -d --build
# 4. Bootstrap a tenant + key (CLI runs inside the gateway container).
docker compose exec gateway neuronetz-gateway create-tenant --name acme --rpm 120 --tpm 200000
docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
# ^ prints the full key ONCE — store it in your secret manager now.
# 5. Smoke test (through Caddy / TLS).
curl https://api.neuronetz.ai/healthz
curl -N https://api.neuronetz.ai/v1/chat/completions \
-H "Authorization: Bearer nz_…" -H "Content-Type: application/json" \
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"hi"}]}'
```
Caddy obtains and renews the certificate automatically. For local testing without a public
domain, use the `localhost { tls internal … }` block documented in `Caddyfile.example`
(trust Caddy's local CA or pass `-k` to curl).
---
## Pointing at a real Ollama backend
The gateway reaches Ollama via `OLLAMA_BASE_URL`. In the bundled stack this is the in-stack
`ollama` service: `OLLAMA_BASE_URL=http://ollama:11434`.
To use an **existing/external** Ollama host instead:
1. Remove the `ollama` service from `docker-compose.yml` (or leave it; it just won't be used).
2. Set `OLLAMA_BASE_URL` to the backend address reachable from the gateway container, e.g.
`http://10.0.0.5:11434` or an internal DNS name.
3. Ensure that backend is itself **not** exposed to the internet — the gateway is the only
thing that should ever reach it. Use a private network / firewall rule, not a public port.
4. Pull the models you want available on that backend. They appear in tenants' effective sets
automatically on the next discovery refresh (SPEC §4.6) — no gateway config change for
`allow_all_models` tenants.
Discovery polls `OLLAMA_BASE_URL/api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. If the
backend is unreachable, the discovered set is empty and requests **fail closed**.
---
## Environment reference (SPEC §7)
All configuration is via environment variables, validated by Pydantic Settings on boot. Boot
**fails loudly** on invalid config. See [`.env.example`](../.env.example) for a copyable file.
### Service
| Var | Default | Notes |
|---|---|---|
| `GATEWAY_BIND_HOST` | `0.0.0.0` | Bind-all inside the container. |
| `GATEWAY_BIND_PORT` | `8080` | Internal port; never published directly in prod. |
| `GATEWAY_LOG_LEVEL` | `INFO` | |
| `GATEWAY_LOG_FORMAT` | `json` | `json` in prod, `console` for local dev. |
| `GATEWAY_REQUEST_ID_HEADER` | `X-Request-ID` | |
| `GATEWAY_TRUSTED_PROXIES` | `127.0.0.1,caddy` | Sources trusted for `X-Forwarded-For`. |
### Upstream (Ollama)
| Var | Default | Notes |
|---|---|---|
| `OLLAMA_BASE_URL` | `http://ollama:11434` | Internal address of the backend. |
| `OLLAMA_CONNECT_TIMEOUT_S` | `5` | |
| `OLLAMA_READ_TIMEOUT_S` | `600` | Long, for slow generations. |
| `OLLAMA_MAX_CONNECTIONS` | `64` | httpx pool size. |
### Model discovery (§4.6)
| Var | Default | Notes |
|---|---|---|
| `MODEL_DISCOVERY_REFRESH_S` | `60` | How often to re-query `/api/tags`. |
| `MODEL_DISCOVERY_CACHE_TTL_S` | `120` | Redis TTL for the discovered set. |
### Database
| Var | Default | Notes |
|---|---|---|
| `DATABASE_URL` | `postgresql+asyncpg://…` | asyncpg driver. |
| `DATABASE_POOL_SIZE` | `10` | |
| `DATABASE_POOL_OVERFLOW` | `20` | |
### Redis
| Var | Default | Notes |
|---|---|---|
| `REDIS_URL` | `redis://redis:6379/0` | |
| `REDIS_KEY_CACHE_TTL_S` | `60` | Resolved-key cache TTL. |
### Limits (defaults; per-tenant/key DB overrides win)
| Var | Default | Notes |
|---|---|---|
| `DEFAULT_RPM` | `60` | |
| `DEFAULT_TPM` | `100000` | |
| `DEFAULT_CONCURRENT` | `8` | |
| `MAX_REQUEST_BODY_BYTES` | `262144` | 256 KiB request cap. |
| `MAX_NUM_PREDICT` | `4096` | Hard cap on requested completion tokens. |
### Security
| Var | Default | Notes |
|---|---|---|
| `ARGON2_TIME_COST` | `3` | |
| `ARGON2_MEMORY_COST_KIB` | `65536` | 64 MiB. |
| `ARGON2_PARALLELISM` | `4` | |
| `AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN` | `20` | Throttles auth brute-force per source IP. |
### Audit
| Var | Default | Notes |
|---|---|---|
| `AUDIT_BUFFER_SIZE` | `1000` | Ring buffer; full ⇒ deny mode. |
| `PROMPT_LOG_DEFAULT_RETENTION_DAYS` | `30` | |
| `AUDIT_LOG_DEFAULT_RETENTION_DAYS` | `365` | |
---
## TLS & security headers (Caddy)
`ops/caddy/Caddyfile.example` already sets:
- **HSTS** `max-age=63072000; includeSubDomains; preload`
- `X-Content-Type-Options: nosniff`
- `X-Frame-Options: DENY`
- `Referrer-Policy: no-referrer`
- strips `Server` and `X-Powered-By`
Edit the site address and ACME `email` before deploying.
---
## Non-Compose (systemd)
A systemd unit is provided for hosts that run the image directly (`ops/systemd/`). The
gateway still requires reachable Postgres, Redis, and Ollama, and the same environment
variables. TLS in that topology is whatever fronts the host (Caddy, nginx, a load balancer) —
**Ollama still must not be publicly reachable.**
---
## Upgrades & migrations
The gateway runs `alembic upgrade head` on container start, so a normal
`docker compose up -d --build` after pulling a new version applies pending migrations. For
zero-downtime upgrades, run migrations as a one-off
(`docker compose run --rm gateway alembic upgrade head`) before rolling the service.

172
docs/OPERATIONS.md Normal file
View File

@@ -0,0 +1,172 @@
# neuronetz-gateway — Operations Runbook
Day-2 operations for the gateway: managing tenants and keys, budgets, model policy, usage,
and the fail-closed behaviors you'll encounter. All administration is via the **bootstrap
CLI** (SPEC §11), run inside the gateway container. There are no admin HTTP endpoints in the
gateway (that's `neuronetz-console`'s job).
> Run the CLI inside the running container:
> ```bash
> docker compose exec gateway neuronetz-gateway <command> …
> ```
> In the demo stack, swap the compose file: `docker compose -f docker-compose.demo.yml exec gateway …`
---
## Keys
### Create a key
```bash
docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
# optional: --scopes chat,embeddings (default: chat,embeddings)
```
The **full key is printed exactly once** in the form `nz_<prefix><secret>`. Store it
immediately in your secret manager — it is argon2id-hashed and cannot be recovered. Only the
12-char `prefix` is retained server-side.
### List keys (never shows full keys)
```bash
docker compose exec gateway neuronetz-gateway list-keys --tenant acme
# prints: <prefix> status=active name='prod-server-1' created=…
```
### Revoke a key
```bash
docker compose exec gateway neuronetz-gateway revoke-key --prefix nz_abc12345
```
This sets the key status to `revoked` and writes the `gateway.revocations` outbox row. A
Postgres `NOTIFY` on channel `key_revoked` fires; the gateway evicts the key's Redis cache
entry, so revocation takes effect within ~1 second (SPEC §4.5) without restarting anything.
A subsequent request with that key returns **401**.
> The console (`neuronetz-console`) revokes keys the same way — by inserting into
> `gateway.revocations`. The trigger-driven NOTIFY makes it immediate without any
> cross-service HTTP call.
### Rotate a key
There is no in-place rotate. Rotate by: create a new key → deploy it to the client → verify
traffic on the new prefix → revoke the old prefix.
---
## Tenants & limits
### Create a tenant
```bash
docker compose exec gateway neuronetz-gateway create-tenant --name acme \
--rpm 120 --tpm 200000 --concurrent 8
# add --allow-all-models to opt into using any installed model (default: off)
```
Limits inherit **key → tenant**: a `NULL` key-level limit uses the tenant value.
---
## Budgets
Set per-key token budgets (any combination of daily / monthly / total):
```bash
docker compose exec gateway neuronetz-gateway set-budget --key nz_abc12345 \
--daily 1000000 --monthly 30000000 --total 500000000
```
- Budgets are enforced **fail-closed**: when the binding period hits zero remaining, requests
return **429** with a descriptive error and a `Retry-After` header. The binding period and
remaining balance are surfaced on every response via `X-Budget-Period` and
`X-Budget-Tokens-Remaining` (SPEC §6.5).
- Live counters live in Redis; the Postgres ledger (`gateway.budget_usage`) is the source of
truth on period rollover/reset.
---
## Model policy
### Set an explicit allowlist (default-deny)
```bash
docker compose exec gateway neuronetz-gateway set-models --tenant acme \
--models llama3.1:8b,mistral:7b
```
The tenant's **effective set** is `allowed_models ∩ discovered` — entries that aren't
actually installed on the backend silently never resolve. A request for a model outside the
effective set returns a generic **403** (same response as "doesn't exist" — no enumeration).
### Toggle `allow_all_models`
```bash
docker compose exec gateway neuronetz-gateway set-models --tenant acme --allow-all # opt in
docker compose exec gateway neuronetz-gateway set-models --tenant acme --no-allow-all # back to allowlist
```
With `allow_all_models` on, the effective set **is** the live discovered set — any model
pulled into Ollama becomes usable on the next discovery refresh, with no further config
change. This is an audited convenience; prefer explicit allowlists for untrusted tenants
(see [`THREAT_MODEL.md`](THREAT_MODEL.md)).
### Inspect discovery and effective sets
```bash
docker compose exec gateway neuronetz-gateway list-models # live-discovered models
docker compose exec gateway neuronetz-gateway list-models --tenant acme # + that tenant's effective set
```
---
## Usage
```bash
docker compose exec gateway neuronetz-gateway show-usage --tenant acme --period day
# prints: requests=… tokens_in=… tokens_out=… (period: day|month|total)
```
For per-key forensics and finer slicing, query `gateway.audit_log` directly (it records
`request_id`, `key_prefix`, `model`, `tokens_in/out`, `status`, `latency_ms`, `client_ip`).
---
## How model discovery refresh works (SPEC §4.6)
- A background task polls Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds and
caches the result in Redis (`gateway:models:discovered`, TTL `MODEL_DISCOVERY_CACHE_TTL_S`)
plus an in-process copy for hot reads.
- A model pulled into Ollama out-of-band appears in `allow_all_models` tenants' effective sets
within one refresh interval — no config change.
- Discovery is **read-only** and uses only the allowlisted `/api/tags` endpoint; it never
triggers a pull.
- To force a faster pickup, lower `MODEL_DISCOVERY_REFRESH_S` (the demo uses 15 s).
---
## Fail-closed behaviors to expect
| Symptom | Cause | Correct behavior |
|---|---|---|
| `503` on every request | Redis or Postgres-read down | Fail-closed — rate-limit/budget/auth can't be checked, so deny. Restore the backend. |
| `502` with retry-after | Ollama unreachable | Circuit breaker opens after 5 consecutive failures, half-opens after 30 s. Check the backend / `OLLAMA_BASE_URL`. |
| `403` for a model you "know" exists | Model not in the tenant's effective set, **or** discovery cache empty/expired | Check `list-models --tenant …`; verify the backend is reachable and the model is installed. Empty discovery = deny by design. |
| `429` with `Retry-After` | Rate limit or budget exhausted | Inspect headers (`X-RateLimit-*`, `X-Budget-*`); raise limits/budget or wait. |
| `401` immediately after revoke | Working as intended | Revocation propagated via NOTIFY → Redis eviction. |
`/readyz` returns `503` when **any** dependency (DB, Redis, Ollama) is unreachable; use it as
the load-balancer health gate. `/healthz` only checks process liveness.
---
## Logs, metrics, audit
- **Logs:** structured (structlog), JSON in production, to stdout. Keys/secrets are never
logged.
- **Metrics:** Prometheus at `/metrics` (loopback only): `gateway_requests_total`,
`gateway_tokens_total`, `gateway_request_duration_seconds`, labelled by `tenant` and
`model` (never `key_id`).
- **Audit log:** always-on in `gateway.audit_log`. **Prompt log** is opt-in per key and TTL'd
(`PROMPT_LOG_DEFAULT_RETENTION_DAYS`); a sweeper enforces retention.

113
docs/PLAYGROUND.md Normal file
View File

@@ -0,0 +1,113 @@
# neuronetz-gateway — Demo & Playground
The fastest way to see the gateway working end-to-end, with **no GPU and no model downloads**.
`./demo.sh` brings up the gateway against a mock Ollama backend, mints a demo API key, and
prints ready-to-paste curl commands and a link to an interactive browser playground.
---
## Launch the demo
From the repo root:
```bash
./demo.sh
```
This will:
1. Build and start the demo stack (`docker-compose.demo.yml`): **postgres + redis +
mock-ollama + gateway**. No Caddy; the gateway is published on `127.0.0.1:8080`.
2. Wait for the gateway to report healthy at `/healthz`.
3. Create a demo tenant (`--allow-all-models`) and an API key via the bootstrap CLI **inside
the gateway container**, capturing the key (which is printed exactly once).
4. Print a summary: the **API key**, the **playground URL**
`http://localhost:8080/playground`, and five ready-to-paste curl commands —
- streaming `/v1/chat/completions` (OpenAI SSE),
- streaming `/api/chat` (native NDJSON),
- `GET /v1/models`,
- a **401** example (no/bad key),
- a **403** example (`POST /api/pull`, hard-blocked).
The script is **re-runnable**: an existing tenant is reused, and each run mints a fresh,
uniquely-named key (the full key only ever prints at creation).
Tear everything down (containers + volumes):
```bash
./demo.sh --down
```
### What's running
| Service | Exposed? | Notes |
|---|---|---|
| `gateway` | `127.0.0.1:8080` | The real gateway image, built from the repo `Dockerfile`. |
| `mock-ollama` | **no** | Internal network only — mirrors the prod "Ollama is never exposed" rule. |
| `postgres` | **no** | Internal only. |
| `redis` | **no** | Internal only. |
The mock backend (`demo/mock-ollama/`) emulates Ollama's API shapes — including realistic
`prompt_eval_count` / `eval_count` on the final stream object — so token counting, model
discovery, and `/api/show` sanitization all exercise real gateway code paths. It serves a
small catalogue: `llama3.1:8b`, `mistral:7b`, `qwen2.5:3b`, `nomic-embed-text`.
---
## Use the playground
Open **http://localhost:8080/playground** in a browser. It is a single self-contained HTML
page, served **same-origin** by the gateway (so no CORS to worry about).
1. **Base URL** is pre-filled with the current origin; leave it as is for the demo.
2. Paste the **API key** from the `./demo.sh` output into the Bearer field. (Typing a key
auto-loads the model dropdown; you can also hit **↻ Refresh**.)
3. Pick an **endpoint** tab: `/v1/chat/completions`, `/api/chat`, `/api/generate`,
`/v1/models`, `/api/tags`, `/healthz`, `/readyz`.
4. Choose a **model** from the auto-populated dropdown, type a prompt, toggle **stream**.
5. Hit **▶ Run**. The streamed output renders **live** — SSE `data:` deltas (incl. `[DONE]`)
for `/v1/*`, NDJSON lines for `/api/*`.
6. The panel shows the **response status** and the rate-limit / budget **response headers**
(`X-Request-ID`, `X-RateLimit-*`, `X-Budget-*`; SPEC §6.5).
7. The **Exact curl** box mirrors precisely what **Run** sends — copy it to reproduce in a
terminal.
Try the 403 path too: there's no mutating-endpoint tab by design, but the printed `curl` for
`POST /api/pull` shows the hard block, and an invalid key in the Bearer field demonstrates the
401 fail-closed response.
---
## ⚠️ Security note: the playground is OFF by default in production
The playground route is **flag-gated** and **disabled by default**. The demo stack turns it on
explicitly:
```yaml
# docker-compose.demo.yml (gateway service)
GATEWAY_PLAYGROUND_ENABLED: "true"
GATEWAY_PLAYGROUND_FILE: /app/playground/index.html
```
with the file mounted read-only into the container:
```yaml
volumes:
- ./playground:/app/playground:ro
```
The production stack (`docker-compose.yml`) does **not** set `GATEWAY_PLAYGROUND_ENABLED`, so
the route is absent. Do not enable it on a public deployment: it is a convenience for demos and
local development, not a production surface. Leaving it off keeps the public attack surface to
the documented API only.
---
## Files behind the demo
| Path | What it is |
|---|---|
| `demo.sh` | The one-command entrypoint (up / `--down`). |
| `docker-compose.demo.yml` | The demo stack definition. |
| `demo/mock-ollama/` | The standalone mock Ollama service (FastAPI app + Dockerfile). |
| `playground/index.html` | The self-contained browser playground served at `/playground`. |

77
docs/THREAT_MODEL.md Normal file
View File

@@ -0,0 +1,77 @@
# neuronetz-gateway — Threat Model
From [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §3. The governing principle, in one line:
> **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down,
> DB unreachable, ambiguous state), **deny** the request. Never default to allow.
> (AGENT_PROMPT non-negotiable #1.)
The gateway exists because the Ollama instance at `api.neuronetz.ai` was exposed without
authentication — a standing security incident. Every defense below traces back to closing
that gap and keeping it closed.
---
## Threats & mitigations (SPEC §3)
| Threat | Mitigation |
|---|---|
| Internet scanners hitting Ollama directly | Ollama bound to the internal Docker network; **never published**. No `ports:` mapping in any shipped compose file. |
| Unauthenticated API abuse | Mandatory Bearer token; **fail-closed** on auth errors (401). |
| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP (`AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN`). |
| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent-connection cap. |
| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096). |
| Model enumeration / training-data exfil via uncommon models | Model allowlist, **default-deny**. Discovery only exposes models actually installed; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the **same** generic response. |
| Discovery backend unreachable | **Fail-closed:** an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models." |
| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) **hard-blocked** at the gateway, not configurable. |
| Information disclosure via error messages | Upstream errors **sanitized** at the boundary; Ollama internals never proxied to the client. Each error carries an `X-Request-ID` for correlation. |
| Audit log tampering | Append-only at the app layer; DB role separation; optional WAL archiving. |
| Prompt data leakage | Prompt logging **off by default**; opt-in per key; TTL'd retention; redaction hook. |
| Redis outage causing "fail open" | **Fail-closed:** if the rate-limit/budget backend is unavailable, deny (503), not allow. |
| Compromised admin token | There is **no admin endpoint** in the gateway. Admin lives in `neuronetz-console`; the gateway has nothing to compromise here. |
---
## Notes on selected defenses
### `allow_all_models` is an audited opt-in
`allow_all_models` lets a tenant use any currently-installed model, so models newly pulled
into Ollama are auto-granted on the next discovery refresh. This is convenient but widens the
attack surface for *that tenant*, so it is:
- **opt-in per tenant** (default `false`), set explicitly via the CLI
(`create-tenant --allow-all-models` or `set-models --allow-all`);
- **overridable per key** — a non-`NULL` key-level `allow_all_models` overrides the tenant
flag; otherwise the tenant flag applies (SPEC §13.7);
- **audited** — every request records the model used in `gateway.audit_log`.
Default-deny tenants instead see only `allowed_models ∩ discovered`. Either way the effective
set is always intersected with the *live* discovered set, so stale or typo'd allowlist entries
never resolve.
### No existence disclosure
A model that is installed-but-unpermitted and a model that is not installed both return the
**same** generic `403`. An attacker cannot use the gateway to enumerate which models exist on
the backend (SPEC §13.6).
### Sanitized errors + request IDs
Clients never receive Ollama's error text, stack traces, or internal hostnames. Errors are
mapped to generic `4xx`/`5xx` JSON with a `request_id`. Operators correlate that ID with the
audit log to investigate without leaking internals to callers (SPEC §4.3 step 14).
### Streaming integrity is also a safety property
Token counting and audit writes happen **after** stream close, never on the hot path. This
keeps time-to-first-byte honest and ensures budget decrements and audit rows reflect the true
final token counts reported by Ollama (`prompt_eval_count` + `eval_count`), not estimates.
---
## Out of scope (v0.1.0)
Documented as future work, **not** mitigations present today: content moderation /
prompt-injection filtering, response caching, multi-backend routing, billing, SSO/OAuth2 for
admin, and any web admin UI (that lives in `neuronetz-console`).

40
mkdocs.yml Normal file
View File

@@ -0,0 +1,40 @@
# mkdocs configuration for the neuronetz-gateway documentation.
#
# pip install mkdocs-material
# mkdocs serve # live preview at http://127.0.0.1:8000
# mkdocs build # static site into ./site
#
# Docs live in docs/. This wires them into a single Material-themed site.
site_name: neuronetz-gateway
site_description: Secure, multi-tenant API gateway in front of Ollama.
docs_dir: docs
theme:
name: material
palette:
- scheme: slate
primary: indigo
accent: indigo
features:
- navigation.sections
- navigation.top
- content.code.copy
- content.code.annotate
markdown_extensions:
- admonition
- tables
- toc:
permalink: true
- pymdownx.highlight:
anchor_linenums: true
- pymdownx.superfences
- pymdownx.inlinehilite
nav:
- Architecture: ARCHITECTURE.md
- API Reference: API.md
- Deployment: DEPLOYMENT.md
- Threat Model: THREAT_MODEL.md
- Operations Runbook: OPERATIONS.md
- Demo & Playground: PLAYGROUND.md

716
playground/index.html Normal file
View File

@@ -0,0 +1,716 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>neuronetz-gateway · playground</title>
<style>
:root {
--bg: #0a0e16;
--bg-2: #10151f;
--panel: #141b27;
--panel-2: #1a2333;
--border: #243047;
--text: #e6edf6;
--muted: #8b9bb4;
--accent: #4f8cff;
--accent-2: #7c5cff;
--good: #3fcf8e;
--warn: #f0b429;
--bad: #ff5d6c;
--mono: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace;
--sans: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
}
* { box-sizing: border-box; }
html, body { margin: 0; height: 100%; }
body {
background:
radial-gradient(1200px 600px at 80% -10%, rgba(124,92,255,.10), transparent 60%),
radial-gradient(900px 500px at -10% 110%, rgba(79,140,255,.10), transparent 55%),
var(--bg);
color: var(--text);
font-family: var(--sans);
font-size: 14px;
line-height: 1.5;
-webkit-font-smoothing: antialiased;
}
a { color: var(--accent); }
header {
display: flex; align-items: center; gap: 14px;
padding: 18px 26px;
border-bottom: 1px solid var(--border);
background: linear-gradient(180deg, rgba(255,255,255,.02), transparent);
position: sticky; top: 0; z-index: 5;
backdrop-filter: blur(6px);
}
.logo {
width: 34px; height: 34px; border-radius: 9px;
background: linear-gradient(135deg, var(--accent), var(--accent-2));
display: grid; place-items: center;
font-weight: 800; color: #fff; letter-spacing: -1px;
box-shadow: 0 6px 20px rgba(79,140,255,.35);
}
header h1 { font-size: 16px; margin: 0; font-weight: 700; letter-spacing: .2px; }
header .sub { color: var(--muted); font-size: 12px; }
.grow { flex: 1; }
.pill {
font-size: 11px; color: var(--muted);
border: 1px solid var(--border); border-radius: 999px;
padding: 4px 10px; font-family: var(--mono);
}
main {
display: grid;
grid-template-columns: 380px 1fr;
gap: 18px;
padding: 18px 26px 40px;
max-width: 1400px; margin: 0 auto;
}
@media (max-width: 920px) { main { grid-template-columns: 1fr; } }
.panel {
background: var(--panel);
border: 1px solid var(--border);
border-radius: 14px;
padding: 16px;
}
.panel h2 {
font-size: 12px; text-transform: uppercase; letter-spacing: .12em;
color: var(--muted); margin: 0 0 12px;
}
label { display: block; font-size: 12px; color: var(--muted); margin: 12px 0 5px; }
label:first-of-type { margin-top: 0; }
input, select, textarea {
width: 100%; background: var(--bg-2); color: var(--text);
border: 1px solid var(--border); border-radius: 9px;
padding: 9px 11px; font-size: 13px; font-family: var(--sans);
outline: none; transition: border-color .15s, box-shadow .15s;
}
input:focus, select:focus, textarea:focus {
border-color: var(--accent);
box-shadow: 0 0 0 3px rgba(79,140,255,.18);
}
textarea { resize: vertical; min-height: 90px; font-family: var(--mono); font-size: 12.5px; }
.row { display: flex; gap: 8px; }
.row > * { flex: 1; }
.inline { display: flex; align-items: center; gap: 8px; }
.inline input[type=checkbox] { width: auto; }
.tabs { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 8px; }
.tab {
font-family: var(--mono); font-size: 11.5px;
padding: 6px 10px; border-radius: 8px; cursor: pointer;
border: 1px solid var(--border); background: var(--bg-2); color: var(--muted);
transition: all .12s;
}
.tab:hover { color: var(--text); border-color: #34425f; }
.tab.active {
color: #fff; border-color: transparent;
background: linear-gradient(135deg, var(--accent), var(--accent-2));
}
button.run {
margin-top: 14px; width: 100%;
background: linear-gradient(135deg, var(--accent), var(--accent-2));
color: #fff; border: none; border-radius: 10px;
padding: 12px; font-size: 14px; font-weight: 700; cursor: pointer;
box-shadow: 0 8px 22px rgba(79,140,255,.3);
transition: transform .08s, filter .15s;
}
button.run:hover { filter: brightness(1.07); }
button.run:active { transform: translateY(1px); }
button.run:disabled { filter: grayscale(.6) brightness(.8); cursor: progress; }
.ghost {
background: var(--panel-2); color: var(--muted);
border: 1px solid var(--border); border-radius: 8px;
padding: 7px 10px; font-size: 12px; cursor: pointer; transition: all .12s;
}
.ghost:hover { color: var(--text); border-color: #34425f; }
.field-with-btn { display: flex; gap: 8px; align-items: stretch; }
.field-with-btn select { flex: 1; }
.out-head { display: flex; align-items: center; gap: 10px; margin-bottom: 10px; }
.status {
font-family: var(--mono); font-size: 12px; padding: 3px 9px; border-radius: 7px;
border: 1px solid var(--border); color: var(--muted);
}
.status.s2 { color: var(--good); border-color: rgba(63,207,142,.4); background: rgba(63,207,142,.08); }
.status.s4 { color: var(--warn); border-color: rgba(240,180,41,.4); background: rgba(240,180,41,.08); }
.status.s5 { color: var(--bad); border-color: rgba(255,93,108,.4); background: rgba(255,93,108,.08); }
pre, .codebox {
background: #0b0f17; border: 1px solid var(--border); border-radius: 10px;
padding: 13px; font-family: var(--mono); font-size: 12.5px;
white-space: pre-wrap; word-break: break-word; margin: 0;
max-height: 460px; overflow: auto;
}
.codebox.curl { color: #c9d6ea; }
.out-body { min-height: 120px; }
.headers {
margin-top: 12px; font-family: var(--mono); font-size: 11.5px;
border: 1px solid var(--border); border-radius: 10px; overflow: hidden;
}
.headers .hrow { display: flex; border-top: 1px solid var(--border); }
.headers .hrow:first-child { border-top: none; }
.headers .hk { width: 46%; padding: 6px 10px; color: var(--muted); background: var(--bg-2); }
.headers .hv { flex: 1; padding: 6px 10px; color: var(--text); word-break: break-all; }
.section-title {
display: flex; align-items: center; justify-content: space-between; margin: 0 0 8px;
}
.section-title .copy { font-size: 11px; }
.hint { color: var(--muted); font-size: 11.5px; margin-top: 6px; }
.stack { display: grid; gap: 16px; }
/* "About this endpoint" panel */
.ep-head { display: flex; align-items: center; gap: 8px; flex-wrap: wrap; margin-bottom: 8px; }
#endpointInfo h2 { font-family: ui-monospace, "JetBrains Mono", "Fira Code", monospace; font-size: 14px; letter-spacing: 0.2px; }
.summary { margin: 4px 0 12px; color: var(--text); font-size: 13.5px; line-height: 1.55; }
.sub-title { margin: 10px 0 6px; color: var(--muted); font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.8px; }
.codebox.sample { max-height: 200px; overflow: auto; font-size: 11.5px; color: #c9d6ea; }
.badge {
font-size: 10.5px; text-transform: uppercase; letter-spacing: 0.6px;
padding: 2px 7px; border-radius: 999px; border: 1px solid var(--border);
color: var(--muted); background: var(--bg-2);
}
.badge-post { color: #ffb84a; border-color: rgba(255,184,74,.35); background: rgba(255,184,74,.08); }
.badge-get { color: #5fc8ff; border-color: rgba(95,200,255,.35); background: rgba(95,200,255,.08); }
.badge-auth { color: #c9b6ff; border-color: rgba(201,182,255,.35); background: rgba(201,182,255,.08); }
.badge-open { color: #3fcf8e; border-color: rgba(63,207,142,.35); background: rgba(63,207,142,.08); }
.blink { animation: blink 1s steps(2,start) infinite; }
@keyframes blink { to { opacity: .25; } }
</style>
</head>
<body>
<header>
<div class="logo">N</div>
<div>
<h1>neuronetz-gateway <span class="sub">· playground</span></h1>
<div class="sub">Authenticated, rate-limited, audited access to the model backend</div>
</div>
<div class="grow"></div>
<div class="pill" id="originPill">same-origin</div>
</header>
<main>
<!-- ── Left: request builder ─────────────────────────────────────────── -->
<section class="panel">
<h2>Request</h2>
<label for="baseUrl">Base URL</label>
<div class="field-with-btn">
<input id="baseUrl" type="text" spellcheck="false" autocomplete="off" autocapitalize="off" autocorrect="off" />
<button class="ghost" id="resetBase" title="Reset Base URL to this page's origin">⟳ This origin</button>
</div>
<label for="apiKey">API key (Bearer)</label>
<input id="apiKey" type="password" placeholder="nz_…" spellcheck="false" autocomplete="off" />
<div class="hint" id="keyHint">Created by <code>./demo.sh</code> and printed once in your terminal.</div>
<label>Endpoint</label>
<div class="tabs" id="tabs"></div>
<div id="modelWrap">
<label for="model">Model</label>
<div class="field-with-btn">
<select id="model"><option value="">— enter a key, then refresh —</option></select>
<button class="ghost" id="refreshModels" title="Load /v1/models with your key">↻ Refresh</button>
</div>
</div>
<div id="promptWrap">
<label for="prompt" id="promptLabel">Prompt</label>
<textarea id="prompt" spellcheck="false">Say hello in one sentence.</textarea>
<label class="inline" id="streamWrap" style="margin-top:10px">
<input id="stream" type="checkbox" checked /> Stream the response
</label>
</div>
<button class="run" id="run">▶ Run</button>
<div class="hint" id="methodHint"></div>
</section>
<!-- ── Right: about + response + curl ────────────────────────────────── -->
<div class="stack">
<section class="panel" id="endpointInfo">
<div class="ep-head">
<h2 id="epTitle" style="margin:0">POST /v1/chat/completions</h2>
<div class="grow"></div>
<span class="badge" id="epMethod">POST</span>
<span class="badge" id="epAuth">auth: bearer</span>
<span class="badge" id="epStream">streams · SSE</span>
</div>
<p class="summary" id="epSummary"></p>
<div class="sub-title">Sample request body</div>
<pre class="codebox sample" id="epSampleReq"></pre>
<div class="sub-title">Sample response</div>
<pre class="codebox sample" id="epSampleResp"></pre>
<div class="hint" id="epNote"></div>
</section>
<section class="panel">
<div class="out-head">
<h2 style="margin:0">Response</h2>
<div class="grow"></div>
<span class="status" id="status">idle</span>
</div>
<pre class="codebox out-body" id="output">Run a request to see the response stream here.</pre>
<div class="headers" id="headers" style="display:none"></div>
</section>
<section class="panel">
<div class="section-title">
<h2 style="margin:0">Exact curl</h2>
<button class="ghost copy" id="copyCurl">Copy</button>
</div>
<pre class="codebox curl" id="curl"></pre>
<div class="hint">This is exactly what <b>Run</b> sends — paste it into a terminal to reproduce.</div>
</section>
</div>
</main>
<script>
"use strict";
// ── Endpoint catalogue ──────────────────────────────────────────────────
// Each endpoint knows its method, format, body shape, and how to render itself
// in the "About this endpoint" panel: summary, sample request, sample response,
// and an optional note. Mirrors SPEC §6.
const ENDPOINTS = {
"/v1/chat/completions": {
method: "POST", canStream: true, format: "sse", needsModel: true, needsPrompt: true,
summary: "OpenAI-compatible Chat Completions — a drop-in replacement for OpenAI's endpoint. Point any OpenAI SDK at this gateway's base URL with your nz_ key and existing client code works unchanged. Streaming uses Server-Sent Events terminated by `data: [DONE]`.",
body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
sampleResponse:
`data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":" hi"},"finish_reason":null}]}
data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":1,"completion_tokens":2,"total_tokens":3}}
data: [DONE]`,
note: "Non-streaming (`stream: false`) returns one `chat.completion` JSON object — same shape as OpenAI.",
},
"/api/chat": {
method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
summary: "Native Ollama chat. Streams NDJSON — one JSON object per line; the final object carries `prompt_eval_count` + `eval_count` for exact token accounting in the audit log.",
body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
sampleResponse:
`{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" hi"},"done":false}
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2,"total_duration":12345678}`,
note: "Errors are sanitized but every response carries an X-Request-ID; upstream internals never leak.",
},
"/api/generate": {
method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
summary: "Native Ollama text generation. Takes a plain `prompt` string (no chat message structure) and streams NDJSON `response` chunks plus a final done frame with token counts.",
body: (s) => ({ model: s.model, stream: s.stream, prompt: s.prompt }),
sampleRequest: { model: "mistral:7b", stream: true, prompt: "Say hello in one sentence." },
sampleResponse:
`{"model":"mistral:7b","created_at":"…","response":"Echo:","done":false}
{"model":"mistral:7b","created_at":"…","response":" hi","done":false}
{"model":"mistral:7b","created_at":"…","response":"","done":true,"prompt_eval_count":1,"eval_count":2}`,
note: "Use this when you don't need chat-message structure; otherwise prefer `/api/chat` or `/v1/chat/completions`.",
},
"/v1/models": {
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
summary: "Lists the tenant's effective model set in OpenAI format: (live-discovered ∩ allowed_models), or all discovered models when the tenant has allow_all_models enabled. There is no static list — discovery polls the Ollama backend in the background.",
sampleRequest: null,
sampleResponse:
`{
"object": "list",
"data": [
{"id": "llama3.1:8b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
{"id": "mistral:7b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
{"id": "qwen2.5:3b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
{"id": "nomic-embed-text", "object": "model", "created": 1779492441, "owned_by": "neuronetz"}
]
}`,
note: "Refreshed automatically every MODEL_DISCOVERY_REFRESH_S (default 60s). Cached fail-closed.",
},
"/api/tags": {
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
summary: "Native Ollama model list, filtered to the tenant's effective set. Same data as /v1/models but in Ollama's `models` shape — includes size, digest, modified_at, family and quantization details.",
sampleRequest: null,
sampleResponse:
`{
"models": [
{
"name": "llama3.1:8b",
"model": "llama3.1:8b",
"modified_at": "2026-04-01T12:00:00Z",
"size": 4920624384,
"digest": "sha256:…",
"details": {"family": "llama", "parameter_size": "8B", "quantization_level": "Q4_K_M"}
}
]
}`,
note: "",
},
"/healthz": {
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
summary: "Liveness probe. Returns 200 as long as the gateway process can respond — does NOT check downstream dependencies. Safe for load-balancer health checks. No authentication required.",
sampleRequest: null,
sampleResponse: `{"status": "ok"}`,
note: "",
},
"/readyz": {
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
summary: "Readiness probe. Returns 200 only when Postgres + Redis + the Ollama backend are all reachable; 503 otherwise with which dependencies are down. No authentication required.",
sampleRequest: null,
sampleResponse:
`# 200 OK
{"status": "ready", "checks": {"postgres": true, "redis": true, "ollama": true}}
# 503 Service Unavailable
{"status": "not_ready", "checks": {"postgres": true, "redis": true, "ollama": false}}`,
note: "In this demo, /readyz will return 200 — the mock Ollama is reachable. In dev-only stacks without an Ollama backend, /readyz fails closed.",
},
};
// Response headers worth surfacing (SPEC §6.5).
const SURFACE_HEADERS = [
"x-request-id",
"x-ratelimit-limit-requests", "x-ratelimit-remaining-requests",
"x-ratelimit-limit-tokens", "x-ratelimit-remaining-tokens",
"x-budget-period", "x-budget-tokens-remaining",
"retry-after", "content-type",
];
const $ = (id) => document.getElementById(id);
let current = "/v1/chat/completions";
// ── State helpers ───────────────────────────────────────────────────────
function state() {
return {
base: $("baseUrl").value.replace(/\/+$/, ""),
key: $("apiKey").value.trim(),
model: $("model").value,
prompt: $("prompt").value,
stream: $("stream").checked,
};
}
function buildTabs() {
const tabs = $("tabs");
tabs.innerHTML = "";
for (const path of Object.keys(ENDPOINTS)) {
const el = document.createElement("div");
el.className = "tab" + (path === current ? " active" : "");
el.textContent = path;
el.onclick = () => { current = path; buildTabs(); syncForm(); updateCurl(); };
tabs.appendChild(el);
}
}
function syncForm() {
const ep = ENDPOINTS[current];
$("modelWrap").style.display = ep.needsModel ? "" : "none";
$("promptWrap").style.display = ep.needsPrompt ? "" : "none";
$("streamWrap").style.display = ep.canStream ? "" : "none";
$("promptLabel").textContent = current === "/api/generate" ? "Prompt" : "Message";
$("methodHint").textContent = `${ep.method} · ${ep.canStream ? `streams ${ep.format.toUpperCase()}` : ep.format.toUpperCase()} · ${ep.noAuth ? "no auth" : "requires Bearer"}`;
renderEndpointInfo();
refreshGating();
}
// Populate the "About this endpoint" panel from the current endpoint's metadata.
function renderEndpointInfo() {
const ep = ENDPOINTS[current];
$("epTitle").textContent = ep.method + " " + current;
const method = $("epMethod");
method.textContent = ep.method;
method.className = "badge badge-" + ep.method.toLowerCase();
const auth = $("epAuth");
auth.textContent = ep.noAuth ? "no auth" : "auth: bearer";
auth.className = "badge " + (ep.noAuth ? "badge-open" : "badge-auth");
const streamBadge = $("epStream");
if (ep.canStream) {
streamBadge.style.display = "";
streamBadge.textContent = "streams · " + (ep.format === "sse" ? "SSE" : "NDJSON");
} else {
streamBadge.style.display = "none";
}
$("epSummary").textContent = ep.summary;
$("epSampleReq").textContent = ep.sampleRequest
? JSON.stringify(ep.sampleRequest, null, 2)
: "(no request body — GET)";
$("epSampleResp").textContent = ep.sampleResponse;
const note = $("epNote");
if (ep.note) { note.textContent = ep.note; note.style.display = ""; }
else { note.style.display = "none"; }
}
// Visibly disable Run/Refresh when no key is present (most endpoints need auth)
// and surface the reason RIGHT next to the API-key field — not just in the right
// pane where it's easy to miss.
function refreshGating() {
const ep = ENDPOINTS[current];
const hasKey = $("apiKey").value.trim().length > 0;
const needsKey = !ep.noAuth;
const run = $("run");
const refresh = $("refreshModels");
const blocked = needsKey && !hasKey;
run.disabled = blocked;
refresh.disabled = !hasKey; // refresh always needs a key
run.style.opacity = blocked ? "0.45" : "";
run.style.cursor = blocked ? "not-allowed" : "";
refresh.style.opacity = !hasKey ? "0.45" : "";
refresh.style.cursor = !hasKey ? "not-allowed" : "";
const hint = $("keyHint");
if (blocked) {
hint.innerHTML = "⚠ <b style=\"color:#ffb84a\">Paste your API key above</b> to enable Run and Refresh. Get one by running <code>./demo.sh</code>.";
} else {
hint.innerHTML = "Created by <code>./demo.sh</code> and printed once in your terminal.";
}
}
// ── curl preview (must match exactly what Run sends) ────────────────────
function buildRequest() {
const s = state();
const ep = ENDPOINTS[current];
const url = (s.base || location.origin) + current;
const headers = {};
if (!ep.noAuth) headers["Authorization"] = "Bearer " + (s.key || "nz_YOUR_KEY");
let body = null;
if (ep.method === "POST") {
headers["Content-Type"] = "application/json";
body = JSON.stringify(ep.body(s));
}
return { url, method: ep.method, headers, body, ep };
}
function updateCurl() {
const r = buildRequest();
const parts = ["curl"];
if (r.ep.canStream && state().stream && r.method === "POST") parts.push("-N");
if (r.method === "GET") parts.push("-i");
parts.push(shellQuote(r.url));
for (const [k, v] of Object.entries(r.headers)) {
parts.push("\\\n -H " + shellQuote(k + ": " + v));
}
if (r.body) parts.push("\\\n -d " + shellQuote(r.body));
$("curl").textContent = parts.join(" ");
}
function shellQuote(s) {
if (/^[A-Za-z0-9_\-:/.@]+$/.test(s)) return s;
return "'" + s.replace(/'/g, "'\\''") + "'";
}
// ── Status + header rendering ───────────────────────────────────────────
function setStatus(text, code) {
const el = $("status");
el.textContent = text;
el.className = "status" + (code ? " s" + String(code)[0] : "");
}
function renderHeaders(resp) {
const box = $("headers");
const rows = [];
for (const h of SURFACE_HEADERS) {
const v = resp.headers.get(h);
if (v != null) rows.push([h, v]);
}
if (!rows.length) { box.style.display = "none"; return; }
box.innerHTML = rows.map(([k, v]) =>
`<div class="hrow"><div class="hk">${k}</div><div class="hv">${escapeHtml(v)}</div></div>`
).join("");
box.style.display = "";
}
function escapeHtml(s) {
return String(s).replace(/[&<>]/g, (c) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[c]));
}
// ── Model dropdown population ───────────────────────────────────────────
async function refreshModels() {
const s = state();
if (!s.key) { setOutput("Enter an API key first, then refresh models."); return; }
const sel = $("model");
const btn = $("refreshModels");
btn.disabled = true; btn.textContent = "…";
try {
const resp = await fetch((s.base || location.origin) + "/v1/models", {
headers: { "Authorization": "Bearer " + s.key },
});
if (!resp.ok) { setOutput("Could not load models: HTTP " + resp.status); return; }
const data = await resp.json();
const names = (data.data || []).map((m) => m.id).filter(Boolean);
const prev = sel.value;
sel.innerHTML = "";
if (!names.length) {
sel.innerHTML = '<option value="">(no models in your effective set)</option>';
} else {
for (const n of names) {
const o = document.createElement("option");
o.value = n; o.textContent = n;
sel.appendChild(o);
}
if (names.includes(prev)) sel.value = prev;
}
updateCurl();
} catch (e) {
setOutput("Network error loading models: " + e.message);
} finally {
btn.disabled = false; btn.textContent = "↻ Refresh";
}
}
function setOutput(text) { $("output").textContent = text; }
function appendOutput(text) { $("output").textContent += text; }
// ── Run ─────────────────────────────────────────────────────────────────
let running = false;
async function run() {
if (running) return;
running = true;
const btn = $("run");
btn.disabled = true;
setStatus("connecting…");
setOutput("");
$("headers").style.display = "none";
const r = buildRequest();
const willStream = r.ep.canStream && state().stream && r.method === "POST";
try {
const resp = await fetch(r.url, { method: r.method, headers: r.headers, body: r.body });
setStatus(resp.status + " " + resp.statusText, resp.status);
renderHeaders(resp);
if (willStream && resp.body && resp.ok) {
await consumeStream(resp, r.ep.format);
} else {
const text = await resp.text();
setOutput(prettyMaybeJson(text));
}
} catch (e) {
setStatus("network error", 5);
setOutput("Request failed: " + e.message + "\n\n(Check the Base URL and that the gateway is running.)");
} finally {
running = false;
btn.disabled = false;
}
}
function prettyMaybeJson(text) {
try { return JSON.stringify(JSON.parse(text), null, 2); } catch { return text || "(empty response)"; }
}
// Parse SSE (data: {...} … data: [DONE]) or NDJSON (one JSON object per line),
// rendering text deltas live as they arrive.
async function consumeStream(resp, format) {
const reader = resp.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
setOutput("");
const cursor = "▌";
const render = (txt) => { $("output").textContent = txt + cursor; };
let acc = "";
while (true) {
const { value, done } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
let idx;
// SSE events are separated by blank lines; NDJSON by single newlines.
const sep = format === "sse" ? "\n\n" : "\n";
while ((idx = buffer.indexOf(sep)) >= 0) {
const raw = buffer.slice(0, idx);
buffer = buffer.slice(idx + sep.length);
acc += handleEvent(raw, format);
render(acc);
}
}
if (buffer.trim()) acc += handleEvent(buffer, format);
$("output").textContent = acc || "(stream produced no text)";
}
// Returns the text delta extracted from one event/line.
function handleEvent(raw, format) {
if (format === "sse") {
let out = "";
for (let line of raw.split("\n")) {
line = line.trim();
if (!line.startsWith("data:")) continue;
const payload = line.slice(5).trim();
if (payload === "[DONE]") continue;
try {
const obj = JSON.parse(payload);
const delta = obj.choices && obj.choices[0] && obj.choices[0].delta;
if (delta && typeof delta.content === "string") out += delta.content;
} catch { /* ignore keep-alives / partial */ }
}
return out;
}
// NDJSON
const line = raw.trim();
if (!line) return "";
try {
const obj = JSON.parse(line);
if (obj.message && typeof obj.message.content === "string") return obj.message.content; // /api/chat
if (typeof obj.response === "string") return obj.response; // /api/generate
} catch { /* partial line */ }
return "";
}
// ── Wiring ──────────────────────────────────────────────────────────────
function init() {
// Set the base URL to this page's origin. Browsers love to autofill text
// inputs from history *after* the page scripts run, so we ALSO re-assert it on
// the next microtask and again after a short delay — that wins against
// chromium/firefox autofill, which can otherwise replace the value with a
// stale entry like https://api.neuronetz.ai.
const setOrigin = () => { $("baseUrl").value = location.origin; };
setOrigin();
$("originPill").textContent = location.origin;
queueMicrotask(setOrigin);
setTimeout(setOrigin, 250);
buildTabs();
syncForm();
updateCurl();
refreshGating();
for (const id of ["baseUrl", "apiKey", "model", "prompt"]) {
$(id).addEventListener("input", updateCurl);
}
$("apiKey").addEventListener("input", refreshGating);
$("stream").addEventListener("change", updateCurl);
$("run").addEventListener("click", run);
$("refreshModels").addEventListener("click", refreshModels);
$("resetBase").addEventListener("click", () => {
$("baseUrl").value = location.origin;
updateCurl();
});
$("copyCurl").addEventListener("click", async () => {
try {
await navigator.clipboard.writeText($("curl").textContent);
const b = $("copyCurl"); b.textContent = "Copied!"; setTimeout(() => (b.textContent = "Copy"), 1200);
} catch { /* clipboard may be blocked; ignore */ }
});
// Convenience: refresh models when a key is pasted/typed (debounced).
let t = null;
$("apiKey").addEventListener("input", () => {
clearTimeout(t);
if ($("apiKey").value.trim().length > 8) t = setTimeout(refreshModels, 500);
});
}
document.addEventListener("DOMContentLoaded", init);
</script>
</body>
</html>