demo + playground + docs
One-command demo so the gateway can be exercised end-to-end without a GPU or a real model download: - demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags, /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count and eval_count on the final frame, /api/embed, /api/show, /api/version). Non-root multi-stage Dockerfile, never published (internal network only). - docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground. Mirrors the prod posture (mock-ollama not exposed). - demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with allow_all_models and a fresh API key via the bootstrap CLI inside the container, then prints the key, the playground URL, and five ready-to-paste curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull). ./demo.sh --down tears everything back down with volumes. - playground/index.html — single-file dark-themed UI served same-origin by the gateway at /playground (CORS-free). Per-endpoint About card with method/ auth/streaming badges, a real description, sample request body, sample response, and a footer note. Live SSE/NDJSON rendering of the response. A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh are visibly gated until an API key is in the field; the Base URL is force-pinned to location.origin three times to defeat browser autofill. - docs/ — API.md (full endpoint reference with curl, streaming formats, error model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule, pointing at a real Ollama backend, env reference), THREAT_MODEL.md (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md. mkdocs.yml (Material theme) wires them together.
This commit is contained in:
204
demo.sh
Executable file
204
demo.sh
Executable file
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env bash
|
||||
#
|
||||
# demo.sh — the neuronetz-gateway one-command presentation.
|
||||
#
|
||||
# Brings up the demo stack (postgres + redis + mock-ollama + gateway) with NO
|
||||
# GPU and NO model downloads, creates a demo tenant + API key via the bootstrap
|
||||
# CLI *inside the gateway container*, and prints a clean summary with the key,
|
||||
# the playground URL, and ready-to-paste curl commands.
|
||||
#
|
||||
# Usage:
|
||||
# ./demo.sh # build + start, bootstrap a tenant/key, print summary
|
||||
# ./demo.sh --down # tear the whole stack down (and remove volumes)
|
||||
# ./demo.sh --help # this help
|
||||
#
|
||||
# Re-runnable: existing tenant/key are handled gracefully. The full API key is
|
||||
# only ever printed once at creation (SPEC §11), so on a re-run where the key
|
||||
# already exists this script creates a fresh, uniquely-named key and prints it.
|
||||
set -euo pipefail
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# Configuration
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
COMPOSE_FILE="${SCRIPT_DIR}/docker-compose.demo.yml"
|
||||
COMPOSE=(docker compose -f "${COMPOSE_FILE}")
|
||||
|
||||
GATEWAY_URL="http://localhost:8080"
|
||||
PLAYGROUND_URL="${GATEWAY_URL}/playground"
|
||||
TENANT_NAME="demo"
|
||||
KEY_NAME="demo-key"
|
||||
|
||||
# Colours (disabled when stdout is not a TTY).
|
||||
if [ -t 1 ]; then
|
||||
BOLD="$(printf '\033[1m')"; DIM="$(printf '\033[2m')"; RESET="$(printf '\033[0m')"
|
||||
CYAN="$(printf '\033[36m')"; GREEN="$(printf '\033[32m')"; YELLOW="$(printf '\033[33m')"
|
||||
else
|
||||
BOLD=""; DIM=""; RESET=""; CYAN=""; GREEN=""; YELLOW=""
|
||||
fi
|
||||
|
||||
log() { printf '%s\n' "${CYAN}==>${RESET} ${BOLD}$*${RESET}"; }
|
||||
warn() { printf '%s\n' "${YELLOW}!!${RESET} $*" >&2; }
|
||||
die() { printf '%s\n' "${YELLOW}xx${RESET} $*" >&2; exit 1; }
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# Subcommands
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
usage() {
|
||||
sed -n '3,18p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//'
|
||||
}
|
||||
|
||||
down() {
|
||||
log "Tearing down the demo stack (containers + volumes)…"
|
||||
"${COMPOSE[@]}" down --volumes --remove-orphans
|
||||
log "Done. The demo stack is gone."
|
||||
}
|
||||
|
||||
# Run the bootstrap CLI inside the running gateway container.
|
||||
gw_cli() {
|
||||
"${COMPOSE[@]}" exec -T gateway neuronetz-gateway "$@"
|
||||
}
|
||||
|
||||
wait_for_health() {
|
||||
log "Waiting for the gateway to become healthy at ${GATEWAY_URL}/healthz …"
|
||||
local deadline=$(( $(date +%s) + 180 ))
|
||||
until curl -fsS "${GATEWAY_URL}/healthz" >/dev/null 2>&1; do
|
||||
if [ "$(date +%s)" -ge "${deadline}" ]; then
|
||||
warn "Gateway did not become healthy in time. Recent gateway logs:"
|
||||
"${COMPOSE[@]}" logs --tail=50 gateway >&2 || true
|
||||
die "Aborting."
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
log "Gateway is up."
|
||||
}
|
||||
|
||||
# Create the demo tenant if it does not already exist (idempotent).
|
||||
ensure_tenant() {
|
||||
log "Creating demo tenant '${TENANT_NAME}' (allow-all-models) …"
|
||||
local out
|
||||
if out="$(gw_cli create-tenant --name "${TENANT_NAME}" --allow-all-models 2>&1)"; then
|
||||
printf '%s\n' "${DIM}${out}${RESET}"
|
||||
else
|
||||
# Already-exists (or similar) is fine — surface it but keep going.
|
||||
if printf '%s' "${out}" | grep -qiE 'exist|duplicate|unique'; then
|
||||
log "Tenant '${TENANT_NAME}' already exists — reusing it."
|
||||
else
|
||||
warn "create-tenant reported:"
|
||||
printf '%s\n' "${out}" >&2
|
||||
warn "Continuing; the tenant may already be present."
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# Create a fresh API key and capture the printed key. The key is printed once.
|
||||
# We give each created key a unique name so re-runs always succeed and always
|
||||
# yield a usable key to print.
|
||||
create_key() {
|
||||
local unique_name="${KEY_NAME}-$(date +%Y%m%d-%H%M%S)"
|
||||
log "Creating API key '${unique_name}' for tenant '${TENANT_NAME}' …" >&2
|
||||
local out
|
||||
if ! out="$(gw_cli create-key --tenant "${TENANT_NAME}" --name "${unique_name}" 2>&1)"; then
|
||||
warn "create-key failed:" >&2
|
||||
printf '%s\n' "${out}" >&2
|
||||
return 1
|
||||
fi
|
||||
# The CLI prints both the 12-char prefix (e.g. "prefix nz_abc12345Yz") AND the
|
||||
# full key on a later line. Both match /nz_[A-Za-z0-9]+/, so pick the longest
|
||||
# match — that's the full key (44 chars), never the prefix (12).
|
||||
local key
|
||||
key="$(printf '%s' "${out}" | grep -oE 'nz_[A-Za-z0-9]+' \
|
||||
| awk '{ if (length($0) > maxlen) { maxlen = length($0); k = $0 } } END { print k }' \
|
||||
|| true)"
|
||||
if [ -z "${key}" ]; then
|
||||
warn "Could not parse an API key from create-key output:" >&2
|
||||
printf '%s\n' "${out}" >&2
|
||||
return 1
|
||||
fi
|
||||
printf '%s' "${key}"
|
||||
}
|
||||
|
||||
print_summary() {
|
||||
local key="$1"
|
||||
local cl='application/json'
|
||||
|
||||
cat <<EOF
|
||||
|
||||
${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
|
||||
${GREEN}${BOLD} neuronetz-gateway demo is live${RESET}
|
||||
${GREEN}${BOLD}════════════════════════════════════════════════════════════════════════${RESET}
|
||||
|
||||
${BOLD}API base URL${RESET} ${CYAN}${GATEWAY_URL}${RESET}
|
||||
${BOLD}Playground${RESET} ${CYAN}${PLAYGROUND_URL}${RESET}
|
||||
${BOLD}API key${RESET} ${YELLOW}${key}${RESET}
|
||||
${DIM}(printed once — copy it now; re-run ./demo.sh to mint another)${RESET}
|
||||
|
||||
${BOLD}Model backend${RESET} mock-ollama (internal network only, never published)
|
||||
${BOLD}Models${RESET} llama3.1:8b · mistral:7b · qwen2.5:3b · nomic-embed-text
|
||||
|
||||
${BOLD}── Ready-to-paste curl commands ───────────────────────────────────────${RESET}
|
||||
|
||||
${DIM}# 1) Streaming chat — OpenAI-compatible SSE (data: {...} … data: [DONE])${RESET}
|
||||
curl -N ${GATEWAY_URL}/v1/chat/completions \\
|
||||
-H "Authorization: Bearer ${key}" \\
|
||||
-H "Content-Type: ${cl}" \\
|
||||
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
|
||||
|
||||
${DIM}# 2) Streaming chat — native Ollama NDJSON (one JSON object per line)${RESET}
|
||||
curl -N ${GATEWAY_URL}/api/chat \\
|
||||
-H "Authorization: Bearer ${key}" \\
|
||||
-H "Content-Type: ${cl}" \\
|
||||
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
|
||||
|
||||
${DIM}# 3) List models — the tenant's effective (live-discovered) set, OpenAI format${RESET}
|
||||
curl ${GATEWAY_URL}/v1/models \\
|
||||
-H "Authorization: Bearer ${key}"
|
||||
|
||||
${DIM}# 4) 401 Unauthorized — no/invalid key, fail-closed, no upstream details leaked${RESET}
|
||||
curl -i ${GATEWAY_URL}/v1/models \\
|
||||
-H "Authorization: Bearer nz_invalidKEYdoesNotExist000000000000000000"
|
||||
|
||||
${DIM}# 5) 403 Forbidden — model-mutating endpoint is hard-blocked (not configurable)${RESET}
|
||||
curl -i ${GATEWAY_URL}/api/pull \\
|
||||
-H "Authorization: Bearer ${key}" \\
|
||||
-H "Content-Type: ${cl}" \\
|
||||
-d '{"model":"llama3.1:8b"}'
|
||||
|
||||
${BOLD}───────────────────────────────────────────────────────────────────────${RESET}
|
||||
Tear it all down with: ${CYAN}./demo.sh --down${RESET}
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
up() {
|
||||
command -v docker >/dev/null 2>&1 || die "docker is required but not found on PATH."
|
||||
command -v curl >/dev/null 2>&1 || die "curl is required but not found on PATH."
|
||||
[ -f "${COMPOSE_FILE}" ] || die "Missing ${COMPOSE_FILE}"
|
||||
|
||||
log "Building and starting the demo stack (postgres + redis + mock-ollama + gateway) …"
|
||||
"${COMPOSE[@]}" up --build -d
|
||||
|
||||
wait_for_health
|
||||
ensure_tenant
|
||||
|
||||
local key
|
||||
if ! key="$(create_key)"; then
|
||||
die "Could not create/parse an API key. See logs above."
|
||||
fi
|
||||
|
||||
print_summary "${key}"
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
# Entry point
|
||||
# ──────────────────────────────────────────────────────────────────────────
|
||||
main() {
|
||||
case "${1:-}" in
|
||||
--down|-d|down) down ;;
|
||||
--help|-h|help) usage ;;
|
||||
"") up ;;
|
||||
*) die "Unknown argument: $1 (try --help)" ;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
61
demo/mock-ollama/Dockerfile
Normal file
61
demo/mock-ollama/Dockerfile
Normal file
@@ -0,0 +1,61 @@
|
||||
# syntax=docker/dockerfile:1.7
|
||||
#
|
||||
# mock-ollama — a tiny FastAPI app emulating the Ollama HTTP API for the demo.
|
||||
#
|
||||
# builder stage : installs deps into a self-contained virtualenv.
|
||||
# runtime stage : copies the venv + app, drops to a NON-ROOT user, no build
|
||||
# tools, runs uvicorn on :11434.
|
||||
#
|
||||
# This image exists ONLY for the demo stack (docker-compose.demo.yml). It lets
|
||||
# the demo run with no GPU and no model downloads. It is never published to the
|
||||
# host — like real Ollama, it is reachable only on the internal Docker network.
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Stage 1 — builder
|
||||
# ----------------------------------------------------------------------------
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
VIRTUAL_ENV=/opt/venv \
|
||||
PATH=/opt/venv/bin:$PATH
|
||||
|
||||
RUN python -m venv /opt/venv
|
||||
|
||||
WORKDIR /app
|
||||
COPY requirements.txt ./
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Stage 2 — runtime
|
||||
# ----------------------------------------------------------------------------
|
||||
FROM python:3.12-slim AS runtime
|
||||
|
||||
# curl is used by the compose healthcheck.
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Non-root user.
|
||||
RUN groupadd --system --gid 10001 mock \
|
||||
&& useradd --system --uid 10001 --gid mock --home-dir /app --shell /usr/sbin/nologin mock
|
||||
|
||||
ENV VIRTUAL_ENV=/opt/venv \
|
||||
PATH=/opt/venv/bin:$PATH \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
MOCK_OLLAMA_PORT=11434
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
COPY app.py ./
|
||||
|
||||
USER mock
|
||||
|
||||
EXPOSE 11434
|
||||
|
||||
HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=5 \
|
||||
CMD curl -fsS "http://127.0.0.1:${MOCK_OLLAMA_PORT}/api/version" || exit 1
|
||||
|
||||
CMD ["python", "-m", "app"]
|
||||
361
demo/mock-ollama/app.py
Normal file
361
demo/mock-ollama/app.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""Standalone mock Ollama service for the neuronetz-gateway demo.
|
||||
|
||||
This is a containerised sibling of ``tests/integration/mock_ollama.py``: it
|
||||
emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so
|
||||
the demo runs with **no GPU and no model downloads**. The response *shapes*
|
||||
match real Ollama closely enough that the gateway's token counter, model
|
||||
discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths.
|
||||
|
||||
Endpoints emulated:
|
||||
|
||||
* ``GET /api/tags`` - model catalogue (size/digest/modified_at/details)
|
||||
* ``POST /api/chat`` - NDJSON streaming (default) or single JSON
|
||||
* ``POST /api/generate`` - NDJSON streaming (default) or single JSON
|
||||
* ``POST /api/embed`` - newer batch embeddings (field ``embeddings``)
|
||||
* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``)
|
||||
* ``POST /api/show`` - returns template/system so the gateway can prove it
|
||||
strips them
|
||||
* ``GET /api/version`` - plausible upstream version
|
||||
|
||||
The terminal NDJSON object of every chat/generate response carries realistic
|
||||
``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the
|
||||
gateway counts tokens for real. Reply text is ``"Echo: <prompt>"``.
|
||||
|
||||
Runs uvicorn on :11434 as a non-root user inside the container.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
from collections.abc import AsyncIterator, Iterable
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
|
||||
NDJSON_MEDIA_TYPE = "application/x-ndjson"
|
||||
|
||||
# A small, realistic catalogue. Sizes/digests are plausible but fixed so the
|
||||
# demo is fully deterministic.
|
||||
MODELS: tuple[dict[str, Any], ...] = (
|
||||
{
|
||||
"name": "llama3.1:8b",
|
||||
"family": "llama",
|
||||
"parameter_size": "8.0B",
|
||||
"quantization_level": "Q4_0",
|
||||
"size": 4_661_211_808,
|
||||
},
|
||||
{
|
||||
"name": "mistral:7b",
|
||||
"family": "llama",
|
||||
"parameter_size": "7.2B",
|
||||
"quantization_level": "Q4_0",
|
||||
"size": 4_109_865_159,
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5:3b",
|
||||
"family": "qwen2",
|
||||
"parameter_size": "3.1B",
|
||||
"quantization_level": "Q4_K_M",
|
||||
"size": 1_929_889_677,
|
||||
},
|
||||
{
|
||||
"name": "nomic-embed-text",
|
||||
"family": "nomic-bert",
|
||||
"parameter_size": "137M",
|
||||
"quantization_level": "F16",
|
||||
"size": 274_302_450,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
||||
|
||||
|
||||
def _digest_for(name: str) -> str:
|
||||
return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _details_for(name: str) -> dict[str, Any]:
|
||||
for m in MODELS:
|
||||
if m["name"] == name:
|
||||
return {
|
||||
"parent_model": "",
|
||||
"format": "gguf",
|
||||
"family": m["family"],
|
||||
"families": [m["family"]],
|
||||
"parameter_size": m["parameter_size"],
|
||||
"quantization_level": m["quantization_level"],
|
||||
}
|
||||
return {
|
||||
"parent_model": "",
|
||||
"format": "gguf",
|
||||
"family": name.split(":", 1)[0],
|
||||
"families": [name.split(":", 1)[0]],
|
||||
"parameter_size": "8B",
|
||||
"quantization_level": "Q4_0",
|
||||
}
|
||||
|
||||
|
||||
def _reply_for(prompt: str, override: str | None) -> str:
|
||||
if override is not None:
|
||||
return override
|
||||
if not prompt:
|
||||
return "Hello from the mock Ollama backend."
|
||||
return f"Echo: {prompt}"
|
||||
|
||||
|
||||
def _tokenize(text: str) -> list[str]:
|
||||
return text.split()
|
||||
|
||||
|
||||
def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]:
|
||||
"""Timing/usage fields Ollama attaches to the terminal stream object."""
|
||||
return {
|
||||
"total_duration": 1_234_567_890,
|
||||
"load_duration": 12_345_678,
|
||||
"prompt_eval_count": prompt_tokens,
|
||||
"prompt_eval_duration": 23_456_789,
|
||||
"eval_count": completion_tokens,
|
||||
"eval_duration": 34_567_890,
|
||||
}
|
||||
|
||||
|
||||
def _chat_chunk(
|
||||
model: str,
|
||||
*,
|
||||
content: str,
|
||||
done: bool,
|
||||
prompt_tokens: int = 0,
|
||||
completion_tokens: int = 0,
|
||||
) -> dict[str, Any]:
|
||||
obj: dict[str, Any] = {
|
||||
"model": model,
|
||||
"created_at": _now_iso(),
|
||||
"message": {"role": "assistant", "content": content},
|
||||
"done": done,
|
||||
}
|
||||
if done:
|
||||
obj["done_reason"] = "stop"
|
||||
obj.update(_final_metrics(prompt_tokens, completion_tokens))
|
||||
return obj
|
||||
|
||||
|
||||
def _generate_chunk(
|
||||
model: str,
|
||||
*,
|
||||
response: str,
|
||||
done: bool,
|
||||
prompt_tokens: int = 0,
|
||||
completion_tokens: int = 0,
|
||||
) -> dict[str, Any]:
|
||||
obj: dict[str, Any] = {
|
||||
"model": model,
|
||||
"created_at": _now_iso(),
|
||||
"response": response,
|
||||
"done": done,
|
||||
}
|
||||
if done:
|
||||
obj["done_reason"] = "stop"
|
||||
obj["context"] = [1, 2, 3]
|
||||
obj.update(_final_metrics(prompt_tokens, completion_tokens))
|
||||
return obj
|
||||
|
||||
|
||||
async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]:
|
||||
for obj in objects:
|
||||
yield (json.dumps(obj) + "\n").encode("utf-8")
|
||||
|
||||
|
||||
def _extract_last_user_message(messages: list[dict[str, Any]]) -> str:
|
||||
for msg in reversed(messages):
|
||||
if msg.get("role") == "user":
|
||||
content = msg.get("content", "")
|
||||
return content if isinstance(content, str) else ""
|
||||
return ""
|
||||
|
||||
|
||||
def create_app() -> FastAPI:
|
||||
app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None)
|
||||
|
||||
@app.post("/api/chat")
|
||||
async def chat(request: Request) -> Any:
|
||||
body: dict[str, Any] = await request.json()
|
||||
model: str = body.get("model", "llama3.1:8b")
|
||||
stream: bool = body.get("stream", True)
|
||||
reply_override: str | None = body.get("reply_text")
|
||||
prompt = _extract_last_user_message(body.get("messages", []))
|
||||
reply = _reply_for(prompt, reply_override)
|
||||
|
||||
prompt_tokens = len(_tokenize(prompt))
|
||||
completion_tokens = len(_tokenize(reply))
|
||||
|
||||
if not stream:
|
||||
return JSONResponse(
|
||||
_chat_chunk(
|
||||
model,
|
||||
content=reply,
|
||||
done=True,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
)
|
||||
|
||||
words = _tokenize(reply) or [""]
|
||||
|
||||
def chunks() -> list[dict[str, Any]]:
|
||||
out: list[dict[str, Any]] = []
|
||||
for i, word in enumerate(words):
|
||||
piece = word if i == 0 else f" {word}"
|
||||
out.append(_chat_chunk(model, content=piece, done=False))
|
||||
out.append(
|
||||
_chat_chunk(
|
||||
model,
|
||||
content="",
|
||||
done=True,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
|
||||
|
||||
@app.post("/api/generate")
|
||||
async def generate(request: Request) -> Any:
|
||||
body: dict[str, Any] = await request.json()
|
||||
model: str = body.get("model", "llama3.1:8b")
|
||||
stream: bool = body.get("stream", True)
|
||||
prompt = body.get("prompt", "")
|
||||
reply = _reply_for(prompt, body.get("reply_text"))
|
||||
|
||||
prompt_tokens = len(_tokenize(prompt))
|
||||
completion_tokens = len(_tokenize(reply))
|
||||
|
||||
if not stream:
|
||||
return JSONResponse(
|
||||
_generate_chunk(
|
||||
model,
|
||||
response=reply,
|
||||
done=True,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
)
|
||||
|
||||
words = _tokenize(reply) or [""]
|
||||
|
||||
def chunks() -> list[dict[str, Any]]:
|
||||
out: list[dict[str, Any]] = []
|
||||
for i, word in enumerate(words):
|
||||
piece = word if i == 0 else f" {word}"
|
||||
out.append(_generate_chunk(model, response=piece, done=False))
|
||||
out.append(
|
||||
_generate_chunk(
|
||||
model,
|
||||
response="",
|
||||
done=True,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
)
|
||||
return out
|
||||
|
||||
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
|
||||
|
||||
@app.post("/api/embed")
|
||||
async def embed(request: Request) -> Any:
|
||||
body: dict[str, Any] = await request.json()
|
||||
model: str = body.get("model", "nomic-embed-text")
|
||||
inp = body.get("input", "")
|
||||
items = inp if isinstance(inp, list) else [inp]
|
||||
prompt_tokens = sum(len(_tokenize(str(i))) for i in items)
|
||||
return JSONResponse(
|
||||
{
|
||||
"model": model,
|
||||
"embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items],
|
||||
"total_duration": 1_111_111,
|
||||
"load_duration": 222_222,
|
||||
"prompt_eval_count": prompt_tokens,
|
||||
}
|
||||
)
|
||||
|
||||
@app.post("/api/embeddings")
|
||||
async def embeddings(request: Request) -> Any:
|
||||
# Legacy single-vector endpoint: field name is ``embedding`` (singular).
|
||||
body: dict[str, Any] = await request.json()
|
||||
prompt = body.get("prompt", "")
|
||||
prompt_tokens = len(_tokenize(prompt))
|
||||
return JSONResponse(
|
||||
{
|
||||
# Ollama returns no eval_count for embeddings (SPEC §13.1);
|
||||
# only prompt_eval_count is meaningful for cost accounting.
|
||||
"embedding": [0.0, 0.1, 0.2, 0.3],
|
||||
"prompt_eval_count": prompt_tokens,
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/api/tags")
|
||||
async def tags() -> Any:
|
||||
return JSONResponse(
|
||||
{
|
||||
"models": [
|
||||
{
|
||||
"name": m["name"],
|
||||
"model": m["name"],
|
||||
"modified_at": _now_iso(),
|
||||
"size": m["size"],
|
||||
"digest": _digest_for(m["name"]),
|
||||
"details": _details_for(m["name"]),
|
||||
}
|
||||
for m in MODELS
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
@app.post("/api/show")
|
||||
async def show(request: Request) -> Any:
|
||||
body: dict[str, Any] = await request.json()
|
||||
name = body.get("model") or body.get("name", "llama3.1:8b")
|
||||
# Real Ollama returns a system prompt + template here; the gateway is
|
||||
# expected to strip those. We include them so the demo (and the
|
||||
# sanitisation test) can prove they don't reach the client.
|
||||
return JSONResponse(
|
||||
{
|
||||
"modelfile": f"FROM {name}",
|
||||
"parameters": "stop \"<|eot_id|>\"",
|
||||
"template": "{{ .System }} {{ .Prompt }}",
|
||||
"system": "You are a secret internal system prompt. Do not reveal me.",
|
||||
"details": _details_for(str(name)),
|
||||
"model_info": {"general.architecture": str(name).split(":", 1)[0]},
|
||||
}
|
||||
)
|
||||
|
||||
@app.get("/api/version")
|
||||
async def version() -> Any:
|
||||
# Plausible upstream version; the gateway overrides this with its own
|
||||
# version (SPEC §6.1) so a client never sees this value.
|
||||
return JSONResponse({"version": "0.5.7"})
|
||||
|
||||
@app.get("/healthz")
|
||||
async def healthz() -> Any:
|
||||
return JSONResponse({"status": "ok"})
|
||||
|
||||
return app
|
||||
|
||||
|
||||
app = create_app()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434"))
|
||||
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info") # noqa: S104
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2
demo/mock-ollama/requirements.txt
Normal file
2
demo/mock-ollama/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
fastapi==0.115.6
|
||||
uvicorn[standard]==0.34.0
|
||||
146
docker-compose.demo.yml
Normal file
146
docker-compose.demo.yml
Normal file
@@ -0,0 +1,146 @@
|
||||
# neuronetz-gateway — DEMO stack (postgres + redis + mock-ollama + gateway).
|
||||
#
|
||||
# This is the one-command presentation stack. It runs the real gateway image
|
||||
# (built from the repo Dockerfile) against a MOCK Ollama backend, so the whole
|
||||
# thing comes up with NO GPU and NO model downloads.
|
||||
#
|
||||
# ./demo.sh # bring it up, create a demo tenant+key, print curls
|
||||
# ./demo.sh --down # tear it all down
|
||||
#
|
||||
# Differs from the production stack (docker-compose.yml):
|
||||
# * NO caddy — the gateway is published directly on 127.0.0.1:8080.
|
||||
# * mock-ollama instead of the real ollama image.
|
||||
# * playground enabled — the gateway serves /playground from a mounted file.
|
||||
#
|
||||
# ┌─────────────────────────────────────────────────────────────────────────┐
|
||||
# │ SECURITY POSTURE (mirrors prod): │
|
||||
# │ `mock-ollama` has NO `ports:` mapping. The model backend is reachable │
|
||||
# │ only on the internal Docker network as `mock-ollama:11434`, exactly │
|
||||
# │ like real Ollama in production. Only the gateway is published, and only │
|
||||
# │ on the loopback interface (127.0.0.1:8080). │
|
||||
# └─────────────────────────────────────────────────────────────────────────┘
|
||||
|
||||
services:
|
||||
gateway:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "127.0.0.1:8080:8080"
|
||||
environment:
|
||||
GATEWAY_BIND_HOST: 0.0.0.0
|
||||
GATEWAY_BIND_PORT: "8080"
|
||||
GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
|
||||
GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
|
||||
GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
|
||||
GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
|
||||
# Serve the interactive playground from the mounted file (flag-gated;
|
||||
# OFF by default in prod). See playground/index.html.
|
||||
PLAYGROUND_ENABLED: "true"
|
||||
PLAYGROUND_FILE: /app/playground/index.html
|
||||
# Point the gateway at the mock Ollama on the internal network.
|
||||
OLLAMA_BASE_URL: http://mock-ollama:11434
|
||||
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
|
||||
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
|
||||
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
|
||||
# Discover models quickly so the demo feels live.
|
||||
MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-15}
|
||||
MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-60}
|
||||
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
|
||||
DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
|
||||
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
|
||||
DEFAULT_RPM: ${DEFAULT_RPM:-60}
|
||||
DEFAULT_TPM: ${DEFAULT_TPM:-100000}
|
||||
DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
|
||||
MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
|
||||
MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
|
||||
ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
|
||||
ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
|
||||
ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
|
||||
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
|
||||
AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
|
||||
PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
|
||||
AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
|
||||
volumes:
|
||||
# The gateway serves /playground by reading this file at request time.
|
||||
# Read-only mount: the demo never lets the container modify it.
|
||||
- ./playground:/app/playground:ro
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
mock-ollama:
|
||||
condition: service_healthy
|
||||
# Apply migrations, then start the server (mirrors docker-compose.dev.yml).
|
||||
command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
|
||||
interval: 10s
|
||||
timeout: 3s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
networks:
|
||||
- internal
|
||||
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
# mock-ollama — INTERNAL NETWORK ONLY. Stands in for the real Ollama backend.
|
||||
# NO `ports:` mapping, mirroring the production "Ollama is never exposed" rule.
|
||||
# Reachable only as `http://mock-ollama:11434` from the gateway container.
|
||||
# ───────────────────────────────────────────────────────────────────────────
|
||||
mock-ollama:
|
||||
build:
|
||||
context: ./demo/mock-ollama
|
||||
dockerfile: Dockerfile
|
||||
restart: unless-stopped
|
||||
# !!! NO `ports:` — the model backend is never published. !!!
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:11434/api/version"]
|
||||
interval: 10s
|
||||
timeout: 3s
|
||||
retries: 5
|
||||
start_period: 5s
|
||||
networks:
|
||||
- internal
|
||||
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER:-gateway}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
|
||||
POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
|
||||
volumes:
|
||||
- postgres_demo_data:/var/lib/postgresql/data
|
||||
# No `ports:` — Postgres is internal-only.
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
|
||||
interval: 5s
|
||||
timeout: 3s
|
||||
retries: 10
|
||||
networks:
|
||||
- internal
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
restart: unless-stopped
|
||||
command: ["redis-server", "--save", "", "--appendonly", "no"]
|
||||
# No `ports:` — Redis is internal-only.
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 5s
|
||||
timeout: 3s
|
||||
retries: 10
|
||||
networks:
|
||||
- internal
|
||||
|
||||
networks:
|
||||
# Private network for inter-service traffic; not reachable from the host.
|
||||
internal:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
postgres_demo_data:
|
||||
253
docs/API.md
Normal file
253
docs/API.md
Normal file
@@ -0,0 +1,253 @@
|
||||
# neuronetz-gateway — API Reference
|
||||
|
||||
The gateway exposes two compatible API surfaces in front of the Ollama backend:
|
||||
|
||||
- **Native Ollama** under `/api/*` — NDJSON streaming, identical request shapes to Ollama.
|
||||
- **OpenAI-compatible** under `/v1/*` — SSE streaming, drop-in for the OpenAI SDKs.
|
||||
|
||||
Plus unauthenticated health endpoints. Everything else is blocked.
|
||||
|
||||
> Source of truth: [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §6. Where this doc and the
|
||||
> SPEC disagree, the SPEC wins.
|
||||
|
||||
---
|
||||
|
||||
## Authentication
|
||||
|
||||
Every model endpoint requires an API key as a Bearer token:
|
||||
|
||||
```
|
||||
Authorization: Bearer nz_<12-char-prefix><32-char-random>
|
||||
```
|
||||
|
||||
- **Key format:** `nz_` namespace + random base62 body. The first 12 characters
|
||||
(`nz_` + entropy) are the **prefix**, stored in cleartext and indexed for O(1) lookup.
|
||||
The full key is **argon2id**-hashed; it is shown **exactly once** at creation
|
||||
(`neuronetz-gateway create-key`) and never stored or logged.
|
||||
- **Fail-closed:** a missing, malformed, expired, disabled, or revoked key returns **401**.
|
||||
No upstream/Ollama detail is ever leaked in the error.
|
||||
- Health endpoints (`/healthz`, `/readyz`) require **no** auth.
|
||||
|
||||
The placeholder key `nz_demoKEY...` is used throughout this doc. `./demo.sh` prints a
|
||||
**real** key for the local demo.
|
||||
|
||||
---
|
||||
|
||||
## Response headers (SPEC §6.5)
|
||||
|
||||
Every proxied response carries:
|
||||
|
||||
| Header | Meaning |
|
||||
|---|---|
|
||||
| `X-Request-ID` | Correlates the response with the audit log row. Present on errors too. |
|
||||
| `X-RateLimit-Limit-Requests` | Effective RPM limit for this key/tenant. |
|
||||
| `X-RateLimit-Remaining-Requests` | Requests remaining in the current window. |
|
||||
| `X-RateLimit-Limit-Tokens` | Effective TPM limit. |
|
||||
| `X-RateLimit-Remaining-Tokens` | Tokens remaining in the current window. |
|
||||
| `X-Budget-Period` | `day` \| `month` \| `total` — the binding budget period. |
|
||||
| `X-Budget-Tokens-Remaining` | Tokens left in the binding budget period. |
|
||||
|
||||
`429 Too Many Requests` responses additionally carry `Retry-After: <seconds>`.
|
||||
|
||||
---
|
||||
|
||||
## Error model
|
||||
|
||||
Errors are **sanitized** at the gateway boundary — Ollama internals are never reflected.
|
||||
The body is a small generic JSON object and the `X-Request-ID` header ties it to the audit log.
|
||||
|
||||
```json
|
||||
{ "error": { "message": "forbidden", "type": "forbidden", "code": 403 }, "request_id": "b3f1…" }
|
||||
```
|
||||
|
||||
| Status | When |
|
||||
|---|---|
|
||||
| `400` | Malformed body, schema violation, or `num_predict` over the cap. |
|
||||
| `401` | Missing / invalid / expired / revoked key. |
|
||||
| `403` | Endpoint hard-blocked, or model outside the tenant's effective set (no existence disclosure). |
|
||||
| `413` | Request body over `MAX_REQUEST_BODY_BYTES` (default 256 KiB). |
|
||||
| `429` | Rate limit or budget exceeded (carries `Retry-After`). |
|
||||
| `502` | Ollama upstream unreachable / circuit breaker open. |
|
||||
| `503` | A required subsystem (Postgres read, Redis) is down — **fail-closed**, never "allow". |
|
||||
|
||||
A model that is *installed-but-unpermitted* and a model that is *not installed* return the
|
||||
**same** generic `403`, to prevent enumeration (SPEC §13.6).
|
||||
|
||||
---
|
||||
|
||||
## Native Ollama endpoints (`/api/*`)
|
||||
|
||||
### `POST /api/chat`
|
||||
|
||||
Streamed (NDJSON, default) or non-streamed chat completion.
|
||||
|
||||
```bash
|
||||
curl -N http://localhost:8080/api/chat \
|
||||
-H "Authorization: Bearer nz_demoKEY..." \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"llama3.1:8b","stream":true,
|
||||
"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
|
||||
```
|
||||
|
||||
**Streaming response** — `Content-Type: application/x-ndjson`, one JSON object per line:
|
||||
|
||||
```
|
||||
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
|
||||
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" Say"},"done":false}
|
||||
…
|
||||
{"model":"llama3.1:8b","done":true,"done_reason":"stop",
|
||||
"prompt_eval_count":6,"eval_count":7,"total_duration":1234567890,"eval_duration":34567890}
|
||||
```
|
||||
|
||||
The **final** object carries `prompt_eval_count` (tokens in) and `eval_count` (tokens out);
|
||||
the gateway uses these for precise token accounting (SPEC §4.3 step 12).
|
||||
|
||||
**Non-streaming** (`"stream": false`) returns a single JSON object of the same shape with
|
||||
`"done": true`.
|
||||
|
||||
### `POST /api/generate`
|
||||
|
||||
Same semantics as `/api/chat` but uses a flat `prompt` string and returns `response`
|
||||
fields instead of `message` objects.
|
||||
|
||||
```bash
|
||||
curl -N http://localhost:8080/api/generate \
|
||||
-H "Authorization: Bearer nz_demoKEY..." \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"llama3.1:8b","stream":true,"prompt":"Write a haiku about routers."}'
|
||||
```
|
||||
|
||||
### `POST /api/embed` / `POST /api/embeddings`
|
||||
|
||||
Non-streamed embeddings. `/api/embed` is the newer batch endpoint (field `embeddings`,
|
||||
a list of vectors); `/api/embeddings` is the legacy single-vector endpoint (field
|
||||
`embedding`). Ollama returns no `eval_count` for embeddings; cost is charged on
|
||||
`prompt_eval_count` only (SPEC §13.1).
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/api/embed \
|
||||
-H "Authorization: Bearer nz_demoKEY..." \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"nomic-embed-text","input":["hello","world"]}'
|
||||
```
|
||||
|
||||
```json
|
||||
{ "model": "nomic-embed-text", "embeddings": [[0.0, 0.1, …], [0.0, 0.1, …]], "prompt_eval_count": 2 }
|
||||
```
|
||||
|
||||
### `GET /api/tags`
|
||||
|
||||
Returns the tenant's **effective** model set — the live-discovered set intersected with the
|
||||
tenant's allowlist, or *all* discovered models when `allow_all_models` is on. Sourced from
|
||||
discovery (SPEC §4.6), never a static list.
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/api/tags -H "Authorization: Bearer nz_demoKEY..."
|
||||
```
|
||||
|
||||
### `POST /api/show`
|
||||
|
||||
Allowed only for models in the effective set; returns **sanitized** model info.
|
||||
The system prompt and template that Ollama returns are **stripped** by the gateway.
|
||||
|
||||
### `GET /api/version`
|
||||
|
||||
Returns the **gateway** version, not the Ollama version.
|
||||
|
||||
```json
|
||||
{ "version": "0.1.0" }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Hard-blocked endpoints (always `403`)
|
||||
|
||||
These model-mutating endpoints are blocked at the gateway. **Not configurable, not behind a
|
||||
flag** (SPEC §6.2, AGENT_PROMPT non-negotiable #5):
|
||||
|
||||
```
|
||||
/api/pull /api/push /api/create /api/copy /api/delete /api/blobs/*
|
||||
```
|
||||
|
||||
```bash
|
||||
# Always 403, even with a valid key:
|
||||
curl -i http://localhost:8080/api/pull \
|
||||
-H "Authorization: Bearer nz_demoKEY..." \
|
||||
-H "Content-Type: application/json" -d '{"model":"llama3.1:8b"}'
|
||||
```
|
||||
|
||||
`GET /api/ps` is also blocked (it would leak which models are loaded).
|
||||
|
||||
---
|
||||
|
||||
## OpenAI-compatible endpoints (`/v1/*`)
|
||||
|
||||
| Path | Method | Maps to |
|
||||
|---|---|---|
|
||||
| `/v1/chat/completions` | POST | `/api/chat` |
|
||||
| `/v1/completions` | POST | `/api/generate` |
|
||||
| `/v1/embeddings` | POST | `/api/embed` |
|
||||
| `/v1/models` | GET | `/api/tags` (effective set, OpenAI list format) |
|
||||
|
||||
Streaming uses **SSE**: `data: {…}\n\n` events terminated by a literal `data: [DONE]\n\n`.
|
||||
|
||||
### `POST /v1/chat/completions`
|
||||
|
||||
```bash
|
||||
curl -N http://localhost:8080/v1/chat/completions \
|
||||
-H "Authorization: Bearer nz_demoKEY..." \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"model":"llama3.1:8b","stream":true,
|
||||
"messages":[{"role":"user","content":"Say hello in one sentence."}]}'
|
||||
```
|
||||
|
||||
**Streaming response** — `Content-Type: text/event-stream`:
|
||||
|
||||
```
|
||||
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":" Say"},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":6,"completion_tokens":7,"total_tokens":13}}
|
||||
|
||||
data: [DONE]
|
||||
```
|
||||
|
||||
Works with the OpenAI Python SDK by pointing `base_url` at `http://localhost:8080/v1`.
|
||||
|
||||
### `GET /v1/models`
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models -H "Authorization: Bearer nz_demoKEY..."
|
||||
```
|
||||
|
||||
```json
|
||||
{ "object": "list", "data": [
|
||||
{ "id": "llama3.1:8b", "object": "model", "owned_by": "neuronetz" },
|
||||
{ "id": "mistral:7b", "object": "model", "owned_by": "neuronetz" }
|
||||
] }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Health endpoints
|
||||
|
||||
| Path | Method | Auth | Purpose |
|
||||
|---|---|---|---|
|
||||
| `/healthz` | GET | none | Liveness — process responsive (`200`). |
|
||||
| `/readyz` | GET | none | Readiness — DB + Redis + Ollama reachable, else `503`. |
|
||||
| `/metrics` | GET | none (loopback only) | Prometheus exposition. |
|
||||
|
||||
```bash
|
||||
curl -i http://localhost:8080/healthz # 200 {"status":"ok"}
|
||||
curl -i http://localhost:8080/readyz # 200 when all deps up; 503 otherwise
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick reference: streaming formats
|
||||
|
||||
| Surface | Content-Type | Frame | Terminator |
|
||||
|---|---|---|---|
|
||||
| Native `/api/*` | `application/x-ndjson` | one JSON object per `\n` | final object has `"done": true` |
|
||||
| OpenAI `/v1/*` | `text/event-stream` | `data: {…}\n\n` | `data: [DONE]\n\n` |
|
||||
168
docs/ARCHITECTURE.md
Normal file
168
docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,168 @@
|
||||
# neuronetz-gateway — Architecture
|
||||
|
||||
Distilled from [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §4. The SPEC is the source of truth.
|
||||
|
||||
The gateway is the **hot path** of the Neuronetz API: a secure, multi-tenant proxy in front
|
||||
of an Ollama instance. The Ollama backend must never be reachable directly from the public
|
||||
internet — all access flows through this gateway. Administration (dashboards, tenant
|
||||
self-service) lives in a separate service, `neuronetz-console`, and is out of scope here.
|
||||
|
||||
---
|
||||
|
||||
## Component diagram (SPEC §4.1)
|
||||
|
||||
```
|
||||
Internet
|
||||
│ TLS
|
||||
▼
|
||||
┌──────────────────────┐
|
||||
│ Caddy (sidecar) │ Let's Encrypt for api.neuronetz.ai
|
||||
│ - TLS termination │ HSTS, security headers
|
||||
│ - HTTP/2, HTTP/3 │
|
||||
└──────────┬───────────┘
|
||||
│ HTTP/1.1 internal
|
||||
┌──────────▼───────────┐
|
||||
│ neuronetz-gateway │ FastAPI + uvicorn
|
||||
│ - authn │
|
||||
│ - rate limit │
|
||||
│ - budget check │
|
||||
│ - proxy + stream │
|
||||
│ - token count │
|
||||
│ - audit write │
|
||||
└──┬────────┬──────┬───┘
|
||||
│ │ │
|
||||
┌──────▼──┐ ┌──▼───┐ │
|
||||
│Postgres │ │Redis │ │
|
||||
│ schema: │ │ keys │ │
|
||||
│ gateway │ │bucket│ │
|
||||
└─────────┘ └──────┘ │
|
||||
│ internal network only
|
||||
┌──────▼──────┐
|
||||
│ Ollama │
|
||||
│ 127.0.0.1 │
|
||||
└─────────────┘
|
||||
|
||||
Same Compose stack also hosts (separate from this SPEC):
|
||||
- neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
|
||||
```
|
||||
|
||||
Only **Caddy** publishes ports. Postgres, Redis and (critically) **Ollama** have no
|
||||
published ports and are reachable only on the internal Docker network.
|
||||
|
||||
---
|
||||
|
||||
## Database schemas (SPEC §4.2)
|
||||
|
||||
A single Postgres instance with two schemas:
|
||||
|
||||
- **`gateway`** — owned by this service; full DDL. Tables: `tenants`, `tenant_limits`,
|
||||
`api_keys`, `key_limits`, `budget_usage`, `audit_log`, `prompt_log`, `revocations`
|
||||
(see SPEC §5 for the full DDL).
|
||||
- **`console`** — owned by `neuronetz-console` (out of scope). The console role gets
|
||||
`SELECT` on all `gateway.*` tables and `INSERT` on `gateway.revocations` only.
|
||||
|
||||
If the console needs to mutate gateway state (e.g. revoke a key), it does so by inserting
|
||||
into the `gateway.revocations` **outbox** table, which the gateway tails (see Revocation below).
|
||||
|
||||
**Limit inheritance:** limits and budgets resolve key → tenant. A `NULL` key-level value
|
||||
inherits the tenant value. For `allow_all_models`, a non-`NULL` key value overrides the
|
||||
tenant flag; otherwise the tenant flag applies (SPEC §13.7).
|
||||
|
||||
---
|
||||
|
||||
## Request lifecycle (SPEC §4.3)
|
||||
|
||||
1. Caddy terminates TLS and forwards to the gateway on the internal port.
|
||||
2. Middleware extracts `Authorization: Bearer <key>`.
|
||||
3. The 12-char prefix is the Redis cache key. On miss, look up `gateway.api_keys` by prefix,
|
||||
verify the full key with argon2id, and cache resolved metadata in Redis (TTL 60 s).
|
||||
4. **Rate limit** check — sliding window in Redis (Lua-atomic): per-key RPM + per-tenant RPM.
|
||||
5. **Budget** check — Redis counter for the current period; Postgres ledger is the source of
|
||||
truth on reset.
|
||||
6. **Concurrency** semaphore — Redis `INCR` with TTL.
|
||||
7. **Model allowlist** check — resolve the effective set (see below); the request `model`
|
||||
must be in it, else a generic `403`.
|
||||
8. **Endpoint allowlist** check — mutating endpoints are hard-blocked.
|
||||
9. **Body validation** — size, schema, `num_predict` cap.
|
||||
10. If an OpenAI-compat path, translate the request to the Ollama schema.
|
||||
11. Open an httpx async stream to Ollama.
|
||||
12. Stream the response back to the client, accumulating the final `prompt_eval_count` +
|
||||
`eval_count`.
|
||||
13. On stream close: write the `gateway.audit_log` row; decrement the budget; release the
|
||||
semaphore; if prompt logging is enabled, write `gateway.prompt_log`.
|
||||
14. On any failure: sanitized error to the client, audit row with the status code, semaphore
|
||||
released.
|
||||
|
||||
**Streaming integrity:** token counting and the audit write happen **after** stream close,
|
||||
never on the hot path — time-to-first-byte is not degraded by bookkeeping (SPEC §9).
|
||||
|
||||
---
|
||||
|
||||
## Model discovery (SPEC §4.6)
|
||||
|
||||
The set of usable models is **never hand-maintained**; it is extracted live from Ollama.
|
||||
|
||||
- A background task (started in the app lifespan, alongside the revocation listener) polls
|
||||
Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
|
||||
- The parsed set (names + sanitized metadata: family, parameter size, quantization, size,
|
||||
modified-at) is cached in Redis under `gateway:models:discovered` with TTL
|
||||
`MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
|
||||
- An initial fetch runs at startup; if Ollama is unreachable the discovered set is empty.
|
||||
- **Fail-closed:** an empty or expired-and-unrefreshable discovered set means *no model
|
||||
resolves* and requests are denied. Discovery never opens access on failure.
|
||||
- **Auto-grant:** because the effective set intersects with `discovered` (or *is*
|
||||
`discovered` when `allow_all_models`), a model pulled into Ollama out-of-band becomes
|
||||
usable to `allow_all` tenants on the next refresh — no per-tenant config change.
|
||||
- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
|
||||
endpoint; it never triggers a model pull.
|
||||
|
||||
### Effective-set resolution (SPEC §4.3 step 7)
|
||||
|
||||
```
|
||||
allow_all := key.allow_all_models ?? tenant.allow_all_models
|
||||
effective := discovered if allow_all
|
||||
(key.allowed_models ?? tenant.allowed_models) ∩ discovered otherwise
|
||||
```
|
||||
|
||||
`/api/tags` and `/v1/models` return exactly this effective set, so the listing never reveals
|
||||
models outside the tenant's reach. A model that is installed-but-unpermitted and one that is
|
||||
not installed both return the same generic `403` — no existence disclosure (SPEC §13.6).
|
||||
|
||||
---
|
||||
|
||||
## Failure modes — fail-closed (SPEC §4.4)
|
||||
|
||||
| Subsystem | If down | Behavior |
|
||||
|---|---|---|
|
||||
| Postgres (read) | Key lookup fails | `503` with retry-after; nothing proxied. |
|
||||
| Postgres (write) | Audit write fails | Request still succeeds; audit row buffered in-memory ring (max 1000), drained on recovery; if the buffer fills, switch to deny mode. |
|
||||
| Redis | Rate limit / budget unavailable | `503` — fail closed. Never "allow because we can't check." |
|
||||
| Ollama | Upstream unreachable | `502` with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30 s. |
|
||||
| Caddy | Not a gateway concern | — |
|
||||
|
||||
The governing rule (AGENT_PROMPT non-negotiable #1): **if a security or budgeting check
|
||||
cannot be performed, deny.** Never default to allow.
|
||||
|
||||
---
|
||||
|
||||
## Cache invalidation / key revocation (SPEC §4.5)
|
||||
|
||||
The console revokes a key by inserting into `gateway.revocations(key_id, ts, reason)`.
|
||||
A background task in the gateway lifespan:
|
||||
|
||||
- `LISTEN`s on the Postgres channel `key_revoked` (the gateway emits `NOTIFY` on its own
|
||||
write path; the console's INSERT fires a trigger that emits it).
|
||||
- On notification, evicts the Redis cache entry for that key's prefix.
|
||||
|
||||
This makes revocation effectively immediate (≤ Redis RTT) with no cross-service HTTP.
|
||||
|
||||
---
|
||||
|
||||
## Observability
|
||||
|
||||
- **Structured logs** (structlog), JSON in production. Secrets/keys are never logged.
|
||||
- **Prometheus** `/metrics` (loopback only): `gateway_requests_total{tenant,model,status}`,
|
||||
`gateway_tokens_total{tenant,model,direction}`,
|
||||
`gateway_request_duration_seconds{tenant,model}` (histogram). Labelled by `tenant`, never
|
||||
by `key_id` (cardinality — SPEC §13.3); per-key data lives in Postgres.
|
||||
- **Audit log** — always-on request metadata. **Prompt log** — opt-in per key, TTL'd.
|
||||
188
docs/DEPLOYMENT.md
Normal file
188
docs/DEPLOYMENT.md
Normal file
@@ -0,0 +1,188 @@
|
||||
# neuronetz-gateway — Deployment
|
||||
|
||||
Production deployment is a single Docker Compose stack: **Caddy + gateway + Postgres + Redis
|
||||
+ Ollama**. Caddy is the only public-facing component; it terminates TLS via Let's Encrypt
|
||||
for `api.neuronetz.ai` and reverse-proxies to the internal-only gateway.
|
||||
|
||||
> For the local, no-GPU demo (mock Ollama + playground), see [`PLAYGROUND.md`](PLAYGROUND.md)
|
||||
> and run `./demo.sh`. This document is the **production** path.
|
||||
|
||||
---
|
||||
|
||||
## The one rule that must never break
|
||||
|
||||
> ## ⛔ Ollama is NEVER exposed to the host or the internet.
|
||||
>
|
||||
> The `ollama` service in `docker-compose.yml` has **no `ports:` mapping** and must never
|
||||
> get one. Ollama is reachable only on the internal Docker network as `ollama:11434`.
|
||||
> Publishing it would re-open the exact unauthenticated exposure this whole project exists
|
||||
> to close (SPEC §1, §3; AGENT_PROMPT non-negotiable #2).
|
||||
|
||||
The same posture applies to **Postgres** and **Redis** in the production compose file — no
|
||||
published ports. Only **Caddy** binds host ports (80/443, 443/udp for HTTP/3).
|
||||
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A host with Docker + Docker Compose.
|
||||
- DNS: `api.neuronetz.ai` → the host's public IP (for Let's Encrypt).
|
||||
- Ports 80 and 443 reachable from the internet (ACME HTTP/TLS challenge + serving).
|
||||
|
||||
---
|
||||
|
||||
## Steps
|
||||
|
||||
```bash
|
||||
git clone <repo> neuronetz-gateway && cd neuronetz-gateway
|
||||
|
||||
# 1. Configure. Copy the example env and change EVERY secret.
|
||||
cp .env.example .env
|
||||
# - POSTGRES_PASSWORD: a strong, unique value
|
||||
# - DATABASE_URL: must match the POSTGRES_* values
|
||||
# - GATEWAY_LOG_FORMAT=json for production
|
||||
|
||||
# 2. Configure Caddy for your domain + ACME email.
|
||||
cp ops/caddy/Caddyfile.example ops/caddy/Caddyfile # then edit the site + email
|
||||
# (docker-compose.yml mounts Caddyfile.example by default; point it at your edited file
|
||||
# or edit in place.)
|
||||
|
||||
# 3. Bring up the full stack. The gateway runs `alembic upgrade head`, then serves.
|
||||
docker compose up -d --build
|
||||
|
||||
# 4. Bootstrap a tenant + key (CLI runs inside the gateway container).
|
||||
docker compose exec gateway neuronetz-gateway create-tenant --name acme --rpm 120 --tpm 200000
|
||||
docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
|
||||
# ^ prints the full key ONCE — store it in your secret manager now.
|
||||
|
||||
# 5. Smoke test (through Caddy / TLS).
|
||||
curl https://api.neuronetz.ai/healthz
|
||||
curl -N https://api.neuronetz.ai/v1/chat/completions \
|
||||
-H "Authorization: Bearer nz_…" -H "Content-Type: application/json" \
|
||||
-d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"hi"}]}'
|
||||
```
|
||||
|
||||
Caddy obtains and renews the certificate automatically. For local testing without a public
|
||||
domain, use the `localhost { tls internal … }` block documented in `Caddyfile.example`
|
||||
(trust Caddy's local CA or pass `-k` to curl).
|
||||
|
||||
---
|
||||
|
||||
## Pointing at a real Ollama backend
|
||||
|
||||
The gateway reaches Ollama via `OLLAMA_BASE_URL`. In the bundled stack this is the in-stack
|
||||
`ollama` service: `OLLAMA_BASE_URL=http://ollama:11434`.
|
||||
|
||||
To use an **existing/external** Ollama host instead:
|
||||
|
||||
1. Remove the `ollama` service from `docker-compose.yml` (or leave it; it just won't be used).
|
||||
2. Set `OLLAMA_BASE_URL` to the backend address reachable from the gateway container, e.g.
|
||||
`http://10.0.0.5:11434` or an internal DNS name.
|
||||
3. Ensure that backend is itself **not** exposed to the internet — the gateway is the only
|
||||
thing that should ever reach it. Use a private network / firewall rule, not a public port.
|
||||
4. Pull the models you want available on that backend. They appear in tenants' effective sets
|
||||
automatically on the next discovery refresh (SPEC §4.6) — no gateway config change for
|
||||
`allow_all_models` tenants.
|
||||
|
||||
Discovery polls `OLLAMA_BASE_URL/api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. If the
|
||||
backend is unreachable, the discovered set is empty and requests **fail closed**.
|
||||
|
||||
---
|
||||
|
||||
## Environment reference (SPEC §7)
|
||||
|
||||
All configuration is via environment variables, validated by Pydantic Settings on boot. Boot
|
||||
**fails loudly** on invalid config. See [`.env.example`](../.env.example) for a copyable file.
|
||||
|
||||
### Service
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `GATEWAY_BIND_HOST` | `0.0.0.0` | Bind-all inside the container. |
|
||||
| `GATEWAY_BIND_PORT` | `8080` | Internal port; never published directly in prod. |
|
||||
| `GATEWAY_LOG_LEVEL` | `INFO` | |
|
||||
| `GATEWAY_LOG_FORMAT` | `json` | `json` in prod, `console` for local dev. |
|
||||
| `GATEWAY_REQUEST_ID_HEADER` | `X-Request-ID` | |
|
||||
| `GATEWAY_TRUSTED_PROXIES` | `127.0.0.1,caddy` | Sources trusted for `X-Forwarded-For`. |
|
||||
|
||||
### Upstream (Ollama)
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `OLLAMA_BASE_URL` | `http://ollama:11434` | Internal address of the backend. |
|
||||
| `OLLAMA_CONNECT_TIMEOUT_S` | `5` | |
|
||||
| `OLLAMA_READ_TIMEOUT_S` | `600` | Long, for slow generations. |
|
||||
| `OLLAMA_MAX_CONNECTIONS` | `64` | httpx pool size. |
|
||||
|
||||
### Model discovery (§4.6)
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `MODEL_DISCOVERY_REFRESH_S` | `60` | How often to re-query `/api/tags`. |
|
||||
| `MODEL_DISCOVERY_CACHE_TTL_S` | `120` | Redis TTL for the discovered set. |
|
||||
|
||||
### Database
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `DATABASE_URL` | `postgresql+asyncpg://…` | asyncpg driver. |
|
||||
| `DATABASE_POOL_SIZE` | `10` | |
|
||||
| `DATABASE_POOL_OVERFLOW` | `20` | |
|
||||
|
||||
### Redis
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `REDIS_URL` | `redis://redis:6379/0` | |
|
||||
| `REDIS_KEY_CACHE_TTL_S` | `60` | Resolved-key cache TTL. |
|
||||
|
||||
### Limits (defaults; per-tenant/key DB overrides win)
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `DEFAULT_RPM` | `60` | |
|
||||
| `DEFAULT_TPM` | `100000` | |
|
||||
| `DEFAULT_CONCURRENT` | `8` | |
|
||||
| `MAX_REQUEST_BODY_BYTES` | `262144` | 256 KiB request cap. |
|
||||
| `MAX_NUM_PREDICT` | `4096` | Hard cap on requested completion tokens. |
|
||||
|
||||
### Security
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `ARGON2_TIME_COST` | `3` | |
|
||||
| `ARGON2_MEMORY_COST_KIB` | `65536` | 64 MiB. |
|
||||
| `ARGON2_PARALLELISM` | `4` | |
|
||||
| `AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN` | `20` | Throttles auth brute-force per source IP. |
|
||||
|
||||
### Audit
|
||||
| Var | Default | Notes |
|
||||
|---|---|---|
|
||||
| `AUDIT_BUFFER_SIZE` | `1000` | Ring buffer; full ⇒ deny mode. |
|
||||
| `PROMPT_LOG_DEFAULT_RETENTION_DAYS` | `30` | |
|
||||
| `AUDIT_LOG_DEFAULT_RETENTION_DAYS` | `365` | |
|
||||
|
||||
---
|
||||
|
||||
## TLS & security headers (Caddy)
|
||||
|
||||
`ops/caddy/Caddyfile.example` already sets:
|
||||
|
||||
- **HSTS** `max-age=63072000; includeSubDomains; preload`
|
||||
- `X-Content-Type-Options: nosniff`
|
||||
- `X-Frame-Options: DENY`
|
||||
- `Referrer-Policy: no-referrer`
|
||||
- strips `Server` and `X-Powered-By`
|
||||
|
||||
Edit the site address and ACME `email` before deploying.
|
||||
|
||||
---
|
||||
|
||||
## Non-Compose (systemd)
|
||||
|
||||
A systemd unit is provided for hosts that run the image directly (`ops/systemd/`). The
|
||||
gateway still requires reachable Postgres, Redis, and Ollama, and the same environment
|
||||
variables. TLS in that topology is whatever fronts the host (Caddy, nginx, a load balancer) —
|
||||
**Ollama still must not be publicly reachable.**
|
||||
|
||||
---
|
||||
|
||||
## Upgrades & migrations
|
||||
|
||||
The gateway runs `alembic upgrade head` on container start, so a normal
|
||||
`docker compose up -d --build` after pulling a new version applies pending migrations. For
|
||||
zero-downtime upgrades, run migrations as a one-off
|
||||
(`docker compose run --rm gateway alembic upgrade head`) before rolling the service.
|
||||
172
docs/OPERATIONS.md
Normal file
172
docs/OPERATIONS.md
Normal file
@@ -0,0 +1,172 @@
|
||||
# neuronetz-gateway — Operations Runbook
|
||||
|
||||
Day-2 operations for the gateway: managing tenants and keys, budgets, model policy, usage,
|
||||
and the fail-closed behaviors you'll encounter. All administration is via the **bootstrap
|
||||
CLI** (SPEC §11), run inside the gateway container. There are no admin HTTP endpoints in the
|
||||
gateway (that's `neuronetz-console`'s job).
|
||||
|
||||
> Run the CLI inside the running container:
|
||||
> ```bash
|
||||
> docker compose exec gateway neuronetz-gateway <command> …
|
||||
> ```
|
||||
> In the demo stack, swap the compose file: `docker compose -f docker-compose.demo.yml exec gateway …`
|
||||
|
||||
---
|
||||
|
||||
## Keys
|
||||
|
||||
### Create a key
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1
|
||||
# optional: --scopes chat,embeddings (default: chat,embeddings)
|
||||
```
|
||||
|
||||
The **full key is printed exactly once** in the form `nz_<prefix><secret>`. Store it
|
||||
immediately in your secret manager — it is argon2id-hashed and cannot be recovered. Only the
|
||||
12-char `prefix` is retained server-side.
|
||||
|
||||
### List keys (never shows full keys)
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway list-keys --tenant acme
|
||||
# prints: <prefix> status=active name='prod-server-1' created=…
|
||||
```
|
||||
|
||||
### Revoke a key
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway revoke-key --prefix nz_abc12345
|
||||
```
|
||||
|
||||
This sets the key status to `revoked` and writes the `gateway.revocations` outbox row. A
|
||||
Postgres `NOTIFY` on channel `key_revoked` fires; the gateway evicts the key's Redis cache
|
||||
entry, so revocation takes effect within ~1 second (SPEC §4.5) without restarting anything.
|
||||
A subsequent request with that key returns **401**.
|
||||
|
||||
> The console (`neuronetz-console`) revokes keys the same way — by inserting into
|
||||
> `gateway.revocations`. The trigger-driven NOTIFY makes it immediate without any
|
||||
> cross-service HTTP call.
|
||||
|
||||
### Rotate a key
|
||||
|
||||
There is no in-place rotate. Rotate by: create a new key → deploy it to the client → verify
|
||||
traffic on the new prefix → revoke the old prefix.
|
||||
|
||||
---
|
||||
|
||||
## Tenants & limits
|
||||
|
||||
### Create a tenant
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway create-tenant --name acme \
|
||||
--rpm 120 --tpm 200000 --concurrent 8
|
||||
# add --allow-all-models to opt into using any installed model (default: off)
|
||||
```
|
||||
|
||||
Limits inherit **key → tenant**: a `NULL` key-level limit uses the tenant value.
|
||||
|
||||
---
|
||||
|
||||
## Budgets
|
||||
|
||||
Set per-key token budgets (any combination of daily / monthly / total):
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway set-budget --key nz_abc12345 \
|
||||
--daily 1000000 --monthly 30000000 --total 500000000
|
||||
```
|
||||
|
||||
- Budgets are enforced **fail-closed**: when the binding period hits zero remaining, requests
|
||||
return **429** with a descriptive error and a `Retry-After` header. The binding period and
|
||||
remaining balance are surfaced on every response via `X-Budget-Period` and
|
||||
`X-Budget-Tokens-Remaining` (SPEC §6.5).
|
||||
- Live counters live in Redis; the Postgres ledger (`gateway.budget_usage`) is the source of
|
||||
truth on period rollover/reset.
|
||||
|
||||
---
|
||||
|
||||
## Model policy
|
||||
|
||||
### Set an explicit allowlist (default-deny)
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway set-models --tenant acme \
|
||||
--models llama3.1:8b,mistral:7b
|
||||
```
|
||||
|
||||
The tenant's **effective set** is `allowed_models ∩ discovered` — entries that aren't
|
||||
actually installed on the backend silently never resolve. A request for a model outside the
|
||||
effective set returns a generic **403** (same response as "doesn't exist" — no enumeration).
|
||||
|
||||
### Toggle `allow_all_models`
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway set-models --tenant acme --allow-all # opt in
|
||||
docker compose exec gateway neuronetz-gateway set-models --tenant acme --no-allow-all # back to allowlist
|
||||
```
|
||||
|
||||
With `allow_all_models` on, the effective set **is** the live discovered set — any model
|
||||
pulled into Ollama becomes usable on the next discovery refresh, with no further config
|
||||
change. This is an audited convenience; prefer explicit allowlists for untrusted tenants
|
||||
(see [`THREAT_MODEL.md`](THREAT_MODEL.md)).
|
||||
|
||||
### Inspect discovery and effective sets
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway list-models # live-discovered models
|
||||
docker compose exec gateway neuronetz-gateway list-models --tenant acme # + that tenant's effective set
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
docker compose exec gateway neuronetz-gateway show-usage --tenant acme --period day
|
||||
# prints: requests=… tokens_in=… tokens_out=… (period: day|month|total)
|
||||
```
|
||||
|
||||
For per-key forensics and finer slicing, query `gateway.audit_log` directly (it records
|
||||
`request_id`, `key_prefix`, `model`, `tokens_in/out`, `status`, `latency_ms`, `client_ip`).
|
||||
|
||||
---
|
||||
|
||||
## How model discovery refresh works (SPEC §4.6)
|
||||
|
||||
- A background task polls Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds and
|
||||
caches the result in Redis (`gateway:models:discovered`, TTL `MODEL_DISCOVERY_CACHE_TTL_S`)
|
||||
plus an in-process copy for hot reads.
|
||||
- A model pulled into Ollama out-of-band appears in `allow_all_models` tenants' effective sets
|
||||
within one refresh interval — no config change.
|
||||
- Discovery is **read-only** and uses only the allowlisted `/api/tags` endpoint; it never
|
||||
triggers a pull.
|
||||
- To force a faster pickup, lower `MODEL_DISCOVERY_REFRESH_S` (the demo uses 15 s).
|
||||
|
||||
---
|
||||
|
||||
## Fail-closed behaviors to expect
|
||||
|
||||
| Symptom | Cause | Correct behavior |
|
||||
|---|---|---|
|
||||
| `503` on every request | Redis or Postgres-read down | Fail-closed — rate-limit/budget/auth can't be checked, so deny. Restore the backend. |
|
||||
| `502` with retry-after | Ollama unreachable | Circuit breaker opens after 5 consecutive failures, half-opens after 30 s. Check the backend / `OLLAMA_BASE_URL`. |
|
||||
| `403` for a model you "know" exists | Model not in the tenant's effective set, **or** discovery cache empty/expired | Check `list-models --tenant …`; verify the backend is reachable and the model is installed. Empty discovery = deny by design. |
|
||||
| `429` with `Retry-After` | Rate limit or budget exhausted | Inspect headers (`X-RateLimit-*`, `X-Budget-*`); raise limits/budget or wait. |
|
||||
| `401` immediately after revoke | Working as intended | Revocation propagated via NOTIFY → Redis eviction. |
|
||||
|
||||
`/readyz` returns `503` when **any** dependency (DB, Redis, Ollama) is unreachable; use it as
|
||||
the load-balancer health gate. `/healthz` only checks process liveness.
|
||||
|
||||
---
|
||||
|
||||
## Logs, metrics, audit
|
||||
|
||||
- **Logs:** structured (structlog), JSON in production, to stdout. Keys/secrets are never
|
||||
logged.
|
||||
- **Metrics:** Prometheus at `/metrics` (loopback only): `gateway_requests_total`,
|
||||
`gateway_tokens_total`, `gateway_request_duration_seconds`, labelled by `tenant` and
|
||||
`model` (never `key_id`).
|
||||
- **Audit log:** always-on in `gateway.audit_log`. **Prompt log** is opt-in per key and TTL'd
|
||||
(`PROMPT_LOG_DEFAULT_RETENTION_DAYS`); a sweeper enforces retention.
|
||||
113
docs/PLAYGROUND.md
Normal file
113
docs/PLAYGROUND.md
Normal file
@@ -0,0 +1,113 @@
|
||||
# neuronetz-gateway — Demo & Playground
|
||||
|
||||
The fastest way to see the gateway working end-to-end, with **no GPU and no model downloads**.
|
||||
`./demo.sh` brings up the gateway against a mock Ollama backend, mints a demo API key, and
|
||||
prints ready-to-paste curl commands and a link to an interactive browser playground.
|
||||
|
||||
---
|
||||
|
||||
## Launch the demo
|
||||
|
||||
From the repo root:
|
||||
|
||||
```bash
|
||||
./demo.sh
|
||||
```
|
||||
|
||||
This will:
|
||||
|
||||
1. Build and start the demo stack (`docker-compose.demo.yml`): **postgres + redis +
|
||||
mock-ollama + gateway**. No Caddy; the gateway is published on `127.0.0.1:8080`.
|
||||
2. Wait for the gateway to report healthy at `/healthz`.
|
||||
3. Create a demo tenant (`--allow-all-models`) and an API key via the bootstrap CLI **inside
|
||||
the gateway container**, capturing the key (which is printed exactly once).
|
||||
4. Print a summary: the **API key**, the **playground URL**
|
||||
`http://localhost:8080/playground`, and five ready-to-paste curl commands —
|
||||
- streaming `/v1/chat/completions` (OpenAI SSE),
|
||||
- streaming `/api/chat` (native NDJSON),
|
||||
- `GET /v1/models`,
|
||||
- a **401** example (no/bad key),
|
||||
- a **403** example (`POST /api/pull`, hard-blocked).
|
||||
|
||||
The script is **re-runnable**: an existing tenant is reused, and each run mints a fresh,
|
||||
uniquely-named key (the full key only ever prints at creation).
|
||||
|
||||
Tear everything down (containers + volumes):
|
||||
|
||||
```bash
|
||||
./demo.sh --down
|
||||
```
|
||||
|
||||
### What's running
|
||||
|
||||
| Service | Exposed? | Notes |
|
||||
|---|---|---|
|
||||
| `gateway` | `127.0.0.1:8080` | The real gateway image, built from the repo `Dockerfile`. |
|
||||
| `mock-ollama` | **no** | Internal network only — mirrors the prod "Ollama is never exposed" rule. |
|
||||
| `postgres` | **no** | Internal only. |
|
||||
| `redis` | **no** | Internal only. |
|
||||
|
||||
The mock backend (`demo/mock-ollama/`) emulates Ollama's API shapes — including realistic
|
||||
`prompt_eval_count` / `eval_count` on the final stream object — so token counting, model
|
||||
discovery, and `/api/show` sanitization all exercise real gateway code paths. It serves a
|
||||
small catalogue: `llama3.1:8b`, `mistral:7b`, `qwen2.5:3b`, `nomic-embed-text`.
|
||||
|
||||
---
|
||||
|
||||
## Use the playground
|
||||
|
||||
Open **http://localhost:8080/playground** in a browser. It is a single self-contained HTML
|
||||
page, served **same-origin** by the gateway (so no CORS to worry about).
|
||||
|
||||
1. **Base URL** is pre-filled with the current origin; leave it as is for the demo.
|
||||
2. Paste the **API key** from the `./demo.sh` output into the Bearer field. (Typing a key
|
||||
auto-loads the model dropdown; you can also hit **↻ Refresh**.)
|
||||
3. Pick an **endpoint** tab: `/v1/chat/completions`, `/api/chat`, `/api/generate`,
|
||||
`/v1/models`, `/api/tags`, `/healthz`, `/readyz`.
|
||||
4. Choose a **model** from the auto-populated dropdown, type a prompt, toggle **stream**.
|
||||
5. Hit **▶ Run**. The streamed output renders **live** — SSE `data:` deltas (incl. `[DONE]`)
|
||||
for `/v1/*`, NDJSON lines for `/api/*`.
|
||||
6. The panel shows the **response status** and the rate-limit / budget **response headers**
|
||||
(`X-Request-ID`, `X-RateLimit-*`, `X-Budget-*`; SPEC §6.5).
|
||||
7. The **Exact curl** box mirrors precisely what **Run** sends — copy it to reproduce in a
|
||||
terminal.
|
||||
|
||||
Try the 403 path too: there's no mutating-endpoint tab by design, but the printed `curl` for
|
||||
`POST /api/pull` shows the hard block, and an invalid key in the Bearer field demonstrates the
|
||||
401 fail-closed response.
|
||||
|
||||
---
|
||||
|
||||
## ⚠️ Security note: the playground is OFF by default in production
|
||||
|
||||
The playground route is **flag-gated** and **disabled by default**. The demo stack turns it on
|
||||
explicitly:
|
||||
|
||||
```yaml
|
||||
# docker-compose.demo.yml (gateway service)
|
||||
GATEWAY_PLAYGROUND_ENABLED: "true"
|
||||
GATEWAY_PLAYGROUND_FILE: /app/playground/index.html
|
||||
```
|
||||
|
||||
with the file mounted read-only into the container:
|
||||
|
||||
```yaml
|
||||
volumes:
|
||||
- ./playground:/app/playground:ro
|
||||
```
|
||||
|
||||
The production stack (`docker-compose.yml`) does **not** set `GATEWAY_PLAYGROUND_ENABLED`, so
|
||||
the route is absent. Do not enable it on a public deployment: it is a convenience for demos and
|
||||
local development, not a production surface. Leaving it off keeps the public attack surface to
|
||||
the documented API only.
|
||||
|
||||
---
|
||||
|
||||
## Files behind the demo
|
||||
|
||||
| Path | What it is |
|
||||
|---|---|
|
||||
| `demo.sh` | The one-command entrypoint (up / `--down`). |
|
||||
| `docker-compose.demo.yml` | The demo stack definition. |
|
||||
| `demo/mock-ollama/` | The standalone mock Ollama service (FastAPI app + Dockerfile). |
|
||||
| `playground/index.html` | The self-contained browser playground served at `/playground`. |
|
||||
77
docs/THREAT_MODEL.md
Normal file
77
docs/THREAT_MODEL.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# neuronetz-gateway — Threat Model
|
||||
|
||||
From [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §3. The governing principle, in one line:
|
||||
|
||||
> **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down,
|
||||
> DB unreachable, ambiguous state), **deny** the request. Never default to allow.
|
||||
> (AGENT_PROMPT non-negotiable #1.)
|
||||
|
||||
The gateway exists because the Ollama instance at `api.neuronetz.ai` was exposed without
|
||||
authentication — a standing security incident. Every defense below traces back to closing
|
||||
that gap and keeping it closed.
|
||||
|
||||
---
|
||||
|
||||
## Threats & mitigations (SPEC §3)
|
||||
|
||||
| Threat | Mitigation |
|
||||
|---|---|
|
||||
| Internet scanners hitting Ollama directly | Ollama bound to the internal Docker network; **never published**. No `ports:` mapping in any shipped compose file. |
|
||||
| Unauthenticated API abuse | Mandatory Bearer token; **fail-closed** on auth errors (401). |
|
||||
| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP (`AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN`). |
|
||||
| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent-connection cap. |
|
||||
| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096). |
|
||||
| Model enumeration / training-data exfil via uncommon models | Model allowlist, **default-deny**. Discovery only exposes models actually installed; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the **same** generic response. |
|
||||
| Discovery backend unreachable | **Fail-closed:** an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models." |
|
||||
| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) **hard-blocked** at the gateway, not configurable. |
|
||||
| Information disclosure via error messages | Upstream errors **sanitized** at the boundary; Ollama internals never proxied to the client. Each error carries an `X-Request-ID` for correlation. |
|
||||
| Audit log tampering | Append-only at the app layer; DB role separation; optional WAL archiving. |
|
||||
| Prompt data leakage | Prompt logging **off by default**; opt-in per key; TTL'd retention; redaction hook. |
|
||||
| Redis outage causing "fail open" | **Fail-closed:** if the rate-limit/budget backend is unavailable, deny (503), not allow. |
|
||||
| Compromised admin token | There is **no admin endpoint** in the gateway. Admin lives in `neuronetz-console`; the gateway has nothing to compromise here. |
|
||||
|
||||
---
|
||||
|
||||
## Notes on selected defenses
|
||||
|
||||
### `allow_all_models` is an audited opt-in
|
||||
|
||||
`allow_all_models` lets a tenant use any currently-installed model, so models newly pulled
|
||||
into Ollama are auto-granted on the next discovery refresh. This is convenient but widens the
|
||||
attack surface for *that tenant*, so it is:
|
||||
|
||||
- **opt-in per tenant** (default `false`), set explicitly via the CLI
|
||||
(`create-tenant --allow-all-models` or `set-models --allow-all`);
|
||||
- **overridable per key** — a non-`NULL` key-level `allow_all_models` overrides the tenant
|
||||
flag; otherwise the tenant flag applies (SPEC §13.7);
|
||||
- **audited** — every request records the model used in `gateway.audit_log`.
|
||||
|
||||
Default-deny tenants instead see only `allowed_models ∩ discovered`. Either way the effective
|
||||
set is always intersected with the *live* discovered set, so stale or typo'd allowlist entries
|
||||
never resolve.
|
||||
|
||||
### No existence disclosure
|
||||
|
||||
A model that is installed-but-unpermitted and a model that is not installed both return the
|
||||
**same** generic `403`. An attacker cannot use the gateway to enumerate which models exist on
|
||||
the backend (SPEC §13.6).
|
||||
|
||||
### Sanitized errors + request IDs
|
||||
|
||||
Clients never receive Ollama's error text, stack traces, or internal hostnames. Errors are
|
||||
mapped to generic `4xx`/`5xx` JSON with a `request_id`. Operators correlate that ID with the
|
||||
audit log to investigate without leaking internals to callers (SPEC §4.3 step 14).
|
||||
|
||||
### Streaming integrity is also a safety property
|
||||
|
||||
Token counting and audit writes happen **after** stream close, never on the hot path. This
|
||||
keeps time-to-first-byte honest and ensures budget decrements and audit rows reflect the true
|
||||
final token counts reported by Ollama (`prompt_eval_count` + `eval_count`), not estimates.
|
||||
|
||||
---
|
||||
|
||||
## Out of scope (v0.1.0)
|
||||
|
||||
Documented as future work, **not** mitigations present today: content moderation /
|
||||
prompt-injection filtering, response caching, multi-backend routing, billing, SSO/OAuth2 for
|
||||
admin, and any web admin UI (that lives in `neuronetz-console`).
|
||||
40
mkdocs.yml
Normal file
40
mkdocs.yml
Normal file
@@ -0,0 +1,40 @@
|
||||
# mkdocs configuration for the neuronetz-gateway documentation.
|
||||
#
|
||||
# pip install mkdocs-material
|
||||
# mkdocs serve # live preview at http://127.0.0.1:8000
|
||||
# mkdocs build # static site into ./site
|
||||
#
|
||||
# Docs live in docs/. This wires them into a single Material-themed site.
|
||||
site_name: neuronetz-gateway
|
||||
site_description: Secure, multi-tenant API gateway in front of Ollama.
|
||||
docs_dir: docs
|
||||
|
||||
theme:
|
||||
name: material
|
||||
palette:
|
||||
- scheme: slate
|
||||
primary: indigo
|
||||
accent: indigo
|
||||
features:
|
||||
- navigation.sections
|
||||
- navigation.top
|
||||
- content.code.copy
|
||||
- content.code.annotate
|
||||
|
||||
markdown_extensions:
|
||||
- admonition
|
||||
- tables
|
||||
- toc:
|
||||
permalink: true
|
||||
- pymdownx.highlight:
|
||||
anchor_linenums: true
|
||||
- pymdownx.superfences
|
||||
- pymdownx.inlinehilite
|
||||
|
||||
nav:
|
||||
- Architecture: ARCHITECTURE.md
|
||||
- API Reference: API.md
|
||||
- Deployment: DEPLOYMENT.md
|
||||
- Threat Model: THREAT_MODEL.md
|
||||
- Operations Runbook: OPERATIONS.md
|
||||
- Demo & Playground: PLAYGROUND.md
|
||||
716
playground/index.html
Normal file
716
playground/index.html
Normal file
@@ -0,0 +1,716 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<title>neuronetz-gateway · playground</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg: #0a0e16;
|
||||
--bg-2: #10151f;
|
||||
--panel: #141b27;
|
||||
--panel-2: #1a2333;
|
||||
--border: #243047;
|
||||
--text: #e6edf6;
|
||||
--muted: #8b9bb4;
|
||||
--accent: #4f8cff;
|
||||
--accent-2: #7c5cff;
|
||||
--good: #3fcf8e;
|
||||
--warn: #f0b429;
|
||||
--bad: #ff5d6c;
|
||||
--mono: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace;
|
||||
--sans: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
|
||||
}
|
||||
* { box-sizing: border-box; }
|
||||
html, body { margin: 0; height: 100%; }
|
||||
body {
|
||||
background:
|
||||
radial-gradient(1200px 600px at 80% -10%, rgba(124,92,255,.10), transparent 60%),
|
||||
radial-gradient(900px 500px at -10% 110%, rgba(79,140,255,.10), transparent 55%),
|
||||
var(--bg);
|
||||
color: var(--text);
|
||||
font-family: var(--sans);
|
||||
font-size: 14px;
|
||||
line-height: 1.5;
|
||||
-webkit-font-smoothing: antialiased;
|
||||
}
|
||||
a { color: var(--accent); }
|
||||
header {
|
||||
display: flex; align-items: center; gap: 14px;
|
||||
padding: 18px 26px;
|
||||
border-bottom: 1px solid var(--border);
|
||||
background: linear-gradient(180deg, rgba(255,255,255,.02), transparent);
|
||||
position: sticky; top: 0; z-index: 5;
|
||||
backdrop-filter: blur(6px);
|
||||
}
|
||||
.logo {
|
||||
width: 34px; height: 34px; border-radius: 9px;
|
||||
background: linear-gradient(135deg, var(--accent), var(--accent-2));
|
||||
display: grid; place-items: center;
|
||||
font-weight: 800; color: #fff; letter-spacing: -1px;
|
||||
box-shadow: 0 6px 20px rgba(79,140,255,.35);
|
||||
}
|
||||
header h1 { font-size: 16px; margin: 0; font-weight: 700; letter-spacing: .2px; }
|
||||
header .sub { color: var(--muted); font-size: 12px; }
|
||||
.grow { flex: 1; }
|
||||
.pill {
|
||||
font-size: 11px; color: var(--muted);
|
||||
border: 1px solid var(--border); border-radius: 999px;
|
||||
padding: 4px 10px; font-family: var(--mono);
|
||||
}
|
||||
|
||||
main {
|
||||
display: grid;
|
||||
grid-template-columns: 380px 1fr;
|
||||
gap: 18px;
|
||||
padding: 18px 26px 40px;
|
||||
max-width: 1400px; margin: 0 auto;
|
||||
}
|
||||
@media (max-width: 920px) { main { grid-template-columns: 1fr; } }
|
||||
|
||||
.panel {
|
||||
background: var(--panel);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: 14px;
|
||||
padding: 16px;
|
||||
}
|
||||
.panel h2 {
|
||||
font-size: 12px; text-transform: uppercase; letter-spacing: .12em;
|
||||
color: var(--muted); margin: 0 0 12px;
|
||||
}
|
||||
label { display: block; font-size: 12px; color: var(--muted); margin: 12px 0 5px; }
|
||||
label:first-of-type { margin-top: 0; }
|
||||
input, select, textarea {
|
||||
width: 100%; background: var(--bg-2); color: var(--text);
|
||||
border: 1px solid var(--border); border-radius: 9px;
|
||||
padding: 9px 11px; font-size: 13px; font-family: var(--sans);
|
||||
outline: none; transition: border-color .15s, box-shadow .15s;
|
||||
}
|
||||
input:focus, select:focus, textarea:focus {
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 3px rgba(79,140,255,.18);
|
||||
}
|
||||
textarea { resize: vertical; min-height: 90px; font-family: var(--mono); font-size: 12.5px; }
|
||||
.row { display: flex; gap: 8px; }
|
||||
.row > * { flex: 1; }
|
||||
.inline { display: flex; align-items: center; gap: 8px; }
|
||||
.inline input[type=checkbox] { width: auto; }
|
||||
|
||||
.tabs { display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 8px; }
|
||||
.tab {
|
||||
font-family: var(--mono); font-size: 11.5px;
|
||||
padding: 6px 10px; border-radius: 8px; cursor: pointer;
|
||||
border: 1px solid var(--border); background: var(--bg-2); color: var(--muted);
|
||||
transition: all .12s;
|
||||
}
|
||||
.tab:hover { color: var(--text); border-color: #34425f; }
|
||||
.tab.active {
|
||||
color: #fff; border-color: transparent;
|
||||
background: linear-gradient(135deg, var(--accent), var(--accent-2));
|
||||
}
|
||||
|
||||
button.run {
|
||||
margin-top: 14px; width: 100%;
|
||||
background: linear-gradient(135deg, var(--accent), var(--accent-2));
|
||||
color: #fff; border: none; border-radius: 10px;
|
||||
padding: 12px; font-size: 14px; font-weight: 700; cursor: pointer;
|
||||
box-shadow: 0 8px 22px rgba(79,140,255,.3);
|
||||
transition: transform .08s, filter .15s;
|
||||
}
|
||||
button.run:hover { filter: brightness(1.07); }
|
||||
button.run:active { transform: translateY(1px); }
|
||||
button.run:disabled { filter: grayscale(.6) brightness(.8); cursor: progress; }
|
||||
|
||||
.ghost {
|
||||
background: var(--panel-2); color: var(--muted);
|
||||
border: 1px solid var(--border); border-radius: 8px;
|
||||
padding: 7px 10px; font-size: 12px; cursor: pointer; transition: all .12s;
|
||||
}
|
||||
.ghost:hover { color: var(--text); border-color: #34425f; }
|
||||
|
||||
.field-with-btn { display: flex; gap: 8px; align-items: stretch; }
|
||||
.field-with-btn select { flex: 1; }
|
||||
|
||||
.out-head { display: flex; align-items: center; gap: 10px; margin-bottom: 10px; }
|
||||
.status {
|
||||
font-family: var(--mono); font-size: 12px; padding: 3px 9px; border-radius: 7px;
|
||||
border: 1px solid var(--border); color: var(--muted);
|
||||
}
|
||||
.status.s2 { color: var(--good); border-color: rgba(63,207,142,.4); background: rgba(63,207,142,.08); }
|
||||
.status.s4 { color: var(--warn); border-color: rgba(240,180,41,.4); background: rgba(240,180,41,.08); }
|
||||
.status.s5 { color: var(--bad); border-color: rgba(255,93,108,.4); background: rgba(255,93,108,.08); }
|
||||
|
||||
pre, .codebox {
|
||||
background: #0b0f17; border: 1px solid var(--border); border-radius: 10px;
|
||||
padding: 13px; font-family: var(--mono); font-size: 12.5px;
|
||||
white-space: pre-wrap; word-break: break-word; margin: 0;
|
||||
max-height: 460px; overflow: auto;
|
||||
}
|
||||
.codebox.curl { color: #c9d6ea; }
|
||||
.out-body { min-height: 120px; }
|
||||
|
||||
.headers {
|
||||
margin-top: 12px; font-family: var(--mono); font-size: 11.5px;
|
||||
border: 1px solid var(--border); border-radius: 10px; overflow: hidden;
|
||||
}
|
||||
.headers .hrow { display: flex; border-top: 1px solid var(--border); }
|
||||
.headers .hrow:first-child { border-top: none; }
|
||||
.headers .hk { width: 46%; padding: 6px 10px; color: var(--muted); background: var(--bg-2); }
|
||||
.headers .hv { flex: 1; padding: 6px 10px; color: var(--text); word-break: break-all; }
|
||||
|
||||
.section-title {
|
||||
display: flex; align-items: center; justify-content: space-between; margin: 0 0 8px;
|
||||
}
|
||||
.section-title .copy { font-size: 11px; }
|
||||
.hint { color: var(--muted); font-size: 11.5px; margin-top: 6px; }
|
||||
.stack { display: grid; gap: 16px; }
|
||||
|
||||
/* "About this endpoint" panel */
|
||||
.ep-head { display: flex; align-items: center; gap: 8px; flex-wrap: wrap; margin-bottom: 8px; }
|
||||
#endpointInfo h2 { font-family: ui-monospace, "JetBrains Mono", "Fira Code", monospace; font-size: 14px; letter-spacing: 0.2px; }
|
||||
.summary { margin: 4px 0 12px; color: var(--text); font-size: 13.5px; line-height: 1.55; }
|
||||
.sub-title { margin: 10px 0 6px; color: var(--muted); font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.8px; }
|
||||
.codebox.sample { max-height: 200px; overflow: auto; font-size: 11.5px; color: #c9d6ea; }
|
||||
.badge {
|
||||
font-size: 10.5px; text-transform: uppercase; letter-spacing: 0.6px;
|
||||
padding: 2px 7px; border-radius: 999px; border: 1px solid var(--border);
|
||||
color: var(--muted); background: var(--bg-2);
|
||||
}
|
||||
.badge-post { color: #ffb84a; border-color: rgba(255,184,74,.35); background: rgba(255,184,74,.08); }
|
||||
.badge-get { color: #5fc8ff; border-color: rgba(95,200,255,.35); background: rgba(95,200,255,.08); }
|
||||
.badge-auth { color: #c9b6ff; border-color: rgba(201,182,255,.35); background: rgba(201,182,255,.08); }
|
||||
.badge-open { color: #3fcf8e; border-color: rgba(63,207,142,.35); background: rgba(63,207,142,.08); }
|
||||
.blink { animation: blink 1s steps(2,start) infinite; }
|
||||
@keyframes blink { to { opacity: .25; } }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<div class="logo">N</div>
|
||||
<div>
|
||||
<h1>neuronetz-gateway <span class="sub">· playground</span></h1>
|
||||
<div class="sub">Authenticated, rate-limited, audited access to the model backend</div>
|
||||
</div>
|
||||
<div class="grow"></div>
|
||||
<div class="pill" id="originPill">same-origin</div>
|
||||
</header>
|
||||
|
||||
<main>
|
||||
<!-- ── Left: request builder ─────────────────────────────────────────── -->
|
||||
<section class="panel">
|
||||
<h2>Request</h2>
|
||||
|
||||
<label for="baseUrl">Base URL</label>
|
||||
<div class="field-with-btn">
|
||||
<input id="baseUrl" type="text" spellcheck="false" autocomplete="off" autocapitalize="off" autocorrect="off" />
|
||||
<button class="ghost" id="resetBase" title="Reset Base URL to this page's origin">⟳ This origin</button>
|
||||
</div>
|
||||
|
||||
<label for="apiKey">API key (Bearer)</label>
|
||||
<input id="apiKey" type="password" placeholder="nz_…" spellcheck="false" autocomplete="off" />
|
||||
<div class="hint" id="keyHint">Created by <code>./demo.sh</code> and printed once in your terminal.</div>
|
||||
|
||||
<label>Endpoint</label>
|
||||
<div class="tabs" id="tabs"></div>
|
||||
|
||||
<div id="modelWrap">
|
||||
<label for="model">Model</label>
|
||||
<div class="field-with-btn">
|
||||
<select id="model"><option value="">— enter a key, then refresh —</option></select>
|
||||
<button class="ghost" id="refreshModels" title="Load /v1/models with your key">↻ Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="promptWrap">
|
||||
<label for="prompt" id="promptLabel">Prompt</label>
|
||||
<textarea id="prompt" spellcheck="false">Say hello in one sentence.</textarea>
|
||||
<label class="inline" id="streamWrap" style="margin-top:10px">
|
||||
<input id="stream" type="checkbox" checked /> Stream the response
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<button class="run" id="run">▶ Run</button>
|
||||
<div class="hint" id="methodHint"></div>
|
||||
</section>
|
||||
|
||||
<!-- ── Right: about + response + curl ────────────────────────────────── -->
|
||||
<div class="stack">
|
||||
<section class="panel" id="endpointInfo">
|
||||
<div class="ep-head">
|
||||
<h2 id="epTitle" style="margin:0">POST /v1/chat/completions</h2>
|
||||
<div class="grow"></div>
|
||||
<span class="badge" id="epMethod">POST</span>
|
||||
<span class="badge" id="epAuth">auth: bearer</span>
|
||||
<span class="badge" id="epStream">streams · SSE</span>
|
||||
</div>
|
||||
<p class="summary" id="epSummary"></p>
|
||||
<div class="sub-title">Sample request body</div>
|
||||
<pre class="codebox sample" id="epSampleReq"></pre>
|
||||
<div class="sub-title">Sample response</div>
|
||||
<pre class="codebox sample" id="epSampleResp"></pre>
|
||||
<div class="hint" id="epNote"></div>
|
||||
</section>
|
||||
|
||||
<section class="panel">
|
||||
<div class="out-head">
|
||||
<h2 style="margin:0">Response</h2>
|
||||
<div class="grow"></div>
|
||||
<span class="status" id="status">idle</span>
|
||||
</div>
|
||||
<pre class="codebox out-body" id="output">Run a request to see the response stream here.</pre>
|
||||
<div class="headers" id="headers" style="display:none"></div>
|
||||
</section>
|
||||
|
||||
<section class="panel">
|
||||
<div class="section-title">
|
||||
<h2 style="margin:0">Exact curl</h2>
|
||||
<button class="ghost copy" id="copyCurl">Copy</button>
|
||||
</div>
|
||||
<pre class="codebox curl" id="curl"></pre>
|
||||
<div class="hint">This is exactly what <b>Run</b> sends — paste it into a terminal to reproduce.</div>
|
||||
</section>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<script>
|
||||
"use strict";
|
||||
|
||||
// ── Endpoint catalogue ──────────────────────────────────────────────────
|
||||
// Each endpoint knows its method, format, body shape, and how to render itself
|
||||
// in the "About this endpoint" panel: summary, sample request, sample response,
|
||||
// and an optional note. Mirrors SPEC §6.
|
||||
const ENDPOINTS = {
|
||||
"/v1/chat/completions": {
|
||||
method: "POST", canStream: true, format: "sse", needsModel: true, needsPrompt: true,
|
||||
summary: "OpenAI-compatible Chat Completions — a drop-in replacement for OpenAI's endpoint. Point any OpenAI SDK at this gateway's base URL with your nz_ key and existing client code works unchanged. Streaming uses Server-Sent Events terminated by `data: [DONE]`.",
|
||||
body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
|
||||
sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
|
||||
sampleResponse:
|
||||
`data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","created":1779492441,"model":"llama3.1:8b","choices":[{"index":0,"delta":{"content":" hi"},"finish_reason":null}]}
|
||||
|
||||
data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":1,"completion_tokens":2,"total_tokens":3}}
|
||||
|
||||
data: [DONE]`,
|
||||
note: "Non-streaming (`stream: false`) returns one `chat.completion` JSON object — same shape as OpenAI.",
|
||||
},
|
||||
"/api/chat": {
|
||||
method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
|
||||
summary: "Native Ollama chat. Streams NDJSON — one JSON object per line; the final object carries `prompt_eval_count` + `eval_count` for exact token accounting in the audit log.",
|
||||
body: (s) => ({ model: s.model, stream: s.stream, messages: [{ role: "user", content: s.prompt }] }),
|
||||
sampleRequest: { model: "llama3.1:8b", stream: true, messages: [{ role: "user", content: "Say hello in one sentence." }] },
|
||||
sampleResponse:
|
||||
`{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false}
|
||||
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" hi"},"done":false}
|
||||
{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":""},"done":true,"prompt_eval_count":1,"eval_count":2,"total_duration":12345678}`,
|
||||
note: "Errors are sanitized but every response carries an X-Request-ID; upstream internals never leak.",
|
||||
},
|
||||
"/api/generate": {
|
||||
method: "POST", canStream: true, format: "ndjson", needsModel: true, needsPrompt: true,
|
||||
summary: "Native Ollama text generation. Takes a plain `prompt` string (no chat message structure) and streams NDJSON `response` chunks plus a final done frame with token counts.",
|
||||
body: (s) => ({ model: s.model, stream: s.stream, prompt: s.prompt }),
|
||||
sampleRequest: { model: "mistral:7b", stream: true, prompt: "Say hello in one sentence." },
|
||||
sampleResponse:
|
||||
`{"model":"mistral:7b","created_at":"…","response":"Echo:","done":false}
|
||||
{"model":"mistral:7b","created_at":"…","response":" hi","done":false}
|
||||
{"model":"mistral:7b","created_at":"…","response":"","done":true,"prompt_eval_count":1,"eval_count":2}`,
|
||||
note: "Use this when you don't need chat-message structure; otherwise prefer `/api/chat` or `/v1/chat/completions`.",
|
||||
},
|
||||
"/v1/models": {
|
||||
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
|
||||
summary: "Lists the tenant's effective model set in OpenAI format: (live-discovered ∩ allowed_models), or all discovered models when the tenant has allow_all_models enabled. There is no static list — discovery polls the Ollama backend in the background.",
|
||||
sampleRequest: null,
|
||||
sampleResponse:
|
||||
`{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{"id": "llama3.1:8b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
|
||||
{"id": "mistral:7b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
|
||||
{"id": "qwen2.5:3b", "object": "model", "created": 1779492441, "owned_by": "neuronetz"},
|
||||
{"id": "nomic-embed-text", "object": "model", "created": 1779492441, "owned_by": "neuronetz"}
|
||||
]
|
||||
}`,
|
||||
note: "Refreshed automatically every MODEL_DISCOVERY_REFRESH_S (default 60s). Cached fail-closed.",
|
||||
},
|
||||
"/api/tags": {
|
||||
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false,
|
||||
summary: "Native Ollama model list, filtered to the tenant's effective set. Same data as /v1/models but in Ollama's `models` shape — includes size, digest, modified_at, family and quantization details.",
|
||||
sampleRequest: null,
|
||||
sampleResponse:
|
||||
`{
|
||||
"models": [
|
||||
{
|
||||
"name": "llama3.1:8b",
|
||||
"model": "llama3.1:8b",
|
||||
"modified_at": "2026-04-01T12:00:00Z",
|
||||
"size": 4920624384,
|
||||
"digest": "sha256:…",
|
||||
"details": {"family": "llama", "parameter_size": "8B", "quantization_level": "Q4_K_M"}
|
||||
}
|
||||
]
|
||||
}`,
|
||||
note: "",
|
||||
},
|
||||
"/healthz": {
|
||||
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
|
||||
summary: "Liveness probe. Returns 200 as long as the gateway process can respond — does NOT check downstream dependencies. Safe for load-balancer health checks. No authentication required.",
|
||||
sampleRequest: null,
|
||||
sampleResponse: `{"status": "ok"}`,
|
||||
note: "",
|
||||
},
|
||||
"/readyz": {
|
||||
method: "GET", canStream: false, format: "json", needsModel: false, needsPrompt: false, noAuth: true,
|
||||
summary: "Readiness probe. Returns 200 only when Postgres + Redis + the Ollama backend are all reachable; 503 otherwise with which dependencies are down. No authentication required.",
|
||||
sampleRequest: null,
|
||||
sampleResponse:
|
||||
`# 200 OK
|
||||
{"status": "ready", "checks": {"postgres": true, "redis": true, "ollama": true}}
|
||||
|
||||
# 503 Service Unavailable
|
||||
{"status": "not_ready", "checks": {"postgres": true, "redis": true, "ollama": false}}`,
|
||||
note: "In this demo, /readyz will return 200 — the mock Ollama is reachable. In dev-only stacks without an Ollama backend, /readyz fails closed.",
|
||||
},
|
||||
};
|
||||
|
||||
// Response headers worth surfacing (SPEC §6.5).
|
||||
const SURFACE_HEADERS = [
|
||||
"x-request-id",
|
||||
"x-ratelimit-limit-requests", "x-ratelimit-remaining-requests",
|
||||
"x-ratelimit-limit-tokens", "x-ratelimit-remaining-tokens",
|
||||
"x-budget-period", "x-budget-tokens-remaining",
|
||||
"retry-after", "content-type",
|
||||
];
|
||||
|
||||
const $ = (id) => document.getElementById(id);
|
||||
let current = "/v1/chat/completions";
|
||||
|
||||
// ── State helpers ───────────────────────────────────────────────────────
|
||||
function state() {
|
||||
return {
|
||||
base: $("baseUrl").value.replace(/\/+$/, ""),
|
||||
key: $("apiKey").value.trim(),
|
||||
model: $("model").value,
|
||||
prompt: $("prompt").value,
|
||||
stream: $("stream").checked,
|
||||
};
|
||||
}
|
||||
|
||||
function buildTabs() {
|
||||
const tabs = $("tabs");
|
||||
tabs.innerHTML = "";
|
||||
for (const path of Object.keys(ENDPOINTS)) {
|
||||
const el = document.createElement("div");
|
||||
el.className = "tab" + (path === current ? " active" : "");
|
||||
el.textContent = path;
|
||||
el.onclick = () => { current = path; buildTabs(); syncForm(); updateCurl(); };
|
||||
tabs.appendChild(el);
|
||||
}
|
||||
}
|
||||
|
||||
function syncForm() {
|
||||
const ep = ENDPOINTS[current];
|
||||
$("modelWrap").style.display = ep.needsModel ? "" : "none";
|
||||
$("promptWrap").style.display = ep.needsPrompt ? "" : "none";
|
||||
$("streamWrap").style.display = ep.canStream ? "" : "none";
|
||||
$("promptLabel").textContent = current === "/api/generate" ? "Prompt" : "Message";
|
||||
$("methodHint").textContent = `${ep.method} · ${ep.canStream ? `streams ${ep.format.toUpperCase()}` : ep.format.toUpperCase()} · ${ep.noAuth ? "no auth" : "requires Bearer"}`;
|
||||
renderEndpointInfo();
|
||||
refreshGating();
|
||||
}
|
||||
|
||||
// Populate the "About this endpoint" panel from the current endpoint's metadata.
|
||||
function renderEndpointInfo() {
|
||||
const ep = ENDPOINTS[current];
|
||||
$("epTitle").textContent = ep.method + " " + current;
|
||||
|
||||
const method = $("epMethod");
|
||||
method.textContent = ep.method;
|
||||
method.className = "badge badge-" + ep.method.toLowerCase();
|
||||
|
||||
const auth = $("epAuth");
|
||||
auth.textContent = ep.noAuth ? "no auth" : "auth: bearer";
|
||||
auth.className = "badge " + (ep.noAuth ? "badge-open" : "badge-auth");
|
||||
|
||||
const streamBadge = $("epStream");
|
||||
if (ep.canStream) {
|
||||
streamBadge.style.display = "";
|
||||
streamBadge.textContent = "streams · " + (ep.format === "sse" ? "SSE" : "NDJSON");
|
||||
} else {
|
||||
streamBadge.style.display = "none";
|
||||
}
|
||||
|
||||
$("epSummary").textContent = ep.summary;
|
||||
$("epSampleReq").textContent = ep.sampleRequest
|
||||
? JSON.stringify(ep.sampleRequest, null, 2)
|
||||
: "(no request body — GET)";
|
||||
$("epSampleResp").textContent = ep.sampleResponse;
|
||||
|
||||
const note = $("epNote");
|
||||
if (ep.note) { note.textContent = ep.note; note.style.display = ""; }
|
||||
else { note.style.display = "none"; }
|
||||
}
|
||||
|
||||
// Visibly disable Run/Refresh when no key is present (most endpoints need auth)
|
||||
// and surface the reason RIGHT next to the API-key field — not just in the right
|
||||
// pane where it's easy to miss.
|
||||
function refreshGating() {
|
||||
const ep = ENDPOINTS[current];
|
||||
const hasKey = $("apiKey").value.trim().length > 0;
|
||||
const needsKey = !ep.noAuth;
|
||||
const run = $("run");
|
||||
const refresh = $("refreshModels");
|
||||
const blocked = needsKey && !hasKey;
|
||||
run.disabled = blocked;
|
||||
refresh.disabled = !hasKey; // refresh always needs a key
|
||||
run.style.opacity = blocked ? "0.45" : "";
|
||||
run.style.cursor = blocked ? "not-allowed" : "";
|
||||
refresh.style.opacity = !hasKey ? "0.45" : "";
|
||||
refresh.style.cursor = !hasKey ? "not-allowed" : "";
|
||||
const hint = $("keyHint");
|
||||
if (blocked) {
|
||||
hint.innerHTML = "⚠ <b style=\"color:#ffb84a\">Paste your API key above</b> to enable Run and Refresh. Get one by running <code>./demo.sh</code>.";
|
||||
} else {
|
||||
hint.innerHTML = "Created by <code>./demo.sh</code> and printed once in your terminal.";
|
||||
}
|
||||
}
|
||||
|
||||
// ── curl preview (must match exactly what Run sends) ────────────────────
|
||||
function buildRequest() {
|
||||
const s = state();
|
||||
const ep = ENDPOINTS[current];
|
||||
const url = (s.base || location.origin) + current;
|
||||
const headers = {};
|
||||
if (!ep.noAuth) headers["Authorization"] = "Bearer " + (s.key || "nz_YOUR_KEY");
|
||||
let body = null;
|
||||
if (ep.method === "POST") {
|
||||
headers["Content-Type"] = "application/json";
|
||||
body = JSON.stringify(ep.body(s));
|
||||
}
|
||||
return { url, method: ep.method, headers, body, ep };
|
||||
}
|
||||
|
||||
function updateCurl() {
|
||||
const r = buildRequest();
|
||||
const parts = ["curl"];
|
||||
if (r.ep.canStream && state().stream && r.method === "POST") parts.push("-N");
|
||||
if (r.method === "GET") parts.push("-i");
|
||||
parts.push(shellQuote(r.url));
|
||||
for (const [k, v] of Object.entries(r.headers)) {
|
||||
parts.push("\\\n -H " + shellQuote(k + ": " + v));
|
||||
}
|
||||
if (r.body) parts.push("\\\n -d " + shellQuote(r.body));
|
||||
$("curl").textContent = parts.join(" ");
|
||||
}
|
||||
|
||||
function shellQuote(s) {
|
||||
if (/^[A-Za-z0-9_\-:/.@]+$/.test(s)) return s;
|
||||
return "'" + s.replace(/'/g, "'\\''") + "'";
|
||||
}
|
||||
|
||||
// ── Status + header rendering ───────────────────────────────────────────
|
||||
function setStatus(text, code) {
|
||||
const el = $("status");
|
||||
el.textContent = text;
|
||||
el.className = "status" + (code ? " s" + String(code)[0] : "");
|
||||
}
|
||||
|
||||
function renderHeaders(resp) {
|
||||
const box = $("headers");
|
||||
const rows = [];
|
||||
for (const h of SURFACE_HEADERS) {
|
||||
const v = resp.headers.get(h);
|
||||
if (v != null) rows.push([h, v]);
|
||||
}
|
||||
if (!rows.length) { box.style.display = "none"; return; }
|
||||
box.innerHTML = rows.map(([k, v]) =>
|
||||
`<div class="hrow"><div class="hk">${k}</div><div class="hv">${escapeHtml(v)}</div></div>`
|
||||
).join("");
|
||||
box.style.display = "";
|
||||
}
|
||||
|
||||
function escapeHtml(s) {
|
||||
return String(s).replace(/[&<>]/g, (c) => ({ "&": "&", "<": "<", ">": ">" }[c]));
|
||||
}
|
||||
|
||||
// ── Model dropdown population ───────────────────────────────────────────
|
||||
async function refreshModels() {
|
||||
const s = state();
|
||||
if (!s.key) { setOutput("Enter an API key first, then refresh models."); return; }
|
||||
const sel = $("model");
|
||||
const btn = $("refreshModels");
|
||||
btn.disabled = true; btn.textContent = "…";
|
||||
try {
|
||||
const resp = await fetch((s.base || location.origin) + "/v1/models", {
|
||||
headers: { "Authorization": "Bearer " + s.key },
|
||||
});
|
||||
if (!resp.ok) { setOutput("Could not load models: HTTP " + resp.status); return; }
|
||||
const data = await resp.json();
|
||||
const names = (data.data || []).map((m) => m.id).filter(Boolean);
|
||||
const prev = sel.value;
|
||||
sel.innerHTML = "";
|
||||
if (!names.length) {
|
||||
sel.innerHTML = '<option value="">(no models in your effective set)</option>';
|
||||
} else {
|
||||
for (const n of names) {
|
||||
const o = document.createElement("option");
|
||||
o.value = n; o.textContent = n;
|
||||
sel.appendChild(o);
|
||||
}
|
||||
if (names.includes(prev)) sel.value = prev;
|
||||
}
|
||||
updateCurl();
|
||||
} catch (e) {
|
||||
setOutput("Network error loading models: " + e.message);
|
||||
} finally {
|
||||
btn.disabled = false; btn.textContent = "↻ Refresh";
|
||||
}
|
||||
}
|
||||
|
||||
function setOutput(text) { $("output").textContent = text; }
|
||||
function appendOutput(text) { $("output").textContent += text; }
|
||||
|
||||
// ── Run ─────────────────────────────────────────────────────────────────
|
||||
let running = false;
|
||||
async function run() {
|
||||
if (running) return;
|
||||
running = true;
|
||||
const btn = $("run");
|
||||
btn.disabled = true;
|
||||
setStatus("connecting…");
|
||||
setOutput("");
|
||||
$("headers").style.display = "none";
|
||||
|
||||
const r = buildRequest();
|
||||
const willStream = r.ep.canStream && state().stream && r.method === "POST";
|
||||
|
||||
try {
|
||||
const resp = await fetch(r.url, { method: r.method, headers: r.headers, body: r.body });
|
||||
setStatus(resp.status + " " + resp.statusText, resp.status);
|
||||
renderHeaders(resp);
|
||||
|
||||
if (willStream && resp.body && resp.ok) {
|
||||
await consumeStream(resp, r.ep.format);
|
||||
} else {
|
||||
const text = await resp.text();
|
||||
setOutput(prettyMaybeJson(text));
|
||||
}
|
||||
} catch (e) {
|
||||
setStatus("network error", 5);
|
||||
setOutput("Request failed: " + e.message + "\n\n(Check the Base URL and that the gateway is running.)");
|
||||
} finally {
|
||||
running = false;
|
||||
btn.disabled = false;
|
||||
}
|
||||
}
|
||||
|
||||
function prettyMaybeJson(text) {
|
||||
try { return JSON.stringify(JSON.parse(text), null, 2); } catch { return text || "(empty response)"; }
|
||||
}
|
||||
|
||||
// Parse SSE (data: {...} … data: [DONE]) or NDJSON (one JSON object per line),
|
||||
// rendering text deltas live as they arrive.
|
||||
async function consumeStream(resp, format) {
|
||||
const reader = resp.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
setOutput("");
|
||||
const cursor = "▌";
|
||||
const render = (txt) => { $("output").textContent = txt + cursor; };
|
||||
let acc = "";
|
||||
|
||||
while (true) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
|
||||
let idx;
|
||||
// SSE events are separated by blank lines; NDJSON by single newlines.
|
||||
const sep = format === "sse" ? "\n\n" : "\n";
|
||||
while ((idx = buffer.indexOf(sep)) >= 0) {
|
||||
const raw = buffer.slice(0, idx);
|
||||
buffer = buffer.slice(idx + sep.length);
|
||||
acc += handleEvent(raw, format);
|
||||
render(acc);
|
||||
}
|
||||
}
|
||||
if (buffer.trim()) acc += handleEvent(buffer, format);
|
||||
$("output").textContent = acc || "(stream produced no text)";
|
||||
}
|
||||
|
||||
// Returns the text delta extracted from one event/line.
|
||||
function handleEvent(raw, format) {
|
||||
if (format === "sse") {
|
||||
let out = "";
|
||||
for (let line of raw.split("\n")) {
|
||||
line = line.trim();
|
||||
if (!line.startsWith("data:")) continue;
|
||||
const payload = line.slice(5).trim();
|
||||
if (payload === "[DONE]") continue;
|
||||
try {
|
||||
const obj = JSON.parse(payload);
|
||||
const delta = obj.choices && obj.choices[0] && obj.choices[0].delta;
|
||||
if (delta && typeof delta.content === "string") out += delta.content;
|
||||
} catch { /* ignore keep-alives / partial */ }
|
||||
}
|
||||
return out;
|
||||
}
|
||||
// NDJSON
|
||||
const line = raw.trim();
|
||||
if (!line) return "";
|
||||
try {
|
||||
const obj = JSON.parse(line);
|
||||
if (obj.message && typeof obj.message.content === "string") return obj.message.content; // /api/chat
|
||||
if (typeof obj.response === "string") return obj.response; // /api/generate
|
||||
} catch { /* partial line */ }
|
||||
return "";
|
||||
}
|
||||
|
||||
// ── Wiring ──────────────────────────────────────────────────────────────
|
||||
function init() {
|
||||
// Set the base URL to this page's origin. Browsers love to autofill text
|
||||
// inputs from history *after* the page scripts run, so we ALSO re-assert it on
|
||||
// the next microtask and again after a short delay — that wins against
|
||||
// chromium/firefox autofill, which can otherwise replace the value with a
|
||||
// stale entry like https://api.neuronetz.ai.
|
||||
const setOrigin = () => { $("baseUrl").value = location.origin; };
|
||||
setOrigin();
|
||||
$("originPill").textContent = location.origin;
|
||||
queueMicrotask(setOrigin);
|
||||
setTimeout(setOrigin, 250);
|
||||
|
||||
buildTabs();
|
||||
syncForm();
|
||||
updateCurl();
|
||||
refreshGating();
|
||||
|
||||
for (const id of ["baseUrl", "apiKey", "model", "prompt"]) {
|
||||
$(id).addEventListener("input", updateCurl);
|
||||
}
|
||||
$("apiKey").addEventListener("input", refreshGating);
|
||||
$("stream").addEventListener("change", updateCurl);
|
||||
$("run").addEventListener("click", run);
|
||||
$("refreshModels").addEventListener("click", refreshModels);
|
||||
$("resetBase").addEventListener("click", () => {
|
||||
$("baseUrl").value = location.origin;
|
||||
updateCurl();
|
||||
});
|
||||
$("copyCurl").addEventListener("click", async () => {
|
||||
try {
|
||||
await navigator.clipboard.writeText($("curl").textContent);
|
||||
const b = $("copyCurl"); b.textContent = "Copied!"; setTimeout(() => (b.textContent = "Copy"), 1200);
|
||||
} catch { /* clipboard may be blocked; ignore */ }
|
||||
});
|
||||
|
||||
// Convenience: refresh models when a key is pasted/typed (debounced).
|
||||
let t = null;
|
||||
$("apiKey").addEventListener("input", () => {
|
||||
clearTimeout(t);
|
||||
if ($("apiKey").value.trim().length > 8) t = setTimeout(refreshModels, 500);
|
||||
});
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", init);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user