diff --git a/demo.sh b/demo.sh new file mode 100755 index 0000000..c70b0a7 --- /dev/null +++ b/demo.sh @@ -0,0 +1,204 @@ +#!/usr/bin/env bash +# +# demo.sh — the neuronetz-gateway one-command presentation. +# +# Brings up the demo stack (postgres + redis + mock-ollama + gateway) with NO +# GPU and NO model downloads, creates a demo tenant + API key via the bootstrap +# CLI *inside the gateway container*, and prints a clean summary with the key, +# the playground URL, and ready-to-paste curl commands. +# +# Usage: +# ./demo.sh # build + start, bootstrap a tenant/key, print summary +# ./demo.sh --down # tear the whole stack down (and remove volumes) +# ./demo.sh --help # this help +# +# Re-runnable: existing tenant/key are handled gracefully. The full API key is +# only ever printed once at creation (SPEC §11), so on a re-run where the key +# already exists this script creates a fresh, uniquely-named key and prints it. +set -euo pipefail + +# ────────────────────────────────────────────────────────────────────────── +# Configuration +# ────────────────────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +COMPOSE_FILE="${SCRIPT_DIR}/docker-compose.demo.yml" +COMPOSE=(docker compose -f "${COMPOSE_FILE}") + +GATEWAY_URL="http://localhost:8080" +PLAYGROUND_URL="${GATEWAY_URL}/playground" +TENANT_NAME="demo" +KEY_NAME="demo-key" + +# Colours (disabled when stdout is not a TTY). +if [ -t 1 ]; then + BOLD="$(printf '\033[1m')"; DIM="$(printf '\033[2m')"; RESET="$(printf '\033[0m')" + CYAN="$(printf '\033[36m')"; GREEN="$(printf '\033[32m')"; YELLOW="$(printf '\033[33m')" +else + BOLD=""; DIM=""; RESET=""; CYAN=""; GREEN=""; YELLOW="" +fi + +log() { printf '%s\n' "${CYAN}==>${RESET} ${BOLD}$*${RESET}"; } +warn() { printf '%s\n' "${YELLOW}!!${RESET} $*" >&2; } +die() { printf '%s\n' "${YELLOW}xx${RESET} $*" >&2; exit 1; } + +# ────────────────────────────────────────────────────────────────────────── +# Subcommands +# ────────────────────────────────────────────────────────────────────────── +usage() { + sed -n '3,18p' "${BASH_SOURCE[0]}" | sed 's/^# \{0,1\}//' +} + +down() { + log "Tearing down the demo stack (containers + volumes)…" + "${COMPOSE[@]}" down --volumes --remove-orphans + log "Done. The demo stack is gone." +} + +# Run the bootstrap CLI inside the running gateway container. +gw_cli() { + "${COMPOSE[@]}" exec -T gateway neuronetz-gateway "$@" +} + +wait_for_health() { + log "Waiting for the gateway to become healthy at ${GATEWAY_URL}/healthz …" + local deadline=$(( $(date +%s) + 180 )) + until curl -fsS "${GATEWAY_URL}/healthz" >/dev/null 2>&1; do + if [ "$(date +%s)" -ge "${deadline}" ]; then + warn "Gateway did not become healthy in time. Recent gateway logs:" + "${COMPOSE[@]}" logs --tail=50 gateway >&2 || true + die "Aborting." + fi + sleep 2 + done + log "Gateway is up." +} + +# Create the demo tenant if it does not already exist (idempotent). +ensure_tenant() { + log "Creating demo tenant '${TENANT_NAME}' (allow-all-models) …" + local out + if out="$(gw_cli create-tenant --name "${TENANT_NAME}" --allow-all-models 2>&1)"; then + printf '%s\n' "${DIM}${out}${RESET}" + else + # Already-exists (or similar) is fine — surface it but keep going. + if printf '%s' "${out}" | grep -qiE 'exist|duplicate|unique'; then + log "Tenant '${TENANT_NAME}' already exists — reusing it." + else + warn "create-tenant reported:" + printf '%s\n' "${out}" >&2 + warn "Continuing; the tenant may already be present." + fi + fi +} + +# Create a fresh API key and capture the printed key. The key is printed once. +# We give each created key a unique name so re-runs always succeed and always +# yield a usable key to print. +create_key() { + local unique_name="${KEY_NAME}-$(date +%Y%m%d-%H%M%S)" + log "Creating API key '${unique_name}' for tenant '${TENANT_NAME}' …" >&2 + local out + if ! out="$(gw_cli create-key --tenant "${TENANT_NAME}" --name "${unique_name}" 2>&1)"; then + warn "create-key failed:" >&2 + printf '%s\n' "${out}" >&2 + return 1 + fi + # The CLI prints both the 12-char prefix (e.g. "prefix nz_abc12345Yz") AND the + # full key on a later line. Both match /nz_[A-Za-z0-9]+/, so pick the longest + # match — that's the full key (44 chars), never the prefix (12). + local key + key="$(printf '%s' "${out}" | grep -oE 'nz_[A-Za-z0-9]+' \ + | awk '{ if (length($0) > maxlen) { maxlen = length($0); k = $0 } } END { print k }' \ + || true)" + if [ -z "${key}" ]; then + warn "Could not parse an API key from create-key output:" >&2 + printf '%s\n' "${out}" >&2 + return 1 + fi + printf '%s' "${key}" +} + +print_summary() { + local key="$1" + local cl='application/json' + + cat </dev/null 2>&1 || die "docker is required but not found on PATH." + command -v curl >/dev/null 2>&1 || die "curl is required but not found on PATH." + [ -f "${COMPOSE_FILE}" ] || die "Missing ${COMPOSE_FILE}" + + log "Building and starting the demo stack (postgres + redis + mock-ollama + gateway) …" + "${COMPOSE[@]}" up --build -d + + wait_for_health + ensure_tenant + + local key + if ! key="$(create_key)"; then + die "Could not create/parse an API key. See logs above." + fi + + print_summary "${key}" +} + +# ────────────────────────────────────────────────────────────────────────── +# Entry point +# ────────────────────────────────────────────────────────────────────────── +main() { + case "${1:-}" in + --down|-d|down) down ;; + --help|-h|help) usage ;; + "") up ;; + *) die "Unknown argument: $1 (try --help)" ;; + esac +} + +main "$@" diff --git a/demo/mock-ollama/Dockerfile b/demo/mock-ollama/Dockerfile new file mode 100644 index 0000000..1a7c54d --- /dev/null +++ b/demo/mock-ollama/Dockerfile @@ -0,0 +1,61 @@ +# syntax=docker/dockerfile:1.7 +# +# mock-ollama — a tiny FastAPI app emulating the Ollama HTTP API for the demo. +# +# builder stage : installs deps into a self-contained virtualenv. +# runtime stage : copies the venv + app, drops to a NON-ROOT user, no build +# tools, runs uvicorn on :11434. +# +# This image exists ONLY for the demo stack (docker-compose.demo.yml). It lets +# the demo run with no GPU and no model downloads. It is never published to the +# host — like real Ollama, it is reachable only on the internal Docker network. + +# ---------------------------------------------------------------------------- +# Stage 1 — builder +# ---------------------------------------------------------------------------- +FROM python:3.12-slim AS builder + +ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_NO_CACHE_DIR=1 \ + VIRTUAL_ENV=/opt/venv \ + PATH=/opt/venv/bin:$PATH + +RUN python -m venv /opt/venv + +WORKDIR /app +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +# ---------------------------------------------------------------------------- +# Stage 2 — runtime +# ---------------------------------------------------------------------------- +FROM python:3.12-slim AS runtime + +# curl is used by the compose healthcheck. +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + +# Non-root user. +RUN groupadd --system --gid 10001 mock \ + && useradd --system --uid 10001 --gid mock --home-dir /app --shell /usr/sbin/nologin mock + +ENV VIRTUAL_ENV=/opt/venv \ + PATH=/opt/venv/bin:$PATH \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + MOCK_OLLAMA_PORT=11434 + +WORKDIR /app + +COPY --from=builder /opt/venv /opt/venv +COPY app.py ./ + +USER mock + +EXPOSE 11434 + +HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=5 \ + CMD curl -fsS "http://127.0.0.1:${MOCK_OLLAMA_PORT}/api/version" || exit 1 + +CMD ["python", "-m", "app"] diff --git a/demo/mock-ollama/app.py b/demo/mock-ollama/app.py new file mode 100644 index 0000000..751651c --- /dev/null +++ b/demo/mock-ollama/app.py @@ -0,0 +1,361 @@ +"""Standalone mock Ollama service for the neuronetz-gateway demo. + +This is a containerised sibling of ``tests/integration/mock_ollama.py``: it +emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so +the demo runs with **no GPU and no model downloads**. The response *shapes* +match real Ollama closely enough that the gateway's token counter, model +discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths. + +Endpoints emulated: + +* ``GET /api/tags`` - model catalogue (size/digest/modified_at/details) +* ``POST /api/chat`` - NDJSON streaming (default) or single JSON +* ``POST /api/generate`` - NDJSON streaming (default) or single JSON +* ``POST /api/embed`` - newer batch embeddings (field ``embeddings``) +* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``) +* ``POST /api/show`` - returns template/system so the gateway can prove it + strips them +* ``GET /api/version`` - plausible upstream version + +The terminal NDJSON object of every chat/generate response carries realistic +``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the +gateway counts tokens for real. Reply text is ``"Echo: "``. + +Runs uvicorn on :11434 as a non-root user inside the container. +""" + +from __future__ import annotations + +import hashlib +import json +import os +from collections.abc import AsyncIterator, Iterable +from datetime import UTC, datetime +from typing import Any + +import uvicorn +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, StreamingResponse + +NDJSON_MEDIA_TYPE = "application/x-ndjson" + +# A small, realistic catalogue. Sizes/digests are plausible but fixed so the +# demo is fully deterministic. +MODELS: tuple[dict[str, Any], ...] = ( + { + "name": "llama3.1:8b", + "family": "llama", + "parameter_size": "8.0B", + "quantization_level": "Q4_0", + "size": 4_661_211_808, + }, + { + "name": "mistral:7b", + "family": "llama", + "parameter_size": "7.2B", + "quantization_level": "Q4_0", + "size": 4_109_865_159, + }, + { + "name": "qwen2.5:3b", + "family": "qwen2", + "parameter_size": "3.1B", + "quantization_level": "Q4_K_M", + "size": 1_929_889_677, + }, + { + "name": "nomic-embed-text", + "family": "nomic-bert", + "parameter_size": "137M", + "quantization_level": "F16", + "size": 274_302_450, + }, +) + + +def _now_iso() -> str: + return datetime.now(UTC).isoformat().replace("+00:00", "Z") + + +def _digest_for(name: str) -> str: + return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest() + + +def _details_for(name: str) -> dict[str, Any]: + for m in MODELS: + if m["name"] == name: + return { + "parent_model": "", + "format": "gguf", + "family": m["family"], + "families": [m["family"]], + "parameter_size": m["parameter_size"], + "quantization_level": m["quantization_level"], + } + return { + "parent_model": "", + "format": "gguf", + "family": name.split(":", 1)[0], + "families": [name.split(":", 1)[0]], + "parameter_size": "8B", + "quantization_level": "Q4_0", + } + + +def _reply_for(prompt: str, override: str | None) -> str: + if override is not None: + return override + if not prompt: + return "Hello from the mock Ollama backend." + return f"Echo: {prompt}" + + +def _tokenize(text: str) -> list[str]: + return text.split() + + +def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]: + """Timing/usage fields Ollama attaches to the terminal stream object.""" + return { + "total_duration": 1_234_567_890, + "load_duration": 12_345_678, + "prompt_eval_count": prompt_tokens, + "prompt_eval_duration": 23_456_789, + "eval_count": completion_tokens, + "eval_duration": 34_567_890, + } + + +def _chat_chunk( + model: str, + *, + content: str, + done: bool, + prompt_tokens: int = 0, + completion_tokens: int = 0, +) -> dict[str, Any]: + obj: dict[str, Any] = { + "model": model, + "created_at": _now_iso(), + "message": {"role": "assistant", "content": content}, + "done": done, + } + if done: + obj["done_reason"] = "stop" + obj.update(_final_metrics(prompt_tokens, completion_tokens)) + return obj + + +def _generate_chunk( + model: str, + *, + response: str, + done: bool, + prompt_tokens: int = 0, + completion_tokens: int = 0, +) -> dict[str, Any]: + obj: dict[str, Any] = { + "model": model, + "created_at": _now_iso(), + "response": response, + "done": done, + } + if done: + obj["done_reason"] = "stop" + obj["context"] = [1, 2, 3] + obj.update(_final_metrics(prompt_tokens, completion_tokens)) + return obj + + +async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]: + for obj in objects: + yield (json.dumps(obj) + "\n").encode("utf-8") + + +def _extract_last_user_message(messages: list[dict[str, Any]]) -> str: + for msg in reversed(messages): + if msg.get("role") == "user": + content = msg.get("content", "") + return content if isinstance(content, str) else "" + return "" + + +def create_app() -> FastAPI: + app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None) + + @app.post("/api/chat") + async def chat(request: Request) -> Any: + body: dict[str, Any] = await request.json() + model: str = body.get("model", "llama3.1:8b") + stream: bool = body.get("stream", True) + reply_override: str | None = body.get("reply_text") + prompt = _extract_last_user_message(body.get("messages", [])) + reply = _reply_for(prompt, reply_override) + + prompt_tokens = len(_tokenize(prompt)) + completion_tokens = len(_tokenize(reply)) + + if not stream: + return JSONResponse( + _chat_chunk( + model, + content=reply, + done=True, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + ) + + words = _tokenize(reply) or [""] + + def chunks() -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + for i, word in enumerate(words): + piece = word if i == 0 else f" {word}" + out.append(_chat_chunk(model, content=piece, done=False)) + out.append( + _chat_chunk( + model, + content="", + done=True, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + ) + return out + + return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE) + + @app.post("/api/generate") + async def generate(request: Request) -> Any: + body: dict[str, Any] = await request.json() + model: str = body.get("model", "llama3.1:8b") + stream: bool = body.get("stream", True) + prompt = body.get("prompt", "") + reply = _reply_for(prompt, body.get("reply_text")) + + prompt_tokens = len(_tokenize(prompt)) + completion_tokens = len(_tokenize(reply)) + + if not stream: + return JSONResponse( + _generate_chunk( + model, + response=reply, + done=True, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + ) + + words = _tokenize(reply) or [""] + + def chunks() -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + for i, word in enumerate(words): + piece = word if i == 0 else f" {word}" + out.append(_generate_chunk(model, response=piece, done=False)) + out.append( + _generate_chunk( + model, + response="", + done=True, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + ) + ) + return out + + return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE) + + @app.post("/api/embed") + async def embed(request: Request) -> Any: + body: dict[str, Any] = await request.json() + model: str = body.get("model", "nomic-embed-text") + inp = body.get("input", "") + items = inp if isinstance(inp, list) else [inp] + prompt_tokens = sum(len(_tokenize(str(i))) for i in items) + return JSONResponse( + { + "model": model, + "embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items], + "total_duration": 1_111_111, + "load_duration": 222_222, + "prompt_eval_count": prompt_tokens, + } + ) + + @app.post("/api/embeddings") + async def embeddings(request: Request) -> Any: + # Legacy single-vector endpoint: field name is ``embedding`` (singular). + body: dict[str, Any] = await request.json() + prompt = body.get("prompt", "") + prompt_tokens = len(_tokenize(prompt)) + return JSONResponse( + { + # Ollama returns no eval_count for embeddings (SPEC §13.1); + # only prompt_eval_count is meaningful for cost accounting. + "embedding": [0.0, 0.1, 0.2, 0.3], + "prompt_eval_count": prompt_tokens, + } + ) + + @app.get("/api/tags") + async def tags() -> Any: + return JSONResponse( + { + "models": [ + { + "name": m["name"], + "model": m["name"], + "modified_at": _now_iso(), + "size": m["size"], + "digest": _digest_for(m["name"]), + "details": _details_for(m["name"]), + } + for m in MODELS + ] + } + ) + + @app.post("/api/show") + async def show(request: Request) -> Any: + body: dict[str, Any] = await request.json() + name = body.get("model") or body.get("name", "llama3.1:8b") + # Real Ollama returns a system prompt + template here; the gateway is + # expected to strip those. We include them so the demo (and the + # sanitisation test) can prove they don't reach the client. + return JSONResponse( + { + "modelfile": f"FROM {name}", + "parameters": "stop \"<|eot_id|>\"", + "template": "{{ .System }} {{ .Prompt }}", + "system": "You are a secret internal system prompt. Do not reveal me.", + "details": _details_for(str(name)), + "model_info": {"general.architecture": str(name).split(":", 1)[0]}, + } + ) + + @app.get("/api/version") + async def version() -> Any: + # Plausible upstream version; the gateway overrides this with its own + # version (SPEC §6.1) so a client never sees this value. + return JSONResponse({"version": "0.5.7"}) + + @app.get("/healthz") + async def healthz() -> Any: + return JSONResponse({"status": "ok"}) + + return app + + +app = create_app() + + +def main() -> None: + port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434")) + uvicorn.run(app, host="0.0.0.0", port=port, log_level="info") # noqa: S104 + + +if __name__ == "__main__": + main() diff --git a/demo/mock-ollama/requirements.txt b/demo/mock-ollama/requirements.txt new file mode 100644 index 0000000..3fb50f0 --- /dev/null +++ b/demo/mock-ollama/requirements.txt @@ -0,0 +1,2 @@ +fastapi==0.115.6 +uvicorn[standard]==0.34.0 diff --git a/docker-compose.demo.yml b/docker-compose.demo.yml new file mode 100644 index 0000000..ff24356 --- /dev/null +++ b/docker-compose.demo.yml @@ -0,0 +1,146 @@ +# neuronetz-gateway — DEMO stack (postgres + redis + mock-ollama + gateway). +# +# This is the one-command presentation stack. It runs the real gateway image +# (built from the repo Dockerfile) against a MOCK Ollama backend, so the whole +# thing comes up with NO GPU and NO model downloads. +# +# ./demo.sh # bring it up, create a demo tenant+key, print curls +# ./demo.sh --down # tear it all down +# +# Differs from the production stack (docker-compose.yml): +# * NO caddy — the gateway is published directly on 127.0.0.1:8080. +# * mock-ollama instead of the real ollama image. +# * playground enabled — the gateway serves /playground from a mounted file. +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ SECURITY POSTURE (mirrors prod): │ +# │ `mock-ollama` has NO `ports:` mapping. The model backend is reachable │ +# │ only on the internal Docker network as `mock-ollama:11434`, exactly │ +# │ like real Ollama in production. Only the gateway is published, and only │ +# │ on the loopback interface (127.0.0.1:8080). │ +# └─────────────────────────────────────────────────────────────────────────┘ + +services: + gateway: + build: + context: . + dockerfile: Dockerfile + restart: unless-stopped + ports: + - "127.0.0.1:8080:8080" + environment: + GATEWAY_BIND_HOST: 0.0.0.0 + GATEWAY_BIND_PORT: "8080" + GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO} + GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console} + GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID} + GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1} + # Serve the interactive playground from the mounted file (flag-gated; + # OFF by default in prod). See playground/index.html. + PLAYGROUND_ENABLED: "true" + PLAYGROUND_FILE: /app/playground/index.html + # Point the gateway at the mock Ollama on the internal network. + OLLAMA_BASE_URL: http://mock-ollama:11434 + OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5} + OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600} + OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64} + # Discover models quickly so the demo feels live. + MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-15} + MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-60} + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz} + DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10} + DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20} + REDIS_URL: redis://redis:6379/0 + REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60} + DEFAULT_RPM: ${DEFAULT_RPM:-60} + DEFAULT_TPM: ${DEFAULT_TPM:-100000} + DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8} + MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144} + MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096} + ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3} + ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536} + ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4} + AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20} + AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000} + PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30} + AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365} + volumes: + # The gateway serves /playground by reading this file at request time. + # Read-only mount: the demo never lets the container modify it. + - ./playground:/app/playground:ro + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + mock-ollama: + condition: service_healthy + # Apply migrations, then start the server (mirrors docker-compose.dev.yml). + command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"] + interval: 10s + timeout: 3s + retries: 5 + start_period: 30s + networks: + - internal + + # ─────────────────────────────────────────────────────────────────────────── + # mock-ollama — INTERNAL NETWORK ONLY. Stands in for the real Ollama backend. + # NO `ports:` mapping, mirroring the production "Ollama is never exposed" rule. + # Reachable only as `http://mock-ollama:11434` from the gateway container. + # ─────────────────────────────────────────────────────────────────────────── + mock-ollama: + build: + context: ./demo/mock-ollama + dockerfile: Dockerfile + restart: unless-stopped + # !!! NO `ports:` — the model backend is never published. !!! + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:11434/api/version"] + interval: 10s + timeout: 3s + retries: 5 + start_period: 5s + networks: + - internal + + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER:-gateway} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway} + POSTGRES_DB: ${POSTGRES_DB:-neuronetz} + volumes: + - postgres_demo_data:/var/lib/postgresql/data + # No `ports:` — Postgres is internal-only. + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"] + interval: 5s + timeout: 3s + retries: 10 + networks: + - internal + + redis: + image: redis:7-alpine + restart: unless-stopped + command: ["redis-server", "--save", "", "--appendonly", "no"] + # No `ports:` — Redis is internal-only. + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + networks: + - internal + +networks: + # Private network for inter-service traffic; not reachable from the host. + internal: + driver: bridge + +volumes: + postgres_demo_data: diff --git a/docs/API.md b/docs/API.md new file mode 100644 index 0000000..57a1159 --- /dev/null +++ b/docs/API.md @@ -0,0 +1,253 @@ +# neuronetz-gateway — API Reference + +The gateway exposes two compatible API surfaces in front of the Ollama backend: + +- **Native Ollama** under `/api/*` — NDJSON streaming, identical request shapes to Ollama. +- **OpenAI-compatible** under `/v1/*` — SSE streaming, drop-in for the OpenAI SDKs. + +Plus unauthenticated health endpoints. Everything else is blocked. + +> Source of truth: [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §6. Where this doc and the +> SPEC disagree, the SPEC wins. + +--- + +## Authentication + +Every model endpoint requires an API key as a Bearer token: + +``` +Authorization: Bearer nz_<12-char-prefix><32-char-random> +``` + +- **Key format:** `nz_` namespace + random base62 body. The first 12 characters + (`nz_` + entropy) are the **prefix**, stored in cleartext and indexed for O(1) lookup. + The full key is **argon2id**-hashed; it is shown **exactly once** at creation + (`neuronetz-gateway create-key`) and never stored or logged. +- **Fail-closed:** a missing, malformed, expired, disabled, or revoked key returns **401**. + No upstream/Ollama detail is ever leaked in the error. +- Health endpoints (`/healthz`, `/readyz`) require **no** auth. + +The placeholder key `nz_demoKEY...` is used throughout this doc. `./demo.sh` prints a +**real** key for the local demo. + +--- + +## Response headers (SPEC §6.5) + +Every proxied response carries: + +| Header | Meaning | +|---|---| +| `X-Request-ID` | Correlates the response with the audit log row. Present on errors too. | +| `X-RateLimit-Limit-Requests` | Effective RPM limit for this key/tenant. | +| `X-RateLimit-Remaining-Requests` | Requests remaining in the current window. | +| `X-RateLimit-Limit-Tokens` | Effective TPM limit. | +| `X-RateLimit-Remaining-Tokens` | Tokens remaining in the current window. | +| `X-Budget-Period` | `day` \| `month` \| `total` — the binding budget period. | +| `X-Budget-Tokens-Remaining` | Tokens left in the binding budget period. | + +`429 Too Many Requests` responses additionally carry `Retry-After: `. + +--- + +## Error model + +Errors are **sanitized** at the gateway boundary — Ollama internals are never reflected. +The body is a small generic JSON object and the `X-Request-ID` header ties it to the audit log. + +```json +{ "error": { "message": "forbidden", "type": "forbidden", "code": 403 }, "request_id": "b3f1…" } +``` + +| Status | When | +|---|---| +| `400` | Malformed body, schema violation, or `num_predict` over the cap. | +| `401` | Missing / invalid / expired / revoked key. | +| `403` | Endpoint hard-blocked, or model outside the tenant's effective set (no existence disclosure). | +| `413` | Request body over `MAX_REQUEST_BODY_BYTES` (default 256 KiB). | +| `429` | Rate limit or budget exceeded (carries `Retry-After`). | +| `502` | Ollama upstream unreachable / circuit breaker open. | +| `503` | A required subsystem (Postgres read, Redis) is down — **fail-closed**, never "allow". | + +A model that is *installed-but-unpermitted* and a model that is *not installed* return the +**same** generic `403`, to prevent enumeration (SPEC §13.6). + +--- + +## Native Ollama endpoints (`/api/*`) + +### `POST /api/chat` + +Streamed (NDJSON, default) or non-streamed chat completion. + +```bash +curl -N http://localhost:8080/api/chat \ + -H "Authorization: Bearer nz_demoKEY..." \ + -H "Content-Type: application/json" \ + -d '{"model":"llama3.1:8b","stream":true, + "messages":[{"role":"user","content":"Say hello in one sentence."}]}' +``` + +**Streaming response** — `Content-Type: application/x-ndjson`, one JSON object per line: + +``` +{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":"Echo:"},"done":false} +{"model":"llama3.1:8b","created_at":"…","message":{"role":"assistant","content":" Say"},"done":false} +… +{"model":"llama3.1:8b","done":true,"done_reason":"stop", + "prompt_eval_count":6,"eval_count":7,"total_duration":1234567890,"eval_duration":34567890} +``` + +The **final** object carries `prompt_eval_count` (tokens in) and `eval_count` (tokens out); +the gateway uses these for precise token accounting (SPEC §4.3 step 12). + +**Non-streaming** (`"stream": false`) returns a single JSON object of the same shape with +`"done": true`. + +### `POST /api/generate` + +Same semantics as `/api/chat` but uses a flat `prompt` string and returns `response` +fields instead of `message` objects. + +```bash +curl -N http://localhost:8080/api/generate \ + -H "Authorization: Bearer nz_demoKEY..." \ + -H "Content-Type: application/json" \ + -d '{"model":"llama3.1:8b","stream":true,"prompt":"Write a haiku about routers."}' +``` + +### `POST /api/embed` / `POST /api/embeddings` + +Non-streamed embeddings. `/api/embed` is the newer batch endpoint (field `embeddings`, +a list of vectors); `/api/embeddings` is the legacy single-vector endpoint (field +`embedding`). Ollama returns no `eval_count` for embeddings; cost is charged on +`prompt_eval_count` only (SPEC §13.1). + +```bash +curl http://localhost:8080/api/embed \ + -H "Authorization: Bearer nz_demoKEY..." \ + -H "Content-Type: application/json" \ + -d '{"model":"nomic-embed-text","input":["hello","world"]}' +``` + +```json +{ "model": "nomic-embed-text", "embeddings": [[0.0, 0.1, …], [0.0, 0.1, …]], "prompt_eval_count": 2 } +``` + +### `GET /api/tags` + +Returns the tenant's **effective** model set — the live-discovered set intersected with the +tenant's allowlist, or *all* discovered models when `allow_all_models` is on. Sourced from +discovery (SPEC §4.6), never a static list. + +```bash +curl http://localhost:8080/api/tags -H "Authorization: Bearer nz_demoKEY..." +``` + +### `POST /api/show` + +Allowed only for models in the effective set; returns **sanitized** model info. +The system prompt and template that Ollama returns are **stripped** by the gateway. + +### `GET /api/version` + +Returns the **gateway** version, not the Ollama version. + +```json +{ "version": "0.1.0" } +``` + +--- + +## Hard-blocked endpoints (always `403`) + +These model-mutating endpoints are blocked at the gateway. **Not configurable, not behind a +flag** (SPEC §6.2, AGENT_PROMPT non-negotiable #5): + +``` +/api/pull /api/push /api/create /api/copy /api/delete /api/blobs/* +``` + +```bash +# Always 403, even with a valid key: +curl -i http://localhost:8080/api/pull \ + -H "Authorization: Bearer nz_demoKEY..." \ + -H "Content-Type: application/json" -d '{"model":"llama3.1:8b"}' +``` + +`GET /api/ps` is also blocked (it would leak which models are loaded). + +--- + +## OpenAI-compatible endpoints (`/v1/*`) + +| Path | Method | Maps to | +|---|---|---| +| `/v1/chat/completions` | POST | `/api/chat` | +| `/v1/completions` | POST | `/api/generate` | +| `/v1/embeddings` | POST | `/api/embed` | +| `/v1/models` | GET | `/api/tags` (effective set, OpenAI list format) | + +Streaming uses **SSE**: `data: {…}\n\n` events terminated by a literal `data: [DONE]\n\n`. + +### `POST /v1/chat/completions` + +```bash +curl -N http://localhost:8080/v1/chat/completions \ + -H "Authorization: Bearer nz_demoKEY..." \ + -H "Content-Type: application/json" \ + -d '{"model":"llama3.1:8b","stream":true, + "messages":[{"role":"user","content":"Say hello in one sentence."}]}' +``` + +**Streaming response** — `Content-Type: text/event-stream`: + +``` +data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":"Echo:"},"finish_reason":null}]} + +data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{"content":" Say"},"finish_reason":null}]} + +data: {"id":"chatcmpl-…","object":"chat.completion.chunk","choices":[{"index":0,"delta":{},"finish_reason":"stop"}],"usage":{"prompt_tokens":6,"completion_tokens":7,"total_tokens":13}} + +data: [DONE] +``` + +Works with the OpenAI Python SDK by pointing `base_url` at `http://localhost:8080/v1`. + +### `GET /v1/models` + +```bash +curl http://localhost:8080/v1/models -H "Authorization: Bearer nz_demoKEY..." +``` + +```json +{ "object": "list", "data": [ + { "id": "llama3.1:8b", "object": "model", "owned_by": "neuronetz" }, + { "id": "mistral:7b", "object": "model", "owned_by": "neuronetz" } +] } +``` + +--- + +## Health endpoints + +| Path | Method | Auth | Purpose | +|---|---|---|---| +| `/healthz` | GET | none | Liveness — process responsive (`200`). | +| `/readyz` | GET | none | Readiness — DB + Redis + Ollama reachable, else `503`. | +| `/metrics` | GET | none (loopback only) | Prometheus exposition. | + +```bash +curl -i http://localhost:8080/healthz # 200 {"status":"ok"} +curl -i http://localhost:8080/readyz # 200 when all deps up; 503 otherwise +``` + +--- + +## Quick reference: streaming formats + +| Surface | Content-Type | Frame | Terminator | +|---|---|---|---| +| Native `/api/*` | `application/x-ndjson` | one JSON object per `\n` | final object has `"done": true` | +| OpenAI `/v1/*` | `text/event-stream` | `data: {…}\n\n` | `data: [DONE]\n\n` | diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..2692e48 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,168 @@ +# neuronetz-gateway — Architecture + +Distilled from [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §4. The SPEC is the source of truth. + +The gateway is the **hot path** of the Neuronetz API: a secure, multi-tenant proxy in front +of an Ollama instance. The Ollama backend must never be reachable directly from the public +internet — all access flows through this gateway. Administration (dashboards, tenant +self-service) lives in a separate service, `neuronetz-console`, and is out of scope here. + +--- + +## Component diagram (SPEC §4.1) + +``` + Internet + │ TLS + ▼ + ┌──────────────────────┐ + │ Caddy (sidecar) │ Let's Encrypt for api.neuronetz.ai + │ - TLS termination │ HSTS, security headers + │ - HTTP/2, HTTP/3 │ + └──────────┬───────────┘ + │ HTTP/1.1 internal + ┌──────────▼───────────┐ + │ neuronetz-gateway │ FastAPI + uvicorn + │ - authn │ + │ - rate limit │ + │ - budget check │ + │ - proxy + stream │ + │ - token count │ + │ - audit write │ + └──┬────────┬──────┬───┘ + │ │ │ + ┌──────▼──┐ ┌──▼───┐ │ + │Postgres │ │Redis │ │ + │ schema: │ │ keys │ │ + │ gateway │ │bucket│ │ + └─────────┘ └──────┘ │ + │ internal network only + ┌──────▼──────┐ + │ Ollama │ + │ 127.0.0.1 │ + └─────────────┘ + +Same Compose stack also hosts (separate from this SPEC): + - neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT) +``` + +Only **Caddy** publishes ports. Postgres, Redis and (critically) **Ollama** have no +published ports and are reachable only on the internal Docker network. + +--- + +## Database schemas (SPEC §4.2) + +A single Postgres instance with two schemas: + +- **`gateway`** — owned by this service; full DDL. Tables: `tenants`, `tenant_limits`, + `api_keys`, `key_limits`, `budget_usage`, `audit_log`, `prompt_log`, `revocations` + (see SPEC §5 for the full DDL). +- **`console`** — owned by `neuronetz-console` (out of scope). The console role gets + `SELECT` on all `gateway.*` tables and `INSERT` on `gateway.revocations` only. + +If the console needs to mutate gateway state (e.g. revoke a key), it does so by inserting +into the `gateway.revocations` **outbox** table, which the gateway tails (see Revocation below). + +**Limit inheritance:** limits and budgets resolve key → tenant. A `NULL` key-level value +inherits the tenant value. For `allow_all_models`, a non-`NULL` key value overrides the +tenant flag; otherwise the tenant flag applies (SPEC §13.7). + +--- + +## Request lifecycle (SPEC §4.3) + +1. Caddy terminates TLS and forwards to the gateway on the internal port. +2. Middleware extracts `Authorization: Bearer `. +3. The 12-char prefix is the Redis cache key. On miss, look up `gateway.api_keys` by prefix, + verify the full key with argon2id, and cache resolved metadata in Redis (TTL 60 s). +4. **Rate limit** check — sliding window in Redis (Lua-atomic): per-key RPM + per-tenant RPM. +5. **Budget** check — Redis counter for the current period; Postgres ledger is the source of + truth on reset. +6. **Concurrency** semaphore — Redis `INCR` with TTL. +7. **Model allowlist** check — resolve the effective set (see below); the request `model` + must be in it, else a generic `403`. +8. **Endpoint allowlist** check — mutating endpoints are hard-blocked. +9. **Body validation** — size, schema, `num_predict` cap. +10. If an OpenAI-compat path, translate the request to the Ollama schema. +11. Open an httpx async stream to Ollama. +12. Stream the response back to the client, accumulating the final `prompt_eval_count` + + `eval_count`. +13. On stream close: write the `gateway.audit_log` row; decrement the budget; release the + semaphore; if prompt logging is enabled, write `gateway.prompt_log`. +14. On any failure: sanitized error to the client, audit row with the status code, semaphore + released. + +**Streaming integrity:** token counting and the audit write happen **after** stream close, +never on the hot path — time-to-first-byte is not degraded by bookkeeping (SPEC §9). + +--- + +## Model discovery (SPEC §4.6) + +The set of usable models is **never hand-maintained**; it is extracted live from Ollama. + +- A background task (started in the app lifespan, alongside the revocation listener) polls + Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. +- The parsed set (names + sanitized metadata: family, parameter size, quantization, size, + modified-at) is cached in Redis under `gateway:models:discovered` with TTL + `MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path. +- An initial fetch runs at startup; if Ollama is unreachable the discovered set is empty. +- **Fail-closed:** an empty or expired-and-unrefreshable discovered set means *no model + resolves* and requests are denied. Discovery never opens access on failure. +- **Auto-grant:** because the effective set intersects with `discovered` (or *is* + `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band becomes + usable to `allow_all` tenants on the next refresh — no per-tenant config change. +- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags` + endpoint; it never triggers a model pull. + +### Effective-set resolution (SPEC §4.3 step 7) + +``` +allow_all := key.allow_all_models ?? tenant.allow_all_models +effective := discovered if allow_all + (key.allowed_models ?? tenant.allowed_models) ∩ discovered otherwise +``` + +`/api/tags` and `/v1/models` return exactly this effective set, so the listing never reveals +models outside the tenant's reach. A model that is installed-but-unpermitted and one that is +not installed both return the same generic `403` — no existence disclosure (SPEC §13.6). + +--- + +## Failure modes — fail-closed (SPEC §4.4) + +| Subsystem | If down | Behavior | +|---|---|---| +| Postgres (read) | Key lookup fails | `503` with retry-after; nothing proxied. | +| Postgres (write) | Audit write fails | Request still succeeds; audit row buffered in-memory ring (max 1000), drained on recovery; if the buffer fills, switch to deny mode. | +| Redis | Rate limit / budget unavailable | `503` — fail closed. Never "allow because we can't check." | +| Ollama | Upstream unreachable | `502` with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30 s. | +| Caddy | Not a gateway concern | — | + +The governing rule (AGENT_PROMPT non-negotiable #1): **if a security or budgeting check +cannot be performed, deny.** Never default to allow. + +--- + +## Cache invalidation / key revocation (SPEC §4.5) + +The console revokes a key by inserting into `gateway.revocations(key_id, ts, reason)`. +A background task in the gateway lifespan: + +- `LISTEN`s on the Postgres channel `key_revoked` (the gateway emits `NOTIFY` on its own + write path; the console's INSERT fires a trigger that emits it). +- On notification, evicts the Redis cache entry for that key's prefix. + +This makes revocation effectively immediate (≤ Redis RTT) with no cross-service HTTP. + +--- + +## Observability + +- **Structured logs** (structlog), JSON in production. Secrets/keys are never logged. +- **Prometheus** `/metrics` (loopback only): `gateway_requests_total{tenant,model,status}`, + `gateway_tokens_total{tenant,model,direction}`, + `gateway_request_duration_seconds{tenant,model}` (histogram). Labelled by `tenant`, never + by `key_id` (cardinality — SPEC §13.3); per-key data lives in Postgres. +- **Audit log** — always-on request metadata. **Prompt log** — opt-in per key, TTL'd. diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..55ac663 --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,188 @@ +# neuronetz-gateway — Deployment + +Production deployment is a single Docker Compose stack: **Caddy + gateway + Postgres + Redis ++ Ollama**. Caddy is the only public-facing component; it terminates TLS via Let's Encrypt +for `api.neuronetz.ai` and reverse-proxies to the internal-only gateway. + +> For the local, no-GPU demo (mock Ollama + playground), see [`PLAYGROUND.md`](PLAYGROUND.md) +> and run `./demo.sh`. This document is the **production** path. + +--- + +## The one rule that must never break + +> ## ⛔ Ollama is NEVER exposed to the host or the internet. +> +> The `ollama` service in `docker-compose.yml` has **no `ports:` mapping** and must never +> get one. Ollama is reachable only on the internal Docker network as `ollama:11434`. +> Publishing it would re-open the exact unauthenticated exposure this whole project exists +> to close (SPEC §1, §3; AGENT_PROMPT non-negotiable #2). + +The same posture applies to **Postgres** and **Redis** in the production compose file — no +published ports. Only **Caddy** binds host ports (80/443, 443/udp for HTTP/3). + +--- + +## Prerequisites + +- A host with Docker + Docker Compose. +- DNS: `api.neuronetz.ai` → the host's public IP (for Let's Encrypt). +- Ports 80 and 443 reachable from the internet (ACME HTTP/TLS challenge + serving). + +--- + +## Steps + +```bash +git clone neuronetz-gateway && cd neuronetz-gateway + +# 1. Configure. Copy the example env and change EVERY secret. +cp .env.example .env +# - POSTGRES_PASSWORD: a strong, unique value +# - DATABASE_URL: must match the POSTGRES_* values +# - GATEWAY_LOG_FORMAT=json for production + +# 2. Configure Caddy for your domain + ACME email. +cp ops/caddy/Caddyfile.example ops/caddy/Caddyfile # then edit the site + email +# (docker-compose.yml mounts Caddyfile.example by default; point it at your edited file +# or edit in place.) + +# 3. Bring up the full stack. The gateway runs `alembic upgrade head`, then serves. +docker compose up -d --build + +# 4. Bootstrap a tenant + key (CLI runs inside the gateway container). +docker compose exec gateway neuronetz-gateway create-tenant --name acme --rpm 120 --tpm 200000 +docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1 +# ^ prints the full key ONCE — store it in your secret manager now. + +# 5. Smoke test (through Caddy / TLS). +curl https://api.neuronetz.ai/healthz +curl -N https://api.neuronetz.ai/v1/chat/completions \ + -H "Authorization: Bearer nz_…" -H "Content-Type: application/json" \ + -d '{"model":"llama3.1:8b","stream":true,"messages":[{"role":"user","content":"hi"}]}' +``` + +Caddy obtains and renews the certificate automatically. For local testing without a public +domain, use the `localhost { tls internal … }` block documented in `Caddyfile.example` +(trust Caddy's local CA or pass `-k` to curl). + +--- + +## Pointing at a real Ollama backend + +The gateway reaches Ollama via `OLLAMA_BASE_URL`. In the bundled stack this is the in-stack +`ollama` service: `OLLAMA_BASE_URL=http://ollama:11434`. + +To use an **existing/external** Ollama host instead: + +1. Remove the `ollama` service from `docker-compose.yml` (or leave it; it just won't be used). +2. Set `OLLAMA_BASE_URL` to the backend address reachable from the gateway container, e.g. + `http://10.0.0.5:11434` or an internal DNS name. +3. Ensure that backend is itself **not** exposed to the internet — the gateway is the only + thing that should ever reach it. Use a private network / firewall rule, not a public port. +4. Pull the models you want available on that backend. They appear in tenants' effective sets + automatically on the next discovery refresh (SPEC §4.6) — no gateway config change for + `allow_all_models` tenants. + +Discovery polls `OLLAMA_BASE_URL/api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. If the +backend is unreachable, the discovered set is empty and requests **fail closed**. + +--- + +## Environment reference (SPEC §7) + +All configuration is via environment variables, validated by Pydantic Settings on boot. Boot +**fails loudly** on invalid config. See [`.env.example`](../.env.example) for a copyable file. + +### Service +| Var | Default | Notes | +|---|---|---| +| `GATEWAY_BIND_HOST` | `0.0.0.0` | Bind-all inside the container. | +| `GATEWAY_BIND_PORT` | `8080` | Internal port; never published directly in prod. | +| `GATEWAY_LOG_LEVEL` | `INFO` | | +| `GATEWAY_LOG_FORMAT` | `json` | `json` in prod, `console` for local dev. | +| `GATEWAY_REQUEST_ID_HEADER` | `X-Request-ID` | | +| `GATEWAY_TRUSTED_PROXIES` | `127.0.0.1,caddy` | Sources trusted for `X-Forwarded-For`. | + +### Upstream (Ollama) +| Var | Default | Notes | +|---|---|---| +| `OLLAMA_BASE_URL` | `http://ollama:11434` | Internal address of the backend. | +| `OLLAMA_CONNECT_TIMEOUT_S` | `5` | | +| `OLLAMA_READ_TIMEOUT_S` | `600` | Long, for slow generations. | +| `OLLAMA_MAX_CONNECTIONS` | `64` | httpx pool size. | + +### Model discovery (§4.6) +| Var | Default | Notes | +|---|---|---| +| `MODEL_DISCOVERY_REFRESH_S` | `60` | How often to re-query `/api/tags`. | +| `MODEL_DISCOVERY_CACHE_TTL_S` | `120` | Redis TTL for the discovered set. | + +### Database +| Var | Default | Notes | +|---|---|---| +| `DATABASE_URL` | `postgresql+asyncpg://…` | asyncpg driver. | +| `DATABASE_POOL_SIZE` | `10` | | +| `DATABASE_POOL_OVERFLOW` | `20` | | + +### Redis +| Var | Default | Notes | +|---|---|---| +| `REDIS_URL` | `redis://redis:6379/0` | | +| `REDIS_KEY_CACHE_TTL_S` | `60` | Resolved-key cache TTL. | + +### Limits (defaults; per-tenant/key DB overrides win) +| Var | Default | Notes | +|---|---|---| +| `DEFAULT_RPM` | `60` | | +| `DEFAULT_TPM` | `100000` | | +| `DEFAULT_CONCURRENT` | `8` | | +| `MAX_REQUEST_BODY_BYTES` | `262144` | 256 KiB request cap. | +| `MAX_NUM_PREDICT` | `4096` | Hard cap on requested completion tokens. | + +### Security +| Var | Default | Notes | +|---|---|---| +| `ARGON2_TIME_COST` | `3` | | +| `ARGON2_MEMORY_COST_KIB` | `65536` | 64 MiB. | +| `ARGON2_PARALLELISM` | `4` | | +| `AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN` | `20` | Throttles auth brute-force per source IP. | + +### Audit +| Var | Default | Notes | +|---|---|---| +| `AUDIT_BUFFER_SIZE` | `1000` | Ring buffer; full ⇒ deny mode. | +| `PROMPT_LOG_DEFAULT_RETENTION_DAYS` | `30` | | +| `AUDIT_LOG_DEFAULT_RETENTION_DAYS` | `365` | | + +--- + +## TLS & security headers (Caddy) + +`ops/caddy/Caddyfile.example` already sets: + +- **HSTS** `max-age=63072000; includeSubDomains; preload` +- `X-Content-Type-Options: nosniff` +- `X-Frame-Options: DENY` +- `Referrer-Policy: no-referrer` +- strips `Server` and `X-Powered-By` + +Edit the site address and ACME `email` before deploying. + +--- + +## Non-Compose (systemd) + +A systemd unit is provided for hosts that run the image directly (`ops/systemd/`). The +gateway still requires reachable Postgres, Redis, and Ollama, and the same environment +variables. TLS in that topology is whatever fronts the host (Caddy, nginx, a load balancer) — +**Ollama still must not be publicly reachable.** + +--- + +## Upgrades & migrations + +The gateway runs `alembic upgrade head` on container start, so a normal +`docker compose up -d --build` after pulling a new version applies pending migrations. For +zero-downtime upgrades, run migrations as a one-off +(`docker compose run --rm gateway alembic upgrade head`) before rolling the service. diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md new file mode 100644 index 0000000..2bbb852 --- /dev/null +++ b/docs/OPERATIONS.md @@ -0,0 +1,172 @@ +# neuronetz-gateway — Operations Runbook + +Day-2 operations for the gateway: managing tenants and keys, budgets, model policy, usage, +and the fail-closed behaviors you'll encounter. All administration is via the **bootstrap +CLI** (SPEC §11), run inside the gateway container. There are no admin HTTP endpoints in the +gateway (that's `neuronetz-console`'s job). + +> Run the CLI inside the running container: +> ```bash +> docker compose exec gateway neuronetz-gateway … +> ``` +> In the demo stack, swap the compose file: `docker compose -f docker-compose.demo.yml exec gateway …` + +--- + +## Keys + +### Create a key + +```bash +docker compose exec gateway neuronetz-gateway create-key --tenant acme --name prod-server-1 +# optional: --scopes chat,embeddings (default: chat,embeddings) +``` + +The **full key is printed exactly once** in the form `nz_`. Store it +immediately in your secret manager — it is argon2id-hashed and cannot be recovered. Only the +12-char `prefix` is retained server-side. + +### List keys (never shows full keys) + +```bash +docker compose exec gateway neuronetz-gateway list-keys --tenant acme +# prints: status=active name='prod-server-1' created=… +``` + +### Revoke a key + +```bash +docker compose exec gateway neuronetz-gateway revoke-key --prefix nz_abc12345 +``` + +This sets the key status to `revoked` and writes the `gateway.revocations` outbox row. A +Postgres `NOTIFY` on channel `key_revoked` fires; the gateway evicts the key's Redis cache +entry, so revocation takes effect within ~1 second (SPEC §4.5) without restarting anything. +A subsequent request with that key returns **401**. + +> The console (`neuronetz-console`) revokes keys the same way — by inserting into +> `gateway.revocations`. The trigger-driven NOTIFY makes it immediate without any +> cross-service HTTP call. + +### Rotate a key + +There is no in-place rotate. Rotate by: create a new key → deploy it to the client → verify +traffic on the new prefix → revoke the old prefix. + +--- + +## Tenants & limits + +### Create a tenant + +```bash +docker compose exec gateway neuronetz-gateway create-tenant --name acme \ + --rpm 120 --tpm 200000 --concurrent 8 +# add --allow-all-models to opt into using any installed model (default: off) +``` + +Limits inherit **key → tenant**: a `NULL` key-level limit uses the tenant value. + +--- + +## Budgets + +Set per-key token budgets (any combination of daily / monthly / total): + +```bash +docker compose exec gateway neuronetz-gateway set-budget --key nz_abc12345 \ + --daily 1000000 --monthly 30000000 --total 500000000 +``` + +- Budgets are enforced **fail-closed**: when the binding period hits zero remaining, requests + return **429** with a descriptive error and a `Retry-After` header. The binding period and + remaining balance are surfaced on every response via `X-Budget-Period` and + `X-Budget-Tokens-Remaining` (SPEC §6.5). +- Live counters live in Redis; the Postgres ledger (`gateway.budget_usage`) is the source of + truth on period rollover/reset. + +--- + +## Model policy + +### Set an explicit allowlist (default-deny) + +```bash +docker compose exec gateway neuronetz-gateway set-models --tenant acme \ + --models llama3.1:8b,mistral:7b +``` + +The tenant's **effective set** is `allowed_models ∩ discovered` — entries that aren't +actually installed on the backend silently never resolve. A request for a model outside the +effective set returns a generic **403** (same response as "doesn't exist" — no enumeration). + +### Toggle `allow_all_models` + +```bash +docker compose exec gateway neuronetz-gateway set-models --tenant acme --allow-all # opt in +docker compose exec gateway neuronetz-gateway set-models --tenant acme --no-allow-all # back to allowlist +``` + +With `allow_all_models` on, the effective set **is** the live discovered set — any model +pulled into Ollama becomes usable on the next discovery refresh, with no further config +change. This is an audited convenience; prefer explicit allowlists for untrusted tenants +(see [`THREAT_MODEL.md`](THREAT_MODEL.md)). + +### Inspect discovery and effective sets + +```bash +docker compose exec gateway neuronetz-gateway list-models # live-discovered models +docker compose exec gateway neuronetz-gateway list-models --tenant acme # + that tenant's effective set +``` + +--- + +## Usage + +```bash +docker compose exec gateway neuronetz-gateway show-usage --tenant acme --period day +# prints: requests=… tokens_in=… tokens_out=… (period: day|month|total) +``` + +For per-key forensics and finer slicing, query `gateway.audit_log` directly (it records +`request_id`, `key_prefix`, `model`, `tokens_in/out`, `status`, `latency_ms`, `client_ip`). + +--- + +## How model discovery refresh works (SPEC §4.6) + +- A background task polls Ollama `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds and + caches the result in Redis (`gateway:models:discovered`, TTL `MODEL_DISCOVERY_CACHE_TTL_S`) + plus an in-process copy for hot reads. +- A model pulled into Ollama out-of-band appears in `allow_all_models` tenants' effective sets + within one refresh interval — no config change. +- Discovery is **read-only** and uses only the allowlisted `/api/tags` endpoint; it never + triggers a pull. +- To force a faster pickup, lower `MODEL_DISCOVERY_REFRESH_S` (the demo uses 15 s). + +--- + +## Fail-closed behaviors to expect + +| Symptom | Cause | Correct behavior | +|---|---|---| +| `503` on every request | Redis or Postgres-read down | Fail-closed — rate-limit/budget/auth can't be checked, so deny. Restore the backend. | +| `502` with retry-after | Ollama unreachable | Circuit breaker opens after 5 consecutive failures, half-opens after 30 s. Check the backend / `OLLAMA_BASE_URL`. | +| `403` for a model you "know" exists | Model not in the tenant's effective set, **or** discovery cache empty/expired | Check `list-models --tenant …`; verify the backend is reachable and the model is installed. Empty discovery = deny by design. | +| `429` with `Retry-After` | Rate limit or budget exhausted | Inspect headers (`X-RateLimit-*`, `X-Budget-*`); raise limits/budget or wait. | +| `401` immediately after revoke | Working as intended | Revocation propagated via NOTIFY → Redis eviction. | + +`/readyz` returns `503` when **any** dependency (DB, Redis, Ollama) is unreachable; use it as +the load-balancer health gate. `/healthz` only checks process liveness. + +--- + +## Logs, metrics, audit + +- **Logs:** structured (structlog), JSON in production, to stdout. Keys/secrets are never + logged. +- **Metrics:** Prometheus at `/metrics` (loopback only): `gateway_requests_total`, + `gateway_tokens_total`, `gateway_request_duration_seconds`, labelled by `tenant` and + `model` (never `key_id`). +- **Audit log:** always-on in `gateway.audit_log`. **Prompt log** is opt-in per key and TTL'd + (`PROMPT_LOG_DEFAULT_RETENTION_DAYS`); a sweeper enforces retention. diff --git a/docs/PLAYGROUND.md b/docs/PLAYGROUND.md new file mode 100644 index 0000000..7f2a581 --- /dev/null +++ b/docs/PLAYGROUND.md @@ -0,0 +1,113 @@ +# neuronetz-gateway — Demo & Playground + +The fastest way to see the gateway working end-to-end, with **no GPU and no model downloads**. +`./demo.sh` brings up the gateway against a mock Ollama backend, mints a demo API key, and +prints ready-to-paste curl commands and a link to an interactive browser playground. + +--- + +## Launch the demo + +From the repo root: + +```bash +./demo.sh +``` + +This will: + +1. Build and start the demo stack (`docker-compose.demo.yml`): **postgres + redis + + mock-ollama + gateway**. No Caddy; the gateway is published on `127.0.0.1:8080`. +2. Wait for the gateway to report healthy at `/healthz`. +3. Create a demo tenant (`--allow-all-models`) and an API key via the bootstrap CLI **inside + the gateway container**, capturing the key (which is printed exactly once). +4. Print a summary: the **API key**, the **playground URL** + `http://localhost:8080/playground`, and five ready-to-paste curl commands — + - streaming `/v1/chat/completions` (OpenAI SSE), + - streaming `/api/chat` (native NDJSON), + - `GET /v1/models`, + - a **401** example (no/bad key), + - a **403** example (`POST /api/pull`, hard-blocked). + +The script is **re-runnable**: an existing tenant is reused, and each run mints a fresh, +uniquely-named key (the full key only ever prints at creation). + +Tear everything down (containers + volumes): + +```bash +./demo.sh --down +``` + +### What's running + +| Service | Exposed? | Notes | +|---|---|---| +| `gateway` | `127.0.0.1:8080` | The real gateway image, built from the repo `Dockerfile`. | +| `mock-ollama` | **no** | Internal network only — mirrors the prod "Ollama is never exposed" rule. | +| `postgres` | **no** | Internal only. | +| `redis` | **no** | Internal only. | + +The mock backend (`demo/mock-ollama/`) emulates Ollama's API shapes — including realistic +`prompt_eval_count` / `eval_count` on the final stream object — so token counting, model +discovery, and `/api/show` sanitization all exercise real gateway code paths. It serves a +small catalogue: `llama3.1:8b`, `mistral:7b`, `qwen2.5:3b`, `nomic-embed-text`. + +--- + +## Use the playground + +Open **http://localhost:8080/playground** in a browser. It is a single self-contained HTML +page, served **same-origin** by the gateway (so no CORS to worry about). + +1. **Base URL** is pre-filled with the current origin; leave it as is for the demo. +2. Paste the **API key** from the `./demo.sh` output into the Bearer field. (Typing a key + auto-loads the model dropdown; you can also hit **↻ Refresh**.) +3. Pick an **endpoint** tab: `/v1/chat/completions`, `/api/chat`, `/api/generate`, + `/v1/models`, `/api/tags`, `/healthz`, `/readyz`. +4. Choose a **model** from the auto-populated dropdown, type a prompt, toggle **stream**. +5. Hit **▶ Run**. The streamed output renders **live** — SSE `data:` deltas (incl. `[DONE]`) + for `/v1/*`, NDJSON lines for `/api/*`. +6. The panel shows the **response status** and the rate-limit / budget **response headers** + (`X-Request-ID`, `X-RateLimit-*`, `X-Budget-*`; SPEC §6.5). +7. The **Exact curl** box mirrors precisely what **Run** sends — copy it to reproduce in a + terminal. + +Try the 403 path too: there's no mutating-endpoint tab by design, but the printed `curl` for +`POST /api/pull` shows the hard block, and an invalid key in the Bearer field demonstrates the +401 fail-closed response. + +--- + +## ⚠️ Security note: the playground is OFF by default in production + +The playground route is **flag-gated** and **disabled by default**. The demo stack turns it on +explicitly: + +```yaml +# docker-compose.demo.yml (gateway service) +GATEWAY_PLAYGROUND_ENABLED: "true" +GATEWAY_PLAYGROUND_FILE: /app/playground/index.html +``` + +with the file mounted read-only into the container: + +```yaml +volumes: + - ./playground:/app/playground:ro +``` + +The production stack (`docker-compose.yml`) does **not** set `GATEWAY_PLAYGROUND_ENABLED`, so +the route is absent. Do not enable it on a public deployment: it is a convenience for demos and +local development, not a production surface. Leaving it off keeps the public attack surface to +the documented API only. + +--- + +## Files behind the demo + +| Path | What it is | +|---|---| +| `demo.sh` | The one-command entrypoint (up / `--down`). | +| `docker-compose.demo.yml` | The demo stack definition. | +| `demo/mock-ollama/` | The standalone mock Ollama service (FastAPI app + Dockerfile). | +| `playground/index.html` | The self-contained browser playground served at `/playground`. | diff --git a/docs/THREAT_MODEL.md b/docs/THREAT_MODEL.md new file mode 100644 index 0000000..e3be383 --- /dev/null +++ b/docs/THREAT_MODEL.md @@ -0,0 +1,77 @@ +# neuronetz-gateway — Threat Model + +From [`scope-docs/SPEC.md`](../scope-docs/SPEC.md) §3. The governing principle, in one line: + +> **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down, +> DB unreachable, ambiguous state), **deny** the request. Never default to allow. +> (AGENT_PROMPT non-negotiable #1.) + +The gateway exists because the Ollama instance at `api.neuronetz.ai` was exposed without +authentication — a standing security incident. Every defense below traces back to closing +that gap and keeping it closed. + +--- + +## Threats & mitigations (SPEC §3) + +| Threat | Mitigation | +|---|---| +| Internet scanners hitting Ollama directly | Ollama bound to the internal Docker network; **never published**. No `ports:` mapping in any shipped compose file. | +| Unauthenticated API abuse | Mandatory Bearer token; **fail-closed** on auth errors (401). | +| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP (`AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN`). | +| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent-connection cap. | +| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096). | +| Model enumeration / training-data exfil via uncommon models | Model allowlist, **default-deny**. Discovery only exposes models actually installed; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the **same** generic response. | +| Discovery backend unreachable | **Fail-closed:** an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models." | +| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) **hard-blocked** at the gateway, not configurable. | +| Information disclosure via error messages | Upstream errors **sanitized** at the boundary; Ollama internals never proxied to the client. Each error carries an `X-Request-ID` for correlation. | +| Audit log tampering | Append-only at the app layer; DB role separation; optional WAL archiving. | +| Prompt data leakage | Prompt logging **off by default**; opt-in per key; TTL'd retention; redaction hook. | +| Redis outage causing "fail open" | **Fail-closed:** if the rate-limit/budget backend is unavailable, deny (503), not allow. | +| Compromised admin token | There is **no admin endpoint** in the gateway. Admin lives in `neuronetz-console`; the gateway has nothing to compromise here. | + +--- + +## Notes on selected defenses + +### `allow_all_models` is an audited opt-in + +`allow_all_models` lets a tenant use any currently-installed model, so models newly pulled +into Ollama are auto-granted on the next discovery refresh. This is convenient but widens the +attack surface for *that tenant*, so it is: + +- **opt-in per tenant** (default `false`), set explicitly via the CLI + (`create-tenant --allow-all-models` or `set-models --allow-all`); +- **overridable per key** — a non-`NULL` key-level `allow_all_models` overrides the tenant + flag; otherwise the tenant flag applies (SPEC §13.7); +- **audited** — every request records the model used in `gateway.audit_log`. + +Default-deny tenants instead see only `allowed_models ∩ discovered`. Either way the effective +set is always intersected with the *live* discovered set, so stale or typo'd allowlist entries +never resolve. + +### No existence disclosure + +A model that is installed-but-unpermitted and a model that is not installed both return the +**same** generic `403`. An attacker cannot use the gateway to enumerate which models exist on +the backend (SPEC §13.6). + +### Sanitized errors + request IDs + +Clients never receive Ollama's error text, stack traces, or internal hostnames. Errors are +mapped to generic `4xx`/`5xx` JSON with a `request_id`. Operators correlate that ID with the +audit log to investigate without leaking internals to callers (SPEC §4.3 step 14). + +### Streaming integrity is also a safety property + +Token counting and audit writes happen **after** stream close, never on the hot path. This +keeps time-to-first-byte honest and ensures budget decrements and audit rows reflect the true +final token counts reported by Ollama (`prompt_eval_count` + `eval_count`), not estimates. + +--- + +## Out of scope (v0.1.0) + +Documented as future work, **not** mitigations present today: content moderation / +prompt-injection filtering, response caching, multi-backend routing, billing, SSO/OAuth2 for +admin, and any web admin UI (that lives in `neuronetz-console`). diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..944c53c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,40 @@ +# mkdocs configuration for the neuronetz-gateway documentation. +# +# pip install mkdocs-material +# mkdocs serve # live preview at http://127.0.0.1:8000 +# mkdocs build # static site into ./site +# +# Docs live in docs/. This wires them into a single Material-themed site. +site_name: neuronetz-gateway +site_description: Secure, multi-tenant API gateway in front of Ollama. +docs_dir: docs + +theme: + name: material + palette: + - scheme: slate + primary: indigo + accent: indigo + features: + - navigation.sections + - navigation.top + - content.code.copy + - content.code.annotate + +markdown_extensions: + - admonition + - tables + - toc: + permalink: true + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.superfences + - pymdownx.inlinehilite + +nav: + - Architecture: ARCHITECTURE.md + - API Reference: API.md + - Deployment: DEPLOYMENT.md + - Threat Model: THREAT_MODEL.md + - Operations Runbook: OPERATIONS.md + - Demo & Playground: PLAYGROUND.md diff --git a/playground/index.html b/playground/index.html new file mode 100644 index 0000000..da37760 --- /dev/null +++ b/playground/index.html @@ -0,0 +1,716 @@ + + + + + +neuronetz-gateway · playground + + + +
+ +
+

neuronetz-gateway · playground

+
Authenticated, rate-limited, audited access to the model backend
+
+
+
same-origin
+
+ +
+ +
+

Request

+ + +
+ + +
+ + + +
Created by ./demo.sh and printed once in your terminal.
+ + +
+ +
+ +
+ + +
+
+ +
+ + + +
+ + +
+
+ + +
+
+
+

POST /v1/chat/completions

+
+ POST + auth: bearer + streams · SSE +
+

+
Sample request body
+

+      
Sample response
+

+      
+
+ +
+
+

Response

+
+ idle +
+
Run a request to see the response stream here.
+ +
+ +
+
+

Exact curl

+ +
+

+      
This is exactly what Run sends — paste it into a terminal to reproduce.
+
+
+
+ + + +