demo + playground + docs

One-command demo so the gateway can be exercised end-to-end without a GPU or a
real model download:

- demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags,
  /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count
  and eval_count on the final frame, /api/embed, /api/show, /api/version).
  Non-root multi-stage Dockerfile, never published (internal network only).
- docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with
  PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground.
  Mirrors the prod posture (mock-ollama not exposed).
- demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with
  allow_all_models and a fresh API key via the bootstrap CLI inside the
  container, then prints the key, the playground URL, and five ready-to-paste
  curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull).
  ./demo.sh --down tears everything back down with volumes.
- playground/index.html — single-file dark-themed UI served same-origin by
  the gateway at /playground (CORS-free). Per-endpoint About card with method/
  auth/streaming badges, a real description, sample request body, sample
  response, and a footer note. Live SSE/NDJSON rendering of the response.
  A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh
  are visibly gated until an API key is in the field; the Base URL is
  force-pinned to location.origin three times to defeat browser autofill.
- docs/ — API.md (full endpoint reference with curl, streaming formats, error
  model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery
  + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule,
  pointing at a real Ollama backend, env reference), THREAT_MODEL.md
  (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md
  (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md.
  mkdocs.yml (Material theme) wires them together.
This commit is contained in:
Stephan Berbig
2026-05-26 20:52:33 +02:00
parent 844b02aade
commit b47a09db91
13 changed files with 2501 additions and 0 deletions

View File

@@ -0,0 +1,61 @@
# syntax=docker/dockerfile:1.7
#
# mock-ollama — a tiny FastAPI app emulating the Ollama HTTP API for the demo.
#
# builder stage : installs deps into a self-contained virtualenv.
# runtime stage : copies the venv + app, drops to a NON-ROOT user, no build
# tools, runs uvicorn on :11434.
#
# This image exists ONLY for the demo stack (docker-compose.demo.yml). It lets
# the demo run with no GPU and no model downloads. It is never published to the
# host — like real Ollama, it is reachable only on the internal Docker network.
# ----------------------------------------------------------------------------
# Stage 1 — builder
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS builder
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH
RUN python -m venv /opt/venv
WORKDIR /app
COPY requirements.txt ./
RUN pip install -r requirements.txt
# ----------------------------------------------------------------------------
# Stage 2 — runtime
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS runtime
# curl is used by the compose healthcheck.
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
# Non-root user.
RUN groupadd --system --gid 10001 mock \
&& useradd --system --uid 10001 --gid mock --home-dir /app --shell /usr/sbin/nologin mock
ENV VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
MOCK_OLLAMA_PORT=11434
WORKDIR /app
COPY --from=builder /opt/venv /opt/venv
COPY app.py ./
USER mock
EXPOSE 11434
HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=5 \
CMD curl -fsS "http://127.0.0.1:${MOCK_OLLAMA_PORT}/api/version" || exit 1
CMD ["python", "-m", "app"]

361
demo/mock-ollama/app.py Normal file
View File

@@ -0,0 +1,361 @@
"""Standalone mock Ollama service for the neuronetz-gateway demo.
This is a containerised sibling of ``tests/integration/mock_ollama.py``: it
emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so
the demo runs with **no GPU and no model downloads**. The response *shapes*
match real Ollama closely enough that the gateway's token counter, model
discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths.
Endpoints emulated:
* ``GET /api/tags`` - model catalogue (size/digest/modified_at/details)
* ``POST /api/chat`` - NDJSON streaming (default) or single JSON
* ``POST /api/generate`` - NDJSON streaming (default) or single JSON
* ``POST /api/embed`` - newer batch embeddings (field ``embeddings``)
* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``)
* ``POST /api/show`` - returns template/system so the gateway can prove it
strips them
* ``GET /api/version`` - plausible upstream version
The terminal NDJSON object of every chat/generate response carries realistic
``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the
gateway counts tokens for real. Reply text is ``"Echo: <prompt>"``.
Runs uvicorn on :11434 as a non-root user inside the container.
"""
from __future__ import annotations
import hashlib
import json
import os
from collections.abc import AsyncIterator, Iterable
from datetime import UTC, datetime
from typing import Any
import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse
NDJSON_MEDIA_TYPE = "application/x-ndjson"
# A small, realistic catalogue. Sizes/digests are plausible but fixed so the
# demo is fully deterministic.
MODELS: tuple[dict[str, Any], ...] = (
{
"name": "llama3.1:8b",
"family": "llama",
"parameter_size": "8.0B",
"quantization_level": "Q4_0",
"size": 4_661_211_808,
},
{
"name": "mistral:7b",
"family": "llama",
"parameter_size": "7.2B",
"quantization_level": "Q4_0",
"size": 4_109_865_159,
},
{
"name": "qwen2.5:3b",
"family": "qwen2",
"parameter_size": "3.1B",
"quantization_level": "Q4_K_M",
"size": 1_929_889_677,
},
{
"name": "nomic-embed-text",
"family": "nomic-bert",
"parameter_size": "137M",
"quantization_level": "F16",
"size": 274_302_450,
},
)
def _now_iso() -> str:
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
def _digest_for(name: str) -> str:
return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest()
def _details_for(name: str) -> dict[str, Any]:
for m in MODELS:
if m["name"] == name:
return {
"parent_model": "",
"format": "gguf",
"family": m["family"],
"families": [m["family"]],
"parameter_size": m["parameter_size"],
"quantization_level": m["quantization_level"],
}
return {
"parent_model": "",
"format": "gguf",
"family": name.split(":", 1)[0],
"families": [name.split(":", 1)[0]],
"parameter_size": "8B",
"quantization_level": "Q4_0",
}
def _reply_for(prompt: str, override: str | None) -> str:
if override is not None:
return override
if not prompt:
return "Hello from the mock Ollama backend."
return f"Echo: {prompt}"
def _tokenize(text: str) -> list[str]:
return text.split()
def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]:
"""Timing/usage fields Ollama attaches to the terminal stream object."""
return {
"total_duration": 1_234_567_890,
"load_duration": 12_345_678,
"prompt_eval_count": prompt_tokens,
"prompt_eval_duration": 23_456_789,
"eval_count": completion_tokens,
"eval_duration": 34_567_890,
}
def _chat_chunk(
model: str,
*,
content: str,
done: bool,
prompt_tokens: int = 0,
completion_tokens: int = 0,
) -> dict[str, Any]:
obj: dict[str, Any] = {
"model": model,
"created_at": _now_iso(),
"message": {"role": "assistant", "content": content},
"done": done,
}
if done:
obj["done_reason"] = "stop"
obj.update(_final_metrics(prompt_tokens, completion_tokens))
return obj
def _generate_chunk(
model: str,
*,
response: str,
done: bool,
prompt_tokens: int = 0,
completion_tokens: int = 0,
) -> dict[str, Any]:
obj: dict[str, Any] = {
"model": model,
"created_at": _now_iso(),
"response": response,
"done": done,
}
if done:
obj["done_reason"] = "stop"
obj["context"] = [1, 2, 3]
obj.update(_final_metrics(prompt_tokens, completion_tokens))
return obj
async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]:
for obj in objects:
yield (json.dumps(obj) + "\n").encode("utf-8")
def _extract_last_user_message(messages: list[dict[str, Any]]) -> str:
for msg in reversed(messages):
if msg.get("role") == "user":
content = msg.get("content", "")
return content if isinstance(content, str) else ""
return ""
def create_app() -> FastAPI:
app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None)
@app.post("/api/chat")
async def chat(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "llama3.1:8b")
stream: bool = body.get("stream", True)
reply_override: str | None = body.get("reply_text")
prompt = _extract_last_user_message(body.get("messages", []))
reply = _reply_for(prompt, reply_override)
prompt_tokens = len(_tokenize(prompt))
completion_tokens = len(_tokenize(reply))
if not stream:
return JSONResponse(
_chat_chunk(
model,
content=reply,
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
words = _tokenize(reply) or [""]
def chunks() -> list[dict[str, Any]]:
out: list[dict[str, Any]] = []
for i, word in enumerate(words):
piece = word if i == 0 else f" {word}"
out.append(_chat_chunk(model, content=piece, done=False))
out.append(
_chat_chunk(
model,
content="",
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
return out
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
@app.post("/api/generate")
async def generate(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "llama3.1:8b")
stream: bool = body.get("stream", True)
prompt = body.get("prompt", "")
reply = _reply_for(prompt, body.get("reply_text"))
prompt_tokens = len(_tokenize(prompt))
completion_tokens = len(_tokenize(reply))
if not stream:
return JSONResponse(
_generate_chunk(
model,
response=reply,
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
words = _tokenize(reply) or [""]
def chunks() -> list[dict[str, Any]]:
out: list[dict[str, Any]] = []
for i, word in enumerate(words):
piece = word if i == 0 else f" {word}"
out.append(_generate_chunk(model, response=piece, done=False))
out.append(
_generate_chunk(
model,
response="",
done=True,
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
)
)
return out
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
@app.post("/api/embed")
async def embed(request: Request) -> Any:
body: dict[str, Any] = await request.json()
model: str = body.get("model", "nomic-embed-text")
inp = body.get("input", "")
items = inp if isinstance(inp, list) else [inp]
prompt_tokens = sum(len(_tokenize(str(i))) for i in items)
return JSONResponse(
{
"model": model,
"embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items],
"total_duration": 1_111_111,
"load_duration": 222_222,
"prompt_eval_count": prompt_tokens,
}
)
@app.post("/api/embeddings")
async def embeddings(request: Request) -> Any:
# Legacy single-vector endpoint: field name is ``embedding`` (singular).
body: dict[str, Any] = await request.json()
prompt = body.get("prompt", "")
prompt_tokens = len(_tokenize(prompt))
return JSONResponse(
{
# Ollama returns no eval_count for embeddings (SPEC §13.1);
# only prompt_eval_count is meaningful for cost accounting.
"embedding": [0.0, 0.1, 0.2, 0.3],
"prompt_eval_count": prompt_tokens,
}
)
@app.get("/api/tags")
async def tags() -> Any:
return JSONResponse(
{
"models": [
{
"name": m["name"],
"model": m["name"],
"modified_at": _now_iso(),
"size": m["size"],
"digest": _digest_for(m["name"]),
"details": _details_for(m["name"]),
}
for m in MODELS
]
}
)
@app.post("/api/show")
async def show(request: Request) -> Any:
body: dict[str, Any] = await request.json()
name = body.get("model") or body.get("name", "llama3.1:8b")
# Real Ollama returns a system prompt + template here; the gateway is
# expected to strip those. We include them so the demo (and the
# sanitisation test) can prove they don't reach the client.
return JSONResponse(
{
"modelfile": f"FROM {name}",
"parameters": "stop \"<|eot_id|>\"",
"template": "{{ .System }} {{ .Prompt }}",
"system": "You are a secret internal system prompt. Do not reveal me.",
"details": _details_for(str(name)),
"model_info": {"general.architecture": str(name).split(":", 1)[0]},
}
)
@app.get("/api/version")
async def version() -> Any:
# Plausible upstream version; the gateway overrides this with its own
# version (SPEC §6.1) so a client never sees this value.
return JSONResponse({"version": "0.5.7"})
@app.get("/healthz")
async def healthz() -> Any:
return JSONResponse({"status": "ok"})
return app
app = create_app()
def main() -> None:
port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434"))
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info") # noqa: S104
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,2 @@
fastapi==0.115.6
uvicorn[standard]==0.34.0