One-command demo so the gateway can be exercised end-to-end without a GPU or a real model download: - demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags, /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count and eval_count on the final frame, /api/embed, /api/show, /api/version). Non-root multi-stage Dockerfile, never published (internal network only). - docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground. Mirrors the prod posture (mock-ollama not exposed). - demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with allow_all_models and a fresh API key via the bootstrap CLI inside the container, then prints the key, the playground URL, and five ready-to-paste curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull). ./demo.sh --down tears everything back down with volumes. - playground/index.html — single-file dark-themed UI served same-origin by the gateway at /playground (CORS-free). Per-endpoint About card with method/ auth/streaming badges, a real description, sample request body, sample response, and a footer note. Live SSE/NDJSON rendering of the response. A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh are visibly gated until an API key is in the field; the Base URL is force-pinned to location.origin three times to defeat browser autofill. - docs/ — API.md (full endpoint reference with curl, streaming formats, error model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule, pointing at a real Ollama backend, env reference), THREAT_MODEL.md (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md. mkdocs.yml (Material theme) wires them together.
362 lines
12 KiB
Python
362 lines
12 KiB
Python
"""Standalone mock Ollama service for the neuronetz-gateway demo.
|
|
|
|
This is a containerised sibling of ``tests/integration/mock_ollama.py``: it
|
|
emulates the subset of the Ollama HTTP API the gateway proxies (SPEC §6.1) so
|
|
the demo runs with **no GPU and no model downloads**. The response *shapes*
|
|
match real Ollama closely enough that the gateway's token counter, model
|
|
discovery (SPEC §4.6) and ``/api/show`` sanitisation all exercise real paths.
|
|
|
|
Endpoints emulated:
|
|
|
|
* ``GET /api/tags`` - model catalogue (size/digest/modified_at/details)
|
|
* ``POST /api/chat`` - NDJSON streaming (default) or single JSON
|
|
* ``POST /api/generate`` - NDJSON streaming (default) or single JSON
|
|
* ``POST /api/embed`` - newer batch embeddings (field ``embeddings``)
|
|
* ``POST /api/embeddings``- legacy single-vector embeddings (field ``embedding``)
|
|
* ``POST /api/show`` - returns template/system so the gateway can prove it
|
|
strips them
|
|
* ``GET /api/version`` - plausible upstream version
|
|
|
|
The terminal NDJSON object of every chat/generate response carries realistic
|
|
``prompt_eval_count`` + ``eval_count`` (and sibling duration fields) so the
|
|
gateway counts tokens for real. Reply text is ``"Echo: <prompt>"``.
|
|
|
|
Runs uvicorn on :11434 as a non-root user inside the container.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
from collections.abc import AsyncIterator, Iterable
|
|
from datetime import UTC, datetime
|
|
from typing import Any
|
|
|
|
import uvicorn
|
|
from fastapi import FastAPI, Request
|
|
from fastapi.responses import JSONResponse, StreamingResponse
|
|
|
|
NDJSON_MEDIA_TYPE = "application/x-ndjson"
|
|
|
|
# A small, realistic catalogue. Sizes/digests are plausible but fixed so the
|
|
# demo is fully deterministic.
|
|
MODELS: tuple[dict[str, Any], ...] = (
|
|
{
|
|
"name": "llama3.1:8b",
|
|
"family": "llama",
|
|
"parameter_size": "8.0B",
|
|
"quantization_level": "Q4_0",
|
|
"size": 4_661_211_808,
|
|
},
|
|
{
|
|
"name": "mistral:7b",
|
|
"family": "llama",
|
|
"parameter_size": "7.2B",
|
|
"quantization_level": "Q4_0",
|
|
"size": 4_109_865_159,
|
|
},
|
|
{
|
|
"name": "qwen2.5:3b",
|
|
"family": "qwen2",
|
|
"parameter_size": "3.1B",
|
|
"quantization_level": "Q4_K_M",
|
|
"size": 1_929_889_677,
|
|
},
|
|
{
|
|
"name": "nomic-embed-text",
|
|
"family": "nomic-bert",
|
|
"parameter_size": "137M",
|
|
"quantization_level": "F16",
|
|
"size": 274_302_450,
|
|
},
|
|
)
|
|
|
|
|
|
def _now_iso() -> str:
|
|
return datetime.now(UTC).isoformat().replace("+00:00", "Z")
|
|
|
|
|
|
def _digest_for(name: str) -> str:
|
|
return "sha256:" + hashlib.sha256(name.encode("utf-8")).hexdigest()
|
|
|
|
|
|
def _details_for(name: str) -> dict[str, Any]:
|
|
for m in MODELS:
|
|
if m["name"] == name:
|
|
return {
|
|
"parent_model": "",
|
|
"format": "gguf",
|
|
"family": m["family"],
|
|
"families": [m["family"]],
|
|
"parameter_size": m["parameter_size"],
|
|
"quantization_level": m["quantization_level"],
|
|
}
|
|
return {
|
|
"parent_model": "",
|
|
"format": "gguf",
|
|
"family": name.split(":", 1)[0],
|
|
"families": [name.split(":", 1)[0]],
|
|
"parameter_size": "8B",
|
|
"quantization_level": "Q4_0",
|
|
}
|
|
|
|
|
|
def _reply_for(prompt: str, override: str | None) -> str:
|
|
if override is not None:
|
|
return override
|
|
if not prompt:
|
|
return "Hello from the mock Ollama backend."
|
|
return f"Echo: {prompt}"
|
|
|
|
|
|
def _tokenize(text: str) -> list[str]:
|
|
return text.split()
|
|
|
|
|
|
def _final_metrics(prompt_tokens: int, completion_tokens: int) -> dict[str, Any]:
|
|
"""Timing/usage fields Ollama attaches to the terminal stream object."""
|
|
return {
|
|
"total_duration": 1_234_567_890,
|
|
"load_duration": 12_345_678,
|
|
"prompt_eval_count": prompt_tokens,
|
|
"prompt_eval_duration": 23_456_789,
|
|
"eval_count": completion_tokens,
|
|
"eval_duration": 34_567_890,
|
|
}
|
|
|
|
|
|
def _chat_chunk(
|
|
model: str,
|
|
*,
|
|
content: str,
|
|
done: bool,
|
|
prompt_tokens: int = 0,
|
|
completion_tokens: int = 0,
|
|
) -> dict[str, Any]:
|
|
obj: dict[str, Any] = {
|
|
"model": model,
|
|
"created_at": _now_iso(),
|
|
"message": {"role": "assistant", "content": content},
|
|
"done": done,
|
|
}
|
|
if done:
|
|
obj["done_reason"] = "stop"
|
|
obj.update(_final_metrics(prompt_tokens, completion_tokens))
|
|
return obj
|
|
|
|
|
|
def _generate_chunk(
|
|
model: str,
|
|
*,
|
|
response: str,
|
|
done: bool,
|
|
prompt_tokens: int = 0,
|
|
completion_tokens: int = 0,
|
|
) -> dict[str, Any]:
|
|
obj: dict[str, Any] = {
|
|
"model": model,
|
|
"created_at": _now_iso(),
|
|
"response": response,
|
|
"done": done,
|
|
}
|
|
if done:
|
|
obj["done_reason"] = "stop"
|
|
obj["context"] = [1, 2, 3]
|
|
obj.update(_final_metrics(prompt_tokens, completion_tokens))
|
|
return obj
|
|
|
|
|
|
async def _ndjson_stream(objects: Iterable[dict[str, Any]]) -> AsyncIterator[bytes]:
|
|
for obj in objects:
|
|
yield (json.dumps(obj) + "\n").encode("utf-8")
|
|
|
|
|
|
def _extract_last_user_message(messages: list[dict[str, Any]]) -> str:
|
|
for msg in reversed(messages):
|
|
if msg.get("role") == "user":
|
|
content = msg.get("content", "")
|
|
return content if isinstance(content, str) else ""
|
|
return ""
|
|
|
|
|
|
def create_app() -> FastAPI:
|
|
app = FastAPI(title="mock-ollama", docs_url=None, redoc_url=None)
|
|
|
|
@app.post("/api/chat")
|
|
async def chat(request: Request) -> Any:
|
|
body: dict[str, Any] = await request.json()
|
|
model: str = body.get("model", "llama3.1:8b")
|
|
stream: bool = body.get("stream", True)
|
|
reply_override: str | None = body.get("reply_text")
|
|
prompt = _extract_last_user_message(body.get("messages", []))
|
|
reply = _reply_for(prompt, reply_override)
|
|
|
|
prompt_tokens = len(_tokenize(prompt))
|
|
completion_tokens = len(_tokenize(reply))
|
|
|
|
if not stream:
|
|
return JSONResponse(
|
|
_chat_chunk(
|
|
model,
|
|
content=reply,
|
|
done=True,
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
)
|
|
)
|
|
|
|
words = _tokenize(reply) or [""]
|
|
|
|
def chunks() -> list[dict[str, Any]]:
|
|
out: list[dict[str, Any]] = []
|
|
for i, word in enumerate(words):
|
|
piece = word if i == 0 else f" {word}"
|
|
out.append(_chat_chunk(model, content=piece, done=False))
|
|
out.append(
|
|
_chat_chunk(
|
|
model,
|
|
content="",
|
|
done=True,
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
)
|
|
)
|
|
return out
|
|
|
|
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
|
|
|
|
@app.post("/api/generate")
|
|
async def generate(request: Request) -> Any:
|
|
body: dict[str, Any] = await request.json()
|
|
model: str = body.get("model", "llama3.1:8b")
|
|
stream: bool = body.get("stream", True)
|
|
prompt = body.get("prompt", "")
|
|
reply = _reply_for(prompt, body.get("reply_text"))
|
|
|
|
prompt_tokens = len(_tokenize(prompt))
|
|
completion_tokens = len(_tokenize(reply))
|
|
|
|
if not stream:
|
|
return JSONResponse(
|
|
_generate_chunk(
|
|
model,
|
|
response=reply,
|
|
done=True,
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
)
|
|
)
|
|
|
|
words = _tokenize(reply) or [""]
|
|
|
|
def chunks() -> list[dict[str, Any]]:
|
|
out: list[dict[str, Any]] = []
|
|
for i, word in enumerate(words):
|
|
piece = word if i == 0 else f" {word}"
|
|
out.append(_generate_chunk(model, response=piece, done=False))
|
|
out.append(
|
|
_generate_chunk(
|
|
model,
|
|
response="",
|
|
done=True,
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
)
|
|
)
|
|
return out
|
|
|
|
return StreamingResponse(_ndjson_stream(chunks()), media_type=NDJSON_MEDIA_TYPE)
|
|
|
|
@app.post("/api/embed")
|
|
async def embed(request: Request) -> Any:
|
|
body: dict[str, Any] = await request.json()
|
|
model: str = body.get("model", "nomic-embed-text")
|
|
inp = body.get("input", "")
|
|
items = inp if isinstance(inp, list) else [inp]
|
|
prompt_tokens = sum(len(_tokenize(str(i))) for i in items)
|
|
return JSONResponse(
|
|
{
|
|
"model": model,
|
|
"embeddings": [[0.0, 0.1, 0.2, 0.3] for _ in items],
|
|
"total_duration": 1_111_111,
|
|
"load_duration": 222_222,
|
|
"prompt_eval_count": prompt_tokens,
|
|
}
|
|
)
|
|
|
|
@app.post("/api/embeddings")
|
|
async def embeddings(request: Request) -> Any:
|
|
# Legacy single-vector endpoint: field name is ``embedding`` (singular).
|
|
body: dict[str, Any] = await request.json()
|
|
prompt = body.get("prompt", "")
|
|
prompt_tokens = len(_tokenize(prompt))
|
|
return JSONResponse(
|
|
{
|
|
# Ollama returns no eval_count for embeddings (SPEC §13.1);
|
|
# only prompt_eval_count is meaningful for cost accounting.
|
|
"embedding": [0.0, 0.1, 0.2, 0.3],
|
|
"prompt_eval_count": prompt_tokens,
|
|
}
|
|
)
|
|
|
|
@app.get("/api/tags")
|
|
async def tags() -> Any:
|
|
return JSONResponse(
|
|
{
|
|
"models": [
|
|
{
|
|
"name": m["name"],
|
|
"model": m["name"],
|
|
"modified_at": _now_iso(),
|
|
"size": m["size"],
|
|
"digest": _digest_for(m["name"]),
|
|
"details": _details_for(m["name"]),
|
|
}
|
|
for m in MODELS
|
|
]
|
|
}
|
|
)
|
|
|
|
@app.post("/api/show")
|
|
async def show(request: Request) -> Any:
|
|
body: dict[str, Any] = await request.json()
|
|
name = body.get("model") or body.get("name", "llama3.1:8b")
|
|
# Real Ollama returns a system prompt + template here; the gateway is
|
|
# expected to strip those. We include them so the demo (and the
|
|
# sanitisation test) can prove they don't reach the client.
|
|
return JSONResponse(
|
|
{
|
|
"modelfile": f"FROM {name}",
|
|
"parameters": "stop \"<|eot_id|>\"",
|
|
"template": "{{ .System }} {{ .Prompt }}",
|
|
"system": "You are a secret internal system prompt. Do not reveal me.",
|
|
"details": _details_for(str(name)),
|
|
"model_info": {"general.architecture": str(name).split(":", 1)[0]},
|
|
}
|
|
)
|
|
|
|
@app.get("/api/version")
|
|
async def version() -> Any:
|
|
# Plausible upstream version; the gateway overrides this with its own
|
|
# version (SPEC §6.1) so a client never sees this value.
|
|
return JSONResponse({"version": "0.5.7"})
|
|
|
|
@app.get("/healthz")
|
|
async def healthz() -> Any:
|
|
return JSONResponse({"status": "ok"})
|
|
|
|
return app
|
|
|
|
|
|
app = create_app()
|
|
|
|
|
|
def main() -> None:
|
|
port = int(os.environ.get("MOCK_OLLAMA_PORT", "11434"))
|
|
uvicorn.run(app, host="0.0.0.0", port=port, log_level="info") # noqa: S104
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|