One-command demo so the gateway can be exercised end-to-end without a GPU or a real model download: - demo/mock-ollama/ — tiny FastAPI service emulating Ollama (/api/tags, /api/chat + /api/generate NDJSON streaming with realistic prompt_eval_count and eval_count on the final frame, /api/embed, /api/show, /api/version). Non-root multi-stage Dockerfile, never published (internal network only). - docker-compose.demo.yml — postgres + redis + mock-ollama + gateway, with PLAYGROUND_ENABLED=true and ./playground mounted read-only at /app/playground. Mirrors the prod posture (mock-ollama not exposed). - demo.sh — brings the stack up, waits on /healthz, creates a demo tenant with allow_all_models and a fresh API key via the bootstrap CLI inside the container, then prints the key, the playground URL, and five ready-to-paste curl commands (SSE chat, NDJSON chat, /v1/models, a 401, a 403 /api/pull). ./demo.sh --down tears everything back down with volumes. - playground/index.html — single-file dark-themed UI served same-origin by the gateway at /playground (CORS-free). Per-endpoint About card with method/ auth/streaming badges, a real description, sample request body, sample response, and a footer note. Live SSE/NDJSON rendering of the response. A live, copyable curl box that mirrors exactly what Run sends. Run + Refresh are visibly gated until an API key is in the field; the Base URL is force-pinned to location.origin three times to defeat browser autofill. - docs/ — API.md (full endpoint reference with curl, streaming formats, error model, SPEC §6.5 response headers), ARCHITECTURE.md (incl. §4.6 discovery + the request lifecycle), DEPLOYMENT.md (Ollama-never-exposed rule, pointing at a real Ollama backend, env reference), THREAT_MODEL.md (SPEC §3 table + the allow_all_models opt-in notes), OPERATIONS.md (key/budget/model/usage runbook + fail-closed table), PLAYGROUND.md. mkdocs.yml (Material theme) wires them together.
147 lines
6.5 KiB
YAML
147 lines
6.5 KiB
YAML
# neuronetz-gateway — DEMO stack (postgres + redis + mock-ollama + gateway).
|
|
#
|
|
# This is the one-command presentation stack. It runs the real gateway image
|
|
# (built from the repo Dockerfile) against a MOCK Ollama backend, so the whole
|
|
# thing comes up with NO GPU and NO model downloads.
|
|
#
|
|
# ./demo.sh # bring it up, create a demo tenant+key, print curls
|
|
# ./demo.sh --down # tear it all down
|
|
#
|
|
# Differs from the production stack (docker-compose.yml):
|
|
# * NO caddy — the gateway is published directly on 127.0.0.1:8080.
|
|
# * mock-ollama instead of the real ollama image.
|
|
# * playground enabled — the gateway serves /playground from a mounted file.
|
|
#
|
|
# ┌─────────────────────────────────────────────────────────────────────────┐
|
|
# │ SECURITY POSTURE (mirrors prod): │
|
|
# │ `mock-ollama` has NO `ports:` mapping. The model backend is reachable │
|
|
# │ only on the internal Docker network as `mock-ollama:11434`, exactly │
|
|
# │ like real Ollama in production. Only the gateway is published, and only │
|
|
# │ on the loopback interface (127.0.0.1:8080). │
|
|
# └─────────────────────────────────────────────────────────────────────────┘
|
|
|
|
services:
|
|
gateway:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
restart: unless-stopped
|
|
ports:
|
|
- "127.0.0.1:8080:8080"
|
|
environment:
|
|
GATEWAY_BIND_HOST: 0.0.0.0
|
|
GATEWAY_BIND_PORT: "8080"
|
|
GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
|
|
GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
|
|
GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
|
|
GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
|
|
# Serve the interactive playground from the mounted file (flag-gated;
|
|
# OFF by default in prod). See playground/index.html.
|
|
PLAYGROUND_ENABLED: "true"
|
|
PLAYGROUND_FILE: /app/playground/index.html
|
|
# Point the gateway at the mock Ollama on the internal network.
|
|
OLLAMA_BASE_URL: http://mock-ollama:11434
|
|
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
|
|
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
|
|
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
|
|
# Discover models quickly so the demo feels live.
|
|
MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-15}
|
|
MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-60}
|
|
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
|
|
DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
|
|
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
|
|
REDIS_URL: redis://redis:6379/0
|
|
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
|
|
DEFAULT_RPM: ${DEFAULT_RPM:-60}
|
|
DEFAULT_TPM: ${DEFAULT_TPM:-100000}
|
|
DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
|
|
MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
|
|
MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
|
|
ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
|
|
ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
|
|
ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
|
|
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
|
|
AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
|
|
PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
|
|
AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
|
|
volumes:
|
|
# The gateway serves /playground by reading this file at request time.
|
|
# Read-only mount: the demo never lets the container modify it.
|
|
- ./playground:/app/playground:ro
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
redis:
|
|
condition: service_healthy
|
|
mock-ollama:
|
|
condition: service_healthy
|
|
# Apply migrations, then start the server (mirrors docker-compose.dev.yml).
|
|
command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
|
|
interval: 10s
|
|
timeout: 3s
|
|
retries: 5
|
|
start_period: 30s
|
|
networks:
|
|
- internal
|
|
|
|
# ───────────────────────────────────────────────────────────────────────────
|
|
# mock-ollama — INTERNAL NETWORK ONLY. Stands in for the real Ollama backend.
|
|
# NO `ports:` mapping, mirroring the production "Ollama is never exposed" rule.
|
|
# Reachable only as `http://mock-ollama:11434` from the gateway container.
|
|
# ───────────────────────────────────────────────────────────────────────────
|
|
mock-ollama:
|
|
build:
|
|
context: ./demo/mock-ollama
|
|
dockerfile: Dockerfile
|
|
restart: unless-stopped
|
|
# !!! NO `ports:` — the model backend is never published. !!!
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:11434/api/version"]
|
|
interval: 10s
|
|
timeout: 3s
|
|
retries: 5
|
|
start_period: 5s
|
|
networks:
|
|
- internal
|
|
|
|
postgres:
|
|
image: postgres:16-alpine
|
|
restart: unless-stopped
|
|
environment:
|
|
POSTGRES_USER: ${POSTGRES_USER:-gateway}
|
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
|
|
POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
|
|
volumes:
|
|
- postgres_demo_data:/var/lib/postgresql/data
|
|
# No `ports:` — Postgres is internal-only.
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
networks:
|
|
- internal
|
|
|
|
redis:
|
|
image: redis:7-alpine
|
|
restart: unless-stopped
|
|
command: ["redis-server", "--save", "", "--appendonly", "no"]
|
|
# No `ports:` — Redis is internal-only.
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "ping"]
|
|
interval: 5s
|
|
timeout: 3s
|
|
retries: 10
|
|
networks:
|
|
- internal
|
|
|
|
networks:
|
|
# Private network for inter-service traffic; not reachable from the host.
|
|
internal:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
postgres_demo_data:
|