deploy: upstream Ollama auth token + adoptable data volumes
Two production-hardening changes triggered by real issues found on the
first prod attempt against neuronetz-ai-01.
1. Upstream auth (the production Ollama is fronted by an auth proxy):
- New config: OLLAMA_AUTH_TOKEN (pydantic SecretStr — never appears in
repr/logs/errors), plus OLLAMA_AUTH_HEADER (default "Authorization")
and OLLAMA_AUTH_SCHEME (default "Bearer") for stacks that expect a
non-standard header like X-API-Key.
- lifespan._build_upstream_headers() injects the configured header into
the single shared httpx client used by both the proxy hot path AND
the discovery poller, so /api/tags + /api/chat both authenticate
against the upstream automatically.
- New CLI: `neuronetz-gateway probe-ollama` — uses the same client
config to GET /api/version and /api/tags, reports success/transport-
error/HTTP-status, lists the first few discovered models, exits 1 on
any failure. The token itself is never printed (only whether one
was attached). Lets ops verify upstream reachability before letting
real traffic through.
- docker-compose.yml passes OLLAMA_AUTH_TOKEN/HEADER/SCHEME through;
.env.example documents them with a leave-blank-for-internal-Ollama
default.
2. Volume adoption (don't lose existing model data on re-deploy):
- docker-compose.yml now pins absolute Docker volume NAMES for both
postgres_data and ollama_data, configurable via POSTGRES_DATA_VOLUME
and OLLAMA_DATA_VOLUME. Defaults preserve the previous per-project
names so existing deployments aren't disturbed.
- This addresses the scenario where deploying this compose under a new
project directory created fresh, empty volumes alongside an existing
`neuro-ollama_ollama-data` volume containing pre-pulled models (incl.
deepseek-r1:14b, qwen2.5:14b, gemma3:12b, ...). Setting
OLLAMA_DATA_VOLUME=neuro-ollama_ollama-data in .env tells the new
stack to mount the existing volume in place — no copy, no downtime.
- .env.example documents the override with the exact host's volume name
as an example.
Both changes are ruff + mypy --strict clean.
This commit is contained in:
22
.env.example
22
.env.example
@@ -18,11 +18,33 @@ GATEWAY_TRUSTED_PROXIES=127.0.0.1,nginx-proxy # for X-Forwarded-For
|
|||||||
GATEWAY_VIRTUAL_HOST=api.neuronetz.ai
|
GATEWAY_VIRTUAL_HOST=api.neuronetz.ai
|
||||||
LETSENCRYPT_EMAIL=admin@neuronetz.ai
|
LETSENCRYPT_EMAIL=admin@neuronetz.ai
|
||||||
|
|
||||||
|
# ──────────────────────── Volume adoption ────────────────────────
|
||||||
|
# Override the Docker volume names if an EXISTING volume on the host holds
|
||||||
|
# data this stack should adopt (e.g. models pulled by a previous Ollama
|
||||||
|
# deployment). Leave unset to use the default per-project names.
|
||||||
|
#
|
||||||
|
# Example (matches the neuronetz-ai-01 host):
|
||||||
|
# OLLAMA_DATA_VOLUME=neuro-ollama_ollama-data
|
||||||
|
# POSTGRES_DATA_VOLUME=neuro-gateway_postgres_data
|
||||||
|
OLLAMA_DATA_VOLUME=
|
||||||
|
POSTGRES_DATA_VOLUME=
|
||||||
|
|
||||||
# ──────────────────────────── Upstream ───────────────────────────
|
# ──────────────────────────── Upstream ───────────────────────────
|
||||||
OLLAMA_BASE_URL=http://ollama:11434
|
OLLAMA_BASE_URL=http://ollama:11434
|
||||||
OLLAMA_CONNECT_TIMEOUT_S=5
|
OLLAMA_CONNECT_TIMEOUT_S=5
|
||||||
OLLAMA_READ_TIMEOUT_S=600
|
OLLAMA_READ_TIMEOUT_S=600
|
||||||
OLLAMA_MAX_CONNECTIONS=64
|
OLLAMA_MAX_CONNECTIONS=64
|
||||||
|
# If you front Ollama with an auth proxy (e.g. an external host like
|
||||||
|
# https://ollama.neuronetz.ai requiring a Bearer token), set the token here.
|
||||||
|
# The value never appears in logs/errors — it's wrapped in pydantic SecretStr.
|
||||||
|
# Leave empty to send no Authorization header (the default for an in-stack
|
||||||
|
# ollama service on the private Docker network).
|
||||||
|
OLLAMA_AUTH_TOKEN=
|
||||||
|
# Override only if your auth proxy expects a non-standard header. For
|
||||||
|
# Authorization the scheme prefix (default: Bearer) is included; for any other
|
||||||
|
# header name the raw token is sent.
|
||||||
|
OLLAMA_AUTH_HEADER=Authorization
|
||||||
|
OLLAMA_AUTH_SCHEME=Bearer
|
||||||
|
|
||||||
# ──────────────────────── Model discovery (§4.6) ─────────────────
|
# ──────────────────────── Model discovery (§4.6) ─────────────────
|
||||||
MODEL_DISCOVERY_REFRESH_S=60
|
MODEL_DISCOVERY_REFRESH_S=60
|
||||||
|
|||||||
@@ -62,10 +62,15 @@ services:
|
|||||||
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
|
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
|
||||||
REDIS_URL: redis://redis:6379/0
|
REDIS_URL: redis://redis:6379/0
|
||||||
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
|
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
|
||||||
OLLAMA_BASE_URL: http://ollama:11434
|
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434}
|
||||||
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
|
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
|
||||||
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
|
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
|
||||||
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
|
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
|
||||||
|
# Optional Bearer token for an externally-fronted Ollama (default empty:
|
||||||
|
# the in-stack ollama service needs no auth on the private network).
|
||||||
|
OLLAMA_AUTH_TOKEN: ${OLLAMA_AUTH_TOKEN:-}
|
||||||
|
OLLAMA_AUTH_HEADER: ${OLLAMA_AUTH_HEADER:-Authorization}
|
||||||
|
OLLAMA_AUTH_SCHEME: ${OLLAMA_AUTH_SCHEME:-Bearer}
|
||||||
MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-60}
|
MODEL_DISCOVERY_REFRESH_S: ${MODEL_DISCOVERY_REFRESH_S:-60}
|
||||||
MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-120}
|
MODEL_DISCOVERY_CACHE_TTL_S: ${MODEL_DISCOVERY_CACHE_TTL_S:-120}
|
||||||
DEFAULT_RPM: ${DEFAULT_RPM:-60}
|
DEFAULT_RPM: ${DEFAULT_RPM:-60}
|
||||||
@@ -159,5 +164,18 @@ networks:
|
|||||||
driver: bridge
|
driver: bridge
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
# Pin absolute volume NAMES so the stack can ADOPT an existing volume that was
|
||||||
|
# created by a previous deployment under a different compose project. Without
|
||||||
|
# an explicit `name:`, compose namespaces volumes by project (directory) name,
|
||||||
|
# so a rename or re-clone silently creates fresh, empty volumes alongside the
|
||||||
|
# old data. We hit that the first time this stack was deployed (the original
|
||||||
|
# models lived in `neuro-ollama_ollama-data` and a fresh `neuro-gateway_
|
||||||
|
# ollama_data` was created next to them, leaving the models orphaned).
|
||||||
|
#
|
||||||
|
# Override via .env if your existing volumes are named differently:
|
||||||
|
# POSTGRES_DATA_VOLUME=neuro-api_postgres-data
|
||||||
|
# OLLAMA_DATA_VOLUME=neuro-ollama_ollama-data
|
||||||
postgres_data:
|
postgres_data:
|
||||||
|
name: ${POSTGRES_DATA_VOLUME:-neuro-gateway_postgres_data}
|
||||||
ollama_data:
|
ollama_data:
|
||||||
|
name: ${OLLAMA_DATA_VOLUME:-neuro-gateway_ollama_data}
|
||||||
|
|||||||
@@ -314,6 +314,90 @@ def list_models(
|
|||||||
_run(work)
|
_run(work)
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("probe-ollama")
|
||||||
|
def probe_ollama(
|
||||||
|
*,
|
||||||
|
timeout: Annotated[float, typer.Option(help="Per-request timeout in seconds.")] = 10.0,
|
||||||
|
) -> None:
|
||||||
|
"""Probe the upstream Ollama: GET /api/version and /api/tags.
|
||||||
|
|
||||||
|
Uses the exact same httpx config as the running gateway (base URL, timeouts,
|
||||||
|
and the OLLAMA_AUTH_TOKEN header if set) so a passing probe proves the
|
||||||
|
gateway will be able to reach the backend in production. The token itself
|
||||||
|
is NEVER printed — only whether one was attached.
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from neuronetz_gateway.lifespan import _build_upstream_headers
|
||||||
|
|
||||||
|
settings = get_settings()
|
||||||
|
headers = _build_upstream_headers(settings)
|
||||||
|
auth_header = settings.ollama_auth_header
|
||||||
|
has_token = settings.ollama_auth_token is not None and bool(
|
||||||
|
settings.ollama_auth_token.get_secret_value().strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
auth_status = f"sending {auth_header}" if has_token else "no token (OLLAMA_AUTH_TOKEN unset)"
|
||||||
|
typer.echo(f"target: {settings.ollama_base_url}")
|
||||||
|
typer.echo(f"auth: {auth_status}")
|
||||||
|
|
||||||
|
async def _go() -> int:
|
||||||
|
probe_timeout = httpx.Timeout(
|
||||||
|
connect=settings.ollama_connect_timeout_s,
|
||||||
|
read=timeout,
|
||||||
|
write=timeout,
|
||||||
|
pool=timeout,
|
||||||
|
)
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
base_url=settings.ollama_base_url,
|
||||||
|
timeout=probe_timeout,
|
||||||
|
headers=headers,
|
||||||
|
) as client:
|
||||||
|
errors = 0
|
||||||
|
for path in ("/api/version", "/api/tags"):
|
||||||
|
try:
|
||||||
|
resp = await client.get(path)
|
||||||
|
except httpx.HTTPError as exc:
|
||||||
|
typer.secho(
|
||||||
|
f" GET {path} ✗ transport error: {type(exc).__name__}",
|
||||||
|
fg=typer.colors.RED,
|
||||||
|
)
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
if resp.status_code >= 400:
|
||||||
|
typer.secho(
|
||||||
|
f" GET {path} ✗ HTTP {resp.status_code}",
|
||||||
|
fg=typer.colors.RED,
|
||||||
|
)
|
||||||
|
if resp.status_code in (401, 403):
|
||||||
|
typer.echo(
|
||||||
|
" upstream rejected the credentials — check "
|
||||||
|
"OLLAMA_AUTH_TOKEN / header."
|
||||||
|
)
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
if path == "/api/version":
|
||||||
|
typer.secho(f" GET {path} ✓ HTTP 200", fg=typer.colors.GREEN)
|
||||||
|
else:
|
||||||
|
ct = resp.headers.get("content-type", "")
|
||||||
|
body = resp.json() if ct.startswith("application/json") else {}
|
||||||
|
n = len(body.get("models", []))
|
||||||
|
typer.secho(
|
||||||
|
f" GET {path} ✓ HTTP 200, {n} model(s) discovered",
|
||||||
|
fg=typer.colors.GREEN,
|
||||||
|
)
|
||||||
|
for m in body.get("models", [])[:5]:
|
||||||
|
typer.echo(f" · {m.get('name') or m.get('model')}")
|
||||||
|
if n > 5:
|
||||||
|
typer.echo(f" … and {n - 5} more")
|
||||||
|
return errors
|
||||||
|
|
||||||
|
errors = asyncio.run(_go())
|
||||||
|
if errors:
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
typer.secho("upstream reachable and authenticated.", fg=typer.colors.GREEN, bold=True)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
"""Console-script entry point."""
|
"""Console-script entry point."""
|
||||||
app()
|
app()
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
|
|
||||||
from pydantic import Field
|
from pydantic import Field, SecretStr
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
@@ -35,6 +35,16 @@ class Settings(BaseSettings):
|
|||||||
ollama_connect_timeout_s: int = Field(default=5)
|
ollama_connect_timeout_s: int = Field(default=5)
|
||||||
ollama_read_timeout_s: int = Field(default=600)
|
ollama_read_timeout_s: int = Field(default=600)
|
||||||
ollama_max_connections: int = Field(default=64)
|
ollama_max_connections: int = Field(default=64)
|
||||||
|
# Optional Bearer token sent to the upstream Ollama on EVERY request from the
|
||||||
|
# gateway (proxy hot path + the discovery poller). Use SecretStr so the value
|
||||||
|
# never appears in repr(), logs, or error messages. Empty/unset = no header.
|
||||||
|
ollama_auth_token: SecretStr | None = Field(default=None)
|
||||||
|
# If you front Ollama with an auth proxy that expects a non-standard header
|
||||||
|
# name (e.g. ``X-API-Key`` instead of ``Authorization``), override here.
|
||||||
|
# The scheme prefix (``Bearer ``) is dropped automatically when the header
|
||||||
|
# isn't ``Authorization``.
|
||||||
|
ollama_auth_header: str = Field(default="Authorization")
|
||||||
|
ollama_auth_scheme: str = Field(default="Bearer")
|
||||||
|
|
||||||
# --- Model discovery (SPEC §4.6) ---
|
# --- Model discovery (SPEC §4.6) ---
|
||||||
model_discovery_refresh_s: int = Field(default=60)
|
model_discovery_refresh_s: int = Field(default=60)
|
||||||
|
|||||||
@@ -32,6 +32,26 @@ if TYPE_CHECKING:
|
|||||||
_log = get_logger("lifespan")
|
_log = get_logger("lifespan")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_upstream_headers(settings: Settings) -> dict[str, str]:
|
||||||
|
"""Compose default headers for the upstream Ollama client.
|
||||||
|
|
||||||
|
If ``OLLAMA_AUTH_TOKEN`` is set, attach the configured auth header. The
|
||||||
|
scheme prefix (``Bearer``) is included only when the header is the standard
|
||||||
|
``Authorization``; for custom headers like ``X-API-Key`` the raw token is
|
||||||
|
sent. The SecretStr is unwrapped only here, never logged.
|
||||||
|
"""
|
||||||
|
headers: dict[str, str] = {"User-Agent": "neuronetz-gateway"}
|
||||||
|
if settings.ollama_auth_token is not None:
|
||||||
|
raw = settings.ollama_auth_token.get_secret_value().strip()
|
||||||
|
if raw:
|
||||||
|
header = settings.ollama_auth_header
|
||||||
|
if header.lower() == "authorization":
|
||||||
|
headers[header] = f"{settings.ollama_auth_scheme} {raw}".strip()
|
||||||
|
else:
|
||||||
|
headers[header] = raw
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
def _build_http_client(settings: Settings) -> httpx.AsyncClient:
|
def _build_http_client(settings: Settings) -> httpx.AsyncClient:
|
||||||
"""Construct the shared httpx client used to reach Ollama."""
|
"""Construct the shared httpx client used to reach Ollama."""
|
||||||
timeout = httpx.Timeout(
|
timeout = httpx.Timeout(
|
||||||
@@ -41,7 +61,12 @@ def _build_http_client(settings: Settings) -> httpx.AsyncClient:
|
|||||||
pool=settings.ollama_connect_timeout_s,
|
pool=settings.ollama_connect_timeout_s,
|
||||||
)
|
)
|
||||||
limits = httpx.Limits(max_connections=settings.ollama_max_connections)
|
limits = httpx.Limits(max_connections=settings.ollama_max_connections)
|
||||||
return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits)
|
return httpx.AsyncClient(
|
||||||
|
base_url=settings.ollama_base_url,
|
||||||
|
timeout=timeout,
|
||||||
|
limits=limits,
|
||||||
|
headers=_build_upstream_headers(settings),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
|
|||||||
Reference in New Issue
Block a user