scaffold: project skeleton, schema, healthz/readyz, CI

Initial project structure for neuronetz-gateway per scope-docs/SPEC.md:

- Python 3.12 / FastAPI / SQLAlchemy 2.0 (async) / Redis / Postgres stack
  managed by uv. Multi-stage non-root Dockerfile, prod + dev compose files
  (ollama service is NEVER published in either), Caddyfile + systemd unit,
  justfile, GitHub Actions CI (ruff, mypy --strict, pytest, bandit, pip-audit).
- Pydantic-Settings config covering every env var from SPEC §7, including the
  MODEL_DISCOVERY_* keys for the dynamic-discovery feature (§4.6).
- Alembic 0001_initial creates the full gateway schema (8 tables, 3 enums,
  notify_key_revoked() trigger), incl. allow_all_models on tenant_limits and
  key_limits for the per-tenant auto-grant toggle.
- Working /healthz, /readyz (fail-closed when deps unreachable), and a
  Prometheus /metrics stub. Sanitizing error handlers that attach X-Request-ID
  to every response and never leak upstream internals.
- SPEC + AGENT_PROMPT included under scope-docs/ (source of truth).
This commit is contained in:
Stephan Berbig
2026-05-26 20:50:35 +02:00
commit d79f17b3bb
32 changed files with 3610 additions and 0 deletions

View File

@@ -0,0 +1,131 @@
"""Application lifespan: connect/dispose backends and run background tasks.
Startup connects Postgres + Redis + the upstream httpx client, builds the
argon2 hasher and the buffered audit writer, and launches the background tasks:
the model-discovery poller (SPEC §4.6) and the Postgres revocation NOTIFY
listener (SPEC §4.5). Connection failures are tolerated so ``/healthz`` always
serves; ``/readyz`` reports true readiness. All handles live on ``app.state``.
"""
from __future__ import annotations
import asyncio
import contextlib
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING
import httpx
import redis.asyncio as redis
from neuronetz_gateway.audit.writer import AuditWriter
from neuronetz_gateway.auth.hashing import build_hasher
from neuronetz_gateway.config import Settings, get_settings
from neuronetz_gateway.db.session import create_engine, create_session_factory
from neuronetz_gateway.observability.logging import get_logger
from neuronetz_gateway.proxy.discovery import DiscoveryCache, discovery_loop
from neuronetz_gateway.revocation import revocation_listener
if TYPE_CHECKING:
from fastapi import FastAPI
_log = get_logger("lifespan")
def _build_http_client(settings: Settings) -> httpx.AsyncClient:
"""Construct the shared httpx client used to reach Ollama."""
timeout = httpx.Timeout(
connect=settings.ollama_connect_timeout_s,
read=settings.ollama_read_timeout_s,
write=settings.ollama_read_timeout_s,
pool=settings.ollama_connect_timeout_s,
)
limits = httpx.Limits(max_connections=settings.ollama_max_connections)
return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
"""Manage startup/shutdown of all backends and background tasks."""
settings: Settings = get_settings()
app.state.settings = settings
app.state.hasher = build_hasher(settings)
app.state.discovery_cache = DiscoveryCache()
tasks: list[asyncio.Task[None]] = []
try:
engine = create_engine(settings)
app.state.db_engine = engine
app.state.db_sessionmaker = create_session_factory(engine)
except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves
_log.error("db_engine_init_failed", error=str(exc))
app.state.db_engine = None
app.state.db_sessionmaker = None
try:
app.state.redis = redis.from_url(settings.redis_url, decode_responses=True)
except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves
_log.error("redis_init_failed", error=str(exc))
app.state.redis = None
app.state.http_client = _build_http_client(settings)
audit_writer = AuditWriter(settings.audit_buffer_size, app.state.db_sessionmaker)
audit_writer.start()
app.state.audit_writer = audit_writer
# Background tasks (cancelled on shutdown).
tasks.append(
asyncio.create_task(
discovery_loop(
app.state.http_client, app.state.redis, app.state.discovery_cache, settings
)
)
)
if app.state.redis is not None and app.state.db_sessionmaker is not None:
tasks.append(
asyncio.create_task(
revocation_listener(settings, app.state.redis, app.state.db_sessionmaker)
)
)
app.state.background_tasks = tasks
_log.info("gateway_startup_complete")
try:
yield
finally:
await _shutdown(app, tasks, audit_writer)
async def _shutdown(
app: FastAPI, tasks: list[asyncio.Task[None]], audit_writer: AuditWriter
) -> None:
"""Cancel background tasks and dispose of all backend handles."""
for task in tasks:
task.cancel()
for task in tasks:
with contextlib.suppress(asyncio.CancelledError):
await task
with contextlib.suppress(Exception):
await audit_writer.stop()
http_client: httpx.AsyncClient | None = getattr(app.state, "http_client", None)
if http_client is not None:
with contextlib.suppress(Exception):
await http_client.aclose()
redis_client = getattr(app.state, "redis", None)
if redis_client is not None:
with contextlib.suppress(Exception):
await redis_client.aclose()
engine = getattr(app.state, "db_engine", None)
if engine is not None:
with contextlib.suppress(Exception):
await engine.dispose()
_log.info("gateway_shutdown_complete")
__all__ = ["lifespan"]