The gateway can now aggregate models across SEVERAL Ollama backends and route each request to the correct one. Opt-in via OLLAMA_BACKENDS in .env — single-backend deployments are unaffected (effective_backends() synthesizes a single "default" backend from the legacy OLLAMA_BASE_URL / OLLAMA_AUTH_TOKEN fields when the list is empty). Behavior: - Discovery polls EVERY configured backend in parallel each tick; the cache stores per-backend model lists plus a model → backends priority list (config order = priority order). - /api/tags and /v1/models surface the DEDUPLICATED UNION of all backends' models. - A request's model is looked up in the priority list and proxied to the FIRST backend that hosts it. If that backend errors on the request, the pipeline transparently fails over to the next backend that has the same model (the streaming-failover probes the first chunk before releasing the response, so we never serve partial bytes from a dead backend). - No existence disclosure: a model not hosted by any backend yields the same generic 403 as "model not allowed" (SPEC §13.6 preserved). Components: - config.py: new BackendSpec model + ollama_backends list field + an effective_backends() helper. - proxy/router.py (new): BackendRouter (clients_for_with_failover), build_http_clients() builds one httpx client per backend with its own auth headers, build_backend_headers() exposes the per-backend header composition for the CLI probe. - proxy/discovery.py: DiscoveryCache.set_per_backend() + backends_for(), refresh_all_backends() polls all in parallel, discovery_loop_multi() replaces the single-backend loop in production; the legacy single- backend functions are kept for the dependency-override tests. - proxy/pipeline.py: Pipeline accepts an optional router; the four proxy methods now retry against each candidate backend in priority order on transport error. - lifespan.py: constructs the per-backend client dict, stores the router on app.state, launches discovery_loop_multi. - deps.py: get_backend_router provider + BackendRouterDep type alias; get_pipeline passes the router into Pipeline. - cli/manage.py: probe-ollama iterates every backend and reports per- backend status; list-models groups its output by backend and prints the union count + Redis cache size for sanity. - .env.example + docker-compose.yml: document and pass through OLLAMA_BACKENDS with a real example. Verified: ruff check (clean), mypy --strict src/ + tests/ (clean, 66 source files), pytest (60 passed + 39 skipped — same baseline as before this change; integration tests are Docker-socket-gated).
207 lines
8.0 KiB
Python
207 lines
8.0 KiB
Python
"""FastAPI dependency-injection providers.
|
|
|
|
Exposes typed accessors for the handles placed on ``app.state`` by the lifespan
|
|
(Redis, the upstream httpx client, the DB session factory, the discovery cache)
|
|
plus the request principal and the proxy client.
|
|
|
|
QA override contract
|
|
--------------------
|
|
Routes obtain the upstream proxy via :func:`get_ollama_client`. Tests override
|
|
the *Ollama backend* by overriding this provider::
|
|
|
|
from neuronetz_gateway.deps import get_ollama_client
|
|
from neuronetz_gateway.proxy.ollama import OllamaClient
|
|
import httpx
|
|
from tests.integration.mock_ollama import create_mock_ollama
|
|
|
|
transport = httpx.ASGITransport(app=create_mock_ollama())
|
|
mock_http = httpx.AsyncClient(transport=transport, base_url="http://ollama")
|
|
app.dependency_overrides[get_ollama_client] = lambda: OllamaClient(mock_http)
|
|
|
|
Because ``get_ollama_client`` returns a fully-built :class:`OllamaClient`, an
|
|
override needs no access to ``app.state`` and can point at the in-process mock.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import AsyncIterator
|
|
from typing import Annotated
|
|
|
|
import httpx
|
|
import redis.asyncio as redis
|
|
from fastapi import Depends, Request
|
|
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
|
|
|
|
from neuronetz_gateway.audit.writer import AuditWriter
|
|
from neuronetz_gateway.auth.principal import Principal
|
|
from neuronetz_gateway.budget.counter import BudgetCounter
|
|
from neuronetz_gateway.config import Settings, get_settings
|
|
from neuronetz_gateway.errors import AuthenticationError, DependencyUnavailableError
|
|
from neuronetz_gateway.proxy.discovery import DiscoveryCache
|
|
from neuronetz_gateway.proxy.ollama import OllamaClient
|
|
from neuronetz_gateway.proxy.pipeline import Pipeline
|
|
from neuronetz_gateway.proxy.router import BackendRouter
|
|
from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter
|
|
from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter
|
|
|
|
|
|
def get_config() -> Settings:
|
|
"""Provide the cached application settings."""
|
|
return get_settings()
|
|
|
|
|
|
def get_redis(request: Request) -> redis.Redis:
|
|
"""Provide the shared Redis client, failing closed if unavailable."""
|
|
client: redis.Redis | None = getattr(request.app.state, "redis", None)
|
|
if client is None:
|
|
raise DependencyUnavailableError(internal_detail="redis client not initialised")
|
|
return client
|
|
|
|
|
|
def get_http_client(request: Request) -> httpx.AsyncClient:
|
|
"""Provide the shared upstream httpx client."""
|
|
client: httpx.AsyncClient | None = getattr(request.app.state, "http_client", None)
|
|
if client is None:
|
|
raise DependencyUnavailableError(internal_detail="http client not initialised")
|
|
return client
|
|
|
|
|
|
def get_ollama_client(request: Request) -> OllamaClient:
|
|
"""Provide the upstream Ollama proxy client (override target for tests).
|
|
|
|
In multi-backend mode this returns the FIRST backend's client (priority
|
|
order = list order). The pipeline uses :func:`get_backend_router` for
|
|
per-model routing + failover; this provider is kept for tests and for code
|
|
paths that don't need routing.
|
|
"""
|
|
return OllamaClient(get_http_client(request))
|
|
|
|
|
|
def get_backend_router(request: Request) -> BackendRouter:
|
|
"""Provide the multi-backend router (one client per configured backend)."""
|
|
router: BackendRouter | None = getattr(request.app.state, "backend_router", None)
|
|
if router is None:
|
|
raise DependencyUnavailableError(internal_detail="backend router not initialised")
|
|
return router
|
|
|
|
|
|
def get_discovery_cache(request: Request) -> DiscoveryCache:
|
|
"""Provide the in-process discovery cache; fail closed if absent."""
|
|
cache: DiscoveryCache | None = getattr(request.app.state, "discovery_cache", None)
|
|
if cache is None:
|
|
raise DependencyUnavailableError(internal_detail="discovery cache not initialised")
|
|
return cache
|
|
|
|
|
|
def get_principal(request: Request) -> Principal:
|
|
"""Return the authenticated principal placed on ``request.state``.
|
|
|
|
The auth middleware attaches it before routing; its absence on a non-exempt
|
|
route is a programming error, so we fail closed with a 401.
|
|
"""
|
|
principal: Principal | None = getattr(request.state, "principal", None)
|
|
if principal is None:
|
|
raise AuthenticationError(internal_detail="principal missing on authenticated route")
|
|
return principal
|
|
|
|
|
|
def get_audit_writer(request: Request) -> AuditWriter:
|
|
"""Provide the shared buffered audit writer; fail closed if absent."""
|
|
writer: AuditWriter | None = getattr(request.app.state, "audit_writer", None)
|
|
if writer is None:
|
|
raise DependencyUnavailableError(internal_detail="audit writer not initialised")
|
|
return writer
|
|
|
|
|
|
def get_pipeline(
|
|
request: Request,
|
|
principal: Annotated[Principal, Depends(get_principal)],
|
|
settings: Annotated[Settings, Depends(get_config)],
|
|
ollama: Annotated[OllamaClient, Depends(get_ollama_client)],
|
|
discovery: Annotated[DiscoveryCache, Depends(get_discovery_cache)],
|
|
redis_client: Annotated[redis.Redis, Depends(get_redis)],
|
|
audit: Annotated[AuditWriter, Depends(get_audit_writer)],
|
|
) -> Pipeline:
|
|
"""Assemble a per-request enforcement + proxy pipeline.
|
|
|
|
The pipeline owns all hot-path checks (rate limit, budget, concurrency,
|
|
model/endpoint allowlist) and the streaming-with-bookkeeping contract.
|
|
Audit deny-mode flips this to fail closed at the route layer.
|
|
|
|
In multi-backend deployments the per-request backend selection is done by
|
|
the pipeline using the :class:`BackendRouter` on ``app.state``; the
|
|
``ollama`` argument here is the fallback single-backend client (used when
|
|
the router has no entry for a model, and as the override target for tests
|
|
that don't care about routing).
|
|
"""
|
|
sessionmaker: async_sessionmaker[AsyncSession] | None = getattr(
|
|
request.app.state, "db_sessionmaker", None
|
|
)
|
|
router: BackendRouter | None = getattr(request.app.state, "backend_router", None)
|
|
return Pipeline(
|
|
request=request,
|
|
principal=principal,
|
|
settings=settings,
|
|
ollama=ollama,
|
|
discovery=discovery,
|
|
rate_limiter=SlidingWindowLimiter(redis_client),
|
|
concurrency=ConcurrencyLimiter(redis_client),
|
|
budget=BudgetCounter(redis_client),
|
|
audit=audit,
|
|
sessionmaker=sessionmaker,
|
|
router=router,
|
|
)
|
|
|
|
|
|
def _get_sessionmaker(request: Request) -> async_sessionmaker[AsyncSession]:
|
|
"""Return the session factory or fail closed if the engine is absent."""
|
|
factory: async_sessionmaker[AsyncSession] | None = getattr(
|
|
request.app.state, "db_sessionmaker", None
|
|
)
|
|
if factory is None:
|
|
raise DependencyUnavailableError(internal_detail="db session factory not initialised")
|
|
return factory
|
|
|
|
|
|
async def get_db_session(request: Request) -> AsyncIterator[AsyncSession]:
|
|
"""Provide a request-scoped async DB session."""
|
|
factory = _get_sessionmaker(request)
|
|
async with factory() as session:
|
|
yield session
|
|
|
|
|
|
ConfigDep = Annotated[Settings, Depends(get_config)]
|
|
RedisDep = Annotated[redis.Redis, Depends(get_redis)]
|
|
HttpClientDep = Annotated[httpx.AsyncClient, Depends(get_http_client)]
|
|
OllamaClientDep = Annotated[OllamaClient, Depends(get_ollama_client)]
|
|
BackendRouterDep = Annotated[BackendRouter, Depends(get_backend_router)]
|
|
DiscoveryCacheDep = Annotated[DiscoveryCache, Depends(get_discovery_cache)]
|
|
PrincipalDep = Annotated[Principal, Depends(get_principal)]
|
|
AuditWriterDep = Annotated[AuditWriter, Depends(get_audit_writer)]
|
|
PipelineDep = Annotated[Pipeline, Depends(get_pipeline)]
|
|
DbSessionDep = Annotated[AsyncSession, Depends(get_db_session)]
|
|
|
|
|
|
__all__ = [
|
|
"AuditWriterDep",
|
|
"BackendRouterDep",
|
|
"ConfigDep",
|
|
"DbSessionDep",
|
|
"DiscoveryCacheDep",
|
|
"HttpClientDep",
|
|
"OllamaClientDep",
|
|
"PipelineDep",
|
|
"PrincipalDep",
|
|
"RedisDep",
|
|
"get_audit_writer",
|
|
"get_backend_router",
|
|
"get_config",
|
|
"get_db_session",
|
|
"get_discovery_cache",
|
|
"get_http_client",
|
|
"get_ollama_client",
|
|
"get_pipeline",
|
|
"get_principal",
|
|
"get_redis",
|
|
]
|