proxy: streaming, discovery, OpenAI-compat, rate-limit, budget, audit

The hot path. A single Pipeline class owns enforcement so the eight non-negotiables can be reviewed in one place. - Native /api/chat, /api/generate (NDJSON streaming + non-stream), /api/tags, /api/show (system-prompt + template stripped), /api/embed(dings), /api/version (returns gateway version, not Ollama's). Endpoint catch-all returns the same generic 403 for hard-blocked and unknown /api/* paths so attackers cannot enumerate which mutating endpoints exist. - OpenAI-compat /v1/chat/completions, /v1/completions, /v1/embeddings, /v1/models with SSE (`data: {...}` + final `data: [DONE]`); preserves streaming end-to-end. - Model discovery (SPEC §4.6): background poller against Ollama /api/tags; Redis + in-process cache (TTL = MODEL_DISCOVERY_CACHE_TTL_S, refresh = MODEL_DISCOVERY_REFRESH_S); fail-closed when the discovered set is empty. - Effective-set resolution in proxy/allowlist.py: allow_all = key.allow_all_models ?? tenant.allow_all_models effective = discovered if allow_all else (key.allowed_models ?? tenant.allowed_models) ∩ discovered A non-effective model returns the same generic 403 whether it's installed- but-unpermitted or doesn't exist at all (no enumeration leak). - Sliding-window rate limit (Redis Lua, single round-trip) for per-key + per-tenant RPM and per-key TPM. Redis-INCR/DECR concurrency semaphore with TTL guard. Token-budget counters per (key, period) with a Postgres ledger for reconciliation across resets. Headers per SPEC §6.5 on every response; 429 carries Retry-After; Redis outage → 503 (fail closed, never 200). - Token counting from the FINAL stream object (NDJSON `done` or the SSE chunk carrying `usage`); the audit row is written AFTER stream close so TTFB is never degraded by bookkeeping. - Audit writer: asyncio.Queue + bounded ring buffer; deny-mode flip on overflow. Optional prompt log per key (TTL'd). - Revocation listener: asyncpg LISTEN on key_revoked → evict the Redis cache entry within ~1s of the console writing to gateway.revocations. - Prometheus counters/histograms labeled by tenant only (per SPEC §13.3).
2026-05-26 20:52:33 +02:00
parent 6431b2f72c
commit 6a92bc8ce9
20 changed files with 2139 additions and 0 deletions
--- a/src/neuronetz_gateway/audit/init.py
+++ b/src/neuronetz_gateway/audit/init.py
@@ -0,0 +1,3 @@
 """Audit logging: buffered async audit writer and opt-in prompt log."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/audit/prompt_log.py
+++ b/src/neuronetz_gateway/audit/prompt_log.py
@@ -0,0 +1,63 @@
 """Opt-in, TTL'd prompt logging (SPEC §2, §5).
 Disabled by default; enabled per key (or inherited from tenant via the resolved
 :class:`Principal.log_prompts`). Each row carries a ``retention_until`` deadline
 (swept in Phase 4). A redaction hook runs before persistence so sensitive spans
 can be scrubbed without changing call sites.
 """
 from __future__ import annotations
 import datetime
 from collections.abc import Callable
 from dataclasses import dataclass
 from uuid import UUID
 from sqlalchemy.ext.asyncio import AsyncSession
 from neuronetz_gateway.db.models import PromptLog
 # A redaction hook maps a request/response body to a sanitized copy. The default
 # is identity; operators can install a stricter hook.
 RedactionHook = Callable[[dict[str, object]], dict[str, object]]
 def _identity(body: dict[str, object]) -> dict[str, object]:
    return body
@dataclass(frozen=True, slots=True)
 class PromptRecord:
    """A captured request/response pair pending TTL'd persistence."""
    audit_id: int
    key_id: UUID
    request_body: dict[str, object]
    response_text: str | None
    retention_days: int
 class PromptLogWriter:
    """Persists opt-in prompt records with a retention deadline."""
    def __init__(self, session: AsyncSession, redact: RedactionHook | None = None) -> None:
        self._session = session
        self._redact = redact or _identity
    async def write(self, record: PromptRecord) -> None:
        """Persist a prompt record to ``gateway.prompt_log``."""
        retention_until = datetime.datetime.now(datetime.UTC) + datetime.timedelta(
            days=record.retention_days
        )
        row = PromptLog(
            audit_id=record.audit_id,
            key_id=record.key_id,
            request_body=self._redact(record.request_body),
            response_text=record.response_text,
            retention_until=retention_until,
        )
        self._session.add(row)
        await self._session.flush()
 __all__ = ["PromptLogWriter", "PromptRecord", "RedactionHook"]
--- a/src/neuronetz_gateway/audit/writer.py
+++ b/src/neuronetz_gateway/audit/writer.py
@@ -0,0 +1,152 @@
 """Buffered async audit-log writer (SPEC §4.4).
 Audit rows are enqueued *after* stream close so the hot path is never delayed
 (non-negotiable #6). A background drain task persists them to Postgres. On a
 Postgres write failure rows are buffered in a bounded in-memory ring (max
 ``AUDIT_BUFFER_SIZE``) and retried; if the ring fills, the writer flips to
 **deny mode** (SPEC §4.4) — :pyattr:`deny_mode` goes True and the request path
 must refuse new work until the backlog drains.
 The writer holds the session factory directly (it runs outside request scope).
 ``enqueue`` never blocks on the DB; it only touches the in-memory queue/ring.
 """
 from __future__ import annotations
 import asyncio
 import collections
 import datetime
 from dataclasses import asdict, dataclass
 from uuid import UUID
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 from neuronetz_gateway.db.repositories import AuditRepository
 from neuronetz_gateway.observability.logging import get_logger
 _log = get_logger("audit")
@dataclass(frozen=True, slots=True)
 class AuditRecord:
    """A single audit-log row queued for persistence."""
    request_id: UUID
    method: str
    path: str
    status: int
    ts: datetime.datetime
    tenant_id: UUID | None = None
    key_id: UUID | None = None
    key_prefix: str | None = None
    model: str | None = None
    tokens_in: int | None = None
    tokens_out: int | None = None
    latency_ms: int | None = None
    client_ip: str | None = None
    user_agent: str | None = None
    error_code: str | None = None
    def as_columns(self) -> dict[str, object]:
        """Map to ``gateway.audit_log`` column kwargs."""
        return asdict(self)
 class AuditWriter:
    """Buffered, fail-safe writer for ``gateway.audit_log``."""
    def __init__(
        self,
        buffer_size: int,
        sessionmaker: async_sessionmaker[AsyncSession] | None = None,
    ) -> None:
        self._buffer_size = buffer_size
        self._sessionmaker = sessionmaker
        self._ring: collections.deque[AuditRecord] = collections.deque(maxlen=buffer_size)
        self._queue: asyncio.Queue[AuditRecord] = asyncio.Queue()
        self._deny_mode = False
        self._task: asyncio.Task[None] | None = None
    @property
    def deny_mode(self) -> bool:
        """True when the buffer has overflowed and new work must be denied."""
        return self._deny_mode
    def bind(self, sessionmaker: async_sessionmaker[AsyncSession]) -> None:
        """Attach the DB session factory (called from lifespan startup)."""
        self._sessionmaker = sessionmaker
    async def enqueue(self, record: AuditRecord) -> None:
        """Queue an audit record for asynchronous persistence (non-blocking)."""
        await self._queue.put(record)
    def start(self) -> None:
        """Launch the background drain task (idempotent)."""
        if self._task is None or self._task.done():
            self._task = asyncio.create_task(self._drain_loop())
    async def stop(self) -> None:
        """Cancel the drain task and flush remaining rows best-effort."""
        if self._task is not None:
            self._task.cancel()
            try:
                await self._task
            except asyncio.CancelledError:
                pass
            self._task = None
        await self.flush()
    async def _drain_loop(self) -> None:
        """Continuously move queued records into Postgres, retrying the ring."""
        while True:
            record = await self._queue.get()
            await self._persist_with_retry(record)
    async def _persist_with_retry(self, record: AuditRecord) -> None:
        """Persist one record; on failure push to the ring (or enter deny mode)."""
        # Drain any backlog first so ordering is roughly preserved.
        if self._ring:
            await self._try_flush_ring()
        if not await self._write_one(record):
            self._buffer(record)
    def _buffer(self, record: AuditRecord) -> None:
        """Buffer a failed write; flip to deny mode if the ring is full."""
        if len(self._ring) >= self._buffer_size:
            self._deny_mode = True
            _log.error("audit_buffer_overflow_deny_mode", buffered=len(self._ring))
            return
        self._ring.append(record)
    async def _try_flush_ring(self) -> None:
        """Attempt to persist buffered rows; clear deny mode once drained."""
        while self._ring:
            record = self._ring[0]
            if not await self._write_one(record):
                return
            self._ring.popleft()
        self._deny_mode = False
    async def _write_one(self, record: AuditRecord) -> bool:
        """Persist a single record; return False on any failure."""
        if self._sessionmaker is None:
            return False
        try:
            async with self._sessionmaker() as session:
                await AuditRepository(session).insert_audit(**record.as_columns())
                await session.commit()
            return True
        except Exception as exc:  # noqa: BLE001 - any DB error ⇒ buffer + retry
            _log.warning("audit_write_failed", error=str(exc))
            return False
    async def flush(self) -> None:
        """Drain queued + buffered records to Postgres (best-effort)."""
        while not self._queue.empty():
            record = self._queue.get_nowait()
            if not await self._write_one(record):
                self._buffer(record)
        await self._try_flush_ring()
 __all__ = ["AuditRecord", "AuditWriter"]
--- a/src/neuronetz_gateway/budget/init.py
+++ b/src/neuronetz_gateway/budget/init.py
@@ -0,0 +1,3 @@
 """Token budgets: Redis period counters and Postgres ledger reconciliation."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/budget/counter.py
+++ b/src/neuronetz_gateway/budget/counter.py
@@ -0,0 +1,105 @@
 """Redis period counters for token budgets (day / month / total).
 The fast-path budget check reads a Redis counter of tokens already consumed in
 the active period and compares it to the configured limit; Postgres
 (``ledger.py``) is the durable source of truth reconciled on rollover. Counters
 are keyed by ``key_id`` + period + period-start so a new period naturally starts
 at zero, and day/month counters carry a TTL so expired periods self-clean.
 Fail-closed (SPEC §4.4): Redis errors raise
 :class:`DependencyUnavailableError`; the caller must deny (503).
 """
 from __future__ import annotations
 import datetime
 from collections.abc import Awaitable
 from dataclasses import dataclass
 from typing import cast
 import redis.asyncio as redis
 from redis.exceptions import RedisError
 from neuronetz_gateway.db.models import BudgetPeriod
 from neuronetz_gateway.errors import DependencyUnavailableError
 _CONSUMED_PREFIX = "gateway:budget:"
 def period_start(period: BudgetPeriod, now: datetime.datetime | None = None) -> datetime.datetime:
    """Return the UTC start of the current period."""
    moment = now or datetime.datetime.now(datetime.UTC)
    moment = moment.astimezone(datetime.UTC)
    if period is BudgetPeriod.day:
        return moment.replace(hour=0, minute=0, second=0, microsecond=0)
    if period is BudgetPeriod.month:
        return moment.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    # 'total' has no rollover; anchor at the epoch.
    return datetime.datetime(1970, 1, 1, tzinfo=datetime.UTC)
 def _ttl_for(period: BudgetPeriod) -> int | None:
    """TTL (seconds) for a period counter; None for 'total' (no expiry)."""
    if period is BudgetPeriod.day:
        return 2 * 24 * 3600
    if period is BudgetPeriod.month:
        return 40 * 24 * 3600
    return None
 def _counter_key(key_id: str, period: BudgetPeriod, start: datetime.datetime) -> str:
    return f"{_CONSUMED_PREFIX}{key_id}:{period.value}:{int(start.timestamp())}"
@dataclass(frozen=True, slots=True)
 class BudgetState:
    """Snapshot of remaining budget for a period."""
    period: BudgetPeriod
    limit: int | None
    remaining: int | None
    @property
    def exhausted(self) -> bool:
        """True if a finite limit is configured and nothing remains."""
        return self.limit is not None and (self.remaining or 0) <= 0
 class BudgetCounter:
    """Redis-backed per-period token counter."""
    def __init__(self, client: redis.Redis) -> None:
        self._client = client
    async def check(self, key_id: str, period: BudgetPeriod, limit: int | None) -> BudgetState:
        """Return remaining budget; ``None`` limit means unlimited for the period."""
        if limit is None:
            return BudgetState(period=period, limit=None, remaining=None)
        start = period_start(period)
        try:
            raw = await self._client.get(_counter_key(key_id, period, start))
        except RedisError as exc:
            raise DependencyUnavailableError(
                internal_detail=f"budget redis error: {exc!r}"
            ) from exc
        consumed = int(raw) if raw else 0
        return BudgetState(period=period, limit=limit, remaining=max(limit - consumed, 0))
    async def consume(self, key_id: str, period: BudgetPeriod, tokens: int) -> None:
        """Increment the consumed counter after a request completes."""
        if tokens <= 0:
            return
        start = period_start(period)
        key = _counter_key(key_id, period, start)
        try:
            await cast("Awaitable[int]", self._client.incrby(key, tokens))
            ttl = _ttl_for(period)
            if ttl is not None:
                await self._client.expire(key, ttl)
        except RedisError as exc:
            raise DependencyUnavailableError(
                internal_detail=f"budget consume redis error: {exc!r}"
            ) from exc
 __all__ = ["BudgetCounter", "BudgetState", "period_start"]
--- a/src/neuronetz_gateway/budget/ledger.py
+++ b/src/neuronetz_gateway/budget/ledger.py
@@ -0,0 +1,58 @@
 """Postgres budget ledger reconciliation.
 Persists token usage to ``gateway.budget_usage`` (the durable source of truth)
 via an idempotent upsert keyed by (key_id, period, period_start). The Redis
 counter (``counter.py``) is the fast path; this ledger is what survives a Redis
 flush and what ``show-usage`` reports against.
 """
 from __future__ import annotations
 import uuid
 from sqlalchemy.dialects.postgresql import insert as pg_insert
 from sqlalchemy.ext.asyncio import AsyncSession
 from neuronetz_gateway.budget.counter import period_start
 from neuronetz_gateway.db.models import BudgetPeriod, BudgetUsage
 class BudgetLedger:
    """Source-of-truth budget accounting in Postgres."""
    def __init__(self, session: AsyncSession) -> None:
        self._session = session
    async def record_usage(
        self, key_id: str, period: BudgetPeriod, tokens_in: int, tokens_out: int
    ) -> None:
        """Upsert usage into ``gateway.budget_usage`` for the active period.
        Uses an ``ON CONFLICT`` upsert so concurrent writers accumulate rather
        than clobber. ``requests`` increments by one per recorded request.
        """
        start = period_start(period)
        stmt = pg_insert(BudgetUsage).values(
            key_id=uuid.UUID(key_id) if isinstance(key_id, str) else key_id,
            period=period,
            period_start=start,
            tokens_in=tokens_in,
            tokens_out=tokens_out,
            requests=1,
        )
        stmt = stmt.on_conflict_do_update(
            index_elements=[
                BudgetUsage.key_id,
                BudgetUsage.period,
                BudgetUsage.period_start,
            ],
            set_={
                "tokens_in": BudgetUsage.tokens_in + stmt.excluded.tokens_in,
                "tokens_out": BudgetUsage.tokens_out + stmt.excluded.tokens_out,
                "requests": BudgetUsage.requests + stmt.excluded.requests,
            },
        )
        await self._session.execute(stmt)
 __all__ = ["BudgetLedger"]
--- a/src/neuronetz_gateway/observability/metrics.py
+++ b/src/neuronetz_gateway/observability/metrics.py
@@ -0,0 +1,67 @@
 """Prometheus metrics.
 Phase 1 declares the metric objects and the exposition helper. Instrumentation
 (incrementing counters / observing histograms on the request path) is wired in
 later phases. Per SPEC §13.3 we label by ``tenant`` only, never by ``key_id``.
 """
 from __future__ import annotations
 from prometheus_client import CollectorRegistry, Counter, Histogram, generate_latest
 REGISTRY = CollectorRegistry()
 REQUESTS_TOTAL = Counter(
    "gateway_requests_total",
    "Total proxied requests.",
    labelnames=("tenant", "model", "status"),
    registry=REGISTRY,
 )
 TOKENS_TOTAL = Counter(
    "gateway_tokens_total",
    "Total tokens accounted, by direction (in|out).",
    labelnames=("tenant", "model", "direction"),
    registry=REGISTRY,
 )
 REQUEST_DURATION_SECONDS = Histogram(
    "gateway_request_duration_seconds",
    "Gateway-side request duration in seconds.",
    labelnames=("tenant", "model"),
    registry=REGISTRY,
 )
 def record_request(tenant: str, model: str, status: int, duration_s: float) -> None:
    """Increment the request counter and observe its duration (tenant-labeled)."""
    REQUESTS_TOTAL.labels(tenant=tenant, model=model, status=str(status)).inc()
    REQUEST_DURATION_SECONDS.labels(tenant=tenant, model=model).observe(duration_s)
 def record_tokens(tenant: str, model: str, tokens_in: int, tokens_out: int) -> None:
    """Add input/output token counts to the tokens counter."""
    if tokens_in:
        TOKENS_TOTAL.labels(tenant=tenant, model=model, direction="in").inc(tokens_in)
    if tokens_out:
        TOKENS_TOTAL.labels(tenant=tenant, model=model, direction="out").inc(tokens_out)
 def render_latest() -> bytes:
    """Return the current metrics in Prometheus text exposition format."""
    payload: bytes = generate_latest(REGISTRY)
    return payload
 CONTENT_TYPE_LATEST = "text/plain; version=0.0.4; charset=utf-8"
 __all__ = [
    "CONTENT_TYPE_LATEST",
    "REGISTRY",
    "REQUESTS_TOTAL",
    "REQUEST_DURATION_SECONDS",
    "TOKENS_TOTAL",
    "record_request",
    "record_tokens",
    "render_latest",
 ]
--- a/src/neuronetz_gateway/proxy/init.py
+++ b/src/neuronetz_gateway/proxy/init.py
@@ -0,0 +1,3 @@
 """Proxy layer: Ollama client, schema translation, token counting, allowlists."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/proxy/allowlist.py
+++ b/src/neuronetz_gateway/proxy/allowlist.py
@@ -0,0 +1,76 @@
 """Endpoint and model allowlists (SPEC §6.1, §6.2).
 Mutating Ollama endpoints are hard-blocked (not configurable, not flagged):
 ``/api/pull``, ``/api/push``, ``/api/create``, ``/api/copy``, ``/api/delete``,
 and any ``/api/blobs/*``. ``/api/ps`` is also blocked (leaks loaded models).
 Model allowlist is per-tenant, default-deny. Enforcement logic lands in Phase 2.
 """
 from __future__ import annotations
 # Hard-blocked upstream endpoints — always 403, not configurable (SPEC §6.2).
 HARD_BLOCKED_PATHS: frozenset[str] = frozenset(
    {
        "/api/pull",
        "/api/push",
        "/api/create",
        "/api/copy",
        "/api/delete",
        "/api/ps",
    }
 )
 # Path prefixes that are hard-blocked (e.g. blob upload/download).
 HARD_BLOCKED_PREFIXES: tuple[str, ...] = ("/api/blobs",)
 def is_hard_blocked(path: str) -> bool:
    """Return True if ``path`` is an unconditionally blocked upstream endpoint."""
    if path in HARD_BLOCKED_PATHS:
        return True
    return any(path.startswith(prefix) for prefix in HARD_BLOCKED_PREFIXES)
 def resolve_effective_models(
    *,
    allow_all: bool,
    allowed_models: tuple[str, ...],
    discovered: frozenset[str],
 ) -> frozenset[str]:
    """Resolve the effective model set per SPEC §4.3 step 7 / §4.6.
    ``allow_all`` ⇒ the effective set is the entire live ``discovered`` set;
    otherwise it is the configured ``allowed_models`` intersected with
    ``discovered`` (so stale or typo'd allowlist entries never resolve, and a
    model that is unpermitted vs. not-installed are indistinguishable).
    Fail-closed: if ``discovered`` is empty (discovery unavailable/expired) the
    result is empty regardless of ``allow_all`` — discovery never opens access.
    """
    if not discovered:
        return frozenset()
    if allow_all:
        return discovered
    return frozenset(allowed_models) & discovered
 def is_model_allowed(
    model: str,
    *,
    allow_all: bool,
    allowed_models: tuple[str, ...],
    discovered: frozenset[str],
 ) -> bool:
    """Return True iff ``model`` is in the resolved effective set (default-deny)."""
    return model in resolve_effective_models(
        allow_all=allow_all, allowed_models=allowed_models, discovered=discovered
    )
 __all__ = [
    "HARD_BLOCKED_PATHS",
    "HARD_BLOCKED_PREFIXES",
    "is_hard_blocked",
    "is_model_allowed",
    "resolve_effective_models",
 ]
--- a/src/neuronetz_gateway/proxy/discovery.py
+++ b/src/neuronetz_gateway/proxy/discovery.py
@@ -0,0 +1,207 @@
 """Live model discovery from the Ollama backend (SPEC §4.6).
 A background task polls Ollama ``GET /api/tags`` every
 ``MODEL_DISCOVERY_REFRESH_S`` seconds. The parsed model set (names + sanitized
 metadata) is cached in Redis under ``gateway:models:discovered`` with TTL
 ``MODEL_DISCOVERY_CACHE_TTL_S`` and held in-process for hot reads on the request
 path.
 Fail-closed (SPEC §4.6, §13.5): if Ollama is unreachable, or the cache is empty
 or expired and cannot be refreshed, the discovered set is empty — and an empty
 discovered set means no model resolves, so requests are denied. Discovery never
 opens access on failure. It is read-only and only ever touches the allowlisted
 ``/api/tags`` endpoint; it never triggers a pull.
 """
 from __future__ import annotations
 import asyncio
 import json
 from dataclasses import asdict, dataclass
 import httpx
 import redis.asyncio as redis
 from neuronetz_gateway.config import Settings
 from neuronetz_gateway.observability.logging import get_logger
 _log = get_logger("discovery")
 REDIS_DISCOVERED_KEY = "gateway:models:discovered"
@dataclass(frozen=True, slots=True)
 class DiscoveredModel:
    """Sanitized metadata for a single installed model."""
    name: str
    family: str | None = None
    parameter_size: str | None = None
    quantization: str | None = None
    size_bytes: int | None = None
    modified_at: str | None = None
 def _parse_tags(payload: dict[str, object]) -> list[DiscoveredModel]:
    """Parse an Ollama ``/api/tags`` body into sanitized model records."""
    models: list[DiscoveredModel] = []
    raw_models = payload.get("models")
    if not isinstance(raw_models, list):
        return models
    for entry in raw_models:
        if not isinstance(entry, dict):
            continue
        name = entry.get("name") or entry.get("model")
        if not isinstance(name, str) or not name:
            continue
        raw_details = entry.get("details")
        details: dict[str, object] = raw_details if isinstance(raw_details, dict) else {}
        size = entry.get("size")
        models.append(
            DiscoveredModel(
                name=name,
                family=_opt_str(details.get("family")),
                parameter_size=_opt_str(details.get("parameter_size")),
                quantization=_opt_str(details.get("quantization_level")),
                size_bytes=size if isinstance(size, int) else None,
                modified_at=_opt_str(entry.get("modified_at")),
            )
        )
    return models
 def _opt_str(value: object) -> str | None:
    """Coerce a value to ``str`` only when it already is one."""
    return value if isinstance(value, str) else None
 def names_of(models: list[DiscoveredModel]) -> frozenset[str]:
    """Return just the set of model names from parsed records."""
    return frozenset(m.name for m in models)
 class DiscoveryCache:
    """In-process holder for the latest discovered model set.
    Holds both the structured records (for ``/api/tags`` / ``list-models``) and a
    fast name set (for allowlist resolution on the hot path). Reads never block
    on Redis or Ollama; the poller refreshes this in the background.
    """
    def __init__(self) -> None:
        self._models: list[DiscoveredModel] = []
        self._names: frozenset[str] = frozenset()
        self._lock = asyncio.Lock()
    async def set(self, models: list[DiscoveredModel]) -> None:
        """Replace the in-process snapshot atomically."""
        async with self._lock:
            self._models = list(models)
            self._names = names_of(models)
    @property
    def names(self) -> frozenset[str]:
        """Current discovered model names (possibly empty ⇒ fail-closed)."""
        return self._names
    @property
    def models(self) -> list[DiscoveredModel]:
        """Current discovered model records (copy)."""
        return list(self._models)
 async def write_discovered_to_redis(
    client: redis.Redis, models: list[DiscoveredModel], ttl_s: int
 ) -> None:
    """Cache the discovered set in Redis with a TTL (so staleness expires)."""
    payload = json.dumps([asdict(m) for m in models], separators=(",", ":"))
    await client.set(REDIS_DISCOVERED_KEY, payload, ex=ttl_s)
 async def read_discovered_from_redis(client: redis.Redis) -> frozenset[str]:
    """Read the cached discovered names from Redis; empty set on miss/expiry."""
    raw = await client.get(REDIS_DISCOVERED_KEY)
    if not raw:
        return frozenset()
    try:
        data = json.loads(raw)
    except (json.JSONDecodeError, TypeError):
        return frozenset()
    if not isinstance(data, list):
        return frozenset()
    return frozenset(
        str(item["name"]) for item in data if isinstance(item, dict) and item.get("name")
    )
 async def fetch_tags(client: httpx.AsyncClient) -> list[DiscoveredModel]:
    """Fetch and parse Ollama ``/api/tags``; raise on transport/HTTP error."""
    resp = await client.get("/api/tags")
    resp.raise_for_status()
    body = resp.json()
    if not isinstance(body, dict):
        return []
    return _parse_tags(body)
 async def refresh_once(
    http_client: httpx.AsyncClient,
    redis_client: redis.Redis | None,
    cache: DiscoveryCache,
    settings: Settings,
 ) -> bool:
    """Run a single discovery refresh. Returns True on success.
    On any failure the in-process and Redis caches are left untouched; they
    expire on their own TTL, which is the fail-closed behavior (stale-expired ⇒
    empty ⇒ deny). We never *clear* eagerly on a transient error, but we also
    never extend the TTL on failure.
    """
    try:
        models = await fetch_tags(http_client)
    except (httpx.HTTPError, ValueError) as exc:
        _log.warning("discovery_refresh_failed", error=str(exc))
        return False
    await cache.set(models)
    if redis_client is not None:
        try:
            await write_discovered_to_redis(
                redis_client, models, settings.model_discovery_cache_ttl_s
            )
        except Exception as exc:  # noqa: BLE001 - Redis write is best-effort cache fill
            _log.warning("discovery_cache_write_failed", error=str(exc))
    _log.info("discovery_refreshed", count=len(models))
    return True
 async def discovery_loop(
    http_client: httpx.AsyncClient,
    redis_client: redis.Redis | None,
    cache: DiscoveryCache,
    settings: Settings,
 ) -> None:
    """Background poller: refresh now, then every ``MODEL_DISCOVERY_REFRESH_S``.
    Designed to be launched via ``asyncio.create_task`` in the lifespan and
    cancelled on shutdown.
    """
    await refresh_once(http_client, redis_client, cache, settings)
    while True:
        try:
            await asyncio.sleep(settings.model_discovery_refresh_s)
        except asyncio.CancelledError:
            raise
        await refresh_once(http_client, redis_client, cache, settings)
 __all__ = [
    "REDIS_DISCOVERED_KEY",
    "DiscoveredModel",
    "DiscoveryCache",
    "discovery_loop",
    "fetch_tags",
    "names_of",
    "read_discovered_from_redis",
    "refresh_once",
    "write_discovered_to_redis",
 ]
--- a/src/neuronetz_gateway/proxy/ollama.py
+++ b/src/neuronetz_gateway/proxy/ollama.py
@@ -0,0 +1,79 @@
 """httpx-based streaming proxy client for Ollama.
 Opens async streams to the upstream and relays bytes without buffering, so the
 gateway does not degrade time-to-first-byte (SPEC §9, non-negotiable #6).
 Transport-level upstream failures are sanitized at this boundary into
 :class:`UpstreamUnavailableError` (never reflected verbatim, non-negotiable #4).
 The client is constructed from the shared ``httpx.AsyncClient`` on
 ``app.state`` and is exposed to routes via the ``get_ollama_client`` dependency
 in ``deps.py`` so tests can override it (the QA override contract).
 """
 from __future__ import annotations
 from collections.abc import AsyncIterator
 import httpx
 from neuronetz_gateway.errors import UpstreamUnavailableError
 class OllamaClient:
    """Thin async wrapper around the shared httpx client for Ollama calls."""
    def __init__(self, client: httpx.AsyncClient) -> None:
        self._client = client
    async def stream(
        self, method: str, path: str, json_body: dict[str, object]
    ) -> AsyncIterator[bytes]:
        """Open a streaming request to Ollama and yield raw response chunks.
        Yields bytes exactly as received so the hot path performs no buffering.
        Transport errors are sanitized; if the upstream returns a non-2xx the
        body is drained (so internals are not surfaced) and a generic 502 is
        raised before any client bytes are emitted.
        """
        request = self._client.build_request(method, path, json=json_body)
        try:
            response = await self._client.send(request, stream=True)
        except httpx.HTTPError as exc:
            raise UpstreamUnavailableError(internal_detail=f"ollama send failed: {exc!r}") from exc
        try:
            if response.status_code >= 400:
                await response.aread()
                raise UpstreamUnavailableError(
                    internal_detail=f"ollama returned {response.status_code} for {path}"
                )
            async for chunk in response.aiter_raw():
                yield chunk
        except httpx.HTTPError as exc:
            raise UpstreamUnavailableError(
                internal_detail=f"ollama stream failed: {exc!r}"
            ) from exc
        finally:
            await response.aclose()
    async def request(
        self, method: str, path: str, json_body: dict[str, object]
    ) -> httpx.Response:
        """Perform a non-streaming request to Ollama.
        Returns the upstream response on success; raises a sanitized 502 on
        transport failure or a non-2xx status (internals never reflected).
        """
        try:
            response = await self._client.request(method, path, json=json_body)
        except httpx.HTTPError as exc:
            raise UpstreamUnavailableError(
                internal_detail=f"ollama request failed: {exc!r}"
            ) from exc
        if response.status_code >= 400:
            raise UpstreamUnavailableError(
                internal_detail=f"ollama returned {response.status_code} for {path}"
            )
        return response
 __all__ = ["OllamaClient"]
--- a/src/neuronetz_gateway/proxy/pipeline.py
+++ b/src/neuronetz_gateway/proxy/pipeline.py
@@ -0,0 +1,467 @@
 """Shared request pipeline: enforcement, streaming, and post-close bookkeeping.
 Both the native (``/api/*``) and OpenAI-compat (``/v1/*``) routes funnel through
 here so the security checks and the streaming-integrity contract are written
 once. The order mirrors SPEC §4.3:
  rate limit (per-key + per-tenant RPM) → budget → concurrency → model allowlist
  → endpoint allowlist → body validation → proxy + stream → post-close audit /
  token-count / budget-consume / metrics / semaphore release.
 Streaming integrity (non-negotiable #6): the bytes flow to the client untouched
 and token counting + audit + budget-consume happen **after** the stream closes,
 never on the hot path.
 Fail-closed (non-negotiable #1): every limiter/budget call raises
 :class:`DependencyUnavailableError` when Redis is down, which the error handler
 renders as 503. The model allowlist is default-deny against the live discovered
 set; a missing/expired discovery set denies everything.
 """
 from __future__ import annotations
 import datetime
 import json
 import time
 import uuid
 from collections.abc import AsyncIterator, Callable
 from dataclasses import dataclass
 from typing import Any
 from fastapi import Request
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 from starlette.responses import JSONResponse, StreamingResponse
 from neuronetz_gateway.audit.writer import AuditRecord, AuditWriter
 from neuronetz_gateway.auth.principal import Principal
 from neuronetz_gateway.budget.counter import BudgetCounter
 from neuronetz_gateway.budget.ledger import BudgetLedger
 from neuronetz_gateway.config import Settings
 from neuronetz_gateway.db.models import BudgetPeriod
 from neuronetz_gateway.errors import (
    AuthorizationError,
    BudgetExceededError,
    RateLimitError,
    RequestTooLargeError,
 )
 from neuronetz_gateway.observability import metrics
 from neuronetz_gateway.observability.logging import get_logger
 from neuronetz_gateway.proxy.allowlist import is_hard_blocked, is_model_allowed
 from neuronetz_gateway.proxy.discovery import DiscoveryCache
 from neuronetz_gateway.proxy.ollama import OllamaClient
 from neuronetz_gateway.proxy.token_counter import TokenUsage, extract_usage
 from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter
 from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter
 _log = get_logger("pipeline")
 NDJSON_MEDIA_TYPE = "application/x-ndjson"
 SSE_MEDIA_TYPE = "text/event-stream"
 _CONCURRENCY_PREFIX = "gateway:concurrency:"
@dataclass(slots=True)
 class RateHeaders:
    """The §6.5 rate/budget header values gathered during pre-flight."""
    limit_requests: int
    remaining_requests: int
    limit_tokens: int
    remaining_tokens: int
    budget_period: str
    budget_remaining: str
    def as_dict(self, request_id: str) -> dict[str, str]:
        """Render to the SPEC §6.5 header set."""
        return {
            "X-Request-ID": request_id,
            "X-RateLimit-Limit-Requests": str(self.limit_requests),
            "X-RateLimit-Remaining-Requests": str(self.remaining_requests),
            "X-RateLimit-Limit-Tokens": str(self.limit_tokens),
            "X-RateLimit-Remaining-Tokens": str(self.remaining_tokens),
            "X-Budget-Period": self.budget_period,
            "X-Budget-Tokens-Remaining": self.budget_remaining,
        }
 class Pipeline:
    """Per-request enforcement + proxy orchestrator."""
    def __init__(
        self,
        *,
        request: Request,
        principal: Principal,
        settings: Settings,
        ollama: OllamaClient,
        discovery: DiscoveryCache,
        rate_limiter: SlidingWindowLimiter,
        concurrency: ConcurrencyLimiter,
        budget: BudgetCounter,
        audit: AuditWriter,
        sessionmaker: async_sessionmaker[AsyncSession] | None = None,
    ) -> None:
        self._request = request
        self._p = principal
        self._settings = settings
        self._ollama = ollama
        self._discovery = discovery
        self._rate = rate_limiter
        self._conc = concurrency
        self._budget = budget
        self._audit = audit
        self._sessionmaker = sessionmaker
        self._request_id = str(getattr(request.state, "request_id", uuid.uuid4()))
        self._concurrency_key = f"{_CONCURRENCY_PREFIX}{principal.tenant_id}"
        self._headers = RateHeaders(
            limit_requests=principal.limits.rpm,
            remaining_requests=principal.limits.rpm,
            limit_tokens=principal.limits.tpm,
            remaining_tokens=principal.limits.tpm,
            budget_period="day",
            budget_remaining="unlimited",
        )
    @property
    def settings(self) -> Settings:
        """The active settings (for body-size / num_predict caps in routes)."""
        return self._settings
    # ----- enforcement -------------------------------------------------------
    def check_scope(self, scope: str) -> None:
        """Authorize a coarse scope (e.g. 'chat', 'embeddings')."""
        if scope not in self._p.scopes:
            raise AuthorizationError(internal_detail=f"scope {scope!r} not granted")
    def check_endpoint(self, path: str) -> None:
        """Reject hard-blocked upstream endpoints with a generic 403."""
        if is_hard_blocked(path):
            raise AuthorizationError(internal_detail=f"hard-blocked endpoint {path}")
    def check_model(self, model: str) -> None:
        """Default-deny model check against the live effective set (§4.3 step 7)."""
        if not model or not is_model_allowed(
            model,
            allow_all=self._p.allow_all_models,
            allowed_models=self._p.allowed_models,
            discovered=self._discovery.names,
        ):
            # No existence disclosure (SPEC §13.6): unpermitted and not-installed
            # both yield the same generic 403.
            raise AuthorizationError(internal_detail=f"model {model!r} not in effective set")
    def validate_body(self, body: dict[str, object]) -> None:
        """Enforce the ``num_predict`` cap (body-size cap is enforced earlier)."""
        options = body.get("options")
        if isinstance(options, dict):
            num_predict = options.get("num_predict")
            if isinstance(num_predict, int) and num_predict > self._settings.max_num_predict:
                options["num_predict"] = self._settings.max_num_predict
    async def enforce_limits(self, *, token_estimate: int = 0) -> None:
        """Run RPM (per-key + per-tenant), TPM, budget, and concurrency checks.
        Budget is checked before concurrency so an over-budget request never even
        acquires a permit. Order otherwise follows SPEC §4.3 steps 4-6.
        """
        await self._check_rpm()
        await self._check_tpm(token_estimate)
        await self.check_budgets()
        await self._acquire_concurrency()
    async def _check_rpm(self) -> None:
        key_result = await self._rate.check(
            f"gateway:rpm:key:{self._p.key_id}", self._p.limits.rpm, 60, cost=1
        )
        self._headers.limit_requests = key_result.limit
        self._headers.remaining_requests = key_result.remaining
        if not key_result.allowed:
            raise RateLimitError(retry_after=key_result.retry_after_s)
        tenant_result = await self._rate.check(
            f"gateway:rpm:tenant:{self._p.tenant_id}", self._p.limits.rpm, 60, cost=1
        )
        if not tenant_result.allowed:
            raise RateLimitError(retry_after=tenant_result.retry_after_s)
    async def _check_tpm(self, token_estimate: int) -> None:
        # TPM is charged with a minimum of 1 so a request always counts; the
        # precise token cost is reconciled post-stream via the budget counter.
        cost = max(token_estimate, 1)
        result = await self._rate.check(
            f"gateway:tpm:key:{self._p.key_id}", self._p.limits.tpm, 60, cost=cost
        )
        self._headers.limit_tokens = result.limit
        self._headers.remaining_tokens = result.remaining
        if not result.allowed:
            raise RateLimitError(retry_after=result.retry_after_s)
    def _budget_periods(self) -> list[tuple[BudgetPeriod, int | None]]:
        return [
            (BudgetPeriod.day, self._p.limits.tokens_daily),
            (BudgetPeriod.month, self._p.limits.tokens_monthly),
            (BudgetPeriod.total, self._p.limits.tokens_total),
        ]
    async def check_budgets(self) -> None:
        """Verify no configured budget period is already exhausted."""
        tightest_period = "day"
        tightest_remaining = "unlimited"
        for period, limit in self._budget_periods():
            state = await self._budget.check(str(self._p.key_id), period, limit)
            if state.exhausted:
                raise BudgetExceededError(internal_detail=f"budget {period.value} exhausted")
            if state.remaining is not None:
                tightest_period = period.value
                tightest_remaining = str(state.remaining)
        self._headers.budget_period = tightest_period
        self._headers.budget_remaining = tightest_remaining
    async def _acquire_concurrency(self) -> None:
        ok = await self._conc.acquire(
            self._concurrency_key,
            self._p.limits.concurrent,
            self._settings.ollama_read_timeout_s + 30,
        )
        if not ok:
            raise RateLimitError(
                retry_after=1, internal_detail="concurrency cap reached"
            )
    # ----- proxy + bookkeeping ----------------------------------------------
    def headers(self) -> dict[str, str]:
        """Render the §6.5 response headers."""
        return self._headers.as_dict(self._request_id)
    async def stream_native(
        self, method: str, path: str, body: dict[str, object], model: str
    ) -> StreamingResponse:
        """Proxy a streaming NDJSON request, accounting tokens after close."""
        started = time.monotonic()
        media_type = NDJSON_MEDIA_TYPE
        async def gen() -> AsyncIterator[bytes]:
            last_obj: dict[str, object] = {}
            try:
                async for chunk in self._ollama.stream(method, path, body):
                    last_obj = _merge_last_ndjson(chunk, last_obj)
                    yield chunk
            finally:
                await self._finish(model, path, method, last_obj, started)
        return StreamingResponse(gen(), media_type=media_type, headers=self.headers())
    async def stream_openai(
        self,
        method: str,
        path: str,
        body: dict[str, object],
        model: str,
        chunk_translator: Callable[[dict[str, object]], dict[str, object]],
    ) -> StreamingResponse:
        """Proxy + translate native NDJSON into OpenAI SSE; account after close."""
        started = time.monotonic()
        async def gen() -> AsyncIterator[bytes]:
            last_obj: dict[str, object] = {}
            buffer = b""
            try:
                async for chunk in self._ollama.stream(method, path, body):
                    buffer += chunk
                    lines = buffer.split(b"\n")
                    buffer = lines.pop()
                    for line in lines:
                        if not line.strip():
                            continue
                        obj = json.loads(line)
                        last_obj = obj if isinstance(obj, dict) else last_obj
                        translated = chunk_translator(obj)
                        yield f"data: {json.dumps(translated)}\n\n".encode()
                if buffer.strip():
                    obj = json.loads(buffer)
                    last_obj = obj if isinstance(obj, dict) else last_obj
                    yield f"data: {json.dumps(chunk_translator(obj))}\n\n".encode()
                yield b"data: [DONE]\n\n"
            finally:
                await self._finish(model, path, method, last_obj, started)
        return StreamingResponse(gen(), media_type=SSE_MEDIA_TYPE, headers=self.headers())
    async def request_native(
        self, method: str, path: str, body: dict[str, object], model: str
    ) -> JSONResponse:
        """Proxy a non-streaming request and account tokens before responding."""
        started = time.monotonic()
        resp = await self._ollama.request(method, path, body)
        payload = resp.json()
        obj = payload if isinstance(payload, dict) else {}
        await self._finish(model, path, method, obj, started)
        return JSONResponse(obj, headers=self.headers())
    async def request_translated(
        self,
        method: str,
        path: str,
        body: dict[str, object],
        model: str,
        translator: Callable[[dict[str, object]], dict[str, object]],
    ) -> JSONResponse:
        """Proxy a non-streaming request, translate the body, then account."""
        started = time.monotonic()
        resp = await self._ollama.request(method, path, body)
        payload = resp.json()
        obj = payload if isinstance(payload, dict) else {}
        await self._finish(model, path, method, obj, started)
        return JSONResponse(translator(obj), headers=self.headers())
    async def _finish(
        self,
        model: str,
        path: str,
        method: str,
        final_obj: dict[str, object],
        started: float,
    ) -> None:
        """Post-close bookkeeping: tokens, budget, metrics, audit, semaphore.
        Runs once per request after the response is fully produced. Each step is
        best-effort and guarded so a bookkeeping failure never corrupts the
        already-delivered response. The concurrency permit is always released.
        """
        usage = extract_usage(final_obj) if final_obj else TokenUsage(0, 0)
        latency_ms = int((time.monotonic() - started) * 1000)
        try:
            await self._account_budget(usage)
        except Exception as exc:  # noqa: BLE001 - never break a delivered response
            _log.warning("budget_account_failed", error=str(exc))
        try:
            metrics.record_request(self._p.tenant_name, model or "unknown", 200, latency_ms / 1000)
            metrics.record_tokens(
                self._p.tenant_name, model or "unknown", usage.tokens_in, usage.tokens_out
            )
        except Exception as exc:  # noqa: BLE001 - metrics must not break responses
            _log.warning("metrics_record_failed", error=str(exc))
        await self._write_audit(model, path, method, usage, latency_ms, 200)
        await self._release_concurrency()
    async def _account_budget(self, usage: TokenUsage) -> None:
        """Decrement Redis budget counters and persist to the Postgres ledger."""
        for period, limit in self._budget_periods():
            if limit is not None:
                await self._budget.consume(str(self._p.key_id), period, usage.total)
        if self._sessionmaker is not None and usage.total >= 0:
            try:
                async with self._sessionmaker() as session:
                    ledger = BudgetLedger(session)
                    for period, _limit in self._budget_periods():
                        await ledger.record_usage(
                            str(self._p.key_id), period, usage.tokens_in, usage.tokens_out
                        )
                    await session.commit()
            except Exception as exc:  # noqa: BLE001 - ledger is durable backstop; never break response
                _log.warning("ledger_record_failed", error=str(exc))
    async def _write_audit(
        self,
        model: str,
        path: str,
        method: str,
        usage: TokenUsage,
        latency_ms: int,
        status: int,
    ) -> None:
        record = AuditRecord(
            request_id=uuid.UUID(self._request_id)
            if _is_uuid(self._request_id)
            else uuid.uuid4(),
            method=method,
            path=path,
            status=status,
            ts=datetime.datetime.now(datetime.UTC),
            tenant_id=self._p.tenant_id,
            key_id=self._p.key_id,
            key_prefix=self._p.key_prefix,
            model=model or None,
            tokens_in=usage.tokens_in,
            tokens_out=usage.tokens_out,
            latency_ms=latency_ms,
            client_ip=self._client_ip(),
            user_agent=self._request.headers.get("user-agent"),
        )
        try:
            await self._audit.enqueue(record)
        except Exception as exc:  # noqa: BLE001 - audit enqueue must not break response
            _log.warning("audit_enqueue_failed", error=str(exc))
    async def _release_concurrency(self) -> None:
        try:
            await self._conc.release(self._concurrency_key)
        except Exception as exc:  # noqa: BLE001 - release is best-effort
            _log.warning("concurrency_release_failed", error=str(exc))
    def _client_ip(self) -> str | None:
        xff = self._request.headers.get("x-forwarded-for")
        if xff:
            return xff.split(",")[0].strip()
        return self._request.client.host if self._request.client else None
 def _merge_last_ndjson(chunk: bytes, prev: dict[str, object]) -> dict[str, object]:
    """Track the last complete NDJSON object seen in a raw byte chunk.
    Token counts live on the final ``done`` frame. We parse only complete lines
    and keep the last successfully-parsed object; partial trailing data is
    ignored here and will be completed by a subsequent chunk.
    """
    text = chunk.decode("utf-8", errors="ignore")
    last = prev
    for line in text.split("\n"):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue
        if isinstance(obj, dict):
            last = obj
    return last
 def _is_uuid(value: str) -> bool:
    try:
        uuid.UUID(value)
    except ValueError:
        return False
    return True
 def model_of(body: dict[str, object]) -> str:
    """Extract the requested model name from a request body (empty if absent)."""
    model = body.get("model")
    return model if isinstance(model, str) else ""
 async def read_json_body(request: Request, settings: Settings) -> dict[str, object]:
    """Read + size-limit the request body, returning the parsed JSON object."""
    raw = await request.body()
    if len(raw) > settings.max_request_body_bytes:
        raise RequestTooLargeError(internal_detail=f"body {len(raw)} bytes")
    if not raw:
        return {}
    try:
        parsed: Any = json.loads(raw)
    except json.JSONDecodeError as exc:
        raise AuthorizationError(internal_detail="invalid JSON body") from exc
    return parsed if isinstance(parsed, dict) else {}
 __all__ = [
    "NDJSON_MEDIA_TYPE",
    "SSE_MEDIA_TYPE",
    "Pipeline",
    "RateHeaders",
    "model_of",
    "read_json_body",
 ]
--- a/src/neuronetz_gateway/proxy/token_counter.py
+++ b/src/neuronetz_gateway/proxy/token_counter.py
@@ -0,0 +1,50 @@
 """Precise token accounting parsed from Ollama responses (SPEC §2, §13.1).
 Tokens are read from Ollama's reported ``prompt_eval_count`` (input) and
 ``eval_count`` (output) on the final stream frame — never heuristically
 estimated. Embeddings charge ``prompt_eval_count`` only (SPEC §13.1); they have
 no ``eval_count`` so output tokens are reported as zero.
 """
 from __future__ import annotations
 from dataclasses import dataclass
@dataclass(frozen=True, slots=True)
 class TokenUsage:
    """Token counts extracted from an Ollama response."""
    tokens_in: int
    tokens_out: int
    @property
    def total(self) -> int:
        """Combined input + output tokens."""
        return self.tokens_in + self.tokens_out
 def _as_int(value: object) -> int:
    """Coerce an Ollama-reported count to a non-negative int (0 if absent/bad)."""
    if isinstance(value, bool):
        return 0
    if isinstance(value, int):
        return max(value, 0)
    if isinstance(value, float):
        return max(int(value), 0)
    return 0
 def extract_usage(final_frame: dict[str, object]) -> TokenUsage:
    """Extract ``prompt_eval_count``/``eval_count`` from the final Ollama frame.
    Works for chat/generate final frames and for embeddings responses (which
    carry ``prompt_eval_count`` but no ``eval_count``).
    """
    return TokenUsage(
        tokens_in=_as_int(final_frame.get("prompt_eval_count")),
        tokens_out=_as_int(final_frame.get("eval_count")),
    )
 __all__ = ["TokenUsage", "extract_usage"]
--- a/src/neuronetz_gateway/proxy/translate.py
+++ b/src/neuronetz_gateway/proxy/translate.py
@@ -0,0 +1,245 @@
 """OpenAI <-> Ollama schema translation (SPEC §6.3).
 Native ``/api/*`` speaks NDJSON; OpenAI-compat ``/v1/*`` speaks SSE
 (``data: {...}\\n\\n`` … ``data: [DONE]\\n\\n``). These helpers translate request
 bodies in both directions and convert native Ollama stream frames into OpenAI
 chunk objects, preserving streaming. Unknown OpenAI sampling params are mapped
 into Ollama's ``options`` block; unrecognized keys are dropped rather than
 forwarded blindly.
 """
 from __future__ import annotations
 import time
 import uuid
 from typing import Any
 # OpenAI sampling params that map into Ollama's ``options`` object.
 _OPTION_KEYS: tuple[str, ...] = (
    "temperature",
    "top_p",
    "top_k",
    "seed",
    "stop",
    "presence_penalty",
    "frequency_penalty",
 )
 def _as_int(value: object) -> int:
    """Coerce a JSON-derived value to a non-negative int (0 if absent/invalid)."""
    if isinstance(value, bool):
        return 0
    if isinstance(value, int):
        return max(value, 0)
    if isinstance(value, float):
        return max(int(value), 0)
    return 0
 def _build_options(payload: dict[str, Any]) -> dict[str, Any]:
    """Collect OpenAI sampling params into an Ollama ``options`` mapping."""
    options: dict[str, Any] = {}
    for key in _OPTION_KEYS:
        if key in payload and payload[key] is not None:
            options[key] = payload[key]
    if "max_tokens" in payload and payload["max_tokens"] is not None:
        options["num_predict"] = payload["max_tokens"]
    return options
 def openai_chat_to_ollama(payload: dict[str, object]) -> dict[str, object]:
    """Translate an OpenAI chat-completion request to an Ollama ``/api/chat`` body."""
    body: dict[str, object] = {
        "model": payload.get("model"),
        "messages": payload.get("messages", []),
        "stream": bool(payload.get("stream", False)),
    }
    options = _build_options(dict(payload))
    if options:
        body["options"] = options
    return body
 def openai_completion_to_ollama(payload: dict[str, object]) -> dict[str, object]:
    """Translate an OpenAI completion request to an Ollama ``/api/generate`` body."""
    prompt = payload.get("prompt", "")
    if isinstance(prompt, list):
        prompt = "".join(str(p) for p in prompt)
    body: dict[str, object] = {
        "model": payload.get("model"),
        "prompt": prompt,
        "stream": bool(payload.get("stream", False)),
    }
    options = _build_options(dict(payload))
    if options:
        body["options"] = options
    return body
 def openai_embeddings_to_ollama(payload: dict[str, object]) -> dict[str, object]:
    """Translate an OpenAI embeddings request to an Ollama ``/api/embed`` body."""
    return {
        "model": payload.get("model"),
        "input": payload.get("input", ""),
    }
 def _completion_id() -> str:
    """Generate an OpenAI-style completion id."""
    return f"chatcmpl-{uuid.uuid4().hex}"
 def ollama_chat_chunk_to_openai(
    chunk: dict[str, object], *, completion_id: str, model: str, created: int
 ) -> dict[str, object]:
    """Translate one Ollama ``/api/chat`` NDJSON frame to an OpenAI SSE chunk."""
    done = bool(chunk.get("done"))
    if done:
        return {
            "id": completion_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": model,
            "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
            "usage": {
                "prompt_tokens": _as_int(chunk.get("prompt_eval_count")),
                "completion_tokens": _as_int(chunk.get("eval_count")),
                "total_tokens": _as_int(chunk.get("prompt_eval_count"))
                + _as_int(chunk.get("eval_count")),
            },
        }
    message = chunk.get("message")
    content = ""
    if isinstance(message, dict):
        content = str(message.get("content", ""))
    return {
        "id": completion_id,
        "object": "chat.completion.chunk",
        "created": created,
        "model": model,
        "choices": [{"index": 0, "delta": {"content": content}, "finish_reason": None}],
    }
 def ollama_generate_chunk_to_openai(
    chunk: dict[str, object], *, completion_id: str, model: str, created: int
 ) -> dict[str, object]:
    """Translate one Ollama ``/api/generate`` NDJSON frame to an OpenAI text chunk."""
    done = bool(chunk.get("done"))
    if done:
        return {
            "id": completion_id,
            "object": "text_completion",
            "created": created,
            "model": model,
            "choices": [{"index": 0, "text": "", "finish_reason": "stop"}],
            "usage": {
                "prompt_tokens": _as_int(chunk.get("prompt_eval_count")),
                "completion_tokens": _as_int(chunk.get("eval_count")),
                "total_tokens": _as_int(chunk.get("prompt_eval_count"))
                + _as_int(chunk.get("eval_count")),
            },
        }
    return {
        "id": completion_id,
        "object": "text_completion",
        "created": created,
        "model": model,
        "choices": [{"index": 0, "text": str(chunk.get("response", "")), "finish_reason": None}],
    }
 def ollama_chat_to_openai(payload: dict[str, object]) -> dict[str, object]:
    """Translate a *non-streaming* Ollama chat response to an OpenAI completion."""
    message = payload.get("message")
    content = ""
    if isinstance(message, dict):
        content = str(message.get("content", ""))
    prompt_tokens = _as_int(payload.get("prompt_eval_count"))
    completion_tokens = _as_int(payload.get("eval_count"))
    return {
        "id": _completion_id(),
        "object": "chat.completion",
        "created": int(time.time()),
        "model": str(payload.get("model", "")),
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": content},
                "finish_reason": "stop",
            }
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens,
        },
    }
 def ollama_generate_to_openai(payload: dict[str, object]) -> dict[str, object]:
    """Translate a *non-streaming* Ollama generate response to OpenAI completion."""
    prompt_tokens = _as_int(payload.get("prompt_eval_count"))
    completion_tokens = _as_int(payload.get("eval_count"))
    return {
        "id": _completion_id(),
        "object": "text_completion",
        "created": int(time.time()),
        "model": str(payload.get("model", "")),
        "choices": [
            {"index": 0, "text": str(payload.get("response", "")), "finish_reason": "stop"}
        ],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens,
        },
    }
 def ollama_embed_to_openai(payload: dict[str, object], model: str) -> dict[str, object]:
    """Translate an Ollama ``/api/embed`` response to the OpenAI embeddings shape."""
    raw = payload.get("embeddings")
    vectors: list[list[float]] = raw if isinstance(raw, list) else []
    prompt_tokens = _as_int(payload.get("prompt_eval_count"))
    return {
        "object": "list",
        "data": [
            {"object": "embedding", "index": i, "embedding": vec}
            for i, vec in enumerate(vectors)
        ],
        "model": model,
        "usage": {"prompt_tokens": prompt_tokens, "total_tokens": prompt_tokens},
    }
 def models_to_openai_list(names: list[str]) -> dict[str, object]:
    """Render a list of model names in the OpenAI ``/v1/models`` list format."""
    created = int(time.time())
    return {
        "object": "list",
        "data": [
            {"id": name, "object": "model", "created": created, "owned_by": "neuronetz"}
            for name in names
        ],
    }
 def new_completion_id() -> str:
    """Public helper to mint a completion id for a streaming response."""
    return _completion_id()
 __all__ = [
    "models_to_openai_list",
    "new_completion_id",
    "ollama_chat_chunk_to_openai",
    "ollama_chat_to_openai",
    "ollama_embed_to_openai",
    "ollama_generate_chunk_to_openai",
    "ollama_generate_to_openai",
    "openai_chat_to_ollama",
    "openai_completion_to_ollama",
    "openai_embeddings_to_ollama",
 ]
--- a/src/neuronetz_gateway/ratelimit/init.py
+++ b/src/neuronetz_gateway/ratelimit/init.py
@@ -0,0 +1,3 @@
 """Rate limiting: sliding-window RPM/TPM and concurrency semaphore (Redis)."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/ratelimit/concurrency.py
+++ b/src/neuronetz_gateway/ratelimit/concurrency.py
@@ -0,0 +1,66 @@
 """Concurrent-connection semaphore backed by Redis ``INCR`` with a TTL guard.
 Acquired before proxying and released on stream close. A TTL on the counter
 prevents a crashed worker from leaking permits forever (self-healing). Fails
 closed (SPEC §4.4): if Redis is unreachable, acquisition raises
 :class:`DependencyUnavailableError` so the caller denies (503).
 """
 from __future__ import annotations
 from collections.abc import Awaitable
 from typing import cast
 import redis.asyncio as redis
 from redis.exceptions import RedisError
 from neuronetz_gateway.errors import DependencyUnavailableError
 # Default guard TTL: a permit auto-expires if not explicitly released, so a
 # crashed worker cannot pin the semaphore. Comfortably longer than any single
 # stream is expected to take is set per-call via ``ttl_s``.
 class ConcurrencyLimiter:
    """Redis-backed concurrency cap with a self-healing TTL guard."""
    def __init__(self, client: redis.Redis) -> None:
        self._client = client
    async def acquire(self, scope_key: str, limit: int, ttl_s: int) -> bool:
        """Try to acquire a permit; return False (deny) if at capacity.
        Increments the counter, refreshes the TTL guard, and rolls back the
        increment if the new value exceeds ``limit``. Raises on Redis failure so
        the caller fails closed.
        """
        try:
            count = await cast("Awaitable[int]", self._client.incr(scope_key))
            await self._client.expire(scope_key, ttl_s)
        except RedisError as exc:
            raise DependencyUnavailableError(
                internal_detail=f"concurrency redis error: {exc!r}"
            ) from exc
        if count > limit:
            # Over capacity: undo our increment and deny.
            try:
                await self._client.decr(scope_key)
            except RedisError:
                # Permit self-heals via the TTL guard; denial still stands.
                return False
            return False
        return True
    async def release(self, scope_key: str) -> None:
        """Release a previously acquired permit (best-effort; never raises)."""
        try:
            count = await cast("Awaitable[int]", self._client.decr(scope_key))
            if count < 0:
                await self._client.set(scope_key, 0)
        except RedisError:
            # The TTL guard will reclaim the permit; releasing must not break the
            # request that already completed.
            return
 __all__ = ["ConcurrencyLimiter"]
--- a/src/neuronetz_gateway/ratelimit/sliding_window.py
+++ b/src/neuronetz_gateway/ratelimit/sliding_window.py
@@ -0,0 +1,109 @@
 """Sliding-window rate limiter backed by an atomic Redis Lua script.
 Enforces per-key and per-tenant RPM and per-key TPM. The window is a sorted set
 of timestamped hits (a true sliding window, not a fixed bucket): each check
 trims entries older than ``window_s``, sums the cost of what remains, and admits
 the new cost only if it keeps the total within ``limit``. The trim + sum +
 conditional add run inside one Lua script so the decision is atomic across
 concurrent workers (SPEC §4.3 step 4).
 Fail-closed (SPEC §4.4): if Redis is unavailable the limiter raises
 :class:`DependencyUnavailableError`; the caller must deny (503), never allow.
 """
 from __future__ import annotations
 import time
 from dataclasses import dataclass
 import redis.asyncio as redis
 from redis.exceptions import RedisError
 from neuronetz_gateway.errors import DependencyUnavailableError
 # KEYS[1] = zset key
 # ARGV[1] = now (ms)  ARGV[2] = window (ms)  ARGV[3] = limit
 # ARGV[4] = cost      ARGV[5] = unique member suffix
 # Returns: {allowed (1/0), used_after, retry_after_ms}
 _LUA = """
 local key = KEYS[1]
 local now = tonumber(ARGV[1])
 local window = tonumber(ARGV[2])
 local limit = tonumber(ARGV[3])
 local cost = tonumber(ARGV[4])
 local member = ARGV[5]
 redis.call('ZREMRANGEBYSCORE', key, 0, now - window)
 -- Each surviving member encodes its cost as a ':<cost>' suffix; sum them so the
 -- window total is cost-weighted (RPM uses cost 1, TPM uses token count).
 local total = 0
 local data = redis.call('ZRANGEBYSCORE', key, now - window, now)
 for i = 1, #data do
    local sep = string.find(data[i], ':[^:]*$')
    if sep then
        total = total + tonumber(string.sub(data[i], sep + 1))
    else
        total = total + 1
    end
 end
 if total + cost > limit then
    local oldest = redis.call('ZRANGE', key, 0, 0, 'WITHSCORES')
    local retry = window
    if oldest[2] then
        retry = (tonumber(oldest[2]) + window) - now
        if retry < 0 then retry = 0 end
    end
    return {0, total, retry}
 end
 redis.call('ZADD', key, now, member .. ':' .. cost)
 redis.call('PEXPIRE', key, window)
 return {1, total + cost, 0}
 """
@dataclass(frozen=True, slots=True)
 class RateLimitResult:
    """Outcome of a rate-limit check."""
    allowed: bool
    limit: int
    remaining: int
    retry_after_s: int | None
 class SlidingWindowLimiter:
    """Redis-backed sliding-window limiter (atomic via Lua)."""
    def __init__(self, client: redis.Redis) -> None:
        self._client = client
        self._script = client.register_script(_LUA)
    async def check(self, key: str, limit: int, window_s: int, cost: int = 1) -> RateLimitResult:
        """Atomically record a hit of ``cost`` and report admission.
        Raises :class:`DependencyUnavailableError` if Redis cannot be reached, so
        the caller fails closed.
        """
        now_ms = int(time.time() * 1000)
        window_ms = window_s * 1000
        member = f"{now_ms}-{id(object())}"
        try:
            raw = await self._script(
                keys=[key], args=[now_ms, window_ms, limit, cost, member]
            )
        except RedisError as exc:
            raise DependencyUnavailableError(
                internal_detail=f"ratelimit redis error: {exc!r}"
            ) from exc
        allowed_i, used_after, retry_ms = (int(raw[0]), int(raw[1]), int(raw[2]))
        allowed = allowed_i == 1
        remaining = max(limit - used_after, 0)
        retry_after_s = None if allowed else max(1, (retry_ms + 999) // 1000)
        return RateLimitResult(
            allowed=allowed, limit=limit, remaining=remaining, retry_after_s=retry_after_s
        )
 __all__ = ["RateLimitResult", "SlidingWindowLimiter"]
--- a/src/neuronetz_gateway/revocation.py
+++ b/src/neuronetz_gateway/revocation.py
@@ -0,0 +1,97 @@
 """Key-revocation NOTIFY listener (SPEC §4.5).
 Console (or the gateway's own CLI) revokes a key by inserting into
 ``gateway.revocations``; an ``AFTER INSERT`` trigger fires
 ``pg_notify('key_revoked', key_id)``. This background task LISTENs on that
 channel and, on each notification, evicts the Redis auth-cache entry for the
 revoked key's prefix so the next request misses the cache, re-reads the DB,
 finds the key non-active, and is rejected — making revocation effective within
 one Redis RTT without any cross-service HTTP.
 The listener resolves ``key_id -> prefix`` via a short DB lookup (the NOTIFY
 payload is the key id, but the cache is keyed by prefix). It is resilient: a
 dropped connection is retried with backoff.
 """
 from __future__ import annotations
 import asyncio
 import contextlib
 import uuid
 import asyncpg
 import redis.asyncio as redis
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 from neuronetz_gateway.config import Settings
 from neuronetz_gateway.db.models import ApiKey
 from neuronetz_gateway.observability.logging import get_logger
 _log = get_logger("revocation")
 _CHANNEL = "key_revoked"
 _CACHE_PREFIX = "gateway:key:"
 def _asyncpg_dsn(database_url: str) -> str:
    """Strip the SQLAlchemy ``+asyncpg`` driver tag for a raw asyncpg connect."""
    return database_url.replace("postgresql+asyncpg://", "postgresql://")
 async def _evict(
    key_id_text: str,
    sessionmaker: async_sessionmaker[AsyncSession],
    redis_client: redis.Redis,
 ) -> None:
    """Resolve the key id to its prefix and delete the cached principal."""
    try:
        key_id = uuid.UUID(key_id_text)
    except ValueError:
        _log.warning("revocation_bad_payload", payload=key_id_text)
        return
    try:
        async with sessionmaker() as session:
            key = await session.get(ApiKey, key_id)
            prefix = key.prefix if key is not None else None
        if prefix is not None:
            await redis_client.delete(_CACHE_PREFIX + prefix)
            _log.info("revocation_cache_evicted", key_prefix=prefix)
    except Exception as exc:  # noqa: BLE001 - listener must survive transient errors
        _log.warning("revocation_evict_failed", error=str(exc))
 async def revocation_listener(
    settings: Settings,
    redis_client: redis.Redis,
    sessionmaker: async_sessionmaker[AsyncSession],
 ) -> None:
    """LISTEN on ``key_revoked`` and evict the Redis cache on each notification."""
    dsn = _asyncpg_dsn(settings.database_url)
    while True:
        conn = None
        try:
            conn = await asyncpg.connect(dsn)
            def _on_notify(
                _c: object, _pid: int, _channel: str, payload: str
            ) -> None:
                # Schedule the async eviction without blocking the callback.
                asyncio.create_task(_evict(payload, sessionmaker, redis_client))  # noqa: RUF006
            await conn.add_listener(_CHANNEL, _on_notify)
            _log.info("revocation_listener_started")
            # Wait forever; notifications arrive via the callback. asyncio.Event
            # is a cancel-friendly substitute for an unbounded sleep loop.
            await asyncio.Event().wait()
        except asyncio.CancelledError:
            raise
        except Exception as exc:  # noqa: BLE001 - reconnect on any failure
            _log.warning("revocation_listener_reconnect", error=str(exc))
            await asyncio.sleep(2)
        finally:
            if conn is not None:
                with contextlib.suppress(Exception):
                    await conn.close()
 __all__ = ["revocation_listener"]
--- a/src/neuronetz_gateway/routes/ollama_native.py
+++ b/src/neuronetz_gateway/routes/ollama_native.py
@@ -0,0 +1,168 @@
 """Native Ollama passthrough routes (SPEC §6.1).
 All proxied endpoints run through the shared :class:`Pipeline` (auth has already
 attached the principal in middleware): scope + model + endpoint allowlist, rate
 limit, budget, concurrency, body validation, then stream/relay with post-close
 token-count + audit + budget accounting.
 Mutating endpoints (``/api/pull|push|create|copy|delete``, ``/api/blobs/*``) and
 ``/api/ps`` are hard-blocked (SPEC §6.2) and intentionally NOT routed; a catch-all
 returns a generic 403 so their existence is never confirmed.
 """
 from __future__ import annotations
 from fastapi import APIRouter, Request
 from starlette.responses import JSONResponse, Response
 from neuronetz_gateway import __version__
 from neuronetz_gateway.deps import (
    ConfigDep,
    DiscoveryCacheDep,
    OllamaClientDep,
    PipelineDep,
    PrincipalDep,
 )
 from neuronetz_gateway.errors import AuthorizationError
 from neuronetz_gateway.proxy.allowlist import is_hard_blocked, resolve_effective_models
 from neuronetz_gateway.proxy.pipeline import model_of, read_json_body
 router = APIRouter(prefix="/api", tags=["ollama-native"])
@router.post("/chat")
 async def chat(request: Request, pipeline: PipelineDep) -> Response:
    """Proxy ``POST /api/chat`` (streamed NDJSON / non-streamed)."""
    return await _chat_or_generate(request, pipeline, path="/api/chat", scope="chat")
@router.post("/generate")
 async def generate(request: Request, pipeline: PipelineDep) -> Response:
    """Proxy ``POST /api/generate`` (streamed NDJSON / non-streamed)."""
    return await _chat_or_generate(request, pipeline, path="/api/generate", scope="chat")
 async def _chat_or_generate(
    request: Request, pipeline: PipelineDep, *, path: str, scope: str
 ) -> Response:
    body = await read_json_body(request, pipeline.settings)
    model = model_of(body)
    pipeline.check_scope(scope)
    pipeline.check_endpoint(path)
    pipeline.check_model(model)
    pipeline.validate_body(body)
    await pipeline.enforce_limits()
    if bool(body.get("stream", True)):
        return await pipeline.stream_native("POST", path, body, model)
    return await pipeline.request_native("POST", path, body, model)
@router.post("/embeddings")
 async def embeddings(request: Request, pipeline: PipelineDep) -> Response:
    """Proxy ``POST /api/embeddings`` (legacy, non-streamed)."""
    return await _embeddings(request, pipeline, "/api/embeddings")
@router.post("/embed")
 async def embed(request: Request, pipeline: PipelineDep) -> Response:
    """Proxy ``POST /api/embed`` (non-streamed)."""
    return await _embeddings(request, pipeline, "/api/embed")
 async def _embeddings(request: Request, pipeline: PipelineDep, path: str) -> Response:
    body = await read_json_body(request, pipeline.settings)
    model = model_of(body)
    pipeline.check_scope("embeddings")
    pipeline.check_endpoint(path)
    pipeline.check_model(model)
    await pipeline.enforce_limits()
    return await pipeline.request_native("POST", path, body, model)
@router.get("/tags")
 async def tags(principal: PrincipalDep, discovery: DiscoveryCacheDep) -> JSONResponse:
    """Return the tenant's effective model set (live-discovered ∩ allowed)."""
    effective = resolve_effective_models(
        allow_all=principal.allow_all_models,
        allowed_models=principal.allowed_models,
        discovered=discovery.names,
    )
    models = [
        {
            "name": m.name,
            "model": m.name,
            "modified_at": m.modified_at,
            "size": m.size_bytes,
            "details": {
                "family": m.family,
                "parameter_size": m.parameter_size,
                "quantization_level": m.quantization,
            },
        }
        for m in discovery.models
        if m.name in effective
    ]
    return JSONResponse({"models": models})
@router.post("/show")
 async def show(
    request: Request,
    principal: PrincipalDep,
    discovery: DiscoveryCacheDep,
    ollama: OllamaClientDep,
    settings: ConfigDep,
 ) -> JSONResponse:
    """Proxy ``POST /api/show`` for an effective-set model; sanitize the result."""
    body = await read_json_body(request, settings)
    name = body.get("model") or body.get("name")
    model = name if isinstance(name, str) else ""
    if not model or model not in resolve_effective_models(
        allow_all=principal.allow_all_models,
        allowed_models=principal.allowed_models,
        discovered=discovery.names,
    ):
        raise AuthorizationError(internal_detail="show: model not in effective set")
    resp = await ollama.request("POST", "/api/show", {"model": model})
    payload = resp.json()
    raw: dict[str, object] = payload if isinstance(payload, dict) else {}
    # Strip system prompt + template (SPEC §6.1: no system prompts, no template).
    raw_details = raw.get("details")
    details: dict[str, object] = raw_details if isinstance(raw_details, dict) else {}
    return JSONResponse(
        {
            "model": model,
            "details": {
                "family": details.get("family"),
                "parameter_size": details.get("parameter_size"),
                "quantization_level": details.get("quantization_level"),
            },
        }
    )
@router.get("/version")
 async def version() -> JSONResponse:
    """Return the gateway version (never Ollama's; SPEC §6.1)."""
    return JSONResponse({"version": __version__})
@router.api_route(
    "/{rest:path}",
    methods=["GET", "POST", "PUT", "DELETE", "HEAD", "PATCH"],
    include_in_schema=False,
 )
 async def catch_all(rest: str) -> Response:
    """Generic 403 for any other ``/api/*`` path (hard-blocked or unknown).
    Mutating endpoints and ``/api/ps`` resolve here and return the same generic
    forbidden response, so the gateway never confirms which upstream endpoints
    exist (SPEC §6.2, §13.6).
    """
    full = f"/api/{rest}"
    if is_hard_blocked(full):
        raise AuthorizationError(internal_detail=f"hard-blocked {full}")
    raise AuthorizationError(internal_detail=f"unrouted upstream path {full}")
 __all__ = ["router"]
--- a/src/neuronetz_gateway/routes/openai_compat.py
+++ b/src/neuronetz_gateway/routes/openai_compat.py
@@ -0,0 +1,118 @@
 """OpenAI-compatible routes (SPEC §6.3).
 Each route translates the OpenAI request into the native Ollama body, runs the
 same :class:`Pipeline` enforcement as the native routes, and translates the
 response back. Streaming uses SSE (``data: {...}\\n\\n`` … ``data: [DONE]\\n\\n``);
 non-streaming returns a single OpenAI-shaped JSON object. ``/v1/models`` returns
 the tenant's effective discovered set in OpenAI list format.
 """
 from __future__ import annotations
 import time
 from fastapi import APIRouter, Request
 from starlette.responses import JSONResponse, Response
 from neuronetz_gateway.deps import (
    DiscoveryCacheDep,
    PipelineDep,
    PrincipalDep,
 )
 from neuronetz_gateway.proxy.allowlist import resolve_effective_models
 from neuronetz_gateway.proxy.pipeline import model_of, read_json_body
 from neuronetz_gateway.proxy.translate import (
    models_to_openai_list,
    new_completion_id,
    ollama_chat_chunk_to_openai,
    ollama_chat_to_openai,
    ollama_embed_to_openai,
    ollama_generate_chunk_to_openai,
    ollama_generate_to_openai,
    openai_chat_to_ollama,
    openai_completion_to_ollama,
    openai_embeddings_to_ollama,
 )
 router = APIRouter(prefix="/v1", tags=["openai-compat"])
@router.post("/chat/completions")
 async def chat_completions(request: Request, pipeline: PipelineDep) -> Response:
    """OpenAI ``/v1/chat/completions`` -> Ollama ``/api/chat``."""
    payload = await read_json_body(request, pipeline.settings)
    body = openai_chat_to_ollama(payload)
    model = model_of(body)
    pipeline.check_scope("chat")
    pipeline.check_endpoint("/api/chat")
    pipeline.check_model(model)
    pipeline.validate_body(body)
    await pipeline.enforce_limits()
    if bool(payload.get("stream", False)):
        completion_id = new_completion_id()
        created = int(time.time())
        def translate(chunk: dict[str, object]) -> dict[str, object]:
            return ollama_chat_chunk_to_openai(
                chunk, completion_id=completion_id, model=model, created=created
            )
        return await pipeline.stream_openai("POST", "/api/chat", body, model, translate)
    return await pipeline.request_translated(
        "POST", "/api/chat", body, model, ollama_chat_to_openai
    )
@router.post("/completions")
 async def completions(request: Request, pipeline: PipelineDep) -> Response:
    """OpenAI ``/v1/completions`` -> Ollama ``/api/generate``."""
    payload = await read_json_body(request, pipeline.settings)
    body = openai_completion_to_ollama(payload)
    model = model_of(body)
    pipeline.check_scope("chat")
    pipeline.check_endpoint("/api/generate")
    pipeline.check_model(model)
    pipeline.validate_body(body)
    await pipeline.enforce_limits()
    if bool(payload.get("stream", False)):
        completion_id = new_completion_id()
        created = int(time.time())
        def translate(chunk: dict[str, object]) -> dict[str, object]:
            return ollama_generate_chunk_to_openai(
                chunk, completion_id=completion_id, model=model, created=created
            )
        return await pipeline.stream_openai("POST", "/api/generate", body, model, translate)
    return await pipeline.request_translated(
        "POST", "/api/generate", body, model, ollama_generate_to_openai
    )
@router.post("/embeddings")
 async def embeddings(request: Request, pipeline: PipelineDep) -> Response:
    """OpenAI ``/v1/embeddings`` -> Ollama ``/api/embed``."""
    payload = await read_json_body(request, pipeline.settings)
    body = openai_embeddings_to_ollama(payload)
    model = model_of(body)
    pipeline.check_scope("embeddings")
    pipeline.check_endpoint("/api/embed")
    pipeline.check_model(model)
    await pipeline.enforce_limits()
    return await pipeline.request_translated(
        "POST", "/api/embed", body, model, lambda obj: ollama_embed_to_openai(obj, model)
    )
@router.get("/models")
 async def models(principal: PrincipalDep, discovery: DiscoveryCacheDep) -> JSONResponse:
    """OpenAI ``/v1/models`` -> the tenant's effective discovered set."""
    effective = resolve_effective_models(
        allow_all=principal.allow_all_models,
        allowed_models=principal.allowed_models,
        discovered=discovery.names,
    )
    return JSONResponse(models_to_openai_list(sorted(effective)))
 __all__ = ["router"]
		`@@ -0,0 +1,3 @@`
							`"""Audit logging: buffered async audit writer and opt-in prompt log."""`

							`from __future__ import annotations`
		`@@ -0,0 +1,3 @@`
							`"""Token budgets: Redis period counters and Postgres ledger reconciliation."""`

							`from __future__ import annotations`
		`@@ -0,0 +1,3 @@`
							`"""Proxy layer: Ollama client, schema translation, token counting, allowlists."""`

							`from __future__ import annotations`
		`@@ -0,0 +1,3 @@`
							`"""Rate limiting: sliding-window RPM/TPM and concurrency semaphore (Redis)."""`

							`from __future__ import annotations`