scaffold: project skeleton, schema, healthz/readyz, CI

Initial project structure for neuronetz-gateway per scope-docs/SPEC.md: - Python 3.12 / FastAPI / SQLAlchemy 2.0 (async) / Redis / Postgres stack managed by uv. Multi-stage non-root Dockerfile, prod + dev compose files (ollama service is NEVER published in either), Caddyfile + systemd unit, justfile, GitHub Actions CI (ruff, mypy --strict, pytest, bandit, pip-audit). - Pydantic-Settings config covering every env var from SPEC §7, including the MODEL_DISCOVERY_* keys for the dynamic-discovery feature (§4.6). - Alembic 0001_initial creates the full gateway schema (8 tables, 3 enums, notify_key_revoked() trigger), incl. allow_all_models on tenant_limits and key_limits for the per-tenant auto-grant toggle. - Working /healthz, /readyz (fail-closed when deps unreachable), and a Prometheus /metrics stub. Sanitizing error handlers that attach X-Request-ID to every response and never leak upstream internals. - SPEC + AGENT_PROMPT included under scope-docs/ (source of truth).
2026-05-26 20:50:35 +02:00
commit d79f17b3bb
32 changed files with 3610 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,44 @@
 # Keep the build context lean and never ship secrets into an image layer.
 # Secrets / local env
 .env
 .env.*
 !.env.example
 # VCS & CI
 .git
 .gitignore
 .github
 # Python caches & build artefacts
 __pycache__/
 *.py[cod]
 *.egg-info/
 .eggs/
 build/
 dist/
 .venv/
 venv/
 .mypy_cache/
 .ruff_cache/
 .pytest_cache/
 .coverage
 htmlcov/
 coverage.xml
 # Tests & docs are not needed in the runtime image
 tests/
 docs/
 scope-docs/
 # Editor / OS cruft
 .idea/
 .vscode/
 *.swp
 .DS_Store
 # Compose / ops files don't belong in the image
 docker-compose*.yml
 ops/
 # NOTE: README.md and LICENSE are intentionally NOT ignored — the build backend
 # (hatchling) reads `readme`/`license` from pyproject.toml at build time.
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,63 @@
 # neuronetz-gateway — environment configuration (SPEC §7).
 #
 # Copy to `.env` and adjust. `.env` is gitignored and MUST NOT be committed.
 # All values here are SAFE EXAMPLES — change every secret before any real deploy.
 # ──────────────────────────── Service ────────────────────────────
 GATEWAY_BIND_HOST=0.0.0.0
 GATEWAY_BIND_PORT=8080
 GATEWAY_LOG_LEVEL=INFO
 GATEWAY_LOG_FORMAT=json                  # json|console
 GATEWAY_REQUEST_ID_HEADER=X-Request-ID
 GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy  # for X-Forwarded-For
 # ──────────────────────────── Upstream ───────────────────────────
 OLLAMA_BASE_URL=http://ollama:11434
 OLLAMA_CONNECT_TIMEOUT_S=5
 OLLAMA_READ_TIMEOUT_S=600
 OLLAMA_MAX_CONNECTIONS=64
 # ──────────────────────── Model discovery (§4.6) ─────────────────
 MODEL_DISCOVERY_REFRESH_S=60
 MODEL_DISCOVERY_CACHE_TTL_S=120
 # ──────────────────────────── Database ───────────────────────────
 # Compose builds DATABASE_URL from the POSTGRES_* parts below, but the gateway
 # also accepts a full DATABASE_URL directly.
 DATABASE_URL=postgresql+asyncpg://gateway:changeme@postgres:5432/neuronetz
 DATABASE_POOL_SIZE=10
 DATABASE_POOL_OVERFLOW=20
 # Postgres container credentials (consumed by docker-compose).
 POSTGRES_USER=gateway
 POSTGRES_PASSWORD=changeme
 POSTGRES_DB=neuronetz
 # ──────────────────────────── Redis ──────────────────────────────
 REDIS_URL=redis://redis:6379/0
 REDIS_KEY_CACHE_TTL_S=60
 # ────────────────── Limits (defaults; DB overrides) ──────────────
 DEFAULT_RPM=60
 DEFAULT_TPM=100000
 DEFAULT_CONCURRENT=8
 MAX_REQUEST_BODY_BYTES=262144
 MAX_NUM_PREDICT=4096
 # ──────────────────────────── Security ───────────────────────────
 ARGON2_TIME_COST=3
 ARGON2_MEMORY_COST_KIB=65536
 ARGON2_PARALLELISM=4
 AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
 # ──────────────────────────── Audit ──────────────────────────────
 AUDIT_BUFFER_SIZE=1000
 PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
 AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
 # ──────────────── Playground / API docs (prod-safe: OFF) ─────────
 # Serve the playground HTML (owned by the docs agent) at /playground.
 PLAYGROUND_ENABLED=false
 PLAYGROUND_FILE=/app/playground/index.html
 # Enable FastAPI's /docs + /openapi.json (default off in production).
 DOCS_ENABLED=false
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,108 @@
 name: CI
 on:
  push:
    branches: ["**"]
  pull_request:
  workflow_dispatch:
 # Cancel superseded runs on the same ref.
 concurrency:
  group: ci-${{ github.ref }}
  cancel-in-progress: true
 env:
  PYTHON_VERSION: "3.12"
 jobs:
  lint:
    name: ruff
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
      - name: Set up Python
        run: uv python install ${{ env.PYTHON_VERSION }}
      - name: Install dependencies
        run: uv sync --extra dev
      - name: ruff check
        run: uv run ruff check .
  typecheck:
    name: mypy --strict
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
      - name: Set up Python
        run: uv python install ${{ env.PYTHON_VERSION }}
      - name: Install dependencies
        run: uv sync --extra dev
      - name: mypy
        run: uv run mypy --strict src
  test:
    name: pytest
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
      - name: Set up Python
        run: uv python install ${{ env.PYTHON_VERSION }}
      - name: Install dependencies
        run: uv sync --extra dev
      # Phase 1: an empty/placeholder suite must pass. pytest exits 5 when it
      # collects no tests; we treat that as success this phase. Coverage is
      # reported but not gated yet (no --cov-fail-under until later phases).
      - name: pytest
        shell: bash
        run: |
          set +e
          uv run pytest --cov=neuronetz_gateway --cov-report=term-missing
          code=$?
          if [ "$code" -eq 5 ]; then
            echo "::notice::No tests collected (Phase 1) — treating as success."
            exit 0
          fi
          exit "$code"
  bandit:
    name: bandit
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
      - name: Set up Python
        run: uv python install ${{ env.PYTHON_VERSION }}
      - name: Install dependencies
        run: uv sync --extra dev
      - name: bandit
        run: uv run bandit -q -r src
  pip-audit:
    name: pip-audit
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Install uv
        uses: astral-sh/setup-uv@v3
        with:
          enable-cache: true
      - name: Set up Python
        run: uv python install ${{ env.PYTHON_VERSION }}
      - name: Install dependencies
        run: uv sync --extra dev
      - name: pip-audit
        run: uv run pip-audit
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,40 @@
 # Secrets — NEVER commit. Only .env.example is tracked.
 .env
 .env.*
 !.env.example
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.egg-info/
 .eggs/
 build/
 dist/
 *.so
 # Virtualenvs / uv
 .venv/
 venv/
 .python-version
 # Type / lint / test caches
 .mypy_cache/
 .ruff_cache/
 .pytest_cache/
 .coverage
 .coverage.*
 htmlcov/
 coverage.xml
 .tox/
 # Docker
 *.pid
 # Editor / OS
 .idea/
 .vscode/
 *.swp
 *~
 .DS_Store
 Thumbs.db
--- a/97
+++ b/97
@@ -0,0 +1,97 @@
 # syntax=docker/dockerfile:1.7
 #
 # neuronetz-gateway — multi-stage image.
 #
 #   builder stage : installs dependencies into a self-contained virtualenv using uv.
 #   runtime stage : copies the venv + source, drops to a NON-ROOT user, contains
 #                   no build tools, and runs `python -m neuronetz_gateway`.
 #
 # uv is pulled from the official distroless image so we don't need network access
 # to `pip install uv`. Dependencies come from pyproject.toml (+ uv.lock if present).
 # ----------------------------------------------------------------------------
 # Stage 1 — builder
 # ----------------------------------------------------------------------------
 FROM python:3.12-slim AS builder
 # Bring in the `uv` binary from its official image.
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
 ENV UV_LINK_MODE=copy \
    UV_COMPILE_BYTECODE=1 \
    UV_PYTHON_DOWNLOADS=never \
    # Create the project venv at a stable, copyable location.
    VIRTUAL_ENV=/opt/venv \
    PATH=/opt/venv/bin:$PATH
 WORKDIR /app
 # Create the target virtualenv up front so uv installs into it.
 RUN uv venv /opt/venv
 # Dependency layer: copy only the manifest(s) first for better caching.
 # uv.lock is optional in Phase 1 — the wildcard makes COPY succeed either way.
 COPY pyproject.toml ./
 COPY uv.loc[k] ./
 # Install dependencies. If a lockfile is present `uv sync` honours it; otherwise
 # we fall back to resolving straight from pyproject.toml. Either way the build
 # does NOT fail when the lock is absent.
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ -f uv.lock ]; then \
        uv sync --frozen --no-install-project --no-dev ; \
    else \
        uv pip install --python /opt/venv/bin/python -r pyproject.toml ; \
    fi
 # Now copy the application source and install the project itself into the venv.
 # README.md + LICENSE are required by the build backend (pyproject `readme`/license).
 COPY README.md LICENSE ./
 COPY src ./src
 COPY alembi[c] ./alembic
 COPY alembic.in[i] ./
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --python /opt/venv/bin/python --no-deps .
 # ----------------------------------------------------------------------------
 # Stage 2 — runtime
 # ----------------------------------------------------------------------------
 FROM python:3.12-slim AS runtime
 # Runtime-only OS packages: curl is used by the compose healthcheck.
 RUN apt-get update \
    && apt-get install -y --no-install-recommends curl \
    && rm -rf /var/lib/apt/lists/*
 # Non-root user.
 RUN groupadd --system --gid 10001 gateway \
    && useradd --system --uid 10001 --gid gateway --home-dir /app --shell /usr/sbin/nologin gateway
 ENV VIRTUAL_ENV=/opt/venv \
    PATH=/opt/venv/bin:$PATH \
    PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    GATEWAY_BIND_HOST=0.0.0.0 \
    GATEWAY_BIND_PORT=8080
 WORKDIR /app
 # Copy the fully-populated virtualenv and the application from the builder.
 COPY --from=builder /opt/venv /opt/venv
 COPY --from=builder /app/src ./src
 # alembic assets are optional during early scaffolding; copy if present.
 COPY --from=builder /app/alembi[c] ./alembic
 COPY --from=builder /app/alembic.in[i] ./
 # Drop privileges. No build tools are present in this stage.
 USER gateway
 EXPOSE 8080
 # Liveness probe target lives at /healthz (see SPEC §6.4).
 HEALTHCHECK --interval=15s --timeout=3s --start-period=20s --retries=5 \
    CMD curl -fsS "http://127.0.0.1:${GATEWAY_BIND_PORT}/healthz" || exit 1
 # Default command: run the server. Compose overrides this in dev to run
 # `alembic upgrade head` first (see docker-compose.dev.yml).
 CMD ["python", "-m", "neuronetz_gateway"]
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,92 @@
 # neuronetz-gateway
 A secure, multi-tenant API gateway in front of an [Ollama](https://github.com/ollama/ollama)
 instance. It is the hot path of the Neuronetz API: every request to the models flows
 through here, authenticated, rate-limited, budgeted, and audited.
 **The Ollama backend is never reachable from the public internet.** It is bound to an
 internal Docker network with no published ports. All access is via this gateway, behind
 TLS terminated by Caddy.
 > Status: **v0.1.0 — in development.** See [`scope-docs/SPEC.md`](scope-docs/SPEC.md) for
 > the full specification and [`scope-docs/AGENT_PROMPT.md`](scope-docs/AGENT_PROMPT.md) for
 > the phased build plan. `SPEC.md` is the source of truth.
 ## What it does
 - **Auth** — API keys as Bearer tokens, stored as Argon2id hashes, verified in constant time.
 - **Multi-tenant** — tenants own keys; limits and budgets inherit tenant → key.
 - **Rate limiting** — per-key and per-tenant RPM / TPM / concurrent connections.
 - **Budgets** — daily / monthly / total token budgets, enforced fail-closed.
 - **Dual API surface** — native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`), both streaming.
 - **Hard-blocked mutations** — `/api/pull`, `/api/push`, `/api/create`, `/api/copy`,
  `/api/delete`, `/api/blobs/*` always return 403. Not configurable.
 - **Audit log** — always-on request metadata; opt-in, TTL'd prompt logging per key.
 Administration (dashboards, tenant self-service) lives in a separate service,
 `neuronetz-console`; it is **not** part of this repository.
 ## Architecture
 ```
 Internet ──TLS──> Caddy ──HTTP──> gateway ──┬──> Postgres   (keys, budgets, audit)
                                            ├──> Redis      (key cache, rate limits)
                                            └──> Ollama     (internal network only)
 ```
 ## Quickstart (dev)
 Requires Docker + Docker Compose. The dev stack runs Postgres, Redis, and the gateway —
 **no Caddy and no Ollama** (so `/readyz` reports 503 until a real Ollama backend is wired
 in; that is expected).
 ```bash
 git clone <repo> neuronetz-gateway && cd neuronetz-gateway
 cp .env.example .env          # adjust if you like; defaults work for local dev
 docker compose -f docker-compose.dev.yml up --build
 ```
 The gateway runs `alembic upgrade head` on startup, then serves on `http://localhost:8080`.
 ```bash
 curl -i http://localhost:8080/healthz   # -> 200  {"status":"ok"}
 curl -i http://localhost:8080/readyz    # -> 503  (no Ollama backend in the dev stack)
 ```
 ## Production
 `docker-compose.yml` brings up the full stack — Caddy (TLS via Let's Encrypt for
 `api.neuronetz.ai`), the gateway, Postgres, Redis, and Ollama. The `ollama` service has
 **no `ports:` mapping** and is reachable only on the internal Docker network. See
 [`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) (added in a later phase) and
 [`ops/caddy/Caddyfile.example`](ops/caddy/Caddyfile.example).
 ## Managing tenants and keys
 Use the bootstrap CLI (Typer). Keys have the form `nz_<prefix><secret>`; the full key is
 printed exactly once at creation and only its Argon2id hash is stored.
 ```bash
 neuronetz-gateway create-tenant --name acme
 neuronetz-gateway create-key   --tenant acme --name prod-server-1
 neuronetz-gateway list-keys    --tenant acme
 neuronetz-gateway revoke-key   --prefix nz_abc12345
 ```
 ## Development
 ```bash
 just dev          # run the dev stack
 just test         # pytest + coverage
 just lint         # ruff
 just typecheck    # mypy --strict
 just migrate      # alembic upgrade head
 ```
 Tooling: Python 3.12, `uv`, FastAPI + uvicorn, SQLAlchemy 2.0 (async) + asyncpg, Redis,
 httpx, structlog, Pydantic. Lint/type/security gates: ruff, mypy `--strict`, bandit,
 pip-audit.
 ## License
 Apache 2.0 — see [`LICENSE`](LICENSE). Owner: Stephan Berbig / Neuronetz.
--- a/alembic.ini
+++ b/alembic.ini
@@ -0,0 +1,49 @@
 # Alembic configuration for neuronetz-gateway.
 # The database URL is read from the DATABASE_URL environment variable in
 # alembic/env.py (do not hardcode credentials here).
 [alembic]
 script_location = alembic
 prepend_sys_path = src
 version_path_separator = os
 # version_locations defaults to alembic/versions
 # DATABASE_URL is injected at runtime; this placeholder is never used directly.
 sqlalchemy.url = driver://user:pass@localhost/dbname
 [post_write_hooks]
 # (none)
 [loggers]
 keys = root,sqlalchemy,alembic
 [handlers]
 keys = console
 [formatters]
 keys = generic
 [logger_root]
 level = WARNING
 handlers = console
 qualname =
 [logger_sqlalchemy]
 level = WARNING
 handlers =
 qualname = sqlalchemy.engine
 [logger_alembic]
 level = INFO
 handlers =
 qualname = alembic
 [handler_console]
 class = StreamHandler
 args = (sys.stderr,)
 level = NOTSET
 formatter = generic
 [formatter_generic]
 format = %(levelname)-5.5s [%(name)s] %(message)s
 datefmt = %H:%M:%S
--- a/alembic/env.py
+++ b/alembic/env.py
@@ -0,0 +1,97 @@
 """Alembic environment for neuronetz-gateway (async engine).
 Reads ``DATABASE_URL`` from the environment (the same value the app uses,
 ``postgresql+asyncpg://...``). Ensures schema ``gateway`` exists and pins the
 Alembic version table into that schema so migration bookkeeping never collides
 with the ``console`` schema in the shared database.
 """
 from __future__ import annotations
 import asyncio
 import os
 from logging.config import fileConfig
 from alembic import context
 from sqlalchemy import pool, text
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import async_engine_from_config
 from neuronetz_gateway.config import get_settings
 from neuronetz_gateway.db.models import GATEWAY_SCHEMA, Base
 config = context.config
 if config.config_file_name is not None:
    fileConfig(config.config_file_name)
 target_metadata = Base.metadata
 def _database_url() -> str:
    """Resolve the async database URL from env, falling back to settings."""
    return os.environ.get("DATABASE_URL") or get_settings().database_url
 def _configure_context(connection: Connection) -> None:
    """Configure migration context with the gateway schema + version table."""
    context.configure(
        connection=connection,
        target_metadata=target_metadata,
        version_table="alembic_version",
        version_table_schema=GATEWAY_SCHEMA,
        include_schemas=True,
        compare_type=True,
    )
 def run_migrations_offline() -> None:
    """Run migrations in 'offline' mode (emit SQL without a DBAPI connection)."""
    context.configure(
        url=_database_url(),
        target_metadata=target_metadata,
        literal_binds=True,
        dialect_opts={"paramstyle": "named"},
        version_table="alembic_version",
        version_table_schema=GATEWAY_SCHEMA,
        include_schemas=True,
    )
    with context.begin_transaction():
        context.run_migrations()
 def _do_run_migrations(connection: Connection) -> None:
    """Ensure the schema exists, then run migrations within a transaction.
    The ``CREATE SCHEMA`` is committed in its own transaction before configuring
    Alembic. Under SQLAlchemy 2.0, ``execute()`` auto-begins a transaction; if it
    were left open, Alembic's ``begin_transaction()`` would treat the connection as
    caller-managed and become a no-op that never commits, so the whole migration
    (and the schema) would be rolled back on connection close. Committing here
    leaves the connection clean so Alembic owns — and commits — its own transaction.
    """
    connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{GATEWAY_SCHEMA}"'))
    connection.commit()
    _configure_context(connection)
    with context.begin_transaction():
        context.run_migrations()
 async def run_migrations_online() -> None:
    """Run migrations in 'online' mode using an async engine."""
    configuration = config.get_section(config.config_ini_section) or {}
    configuration["sqlalchemy.url"] = _database_url()
    connectable = async_engine_from_config(
        configuration,
        prefix="sqlalchemy.",
        poolclass=pool.NullPool,
    )
    async with connectable.connect() as connection:
        await connection.run_sync(_do_run_migrations)
    await connectable.dispose()
 if context.is_offline_mode():
    run_migrations_offline()
 else:
    asyncio.run(run_migrations_online())
--- a/alembic/versions/0001_initial.py
+++ b/alembic/versions/0001_initial.py
@@ -0,0 +1,342 @@
 """initial gateway schema
 Creates schema ``gateway``, the three enum types, all tables and indexes, and
 the ``notify_key_revoked()`` function plus ``trg_notify_key_revoked`` trigger,
 matching SPEC §5 verbatim in structure.
 Revision ID: 0001_initial
 Revises:
 Create Date: 2026-05-22
 """
 from __future__ import annotations
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
 revision: str = "0001_initial"
 down_revision: str | None = None
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 SCHEMA = "gateway"
 # Enum types are created explicitly via raw SQL below; the table columns
 # reference them with create_type=False so they are not created twice.
 _key_status = postgresql.ENUM(
    "active", "disabled", "revoked", name="key_status", schema=SCHEMA, create_type=False
 )
 _tenant_status = postgresql.ENUM(
    "active", "suspended", "closed", name="tenant_status", schema=SCHEMA, create_type=False
 )
 _budget_period = postgresql.ENUM(
    "day", "month", "total", name="budget_period", schema=SCHEMA, create_type=False
 )
 def upgrade() -> None:
    """Create the full ``gateway`` schema."""
    op.execute(f'CREATE SCHEMA IF NOT EXISTS "{SCHEMA}"')
    # --- Enum types (SPEC §5) ---
    op.execute("CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked')")
    op.execute("CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed')")
    op.execute("CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total')")
    # --- tenants ---
    op.create_table(
        "tenants",
        sa.Column(
            "id",
            postgresql.UUID(as_uuid=True),
            primary_key=True,
            server_default=sa.text("gen_random_uuid()"),
        ),
        sa.Column("name", sa.Text(), nullable=False, unique=True),
        sa.Column(
            "status", _tenant_status, nullable=False, server_default=sa.text("'active'")
        ),
        sa.Column(
            "created_at",
            postgresql.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "metadata",
            postgresql.JSONB(),
            nullable=False,
            server_default=sa.text("'{}'::jsonb"),
        ),
        schema=SCHEMA,
    )
    # --- tenant_limits ---
    op.create_table(
        "tenant_limits",
        sa.Column(
            "tenant_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
            primary_key=True,
        ),
        sa.Column("rpm", sa.Integer(), nullable=False, server_default=sa.text("60")),
        sa.Column("tpm", sa.Integer(), nullable=False, server_default=sa.text("100000")),
        sa.Column("concurrent", sa.Integer(), nullable=False, server_default=sa.text("8")),
        sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
        sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
        sa.Column("tokens_total", sa.BigInteger(), nullable=True),
        sa.Column(
            "allowed_models",
            postgresql.ARRAY(sa.Text()),
            nullable=False,
            server_default=sa.text("'{}'"),
        ),
        sa.Column(
            "allow_all_models",
            sa.Boolean(),
            nullable=False,
            server_default=sa.text("false"),
        ),
        sa.Column(
            "log_prompts_default",
            sa.Boolean(),
            nullable=False,
            server_default=sa.text("false"),
        ),
        sa.Column(
            "prompt_retention_days", sa.Integer(), nullable=False, server_default=sa.text("30")
        ),
        sa.Column(
            "audit_retention_days", sa.Integer(), nullable=False, server_default=sa.text("365")
        ),
        schema=SCHEMA,
    )
    # --- api_keys ---
    op.create_table(
        "api_keys",
        sa.Column(
            "id",
            postgresql.UUID(as_uuid=True),
            primary_key=True,
            server_default=sa.text("gen_random_uuid()"),
        ),
        sa.Column(
            "tenant_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column("prefix", sa.Text(), nullable=False, unique=True),
        sa.Column("key_hash", sa.Text(), nullable=False),
        sa.Column("name", sa.Text(), nullable=False),
        sa.Column("status", _key_status, nullable=False, server_default=sa.text("'active'")),
        sa.Column(
            "scopes",
            postgresql.ARRAY(sa.Text()),
            nullable=False,
            server_default=sa.text("'{chat,embeddings}'"),
        ),
        sa.Column(
            "created_at",
            postgresql.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column("last_used_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
        sa.Column("expires_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
        sa.Column("log_prompts", sa.Boolean(), nullable=True),
        sa.Column(
            "metadata",
            postgresql.JSONB(),
            nullable=False,
            server_default=sa.text("'{}'::jsonb"),
        ),
        schema=SCHEMA,
    )
    op.create_index(
        "idx_api_keys_prefix",
        "api_keys",
        ["prefix"],
        schema=SCHEMA,
        postgresql_where=sa.text("status = 'active'"),
    )
    op.create_index("idx_api_keys_tenant", "api_keys", ["tenant_id"], schema=SCHEMA)
    # --- key_limits ---
    op.create_table(
        "key_limits",
        sa.Column(
            "key_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
            primary_key=True,
        ),
        sa.Column("rpm", sa.Integer(), nullable=True),
        sa.Column("tpm", sa.Integer(), nullable=True),
        sa.Column("concurrent", sa.Integer(), nullable=True),
        sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
        sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
        sa.Column("tokens_total", sa.BigInteger(), nullable=True),
        sa.Column("allowed_models", postgresql.ARRAY(sa.Text()), nullable=True),
        sa.Column("allow_all_models", sa.Boolean(), nullable=True),
        schema=SCHEMA,
    )
    # --- budget_usage ---
    op.create_table(
        "budget_usage",
        sa.Column(
            "key_id",
            postgresql.UUID(as_uuid=True),
            sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
            primary_key=True,
            nullable=False,
        ),
        sa.Column("period", _budget_period, primary_key=True, nullable=False),
        sa.Column(
            "period_start",
            postgresql.TIMESTAMP(timezone=True),
            primary_key=True,
            nullable=False,
        ),
        sa.Column("tokens_in", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
        sa.Column("tokens_out", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
        sa.Column("requests", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
        schema=SCHEMA,
    )
    op.create_index(
        "idx_budget_usage_period",
        "budget_usage",
        ["period", "period_start"],
        schema=SCHEMA,
    )
    # --- audit_log ---
    op.create_table(
        "audit_log",
        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
        sa.Column(
            "ts",
            postgresql.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column("request_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("tenant_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column("key_prefix", sa.Text(), nullable=True),
        sa.Column("method", sa.Text(), nullable=False),
        sa.Column("path", sa.Text(), nullable=False),
        sa.Column("model", sa.Text(), nullable=True),
        sa.Column("tokens_in", sa.Integer(), nullable=True),
        sa.Column("tokens_out", sa.Integer(), nullable=True),
        sa.Column("latency_ms", sa.Integer(), nullable=True),
        sa.Column("status", sa.Integer(), nullable=False),
        sa.Column("client_ip", postgresql.INET(), nullable=True),
        sa.Column("user_agent", sa.Text(), nullable=True),
        sa.Column("error_code", sa.Text(), nullable=True),
        schema=SCHEMA,
    )
    op.create_index("idx_audit_ts", "audit_log", ["ts"], schema=SCHEMA)
    op.create_index("idx_audit_tenant_ts", "audit_log", ["tenant_id", "ts"], schema=SCHEMA)
    op.create_index("idx_audit_key_ts", "audit_log", ["key_id", "ts"], schema=SCHEMA)
    # --- prompt_log ---
    op.create_table(
        "prompt_log",
        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
        sa.Column(
            "audit_id",
            sa.BigInteger(),
            sa.ForeignKey(f"{SCHEMA}.audit_log.id", ondelete="CASCADE"),
            nullable=False,
        ),
        sa.Column(
            "ts",
            postgresql.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("request_body", postgresql.JSONB(), nullable=False),
        sa.Column("response_text", sa.Text(), nullable=True),
        sa.Column("retention_until", postgresql.TIMESTAMP(timezone=True), nullable=False),
        schema=SCHEMA,
    )
    op.create_index(
        "idx_prompt_log_retention", "prompt_log", ["retention_until"], schema=SCHEMA
    )
    # --- revocations ---
    op.create_table(
        "revocations",
        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column(
            "ts",
            postgresql.TIMESTAMP(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column("reason", sa.Text(), nullable=True),
        sa.Column("processed_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
        schema=SCHEMA,
    )
    # --- NOTIFY trigger on revocation insert (SPEC §5) ---
    op.execute(
        """
        CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
        BEGIN
            PERFORM pg_notify('key_revoked', NEW.key_id::text);
            RETURN NEW;
        END;
        $$ LANGUAGE plpgsql;
        """
    )
    op.execute(
        """
        CREATE TRIGGER trg_notify_key_revoked
            AFTER INSERT ON gateway.revocations
            FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
        """
    )
 def downgrade() -> None:
    """Drop the entire ``gateway`` schema and its objects."""
    op.execute("DROP TRIGGER IF EXISTS trg_notify_key_revoked ON gateway.revocations")
    op.execute("DROP FUNCTION IF EXISTS gateway.notify_key_revoked()")
    op.drop_index("idx_prompt_log_retention", table_name="prompt_log", schema=SCHEMA)
    op.drop_table("prompt_log", schema=SCHEMA)
    op.drop_index("idx_audit_key_ts", table_name="audit_log", schema=SCHEMA)
    op.drop_index("idx_audit_tenant_ts", table_name="audit_log", schema=SCHEMA)
    op.drop_index("idx_audit_ts", table_name="audit_log", schema=SCHEMA)
    op.drop_table("audit_log", schema=SCHEMA)
    op.drop_index("idx_budget_usage_period", table_name="budget_usage", schema=SCHEMA)
    op.drop_table("budget_usage", schema=SCHEMA)
    op.drop_table("key_limits", schema=SCHEMA)
    op.drop_index("idx_api_keys_tenant", table_name="api_keys", schema=SCHEMA)
    op.drop_index("idx_api_keys_prefix", table_name="api_keys", schema=SCHEMA)
    op.drop_table("api_keys", schema=SCHEMA)
    op.drop_table("tenant_limits", schema=SCHEMA)
    op.drop_table("tenants", schema=SCHEMA)
    op.execute("DROP TYPE IF EXISTS gateway.budget_period")
    op.execute("DROP TYPE IF EXISTS gateway.tenant_status")
    op.execute("DROP TYPE IF EXISTS gateway.key_status")
    op.execute(f'DROP SCHEMA IF EXISTS "{SCHEMA}"')
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -0,0 +1,101 @@
 # neuronetz-gateway — DEV stack (postgres + redis + gateway only).
 #
 # Deliberately differs from the production stack:
 #   * NO caddy   — the gateway is published directly on localhost:8080.
 #   * NO ollama  — Phase 1 expects /readyz to return 503 *because* there is no
 #                  Ollama backend yet. This is the intended exit-criterion state.
 #
 # Bring it up with:
 #   docker compose -f docker-compose.dev.yml up --build
 #
 # Then:
 #   curl -i http://localhost:8080/healthz   # -> 200
 #   curl -i http://localhost:8080/readyz    # -> 503 (no Ollama)
 #
 # The gateway container runs `alembic upgrade head` and then starts the server.
 services:
  gateway:
    build:
      context: .
      dockerfile: Dockerfile
    restart: unless-stopped
    ports:
      - "127.0.0.1:8080:8080"
    environment:
      GATEWAY_BIND_HOST: 0.0.0.0
      GATEWAY_BIND_PORT: "8080"
      GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
      GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
      GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
      GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
      DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
      DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
      REDIS_URL: redis://redis:6379/0
      REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
      # No Ollama in the dev stack — point at the (absent) service name so the
      # readiness check fails closed with 503, exactly as Phase 1 expects.
      OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434}
      OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
      OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
      OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
      DEFAULT_RPM: ${DEFAULT_RPM:-60}
      DEFAULT_TPM: ${DEFAULT_TPM:-100000}
      DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
      MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
      MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
      ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
      ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
      ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
      AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
      AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
      PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
      AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
    # Run migrations, then start the server.
    command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
      interval: 10s
      timeout: 3s
      retries: 5
      start_period: 30s
  postgres:
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_USER: ${POSTGRES_USER:-gateway}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
      POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
    ports:
      # Exposed on localhost for dev convenience (psql, migrations from host).
      - "127.0.0.1:5432:5432"
    volumes:
      - postgres_dev_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
      interval: 5s
      timeout: 3s
      retries: 10
  redis:
    image: redis:7-alpine
    restart: unless-stopped
    command: ["redis-server", "--save", "", "--appendonly", "no"]
    ports:
      # Exposed on localhost for dev convenience (redis-cli from host).
      - "127.0.0.1:6379:6379"
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 10
 volumes:
  postgres_dev_data:
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,152 @@
 # neuronetz-gateway — FULL production stack (SPEC §4.1).
 #
 #   Internet ──TLS──▶ caddy ──HTTP/1.1 internal──▶ gateway ──▶ postgres / redis / ollama
 #
 # Only Caddy publishes ports to the host. The gateway is reachable solely through
 # Caddy on the internal network. Postgres, Redis and (critically) Ollama are NOT
 # published to the host at all.
 #
 #  ┌─────────────────────────────────────────────────────────────────────────┐
 #  │ SECURITY NON-NEGOTIABLE:                                                  │
 #  │   The `ollama` service has NO `ports:` mapping and MUST NEVER get one.    │
 #  │   Ollama is reachable only on the internal Docker network via the         │
 #  │   service name `ollama:11434`. Publishing it would re-open the exact      │
 #  │   unauthenticated exposure this whole project exists to close.            │
 #  └─────────────────────────────────────────────────────────────────────────┘
 #
 # Copy `.env.example` to `.env` and adjust before running:
 #   docker compose up -d --build
 services:
  caddy:
    image: caddy:2-alpine
    restart: unless-stopped
    depends_on:
      gateway:
        condition: service_healthy
    ports:
      - "80:80"
      - "443:443"
      - "443:443/udp"   # HTTP/3
    volumes:
      - ./ops/caddy/Caddyfile.example:/etc/caddy/Caddyfile:ro
      - caddy_data:/data
      - caddy_config:/config
    networks:
      - edge
      - internal
  gateway:
    build:
      context: .
      dockerfile: Dockerfile
    restart: unless-stopped
    # NOTE: deliberately NO `ports:` — the gateway is internal-only and is
    # reached exclusively through Caddy.
    expose:
      - "8080"
    environment:
      GATEWAY_BIND_HOST: 0.0.0.0
      GATEWAY_BIND_PORT: "8080"
      GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
      GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-json}
      GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
      GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1,caddy}
      # Service-name addressing on the internal network.
      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-changeme}@postgres:5432/${POSTGRES_DB:-neuronetz}
      DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
      DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
      REDIS_URL: redis://redis:6379/0
      REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
      OLLAMA_BASE_URL: http://ollama:11434
      OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
      OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
      OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
      DEFAULT_RPM: ${DEFAULT_RPM:-60}
      DEFAULT_TPM: ${DEFAULT_TPM:-100000}
      DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
      MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
      MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
      ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
      ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
      ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
      AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
      AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
      PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
      AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
      ollama:
        condition: service_started
    # Apply migrations, then start the server.
    command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
    healthcheck:
      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
      interval: 15s
      timeout: 3s
      retries: 5
      start_period: 30s
    networks:
      - internal
  postgres:
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_USER: ${POSTGRES_USER:-gateway}
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme}
      POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
    volumes:
      - postgres_data:/var/lib/postgresql/data
    # No `ports:` — Postgres is internal-only.
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
      interval: 5s
      timeout: 3s
      retries: 10
    networks:
      - internal
  redis:
    image: redis:7-alpine
    restart: unless-stopped
    command: ["redis-server", "--save", "", "--appendonly", "no"]
    # No `ports:` — Redis is internal-only.
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 5s
      timeout: 3s
      retries: 10
    networks:
      - internal
  # ───────────────────────────────────────────────────────────────────────────
  # Ollama — INTERNAL NETWORK ONLY. DO NOT ADD A `ports:` MAPPING.
  # Reachable only as `http://ollama:11434` from the gateway container.
  # ───────────────────────────────────────────────────────────────────────────
  ollama:
    image: ollama/ollama:latest
    restart: unless-stopped
    # !!! NO `ports:` — never publish Ollama to the host or the internet. !!!
    volumes:
      - ollama_data:/root/.ollama
    networks:
      - internal
 networks:
  # Public-facing network: only Caddy is attached alongside `internal`.
  edge:
    driver: bridge
  # Private network for inter-service traffic; not reachable from the host.
  internal:
    driver: bridge
    internal: false
 volumes:
  postgres_data:
  ollama_data:
  caddy_data:
  caddy_config:
--- a/60
+++ b/60
@@ -0,0 +1,60 @@
 # neuronetz-gateway — task runner.
 #
 # Requires `just` (https://github.com/casey/just) and `uv`
 # (https://github.com/astral-sh/uv) on the host.
 #
 #   just            # list available targets
 #   just dev        # run postgres + redis + gateway locally (dev stack)
 #   just test       # run the test suite with coverage
 #   just lint       # ruff check
 #   just typecheck  # mypy --strict
 #   just migrate    # apply alembic migrations against DATABASE_URL
 set dotenv-load := true
 # uv runs commands inside the project's managed environment.
 uv := "uv"
 # Show the list of targets (default).
 default:
    @just --list
 # Sync dependencies into the local uv-managed virtualenv (incl. dev extras).
 install:
    {{uv}} sync --extra dev
 # Run the dev stack: postgres + redis + gateway (no caddy, no ollama).
 dev:
    docker compose -f docker-compose.dev.yml up --build
 # Run the test suite with coverage.
 test:
    {{uv}} run pytest
 # Lint with ruff.
 lint:
    {{uv}} run ruff check .
 # Static type checking (strict).
 typecheck:
    {{uv}} run mypy --strict src
 # Apply database migrations to head.
 migrate:
    {{uv}} run alembic upgrade head
 # Security lint.
 bandit:
    {{uv}} run bandit -q -r src
 # Dependency vulnerability audit.
 audit:
    {{uv}} run pip-audit
 # Bring the FULL production stack up (caddy + gateway + postgres + redis + ollama).
 compose-up:
    docker compose up -d --build
 # Tear the production stack down.
 compose-down:
    docker compose down
--- a/ops/caddy/Caddyfile.example
+++ b/ops/caddy/Caddyfile.example
@@ -0,0 +1,59 @@
 # neuronetz-gateway — Caddy reverse proxy (SPEC §4.1, §6.5).
 #
 # Caddy is the only public-facing component. It terminates TLS (HTTP/2 + HTTP/3),
 # obtains a Let's Encrypt certificate for api.neuronetz.ai automatically, applies
 # security headers, and reverse-proxies to the internal-only gateway:8080.
 #
 # Copy this file to `Caddyfile` and edit the site address / admin email.
 # The production docker-compose.yml mounts it at /etc/caddy/Caddyfile.
 {
 	# Email for Let's Encrypt account + expiry notices. Replace before deploy.
 	email ops@neuronetz.ai
 }
 api.neuronetz.ai {
 	# --- Reverse proxy to the internal gateway ---
 	# `gateway` is the Docker service name on the internal network; it is never
 	# published to the host. Caddy forwards plain HTTP/1.1 to it.
 	reverse_proxy gateway:8080
 	# --- Security headers ---
 	header {
 		# HSTS: force HTTPS for two years, include subdomains, allow preload.
 		Strict-Transport-Security "max-age=63072000; includeSubDomains; preload"
 		# Disable MIME sniffing.
 		X-Content-Type-Options "nosniff"
 		# Clickjacking defense (API has no UI, deny framing outright).
 		X-Frame-Options "DENY"
 		# Conservative referrer policy.
 		Referrer-Policy "no-referrer"
 		# Strip server-identifying headers so we don't advertise the stack.
 		-Server
 		-X-Powered-By
 	}
 	# Structured access logs to stdout (collected by the container runtime).
 	log {
 		output stdout
 		format json
 	}
 }
 # ─────────────────────────────────────────────────────────────────────────────
 # DEV / LOCAL note:
 #
 # For local testing without a public domain or real certificate, replace the
 # site block above with a localhost block that uses Caddy's internal self-signed
 # CA (no Let's Encrypt round-trip):
 #
 #   localhost {
 #       tls internal
 #       reverse_proxy gateway:8080
 #   }
 #
 # Caddy will install its local root CA; trust it or pass `-k` to curl. Note the
 # Phase 1 *dev* compose stack (docker-compose.dev.yml) ships WITHOUT Caddy and
 # exposes the gateway directly on localhost:8080 — this file is for the full
 # production stack only.
 # ─────────────────────────────────────────────────────────────────────────────
--- a/ops/systemd/neuronetz-gateway.service
+++ b/ops/systemd/neuronetz-gateway.service
@@ -0,0 +1,58 @@
 # neuronetz-gateway — systemd unit for non-Compose deployments.
 #
 # Assumes the project is installed into a virtualenv at /opt/neuronetz-gateway/venv
 # (e.g. `uv venv /opt/neuronetz-gateway/venv && uv pip install ...`) and that
 # configuration lives in /etc/neuronetz-gateway/gateway.env (same keys as
 # .env.example). Postgres, Redis and Ollama are reached over the network/loopback
 # per that env file — Ollama must remain bound to localhost / a private network
 # and never be published publicly.
 #
 # Install:
 #   sudo cp neuronetz-gateway.service /etc/systemd/system/
 #   sudo systemctl daemon-reload
 #   sudo systemctl enable --now neuronetz-gateway
 [Unit]
 Description=neuronetz-gateway — secure API gateway in front of Ollama
 Documentation=https://github.com/neuronetz/neuronetz-gateway
 After=network-online.target
 Wants=network-online.target
 [Service]
 Type=simple
 # Dedicated unprivileged service account (create with: useradd --system gateway).
 User=gateway
 Group=gateway
 WorkingDirectory=/opt/neuronetz-gateway
 EnvironmentFile=/etc/neuronetz-gateway/gateway.env
 # Apply migrations before starting (idempotent; no-op when already at head).
 ExecStartPre=/opt/neuronetz-gateway/venv/bin/alembic upgrade head
 ExecStart=/opt/neuronetz-gateway/venv/bin/python -m neuronetz_gateway
 Restart=on-failure
 RestartSec=5
 TimeoutStopSec=30
 # --- Hardening ---
 NoNewPrivileges=true
 ProtectSystem=strict
 ProtectHome=true
 PrivateTmp=true
 PrivateDevices=true
 ProtectKernelTunables=true
 ProtectKernelModules=true
 ProtectControlGroups=true
 RestrictNamespaces=true
 RestrictRealtime=true
 RestrictSUIDSGID=true
 LockPersonality=true
 MemoryDenyWriteExecute=true
 RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
 # Allow writing only where the app legitimately needs to (none by default).
 ReadWritePaths=
 [Install]
 WantedBy=multi-user.target
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,94 @@
 [project]
 name = "neuronetz-gateway"
 version = "0.1.0"
 description = "Secure multi-tenant API gateway in front of Ollama for the Neuronetz platform."
 readme = "README.md"
 license = { text = "Apache-2.0" }
 requires-python = ">=3.12"
 authors = [{ name = "Neuronetz", email = "ops@neuronetz.ai" }]
 dependencies = [
    "fastapi>=0.115",
    "uvicorn[standard]>=0.30",
    "httpx>=0.27",
    "sqlalchemy[asyncio]>=2.0",
    "asyncpg>=0.29",
    "redis[hiredis]>=5.0",
    "structlog>=24.1",
    "pydantic>=2.9",
    "pydantic-settings>=2.4",
    "argon2-cffi>=23.1",
    "typer>=0.12",
    "prometheus-client>=0.20",
    "alembic>=1.13",
 ]
 [project.scripts]
 neuronetz-gateway = "neuronetz_gateway.cli.manage:app"
 [project.optional-dependencies]
 dev = [
    "ruff>=0.6",
    "mypy>=1.11",
    "bandit>=1.7",
    "pip-audit>=2.7",
    "pytest>=8.3",
    "pytest-asyncio>=0.24",
    "pytest-cov>=5.0",
    "testcontainers>=4.8",
    "respx>=0.21",
    "locust>=2.31",
 ]
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 [tool.hatch.build.targets.wheel]
 packages = ["src/neuronetz_gateway"]
 [tool.ruff]
 target-version = "py312"
 line-length = 100
 src = ["src", "tests"]
 [tool.ruff.lint]
 select = ["E", "F", "I", "B", "UP", "S", "ASYNC"]
 [tool.ruff.lint.per-file-ignores]
 # Tests may use assert and bind to all interfaces in fixtures.
 "tests/**" = ["S101", "S104"]
 [tool.mypy]
 python_version = "3.12"
 strict = true
 mypy_path = "src"
 plugins = ["pydantic.mypy"]
 namespace_packages = true
 explicit_package_bases = true
 [[tool.mypy.overrides]]
 # argon2 ships types but some transitive deps may not; keep strictness elsewhere.
 # asyncpg ships no stubs/py.typed marker; it is used in revocation.py only.
 module = ["testcontainers.*", "locust.*", "asyncpg", "asyncpg.*"]
 ignore_missing_imports = true
 [tool.pytest.ini_options]
 asyncio_mode = "auto"
 testpaths = ["tests"]
 pythonpath = ["src"]
 addopts = "--cov=neuronetz_gateway --cov-report=term-missing"
 [tool.coverage.run]
 source = ["src/neuronetz_gateway"]
 branch = true
 omit = [
    "src/neuronetz_gateway/__main__.py",
    "src/neuronetz_gateway/cli/*",
 ]
 [tool.coverage.report]
 # Phase 1: coverage is reported but non-blocking. Later phases set fail_under.
 show_missing = true
 [tool.bandit]
 exclude_dirs = ["tests"]
--- a/scope-docs/AGENT_PROMPT.md
+++ b/scope-docs/AGENT_PROMPT.md
@@ -0,0 +1,121 @@
 # Build Order: neuronetz-gateway v0.1.0
 ## Context
 The Ollama instance at `https://api.neuronetz.ai` is currently exposed without authentication. This is a security incident in waiting. Your job is to build the gateway that closes that gap and forms the commercial API surface of the Neuronetz AI platform.
 The full specification is in **`SPEC.md`** in this repository. Read it before writing any code. It is the source of truth; if anything below conflicts with it, SPEC.md wins.
 ## Mission
 Implement `neuronetz-gateway` per SPEC.md to a state that satisfies **§12 Acceptance Criteria**. Nothing less ships.
 ## Non-Negotiables
 These are hard constraints. Violating any of them is a build failure regardless of feature completeness.
 1. **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down, DB unreachable, ambiguous state), deny the request. Never default to allow.
 2. **Ollama never reachable from outside the Docker internal network.** No `ports:` mapping for the ollama service in any compose file shipped with the project. Document this prominently.
 3. **No secrets in code, no secrets in logs, no secrets in errors.** Argon2id for key storage. Constant-time comparison only. Keys printed exactly once at creation.
 4. **No reflected upstream errors.** Ollama errors are sanitized at the gateway boundary. Map to generic 4xx/5xx with a request ID.
 5. **Mutating Ollama endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) are hard-blocked.** Not configurable. Not behind a feature flag. Blocked.
 6. **Streaming integrity.** Token counting and audit writes happen **after** stream close, never on the hot path. Time-to-first-byte must not be degraded by gateway bookkeeping.
 7. **`mypy --strict` and `ruff check` clean before any PR is opened.** No `# type: ignore` without an inline justification comment.
 8. **Test coverage targets (§9) are a gate, not a goal.** 100% on `auth/`, `ratelimit/`, `budget/`. CI fails below threshold.
 9. **Apache 2.0 license file present from commit one.** No GPL dependencies.
 10. **The bootstrap CLI must work before the first manual `curl`.** No "I'll create a key by hand in the DB just to test it" — if the CLI can't create a key, fix the CLI first.
 ## Phasing
 Five phases. Each phase has an explicit exit criterion. **Do not start phase N+1 until phase N's exit criterion is verifiably met.** PM/Control: enforce this.
 ### Phase 1 — Scaffold
 - Repo layout per SPEC §8
 - `pyproject.toml`, `uv.lock`, Dockerfile, docker-compose.yml, docker-compose.dev.yml, .env.example, README, LICENSE
 - Alembic configured; migration `0001_initial.py` creates schema `gateway` and all tables per SPEC §5
 - `make` or `just` targets: `dev`, `test`, `lint`, `typecheck`, `migrate`, `compose-up`, `compose-down`
 - CI workflow runs: ruff, mypy, pytest, bandit, pip-audit
 - **Exit criterion:** `docker compose -f docker-compose.dev.yml up` brings up postgres + redis + a stub gateway that responds 200 on `/healthz` and 503 on `/readyz` (because no Ollama yet). Migrations apply cleanly. CI is green on an empty test suite.
 ### Phase 2 — Core proxy + auth
 - Bootstrap CLI (`create-tenant`, `create-key`, `list-keys`, `revoke-key`) working end-to-end
 - Argon2id hashing module with unit tests covering: hash, verify, constant-time behavior, rehash-on-parameter-change
 - Auth middleware: Bearer extraction, prefix lookup, hash verify, Redis cache with TTL
 - Ollama proxy for `/api/chat` and `/api/generate` — both streamed (NDJSON) and non-streamed
 - Endpoint allowlist enforced
 - **Model discovery (SPEC §4.6):** background poll of Ollama `/api/tags`, cached in Redis + in-process, fail-closed when unavailable
 - Model allowlist enforced per-tenant via the **effective set** (allow_all → all discovered; else `allowed_models ∩ discovered`); key-level `allow_all_models` overrides tenant
 - Error handler: sanitized responses, request ID in every error
 - Audit log writer (buffered, async)
 - Mock Ollama in `tests/integration/mock_ollama.py` (no real model required for CI)
 - **Exit criterion:** A key created via CLI can call `/api/chat` and `/api/generate` through Caddy → gateway → mock Ollama, streaming works, audit rows land in Postgres with correct token counts, `/api/pull` returns 403, no-auth returns 401, wrong-key returns 401. Model discovery populates from the (mock) Ollama `/api/tags`; `/api/tags` returns the tenant's effective set; an `allow_all_models` tenant sees all discovered models, a default-deny tenant sees only `allowed ∩ discovered`, and a non-effective model returns 403; discovery-unavailable fails closed. Integration tests cover all of the above.
 ### Phase 3 — Rate limit + budget + OpenAI-compat
 - Sliding window rate limit (Redis Lua script) — per-key RPM, per-tenant RPM, per-key TPM
 - Concurrency semaphore (Redis-backed) with TTL guard
 - Token budget counters in Redis with Postgres ledger reconciliation on period rollover
 - OpenAI-compatibility layer: `/v1/chat/completions`, `/v1/completions`, `/v1/embeddings`, `/v1/models` with full SSE streaming and `data: [DONE]` terminator
 - Schema translation tests with golden fixtures (request in OpenAI → expected Ollama request; response from Ollama → expected OpenAI response)
 - Rate-limit and budget response headers per SPEC §6.5
 - **Exit criterion:** Locust test (100 concurrent users, 5 min) shows correct 429 behavior at the limit, correct token accounting, p99 gateway overhead < 25 ms. OpenAI Python SDK pointed at `/v1` successfully completes streaming chat. Killing Redis mid-test produces 503 (fail closed), not 200.
 ### Phase 4 — Audit, prompt log, revocation
 - Prompt log (opt-in per key, TTL) with daily sweeper task
 - Audit log retention sweeper (TTL per tenant config)
 - Buffered audit writer with ring-buffer overflow → deny-mode behavior
 - Revocation flow: console (simulated via direct INSERT in tests) writes `gateway.revocations` → NOTIFY → gateway evicts Redis cache → next request with revoked key returns 401 within 1 second
 - Prometheus `/metrics` (loopback only) with: `gateway_requests_total{tenant,model,status}`, `gateway_tokens_total{tenant,model,direction}`, `gateway_request_duration_seconds{tenant,model}` (histogram)
 - `/readyz` checks DB + Redis + Ollama all reachable
 - Circuit breaker on Ollama failures
 - **Exit criterion:** Revocation E2E test green. Prompt log retention TTL works (use freeze-time to simulate). Metrics scrape returns valid Prometheus exposition. `/readyz` flips to 503 when any dependency is down.
 ### Phase 5 — Harden, document, release
 - `docs/ARCHITECTURE.md`, `docs/DEPLOYMENT.md`, `docs/API.md`, `docs/THREAT_MODEL.md`, `docs/OPERATIONS.md` complete
 - Caddyfile example with Let's Encrypt for `api.neuronetz.ai` and security headers (HSTS, X-Content-Type-Options, no Server header, no X-Powered-By)
 - Systemd unit file for non-Compose deployments
 - Multi-stage Dockerfile with non-root user, distroless or `python:3.12-slim` final stage, no build tools in final image
 - `pip-audit` and `bandit` clean in CI
 - Image scan (Trivy or Grype) clean of HIGH/CRITICAL
 - Tag `v0.1.0`, build and push image, GitHub release with changelog
 - **Exit criterion:** Every box in SPEC §12 checked, signed off by Control. Image runnable from a fresh host with only docker + a `.env`. README quickstart works for someone who has never seen the repo.
 ## Agent Role Assignments
 For the multi-agent orchestrator (Fritz/UI-UX/DevOps/QA/Control/Timo/PM):
 | Agent | Owns |
 |---|---|
 | **Backend / Fritz** | All Python code under `src/neuronetz_gateway/`, Alembic migrations, CLI. Primary author. |
 | **DevOps** | Dockerfile, docker-compose.yml(s), Caddyfile, systemd unit, CI workflows, image scanning, release tagging. |
 | **QA** | All tests under `tests/`. Owns coverage gate. Writes the locust scenarios. Verifies acceptance criteria at each phase exit. |
 | **UI-UX** | Not active this project (no UI surface here). Console project will pick this up. |
 | **Control / Timo** | Enforces phase gates. Refuses to advance a phase whose exit criterion isn't met. Runs the acceptance checklist at end of Phase 5. |
 | **PM** | Tracks the phase progression, opens YouTrack tickets per phase, runs daily standups against this prompt, surfaces blockers. |
 ## Working Agreements
 - **Branch per phase.** `phase-1-scaffold`, `phase-2-proxy-auth`, etc. Merge to `main` only after phase exit criterion is verified.
 - **PRs are reviewed against SPEC.md.** "Does this match the spec? If not, is SPEC.md wrong or is the PR wrong?" — that's the review question.
 - **SPEC changes are explicit.** If a phase reveals a spec mistake, amend SPEC.md in a separate PR before changing the implementation. Never drift silently.
 - **Commit messages reference the section.** e.g. `auth: implement argon2id verify per SPEC §5, §9`.
 - **No TODOs in main.** If something is deferred, it becomes a tracked issue, not a code comment.
 - **Open questions (SPEC §13) are resolved in writing.** Decision goes in SPEC.md, not in a Slack message that gets lost.
 ## What "Done" Looks Like
 A fresh clone, a fresh host, a domain pointing at it, and a `.env` file. `docker compose up`. Five minutes later, `curl -H "Authorization: Bearer nz_..." https://api.neuronetz.ai/v1/chat/completions -d '...'` streams a response. The Ollama port is not open. The audit log has a row. The budget counter decremented. The metrics endpoint shows the request. The locust suite passes. The threat model document explains every defense.
 When all of that is true and SPEC §12 is fully ticked, ship v0.1.0.
 ## When You Get Stuck
 - **Ambiguity in the spec → ask, don't guess.** Open a question in the PM channel; if resolved, amend SPEC.md.
 - **Conflict between speed and correctness → correctness wins.** This is security infrastructure. We do not ship "good enough."
 - **Conflict between scope creep and v0.1.0 → defer.** New ideas go in a follow-up issue. v0.1.0 ships per spec.
 Start with Phase 1. Read SPEC.md first.
--- a/scope-docs/SPEC.md
+++ b/scope-docs/SPEC.md
@@ -0,0 +1,593 @@
 # neuronetz-gateway — SPEC.md
 **Project:** `neuronetz-gateway`
 **Version:** 0.1.0 (target)
 **Status:** Specification — not yet implemented
 **License:** Apache 2.0
 **Owner:** Stephan Berbig / Neuronetz
 ---
 ## 1. Purpose
 A secure, multi-tenant API gateway in front of an Ollama instance currently exposed at `https://api.neuronetz.ai`. The Ollama endpoint must never be reachable directly from the public internet again. All access flows through this gateway.
 The gateway is the **hot path** of the Neuronetz API. A separate service (`neuronetz-console`, built on the Nibiru PHP framework) handles administration, dashboards, and tenant self-service. This SPEC covers only the gateway.
 ## 2. Scope
 ### In scope (v0.1.0)
 - Authentication via API keys (Bearer tokens)
 - Multi-tenant data model (tenants → keys, with inheritance)
 - Per-key and per-tenant rate limiting (RPM, TPM, concurrent)
 - Per-key and per-tenant token budgets (daily, monthly, total)
 - Streaming and non-streaming proxy to Ollama
 - Dual API surface: native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`)
 - Endpoint allowlist (block all model-mutating Ollama endpoints)
 - **Dynamic model discovery** from the Ollama backend — the live set of installed models is queried, cached, and auto-refreshed; nothing about the model list is hand-maintained
 - Model allowlist (per-tenant override), **default-deny, resolved against the live discovered set** (stale/typo'd entries never resolve)
 - **Per-tenant `allow_all_models` toggle** — opt-in: a flagged tenant may use any currently-installed model, so models newly pulled into Ollama are auto-granted on the next discovery refresh
 - Request size limits, response size limits, timeouts
 - Token counting from Ollama responses (precise, not heuristic)
 - Audit log (always-on metadata)
 - Prompt log (opt-in per key, TTL'd retention)
 - Bootstrap CLI: create tenants, keys, set budgets
 - Health and readiness endpoints
 - Docker Compose deployment (gateway + caddy + postgres + redis + ollama)
 - Caddy as TLS terminator (Let's Encrypt for `api.neuronetz.ai`)
 ### Out of scope (v0.1.0, document as future)
 - Web admin UI (lives in `neuronetz-console`, separate repo)
 - Billing / Stripe integration (budgets only, no money yet)
 - Multi-region / HA / k8s
 - Content moderation / prompt-injection filtering
 - Response caching
 - Multi-backend routing (one Ollama; pluggable backend interface stays for later)
 - Webhook notifications
 - SSO / OAuth2 for admin
 ## 3. Threat Model (abbreviated)
 | Threat | Mitigation |
 |---|---|
 | Internet scanners hitting Ollama directly | Ollama bound to internal Docker network; never published |
 | Unauthenticated API abuse | Mandatory Bearer token; fail-closed on auth errors |
 | API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP |
 | GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent connection cap |
 | Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096) |
 | Model enumeration / training-data exfil via uncommon models | Model allowlist; default-deny. `allow_all_models` is **opt-in per tenant and audited**. Discovery only ever exposes models actually installed on the backend; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the same generic response |
 | Discovery backend unreachable | Fail-closed: an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models" |
 | Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`) hard-blocked at the gateway |
 | Information disclosure via error messages | Sanitize upstream errors; never proxy Ollama internals to client |
 | Audit log tampering | Append-only at app layer; DB role separation; optional WAL archiving |
 | Prompt data leakage | Prompt logging off by default; opt-in per key; TTL'd; redaction hook |
 | Redis outage causing "fail open" | Fail-closed: if rate-limit/budget backend is unavailable, deny |
 | Compromised admin token | Admin token lives in `neuronetz-console`, not in gateway; gateway has no admin endpoints |
 ## 4. Architecture
 ### 4.1 Component diagram
 ```
                          Internet
                              │ TLS
                              ▼
                  ┌──────────────────────┐
                  │ Caddy (sidecar)      │  Let's Encrypt for api.neuronetz.ai
                  │ - TLS termination    │  HSTS, security headers
                  │ - HTTP/2, HTTP/3     │
                  └──────────┬───────────┘
                             │ HTTP/1.1 internal
                  ┌──────────▼───────────┐
                  │ neuronetz-gateway    │  FastAPI + uvicorn
                  │  - authn             │
                  │  - rate limit        │
                  │  - budget check      │
                  │  - proxy + stream    │
                  │  - token count       │
                  │  - audit write       │
                  └──┬────────┬──────┬───┘
                     │        │      │
              ┌──────▼──┐  ┌──▼───┐  │
              │Postgres │  │Redis │  │
              │ schema: │  │ keys │  │
              │ gateway │  │bucket│  │
              └─────────┘  └──────┘  │
                                     │ internal network only
                              ┌──────▼──────┐
                              │   Ollama    │
                              │ 127.0.0.1   │
                              └─────────────┘
 Same Compose stack also hosts (separate from this SPEC):
  - neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
 ```
 ### 4.2 Database schemas
 **Single Postgres instance, two schemas:**
 - `gateway` — owned by the gateway service; gateway role has full DDL
 - `console` — owned by `neuronetz-console` (out of scope here); console role has full DDL
 - Both services connect with their own role. Cross-schema access is explicit GRANT.
 **Console role gets `SELECT` on all `gateway.*` tables.** Console writes go only to `console.*` tables. If the console needs to mutate gateway state (e.g. revoke a key), it does so by writing to a `gateway.revocations` outbox table that the gateway tails (see §4.5).
 ### 4.3 Request lifecycle
 1. Caddy terminates TLS, forwards to gateway on internal port.
 2. Gateway middleware extracts `Authorization: Bearer <key>`.
 3. Key prefix (first 12 chars) used as Redis cache key. On miss, lookup `gateway.api_keys` by prefix; verify full key with argon2id `verify`; cache resolved key metadata in Redis (TTL 60s).
 4. Rate limit check (sliding window in Redis, Lua-atomic) — per-key RPM + per-tenant RPM.
 5. Budget check (Redis counter for current period; Postgres ledger is source of truth on reset).
 6. Concurrent-connection semaphore (Redis `INCR` with TTL).
 7. Model allowlist check. Resolve the **effective model set** for the key:
   `allow_all := key.allow_all_models ?? tenant.allow_all_models`;
   `effective := discovered` if `allow_all` else `(key.allowed_models ?? tenant.allowed_models) ∩ discovered`,
   where `discovered` is the cached live model set from discovery (§4.6). The request's
   `model` must be in `effective`, else a generic 403 with no disclosure of whether the
   model exists but is unpermitted vs. is not installed.
 8. Endpoint allowlist check.
 9. Request body validation (size, schema, `num_predict` cap).
 10. If OpenAI-compat path, translate request to Ollama schema.
 11. Open httpx async stream to Ollama.
 12. Stream response back to client, accumulating final `prompt_eval_count` + `eval_count`.
 13. On stream close: write `gateway.audit_log` row; decrement budget; release semaphore; if prompt logging enabled, write `gateway.prompt_log` row.
 14. On any failure: sanitized error to client, audit row with status code, semaphore released.
 ### 4.4 Failure modes (fail-closed)
 | Subsystem | If down | Behavior |
 |---|---|---|
 | Postgres (read) | Key lookup fails | 503 with retry-after; no requests proxied |
 | Postgres (write) | Audit write fails | Request still succeeds, audit row buffered in-memory ring (max 1000), drained on recovery; if buffer fills, switch to deny mode |
 | Redis | Rate limit / budget unavailable | 503 — fail closed. Never "allow because we can't check." |
 | Ollama | Upstream unreachable | 502 with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30s |
 | Caddy | Not a gateway concern | — |
 ### 4.5 Cache invalidation (key revocation)
 Console can revoke a key by inserting into `gateway.revocations(key_id, ts, reason)`. Gateway has a background task (`asyncio.create_task` in lifespan) that:
 - LISTENs on Postgres channel `key_revoked` (gateway emits NOTIFY on its own write path; console emits via INSERT trigger)
 - On notification, evicts the Redis cache entry for that key's prefix
 - This makes revocation effectively immediate (≤ Redis RTT) without cross-service HTTP
 ### 4.6 Model discovery
 The set of usable models is **never hand-maintained**; it is extracted live from the
 Ollama backend.
 - A background task (started in lifespan, like the revocation listener) polls Ollama
  `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
 - The parsed model set (names + sanitized metadata: family, parameter size, quantization,
  size bytes, modified-at) is cached in Redis under `gateway:models:discovered` with TTL
  `MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
 - On startup an initial fetch runs; if Ollama is unreachable the discovered set is empty.
 - **Fail-closed:** if the discovered set is empty or its cache has expired and cannot be
  refreshed, no model resolves and requests are denied (consistent with default-deny).
  Discovery never opens access on failure.
 - "Auto-grant": because the effective set (§4.3 step 7) intersects with `discovered` (or
  *is* `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band
  becomes usable to `allow_all` tenants on the next refresh — no per-tenant config change.
 - Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
  endpoint; it never triggers a model pull.
 ## 5. Data Model (schema `gateway`)
 ```sql
 CREATE SCHEMA gateway;
 CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked');
 CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed');
 CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total');
 CREATE TABLE gateway.tenants (
    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
    name            text NOT NULL UNIQUE,
    status          gateway.tenant_status NOT NULL DEFAULT 'active',
    created_at      timestamptz NOT NULL DEFAULT now(),
    metadata        jsonb NOT NULL DEFAULT '{}'::jsonb
 );
 CREATE TABLE gateway.tenant_limits (
    tenant_id           uuid PRIMARY KEY REFERENCES gateway.tenants(id) ON DELETE CASCADE,
    rpm                 integer NOT NULL DEFAULT 60,
    tpm                 integer NOT NULL DEFAULT 100000,
    concurrent          integer NOT NULL DEFAULT 8,
    tokens_daily        bigint,
    tokens_monthly      bigint,
    tokens_total        bigint,
    allowed_models      text[] NOT NULL DEFAULT '{}',
    allow_all_models    boolean NOT NULL DEFAULT false,  -- opt-in: allow any installed model
    log_prompts_default boolean NOT NULL DEFAULT false,
    prompt_retention_days integer NOT NULL DEFAULT 30,
    audit_retention_days  integer NOT NULL DEFAULT 365
 );
 CREATE TABLE gateway.api_keys (
    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
    tenant_id       uuid NOT NULL REFERENCES gateway.tenants(id) ON DELETE CASCADE,
    prefix          text NOT NULL UNIQUE,          -- first 12 chars, indexed
    key_hash        text NOT NULL,                  -- argon2id
    name            text NOT NULL,
    status          gateway.key_status NOT NULL DEFAULT 'active',
    scopes          text[] NOT NULL DEFAULT '{chat,embeddings}',
    created_at      timestamptz NOT NULL DEFAULT now(),
    last_used_at    timestamptz,
    expires_at      timestamptz,
    log_prompts     boolean,                        -- NULL = inherit from tenant
    metadata        jsonb NOT NULL DEFAULT '{}'::jsonb
 );
 CREATE INDEX idx_api_keys_prefix ON gateway.api_keys(prefix) WHERE status = 'active';
 CREATE INDEX idx_api_keys_tenant ON gateway.api_keys(tenant_id);
 CREATE TABLE gateway.key_limits (
    key_id              uuid PRIMARY KEY REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
    rpm                 integer,            -- NULL = inherit tenant
    tpm                 integer,
    concurrent          integer,
    tokens_daily        bigint,
    tokens_monthly      bigint,
    tokens_total        bigint,
    allowed_models      text[],             -- NULL = inherit tenant
    allow_all_models    boolean             -- NULL = inherit tenant
 );
 CREATE TABLE gateway.budget_usage (
    key_id          uuid NOT NULL REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
    period          gateway.budget_period NOT NULL,
    period_start    timestamptz NOT NULL,
    tokens_in       bigint NOT NULL DEFAULT 0,
    tokens_out      bigint NOT NULL DEFAULT 0,
    requests        bigint NOT NULL DEFAULT 0,
    PRIMARY KEY (key_id, period, period_start)
 );
 CREATE INDEX idx_budget_usage_period ON gateway.budget_usage(period, period_start);
 CREATE TABLE gateway.audit_log (
    id              bigserial PRIMARY KEY,
    ts              timestamptz NOT NULL DEFAULT now(),
    request_id      uuid NOT NULL,
    tenant_id       uuid,                          -- nullable for auth-failed rows
    key_id          uuid,
    key_prefix      text,                          -- denormalized for forensic queries
    method          text NOT NULL,
    path            text NOT NULL,
    model           text,
    tokens_in       integer,
    tokens_out      integer,
    latency_ms      integer,
    status          integer NOT NULL,
    client_ip       inet,
    user_agent      text,
    error_code      text
 );
 CREATE INDEX idx_audit_ts ON gateway.audit_log(ts);
 CREATE INDEX idx_audit_tenant_ts ON gateway.audit_log(tenant_id, ts);
 CREATE INDEX idx_audit_key_ts ON gateway.audit_log(key_id, ts);
 CREATE TABLE gateway.prompt_log (
    id              bigserial PRIMARY KEY,
    audit_id        bigint NOT NULL REFERENCES gateway.audit_log(id) ON DELETE CASCADE,
    ts              timestamptz NOT NULL DEFAULT now(),
    key_id          uuid NOT NULL,
    request_body    jsonb NOT NULL,
    response_text   text,
    retention_until timestamptz NOT NULL
 );
 CREATE INDEX idx_prompt_log_retention ON gateway.prompt_log(retention_until);
 CREATE TABLE gateway.revocations (
    id              bigserial PRIMARY KEY,
    key_id          uuid NOT NULL,
    ts              timestamptz NOT NULL DEFAULT now(),
    reason          text,
    processed_at    timestamptz
 );
 -- Trigger to NOTIFY on revocation insert
 CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
 BEGIN
    PERFORM pg_notify('key_revoked', NEW.key_id::text);
    RETURN NEW;
 END;
 $$ LANGUAGE plpgsql;
 CREATE TRIGGER trg_notify_key_revoked
    AFTER INSERT ON gateway.revocations
    FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
 -- Grants for console role (created in console SPEC, referenced here)
 -- GRANT USAGE ON SCHEMA gateway TO console_role;
 -- GRANT SELECT ON ALL TABLES IN SCHEMA gateway TO console_role;
 -- GRANT INSERT ON gateway.revocations TO console_role;
 ```
 ## 6. API Surface
 ### 6.1 Native Ollama passthrough (allowlisted)
 | Path | Method | Notes |
 |---|---|---|
 | `/api/chat` | POST | Streamed (NDJSON) and non-streamed |
 | `/api/generate` | POST | Streamed (NDJSON) and non-streamed |
 | `/api/embeddings` | POST | Non-streamed |
 | `/api/embed` | POST | Newer Ollama embeddings endpoint |
 | `/api/tags` | GET | Returns the tenant's **effective** model set (live-discovered ∩ allowed, or *all* discovered when `allow_all_models`). Sourced from discovery (§4.6), never a static list |
 | `/api/show` | POST | Allowed only for models in the tenant's effective set; returns sanitized model info (no system prompts, no template) |
 | `/api/ps` | GET | **Blocked** — leaks loaded models |
 | `/api/version` | GET | Returns gateway version, not Ollama version |
 ### 6.2 Hard-blocked Ollama endpoints (always 403)
 `/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`
 ### 6.3 OpenAI-compatible
 | Path | Method | Maps to |
 |---|---|---|
 | `/v1/chat/completions` | POST | `/api/chat` |
 | `/v1/completions` | POST | `/api/generate` |
 | `/v1/embeddings` | POST | `/api/embed` |
 | `/v1/models` | GET | `/api/tags` (the tenant's effective discovered set), in OpenAI model-list format |
 Translation must preserve streaming. SSE (`data: {...}\n\n`) for OpenAI-compat; NDJSON for native.
 ### 6.4 Gateway endpoints
 | Path | Method | Auth | Purpose |
 |---|---|---|---|
 | `/healthz` | GET | none | Liveness — process responsive |
 | `/readyz` | GET | none | Readiness — DB + Redis + Ollama all reachable |
 | `/metrics` | GET | none (loopback only) | Prometheus exposition (counters, histograms) |
 No admin endpoints. Admin lives in `neuronetz-console`.
 ### 6.5 Response headers
 Every proxied response carries:
 - `X-Request-ID: <uuid>`
 - `X-RateLimit-Limit-Requests: <n>`
 - `X-RateLimit-Remaining-Requests: <n>`
 - `X-RateLimit-Limit-Tokens: <n>`
 - `X-RateLimit-Remaining-Tokens: <n>`
 - `X-Budget-Period: day|month|total`
 - `X-Budget-Tokens-Remaining: <n>`
 429 responses additionally carry `Retry-After: <seconds>`.
 ## 7. Configuration
 All via environment variables, validated by Pydantic Settings on boot. Boot fails loudly on invalid config.
 ```
 # Service
 GATEWAY_BIND_HOST=0.0.0.0
 GATEWAY_BIND_PORT=8080
 GATEWAY_LOG_LEVEL=INFO
 GATEWAY_LOG_FORMAT=json                  # json|console
 GATEWAY_REQUEST_ID_HEADER=X-Request-ID
 GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy  # for X-Forwarded-For
 # Upstream
 OLLAMA_BASE_URL=http://ollama:11434
 OLLAMA_CONNECT_TIMEOUT_S=5
 OLLAMA_READ_TIMEOUT_S=600
 OLLAMA_MAX_CONNECTIONS=64
 # Model discovery (§4.6)
 MODEL_DISCOVERY_REFRESH_S=60             # how often to re-query Ollama /api/tags
 MODEL_DISCOVERY_CACHE_TTL_S=120          # Redis cache TTL for the discovered model set
 # Database
 DATABASE_URL=postgresql+asyncpg://gateway:...@postgres:5432/neuronetz
 DATABASE_POOL_SIZE=10
 DATABASE_POOL_OVERFLOW=20
 # Redis
 REDIS_URL=redis://redis:6379/0
 REDIS_KEY_CACHE_TTL_S=60
 # Limits (defaults; per-tenant/key overrides in DB)
 DEFAULT_RPM=60
 DEFAULT_TPM=100000
 DEFAULT_CONCURRENT=8
 MAX_REQUEST_BODY_BYTES=262144
 MAX_NUM_PREDICT=4096
 # Security
 ARGON2_TIME_COST=3
 ARGON2_MEMORY_COST_KIB=65536
 ARGON2_PARALLELISM=4
 AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
 # Audit
 AUDIT_BUFFER_SIZE=1000
 PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
 AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
 ```
 ## 8. Repository Layout
 ```
 neuronetz-gateway/
 ├── pyproject.toml                # uv-managed, ruff, mypy --strict, pytest
 ├── README.md
 ├── LICENSE                       # Apache 2.0
 ├── docker-compose.yml            # full stack incl. console placeholder
 ├── docker-compose.dev.yml        # without caddy, gateway exposed on localhost
 ├── Dockerfile                    # multi-stage, python:3.12-slim base
 ├── .env.example
 ├── .dockerignore
 ├── .gitignore
 ├── alembic.ini
 ├── alembic/
 │   ├── env.py
 │   └── versions/
 │       └── 0001_initial.py       # creates schema `gateway` and all tables
 ├── ops/
 │   ├── caddy/
 │   │   └── Caddyfile.example
 │   └── systemd/
 │       └── neuronetz-gateway.service
 ├── src/neuronetz_gateway/
 │   ├── __init__.py
 │   ├── __main__.py               # uvicorn entry
 │   ├── app.py                    # FastAPI factory
 │   ├── config.py                 # Pydantic Settings
 │   ├── deps.py                   # DI providers
 │   ├── lifespan.py               # startup/shutdown, NOTIFY listener
 │   ├── errors.py                 # exception types, handlers, sanitization
 │   ├── auth/
 │   │   ├── __init__.py
 │   │   ├── hashing.py            # argon2id wrapper
 │   │   ├── keys.py               # key generation, prefix, verify
 │   │   └── middleware.py
 │   ├── ratelimit/
 │   │   ├── __init__.py
 │   │   ├── sliding_window.py     # Redis Lua script
 │   │   └── concurrency.py        # semaphore via Redis
 │   ├── budget/
 │   │   ├── __init__.py
 │   │   ├── counter.py            # Redis period counters
 │   │   └── ledger.py             # Postgres reconciliation
 │   ├── proxy/
 │   │   ├── __init__.py
 │   │   ├── ollama.py             # httpx streaming client
 │   │   ├── translate.py          # OpenAI <-> Ollama schemas
 │   │   ├── token_counter.py      # parse usage from stream
 │   │   ├── discovery.py          # live model discovery from Ollama /api/tags (§4.6)
 │   │   └── allowlist.py          # effective-set resolution (allow_all / allowed ∩ discovered)
 │   ├── routes/
 │   │   ├── __init__.py
 │   │   ├── ollama_native.py
 │   │   ├── openai_compat.py
 │   │   └── health.py
 │   ├── db/
 │   │   ├── __init__.py
 │   │   ├── session.py
 │   │   ├── models.py             # SQLAlchemy 2.0
 │   │   └── repositories.py
 │   ├── audit/
 │   │   ├── __init__.py
 │   │   ├── writer.py             # buffered async writer
 │   │   └── prompt_log.py
 │   ├── observability/
 │   │   ├── __init__.py
 │   │   ├── logging.py            # structlog config
 │   │   └── metrics.py            # prometheus
 │   └── cli/
 │       ├── __init__.py
 │       └── manage.py             # typer: create-tenant, create-key, ...
 ├── tests/
 │   ├── conftest.py               # testcontainers fixtures
 │   ├── unit/
 │   │   ├── test_hashing.py
 │   │   ├── test_translate.py
 │   │   ├── test_token_counter.py
 │   │   ├── test_discovery.py
 │   │   ├── test_allowlist.py
 │   │   └── test_sliding_window.py
 │   ├── integration/
 │   │   ├── test_auth_flow.py
 │   │   ├── test_rate_limit.py
 │   │   ├── test_budget.py
 │   │   ├── test_proxy_stream.py
 │   │   ├── test_openai_compat.py
 │   │   ├── test_revocation.py
 │   │   └── mock_ollama.py        # FastAPI mock with NDJSON/SSE
 │   └── load/
 │       └── locustfile.py
 └── docs/
    ├── ARCHITECTURE.md
    ├── DEPLOYMENT.md
    ├── API.md
    ├── THREAT_MODEL.md
    └── OPERATIONS.md              # runbook: revoke key, rotate, check usage
 ```
 ## 9. Non-Functional Requirements
 - **Performance:** p50 overhead < 5 ms over direct Ollama call (auth + ratelimit + audit); p99 < 25 ms (excluding upstream latency)
 - **Streaming:** Time-to-first-byte must not be degraded by gateway logic — audit write happens **after** stream close
 - **Memory:** Steady-state RSS < 200 MiB per gateway worker under 100 concurrent streams
 - **Concurrency:** Handle 200 concurrent connections per worker; 4 workers per instance default
 - **Test coverage:** ≥ 85% line coverage on `src/neuronetz_gateway/` excluding `__main__` and CLI; 100% on `auth/`, `ratelimit/`, `budget/`
 - **Security:** No `eval`, no `exec`, no shell-out, no `pickle`. Bandit clean. `pip-audit` clean on every CI run.
 - **Type safety:** `mypy --strict` clean
 - **Lint:** `ruff check` clean with project ruleset (E, F, I, B, UP, S, ASYNC)
 ## 10. Tooling
 - Python 3.12
 - `uv` for dependency management (pyproject.toml + uv.lock)
 - FastAPI ≥ 0.115, uvicorn[standard], httpx ≥ 0.27, SQLAlchemy 2.0 (async), asyncpg, redis ≥ 5.0 (with hiredis), structlog, pydantic ≥ 2.9, pydantic-settings, argon2-cffi, typer, prometheus-client
 - Test: pytest, pytest-asyncio, pytest-cov, testcontainers, httpx (test client), respx (mock), locust
 - Lint/format: ruff, mypy --strict, bandit, pip-audit
 - CI: GitHub Actions workflow (lint, type, test with coverage, build image, push on tag)
 ## 11. Bootstrap CLI (Typer)
 ```
 neuronetz-gateway create-tenant --name "acme" [--rpm 60] [--tpm 100000]
 neuronetz-gateway create-key --tenant acme --name "prod-server-1" [--scopes chat,embeddings]
 neuronetz-gateway revoke-key --prefix nz_abc12345
 neuronetz-gateway list-keys --tenant acme
 neuronetz-gateway show-usage --tenant acme [--period day|month|total]
 neuronetz-gateway set-budget --key nz_abc12345 --daily 1000000 --monthly 30000000
 neuronetz-gateway set-models --tenant acme --models llama3.1:8b,mistral:7b
 neuronetz-gateway set-models --tenant acme --allow-all          # opt into allow_all_models
 neuronetz-gateway set-models --tenant acme --no-allow-all       # back to explicit allowlist
 neuronetz-gateway list-models [--tenant acme]                   # show live-discovered models
                                                                # (and the tenant's effective set)
 ```
 `create-tenant` accepts `--allow-all-models / --no-allow-all-models` (default off).
 `list-models` reads the discovery cache (§4.6); with `--tenant` it also shows that tenant's
 resolved effective set.
 Key format: `nz_<12-char-prefix><32-char-random>`. Prefix is stored; full key is hashed (argon2id). On creation, the full key is printed exactly once.
 ## 12. Acceptance Criteria
 The build is "done" when every box below is checked. The orchestrator must verify each before declaring v0.1.0.
 - [ ] `docker compose up` from a clean checkout produces a running stack with TLS via Caddy (self-signed in dev, Let's Encrypt-ready in prod).
 - [ ] CLI creates tenant and key; printed key successfully authenticates an `/api/chat` call.
 - [ ] Unauthenticated request returns 401 with no Ollama details leaked.
 - [ ] Request to `/api/pull` returns 403 with generic error message.
 - [ ] Streaming `/api/chat` works end-to-end; first byte arrives within Ollama's own TTFB + < 10 ms gateway overhead.
 - [ ] Streaming `/v1/chat/completions` returns valid SSE with `data: [DONE]` terminator.
 - [ ] Token counts in audit log match Ollama's reported `prompt_eval_count` + `eval_count` exactly.
 - [ ] `/api/tags` and `/v1/models` reflect the **live** Ollama model set (discovery, §4.6): an `allow_all_models` tenant sees every installed model and a newly-pulled model appears within one refresh interval; a default-deny tenant sees only `allowed_models ∩ discovered`; a request for a model outside the effective set returns a generic 403; with discovery unavailable, requests fail closed (deny), not open.
 - [ ] Rate limit triggers at configured RPM with `Retry-After` header.
 - [ ] Token budget enforces and blocks at zero remaining with descriptive error.
 - [ ] Redis outage causes 503 (fail-closed), not 200.
 - [ ] Revocation via `INSERT INTO gateway.revocations` evicts Redis cache within 1 second.
 - [ ] `mypy --strict`, `ruff check`, `bandit`, `pip-audit` all clean in CI.
 - [ ] Test coverage ≥ 85% overall, 100% in `auth/`, `ratelimit/`, `budget/`.
 - [ ] `docs/THREAT_MODEL.md`, `docs/DEPLOYMENT.md`, `docs/OPERATIONS.md` present and accurate.
 - [ ] Load test (locust): 100 concurrent users sustained 5 minutes, p99 gateway overhead < 25 ms, zero 5xx outside induced failures.
 ## 13. Open Questions (decide during build)
 1. Embedding cost accounting — Ollama doesn't return `eval_count` for embeddings. Decision: charge based on `prompt_eval_count` only; document as such.
 2. SSE vs NDJSON heuristic for OpenAI-compat — always SSE per OpenAI spec. NDJSON only on native `/api/*`.
 3. Prometheus cardinality — do not label by `key_id` (too many series); label by `tenant_id` only; per-key data lives in Postgres.
 4. **Model discovery source** — the live model list is `GET /api/tags` on the Ollama backend; there is no separate registry. Cached in Redis + in-process, refreshed every `MODEL_DISCOVERY_REFRESH_S`.
 5. **Discovery failure is fail-closed** — empty/expired discovered set ⇒ no model resolves ⇒ deny. Discovery never opens access on error.
 6. **No existence disclosure** — a model that is installed-but-unpermitted and a model that is not installed both return the same generic response, to prevent enumeration.
 7. **`allow_all_models` precedence** — key-level `allow_all_models` (when non-NULL) overrides the tenant flag; otherwise the tenant flag applies. Same NULL-inherits-tenant rule as the other key limits.
 ## 14. References
 - Ollama API: https://github.com/ollama/ollama/blob/main/docs/api.md
 - OpenAI Chat Completions: https://platform.openai.com/docs/api-reference/chat
 - Nibiru (sibling console project): https://nibiru-framework.com
 - Argon2 RFC 9106
--- a/src/neuronetz_gateway/init.py
+++ b/src/neuronetz_gateway/init.py
@@ -0,0 +1,7 @@
 """neuronetz-gateway: secure multi-tenant API gateway in front of Ollama."""
 from __future__ import annotations
 __version__ = "0.1.0"
 __all__ = ["__version__"]
--- a/src/neuronetz_gateway/main.py
+++ b/src/neuronetz_gateway/main.py
@@ -0,0 +1,28 @@
 """Uvicorn entry point: ``python -m neuronetz_gateway``.
 Binds the app to ``GATEWAY_BIND_HOST``:``GATEWAY_BIND_PORT`` (default
 0.0.0.0:8080). The factory string is passed to uvicorn so the app is built in
 the worker process.
 """
 from __future__ import annotations
 import uvicorn
 from neuronetz_gateway.config import get_settings
 def main() -> None:
    """Run the gateway under uvicorn using the configured bind address."""
    settings = get_settings()
    uvicorn.run(
        "neuronetz_gateway.app:create_app",
        factory=True,
        host=settings.gateway_bind_host,
        port=settings.gateway_bind_port,
        log_level=settings.gateway_log_level.lower(),
    )
 if __name__ == "__main__":
    main()
--- a/src/neuronetz_gateway/app.py
+++ b/src/neuronetz_gateway/app.py
@@ -0,0 +1,111 @@
 """FastAPI application factory.
 ``create_app()`` is the shared contract entry point: other agents (DevOps, QA)
 import and serve this. It configures logging, installs the request-id and auth
 middleware, registers the sanitizing exception handlers, mounts routers, and
 binds the lifespan that manages backend handles + background tasks.
 Production safety: FastAPI's ``/docs`` + ``/openapi.json`` are disabled by
 default (enabled only via ``DOCS_ENABLED``). The ``/playground`` route is served
 only when ``PLAYGROUND_ENABLED`` is true and ``PLAYGROUND_FILE`` exists.
 """
 from __future__ import annotations
 import uuid
 from pathlib import Path
 from fastapi import FastAPI, Request
 from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
 from starlette.responses import HTMLResponse, Response
 from starlette.types import ASGIApp
 from neuronetz_gateway import __version__
 from neuronetz_gateway.auth.middleware import AuthMiddleware
 from neuronetz_gateway.config import Settings, get_settings
 from neuronetz_gateway.errors import register_exception_handlers
 from neuronetz_gateway.lifespan import lifespan
 from neuronetz_gateway.observability.logging import configure_logging
 from neuronetz_gateway.routes import health, ollama_native, openai_compat
 class RequestIDMiddleware(BaseHTTPMiddleware):
    """Assign/propagate a request id and expose it on ``request.state``.
    Honours an inbound ``X-Request-ID`` from a trusted proxy; otherwise mints a
    fresh UUID. The id is echoed on the response and used by error handlers.
    """
    def __init__(self, app: ASGIApp, header_name: str) -> None:
        super().__init__(app)
        self._header = header_name
    async def dispatch(
        self, request: Request, call_next: RequestResponseEndpoint
    ) -> Response:
        incoming = request.headers.get(self._header)
        request_id = incoming or str(uuid.uuid4())
        request.state.request_id = request_id
        response = await call_next(request)
        response.headers[self._header] = request_id
        return response
 def _register_playground(app: FastAPI, cfg: Settings) -> None:
    """Add the flag-gated ``/playground`` route (HTML asset, owned by docs agent).
    The file is read off the event loop via ``asyncio.to_thread`` so a slow disk
    cannot stall request handling. Missing-file is a simple 404, never an error.
    """
    import asyncio as _asyncio
    def _load(path_str: str) -> str | None:
        p = Path(path_str)
        if not p.is_file():
            return None
        return p.read_text(encoding="utf-8")
    @app.get("/playground", include_in_schema=False)
    async def playground() -> Response:
        content = await _asyncio.to_thread(_load, cfg.playground_file)
        if content is None:
            return Response(status_code=404, content="Not found")
        return HTMLResponse(content)
 def create_app(settings: Settings | None = None) -> FastAPI:
    """Build and return the configured FastAPI application."""
    cfg = settings or get_settings()
    configure_logging(level=cfg.gateway_log_level, fmt=cfg.gateway_log_format)
    app = FastAPI(
        title="neuronetz-gateway",
        version=__version__,
        lifespan=lifespan,
        docs_url="/docs" if cfg.docs_enabled else None,
        redoc_url="/redoc" if cfg.docs_enabled else None,
        openapi_url="/openapi.json" if cfg.docs_enabled else None,
    )
    # Settings are needed by the auth middleware before lifespan runs in some
    # test setups; lifespan also sets this. Setting here is idempotent.
    app.state.settings = cfg
    # Auth runs inside RequestID so a request id is always available for the
    # sanitized 401 the auth middleware emits. add_middleware wraps outermost
    # last, so add Auth first then RequestID.
    app.add_middleware(AuthMiddleware)
    app.add_middleware(RequestIDMiddleware, header_name=cfg.gateway_request_id_header)
    register_exception_handlers(app)
    app.include_router(health.router)
    app.include_router(openai_compat.router)
    app.include_router(ollama_native.router)
    if cfg.playground_enabled:
        _register_playground(app, cfg)
    return app
 __all__ = ["RequestIDMiddleware", "create_app"]
--- a/src/neuronetz_gateway/config.py
+++ b/src/neuronetz_gateway/config.py
@@ -0,0 +1,86 @@
 """Application configuration via Pydantic Settings v2.
 Reads every environment variable documented in SPEC §7 with the documented
 defaults. Boot fails loudly (ValidationError) on invalid config.
 """
 from __future__ import annotations
 from functools import lru_cache
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 class Settings(BaseSettings):
    """Gateway runtime configuration. All fields map to SPEC §7 env vars."""
    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
        case_sensitive=False,
    )
    # --- Service ---
    gateway_bind_host: str = Field(default="0.0.0.0")  # noqa: S104 - bind-all is intended in container
    gateway_bind_port: int = Field(default=8080)
    gateway_log_level: str = Field(default="INFO")
    gateway_log_format: str = Field(default="json")  # json|console
    gateway_request_id_header: str = Field(default="X-Request-ID")
    gateway_trusted_proxies: str = Field(default="127.0.0.1,caddy")
    # --- Upstream (Ollama) ---
    ollama_base_url: str = Field(default="http://ollama:11434")
    ollama_connect_timeout_s: int = Field(default=5)
    ollama_read_timeout_s: int = Field(default=600)
    ollama_max_connections: int = Field(default=64)
    # --- Model discovery (SPEC §4.6) ---
    model_discovery_refresh_s: int = Field(default=60)
    model_discovery_cache_ttl_s: int = Field(default=120)
    # --- Database ---
    database_url: str = Field(
        default="postgresql+asyncpg://gateway:gateway@postgres:5432/neuronetz",
    )
    database_pool_size: int = Field(default=10)
    database_pool_overflow: int = Field(default=20)
    # --- Redis ---
    redis_url: str = Field(default="redis://redis:6379/0")
    redis_key_cache_ttl_s: int = Field(default=60)
    # --- Limits ---
    default_rpm: int = Field(default=60)
    default_tpm: int = Field(default=100_000)
    default_concurrent: int = Field(default=8)
    max_request_body_bytes: int = Field(default=262_144)
    max_num_predict: int = Field(default=4096)
    # --- Security ---
    argon2_time_cost: int = Field(default=3)
    argon2_memory_cost_kib: int = Field(default=65_536)
    argon2_parallelism: int = Field(default=4)
    auth_failure_rate_limit_per_ip_per_min: int = Field(default=20)
    # --- Audit ---
    audit_buffer_size: int = Field(default=1000)
    prompt_log_default_retention_days: int = Field(default=30)
    audit_log_default_retention_days: int = Field(default=365)
    # --- Playground / docs (prod-safe defaults: both OFF) ---
    playground_enabled: bool = Field(default=False)
    playground_file: str = Field(default="/app/playground/index.html")
    docs_enabled: bool = Field(default=False)
    @property
    def trusted_proxies_list(self) -> list[str]:
        """Parse the comma-separated trusted-proxy list into individual hosts."""
        return [p.strip() for p in self.gateway_trusted_proxies.split(",") if p.strip()]
@lru_cache(maxsize=1)
 def get_settings() -> Settings:
    """Return a cached Settings instance, constructed from the environment."""
    return Settings()
--- a/src/neuronetz_gateway/db/init.py
+++ b/src/neuronetz_gateway/db/init.py
@@ -0,0 +1,3 @@
 """Database access layer: SQLAlchemy models, session factory, repositories."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/db/models.py
+++ b/src/neuronetz_gateway/db/models.py
@@ -0,0 +1,292 @@
 """SQLAlchemy 2.0 (async) ORM models for schema ``gateway`` per SPEC §5.
 These mirror the migration in ``alembic/versions/0001_initial.py`` exactly.
 The migration is the authoritative DDL; these models are for application use.
 """
 from __future__ import annotations
 import datetime
 import enum
 import uuid
 from sqlalchemy import (
    BigInteger,
    Boolean,
    ForeignKey,
    Integer,
    MetaData,
    String,
    Text,
    text,
 )
 from sqlalchemy.dialects.postgresql import ARRAY, ENUM, INET, JSONB, TIMESTAMP, UUID
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
 GATEWAY_SCHEMA = "gateway"
 # Stable naming convention so Alembic autogenerate and ad-hoc DDL agree.
 _NAMING_CONVENTION = {
    "ix": "ix_%(column_0_label)s",
    "uq": "uq_%(table_name)s_%(column_0_name)s",
    "ck": "ck_%(table_name)s_%(constraint_name)s",
    "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
    "pk": "pk_%(table_name)s",
 }
 class Base(DeclarativeBase):
    """Declarative base; all tables live in the ``gateway`` schema."""
    metadata = MetaData(schema=GATEWAY_SCHEMA, naming_convention=_NAMING_CONVENTION)
 class KeyStatus(enum.StrEnum):
    """Lifecycle states for an API key (SPEC §5 ``gateway.key_status``)."""
    active = "active"
    disabled = "disabled"
    revoked = "revoked"
 class TenantStatus(enum.StrEnum):
    """Lifecycle states for a tenant (SPEC §5 ``gateway.tenant_status``)."""
    active = "active"
    suspended = "suspended"
    closed = "closed"
 class BudgetPeriod(enum.StrEnum):
    """Budget accounting periods (SPEC §5 ``gateway.budget_period``)."""
    day = "day"
    month = "month"
    total = "total"
 # Reuse existing Postgres enum types (the migration creates them); do not let
 # SQLAlchemy try to CREATE TYPE again at runtime.
 _key_status_enum = ENUM(KeyStatus, name="key_status", schema=GATEWAY_SCHEMA, create_type=False)
 _tenant_status_enum = ENUM(
    TenantStatus, name="tenant_status", schema=GATEWAY_SCHEMA, create_type=False
 )
 _budget_period_enum = ENUM(
    BudgetPeriod, name="budget_period", schema=GATEWAY_SCHEMA, create_type=False
 )
 class Tenant(Base):
    """A tenant: the top-level isolation and ownership boundary."""
    __tablename__ = "tenants"
    id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
    )
    name: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
    status: Mapped[TenantStatus] = mapped_column(
        _tenant_status_enum, nullable=False, server_default=text("'active'")
    )
    created_at: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
    )
    tenant_metadata: Mapped[dict[str, object]] = mapped_column(
        "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
    )
 class TenantLimit(Base):
    """Per-tenant default limits and retention policy."""
    __tablename__ = "tenant_limits"
    tenant_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("tenants.id", ondelete="CASCADE"),
        primary_key=True,
    )
    rpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("60"))
    tpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("100000"))
    concurrent: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("8"))
    tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    allowed_models: Mapped[list[str]] = mapped_column(
        ARRAY(Text), nullable=False, server_default=text("'{}'")
    )
    # When true, the tenant may use ANY model currently installed on the Ollama
    # backend (resolved live via model discovery). When false (default), access is
    # default-deny and restricted to ``allowed_models`` intersected with the live set.
    allow_all_models: Mapped[bool] = mapped_column(
        Boolean, nullable=False, server_default=text("false")
    )
    log_prompts_default: Mapped[bool] = mapped_column(
        Boolean, nullable=False, server_default=text("false")
    )
    prompt_retention_days: Mapped[int] = mapped_column(
        Integer, nullable=False, server_default=text("30")
    )
    audit_retention_days: Mapped[int] = mapped_column(
        Integer, nullable=False, server_default=text("365")
    )
 class ApiKey(Base):
    """An API key belonging to a tenant. The full key is never stored."""
    __tablename__ = "api_keys"
    id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
    )
    tenant_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("tenants.id", ondelete="CASCADE"),
        nullable=False,
    )
    prefix: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
    key_hash: Mapped[str] = mapped_column(Text, nullable=False)
    name: Mapped[str] = mapped_column(Text, nullable=False)
    status: Mapped[KeyStatus] = mapped_column(
        _key_status_enum, nullable=False, server_default=text("'active'")
    )
    scopes: Mapped[list[str]] = mapped_column(
        ARRAY(Text), nullable=False, server_default=text("'{chat,embeddings}'")
    )
    created_at: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
    )
    last_used_at: Mapped[datetime.datetime | None] = mapped_column(
        TIMESTAMP(timezone=True), nullable=True
    )
    expires_at: Mapped[datetime.datetime | None] = mapped_column(
        TIMESTAMP(timezone=True), nullable=True
    )
    log_prompts: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
    key_metadata: Mapped[dict[str, object]] = mapped_column(
        "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
    )
 class KeyLimit(Base):
    """Per-key overrides; NULL columns inherit the tenant value."""
    __tablename__ = "key_limits"
    key_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("api_keys.id", ondelete="CASCADE"),
        primary_key=True,
    )
    rpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
    tpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
    concurrent: Mapped[int | None] = mapped_column(Integer, nullable=True)
    tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
    allowed_models: Mapped[list[str] | None] = mapped_column(ARRAY(Text), nullable=True)
    # NULL = inherit tenant's allow_all_models; otherwise overrides it for this key.
    allow_all_models: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
 class BudgetUsage(Base):
    """Token/request accounting per key, period, and period start."""
    __tablename__ = "budget_usage"
    key_id: Mapped[uuid.UUID] = mapped_column(
        UUID(as_uuid=True),
        ForeignKey("api_keys.id", ondelete="CASCADE"),
        primary_key=True,
    )
    period: Mapped[BudgetPeriod] = mapped_column(_budget_period_enum, primary_key=True)
    period_start: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), primary_key=True
    )
    tokens_in: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
    tokens_out: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
    requests: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
 class AuditLog(Base):
    """Always-on append-only request metadata log."""
    __tablename__ = "audit_log"
    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
    ts: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
    )
    request_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
    tenant_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
    key_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
    key_prefix: Mapped[str | None] = mapped_column(Text, nullable=True)
    method: Mapped[str] = mapped_column(Text, nullable=False)
    path: Mapped[str] = mapped_column(Text, nullable=False)
    model: Mapped[str | None] = mapped_column(Text, nullable=True)
    tokens_in: Mapped[int | None] = mapped_column(Integer, nullable=True)
    tokens_out: Mapped[int | None] = mapped_column(Integer, nullable=True)
    latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
    status: Mapped[int] = mapped_column(Integer, nullable=False)
    client_ip: Mapped[str | None] = mapped_column(INET, nullable=True)
    user_agent: Mapped[str | None] = mapped_column(Text, nullable=True)
    error_code: Mapped[str | None] = mapped_column(Text, nullable=True)
 class PromptLog(Base):
    """Opt-in, TTL'd capture of request/response bodies."""
    __tablename__ = "prompt_log"
    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
    audit_id: Mapped[int] = mapped_column(
        BigInteger,
        ForeignKey("audit_log.id", ondelete="CASCADE"),
        nullable=False,
    )
    ts: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
    )
    key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
    request_body: Mapped[dict[str, object]] = mapped_column(JSONB, nullable=False)
    response_text: Mapped[str | None] = mapped_column(Text, nullable=True)
    retention_until: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False
    )
 class Revocation(Base):
    """Outbox table written by console (or gateway) to revoke a key.
    An ``AFTER INSERT`` trigger fires ``pg_notify('key_revoked', key_id)``.
    """
    __tablename__ = "revocations"
    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
    key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
    ts: Mapped[datetime.datetime] = mapped_column(
        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
    )
    reason: Mapped[str | None] = mapped_column(String, nullable=True)
    processed_at: Mapped[datetime.datetime | None] = mapped_column(
        TIMESTAMP(timezone=True), nullable=True
    )
 __all__ = [
    "GATEWAY_SCHEMA",
    "ApiKey",
    "AuditLog",
    "Base",
    "BudgetPeriod",
    "BudgetUsage",
    "KeyLimit",
    "KeyStatus",
    "PromptLog",
    "Revocation",
    "Tenant",
    "TenantLimit",
    "TenantStatus",
 ]
--- a/src/neuronetz_gateway/db/session.py
+++ b/src/neuronetz_gateway/db/session.py
@@ -0,0 +1,53 @@
 """Async SQLAlchemy engine and session factory construction.
 Phase 1 provides the wiring only; the lifespan owns the engine instance and
 stores it on ``app.state``. Business-logic callers should depend on the
 session factory via ``deps.py``.
 """
 from __future__ import annotations
 from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from sqlalchemy.ext.asyncio import (
    AsyncEngine,
    AsyncSession,
    async_sessionmaker,
    create_async_engine,
 )
 from neuronetz_gateway.config import Settings
 def create_engine(settings: Settings) -> AsyncEngine:
    """Build the async engine from settings (asyncpg driver, pooled)."""
    return create_async_engine(
        settings.database_url,
        pool_size=settings.database_pool_size,
        max_overflow=settings.database_pool_overflow,
        pool_pre_ping=True,
        future=True,
    )
 def create_session_factory(engine: AsyncEngine) -> async_sessionmaker[AsyncSession]:
    """Build a session factory bound to the given engine."""
    return async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
@asynccontextmanager
 async def session_scope(
    factory: async_sessionmaker[AsyncSession],
 ) -> AsyncIterator[AsyncSession]:
    """Provide a transactional session scope, committing on success."""
    async with factory() as session:
        try:
            yield session
            await session.commit()
        except Exception:
            await session.rollback()
            raise
 __all__ = ["create_engine", "create_session_factory", "session_scope"]
--- a/src/neuronetz_gateway/deps.py
+++ b/src/neuronetz_gateway/deps.py
@@ -0,0 +1,180 @@
 """FastAPI dependency-injection providers.
 Exposes typed accessors for the handles placed on ``app.state`` by the lifespan
 (Redis, the upstream httpx client, the DB session factory, the discovery cache)
 plus the request principal and the proxy client.
 QA override contract
 --------------------
 Routes obtain the upstream proxy via :func:`get_ollama_client`. Tests override
 the *Ollama backend* by overriding this provider::
    from neuronetz_gateway.deps import get_ollama_client
    from neuronetz_gateway.proxy.ollama import OllamaClient
    import httpx
    from tests.integration.mock_ollama import create_mock_ollama
    transport = httpx.ASGITransport(app=create_mock_ollama())
    mock_http = httpx.AsyncClient(transport=transport, base_url="http://ollama")
    app.dependency_overrides[get_ollama_client] = lambda: OllamaClient(mock_http)
 Because ``get_ollama_client`` returns a fully-built :class:`OllamaClient`, an
 override needs no access to ``app.state`` and can point at the in-process mock.
 """
 from __future__ import annotations
 from collections.abc import AsyncIterator
 from typing import Annotated
 import httpx
 import redis.asyncio as redis
 from fastapi import Depends, Request
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 from neuronetz_gateway.audit.writer import AuditWriter
 from neuronetz_gateway.auth.principal import Principal
 from neuronetz_gateway.budget.counter import BudgetCounter
 from neuronetz_gateway.config import Settings, get_settings
 from neuronetz_gateway.errors import AuthenticationError, DependencyUnavailableError
 from neuronetz_gateway.proxy.discovery import DiscoveryCache
 from neuronetz_gateway.proxy.ollama import OllamaClient
 from neuronetz_gateway.proxy.pipeline import Pipeline
 from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter
 from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter
 def get_config() -> Settings:
    """Provide the cached application settings."""
    return get_settings()
 def get_redis(request: Request) -> redis.Redis:
    """Provide the shared Redis client, failing closed if unavailable."""
    client: redis.Redis | None = getattr(request.app.state, "redis", None)
    if client is None:
        raise DependencyUnavailableError(internal_detail="redis client not initialised")
    return client
 def get_http_client(request: Request) -> httpx.AsyncClient:
    """Provide the shared upstream httpx client."""
    client: httpx.AsyncClient | None = getattr(request.app.state, "http_client", None)
    if client is None:
        raise DependencyUnavailableError(internal_detail="http client not initialised")
    return client
 def get_ollama_client(request: Request) -> OllamaClient:
    """Provide the upstream Ollama proxy client (override target for tests)."""
    return OllamaClient(get_http_client(request))
 def get_discovery_cache(request: Request) -> DiscoveryCache:
    """Provide the in-process discovery cache; fail closed if absent."""
    cache: DiscoveryCache | None = getattr(request.app.state, "discovery_cache", None)
    if cache is None:
        raise DependencyUnavailableError(internal_detail="discovery cache not initialised")
    return cache
 def get_principal(request: Request) -> Principal:
    """Return the authenticated principal placed on ``request.state``.
    The auth middleware attaches it before routing; its absence on a non-exempt
    route is a programming error, so we fail closed with a 401.
    """
    principal: Principal | None = getattr(request.state, "principal", None)
    if principal is None:
        raise AuthenticationError(internal_detail="principal missing on authenticated route")
    return principal
 def get_audit_writer(request: Request) -> AuditWriter:
    """Provide the shared buffered audit writer; fail closed if absent."""
    writer: AuditWriter | None = getattr(request.app.state, "audit_writer", None)
    if writer is None:
        raise DependencyUnavailableError(internal_detail="audit writer not initialised")
    return writer
 def get_pipeline(
    request: Request,
    principal: Annotated[Principal, Depends(get_principal)],
    settings: Annotated[Settings, Depends(get_config)],
    ollama: Annotated[OllamaClient, Depends(get_ollama_client)],
    discovery: Annotated[DiscoveryCache, Depends(get_discovery_cache)],
    redis_client: Annotated[redis.Redis, Depends(get_redis)],
    audit: Annotated[AuditWriter, Depends(get_audit_writer)],
 ) -> Pipeline:
    """Assemble a per-request enforcement + proxy pipeline.
    The pipeline owns all hot-path checks (rate limit, budget, concurrency,
    model/endpoint allowlist) and the streaming-with-bookkeeping contract.
    Audit deny-mode flips this to fail closed at the route layer.
    """
    sessionmaker: async_sessionmaker[AsyncSession] | None = getattr(
        request.app.state, "db_sessionmaker", None
    )
    return Pipeline(
        request=request,
        principal=principal,
        settings=settings,
        ollama=ollama,
        discovery=discovery,
        rate_limiter=SlidingWindowLimiter(redis_client),
        concurrency=ConcurrencyLimiter(redis_client),
        budget=BudgetCounter(redis_client),
        audit=audit,
        sessionmaker=sessionmaker,
    )
 def _get_sessionmaker(request: Request) -> async_sessionmaker[AsyncSession]:
    """Return the session factory or fail closed if the engine is absent."""
    factory: async_sessionmaker[AsyncSession] | None = getattr(
        request.app.state, "db_sessionmaker", None
    )
    if factory is None:
        raise DependencyUnavailableError(internal_detail="db session factory not initialised")
    return factory
 async def get_db_session(request: Request) -> AsyncIterator[AsyncSession]:
    """Provide a request-scoped async DB session."""
    factory = _get_sessionmaker(request)
    async with factory() as session:
        yield session
 ConfigDep = Annotated[Settings, Depends(get_config)]
 RedisDep = Annotated[redis.Redis, Depends(get_redis)]
 HttpClientDep = Annotated[httpx.AsyncClient, Depends(get_http_client)]
 OllamaClientDep = Annotated[OllamaClient, Depends(get_ollama_client)]
 DiscoveryCacheDep = Annotated[DiscoveryCache, Depends(get_discovery_cache)]
 PrincipalDep = Annotated[Principal, Depends(get_principal)]
 AuditWriterDep = Annotated[AuditWriter, Depends(get_audit_writer)]
 PipelineDep = Annotated[Pipeline, Depends(get_pipeline)]
 DbSessionDep = Annotated[AsyncSession, Depends(get_db_session)]
 __all__ = [
    "AuditWriterDep",
    "ConfigDep",
    "DbSessionDep",
    "DiscoveryCacheDep",
    "HttpClientDep",
    "OllamaClientDep",
    "PipelineDep",
    "PrincipalDep",
    "RedisDep",
    "get_audit_writer",
    "get_config",
    "get_db_session",
    "get_discovery_cache",
    "get_http_client",
    "get_ollama_client",
    "get_pipeline",
    "get_principal",
    "get_redis",
 ]
--- a/src/neuronetz_gateway/errors.py
+++ b/src/neuronetz_gateway/errors.py
@@ -0,0 +1,179 @@
 """Exception types and FastAPI exception handlers.
 Hard rule (SPEC §3, AGENT_PROMPT non-negotiable #4): never leak upstream or
 internal error details to the client. Every error response is a generic,
 sanitized JSON body carrying only a stable ``error.code``, a safe message, and
 the request id. Detailed context is logged server-side, never returned.
 """
 from __future__ import annotations
 from fastapi import FastAPI, Request, status
 from fastapi.responses import JSONResponse
 from neuronetz_gateway.observability.logging import get_logger
 _log = get_logger("errors")
 class GatewayError(Exception):
    """Base class for gateway errors that map to a sanitized HTTP response.
    ``message`` MUST be safe to return to clients. Anything sensitive belongs
    in ``internal_detail`` which is logged but never serialized to the client.
    """
    status_code: int = status.HTTP_500_INTERNAL_SERVER_ERROR
    code: str = "internal_error"
    message: str = "An internal error occurred."
    def __init__(self, message: str | None = None, *, internal_detail: str | None = None) -> None:
        super().__init__(message or self.message)
        if message is not None:
            self.message = message
        self.internal_detail = internal_detail
 class AuthenticationError(GatewayError):
    """Missing/invalid credentials. Fail closed, no detail."""
    status_code = status.HTTP_401_UNAUTHORIZED
    code = "unauthorized"
    message = "Authentication required."
 class AuthorizationError(GatewayError):
    """Authenticated but not permitted (scope/model/endpoint denied)."""
    status_code = status.HTTP_403_FORBIDDEN
    code = "forbidden"
    message = "This request is not permitted."
 class RateLimitError(GatewayError):
    """Rate limit exceeded. Handler attaches ``Retry-After`` when known."""
    status_code = status.HTTP_429_TOO_MANY_REQUESTS
    code = "rate_limited"
    message = "Rate limit exceeded."
    def __init__(
        self,
        message: str | None = None,
        *,
        retry_after: int | None = None,
        internal_detail: str | None = None,
    ) -> None:
        super().__init__(message, internal_detail=internal_detail)
        self.retry_after = retry_after
 class BudgetExceededError(GatewayError):
    """Token budget exhausted for the active period."""
    status_code = status.HTTP_429_TOO_MANY_REQUESTS
    code = "budget_exceeded"
    message = "Token budget exhausted for the current period."
 class RequestTooLargeError(GatewayError):
    """Request body exceeds the configured limit."""
    status_code = status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
    code = "request_too_large"
    message = "Request body is too large."
 class UpstreamUnavailableError(GatewayError):
    """Ollama (or another dependency) is unreachable. Fail closed."""
    status_code = status.HTTP_502_BAD_GATEWAY
    code = "upstream_unavailable"
    message = "The upstream service is temporarily unavailable."
 class DependencyUnavailableError(GatewayError):
    """A required backend (DB/Redis) is unavailable; serve 503, fail closed."""
    status_code = status.HTTP_503_SERVICE_UNAVAILABLE
    code = "service_unavailable"
    message = "The service is temporarily unavailable."
 def _request_id(request: Request) -> str:
    """Extract the request id placed on ``request.state`` by middleware."""
    rid = getattr(request.state, "request_id", None)
    return str(rid) if rid else ""
 def _error_response(
    request: Request,
    *,
    status_code: int,
    code: str,
    message: str,
    extra_headers: dict[str, str] | None = None,
 ) -> JSONResponse:
    """Build a sanitized JSON error response with the request id header."""
    request_id = _request_id(request)
    headers = {"X-Request-ID": request_id} if request_id else {}
    if extra_headers:
        headers.update(extra_headers)
    return JSONResponse(
        status_code=status_code,
        content={"error": {"code": code, "message": message, "request_id": request_id}},
        headers=headers,
    )
 async def _gateway_error_handler(request: Request, exc: GatewayError) -> JSONResponse:
    """Render a ``GatewayError`` as a sanitized response."""
    if exc.internal_detail:
        _log.warning(
            "gateway_error",
            code=exc.code,
            status_code=exc.status_code,
            internal_detail=exc.internal_detail,
        )
    extra: dict[str, str] | None = None
    if isinstance(exc, RateLimitError) and exc.retry_after is not None:
        extra = {"Retry-After": str(exc.retry_after)}
    return _error_response(
        request,
        status_code=exc.status_code,
        code=exc.code,
        message=exc.message,
        extra_headers=extra,
    )
 async def _unhandled_error_handler(request: Request, exc: Exception) -> JSONResponse:
    """Catch-all: log the real exception, return a generic 500. No leakage."""
    _log.error("unhandled_exception", exc_info=exc)
    return _error_response(
        request,
        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        code="internal_error",
        message="An internal error occurred.",
    )
 def register_exception_handlers(app: FastAPI) -> None:
    """Attach the gateway's sanitizing exception handlers to the app."""
    # mypy: FastAPI's add_exception_handler accepts these handler signatures;
    # the stubs are intentionally broad, so casts are unnecessary here.
    app.add_exception_handler(GatewayError, _gateway_error_handler)  # type: ignore[arg-type]  # handler typed for GatewayError subclass
    app.add_exception_handler(Exception, _unhandled_error_handler)
 __all__ = [
    "AuthenticationError",
    "AuthorizationError",
    "BudgetExceededError",
    "DependencyUnavailableError",
    "GatewayError",
    "RateLimitError",
    "RequestTooLargeError",
    "UpstreamUnavailableError",
    "register_exception_handlers",
 ]
--- a/src/neuronetz_gateway/lifespan.py
+++ b/src/neuronetz_gateway/lifespan.py
@@ -0,0 +1,131 @@
 """Application lifespan: connect/dispose backends and run background tasks.
 Startup connects Postgres + Redis + the upstream httpx client, builds the
 argon2 hasher and the buffered audit writer, and launches the background tasks:
 the model-discovery poller (SPEC §4.6) and the Postgres revocation NOTIFY
 listener (SPEC §4.5). Connection failures are tolerated so ``/healthz`` always
 serves; ``/readyz`` reports true readiness. All handles live on ``app.state``.
 """
 from __future__ import annotations
 import asyncio
 import contextlib
 from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from typing import TYPE_CHECKING
 import httpx
 import redis.asyncio as redis
 from neuronetz_gateway.audit.writer import AuditWriter
 from neuronetz_gateway.auth.hashing import build_hasher
 from neuronetz_gateway.config import Settings, get_settings
 from neuronetz_gateway.db.session import create_engine, create_session_factory
 from neuronetz_gateway.observability.logging import get_logger
 from neuronetz_gateway.proxy.discovery import DiscoveryCache, discovery_loop
 from neuronetz_gateway.revocation import revocation_listener
 if TYPE_CHECKING:
    from fastapi import FastAPI
 _log = get_logger("lifespan")
 def _build_http_client(settings: Settings) -> httpx.AsyncClient:
    """Construct the shared httpx client used to reach Ollama."""
    timeout = httpx.Timeout(
        connect=settings.ollama_connect_timeout_s,
        read=settings.ollama_read_timeout_s,
        write=settings.ollama_read_timeout_s,
        pool=settings.ollama_connect_timeout_s,
    )
    limits = httpx.Limits(max_connections=settings.ollama_max_connections)
    return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits)
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator[None]:
    """Manage startup/shutdown of all backends and background tasks."""
    settings: Settings = get_settings()
    app.state.settings = settings
    app.state.hasher = build_hasher(settings)
    app.state.discovery_cache = DiscoveryCache()
    tasks: list[asyncio.Task[None]] = []
    try:
        engine = create_engine(settings)
        app.state.db_engine = engine
        app.state.db_sessionmaker = create_session_factory(engine)
    except Exception as exc:  # noqa: BLE001 - tolerate so /healthz still serves
        _log.error("db_engine_init_failed", error=str(exc))
        app.state.db_engine = None
        app.state.db_sessionmaker = None
    try:
        app.state.redis = redis.from_url(settings.redis_url, decode_responses=True)
    except Exception as exc:  # noqa: BLE001 - tolerate so /healthz still serves
        _log.error("redis_init_failed", error=str(exc))
        app.state.redis = None
    app.state.http_client = _build_http_client(settings)
    audit_writer = AuditWriter(settings.audit_buffer_size, app.state.db_sessionmaker)
    audit_writer.start()
    app.state.audit_writer = audit_writer
    # Background tasks (cancelled on shutdown).
    tasks.append(
        asyncio.create_task(
            discovery_loop(
                app.state.http_client, app.state.redis, app.state.discovery_cache, settings
            )
        )
    )
    if app.state.redis is not None and app.state.db_sessionmaker is not None:
        tasks.append(
            asyncio.create_task(
                revocation_listener(settings, app.state.redis, app.state.db_sessionmaker)
            )
        )
    app.state.background_tasks = tasks
    _log.info("gateway_startup_complete")
    try:
        yield
    finally:
        await _shutdown(app, tasks, audit_writer)
 async def _shutdown(
    app: FastAPI, tasks: list[asyncio.Task[None]], audit_writer: AuditWriter
 ) -> None:
    """Cancel background tasks and dispose of all backend handles."""
    for task in tasks:
        task.cancel()
    for task in tasks:
        with contextlib.suppress(asyncio.CancelledError):
            await task
    with contextlib.suppress(Exception):
        await audit_writer.stop()
    http_client: httpx.AsyncClient | None = getattr(app.state, "http_client", None)
    if http_client is not None:
        with contextlib.suppress(Exception):
            await http_client.aclose()
    redis_client = getattr(app.state, "redis", None)
    if redis_client is not None:
        with contextlib.suppress(Exception):
            await redis_client.aclose()
    engine = getattr(app.state, "db_engine", None)
    if engine is not None:
        with contextlib.suppress(Exception):
            await engine.dispose()
    _log.info("gateway_shutdown_complete")
 __all__ = ["lifespan"]
--- a/src/neuronetz_gateway/observability/init.py
+++ b/src/neuronetz_gateway/observability/init.py
@@ -0,0 +1,3 @@
 """Observability: structured logging and Prometheus metrics."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/observability/logging.py
+++ b/src/neuronetz_gateway/observability/logging.py
@@ -0,0 +1,48 @@
 """structlog configuration.
 Renders JSON in production (``GATEWAY_LOG_FORMAT=json``) and a human-friendly
 console format in development. No secrets are ever logged; processors here
 must not introduce any.
 """
 from __future__ import annotations
 import logging
 from typing import Any
 import structlog
 def configure_logging(level: str = "INFO", fmt: str = "json") -> None:
    """Configure stdlib logging and structlog according to settings."""
    log_level = getattr(logging, level.upper(), logging.INFO)
    logging.basicConfig(format="%(message)s", level=log_level)
    shared_processors: list[structlog.types.Processor] = [
        structlog.contextvars.merge_contextvars,
        structlog.processors.add_log_level,
        structlog.processors.TimeStamper(fmt="iso", utc=True),
        structlog.processors.StackInfoRenderer(),
        structlog.processors.format_exc_info,
    ]
    renderer: structlog.types.Processor
    if fmt == "console":
        renderer = structlog.dev.ConsoleRenderer()
    else:
        renderer = structlog.processors.JSONRenderer()
    structlog.configure(
        processors=[*shared_processors, renderer],
        wrapper_class=structlog.make_filtering_bound_logger(log_level),
        logger_factory=structlog.PrintLoggerFactory(),
        cache_logger_on_first_use=True,
    )
 def get_logger(name: str | None = None) -> Any:  # noqa: ANN401 - structlog returns a dynamic proxy
    """Return a bound structlog logger."""
    return structlog.get_logger(name)
 __all__ = ["configure_logging", "get_logger"]
--- a/src/neuronetz_gateway/routes/init.py
+++ b/src/neuronetz_gateway/routes/init.py
@@ -0,0 +1,3 @@
 """HTTP route modules: health, native Ollama passthrough, OpenAI-compat."""
 from __future__ import annotations
--- a/src/neuronetz_gateway/routes/health.py
+++ b/src/neuronetz_gateway/routes/health.py
@@ -0,0 +1,114 @@
 """Health, readiness, and metrics endpoints (SPEC §6.4).
 - ``GET /healthz``  : liveness — always 200 if the process can respond.
 - ``GET /readyz``   : readiness — 200 only if Postgres + Redis + Ollama are all
                      reachable; otherwise 503 with which dependencies are down.
                      In Phase 1 dev there is no Ollama, so 503 is expected.
 - ``GET /metrics``  : Prometheus exposition. (Loopback-only IP check deferred.)
 None of these endpoints require auth and none leak secrets or internal detail.
 """
 from __future__ import annotations
 from collections.abc import Awaitable
 from typing import Literal, cast
 import httpx
 import redis.asyncio as redis
 from fastapi import APIRouter, Request, Response, status
 from pydantic import BaseModel
 from sqlalchemy import text
 from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
 from neuronetz_gateway.observability.logging import get_logger
 from neuronetz_gateway.observability.metrics import CONTENT_TYPE_LATEST, render_latest
 router = APIRouter(tags=["health"])
 _log = get_logger("health")
 class HealthResponse(BaseModel):
    """Liveness response body."""
    status: Literal["ok"] = "ok"
 class ReadyResponse(BaseModel):
    """Readiness response body. ``checks`` maps dependency -> reachable bool."""
    status: Literal["ready", "not_ready"]
    checks: dict[str, bool]
@router.get("/healthz", response_model=HealthResponse, status_code=status.HTTP_200_OK)
 async def healthz() -> HealthResponse:
    """Liveness probe — always returns 200 while the process is responsive."""
    return HealthResponse()
 async def _check_postgres(app_state: object) -> bool:
    """Return True if a trivial query succeeds against Postgres."""
    factory: async_sessionmaker[AsyncSession] | None = getattr(
        app_state, "db_sessionmaker", None
    )
    if factory is None:
        return False
    try:
        async with factory() as session:
            await session.execute(text("SELECT 1"))
        return True
    except Exception as exc:  # noqa: BLE001 - any failure means not ready
        _log.warning("readyz_postgres_unreachable", error=str(exc))
        return False
 async def _check_redis(app_state: object) -> bool:
    """Return True if Redis answers PING."""
    client: redis.Redis | None = getattr(app_state, "redis", None)
    if client is None:
        return False
    try:
        # redis-py types ping() as Awaitable[bool] | bool (sync+async share stubs);
        # the asyncio client always returns an awaitable at runtime.
        return bool(await cast("Awaitable[bool]", client.ping()))
    except Exception as exc:  # noqa: BLE001 - any failure means not ready
        _log.warning("readyz_redis_unreachable", error=str(exc))
        return False
 async def _check_ollama(app_state: object) -> bool:
    """Return True if Ollama's root endpoint is reachable."""
    client: httpx.AsyncClient | None = getattr(app_state, "http_client", None)
    if client is None:
        return False
    try:
        resp = await client.get("/")
        return resp.status_code < 500
    except Exception as exc:  # noqa: BLE001 - any failure means not ready
        _log.warning("readyz_ollama_unreachable", error=str(exc))
        return False
@router.get("/readyz", response_model=ReadyResponse)
 async def readyz(request: Request, response: Response) -> ReadyResponse:
    """Readiness probe — 200 only if every dependency is reachable, else 503."""
    app_state = request.app.state
    checks = {
        "postgres": await _check_postgres(app_state),
        "redis": await _check_redis(app_state),
        "ollama": await _check_ollama(app_state),
    }
    all_ready = all(checks.values())
    if not all_ready:
        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
    return ReadyResponse(status="ready" if all_ready else "not_ready", checks=checks)
@router.get("/metrics")
 async def metrics() -> Response:
    """Prometheus exposition. Loopback-only enforcement is deferred to Phase 4."""
    return Response(content=render_latest(), media_type=CONTENT_TYPE_LATEST)
 __all__ = ["router"]
		`@@ -0,0 +1,3 @@`
							`"""Database access layer: SQLAlchemy models, session factory, repositories."""`

							`from __future__ import annotations`
		`@@ -0,0 +1,3 @@`
							`"""Observability: structured logging and Prometheus metrics."""`

							`from __future__ import annotations`
		`@@ -0,0 +1,3 @@`
							`"""HTTP route modules: health, native Ollama passthrough, OpenAI-compat."""`

							`from __future__ import annotations`