From d79f17b3bb3075cf9f880cf0831580b566c987fe Mon Sep 17 00:00:00 2001
From: Stephan Berbig <skasdorf@gmail.com>
Date: Tue, 26 May 2026 20:50:35 +0200
Subject: [PATCH] scaffold: project skeleton, schema, healthz/readyz, CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initial project structure for neuronetz-gateway per scope-docs/SPEC.md:

- Python 3.12 / FastAPI / SQLAlchemy 2.0 (async) / Redis / Postgres stack
  managed by uv. Multi-stage non-root Dockerfile, prod + dev compose files
  (ollama service is NEVER published in either), Caddyfile + systemd unit,
  justfile, GitHub Actions CI (ruff, mypy --strict, pytest, bandit, pip-audit).
- Pydantic-Settings config covering every env var from SPEC §7, including the
  MODEL_DISCOVERY_* keys for the dynamic-discovery feature (§4.6).
- Alembic 0001_initial creates the full gateway schema (8 tables, 3 enums,
  notify_key_revoked() trigger), incl. allow_all_models on tenant_limits and
  key_limits for the per-tenant auto-grant toggle.
- Working /healthz, /readyz (fail-closed when deps unreachable), and a
  Prometheus /metrics stub. Sanitizing error handlers that attach X-Request-ID
  to every response and never leak upstream internals.
- SPEC + AGENT_PROMPT included under scope-docs/ (source of truth).
---
 .dockerignore                                 |  44 ++
 .env.example                                  |  63 ++
 .github/workflows/ci.yml                      | 108 ++++
 .gitignore                                    |  40 ++
 Dockerfile                                    |  97 +++
 LICENSE                                       | 202 ++++++
 README.md                                     |  92 +++
 alembic.ini                                   |  49 ++
 alembic/env.py                                |  97 +++
 alembic/versions/0001_initial.py              | 342 ++++++++++
 docker-compose.dev.yml                        | 101 +++
 docker-compose.yml                            | 152 +++++
 justfile                                      |  60 ++
 ops/caddy/Caddyfile.example                   |  59 ++
 ops/systemd/neuronetz-gateway.service         |  58 ++
 pyproject.toml                                |  94 +++
 scope-docs/AGENT_PROMPT.md                    | 121 ++++
 scope-docs/SPEC.md                            | 593 ++++++++++++++++++
 src/neuronetz_gateway/__init__.py             |   7 +
 src/neuronetz_gateway/__main__.py             |  28 +
 src/neuronetz_gateway/app.py                  | 111 ++++
 src/neuronetz_gateway/config.py               |  86 +++
 src/neuronetz_gateway/db/__init__.py          |   3 +
 src/neuronetz_gateway/db/models.py            | 292 +++++++++
 src/neuronetz_gateway/db/session.py           |  53 ++
 src/neuronetz_gateway/deps.py                 | 180 ++++++
 src/neuronetz_gateway/errors.py               | 179 ++++++
 src/neuronetz_gateway/lifespan.py             | 131 ++++
 .../observability/__init__.py                 |   3 +
 .../observability/logging.py                  |  48 ++
 src/neuronetz_gateway/routes/__init__.py      |   3 +
 src/neuronetz_gateway/routes/health.py        | 114 ++++
 32 files changed, 3610 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .env.example
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 alembic.ini
 create mode 100644 alembic/env.py
 create mode 100644 alembic/versions/0001_initial.py
 create mode 100644 docker-compose.dev.yml
 create mode 100644 docker-compose.yml
 create mode 100644 justfile
 create mode 100644 ops/caddy/Caddyfile.example
 create mode 100644 ops/systemd/neuronetz-gateway.service
 create mode 100644 pyproject.toml
 create mode 100644 scope-docs/AGENT_PROMPT.md
 create mode 100644 scope-docs/SPEC.md
 create mode 100644 src/neuronetz_gateway/__init__.py
 create mode 100644 src/neuronetz_gateway/__main__.py
 create mode 100644 src/neuronetz_gateway/app.py
 create mode 100644 src/neuronetz_gateway/config.py
 create mode 100644 src/neuronetz_gateway/db/__init__.py
 create mode 100644 src/neuronetz_gateway/db/models.py
 create mode 100644 src/neuronetz_gateway/db/session.py
 create mode 100644 src/neuronetz_gateway/deps.py
 create mode 100644 src/neuronetz_gateway/errors.py
 create mode 100644 src/neuronetz_gateway/lifespan.py
 create mode 100644 src/neuronetz_gateway/observability/__init__.py
 create mode 100644 src/neuronetz_gateway/observability/logging.py
 create mode 100644 src/neuronetz_gateway/routes/__init__.py
 create mode 100644 src/neuronetz_gateway/routes/health.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..ce0bba4
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,44 @@
+# Keep the build context lean and never ship secrets into an image layer.
+
+# Secrets / local env
+.env
+.env.*
+!.env.example
+
+# VCS & CI
+.git
+.gitignore
+.github
+
+# Python caches & build artefacts
+__pycache__/
+*.py[cod]
+*.egg-info/
+.eggs/
+build/
+dist/
+.venv/
+venv/
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+.coverage
+htmlcov/
+coverage.xml
+
+# Tests & docs are not needed in the runtime image
+tests/
+docs/
+scope-docs/
+
+# Editor / OS cruft
+.idea/
+.vscode/
+*.swp
+.DS_Store
+
+# Compose / ops files don't belong in the image
+docker-compose*.yml
+ops/
+# NOTE: README.md and LICENSE are intentionally NOT ignored — the build backend
+# (hatchling) reads `readme`/`license` from pyproject.toml at build time.
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..825036d
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,63 @@
+# neuronetz-gateway — environment configuration (SPEC §7).
+#
+# Copy to `.env` and adjust. `.env` is gitignored and MUST NOT be committed.
+# All values here are SAFE EXAMPLES — change every secret before any real deploy.
+
+# ──────────────────────────── Service ────────────────────────────
+GATEWAY_BIND_HOST=0.0.0.0
+GATEWAY_BIND_PORT=8080
+GATEWAY_LOG_LEVEL=INFO
+GATEWAY_LOG_FORMAT=json                  # json|console
+GATEWAY_REQUEST_ID_HEADER=X-Request-ID
+GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy  # for X-Forwarded-For
+
+# ──────────────────────────── Upstream ───────────────────────────
+OLLAMA_BASE_URL=http://ollama:11434
+OLLAMA_CONNECT_TIMEOUT_S=5
+OLLAMA_READ_TIMEOUT_S=600
+OLLAMA_MAX_CONNECTIONS=64
+
+# ──────────────────────── Model discovery (§4.6) ─────────────────
+MODEL_DISCOVERY_REFRESH_S=60
+MODEL_DISCOVERY_CACHE_TTL_S=120
+
+# ──────────────────────────── Database ───────────────────────────
+# Compose builds DATABASE_URL from the POSTGRES_* parts below, but the gateway
+# also accepts a full DATABASE_URL directly.
+DATABASE_URL=postgresql+asyncpg://gateway:changeme@postgres:5432/neuronetz
+DATABASE_POOL_SIZE=10
+DATABASE_POOL_OVERFLOW=20
+
+# Postgres container credentials (consumed by docker-compose).
+POSTGRES_USER=gateway
+POSTGRES_PASSWORD=changeme
+POSTGRES_DB=neuronetz
+
+# ──────────────────────────── Redis ──────────────────────────────
+REDIS_URL=redis://redis:6379/0
+REDIS_KEY_CACHE_TTL_S=60
+
+# ────────────────── Limits (defaults; DB overrides) ──────────────
+DEFAULT_RPM=60
+DEFAULT_TPM=100000
+DEFAULT_CONCURRENT=8
+MAX_REQUEST_BODY_BYTES=262144
+MAX_NUM_PREDICT=4096
+
+# ──────────────────────────── Security ───────────────────────────
+ARGON2_TIME_COST=3
+ARGON2_MEMORY_COST_KIB=65536
+ARGON2_PARALLELISM=4
+AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
+
+# ──────────────────────────── Audit ──────────────────────────────
+AUDIT_BUFFER_SIZE=1000
+PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
+AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
+
+# ──────────────── Playground / API docs (prod-safe: OFF) ─────────
+# Serve the playground HTML (owned by the docs agent) at /playground.
+PLAYGROUND_ENABLED=false
+PLAYGROUND_FILE=/app/playground/index.html
+# Enable FastAPI's /docs + /openapi.json (default off in production).
+DOCS_ENABLED=false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..553aa6e
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,108 @@
+name: CI
+
+on:
+  push:
+    branches: ["**"]
+  pull_request:
+  workflow_dispatch:
+
+# Cancel superseded runs on the same ref.
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  PYTHON_VERSION: "3.12"
+
+jobs:
+  lint:
+    name: ruff
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - name: Set up Python
+        run: uv python install ${{ env.PYTHON_VERSION }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: ruff check
+        run: uv run ruff check .
+
+  typecheck:
+    name: mypy --strict
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - name: Set up Python
+        run: uv python install ${{ env.PYTHON_VERSION }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: mypy
+        run: uv run mypy --strict src
+
+  test:
+    name: pytest
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - name: Set up Python
+        run: uv python install ${{ env.PYTHON_VERSION }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      # Phase 1: an empty/placeholder suite must pass. pytest exits 5 when it
+      # collects no tests; we treat that as success this phase. Coverage is
+      # reported but not gated yet (no --cov-fail-under until later phases).
+      - name: pytest
+        shell: bash
+        run: |
+          set +e
+          uv run pytest --cov=neuronetz_gateway --cov-report=term-missing
+          code=$?
+          if [ "$code" -eq 5 ]; then
+            echo "::notice::No tests collected (Phase 1) — treating as success."
+            exit 0
+          fi
+          exit "$code"
+
+  bandit:
+    name: bandit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - name: Set up Python
+        run: uv python install ${{ env.PYTHON_VERSION }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: bandit
+        run: uv run bandit -q -r src
+
+  pip-audit:
+    name: pip-audit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          enable-cache: true
+      - name: Set up Python
+        run: uv python install ${{ env.PYTHON_VERSION }}
+      - name: Install dependencies
+        run: uv sync --extra dev
+      - name: pip-audit
+        run: uv run pip-audit
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4024058
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,40 @@
+# Secrets — NEVER commit. Only .env.example is tracked.
+.env
+.env.*
+!.env.example
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+.eggs/
+build/
+dist/
+*.so
+
+# Virtualenvs / uv
+.venv/
+venv/
+.python-version
+
+# Type / lint / test caches
+.mypy_cache/
+.ruff_cache/
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+.tox/
+
+# Docker
+*.pid
+
+# Editor / OS
+.idea/
+.vscode/
+*.swp
+*~
+.DS_Store
+Thumbs.db
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..fd5a71e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,97 @@
+# syntax=docker/dockerfile:1.7
+#
+# neuronetz-gateway — multi-stage image.
+#
+#   builder stage : installs dependencies into a self-contained virtualenv using uv.
+#   runtime stage : copies the venv + source, drops to a NON-ROOT user, contains
+#                   no build tools, and runs `python -m neuronetz_gateway`.
+#
+# uv is pulled from the official distroless image so we don't need network access
+# to `pip install uv`. Dependencies come from pyproject.toml (+ uv.lock if present).
+
+# ----------------------------------------------------------------------------
+# Stage 1 — builder
+# ----------------------------------------------------------------------------
+FROM python:3.12-slim AS builder
+
+# Bring in the `uv` binary from its official image.
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+ENV UV_LINK_MODE=copy \
+    UV_COMPILE_BYTECODE=1 \
+    UV_PYTHON_DOWNLOADS=never \
+    # Create the project venv at a stable, copyable location.
+    VIRTUAL_ENV=/opt/venv \
+    PATH=/opt/venv/bin:$PATH
+
+WORKDIR /app
+
+# Create the target virtualenv up front so uv installs into it.
+RUN uv venv /opt/venv
+
+# Dependency layer: copy only the manifest(s) first for better caching.
+# uv.lock is optional in Phase 1 — the wildcard makes COPY succeed either way.
+COPY pyproject.toml ./
+COPY uv.loc[k] ./
+
+# Install dependencies. If a lockfile is present `uv sync` honours it; otherwise
+# we fall back to resolving straight from pyproject.toml. Either way the build
+# does NOT fail when the lock is absent.
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-dev ; \
+    else \
+        uv pip install --python /opt/venv/bin/python -r pyproject.toml ; \
+    fi
+
+# Now copy the application source and install the project itself into the venv.
+# README.md + LICENSE are required by the build backend (pyproject `readme`/license).
+COPY README.md LICENSE ./
+COPY src ./src
+COPY alembi[c] ./alembic
+COPY alembic.in[i] ./
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --python /opt/venv/bin/python --no-deps .
+
+# ----------------------------------------------------------------------------
+# Stage 2 — runtime
+# ----------------------------------------------------------------------------
+FROM python:3.12-slim AS runtime
+
+# Runtime-only OS packages: curl is used by the compose healthcheck.
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Non-root user.
+RUN groupadd --system --gid 10001 gateway \
+    && useradd --system --uid 10001 --gid gateway --home-dir /app --shell /usr/sbin/nologin gateway
+
+ENV VIRTUAL_ENV=/opt/venv \
+    PATH=/opt/venv/bin:$PATH \
+    PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    GATEWAY_BIND_HOST=0.0.0.0 \
+    GATEWAY_BIND_PORT=8080
+
+WORKDIR /app
+
+# Copy the fully-populated virtualenv and the application from the builder.
+COPY --from=builder /opt/venv /opt/venv
+COPY --from=builder /app/src ./src
+# alembic assets are optional during early scaffolding; copy if present.
+COPY --from=builder /app/alembi[c] ./alembic
+COPY --from=builder /app/alembic.in[i] ./
+
+# Drop privileges. No build tools are present in this stage.
+USER gateway
+
+EXPOSE 8080
+
+# Liveness probe target lives at /healthz (see SPEC §6.4).
+HEALTHCHECK --interval=15s --timeout=3s --start-period=20s --retries=5 \
+    CMD curl -fsS "http://127.0.0.1:${GATEWAY_BIND_PORT}/healthz" || exit 1
+
+# Default command: run the server. Compose overrides this in dev to run
+# `alembic upgrade head` first (see docker-compose.dev.yml).
+CMD ["python", "-m", "neuronetz_gateway"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..44f5372
--- /dev/null
+++ b/README.md
@@ -0,0 +1,92 @@
+# neuronetz-gateway
+
+A secure, multi-tenant API gateway in front of an [Ollama](https://github.com/ollama/ollama)
+instance. It is the hot path of the Neuronetz API: every request to the models flows
+through here, authenticated, rate-limited, budgeted, and audited.
+
+**The Ollama backend is never reachable from the public internet.** It is bound to an
+internal Docker network with no published ports. All access is via this gateway, behind
+TLS terminated by Caddy.
+
+> Status: **v0.1.0 — in development.** See [`scope-docs/SPEC.md`](scope-docs/SPEC.md) for
+> the full specification and [`scope-docs/AGENT_PROMPT.md`](scope-docs/AGENT_PROMPT.md) for
+> the phased build plan. `SPEC.md` is the source of truth.
+
+## What it does
+
+- **Auth** — API keys as Bearer tokens, stored as Argon2id hashes, verified in constant time.
+- **Multi-tenant** — tenants own keys; limits and budgets inherit tenant → key.
+- **Rate limiting** — per-key and per-tenant RPM / TPM / concurrent connections.
+- **Budgets** — daily / monthly / total token budgets, enforced fail-closed.
+- **Dual API surface** — native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`), both streaming.
+- **Hard-blocked mutations** — `/api/pull`, `/api/push`, `/api/create`, `/api/copy`,
+  `/api/delete`, `/api/blobs/*` always return 403. Not configurable.
+- **Audit log** — always-on request metadata; opt-in, TTL'd prompt logging per key.
+
+Administration (dashboards, tenant self-service) lives in a separate service,
+`neuronetz-console`; it is **not** part of this repository.
+
+## Architecture
+
+```
+Internet ──TLS──> Caddy ──HTTP──> gateway ──┬──> Postgres   (keys, budgets, audit)
+                                            ├──> Redis      (key cache, rate limits)
+                                            └──> Ollama     (internal network only)
+```
+
+## Quickstart (dev)
+
+Requires Docker + Docker Compose. The dev stack runs Postgres, Redis, and the gateway —
+**no Caddy and no Ollama** (so `/readyz` reports 503 until a real Ollama backend is wired
+in; that is expected).
+
+```bash
+git clone <repo> neuronetz-gateway && cd neuronetz-gateway
+cp .env.example .env          # adjust if you like; defaults work for local dev
+docker compose -f docker-compose.dev.yml up --build
+```
+
+The gateway runs `alembic upgrade head` on startup, then serves on `http://localhost:8080`.
+
+```bash
+curl -i http://localhost:8080/healthz   # -> 200  {"status":"ok"}
+curl -i http://localhost:8080/readyz    # -> 503  (no Ollama backend in the dev stack)
+```
+
+## Production
+
+`docker-compose.yml` brings up the full stack — Caddy (TLS via Let's Encrypt for
+`api.neuronetz.ai`), the gateway, Postgres, Redis, and Ollama. The `ollama` service has
+**no `ports:` mapping** and is reachable only on the internal Docker network. See
+[`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) (added in a later phase) and
+[`ops/caddy/Caddyfile.example`](ops/caddy/Caddyfile.example).
+
+## Managing tenants and keys
+
+Use the bootstrap CLI (Typer). Keys have the form `nz_<prefix><secret>`; the full key is
+printed exactly once at creation and only its Argon2id hash is stored.
+
+```bash
+neuronetz-gateway create-tenant --name acme
+neuronetz-gateway create-key   --tenant acme --name prod-server-1
+neuronetz-gateway list-keys    --tenant acme
+neuronetz-gateway revoke-key   --prefix nz_abc12345
+```
+
+## Development
+
+```bash
+just dev          # run the dev stack
+just test         # pytest + coverage
+just lint         # ruff
+just typecheck    # mypy --strict
+just migrate      # alembic upgrade head
+```
+
+Tooling: Python 3.12, `uv`, FastAPI + uvicorn, SQLAlchemy 2.0 (async) + asyncpg, Redis,
+httpx, structlog, Pydantic. Lint/type/security gates: ruff, mypy `--strict`, bandit,
+pip-audit.
+
+## License
+
+Apache 2.0 — see [`LICENSE`](LICENSE). Owner: Stephan Berbig / Neuronetz.
diff --git a/alembic.ini b/alembic.ini
new file mode 100644
index 0000000..44ee442
--- /dev/null
+++ b/alembic.ini
@@ -0,0 +1,49 @@
+# Alembic configuration for neuronetz-gateway.
+# The database URL is read from the DATABASE_URL environment variable in
+# alembic/env.py (do not hardcode credentials here).
+
+[alembic]
+script_location = alembic
+prepend_sys_path = src
+version_path_separator = os
+# version_locations defaults to alembic/versions
+
+# DATABASE_URL is injected at runtime; this placeholder is never used directly.
+sqlalchemy.url = driver://user:pass@localhost/dbname
+
+[post_write_hooks]
+# (none)
+
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARNING
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARNING
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/alembic/env.py b/alembic/env.py
new file mode 100644
index 0000000..8ab4c24
--- /dev/null
+++ b/alembic/env.py
@@ -0,0 +1,97 @@
+"""Alembic environment for neuronetz-gateway (async engine).
+
+Reads ``DATABASE_URL`` from the environment (the same value the app uses,
+``postgresql+asyncpg://...``). Ensures schema ``gateway`` exists and pins the
+Alembic version table into that schema so migration bookkeeping never collides
+with the ``console`` schema in the shared database.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy import pool, text
+from sqlalchemy.engine import Connection
+from sqlalchemy.ext.asyncio import async_engine_from_config
+
+from neuronetz_gateway.config import get_settings
+from neuronetz_gateway.db.models import GATEWAY_SCHEMA, Base
+
+config = context.config
+
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+target_metadata = Base.metadata
+
+
+def _database_url() -> str:
+    """Resolve the async database URL from env, falling back to settings."""
+    return os.environ.get("DATABASE_URL") or get_settings().database_url
+
+
+def _configure_context(connection: Connection) -> None:
+    """Configure migration context with the gateway schema + version table."""
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,
+        version_table="alembic_version",
+        version_table_schema=GATEWAY_SCHEMA,
+        include_schemas=True,
+        compare_type=True,
+    )
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode (emit SQL without a DBAPI connection)."""
+    context.configure(
+        url=_database_url(),
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+        version_table="alembic_version",
+        version_table_schema=GATEWAY_SCHEMA,
+        include_schemas=True,
+    )
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def _do_run_migrations(connection: Connection) -> None:
+    """Ensure the schema exists, then run migrations within a transaction.
+
+    The ``CREATE SCHEMA`` is committed in its own transaction before configuring
+    Alembic. Under SQLAlchemy 2.0, ``execute()`` auto-begins a transaction; if it
+    were left open, Alembic's ``begin_transaction()`` would treat the connection as
+    caller-managed and become a no-op that never commits, so the whole migration
+    (and the schema) would be rolled back on connection close. Committing here
+    leaves the connection clean so Alembic owns — and commits — its own transaction.
+    """
+    connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{GATEWAY_SCHEMA}"'))
+    connection.commit()
+    _configure_context(connection)
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+async def run_migrations_online() -> None:
+    """Run migrations in 'online' mode using an async engine."""
+    configuration = config.get_section(config.config_ini_section) or {}
+    configuration["sqlalchemy.url"] = _database_url()
+    connectable = async_engine_from_config(
+        configuration,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+    async with connectable.connect() as connection:
+        await connection.run_sync(_do_run_migrations)
+    await connectable.dispose()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    asyncio.run(run_migrations_online())
diff --git a/alembic/versions/0001_initial.py b/alembic/versions/0001_initial.py
new file mode 100644
index 0000000..0faf866
--- /dev/null
+++ b/alembic/versions/0001_initial.py
@@ -0,0 +1,342 @@
+"""initial gateway schema
+
+Creates schema ``gateway``, the three enum types, all tables and indexes, and
+the ``notify_key_revoked()`` function plus ``trg_notify_key_revoked`` trigger,
+matching SPEC §5 verbatim in structure.
+
+Revision ID: 0001_initial
+Revises:
+Create Date: 2026-05-22
+
+"""
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision: str = "0001_initial"
+down_revision: str | None = None
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+SCHEMA = "gateway"
+
+# Enum types are created explicitly via raw SQL below; the table columns
+# reference them with create_type=False so they are not created twice.
+_key_status = postgresql.ENUM(
+    "active", "disabled", "revoked", name="key_status", schema=SCHEMA, create_type=False
+)
+_tenant_status = postgresql.ENUM(
+    "active", "suspended", "closed", name="tenant_status", schema=SCHEMA, create_type=False
+)
+_budget_period = postgresql.ENUM(
+    "day", "month", "total", name="budget_period", schema=SCHEMA, create_type=False
+)
+
+
+def upgrade() -> None:
+    """Create the full ``gateway`` schema."""
+    op.execute(f'CREATE SCHEMA IF NOT EXISTS "{SCHEMA}"')
+
+    # --- Enum types (SPEC §5) ---
+    op.execute("CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked')")
+    op.execute("CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed')")
+    op.execute("CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total')")
+
+    # --- tenants ---
+    op.create_table(
+        "tenants",
+        sa.Column(
+            "id",
+            postgresql.UUID(as_uuid=True),
+            primary_key=True,
+            server_default=sa.text("gen_random_uuid()"),
+        ),
+        sa.Column("name", sa.Text(), nullable=False, unique=True),
+        sa.Column(
+            "status", _tenant_status, nullable=False, server_default=sa.text("'active'")
+        ),
+        sa.Column(
+            "created_at",
+            postgresql.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.Column(
+            "metadata",
+            postgresql.JSONB(),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+        ),
+        schema=SCHEMA,
+    )
+
+    # --- tenant_limits ---
+    op.create_table(
+        "tenant_limits",
+        sa.Column(
+            "tenant_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
+            primary_key=True,
+        ),
+        sa.Column("rpm", sa.Integer(), nullable=False, server_default=sa.text("60")),
+        sa.Column("tpm", sa.Integer(), nullable=False, server_default=sa.text("100000")),
+        sa.Column("concurrent", sa.Integer(), nullable=False, server_default=sa.text("8")),
+        sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
+        sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
+        sa.Column("tokens_total", sa.BigInteger(), nullable=True),
+        sa.Column(
+            "allowed_models",
+            postgresql.ARRAY(sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{}'"),
+        ),
+        sa.Column(
+            "allow_all_models",
+            sa.Boolean(),
+            nullable=False,
+            server_default=sa.text("false"),
+        ),
+        sa.Column(
+            "log_prompts_default",
+            sa.Boolean(),
+            nullable=False,
+            server_default=sa.text("false"),
+        ),
+        sa.Column(
+            "prompt_retention_days", sa.Integer(), nullable=False, server_default=sa.text("30")
+        ),
+        sa.Column(
+            "audit_retention_days", sa.Integer(), nullable=False, server_default=sa.text("365")
+        ),
+        schema=SCHEMA,
+    )
+
+    # --- api_keys ---
+    op.create_table(
+        "api_keys",
+        sa.Column(
+            "id",
+            postgresql.UUID(as_uuid=True),
+            primary_key=True,
+            server_default=sa.text("gen_random_uuid()"),
+        ),
+        sa.Column(
+            "tenant_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("prefix", sa.Text(), nullable=False, unique=True),
+        sa.Column("key_hash", sa.Text(), nullable=False),
+        sa.Column("name", sa.Text(), nullable=False),
+        sa.Column("status", _key_status, nullable=False, server_default=sa.text("'active'")),
+        sa.Column(
+            "scopes",
+            postgresql.ARRAY(sa.Text()),
+            nullable=False,
+            server_default=sa.text("'{chat,embeddings}'"),
+        ),
+        sa.Column(
+            "created_at",
+            postgresql.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.Column("last_used_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
+        sa.Column("expires_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
+        sa.Column("log_prompts", sa.Boolean(), nullable=True),
+        sa.Column(
+            "metadata",
+            postgresql.JSONB(),
+            nullable=False,
+            server_default=sa.text("'{}'::jsonb"),
+        ),
+        schema=SCHEMA,
+    )
+    op.create_index(
+        "idx_api_keys_prefix",
+        "api_keys",
+        ["prefix"],
+        schema=SCHEMA,
+        postgresql_where=sa.text("status = 'active'"),
+    )
+    op.create_index("idx_api_keys_tenant", "api_keys", ["tenant_id"], schema=SCHEMA)
+
+    # --- key_limits ---
+    op.create_table(
+        "key_limits",
+        sa.Column(
+            "key_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
+            primary_key=True,
+        ),
+        sa.Column("rpm", sa.Integer(), nullable=True),
+        sa.Column("tpm", sa.Integer(), nullable=True),
+        sa.Column("concurrent", sa.Integer(), nullable=True),
+        sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
+        sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
+        sa.Column("tokens_total", sa.BigInteger(), nullable=True),
+        sa.Column("allowed_models", postgresql.ARRAY(sa.Text()), nullable=True),
+        sa.Column("allow_all_models", sa.Boolean(), nullable=True),
+        schema=SCHEMA,
+    )
+
+    # --- budget_usage ---
+    op.create_table(
+        "budget_usage",
+        sa.Column(
+            "key_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
+            primary_key=True,
+            nullable=False,
+        ),
+        sa.Column("period", _budget_period, primary_key=True, nullable=False),
+        sa.Column(
+            "period_start",
+            postgresql.TIMESTAMP(timezone=True),
+            primary_key=True,
+            nullable=False,
+        ),
+        sa.Column("tokens_in", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
+        sa.Column("tokens_out", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
+        sa.Column("requests", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
+        schema=SCHEMA,
+    )
+    op.create_index(
+        "idx_budget_usage_period",
+        "budget_usage",
+        ["period", "period_start"],
+        schema=SCHEMA,
+    )
+
+    # --- audit_log ---
+    op.create_table(
+        "audit_log",
+        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
+        sa.Column(
+            "ts",
+            postgresql.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.Column("request_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("tenant_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("key_prefix", sa.Text(), nullable=True),
+        sa.Column("method", sa.Text(), nullable=False),
+        sa.Column("path", sa.Text(), nullable=False),
+        sa.Column("model", sa.Text(), nullable=True),
+        sa.Column("tokens_in", sa.Integer(), nullable=True),
+        sa.Column("tokens_out", sa.Integer(), nullable=True),
+        sa.Column("latency_ms", sa.Integer(), nullable=True),
+        sa.Column("status", sa.Integer(), nullable=False),
+        sa.Column("client_ip", postgresql.INET(), nullable=True),
+        sa.Column("user_agent", sa.Text(), nullable=True),
+        sa.Column("error_code", sa.Text(), nullable=True),
+        schema=SCHEMA,
+    )
+    op.create_index("idx_audit_ts", "audit_log", ["ts"], schema=SCHEMA)
+    op.create_index("idx_audit_tenant_ts", "audit_log", ["tenant_id", "ts"], schema=SCHEMA)
+    op.create_index("idx_audit_key_ts", "audit_log", ["key_id", "ts"], schema=SCHEMA)
+
+    # --- prompt_log ---
+    op.create_table(
+        "prompt_log",
+        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
+        sa.Column(
+            "audit_id",
+            sa.BigInteger(),
+            sa.ForeignKey(f"{SCHEMA}.audit_log.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column(
+            "ts",
+            postgresql.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("request_body", postgresql.JSONB(), nullable=False),
+        sa.Column("response_text", sa.Text(), nullable=True),
+        sa.Column("retention_until", postgresql.TIMESTAMP(timezone=True), nullable=False),
+        schema=SCHEMA,
+    )
+    op.create_index(
+        "idx_prompt_log_retention", "prompt_log", ["retention_until"], schema=SCHEMA
+    )
+
+    # --- revocations ---
+    op.create_table(
+        "revocations",
+        sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
+        sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column(
+            "ts",
+            postgresql.TIMESTAMP(timezone=True),
+            nullable=False,
+            server_default=sa.text("now()"),
+        ),
+        sa.Column("reason", sa.Text(), nullable=True),
+        sa.Column("processed_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
+        schema=SCHEMA,
+    )
+
+    # --- NOTIFY trigger on revocation insert (SPEC §5) ---
+    op.execute(
+        """
+        CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
+        BEGIN
+            PERFORM pg_notify('key_revoked', NEW.key_id::text);
+            RETURN NEW;
+        END;
+        $$ LANGUAGE plpgsql;
+        """
+    )
+    op.execute(
+        """
+        CREATE TRIGGER trg_notify_key_revoked
+            AFTER INSERT ON gateway.revocations
+            FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
+        """
+    )
+
+
+def downgrade() -> None:
+    """Drop the entire ``gateway`` schema and its objects."""
+    op.execute("DROP TRIGGER IF EXISTS trg_notify_key_revoked ON gateway.revocations")
+    op.execute("DROP FUNCTION IF EXISTS gateway.notify_key_revoked()")
+
+    op.drop_index("idx_prompt_log_retention", table_name="prompt_log", schema=SCHEMA)
+    op.drop_table("prompt_log", schema=SCHEMA)
+
+    op.drop_index("idx_audit_key_ts", table_name="audit_log", schema=SCHEMA)
+    op.drop_index("idx_audit_tenant_ts", table_name="audit_log", schema=SCHEMA)
+    op.drop_index("idx_audit_ts", table_name="audit_log", schema=SCHEMA)
+    op.drop_table("audit_log", schema=SCHEMA)
+
+    op.drop_index("idx_budget_usage_period", table_name="budget_usage", schema=SCHEMA)
+    op.drop_table("budget_usage", schema=SCHEMA)
+
+    op.drop_table("key_limits", schema=SCHEMA)
+
+    op.drop_index("idx_api_keys_tenant", table_name="api_keys", schema=SCHEMA)
+    op.drop_index("idx_api_keys_prefix", table_name="api_keys", schema=SCHEMA)
+    op.drop_table("api_keys", schema=SCHEMA)
+
+    op.drop_table("tenant_limits", schema=SCHEMA)
+    op.drop_table("tenants", schema=SCHEMA)
+
+    op.execute("DROP TYPE IF EXISTS gateway.budget_period")
+    op.execute("DROP TYPE IF EXISTS gateway.tenant_status")
+    op.execute("DROP TYPE IF EXISTS gateway.key_status")
+
+    op.execute(f'DROP SCHEMA IF EXISTS "{SCHEMA}"')
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
new file mode 100644
index 0000000..a73ce4d
--- /dev/null
+++ b/docker-compose.dev.yml
@@ -0,0 +1,101 @@
+# neuronetz-gateway — DEV stack (postgres + redis + gateway only).
+#
+# Deliberately differs from the production stack:
+#   * NO caddy   — the gateway is published directly on localhost:8080.
+#   * NO ollama  — Phase 1 expects /readyz to return 503 *because* there is no
+#                  Ollama backend yet. This is the intended exit-criterion state.
+#
+# Bring it up with:
+#   docker compose -f docker-compose.dev.yml up --build
+#
+# Then:
+#   curl -i http://localhost:8080/healthz   # -> 200
+#   curl -i http://localhost:8080/readyz    # -> 503 (no Ollama)
+#
+# The gateway container runs `alembic upgrade head` and then starts the server.
+
+services:
+  gateway:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    restart: unless-stopped
+    ports:
+      - "127.0.0.1:8080:8080"
+    environment:
+      GATEWAY_BIND_HOST: 0.0.0.0
+      GATEWAY_BIND_PORT: "8080"
+      GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
+      GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
+      GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
+      GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
+      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
+      DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
+      DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
+      REDIS_URL: redis://redis:6379/0
+      REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
+      # No Ollama in the dev stack — point at the (absent) service name so the
+      # readiness check fails closed with 503, exactly as Phase 1 expects.
+      OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434}
+      OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
+      OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
+      OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
+      DEFAULT_RPM: ${DEFAULT_RPM:-60}
+      DEFAULT_TPM: ${DEFAULT_TPM:-100000}
+      DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
+      MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
+      MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
+      ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
+      ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
+      ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
+      AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
+      AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
+      PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
+      AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    # Run migrations, then start the server.
+    command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
+      interval: 10s
+      timeout: 3s
+      retries: 5
+      start_period: 30s
+
+  postgres:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER:-gateway}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
+      POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
+    ports:
+      # Exposed on localhost for dev convenience (psql, migrations from host).
+      - "127.0.0.1:5432:5432"
+    volumes:
+      - postgres_dev_data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+
+  redis:
+    image: redis:7-alpine
+    restart: unless-stopped
+    command: ["redis-server", "--save", "", "--appendonly", "no"]
+    ports:
+      # Exposed on localhost for dev convenience (redis-cli from host).
+      - "127.0.0.1:6379:6379"
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+
+volumes:
+  postgres_dev_data:
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..6a2d0ae
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,152 @@
+# neuronetz-gateway — FULL production stack (SPEC §4.1).
+#
+#   Internet ──TLS──▶ caddy ──HTTP/1.1 internal──▶ gateway ──▶ postgres / redis / ollama
+#
+# Only Caddy publishes ports to the host. The gateway is reachable solely through
+# Caddy on the internal network. Postgres, Redis and (critically) Ollama are NOT
+# published to the host at all.
+#
+#  ┌─────────────────────────────────────────────────────────────────────────┐
+#  │ SECURITY NON-NEGOTIABLE:                                                  │
+#  │   The `ollama` service has NO `ports:` mapping and MUST NEVER get one.    │
+#  │   Ollama is reachable only on the internal Docker network via the         │
+#  │   service name `ollama:11434`. Publishing it would re-open the exact      │
+#  │   unauthenticated exposure this whole project exists to close.            │
+#  └─────────────────────────────────────────────────────────────────────────┘
+#
+# Copy `.env.example` to `.env` and adjust before running:
+#   docker compose up -d --build
+
+services:
+  caddy:
+    image: caddy:2-alpine
+    restart: unless-stopped
+    depends_on:
+      gateway:
+        condition: service_healthy
+    ports:
+      - "80:80"
+      - "443:443"
+      - "443:443/udp"   # HTTP/3
+    volumes:
+      - ./ops/caddy/Caddyfile.example:/etc/caddy/Caddyfile:ro
+      - caddy_data:/data
+      - caddy_config:/config
+    networks:
+      - edge
+      - internal
+
+  gateway:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    restart: unless-stopped
+    # NOTE: deliberately NO `ports:` — the gateway is internal-only and is
+    # reached exclusively through Caddy.
+    expose:
+      - "8080"
+    environment:
+      GATEWAY_BIND_HOST: 0.0.0.0
+      GATEWAY_BIND_PORT: "8080"
+      GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
+      GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-json}
+      GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
+      GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1,caddy}
+      # Service-name addressing on the internal network.
+      DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-changeme}@postgres:5432/${POSTGRES_DB:-neuronetz}
+      DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
+      DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
+      REDIS_URL: redis://redis:6379/0
+      REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
+      OLLAMA_BASE_URL: http://ollama:11434
+      OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
+      OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
+      OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
+      DEFAULT_RPM: ${DEFAULT_RPM:-60}
+      DEFAULT_TPM: ${DEFAULT_TPM:-100000}
+      DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
+      MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
+      MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
+      ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
+      ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
+      ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
+      AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
+      AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
+      PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
+      AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+      ollama:
+        condition: service_started
+    # Apply migrations, then start the server.
+    command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
+    healthcheck:
+      test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
+      interval: 15s
+      timeout: 3s
+      retries: 5
+      start_period: 30s
+    networks:
+      - internal
+
+  postgres:
+    image: postgres:16-alpine
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER:-gateway}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme}
+      POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+    # No `ports:` — Postgres is internal-only.
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+    networks:
+      - internal
+
+  redis:
+    image: redis:7-alpine
+    restart: unless-stopped
+    command: ["redis-server", "--save", "", "--appendonly", "no"]
+    # No `ports:` — Redis is internal-only.
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+    networks:
+      - internal
+
+  # ───────────────────────────────────────────────────────────────────────────
+  # Ollama — INTERNAL NETWORK ONLY. DO NOT ADD A `ports:` MAPPING.
+  # Reachable only as `http://ollama:11434` from the gateway container.
+  # ───────────────────────────────────────────────────────────────────────────
+  ollama:
+    image: ollama/ollama:latest
+    restart: unless-stopped
+    # !!! NO `ports:` — never publish Ollama to the host or the internet. !!!
+    volumes:
+      - ollama_data:/root/.ollama
+    networks:
+      - internal
+
+networks:
+  # Public-facing network: only Caddy is attached alongside `internal`.
+  edge:
+    driver: bridge
+  # Private network for inter-service traffic; not reachable from the host.
+  internal:
+    driver: bridge
+    internal: false
+
+volumes:
+  postgres_data:
+  ollama_data:
+  caddy_data:
+  caddy_config:
diff --git a/justfile b/justfile
new file mode 100644
index 0000000..8194f81
--- /dev/null
+++ b/justfile
@@ -0,0 +1,60 @@
+# neuronetz-gateway — task runner.
+#
+# Requires `just` (https://github.com/casey/just) and `uv`
+# (https://github.com/astral-sh/uv) on the host.
+#
+#   just            # list available targets
+#   just dev        # run postgres + redis + gateway locally (dev stack)
+#   just test       # run the test suite with coverage
+#   just lint       # ruff check
+#   just typecheck  # mypy --strict
+#   just migrate    # apply alembic migrations against DATABASE_URL
+
+set dotenv-load := true
+
+# uv runs commands inside the project's managed environment.
+uv := "uv"
+
+# Show the list of targets (default).
+default:
+    @just --list
+
+# Sync dependencies into the local uv-managed virtualenv (incl. dev extras).
+install:
+    {{uv}} sync --extra dev
+
+# Run the dev stack: postgres + redis + gateway (no caddy, no ollama).
+dev:
+    docker compose -f docker-compose.dev.yml up --build
+
+# Run the test suite with coverage.
+test:
+    {{uv}} run pytest
+
+# Lint with ruff.
+lint:
+    {{uv}} run ruff check .
+
+# Static type checking (strict).
+typecheck:
+    {{uv}} run mypy --strict src
+
+# Apply database migrations to head.
+migrate:
+    {{uv}} run alembic upgrade head
+
+# Security lint.
+bandit:
+    {{uv}} run bandit -q -r src
+
+# Dependency vulnerability audit.
+audit:
+    {{uv}} run pip-audit
+
+# Bring the FULL production stack up (caddy + gateway + postgres + redis + ollama).
+compose-up:
+    docker compose up -d --build
+
+# Tear the production stack down.
+compose-down:
+    docker compose down
diff --git a/ops/caddy/Caddyfile.example b/ops/caddy/Caddyfile.example
new file mode 100644
index 0000000..75fead8
--- /dev/null
+++ b/ops/caddy/Caddyfile.example
@@ -0,0 +1,59 @@
+# neuronetz-gateway — Caddy reverse proxy (SPEC §4.1, §6.5).
+#
+# Caddy is the only public-facing component. It terminates TLS (HTTP/2 + HTTP/3),
+# obtains a Let's Encrypt certificate for api.neuronetz.ai automatically, applies
+# security headers, and reverse-proxies to the internal-only gateway:8080.
+#
+# Copy this file to `Caddyfile` and edit the site address / admin email.
+# The production docker-compose.yml mounts it at /etc/caddy/Caddyfile.
+
+{
+	# Email for Let's Encrypt account + expiry notices. Replace before deploy.
+	email ops@neuronetz.ai
+}
+
+api.neuronetz.ai {
+	# --- Reverse proxy to the internal gateway ---
+	# `gateway` is the Docker service name on the internal network; it is never
+	# published to the host. Caddy forwards plain HTTP/1.1 to it.
+	reverse_proxy gateway:8080
+
+	# --- Security headers ---
+	header {
+		# HSTS: force HTTPS for two years, include subdomains, allow preload.
+		Strict-Transport-Security "max-age=63072000; includeSubDomains; preload"
+		# Disable MIME sniffing.
+		X-Content-Type-Options "nosniff"
+		# Clickjacking defense (API has no UI, deny framing outright).
+		X-Frame-Options "DENY"
+		# Conservative referrer policy.
+		Referrer-Policy "no-referrer"
+		# Strip server-identifying headers so we don't advertise the stack.
+		-Server
+		-X-Powered-By
+	}
+
+	# Structured access logs to stdout (collected by the container runtime).
+	log {
+		output stdout
+		format json
+	}
+}
+
+# ─────────────────────────────────────────────────────────────────────────────
+# DEV / LOCAL note:
+#
+# For local testing without a public domain or real certificate, replace the
+# site block above with a localhost block that uses Caddy's internal self-signed
+# CA (no Let's Encrypt round-trip):
+#
+#   localhost {
+#       tls internal
+#       reverse_proxy gateway:8080
+#   }
+#
+# Caddy will install its local root CA; trust it or pass `-k` to curl. Note the
+# Phase 1 *dev* compose stack (docker-compose.dev.yml) ships WITHOUT Caddy and
+# exposes the gateway directly on localhost:8080 — this file is for the full
+# production stack only.
+# ─────────────────────────────────────────────────────────────────────────────
diff --git a/ops/systemd/neuronetz-gateway.service b/ops/systemd/neuronetz-gateway.service
new file mode 100644
index 0000000..7041375
--- /dev/null
+++ b/ops/systemd/neuronetz-gateway.service
@@ -0,0 +1,58 @@
+# neuronetz-gateway — systemd unit for non-Compose deployments.
+#
+# Assumes the project is installed into a virtualenv at /opt/neuronetz-gateway/venv
+# (e.g. `uv venv /opt/neuronetz-gateway/venv && uv pip install ...`) and that
+# configuration lives in /etc/neuronetz-gateway/gateway.env (same keys as
+# .env.example). Postgres, Redis and Ollama are reached over the network/loopback
+# per that env file — Ollama must remain bound to localhost / a private network
+# and never be published publicly.
+#
+# Install:
+#   sudo cp neuronetz-gateway.service /etc/systemd/system/
+#   sudo systemctl daemon-reload
+#   sudo systemctl enable --now neuronetz-gateway
+
+[Unit]
+Description=neuronetz-gateway — secure API gateway in front of Ollama
+Documentation=https://github.com/neuronetz/neuronetz-gateway
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+
+# Dedicated unprivileged service account (create with: useradd --system gateway).
+User=gateway
+Group=gateway
+
+WorkingDirectory=/opt/neuronetz-gateway
+EnvironmentFile=/etc/neuronetz-gateway/gateway.env
+
+# Apply migrations before starting (idempotent; no-op when already at head).
+ExecStartPre=/opt/neuronetz-gateway/venv/bin/alembic upgrade head
+ExecStart=/opt/neuronetz-gateway/venv/bin/python -m neuronetz_gateway
+
+Restart=on-failure
+RestartSec=5
+TimeoutStopSec=30
+
+# --- Hardening ---
+NoNewPrivileges=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+PrivateDevices=true
+ProtectKernelTunables=true
+ProtectKernelModules=true
+ProtectControlGroups=true
+RestrictNamespaces=true
+RestrictRealtime=true
+RestrictSUIDSGID=true
+LockPersonality=true
+MemoryDenyWriteExecute=true
+RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
+# Allow writing only where the app legitimately needs to (none by default).
+ReadWritePaths=
+
+[Install]
+WantedBy=multi-user.target
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..166bf8f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,94 @@
+[project]
+name = "neuronetz-gateway"
+version = "0.1.0"
+description = "Secure multi-tenant API gateway in front of Ollama for the Neuronetz platform."
+readme = "README.md"
+license = { text = "Apache-2.0" }
+requires-python = ">=3.12"
+authors = [{ name = "Neuronetz", email = "ops@neuronetz.ai" }]
+dependencies = [
+    "fastapi>=0.115",
+    "uvicorn[standard]>=0.30",
+    "httpx>=0.27",
+    "sqlalchemy[asyncio]>=2.0",
+    "asyncpg>=0.29",
+    "redis[hiredis]>=5.0",
+    "structlog>=24.1",
+    "pydantic>=2.9",
+    "pydantic-settings>=2.4",
+    "argon2-cffi>=23.1",
+    "typer>=0.12",
+    "prometheus-client>=0.20",
+    "alembic>=1.13",
+]
+
+[project.scripts]
+neuronetz-gateway = "neuronetz_gateway.cli.manage:app"
+
+[project.optional-dependencies]
+dev = [
+    "ruff>=0.6",
+    "mypy>=1.11",
+    "bandit>=1.7",
+    "pip-audit>=2.7",
+    "pytest>=8.3",
+    "pytest-asyncio>=0.24",
+    "pytest-cov>=5.0",
+    "testcontainers>=4.8",
+    "respx>=0.21",
+    "locust>=2.31",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/neuronetz_gateway"]
+
+[tool.ruff]
+target-version = "py312"
+line-length = 100
+src = ["src", "tests"]
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP", "S", "ASYNC"]
+
+[tool.ruff.lint.per-file-ignores]
+# Tests may use assert and bind to all interfaces in fixtures.
+"tests/**" = ["S101", "S104"]
+
+[tool.mypy]
+python_version = "3.12"
+strict = true
+mypy_path = "src"
+plugins = ["pydantic.mypy"]
+namespace_packages = true
+explicit_package_bases = true
+
+[[tool.mypy.overrides]]
+# argon2 ships types but some transitive deps may not; keep strictness elsewhere.
+# asyncpg ships no stubs/py.typed marker; it is used in revocation.py only.
+module = ["testcontainers.*", "locust.*", "asyncpg", "asyncpg.*"]
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
+pythonpath = ["src"]
+addopts = "--cov=neuronetz_gateway --cov-report=term-missing"
+
+[tool.coverage.run]
+source = ["src/neuronetz_gateway"]
+branch = true
+omit = [
+    "src/neuronetz_gateway/__main__.py",
+    "src/neuronetz_gateway/cli/*",
+]
+
+[tool.coverage.report]
+# Phase 1: coverage is reported but non-blocking. Later phases set fail_under.
+show_missing = true
+
+[tool.bandit]
+exclude_dirs = ["tests"]
diff --git a/scope-docs/AGENT_PROMPT.md b/scope-docs/AGENT_PROMPT.md
new file mode 100644
index 0000000..5361a96
--- /dev/null
+++ b/scope-docs/AGENT_PROMPT.md
@@ -0,0 +1,121 @@
+# Build Order: neuronetz-gateway v0.1.0
+
+## Context
+
+The Ollama instance at `https://api.neuronetz.ai` is currently exposed without authentication. This is a security incident in waiting. Your job is to build the gateway that closes that gap and forms the commercial API surface of the Neuronetz AI platform.
+
+The full specification is in **`SPEC.md`** in this repository. Read it before writing any code. It is the source of truth; if anything below conflicts with it, SPEC.md wins.
+
+## Mission
+
+Implement `neuronetz-gateway` per SPEC.md to a state that satisfies **§12 Acceptance Criteria**. Nothing less ships.
+
+## Non-Negotiables
+
+These are hard constraints. Violating any of them is a build failure regardless of feature completeness.
+
+1. **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down, DB unreachable, ambiguous state), deny the request. Never default to allow.
+2. **Ollama never reachable from outside the Docker internal network.** No `ports:` mapping for the ollama service in any compose file shipped with the project. Document this prominently.
+3. **No secrets in code, no secrets in logs, no secrets in errors.** Argon2id for key storage. Constant-time comparison only. Keys printed exactly once at creation.
+4. **No reflected upstream errors.** Ollama errors are sanitized at the gateway boundary. Map to generic 4xx/5xx with a request ID.
+5. **Mutating Ollama endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) are hard-blocked.** Not configurable. Not behind a feature flag. Blocked.
+6. **Streaming integrity.** Token counting and audit writes happen **after** stream close, never on the hot path. Time-to-first-byte must not be degraded by gateway bookkeeping.
+7. **`mypy --strict` and `ruff check` clean before any PR is opened.** No `# type: ignore` without an inline justification comment.
+8. **Test coverage targets (§9) are a gate, not a goal.** 100% on `auth/`, `ratelimit/`, `budget/`. CI fails below threshold.
+9. **Apache 2.0 license file present from commit one.** No GPL dependencies.
+10. **The bootstrap CLI must work before the first manual `curl`.** No "I'll create a key by hand in the DB just to test it" — if the CLI can't create a key, fix the CLI first.
+
+## Phasing
+
+Five phases. Each phase has an explicit exit criterion. **Do not start phase N+1 until phase N's exit criterion is verifiably met.** PM/Control: enforce this.
+
+### Phase 1 — Scaffold
+
+- Repo layout per SPEC §8
+- `pyproject.toml`, `uv.lock`, Dockerfile, docker-compose.yml, docker-compose.dev.yml, .env.example, README, LICENSE
+- Alembic configured; migration `0001_initial.py` creates schema `gateway` and all tables per SPEC §5
+- `make` or `just` targets: `dev`, `test`, `lint`, `typecheck`, `migrate`, `compose-up`, `compose-down`
+- CI workflow runs: ruff, mypy, pytest, bandit, pip-audit
+- **Exit criterion:** `docker compose -f docker-compose.dev.yml up` brings up postgres + redis + a stub gateway that responds 200 on `/healthz` and 503 on `/readyz` (because no Ollama yet). Migrations apply cleanly. CI is green on an empty test suite.
+
+### Phase 2 — Core proxy + auth
+
+- Bootstrap CLI (`create-tenant`, `create-key`, `list-keys`, `revoke-key`) working end-to-end
+- Argon2id hashing module with unit tests covering: hash, verify, constant-time behavior, rehash-on-parameter-change
+- Auth middleware: Bearer extraction, prefix lookup, hash verify, Redis cache with TTL
+- Ollama proxy for `/api/chat` and `/api/generate` — both streamed (NDJSON) and non-streamed
+- Endpoint allowlist enforced
+- **Model discovery (SPEC §4.6):** background poll of Ollama `/api/tags`, cached in Redis + in-process, fail-closed when unavailable
+- Model allowlist enforced per-tenant via the **effective set** (allow_all → all discovered; else `allowed_models ∩ discovered`); key-level `allow_all_models` overrides tenant
+- Error handler: sanitized responses, request ID in every error
+- Audit log writer (buffered, async)
+- Mock Ollama in `tests/integration/mock_ollama.py` (no real model required for CI)
+- **Exit criterion:** A key created via CLI can call `/api/chat` and `/api/generate` through Caddy → gateway → mock Ollama, streaming works, audit rows land in Postgres with correct token counts, `/api/pull` returns 403, no-auth returns 401, wrong-key returns 401. Model discovery populates from the (mock) Ollama `/api/tags`; `/api/tags` returns the tenant's effective set; an `allow_all_models` tenant sees all discovered models, a default-deny tenant sees only `allowed ∩ discovered`, and a non-effective model returns 403; discovery-unavailable fails closed. Integration tests cover all of the above.
+
+### Phase 3 — Rate limit + budget + OpenAI-compat
+
+- Sliding window rate limit (Redis Lua script) — per-key RPM, per-tenant RPM, per-key TPM
+- Concurrency semaphore (Redis-backed) with TTL guard
+- Token budget counters in Redis with Postgres ledger reconciliation on period rollover
+- OpenAI-compatibility layer: `/v1/chat/completions`, `/v1/completions`, `/v1/embeddings`, `/v1/models` with full SSE streaming and `data: [DONE]` terminator
+- Schema translation tests with golden fixtures (request in OpenAI → expected Ollama request; response from Ollama → expected OpenAI response)
+- Rate-limit and budget response headers per SPEC §6.5
+- **Exit criterion:** Locust test (100 concurrent users, 5 min) shows correct 429 behavior at the limit, correct token accounting, p99 gateway overhead < 25 ms. OpenAI Python SDK pointed at `/v1` successfully completes streaming chat. Killing Redis mid-test produces 503 (fail closed), not 200.
+
+### Phase 4 — Audit, prompt log, revocation
+
+- Prompt log (opt-in per key, TTL) with daily sweeper task
+- Audit log retention sweeper (TTL per tenant config)
+- Buffered audit writer with ring-buffer overflow → deny-mode behavior
+- Revocation flow: console (simulated via direct INSERT in tests) writes `gateway.revocations` → NOTIFY → gateway evicts Redis cache → next request with revoked key returns 401 within 1 second
+- Prometheus `/metrics` (loopback only) with: `gateway_requests_total{tenant,model,status}`, `gateway_tokens_total{tenant,model,direction}`, `gateway_request_duration_seconds{tenant,model}` (histogram)
+- `/readyz` checks DB + Redis + Ollama all reachable
+- Circuit breaker on Ollama failures
+- **Exit criterion:** Revocation E2E test green. Prompt log retention TTL works (use freeze-time to simulate). Metrics scrape returns valid Prometheus exposition. `/readyz` flips to 503 when any dependency is down.
+
+### Phase 5 — Harden, document, release
+
+- `docs/ARCHITECTURE.md`, `docs/DEPLOYMENT.md`, `docs/API.md`, `docs/THREAT_MODEL.md`, `docs/OPERATIONS.md` complete
+- Caddyfile example with Let's Encrypt for `api.neuronetz.ai` and security headers (HSTS, X-Content-Type-Options, no Server header, no X-Powered-By)
+- Systemd unit file for non-Compose deployments
+- Multi-stage Dockerfile with non-root user, distroless or `python:3.12-slim` final stage, no build tools in final image
+- `pip-audit` and `bandit` clean in CI
+- Image scan (Trivy or Grype) clean of HIGH/CRITICAL
+- Tag `v0.1.0`, build and push image, GitHub release with changelog
+- **Exit criterion:** Every box in SPEC §12 checked, signed off by Control. Image runnable from a fresh host with only docker + a `.env`. README quickstart works for someone who has never seen the repo.
+
+## Agent Role Assignments
+
+For the multi-agent orchestrator (Fritz/UI-UX/DevOps/QA/Control/Timo/PM):
+
+| Agent | Owns |
+|---|---|
+| **Backend / Fritz** | All Python code under `src/neuronetz_gateway/`, Alembic migrations, CLI. Primary author. |
+| **DevOps** | Dockerfile, docker-compose.yml(s), Caddyfile, systemd unit, CI workflows, image scanning, release tagging. |
+| **QA** | All tests under `tests/`. Owns coverage gate. Writes the locust scenarios. Verifies acceptance criteria at each phase exit. |
+| **UI-UX** | Not active this project (no UI surface here). Console project will pick this up. |
+| **Control / Timo** | Enforces phase gates. Refuses to advance a phase whose exit criterion isn't met. Runs the acceptance checklist at end of Phase 5. |
+| **PM** | Tracks the phase progression, opens YouTrack tickets per phase, runs daily standups against this prompt, surfaces blockers. |
+
+## Working Agreements
+
+- **Branch per phase.** `phase-1-scaffold`, `phase-2-proxy-auth`, etc. Merge to `main` only after phase exit criterion is verified.
+- **PRs are reviewed against SPEC.md.** "Does this match the spec? If not, is SPEC.md wrong or is the PR wrong?" — that's the review question.
+- **SPEC changes are explicit.** If a phase reveals a spec mistake, amend SPEC.md in a separate PR before changing the implementation. Never drift silently.
+- **Commit messages reference the section.** e.g. `auth: implement argon2id verify per SPEC §5, §9`.
+- **No TODOs in main.** If something is deferred, it becomes a tracked issue, not a code comment.
+- **Open questions (SPEC §13) are resolved in writing.** Decision goes in SPEC.md, not in a Slack message that gets lost.
+
+## What "Done" Looks Like
+
+A fresh clone, a fresh host, a domain pointing at it, and a `.env` file. `docker compose up`. Five minutes later, `curl -H "Authorization: Bearer nz_..." https://api.neuronetz.ai/v1/chat/completions -d '...'` streams a response. The Ollama port is not open. The audit log has a row. The budget counter decremented. The metrics endpoint shows the request. The locust suite passes. The threat model document explains every defense.
+
+When all of that is true and SPEC §12 is fully ticked, ship v0.1.0.
+
+## When You Get Stuck
+
+- **Ambiguity in the spec → ask, don't guess.** Open a question in the PM channel; if resolved, amend SPEC.md.
+- **Conflict between speed and correctness → correctness wins.** This is security infrastructure. We do not ship "good enough."
+- **Conflict between scope creep and v0.1.0 → defer.** New ideas go in a follow-up issue. v0.1.0 ships per spec.
+
+Start with Phase 1. Read SPEC.md first.
diff --git a/scope-docs/SPEC.md b/scope-docs/SPEC.md
new file mode 100644
index 0000000..ddc226a
--- /dev/null
+++ b/scope-docs/SPEC.md
@@ -0,0 +1,593 @@
+# neuronetz-gateway — SPEC.md
+
+**Project:** `neuronetz-gateway`
+**Version:** 0.1.0 (target)
+**Status:** Specification — not yet implemented
+**License:** Apache 2.0
+**Owner:** Stephan Berbig / Neuronetz
+
+---
+
+## 1. Purpose
+
+A secure, multi-tenant API gateway in front of an Ollama instance currently exposed at `https://api.neuronetz.ai`. The Ollama endpoint must never be reachable directly from the public internet again. All access flows through this gateway.
+
+The gateway is the **hot path** of the Neuronetz API. A separate service (`neuronetz-console`, built on the Nibiru PHP framework) handles administration, dashboards, and tenant self-service. This SPEC covers only the gateway.
+
+## 2. Scope
+
+### In scope (v0.1.0)
+
+- Authentication via API keys (Bearer tokens)
+- Multi-tenant data model (tenants → keys, with inheritance)
+- Per-key and per-tenant rate limiting (RPM, TPM, concurrent)
+- Per-key and per-tenant token budgets (daily, monthly, total)
+- Streaming and non-streaming proxy to Ollama
+- Dual API surface: native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`)
+- Endpoint allowlist (block all model-mutating Ollama endpoints)
+- **Dynamic model discovery** from the Ollama backend — the live set of installed models is queried, cached, and auto-refreshed; nothing about the model list is hand-maintained
+- Model allowlist (per-tenant override), **default-deny, resolved against the live discovered set** (stale/typo'd entries never resolve)
+- **Per-tenant `allow_all_models` toggle** — opt-in: a flagged tenant may use any currently-installed model, so models newly pulled into Ollama are auto-granted on the next discovery refresh
+- Request size limits, response size limits, timeouts
+- Token counting from Ollama responses (precise, not heuristic)
+- Audit log (always-on metadata)
+- Prompt log (opt-in per key, TTL'd retention)
+- Bootstrap CLI: create tenants, keys, set budgets
+- Health and readiness endpoints
+- Docker Compose deployment (gateway + caddy + postgres + redis + ollama)
+- Caddy as TLS terminator (Let's Encrypt for `api.neuronetz.ai`)
+
+### Out of scope (v0.1.0, document as future)
+
+- Web admin UI (lives in `neuronetz-console`, separate repo)
+- Billing / Stripe integration (budgets only, no money yet)
+- Multi-region / HA / k8s
+- Content moderation / prompt-injection filtering
+- Response caching
+- Multi-backend routing (one Ollama; pluggable backend interface stays for later)
+- Webhook notifications
+- SSO / OAuth2 for admin
+
+## 3. Threat Model (abbreviated)
+
+| Threat | Mitigation |
+|---|---|
+| Internet scanners hitting Ollama directly | Ollama bound to internal Docker network; never published |
+| Unauthenticated API abuse | Mandatory Bearer token; fail-closed on auth errors |
+| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP |
+| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent connection cap |
+| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096) |
+| Model enumeration / training-data exfil via uncommon models | Model allowlist; default-deny. `allow_all_models` is **opt-in per tenant and audited**. Discovery only ever exposes models actually installed on the backend; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the same generic response |
+| Discovery backend unreachable | Fail-closed: an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models" |
+| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`) hard-blocked at the gateway |
+| Information disclosure via error messages | Sanitize upstream errors; never proxy Ollama internals to client |
+| Audit log tampering | Append-only at app layer; DB role separation; optional WAL archiving |
+| Prompt data leakage | Prompt logging off by default; opt-in per key; TTL'd; redaction hook |
+| Redis outage causing "fail open" | Fail-closed: if rate-limit/budget backend is unavailable, deny |
+| Compromised admin token | Admin token lives in `neuronetz-console`, not in gateway; gateway has no admin endpoints |
+
+## 4. Architecture
+
+### 4.1 Component diagram
+
+```
+                          Internet
+                              │ TLS
+                              ▼
+                  ┌──────────────────────┐
+                  │ Caddy (sidecar)      │  Let's Encrypt for api.neuronetz.ai
+                  │ - TLS termination    │  HSTS, security headers
+                  │ - HTTP/2, HTTP/3     │
+                  └──────────┬───────────┘
+                             │ HTTP/1.1 internal
+                  ┌──────────▼───────────┐
+                  │ neuronetz-gateway    │  FastAPI + uvicorn
+                  │  - authn             │
+                  │  - rate limit        │
+                  │  - budget check      │
+                  │  - proxy + stream    │
+                  │  - token count       │
+                  │  - audit write       │
+                  └──┬────────┬──────┬───┘
+                     │        │      │
+              ┌──────▼──┐  ┌──▼───┐  │
+              │Postgres │  │Redis │  │
+              │ schema: │  │ keys │  │
+              │ gateway │  │bucket│  │
+              └─────────┘  └──────┘  │
+                                     │ internal network only
+                              ┌──────▼──────┐
+                              │   Ollama    │
+                              │ 127.0.0.1   │
+                              └─────────────┘
+
+Same Compose stack also hosts (separate from this SPEC):
+  - neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
+```
+
+### 4.2 Database schemas
+
+**Single Postgres instance, two schemas:**
+
+- `gateway` — owned by the gateway service; gateway role has full DDL
+- `console` — owned by `neuronetz-console` (out of scope here); console role has full DDL
+- Both services connect with their own role. Cross-schema access is explicit GRANT.
+
+**Console role gets `SELECT` on all `gateway.*` tables.** Console writes go only to `console.*` tables. If the console needs to mutate gateway state (e.g. revoke a key), it does so by writing to a `gateway.revocations` outbox table that the gateway tails (see §4.5).
+
+### 4.3 Request lifecycle
+
+1. Caddy terminates TLS, forwards to gateway on internal port.
+2. Gateway middleware extracts `Authorization: Bearer <key>`.
+3. Key prefix (first 12 chars) used as Redis cache key. On miss, lookup `gateway.api_keys` by prefix; verify full key with argon2id `verify`; cache resolved key metadata in Redis (TTL 60s).
+4. Rate limit check (sliding window in Redis, Lua-atomic) — per-key RPM + per-tenant RPM.
+5. Budget check (Redis counter for current period; Postgres ledger is source of truth on reset).
+6. Concurrent-connection semaphore (Redis `INCR` with TTL).
+7. Model allowlist check. Resolve the **effective model set** for the key:
+   `allow_all := key.allow_all_models ?? tenant.allow_all_models`;
+   `effective := discovered` if `allow_all` else `(key.allowed_models ?? tenant.allowed_models) ∩ discovered`,
+   where `discovered` is the cached live model set from discovery (§4.6). The request's
+   `model` must be in `effective`, else a generic 403 with no disclosure of whether the
+   model exists but is unpermitted vs. is not installed.
+8. Endpoint allowlist check.
+9. Request body validation (size, schema, `num_predict` cap).
+10. If OpenAI-compat path, translate request to Ollama schema.
+11. Open httpx async stream to Ollama.
+12. Stream response back to client, accumulating final `prompt_eval_count` + `eval_count`.
+13. On stream close: write `gateway.audit_log` row; decrement budget; release semaphore; if prompt logging enabled, write `gateway.prompt_log` row.
+14. On any failure: sanitized error to client, audit row with status code, semaphore released.
+
+### 4.4 Failure modes (fail-closed)
+
+| Subsystem | If down | Behavior |
+|---|---|---|
+| Postgres (read) | Key lookup fails | 503 with retry-after; no requests proxied |
+| Postgres (write) | Audit write fails | Request still succeeds, audit row buffered in-memory ring (max 1000), drained on recovery; if buffer fills, switch to deny mode |
+| Redis | Rate limit / budget unavailable | 503 — fail closed. Never "allow because we can't check." |
+| Ollama | Upstream unreachable | 502 with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30s |
+| Caddy | Not a gateway concern | — |
+
+### 4.5 Cache invalidation (key revocation)
+
+Console can revoke a key by inserting into `gateway.revocations(key_id, ts, reason)`. Gateway has a background task (`asyncio.create_task` in lifespan) that:
+- LISTENs on Postgres channel `key_revoked` (gateway emits NOTIFY on its own write path; console emits via INSERT trigger)
+- On notification, evicts the Redis cache entry for that key's prefix
+- This makes revocation effectively immediate (≤ Redis RTT) without cross-service HTTP
+
+### 4.6 Model discovery
+
+The set of usable models is **never hand-maintained**; it is extracted live from the
+Ollama backend.
+
+- A background task (started in lifespan, like the revocation listener) polls Ollama
+  `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
+- The parsed model set (names + sanitized metadata: family, parameter size, quantization,
+  size bytes, modified-at) is cached in Redis under `gateway:models:discovered` with TTL
+  `MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
+- On startup an initial fetch runs; if Ollama is unreachable the discovered set is empty.
+- **Fail-closed:** if the discovered set is empty or its cache has expired and cannot be
+  refreshed, no model resolves and requests are denied (consistent with default-deny).
+  Discovery never opens access on failure.
+- "Auto-grant": because the effective set (§4.3 step 7) intersects with `discovered` (or
+  *is* `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band
+  becomes usable to `allow_all` tenants on the next refresh — no per-tenant config change.
+- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
+  endpoint; it never triggers a model pull.
+
+## 5. Data Model (schema `gateway`)
+
+```sql
+CREATE SCHEMA gateway;
+
+CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked');
+CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed');
+CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total');
+
+CREATE TABLE gateway.tenants (
+    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
+    name            text NOT NULL UNIQUE,
+    status          gateway.tenant_status NOT NULL DEFAULT 'active',
+    created_at      timestamptz NOT NULL DEFAULT now(),
+    metadata        jsonb NOT NULL DEFAULT '{}'::jsonb
+);
+
+CREATE TABLE gateway.tenant_limits (
+    tenant_id           uuid PRIMARY KEY REFERENCES gateway.tenants(id) ON DELETE CASCADE,
+    rpm                 integer NOT NULL DEFAULT 60,
+    tpm                 integer NOT NULL DEFAULT 100000,
+    concurrent          integer NOT NULL DEFAULT 8,
+    tokens_daily        bigint,
+    tokens_monthly      bigint,
+    tokens_total        bigint,
+    allowed_models      text[] NOT NULL DEFAULT '{}',
+    allow_all_models    boolean NOT NULL DEFAULT false,  -- opt-in: allow any installed model
+    log_prompts_default boolean NOT NULL DEFAULT false,
+    prompt_retention_days integer NOT NULL DEFAULT 30,
+    audit_retention_days  integer NOT NULL DEFAULT 365
+);
+
+CREATE TABLE gateway.api_keys (
+    id              uuid PRIMARY KEY DEFAULT gen_random_uuid(),
+    tenant_id       uuid NOT NULL REFERENCES gateway.tenants(id) ON DELETE CASCADE,
+    prefix          text NOT NULL UNIQUE,          -- first 12 chars, indexed
+    key_hash        text NOT NULL,                  -- argon2id
+    name            text NOT NULL,
+    status          gateway.key_status NOT NULL DEFAULT 'active',
+    scopes          text[] NOT NULL DEFAULT '{chat,embeddings}',
+    created_at      timestamptz NOT NULL DEFAULT now(),
+    last_used_at    timestamptz,
+    expires_at      timestamptz,
+    log_prompts     boolean,                        -- NULL = inherit from tenant
+    metadata        jsonb NOT NULL DEFAULT '{}'::jsonb
+);
+
+CREATE INDEX idx_api_keys_prefix ON gateway.api_keys(prefix) WHERE status = 'active';
+CREATE INDEX idx_api_keys_tenant ON gateway.api_keys(tenant_id);
+
+CREATE TABLE gateway.key_limits (
+    key_id              uuid PRIMARY KEY REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
+    rpm                 integer,            -- NULL = inherit tenant
+    tpm                 integer,
+    concurrent          integer,
+    tokens_daily        bigint,
+    tokens_monthly      bigint,
+    tokens_total        bigint,
+    allowed_models      text[],             -- NULL = inherit tenant
+    allow_all_models    boolean             -- NULL = inherit tenant
+);
+
+CREATE TABLE gateway.budget_usage (
+    key_id          uuid NOT NULL REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
+    period          gateway.budget_period NOT NULL,
+    period_start    timestamptz NOT NULL,
+    tokens_in       bigint NOT NULL DEFAULT 0,
+    tokens_out      bigint NOT NULL DEFAULT 0,
+    requests        bigint NOT NULL DEFAULT 0,
+    PRIMARY KEY (key_id, period, period_start)
+);
+
+CREATE INDEX idx_budget_usage_period ON gateway.budget_usage(period, period_start);
+
+CREATE TABLE gateway.audit_log (
+    id              bigserial PRIMARY KEY,
+    ts              timestamptz NOT NULL DEFAULT now(),
+    request_id      uuid NOT NULL,
+    tenant_id       uuid,                          -- nullable for auth-failed rows
+    key_id          uuid,
+    key_prefix      text,                          -- denormalized for forensic queries
+    method          text NOT NULL,
+    path            text NOT NULL,
+    model           text,
+    tokens_in       integer,
+    tokens_out      integer,
+    latency_ms      integer,
+    status          integer NOT NULL,
+    client_ip       inet,
+    user_agent      text,
+    error_code      text
+);
+
+CREATE INDEX idx_audit_ts ON gateway.audit_log(ts);
+CREATE INDEX idx_audit_tenant_ts ON gateway.audit_log(tenant_id, ts);
+CREATE INDEX idx_audit_key_ts ON gateway.audit_log(key_id, ts);
+
+CREATE TABLE gateway.prompt_log (
+    id              bigserial PRIMARY KEY,
+    audit_id        bigint NOT NULL REFERENCES gateway.audit_log(id) ON DELETE CASCADE,
+    ts              timestamptz NOT NULL DEFAULT now(),
+    key_id          uuid NOT NULL,
+    request_body    jsonb NOT NULL,
+    response_text   text,
+    retention_until timestamptz NOT NULL
+);
+
+CREATE INDEX idx_prompt_log_retention ON gateway.prompt_log(retention_until);
+
+CREATE TABLE gateway.revocations (
+    id              bigserial PRIMARY KEY,
+    key_id          uuid NOT NULL,
+    ts              timestamptz NOT NULL DEFAULT now(),
+    reason          text,
+    processed_at    timestamptz
+);
+
+-- Trigger to NOTIFY on revocation insert
+CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
+BEGIN
+    PERFORM pg_notify('key_revoked', NEW.key_id::text);
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER trg_notify_key_revoked
+    AFTER INSERT ON gateway.revocations
+    FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
+
+-- Grants for console role (created in console SPEC, referenced here)
+-- GRANT USAGE ON SCHEMA gateway TO console_role;
+-- GRANT SELECT ON ALL TABLES IN SCHEMA gateway TO console_role;
+-- GRANT INSERT ON gateway.revocations TO console_role;
+```
+
+## 6. API Surface
+
+### 6.1 Native Ollama passthrough (allowlisted)
+
+| Path | Method | Notes |
+|---|---|---|
+| `/api/chat` | POST | Streamed (NDJSON) and non-streamed |
+| `/api/generate` | POST | Streamed (NDJSON) and non-streamed |
+| `/api/embeddings` | POST | Non-streamed |
+| `/api/embed` | POST | Newer Ollama embeddings endpoint |
+| `/api/tags` | GET | Returns the tenant's **effective** model set (live-discovered ∩ allowed, or *all* discovered when `allow_all_models`). Sourced from discovery (§4.6), never a static list |
+| `/api/show` | POST | Allowed only for models in the tenant's effective set; returns sanitized model info (no system prompts, no template) |
+| `/api/ps` | GET | **Blocked** — leaks loaded models |
+| `/api/version` | GET | Returns gateway version, not Ollama version |
+
+### 6.2 Hard-blocked Ollama endpoints (always 403)
+
+`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`
+
+### 6.3 OpenAI-compatible
+
+| Path | Method | Maps to |
+|---|---|---|
+| `/v1/chat/completions` | POST | `/api/chat` |
+| `/v1/completions` | POST | `/api/generate` |
+| `/v1/embeddings` | POST | `/api/embed` |
+| `/v1/models` | GET | `/api/tags` (the tenant's effective discovered set), in OpenAI model-list format |
+
+Translation must preserve streaming. SSE (`data: {...}\n\n`) for OpenAI-compat; NDJSON for native.
+
+### 6.4 Gateway endpoints
+
+| Path | Method | Auth | Purpose |
+|---|---|---|---|
+| `/healthz` | GET | none | Liveness — process responsive |
+| `/readyz` | GET | none | Readiness — DB + Redis + Ollama all reachable |
+| `/metrics` | GET | none (loopback only) | Prometheus exposition (counters, histograms) |
+
+No admin endpoints. Admin lives in `neuronetz-console`.
+
+### 6.5 Response headers
+
+Every proxied response carries:
+- `X-Request-ID: <uuid>`
+- `X-RateLimit-Limit-Requests: <n>`
+- `X-RateLimit-Remaining-Requests: <n>`
+- `X-RateLimit-Limit-Tokens: <n>`
+- `X-RateLimit-Remaining-Tokens: <n>`
+- `X-Budget-Period: day|month|total`
+- `X-Budget-Tokens-Remaining: <n>`
+
+429 responses additionally carry `Retry-After: <seconds>`.
+
+## 7. Configuration
+
+All via environment variables, validated by Pydantic Settings on boot. Boot fails loudly on invalid config.
+
+```
+# Service
+GATEWAY_BIND_HOST=0.0.0.0
+GATEWAY_BIND_PORT=8080
+GATEWAY_LOG_LEVEL=INFO
+GATEWAY_LOG_FORMAT=json                  # json|console
+GATEWAY_REQUEST_ID_HEADER=X-Request-ID
+GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy  # for X-Forwarded-For
+
+# Upstream
+OLLAMA_BASE_URL=http://ollama:11434
+OLLAMA_CONNECT_TIMEOUT_S=5
+OLLAMA_READ_TIMEOUT_S=600
+OLLAMA_MAX_CONNECTIONS=64
+
+# Model discovery (§4.6)
+MODEL_DISCOVERY_REFRESH_S=60             # how often to re-query Ollama /api/tags
+MODEL_DISCOVERY_CACHE_TTL_S=120          # Redis cache TTL for the discovered model set
+
+# Database
+DATABASE_URL=postgresql+asyncpg://gateway:...@postgres:5432/neuronetz
+DATABASE_POOL_SIZE=10
+DATABASE_POOL_OVERFLOW=20
+
+# Redis
+REDIS_URL=redis://redis:6379/0
+REDIS_KEY_CACHE_TTL_S=60
+
+# Limits (defaults; per-tenant/key overrides in DB)
+DEFAULT_RPM=60
+DEFAULT_TPM=100000
+DEFAULT_CONCURRENT=8
+MAX_REQUEST_BODY_BYTES=262144
+MAX_NUM_PREDICT=4096
+
+# Security
+ARGON2_TIME_COST=3
+ARGON2_MEMORY_COST_KIB=65536
+ARGON2_PARALLELISM=4
+AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
+
+# Audit
+AUDIT_BUFFER_SIZE=1000
+PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
+AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
+```
+
+## 8. Repository Layout
+
+```
+neuronetz-gateway/
+├── pyproject.toml                # uv-managed, ruff, mypy --strict, pytest
+├── README.md
+├── LICENSE                       # Apache 2.0
+├── docker-compose.yml            # full stack incl. console placeholder
+├── docker-compose.dev.yml        # without caddy, gateway exposed on localhost
+├── Dockerfile                    # multi-stage, python:3.12-slim base
+├── .env.example
+├── .dockerignore
+├── .gitignore
+├── alembic.ini
+├── alembic/
+│   ├── env.py
+│   └── versions/
+│       └── 0001_initial.py       # creates schema `gateway` and all tables
+├── ops/
+│   ├── caddy/
+│   │   └── Caddyfile.example
+│   └── systemd/
+│       └── neuronetz-gateway.service
+├── src/neuronetz_gateway/
+│   ├── __init__.py
+│   ├── __main__.py               # uvicorn entry
+│   ├── app.py                    # FastAPI factory
+│   ├── config.py                 # Pydantic Settings
+│   ├── deps.py                   # DI providers
+│   ├── lifespan.py               # startup/shutdown, NOTIFY listener
+│   ├── errors.py                 # exception types, handlers, sanitization
+│   ├── auth/
+│   │   ├── __init__.py
+│   │   ├── hashing.py            # argon2id wrapper
+│   │   ├── keys.py               # key generation, prefix, verify
+│   │   └── middleware.py
+│   ├── ratelimit/
+│   │   ├── __init__.py
+│   │   ├── sliding_window.py     # Redis Lua script
+│   │   └── concurrency.py        # semaphore via Redis
+│   ├── budget/
+│   │   ├── __init__.py
+│   │   ├── counter.py            # Redis period counters
+│   │   └── ledger.py             # Postgres reconciliation
+│   ├── proxy/
+│   │   ├── __init__.py
+│   │   ├── ollama.py             # httpx streaming client
+│   │   ├── translate.py          # OpenAI <-> Ollama schemas
+│   │   ├── token_counter.py      # parse usage from stream
+│   │   ├── discovery.py          # live model discovery from Ollama /api/tags (§4.6)
+│   │   └── allowlist.py          # effective-set resolution (allow_all / allowed ∩ discovered)
+│   ├── routes/
+│   │   ├── __init__.py
+│   │   ├── ollama_native.py
+│   │   ├── openai_compat.py
+│   │   └── health.py
+│   ├── db/
+│   │   ├── __init__.py
+│   │   ├── session.py
+│   │   ├── models.py             # SQLAlchemy 2.0
+│   │   └── repositories.py
+│   ├── audit/
+│   │   ├── __init__.py
+│   │   ├── writer.py             # buffered async writer
+│   │   └── prompt_log.py
+│   ├── observability/
+│   │   ├── __init__.py
+│   │   ├── logging.py            # structlog config
+│   │   └── metrics.py            # prometheus
+│   └── cli/
+│       ├── __init__.py
+│       └── manage.py             # typer: create-tenant, create-key, ...
+├── tests/
+│   ├── conftest.py               # testcontainers fixtures
+│   ├── unit/
+│   │   ├── test_hashing.py
+│   │   ├── test_translate.py
+│   │   ├── test_token_counter.py
+│   │   ├── test_discovery.py
+│   │   ├── test_allowlist.py
+│   │   └── test_sliding_window.py
+│   ├── integration/
+│   │   ├── test_auth_flow.py
+│   │   ├── test_rate_limit.py
+│   │   ├── test_budget.py
+│   │   ├── test_proxy_stream.py
+│   │   ├── test_openai_compat.py
+│   │   ├── test_revocation.py
+│   │   └── mock_ollama.py        # FastAPI mock with NDJSON/SSE
+│   └── load/
+│       └── locustfile.py
+└── docs/
+    ├── ARCHITECTURE.md
+    ├── DEPLOYMENT.md
+    ├── API.md
+    ├── THREAT_MODEL.md
+    └── OPERATIONS.md              # runbook: revoke key, rotate, check usage
+```
+
+## 9. Non-Functional Requirements
+
+- **Performance:** p50 overhead < 5 ms over direct Ollama call (auth + ratelimit + audit); p99 < 25 ms (excluding upstream latency)
+- **Streaming:** Time-to-first-byte must not be degraded by gateway logic — audit write happens **after** stream close
+- **Memory:** Steady-state RSS < 200 MiB per gateway worker under 100 concurrent streams
+- **Concurrency:** Handle 200 concurrent connections per worker; 4 workers per instance default
+- **Test coverage:** ≥ 85% line coverage on `src/neuronetz_gateway/` excluding `__main__` and CLI; 100% on `auth/`, `ratelimit/`, `budget/`
+- **Security:** No `eval`, no `exec`, no shell-out, no `pickle`. Bandit clean. `pip-audit` clean on every CI run.
+- **Type safety:** `mypy --strict` clean
+- **Lint:** `ruff check` clean with project ruleset (E, F, I, B, UP, S, ASYNC)
+
+## 10. Tooling
+
+- Python 3.12
+- `uv` for dependency management (pyproject.toml + uv.lock)
+- FastAPI ≥ 0.115, uvicorn[standard], httpx ≥ 0.27, SQLAlchemy 2.0 (async), asyncpg, redis ≥ 5.0 (with hiredis), structlog, pydantic ≥ 2.9, pydantic-settings, argon2-cffi, typer, prometheus-client
+- Test: pytest, pytest-asyncio, pytest-cov, testcontainers, httpx (test client), respx (mock), locust
+- Lint/format: ruff, mypy --strict, bandit, pip-audit
+- CI: GitHub Actions workflow (lint, type, test with coverage, build image, push on tag)
+
+## 11. Bootstrap CLI (Typer)
+
+```
+neuronetz-gateway create-tenant --name "acme" [--rpm 60] [--tpm 100000]
+neuronetz-gateway create-key --tenant acme --name "prod-server-1" [--scopes chat,embeddings]
+neuronetz-gateway revoke-key --prefix nz_abc12345
+neuronetz-gateway list-keys --tenant acme
+neuronetz-gateway show-usage --tenant acme [--period day|month|total]
+neuronetz-gateway set-budget --key nz_abc12345 --daily 1000000 --monthly 30000000
+neuronetz-gateway set-models --tenant acme --models llama3.1:8b,mistral:7b
+neuronetz-gateway set-models --tenant acme --allow-all          # opt into allow_all_models
+neuronetz-gateway set-models --tenant acme --no-allow-all       # back to explicit allowlist
+neuronetz-gateway list-models [--tenant acme]                   # show live-discovered models
+                                                                # (and the tenant's effective set)
+```
+
+`create-tenant` accepts `--allow-all-models / --no-allow-all-models` (default off).
+`list-models` reads the discovery cache (§4.6); with `--tenant` it also shows that tenant's
+resolved effective set.
+
+Key format: `nz_<12-char-prefix><32-char-random>`. Prefix is stored; full key is hashed (argon2id). On creation, the full key is printed exactly once.
+
+## 12. Acceptance Criteria
+
+The build is "done" when every box below is checked. The orchestrator must verify each before declaring v0.1.0.
+
+- [ ] `docker compose up` from a clean checkout produces a running stack with TLS via Caddy (self-signed in dev, Let's Encrypt-ready in prod).
+- [ ] CLI creates tenant and key; printed key successfully authenticates an `/api/chat` call.
+- [ ] Unauthenticated request returns 401 with no Ollama details leaked.
+- [ ] Request to `/api/pull` returns 403 with generic error message.
+- [ ] Streaming `/api/chat` works end-to-end; first byte arrives within Ollama's own TTFB + < 10 ms gateway overhead.
+- [ ] Streaming `/v1/chat/completions` returns valid SSE with `data: [DONE]` terminator.
+- [ ] Token counts in audit log match Ollama's reported `prompt_eval_count` + `eval_count` exactly.
+- [ ] `/api/tags` and `/v1/models` reflect the **live** Ollama model set (discovery, §4.6): an `allow_all_models` tenant sees every installed model and a newly-pulled model appears within one refresh interval; a default-deny tenant sees only `allowed_models ∩ discovered`; a request for a model outside the effective set returns a generic 403; with discovery unavailable, requests fail closed (deny), not open.
+- [ ] Rate limit triggers at configured RPM with `Retry-After` header.
+- [ ] Token budget enforces and blocks at zero remaining with descriptive error.
+- [ ] Redis outage causes 503 (fail-closed), not 200.
+- [ ] Revocation via `INSERT INTO gateway.revocations` evicts Redis cache within 1 second.
+- [ ] `mypy --strict`, `ruff check`, `bandit`, `pip-audit` all clean in CI.
+- [ ] Test coverage ≥ 85% overall, 100% in `auth/`, `ratelimit/`, `budget/`.
+- [ ] `docs/THREAT_MODEL.md`, `docs/DEPLOYMENT.md`, `docs/OPERATIONS.md` present and accurate.
+- [ ] Load test (locust): 100 concurrent users sustained 5 minutes, p99 gateway overhead < 25 ms, zero 5xx outside induced failures.
+
+## 13. Open Questions (decide during build)
+
+1. Embedding cost accounting — Ollama doesn't return `eval_count` for embeddings. Decision: charge based on `prompt_eval_count` only; document as such.
+2. SSE vs NDJSON heuristic for OpenAI-compat — always SSE per OpenAI spec. NDJSON only on native `/api/*`.
+3. Prometheus cardinality — do not label by `key_id` (too many series); label by `tenant_id` only; per-key data lives in Postgres.
+4. **Model discovery source** — the live model list is `GET /api/tags` on the Ollama backend; there is no separate registry. Cached in Redis + in-process, refreshed every `MODEL_DISCOVERY_REFRESH_S`.
+5. **Discovery failure is fail-closed** — empty/expired discovered set ⇒ no model resolves ⇒ deny. Discovery never opens access on error.
+6. **No existence disclosure** — a model that is installed-but-unpermitted and a model that is not installed both return the same generic response, to prevent enumeration.
+7. **`allow_all_models` precedence** — key-level `allow_all_models` (when non-NULL) overrides the tenant flag; otherwise the tenant flag applies. Same NULL-inherits-tenant rule as the other key limits.
+
+## 14. References
+
+- Ollama API: https://github.com/ollama/ollama/blob/main/docs/api.md
+- OpenAI Chat Completions: https://platform.openai.com/docs/api-reference/chat
+- Nibiru (sibling console project): https://nibiru-framework.com
+- Argon2 RFC 9106
diff --git a/src/neuronetz_gateway/__init__.py b/src/neuronetz_gateway/__init__.py
new file mode 100644
index 0000000..2bdde0a
--- /dev/null
+++ b/src/neuronetz_gateway/__init__.py
@@ -0,0 +1,7 @@
+"""neuronetz-gateway: secure multi-tenant API gateway in front of Ollama."""
+
+from __future__ import annotations
+
+__version__ = "0.1.0"
+
+__all__ = ["__version__"]
diff --git a/src/neuronetz_gateway/__main__.py b/src/neuronetz_gateway/__main__.py
new file mode 100644
index 0000000..825af6a
--- /dev/null
+++ b/src/neuronetz_gateway/__main__.py
@@ -0,0 +1,28 @@
+"""Uvicorn entry point: ``python -m neuronetz_gateway``.
+
+Binds the app to ``GATEWAY_BIND_HOST``:``GATEWAY_BIND_PORT`` (default
+0.0.0.0:8080). The factory string is passed to uvicorn so the app is built in
+the worker process.
+"""
+
+from __future__ import annotations
+
+import uvicorn
+
+from neuronetz_gateway.config import get_settings
+
+
+def main() -> None:
+    """Run the gateway under uvicorn using the configured bind address."""
+    settings = get_settings()
+    uvicorn.run(
+        "neuronetz_gateway.app:create_app",
+        factory=True,
+        host=settings.gateway_bind_host,
+        port=settings.gateway_bind_port,
+        log_level=settings.gateway_log_level.lower(),
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/neuronetz_gateway/app.py b/src/neuronetz_gateway/app.py
new file mode 100644
index 0000000..a1042d5
--- /dev/null
+++ b/src/neuronetz_gateway/app.py
@@ -0,0 +1,111 @@
+"""FastAPI application factory.
+
+``create_app()`` is the shared contract entry point: other agents (DevOps, QA)
+import and serve this. It configures logging, installs the request-id and auth
+middleware, registers the sanitizing exception handlers, mounts routers, and
+binds the lifespan that manages backend handles + background tasks.
+
+Production safety: FastAPI's ``/docs`` + ``/openapi.json`` are disabled by
+default (enabled only via ``DOCS_ENABLED``). The ``/playground`` route is served
+only when ``PLAYGROUND_ENABLED`` is true and ``PLAYGROUND_FILE`` exists.
+"""
+
+from __future__ import annotations
+
+import uuid
+from pathlib import Path
+
+from fastapi import FastAPI, Request
+from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
+from starlette.responses import HTMLResponse, Response
+from starlette.types import ASGIApp
+
+from neuronetz_gateway import __version__
+from neuronetz_gateway.auth.middleware import AuthMiddleware
+from neuronetz_gateway.config import Settings, get_settings
+from neuronetz_gateway.errors import register_exception_handlers
+from neuronetz_gateway.lifespan import lifespan
+from neuronetz_gateway.observability.logging import configure_logging
+from neuronetz_gateway.routes import health, ollama_native, openai_compat
+
+
+class RequestIDMiddleware(BaseHTTPMiddleware):
+    """Assign/propagate a request id and expose it on ``request.state``.
+
+    Honours an inbound ``X-Request-ID`` from a trusted proxy; otherwise mints a
+    fresh UUID. The id is echoed on the response and used by error handlers.
+    """
+
+    def __init__(self, app: ASGIApp, header_name: str) -> None:
+        super().__init__(app)
+        self._header = header_name
+
+    async def dispatch(
+        self, request: Request, call_next: RequestResponseEndpoint
+    ) -> Response:
+        incoming = request.headers.get(self._header)
+        request_id = incoming or str(uuid.uuid4())
+        request.state.request_id = request_id
+        response = await call_next(request)
+        response.headers[self._header] = request_id
+        return response
+
+
+def _register_playground(app: FastAPI, cfg: Settings) -> None:
+    """Add the flag-gated ``/playground`` route (HTML asset, owned by docs agent).
+
+    The file is read off the event loop via ``asyncio.to_thread`` so a slow disk
+    cannot stall request handling. Missing-file is a simple 404, never an error.
+    """
+    import asyncio as _asyncio
+
+    def _load(path_str: str) -> str | None:
+        p = Path(path_str)
+        if not p.is_file():
+            return None
+        return p.read_text(encoding="utf-8")
+
+    @app.get("/playground", include_in_schema=False)
+    async def playground() -> Response:
+        content = await _asyncio.to_thread(_load, cfg.playground_file)
+        if content is None:
+            return Response(status_code=404, content="Not found")
+        return HTMLResponse(content)
+
+
+def create_app(settings: Settings | None = None) -> FastAPI:
+    """Build and return the configured FastAPI application."""
+    cfg = settings or get_settings()
+    configure_logging(level=cfg.gateway_log_level, fmt=cfg.gateway_log_format)
+
+    app = FastAPI(
+        title="neuronetz-gateway",
+        version=__version__,
+        lifespan=lifespan,
+        docs_url="/docs" if cfg.docs_enabled else None,
+        redoc_url="/redoc" if cfg.docs_enabled else None,
+        openapi_url="/openapi.json" if cfg.docs_enabled else None,
+    )
+    # Settings are needed by the auth middleware before lifespan runs in some
+    # test setups; lifespan also sets this. Setting here is idempotent.
+    app.state.settings = cfg
+
+    # Auth runs inside RequestID so a request id is always available for the
+    # sanitized 401 the auth middleware emits. add_middleware wraps outermost
+    # last, so add Auth first then RequestID.
+    app.add_middleware(AuthMiddleware)
+    app.add_middleware(RequestIDMiddleware, header_name=cfg.gateway_request_id_header)
+
+    register_exception_handlers(app)
+
+    app.include_router(health.router)
+    app.include_router(openai_compat.router)
+    app.include_router(ollama_native.router)
+
+    if cfg.playground_enabled:
+        _register_playground(app, cfg)
+
+    return app
+
+
+__all__ = ["RequestIDMiddleware", "create_app"]
diff --git a/src/neuronetz_gateway/config.py b/src/neuronetz_gateway/config.py
new file mode 100644
index 0000000..327652a
--- /dev/null
+++ b/src/neuronetz_gateway/config.py
@@ -0,0 +1,86 @@
+"""Application configuration via Pydantic Settings v2.
+
+Reads every environment variable documented in SPEC §7 with the documented
+defaults. Boot fails loudly (ValidationError) on invalid config.
+"""
+
+from __future__ import annotations
+
+from functools import lru_cache
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Gateway runtime configuration. All fields map to SPEC §7 env vars."""
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+        case_sensitive=False,
+    )
+
+    # --- Service ---
+    gateway_bind_host: str = Field(default="0.0.0.0")  # noqa: S104 - bind-all is intended in container
+    gateway_bind_port: int = Field(default=8080)
+    gateway_log_level: str = Field(default="INFO")
+    gateway_log_format: str = Field(default="json")  # json|console
+    gateway_request_id_header: str = Field(default="X-Request-ID")
+    gateway_trusted_proxies: str = Field(default="127.0.0.1,caddy")
+
+    # --- Upstream (Ollama) ---
+    ollama_base_url: str = Field(default="http://ollama:11434")
+    ollama_connect_timeout_s: int = Field(default=5)
+    ollama_read_timeout_s: int = Field(default=600)
+    ollama_max_connections: int = Field(default=64)
+
+    # --- Model discovery (SPEC §4.6) ---
+    model_discovery_refresh_s: int = Field(default=60)
+    model_discovery_cache_ttl_s: int = Field(default=120)
+
+    # --- Database ---
+    database_url: str = Field(
+        default="postgresql+asyncpg://gateway:gateway@postgres:5432/neuronetz",
+    )
+    database_pool_size: int = Field(default=10)
+    database_pool_overflow: int = Field(default=20)
+
+    # --- Redis ---
+    redis_url: str = Field(default="redis://redis:6379/0")
+    redis_key_cache_ttl_s: int = Field(default=60)
+
+    # --- Limits ---
+    default_rpm: int = Field(default=60)
+    default_tpm: int = Field(default=100_000)
+    default_concurrent: int = Field(default=8)
+    max_request_body_bytes: int = Field(default=262_144)
+    max_num_predict: int = Field(default=4096)
+
+    # --- Security ---
+    argon2_time_cost: int = Field(default=3)
+    argon2_memory_cost_kib: int = Field(default=65_536)
+    argon2_parallelism: int = Field(default=4)
+    auth_failure_rate_limit_per_ip_per_min: int = Field(default=20)
+
+    # --- Audit ---
+    audit_buffer_size: int = Field(default=1000)
+    prompt_log_default_retention_days: int = Field(default=30)
+    audit_log_default_retention_days: int = Field(default=365)
+
+    # --- Playground / docs (prod-safe defaults: both OFF) ---
+    playground_enabled: bool = Field(default=False)
+    playground_file: str = Field(default="/app/playground/index.html")
+    docs_enabled: bool = Field(default=False)
+
+    @property
+    def trusted_proxies_list(self) -> list[str]:
+        """Parse the comma-separated trusted-proxy list into individual hosts."""
+        return [p.strip() for p in self.gateway_trusted_proxies.split(",") if p.strip()]
+
+
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    """Return a cached Settings instance, constructed from the environment."""
+    return Settings()
diff --git a/src/neuronetz_gateway/db/__init__.py b/src/neuronetz_gateway/db/__init__.py
new file mode 100644
index 0000000..bcfa2ea
--- /dev/null
+++ b/src/neuronetz_gateway/db/__init__.py
@@ -0,0 +1,3 @@
+"""Database access layer: SQLAlchemy models, session factory, repositories."""
+
+from __future__ import annotations
diff --git a/src/neuronetz_gateway/db/models.py b/src/neuronetz_gateway/db/models.py
new file mode 100644
index 0000000..eba7f0a
--- /dev/null
+++ b/src/neuronetz_gateway/db/models.py
@@ -0,0 +1,292 @@
+"""SQLAlchemy 2.0 (async) ORM models for schema ``gateway`` per SPEC §5.
+
+These mirror the migration in ``alembic/versions/0001_initial.py`` exactly.
+The migration is the authoritative DDL; these models are for application use.
+"""
+
+from __future__ import annotations
+
+import datetime
+import enum
+import uuid
+
+from sqlalchemy import (
+    BigInteger,
+    Boolean,
+    ForeignKey,
+    Integer,
+    MetaData,
+    String,
+    Text,
+    text,
+)
+from sqlalchemy.dialects.postgresql import ARRAY, ENUM, INET, JSONB, TIMESTAMP, UUID
+from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
+
+GATEWAY_SCHEMA = "gateway"
+
+# Stable naming convention so Alembic autogenerate and ad-hoc DDL agree.
+_NAMING_CONVENTION = {
+    "ix": "ix_%(column_0_label)s",
+    "uq": "uq_%(table_name)s_%(column_0_name)s",
+    "ck": "ck_%(table_name)s_%(constraint_name)s",
+    "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
+    "pk": "pk_%(table_name)s",
+}
+
+
+class Base(DeclarativeBase):
+    """Declarative base; all tables live in the ``gateway`` schema."""
+
+    metadata = MetaData(schema=GATEWAY_SCHEMA, naming_convention=_NAMING_CONVENTION)
+
+
+class KeyStatus(enum.StrEnum):
+    """Lifecycle states for an API key (SPEC §5 ``gateway.key_status``)."""
+
+    active = "active"
+    disabled = "disabled"
+    revoked = "revoked"
+
+
+class TenantStatus(enum.StrEnum):
+    """Lifecycle states for a tenant (SPEC §5 ``gateway.tenant_status``)."""
+
+    active = "active"
+    suspended = "suspended"
+    closed = "closed"
+
+
+class BudgetPeriod(enum.StrEnum):
+    """Budget accounting periods (SPEC §5 ``gateway.budget_period``)."""
+
+    day = "day"
+    month = "month"
+    total = "total"
+
+
+# Reuse existing Postgres enum types (the migration creates them); do not let
+# SQLAlchemy try to CREATE TYPE again at runtime.
+_key_status_enum = ENUM(KeyStatus, name="key_status", schema=GATEWAY_SCHEMA, create_type=False)
+_tenant_status_enum = ENUM(
+    TenantStatus, name="tenant_status", schema=GATEWAY_SCHEMA, create_type=False
+)
+_budget_period_enum = ENUM(
+    BudgetPeriod, name="budget_period", schema=GATEWAY_SCHEMA, create_type=False
+)
+
+
+class Tenant(Base):
+    """A tenant: the top-level isolation and ownership boundary."""
+
+    __tablename__ = "tenants"
+
+    id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
+    )
+    name: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
+    status: Mapped[TenantStatus] = mapped_column(
+        _tenant_status_enum, nullable=False, server_default=text("'active'")
+    )
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
+    )
+    tenant_metadata: Mapped[dict[str, object]] = mapped_column(
+        "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
+    )
+
+
+class TenantLimit(Base):
+    """Per-tenant default limits and retention policy."""
+
+    __tablename__ = "tenant_limits"
+
+    tenant_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("tenants.id", ondelete="CASCADE"),
+        primary_key=True,
+    )
+    rpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("60"))
+    tpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("100000"))
+    concurrent: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("8"))
+    tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    allowed_models: Mapped[list[str]] = mapped_column(
+        ARRAY(Text), nullable=False, server_default=text("'{}'")
+    )
+    # When true, the tenant may use ANY model currently installed on the Ollama
+    # backend (resolved live via model discovery). When false (default), access is
+    # default-deny and restricted to ``allowed_models`` intersected with the live set.
+    allow_all_models: Mapped[bool] = mapped_column(
+        Boolean, nullable=False, server_default=text("false")
+    )
+    log_prompts_default: Mapped[bool] = mapped_column(
+        Boolean, nullable=False, server_default=text("false")
+    )
+    prompt_retention_days: Mapped[int] = mapped_column(
+        Integer, nullable=False, server_default=text("30")
+    )
+    audit_retention_days: Mapped[int] = mapped_column(
+        Integer, nullable=False, server_default=text("365")
+    )
+
+
+class ApiKey(Base):
+    """An API key belonging to a tenant. The full key is never stored."""
+
+    __tablename__ = "api_keys"
+
+    id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
+    )
+    tenant_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("tenants.id", ondelete="CASCADE"),
+        nullable=False,
+    )
+    prefix: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
+    key_hash: Mapped[str] = mapped_column(Text, nullable=False)
+    name: Mapped[str] = mapped_column(Text, nullable=False)
+    status: Mapped[KeyStatus] = mapped_column(
+        _key_status_enum, nullable=False, server_default=text("'active'")
+    )
+    scopes: Mapped[list[str]] = mapped_column(
+        ARRAY(Text), nullable=False, server_default=text("'{chat,embeddings}'")
+    )
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
+    )
+    last_used_at: Mapped[datetime.datetime | None] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=True
+    )
+    expires_at: Mapped[datetime.datetime | None] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=True
+    )
+    log_prompts: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
+    key_metadata: Mapped[dict[str, object]] = mapped_column(
+        "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
+    )
+
+
+class KeyLimit(Base):
+    """Per-key overrides; NULL columns inherit the tenant value."""
+
+    __tablename__ = "key_limits"
+
+    key_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("api_keys.id", ondelete="CASCADE"),
+        primary_key=True,
+    )
+    rpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    tpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    concurrent: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
+    allowed_models: Mapped[list[str] | None] = mapped_column(ARRAY(Text), nullable=True)
+    # NULL = inherit tenant's allow_all_models; otherwise overrides it for this key.
+    allow_all_models: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
+
+
+class BudgetUsage(Base):
+    """Token/request accounting per key, period, and period start."""
+
+    __tablename__ = "budget_usage"
+
+    key_id: Mapped[uuid.UUID] = mapped_column(
+        UUID(as_uuid=True),
+        ForeignKey("api_keys.id", ondelete="CASCADE"),
+        primary_key=True,
+    )
+    period: Mapped[BudgetPeriod] = mapped_column(_budget_period_enum, primary_key=True)
+    period_start: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), primary_key=True
+    )
+    tokens_in: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
+    tokens_out: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
+    requests: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
+
+
+class AuditLog(Base):
+    """Always-on append-only request metadata log."""
+
+    __tablename__ = "audit_log"
+
+    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
+    ts: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
+    )
+    request_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
+    tenant_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
+    key_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
+    key_prefix: Mapped[str | None] = mapped_column(Text, nullable=True)
+    method: Mapped[str] = mapped_column(Text, nullable=False)
+    path: Mapped[str] = mapped_column(Text, nullable=False)
+    model: Mapped[str | None] = mapped_column(Text, nullable=True)
+    tokens_in: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    tokens_out: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
+    status: Mapped[int] = mapped_column(Integer, nullable=False)
+    client_ip: Mapped[str | None] = mapped_column(INET, nullable=True)
+    user_agent: Mapped[str | None] = mapped_column(Text, nullable=True)
+    error_code: Mapped[str | None] = mapped_column(Text, nullable=True)
+
+
+class PromptLog(Base):
+    """Opt-in, TTL'd capture of request/response bodies."""
+
+    __tablename__ = "prompt_log"
+
+    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
+    audit_id: Mapped[int] = mapped_column(
+        BigInteger,
+        ForeignKey("audit_log.id", ondelete="CASCADE"),
+        nullable=False,
+    )
+    ts: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
+    )
+    key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
+    request_body: Mapped[dict[str, object]] = mapped_column(JSONB, nullable=False)
+    response_text: Mapped[str | None] = mapped_column(Text, nullable=True)
+    retention_until: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False
+    )
+
+
+class Revocation(Base):
+    """Outbox table written by console (or gateway) to revoke a key.
+
+    An ``AFTER INSERT`` trigger fires ``pg_notify('key_revoked', key_id)``.
+    """
+
+    __tablename__ = "revocations"
+
+    id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
+    key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
+    ts: Mapped[datetime.datetime] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
+    )
+    reason: Mapped[str | None] = mapped_column(String, nullable=True)
+    processed_at: Mapped[datetime.datetime | None] = mapped_column(
+        TIMESTAMP(timezone=True), nullable=True
+    )
+
+
+__all__ = [
+    "GATEWAY_SCHEMA",
+    "ApiKey",
+    "AuditLog",
+    "Base",
+    "BudgetPeriod",
+    "BudgetUsage",
+    "KeyLimit",
+    "KeyStatus",
+    "PromptLog",
+    "Revocation",
+    "Tenant",
+    "TenantLimit",
+    "TenantStatus",
+]
diff --git a/src/neuronetz_gateway/db/session.py b/src/neuronetz_gateway/db/session.py
new file mode 100644
index 0000000..0b074c4
--- /dev/null
+++ b/src/neuronetz_gateway/db/session.py
@@ -0,0 +1,53 @@
+"""Async SQLAlchemy engine and session factory construction.
+
+Phase 1 provides the wiring only; the lifespan owns the engine instance and
+stores it on ``app.state``. Business-logic callers should depend on the
+session factory via ``deps.py``.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+
+from sqlalchemy.ext.asyncio import (
+    AsyncEngine,
+    AsyncSession,
+    async_sessionmaker,
+    create_async_engine,
+)
+
+from neuronetz_gateway.config import Settings
+
+
+def create_engine(settings: Settings) -> AsyncEngine:
+    """Build the async engine from settings (asyncpg driver, pooled)."""
+    return create_async_engine(
+        settings.database_url,
+        pool_size=settings.database_pool_size,
+        max_overflow=settings.database_pool_overflow,
+        pool_pre_ping=True,
+        future=True,
+    )
+
+
+def create_session_factory(engine: AsyncEngine) -> async_sessionmaker[AsyncSession]:
+    """Build a session factory bound to the given engine."""
+    return async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
+
+
+@asynccontextmanager
+async def session_scope(
+    factory: async_sessionmaker[AsyncSession],
+) -> AsyncIterator[AsyncSession]:
+    """Provide a transactional session scope, committing on success."""
+    async with factory() as session:
+        try:
+            yield session
+            await session.commit()
+        except Exception:
+            await session.rollback()
+            raise
+
+
+__all__ = ["create_engine", "create_session_factory", "session_scope"]
diff --git a/src/neuronetz_gateway/deps.py b/src/neuronetz_gateway/deps.py
new file mode 100644
index 0000000..a93e5d6
--- /dev/null
+++ b/src/neuronetz_gateway/deps.py
@@ -0,0 +1,180 @@
+"""FastAPI dependency-injection providers.
+
+Exposes typed accessors for the handles placed on ``app.state`` by the lifespan
+(Redis, the upstream httpx client, the DB session factory, the discovery cache)
+plus the request principal and the proxy client.
+
+QA override contract
+--------------------
+Routes obtain the upstream proxy via :func:`get_ollama_client`. Tests override
+the *Ollama backend* by overriding this provider::
+
+    from neuronetz_gateway.deps import get_ollama_client
+    from neuronetz_gateway.proxy.ollama import OllamaClient
+    import httpx
+    from tests.integration.mock_ollama import create_mock_ollama
+
+    transport = httpx.ASGITransport(app=create_mock_ollama())
+    mock_http = httpx.AsyncClient(transport=transport, base_url="http://ollama")
+    app.dependency_overrides[get_ollama_client] = lambda: OllamaClient(mock_http)
+
+Because ``get_ollama_client`` returns a fully-built :class:`OllamaClient`, an
+override needs no access to ``app.state`` and can point at the in-process mock.
+"""
+
+from __future__ import annotations
+
+from collections.abc import AsyncIterator
+from typing import Annotated
+
+import httpx
+import redis.asyncio as redis
+from fastapi import Depends, Request
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
+
+from neuronetz_gateway.audit.writer import AuditWriter
+from neuronetz_gateway.auth.principal import Principal
+from neuronetz_gateway.budget.counter import BudgetCounter
+from neuronetz_gateway.config import Settings, get_settings
+from neuronetz_gateway.errors import AuthenticationError, DependencyUnavailableError
+from neuronetz_gateway.proxy.discovery import DiscoveryCache
+from neuronetz_gateway.proxy.ollama import OllamaClient
+from neuronetz_gateway.proxy.pipeline import Pipeline
+from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter
+from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter
+
+
+def get_config() -> Settings:
+    """Provide the cached application settings."""
+    return get_settings()
+
+
+def get_redis(request: Request) -> redis.Redis:
+    """Provide the shared Redis client, failing closed if unavailable."""
+    client: redis.Redis | None = getattr(request.app.state, "redis", None)
+    if client is None:
+        raise DependencyUnavailableError(internal_detail="redis client not initialised")
+    return client
+
+
+def get_http_client(request: Request) -> httpx.AsyncClient:
+    """Provide the shared upstream httpx client."""
+    client: httpx.AsyncClient | None = getattr(request.app.state, "http_client", None)
+    if client is None:
+        raise DependencyUnavailableError(internal_detail="http client not initialised")
+    return client
+
+
+def get_ollama_client(request: Request) -> OllamaClient:
+    """Provide the upstream Ollama proxy client (override target for tests)."""
+    return OllamaClient(get_http_client(request))
+
+
+def get_discovery_cache(request: Request) -> DiscoveryCache:
+    """Provide the in-process discovery cache; fail closed if absent."""
+    cache: DiscoveryCache | None = getattr(request.app.state, "discovery_cache", None)
+    if cache is None:
+        raise DependencyUnavailableError(internal_detail="discovery cache not initialised")
+    return cache
+
+
+def get_principal(request: Request) -> Principal:
+    """Return the authenticated principal placed on ``request.state``.
+
+    The auth middleware attaches it before routing; its absence on a non-exempt
+    route is a programming error, so we fail closed with a 401.
+    """
+    principal: Principal | None = getattr(request.state, "principal", None)
+    if principal is None:
+        raise AuthenticationError(internal_detail="principal missing on authenticated route")
+    return principal
+
+
+def get_audit_writer(request: Request) -> AuditWriter:
+    """Provide the shared buffered audit writer; fail closed if absent."""
+    writer: AuditWriter | None = getattr(request.app.state, "audit_writer", None)
+    if writer is None:
+        raise DependencyUnavailableError(internal_detail="audit writer not initialised")
+    return writer
+
+
+def get_pipeline(
+    request: Request,
+    principal: Annotated[Principal, Depends(get_principal)],
+    settings: Annotated[Settings, Depends(get_config)],
+    ollama: Annotated[OllamaClient, Depends(get_ollama_client)],
+    discovery: Annotated[DiscoveryCache, Depends(get_discovery_cache)],
+    redis_client: Annotated[redis.Redis, Depends(get_redis)],
+    audit: Annotated[AuditWriter, Depends(get_audit_writer)],
+) -> Pipeline:
+    """Assemble a per-request enforcement + proxy pipeline.
+
+    The pipeline owns all hot-path checks (rate limit, budget, concurrency,
+    model/endpoint allowlist) and the streaming-with-bookkeeping contract.
+    Audit deny-mode flips this to fail closed at the route layer.
+    """
+    sessionmaker: async_sessionmaker[AsyncSession] | None = getattr(
+        request.app.state, "db_sessionmaker", None
+    )
+    return Pipeline(
+        request=request,
+        principal=principal,
+        settings=settings,
+        ollama=ollama,
+        discovery=discovery,
+        rate_limiter=SlidingWindowLimiter(redis_client),
+        concurrency=ConcurrencyLimiter(redis_client),
+        budget=BudgetCounter(redis_client),
+        audit=audit,
+        sessionmaker=sessionmaker,
+    )
+
+
+def _get_sessionmaker(request: Request) -> async_sessionmaker[AsyncSession]:
+    """Return the session factory or fail closed if the engine is absent."""
+    factory: async_sessionmaker[AsyncSession] | None = getattr(
+        request.app.state, "db_sessionmaker", None
+    )
+    if factory is None:
+        raise DependencyUnavailableError(internal_detail="db session factory not initialised")
+    return factory
+
+
+async def get_db_session(request: Request) -> AsyncIterator[AsyncSession]:
+    """Provide a request-scoped async DB session."""
+    factory = _get_sessionmaker(request)
+    async with factory() as session:
+        yield session
+
+
+ConfigDep = Annotated[Settings, Depends(get_config)]
+RedisDep = Annotated[redis.Redis, Depends(get_redis)]
+HttpClientDep = Annotated[httpx.AsyncClient, Depends(get_http_client)]
+OllamaClientDep = Annotated[OllamaClient, Depends(get_ollama_client)]
+DiscoveryCacheDep = Annotated[DiscoveryCache, Depends(get_discovery_cache)]
+PrincipalDep = Annotated[Principal, Depends(get_principal)]
+AuditWriterDep = Annotated[AuditWriter, Depends(get_audit_writer)]
+PipelineDep = Annotated[Pipeline, Depends(get_pipeline)]
+DbSessionDep = Annotated[AsyncSession, Depends(get_db_session)]
+
+
+__all__ = [
+    "AuditWriterDep",
+    "ConfigDep",
+    "DbSessionDep",
+    "DiscoveryCacheDep",
+    "HttpClientDep",
+    "OllamaClientDep",
+    "PipelineDep",
+    "PrincipalDep",
+    "RedisDep",
+    "get_audit_writer",
+    "get_config",
+    "get_db_session",
+    "get_discovery_cache",
+    "get_http_client",
+    "get_ollama_client",
+    "get_pipeline",
+    "get_principal",
+    "get_redis",
+]
diff --git a/src/neuronetz_gateway/errors.py b/src/neuronetz_gateway/errors.py
new file mode 100644
index 0000000..45d0ccd
--- /dev/null
+++ b/src/neuronetz_gateway/errors.py
@@ -0,0 +1,179 @@
+"""Exception types and FastAPI exception handlers.
+
+Hard rule (SPEC §3, AGENT_PROMPT non-negotiable #4): never leak upstream or
+internal error details to the client. Every error response is a generic,
+sanitized JSON body carrying only a stable ``error.code``, a safe message, and
+the request id. Detailed context is logged server-side, never returned.
+"""
+
+from __future__ import annotations
+
+from fastapi import FastAPI, Request, status
+from fastapi.responses import JSONResponse
+
+from neuronetz_gateway.observability.logging import get_logger
+
+_log = get_logger("errors")
+
+
+class GatewayError(Exception):
+    """Base class for gateway errors that map to a sanitized HTTP response.
+
+    ``message`` MUST be safe to return to clients. Anything sensitive belongs
+    in ``internal_detail`` which is logged but never serialized to the client.
+    """
+
+    status_code: int = status.HTTP_500_INTERNAL_SERVER_ERROR
+    code: str = "internal_error"
+    message: str = "An internal error occurred."
+
+    def __init__(self, message: str | None = None, *, internal_detail: str | None = None) -> None:
+        super().__init__(message or self.message)
+        if message is not None:
+            self.message = message
+        self.internal_detail = internal_detail
+
+
+class AuthenticationError(GatewayError):
+    """Missing/invalid credentials. Fail closed, no detail."""
+
+    status_code = status.HTTP_401_UNAUTHORIZED
+    code = "unauthorized"
+    message = "Authentication required."
+
+
+class AuthorizationError(GatewayError):
+    """Authenticated but not permitted (scope/model/endpoint denied)."""
+
+    status_code = status.HTTP_403_FORBIDDEN
+    code = "forbidden"
+    message = "This request is not permitted."
+
+
+class RateLimitError(GatewayError):
+    """Rate limit exceeded. Handler attaches ``Retry-After`` when known."""
+
+    status_code = status.HTTP_429_TOO_MANY_REQUESTS
+    code = "rate_limited"
+    message = "Rate limit exceeded."
+
+    def __init__(
+        self,
+        message: str | None = None,
+        *,
+        retry_after: int | None = None,
+        internal_detail: str | None = None,
+    ) -> None:
+        super().__init__(message, internal_detail=internal_detail)
+        self.retry_after = retry_after
+
+
+class BudgetExceededError(GatewayError):
+    """Token budget exhausted for the active period."""
+
+    status_code = status.HTTP_429_TOO_MANY_REQUESTS
+    code = "budget_exceeded"
+    message = "Token budget exhausted for the current period."
+
+
+class RequestTooLargeError(GatewayError):
+    """Request body exceeds the configured limit."""
+
+    status_code = status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
+    code = "request_too_large"
+    message = "Request body is too large."
+
+
+class UpstreamUnavailableError(GatewayError):
+    """Ollama (or another dependency) is unreachable. Fail closed."""
+
+    status_code = status.HTTP_502_BAD_GATEWAY
+    code = "upstream_unavailable"
+    message = "The upstream service is temporarily unavailable."
+
+
+class DependencyUnavailableError(GatewayError):
+    """A required backend (DB/Redis) is unavailable; serve 503, fail closed."""
+
+    status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+    code = "service_unavailable"
+    message = "The service is temporarily unavailable."
+
+
+def _request_id(request: Request) -> str:
+    """Extract the request id placed on ``request.state`` by middleware."""
+    rid = getattr(request.state, "request_id", None)
+    return str(rid) if rid else ""
+
+
+def _error_response(
+    request: Request,
+    *,
+    status_code: int,
+    code: str,
+    message: str,
+    extra_headers: dict[str, str] | None = None,
+) -> JSONResponse:
+    """Build a sanitized JSON error response with the request id header."""
+    request_id = _request_id(request)
+    headers = {"X-Request-ID": request_id} if request_id else {}
+    if extra_headers:
+        headers.update(extra_headers)
+    return JSONResponse(
+        status_code=status_code,
+        content={"error": {"code": code, "message": message, "request_id": request_id}},
+        headers=headers,
+    )
+
+
+async def _gateway_error_handler(request: Request, exc: GatewayError) -> JSONResponse:
+    """Render a ``GatewayError`` as a sanitized response."""
+    if exc.internal_detail:
+        _log.warning(
+            "gateway_error",
+            code=exc.code,
+            status_code=exc.status_code,
+            internal_detail=exc.internal_detail,
+        )
+    extra: dict[str, str] | None = None
+    if isinstance(exc, RateLimitError) and exc.retry_after is not None:
+        extra = {"Retry-After": str(exc.retry_after)}
+    return _error_response(
+        request,
+        status_code=exc.status_code,
+        code=exc.code,
+        message=exc.message,
+        extra_headers=extra,
+    )
+
+
+async def _unhandled_error_handler(request: Request, exc: Exception) -> JSONResponse:
+    """Catch-all: log the real exception, return a generic 500. No leakage."""
+    _log.error("unhandled_exception", exc_info=exc)
+    return _error_response(
+        request,
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        code="internal_error",
+        message="An internal error occurred.",
+    )
+
+
+def register_exception_handlers(app: FastAPI) -> None:
+    """Attach the gateway's sanitizing exception handlers to the app."""
+    # mypy: FastAPI's add_exception_handler accepts these handler signatures;
+    # the stubs are intentionally broad, so casts are unnecessary here.
+    app.add_exception_handler(GatewayError, _gateway_error_handler)  # type: ignore[arg-type]  # handler typed for GatewayError subclass
+    app.add_exception_handler(Exception, _unhandled_error_handler)
+
+
+__all__ = [
+    "AuthenticationError",
+    "AuthorizationError",
+    "BudgetExceededError",
+    "DependencyUnavailableError",
+    "GatewayError",
+    "RateLimitError",
+    "RequestTooLargeError",
+    "UpstreamUnavailableError",
+    "register_exception_handlers",
+]
diff --git a/src/neuronetz_gateway/lifespan.py b/src/neuronetz_gateway/lifespan.py
new file mode 100644
index 0000000..9912cd5
--- /dev/null
+++ b/src/neuronetz_gateway/lifespan.py
@@ -0,0 +1,131 @@
+"""Application lifespan: connect/dispose backends and run background tasks.
+
+Startup connects Postgres + Redis + the upstream httpx client, builds the
+argon2 hasher and the buffered audit writer, and launches the background tasks:
+the model-discovery poller (SPEC §4.6) and the Postgres revocation NOTIFY
+listener (SPEC §4.5). Connection failures are tolerated so ``/healthz`` always
+serves; ``/readyz`` reports true readiness. All handles live on ``app.state``.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import contextlib
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from typing import TYPE_CHECKING
+
+import httpx
+import redis.asyncio as redis
+
+from neuronetz_gateway.audit.writer import AuditWriter
+from neuronetz_gateway.auth.hashing import build_hasher
+from neuronetz_gateway.config import Settings, get_settings
+from neuronetz_gateway.db.session import create_engine, create_session_factory
+from neuronetz_gateway.observability.logging import get_logger
+from neuronetz_gateway.proxy.discovery import DiscoveryCache, discovery_loop
+from neuronetz_gateway.revocation import revocation_listener
+
+if TYPE_CHECKING:
+    from fastapi import FastAPI
+
+_log = get_logger("lifespan")
+
+
+def _build_http_client(settings: Settings) -> httpx.AsyncClient:
+    """Construct the shared httpx client used to reach Ollama."""
+    timeout = httpx.Timeout(
+        connect=settings.ollama_connect_timeout_s,
+        read=settings.ollama_read_timeout_s,
+        write=settings.ollama_read_timeout_s,
+        pool=settings.ollama_connect_timeout_s,
+    )
+    limits = httpx.Limits(max_connections=settings.ollama_max_connections)
+    return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits)
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncIterator[None]:
+    """Manage startup/shutdown of all backends and background tasks."""
+    settings: Settings = get_settings()
+    app.state.settings = settings
+    app.state.hasher = build_hasher(settings)
+    app.state.discovery_cache = DiscoveryCache()
+    tasks: list[asyncio.Task[None]] = []
+
+    try:
+        engine = create_engine(settings)
+        app.state.db_engine = engine
+        app.state.db_sessionmaker = create_session_factory(engine)
+    except Exception as exc:  # noqa: BLE001 - tolerate so /healthz still serves
+        _log.error("db_engine_init_failed", error=str(exc))
+        app.state.db_engine = None
+        app.state.db_sessionmaker = None
+
+    try:
+        app.state.redis = redis.from_url(settings.redis_url, decode_responses=True)
+    except Exception as exc:  # noqa: BLE001 - tolerate so /healthz still serves
+        _log.error("redis_init_failed", error=str(exc))
+        app.state.redis = None
+
+    app.state.http_client = _build_http_client(settings)
+
+    audit_writer = AuditWriter(settings.audit_buffer_size, app.state.db_sessionmaker)
+    audit_writer.start()
+    app.state.audit_writer = audit_writer
+
+    # Background tasks (cancelled on shutdown).
+    tasks.append(
+        asyncio.create_task(
+            discovery_loop(
+                app.state.http_client, app.state.redis, app.state.discovery_cache, settings
+            )
+        )
+    )
+    if app.state.redis is not None and app.state.db_sessionmaker is not None:
+        tasks.append(
+            asyncio.create_task(
+                revocation_listener(settings, app.state.redis, app.state.db_sessionmaker)
+            )
+        )
+    app.state.background_tasks = tasks
+
+    _log.info("gateway_startup_complete")
+    try:
+        yield
+    finally:
+        await _shutdown(app, tasks, audit_writer)
+
+
+async def _shutdown(
+    app: FastAPI, tasks: list[asyncio.Task[None]], audit_writer: AuditWriter
+) -> None:
+    """Cancel background tasks and dispose of all backend handles."""
+    for task in tasks:
+        task.cancel()
+    for task in tasks:
+        with contextlib.suppress(asyncio.CancelledError):
+            await task
+
+    with contextlib.suppress(Exception):
+        await audit_writer.stop()
+
+    http_client: httpx.AsyncClient | None = getattr(app.state, "http_client", None)
+    if http_client is not None:
+        with contextlib.suppress(Exception):
+            await http_client.aclose()
+
+    redis_client = getattr(app.state, "redis", None)
+    if redis_client is not None:
+        with contextlib.suppress(Exception):
+            await redis_client.aclose()
+
+    engine = getattr(app.state, "db_engine", None)
+    if engine is not None:
+        with contextlib.suppress(Exception):
+            await engine.dispose()
+
+    _log.info("gateway_shutdown_complete")
+
+
+__all__ = ["lifespan"]
diff --git a/src/neuronetz_gateway/observability/__init__.py b/src/neuronetz_gateway/observability/__init__.py
new file mode 100644
index 0000000..b10f28d
--- /dev/null
+++ b/src/neuronetz_gateway/observability/__init__.py
@@ -0,0 +1,3 @@
+"""Observability: structured logging and Prometheus metrics."""
+
+from __future__ import annotations
diff --git a/src/neuronetz_gateway/observability/logging.py b/src/neuronetz_gateway/observability/logging.py
new file mode 100644
index 0000000..e9b25f8
--- /dev/null
+++ b/src/neuronetz_gateway/observability/logging.py
@@ -0,0 +1,48 @@
+"""structlog configuration.
+
+Renders JSON in production (``GATEWAY_LOG_FORMAT=json``) and a human-friendly
+console format in development. No secrets are ever logged; processors here
+must not introduce any.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import structlog
+
+
+def configure_logging(level: str = "INFO", fmt: str = "json") -> None:
+    """Configure stdlib logging and structlog according to settings."""
+    log_level = getattr(logging, level.upper(), logging.INFO)
+    logging.basicConfig(format="%(message)s", level=log_level)
+
+    shared_processors: list[structlog.types.Processor] = [
+        structlog.contextvars.merge_contextvars,
+        structlog.processors.add_log_level,
+        structlog.processors.TimeStamper(fmt="iso", utc=True),
+        structlog.processors.StackInfoRenderer(),
+        structlog.processors.format_exc_info,
+    ]
+
+    renderer: structlog.types.Processor
+    if fmt == "console":
+        renderer = structlog.dev.ConsoleRenderer()
+    else:
+        renderer = structlog.processors.JSONRenderer()
+
+    structlog.configure(
+        processors=[*shared_processors, renderer],
+        wrapper_class=structlog.make_filtering_bound_logger(log_level),
+        logger_factory=structlog.PrintLoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+
+def get_logger(name: str | None = None) -> Any:  # noqa: ANN401 - structlog returns a dynamic proxy
+    """Return a bound structlog logger."""
+    return structlog.get_logger(name)
+
+
+__all__ = ["configure_logging", "get_logger"]
diff --git a/src/neuronetz_gateway/routes/__init__.py b/src/neuronetz_gateway/routes/__init__.py
new file mode 100644
index 0000000..7c091ce
--- /dev/null
+++ b/src/neuronetz_gateway/routes/__init__.py
@@ -0,0 +1,3 @@
+"""HTTP route modules: health, native Ollama passthrough, OpenAI-compat."""
+
+from __future__ import annotations
diff --git a/src/neuronetz_gateway/routes/health.py b/src/neuronetz_gateway/routes/health.py
new file mode 100644
index 0000000..771c92a
--- /dev/null
+++ b/src/neuronetz_gateway/routes/health.py
@@ -0,0 +1,114 @@
+"""Health, readiness, and metrics endpoints (SPEC §6.4).
+
+- ``GET /healthz``  : liveness — always 200 if the process can respond.
+- ``GET /readyz``   : readiness — 200 only if Postgres + Redis + Ollama are all
+                      reachable; otherwise 503 with which dependencies are down.
+                      In Phase 1 dev there is no Ollama, so 503 is expected.
+- ``GET /metrics``  : Prometheus exposition. (Loopback-only IP check deferred.)
+
+None of these endpoints require auth and none leak secrets or internal detail.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Awaitable
+from typing import Literal, cast
+
+import httpx
+import redis.asyncio as redis
+from fastapi import APIRouter, Request, Response, status
+from pydantic import BaseModel
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
+
+from neuronetz_gateway.observability.logging import get_logger
+from neuronetz_gateway.observability.metrics import CONTENT_TYPE_LATEST, render_latest
+
+router = APIRouter(tags=["health"])
+_log = get_logger("health")
+
+
+class HealthResponse(BaseModel):
+    """Liveness response body."""
+
+    status: Literal["ok"] = "ok"
+
+
+class ReadyResponse(BaseModel):
+    """Readiness response body. ``checks`` maps dependency -> reachable bool."""
+
+    status: Literal["ready", "not_ready"]
+    checks: dict[str, bool]
+
+
+@router.get("/healthz", response_model=HealthResponse, status_code=status.HTTP_200_OK)
+async def healthz() -> HealthResponse:
+    """Liveness probe — always returns 200 while the process is responsive."""
+    return HealthResponse()
+
+
+async def _check_postgres(app_state: object) -> bool:
+    """Return True if a trivial query succeeds against Postgres."""
+    factory: async_sessionmaker[AsyncSession] | None = getattr(
+        app_state, "db_sessionmaker", None
+    )
+    if factory is None:
+        return False
+    try:
+        async with factory() as session:
+            await session.execute(text("SELECT 1"))
+        return True
+    except Exception as exc:  # noqa: BLE001 - any failure means not ready
+        _log.warning("readyz_postgres_unreachable", error=str(exc))
+        return False
+
+
+async def _check_redis(app_state: object) -> bool:
+    """Return True if Redis answers PING."""
+    client: redis.Redis | None = getattr(app_state, "redis", None)
+    if client is None:
+        return False
+    try:
+        # redis-py types ping() as Awaitable[bool] | bool (sync+async share stubs);
+        # the asyncio client always returns an awaitable at runtime.
+        return bool(await cast("Awaitable[bool]", client.ping()))
+    except Exception as exc:  # noqa: BLE001 - any failure means not ready
+        _log.warning("readyz_redis_unreachable", error=str(exc))
+        return False
+
+
+async def _check_ollama(app_state: object) -> bool:
+    """Return True if Ollama's root endpoint is reachable."""
+    client: httpx.AsyncClient | None = getattr(app_state, "http_client", None)
+    if client is None:
+        return False
+    try:
+        resp = await client.get("/")
+        return resp.status_code < 500
+    except Exception as exc:  # noqa: BLE001 - any failure means not ready
+        _log.warning("readyz_ollama_unreachable", error=str(exc))
+        return False
+
+
+@router.get("/readyz", response_model=ReadyResponse)
+async def readyz(request: Request, response: Response) -> ReadyResponse:
+    """Readiness probe — 200 only if every dependency is reachable, else 503."""
+    app_state = request.app.state
+    checks = {
+        "postgres": await _check_postgres(app_state),
+        "redis": await _check_redis(app_state),
+        "ollama": await _check_ollama(app_state),
+    }
+    all_ready = all(checks.values())
+    if not all_ready:
+        response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
+    return ReadyResponse(status="ready" if all_ready else "not_ready", checks=checks)
+
+
+@router.get("/metrics")
+async def metrics() -> Response:
+    """Prometheus exposition. Loopback-only enforcement is deferred to Phase 4."""
+    return Response(content=render_latest(), media_type=CONTENT_TYPE_LATEST)
+
+
+__all__ = ["router"]