From d79f17b3bb3075cf9f880cf0831580b566c987fe Mon Sep 17 00:00:00 2001 From: Stephan Berbig Date: Tue, 26 May 2026 20:50:35 +0200 Subject: [PATCH] scaffold: project skeleton, schema, healthz/readyz, CI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial project structure for neuronetz-gateway per scope-docs/SPEC.md: - Python 3.12 / FastAPI / SQLAlchemy 2.0 (async) / Redis / Postgres stack managed by uv. Multi-stage non-root Dockerfile, prod + dev compose files (ollama service is NEVER published in either), Caddyfile + systemd unit, justfile, GitHub Actions CI (ruff, mypy --strict, pytest, bandit, pip-audit). - Pydantic-Settings config covering every env var from SPEC §7, including the MODEL_DISCOVERY_* keys for the dynamic-discovery feature (§4.6). - Alembic 0001_initial creates the full gateway schema (8 tables, 3 enums, notify_key_revoked() trigger), incl. allow_all_models on tenant_limits and key_limits for the per-tenant auto-grant toggle. - Working /healthz, /readyz (fail-closed when deps unreachable), and a Prometheus /metrics stub. Sanitizing error handlers that attach X-Request-ID to every response and never leak upstream internals. - SPEC + AGENT_PROMPT included under scope-docs/ (source of truth). --- .dockerignore | 44 ++ .env.example | 63 ++ .github/workflows/ci.yml | 108 ++++ .gitignore | 40 ++ Dockerfile | 97 +++ LICENSE | 202 ++++++ README.md | 92 +++ alembic.ini | 49 ++ alembic/env.py | 97 +++ alembic/versions/0001_initial.py | 342 ++++++++++ docker-compose.dev.yml | 101 +++ docker-compose.yml | 152 +++++ justfile | 60 ++ ops/caddy/Caddyfile.example | 59 ++ ops/systemd/neuronetz-gateway.service | 58 ++ pyproject.toml | 94 +++ scope-docs/AGENT_PROMPT.md | 121 ++++ scope-docs/SPEC.md | 593 ++++++++++++++++++ src/neuronetz_gateway/__init__.py | 7 + src/neuronetz_gateway/__main__.py | 28 + src/neuronetz_gateway/app.py | 111 ++++ src/neuronetz_gateway/config.py | 86 +++ src/neuronetz_gateway/db/__init__.py | 3 + src/neuronetz_gateway/db/models.py | 292 +++++++++ src/neuronetz_gateway/db/session.py | 53 ++ src/neuronetz_gateway/deps.py | 180 ++++++ src/neuronetz_gateway/errors.py | 179 ++++++ src/neuronetz_gateway/lifespan.py | 131 ++++ .../observability/__init__.py | 3 + .../observability/logging.py | 48 ++ src/neuronetz_gateway/routes/__init__.py | 3 + src/neuronetz_gateway/routes/health.py | 114 ++++ 32 files changed, 3610 insertions(+) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 .github/workflows/ci.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 alembic.ini create mode 100644 alembic/env.py create mode 100644 alembic/versions/0001_initial.py create mode 100644 docker-compose.dev.yml create mode 100644 docker-compose.yml create mode 100644 justfile create mode 100644 ops/caddy/Caddyfile.example create mode 100644 ops/systemd/neuronetz-gateway.service create mode 100644 pyproject.toml create mode 100644 scope-docs/AGENT_PROMPT.md create mode 100644 scope-docs/SPEC.md create mode 100644 src/neuronetz_gateway/__init__.py create mode 100644 src/neuronetz_gateway/__main__.py create mode 100644 src/neuronetz_gateway/app.py create mode 100644 src/neuronetz_gateway/config.py create mode 100644 src/neuronetz_gateway/db/__init__.py create mode 100644 src/neuronetz_gateway/db/models.py create mode 100644 src/neuronetz_gateway/db/session.py create mode 100644 src/neuronetz_gateway/deps.py create mode 100644 src/neuronetz_gateway/errors.py create mode 100644 src/neuronetz_gateway/lifespan.py create mode 100644 src/neuronetz_gateway/observability/__init__.py create mode 100644 src/neuronetz_gateway/observability/logging.py create mode 100644 src/neuronetz_gateway/routes/__init__.py create mode 100644 src/neuronetz_gateway/routes/health.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ce0bba4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,44 @@ +# Keep the build context lean and never ship secrets into an image layer. + +# Secrets / local env +.env +.env.* +!.env.example + +# VCS & CI +.git +.gitignore +.github + +# Python caches & build artefacts +__pycache__/ +*.py[cod] +*.egg-info/ +.eggs/ +build/ +dist/ +.venv/ +venv/ +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ +.coverage +htmlcov/ +coverage.xml + +# Tests & docs are not needed in the runtime image +tests/ +docs/ +scope-docs/ + +# Editor / OS cruft +.idea/ +.vscode/ +*.swp +.DS_Store + +# Compose / ops files don't belong in the image +docker-compose*.yml +ops/ +# NOTE: README.md and LICENSE are intentionally NOT ignored — the build backend +# (hatchling) reads `readme`/`license` from pyproject.toml at build time. diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..825036d --- /dev/null +++ b/.env.example @@ -0,0 +1,63 @@ +# neuronetz-gateway — environment configuration (SPEC §7). +# +# Copy to `.env` and adjust. `.env` is gitignored and MUST NOT be committed. +# All values here are SAFE EXAMPLES — change every secret before any real deploy. + +# ──────────────────────────── Service ──────────────────────────── +GATEWAY_BIND_HOST=0.0.0.0 +GATEWAY_BIND_PORT=8080 +GATEWAY_LOG_LEVEL=INFO +GATEWAY_LOG_FORMAT=json # json|console +GATEWAY_REQUEST_ID_HEADER=X-Request-ID +GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy # for X-Forwarded-For + +# ──────────────────────────── Upstream ─────────────────────────── +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_CONNECT_TIMEOUT_S=5 +OLLAMA_READ_TIMEOUT_S=600 +OLLAMA_MAX_CONNECTIONS=64 + +# ──────────────────────── Model discovery (§4.6) ───────────────── +MODEL_DISCOVERY_REFRESH_S=60 +MODEL_DISCOVERY_CACHE_TTL_S=120 + +# ──────────────────────────── Database ─────────────────────────── +# Compose builds DATABASE_URL from the POSTGRES_* parts below, but the gateway +# also accepts a full DATABASE_URL directly. +DATABASE_URL=postgresql+asyncpg://gateway:changeme@postgres:5432/neuronetz +DATABASE_POOL_SIZE=10 +DATABASE_POOL_OVERFLOW=20 + +# Postgres container credentials (consumed by docker-compose). +POSTGRES_USER=gateway +POSTGRES_PASSWORD=changeme +POSTGRES_DB=neuronetz + +# ──────────────────────────── Redis ────────────────────────────── +REDIS_URL=redis://redis:6379/0 +REDIS_KEY_CACHE_TTL_S=60 + +# ────────────────── Limits (defaults; DB overrides) ────────────── +DEFAULT_RPM=60 +DEFAULT_TPM=100000 +DEFAULT_CONCURRENT=8 +MAX_REQUEST_BODY_BYTES=262144 +MAX_NUM_PREDICT=4096 + +# ──────────────────────────── Security ─────────────────────────── +ARGON2_TIME_COST=3 +ARGON2_MEMORY_COST_KIB=65536 +ARGON2_PARALLELISM=4 +AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20 + +# ──────────────────────────── Audit ────────────────────────────── +AUDIT_BUFFER_SIZE=1000 +PROMPT_LOG_DEFAULT_RETENTION_DAYS=30 +AUDIT_LOG_DEFAULT_RETENTION_DAYS=365 + +# ──────────────── Playground / API docs (prod-safe: OFF) ───────── +# Serve the playground HTML (owned by the docs agent) at /playground. +PLAYGROUND_ENABLED=false +PLAYGROUND_FILE=/app/playground/index.html +# Enable FastAPI's /docs + /openapi.json (default off in production). +DOCS_ENABLED=false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..553aa6e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,108 @@ +name: CI + +on: + push: + branches: ["**"] + pull_request: + workflow_dispatch: + +# Cancel superseded runs on the same ref. +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + +env: + PYTHON_VERSION: "3.12" + +jobs: + lint: + name: ruff + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - name: Set up Python + run: uv python install ${{ env.PYTHON_VERSION }} + - name: Install dependencies + run: uv sync --extra dev + - name: ruff check + run: uv run ruff check . + + typecheck: + name: mypy --strict + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - name: Set up Python + run: uv python install ${{ env.PYTHON_VERSION }} + - name: Install dependencies + run: uv sync --extra dev + - name: mypy + run: uv run mypy --strict src + + test: + name: pytest + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - name: Set up Python + run: uv python install ${{ env.PYTHON_VERSION }} + - name: Install dependencies + run: uv sync --extra dev + # Phase 1: an empty/placeholder suite must pass. pytest exits 5 when it + # collects no tests; we treat that as success this phase. Coverage is + # reported but not gated yet (no --cov-fail-under until later phases). + - name: pytest + shell: bash + run: | + set +e + uv run pytest --cov=neuronetz_gateway --cov-report=term-missing + code=$? + if [ "$code" -eq 5 ]; then + echo "::notice::No tests collected (Phase 1) — treating as success." + exit 0 + fi + exit "$code" + + bandit: + name: bandit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - name: Set up Python + run: uv python install ${{ env.PYTHON_VERSION }} + - name: Install dependencies + run: uv sync --extra dev + - name: bandit + run: uv run bandit -q -r src + + pip-audit: + name: pip-audit + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - name: Set up Python + run: uv python install ${{ env.PYTHON_VERSION }} + - name: Install dependencies + run: uv sync --extra dev + - name: pip-audit + run: uv run pip-audit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4024058 --- /dev/null +++ b/.gitignore @@ -0,0 +1,40 @@ +# Secrets — NEVER commit. Only .env.example is tracked. +.env +.env.* +!.env.example + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.egg-info/ +.eggs/ +build/ +dist/ +*.so + +# Virtualenvs / uv +.venv/ +venv/ +.python-version + +# Type / lint / test caches +.mypy_cache/ +.ruff_cache/ +.pytest_cache/ +.coverage +.coverage.* +htmlcov/ +coverage.xml +.tox/ + +# Docker +*.pid + +# Editor / OS +.idea/ +.vscode/ +*.swp +*~ +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..fd5a71e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,97 @@ +# syntax=docker/dockerfile:1.7 +# +# neuronetz-gateway — multi-stage image. +# +# builder stage : installs dependencies into a self-contained virtualenv using uv. +# runtime stage : copies the venv + source, drops to a NON-ROOT user, contains +# no build tools, and runs `python -m neuronetz_gateway`. +# +# uv is pulled from the official distroless image so we don't need network access +# to `pip install uv`. Dependencies come from pyproject.toml (+ uv.lock if present). + +# ---------------------------------------------------------------------------- +# Stage 1 — builder +# ---------------------------------------------------------------------------- +FROM python:3.12-slim AS builder + +# Bring in the `uv` binary from its official image. +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +ENV UV_LINK_MODE=copy \ + UV_COMPILE_BYTECODE=1 \ + UV_PYTHON_DOWNLOADS=never \ + # Create the project venv at a stable, copyable location. + VIRTUAL_ENV=/opt/venv \ + PATH=/opt/venv/bin:$PATH + +WORKDIR /app + +# Create the target virtualenv up front so uv installs into it. +RUN uv venv /opt/venv + +# Dependency layer: copy only the manifest(s) first for better caching. +# uv.lock is optional in Phase 1 — the wildcard makes COPY succeed either way. +COPY pyproject.toml ./ +COPY uv.loc[k] ./ + +# Install dependencies. If a lockfile is present `uv sync` honours it; otherwise +# we fall back to resolving straight from pyproject.toml. Either way the build +# does NOT fail when the lock is absent. +RUN --mount=type=cache,target=/root/.cache/uv \ + if [ -f uv.lock ]; then \ + uv sync --frozen --no-install-project --no-dev ; \ + else \ + uv pip install --python /opt/venv/bin/python -r pyproject.toml ; \ + fi + +# Now copy the application source and install the project itself into the venv. +# README.md + LICENSE are required by the build backend (pyproject `readme`/license). +COPY README.md LICENSE ./ +COPY src ./src +COPY alembi[c] ./alembic +COPY alembic.in[i] ./ +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --python /opt/venv/bin/python --no-deps . + +# ---------------------------------------------------------------------------- +# Stage 2 — runtime +# ---------------------------------------------------------------------------- +FROM python:3.12-slim AS runtime + +# Runtime-only OS packages: curl is used by the compose healthcheck. +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl \ + && rm -rf /var/lib/apt/lists/* + +# Non-root user. +RUN groupadd --system --gid 10001 gateway \ + && useradd --system --uid 10001 --gid gateway --home-dir /app --shell /usr/sbin/nologin gateway + +ENV VIRTUAL_ENV=/opt/venv \ + PATH=/opt/venv/bin:$PATH \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + GATEWAY_BIND_HOST=0.0.0.0 \ + GATEWAY_BIND_PORT=8080 + +WORKDIR /app + +# Copy the fully-populated virtualenv and the application from the builder. +COPY --from=builder /opt/venv /opt/venv +COPY --from=builder /app/src ./src +# alembic assets are optional during early scaffolding; copy if present. +COPY --from=builder /app/alembi[c] ./alembic +COPY --from=builder /app/alembic.in[i] ./ + +# Drop privileges. No build tools are present in this stage. +USER gateway + +EXPOSE 8080 + +# Liveness probe target lives at /healthz (see SPEC §6.4). +HEALTHCHECK --interval=15s --timeout=3s --start-period=20s --retries=5 \ + CMD curl -fsS "http://127.0.0.1:${GATEWAY_BIND_PORT}/healthz" || exit 1 + +# Default command: run the server. Compose overrides this in dev to run +# `alembic upgrade head` first (see docker-compose.dev.yml). +CMD ["python", "-m", "neuronetz_gateway"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..44f5372 --- /dev/null +++ b/README.md @@ -0,0 +1,92 @@ +# neuronetz-gateway + +A secure, multi-tenant API gateway in front of an [Ollama](https://github.com/ollama/ollama) +instance. It is the hot path of the Neuronetz API: every request to the models flows +through here, authenticated, rate-limited, budgeted, and audited. + +**The Ollama backend is never reachable from the public internet.** It is bound to an +internal Docker network with no published ports. All access is via this gateway, behind +TLS terminated by Caddy. + +> Status: **v0.1.0 — in development.** See [`scope-docs/SPEC.md`](scope-docs/SPEC.md) for +> the full specification and [`scope-docs/AGENT_PROMPT.md`](scope-docs/AGENT_PROMPT.md) for +> the phased build plan. `SPEC.md` is the source of truth. + +## What it does + +- **Auth** — API keys as Bearer tokens, stored as Argon2id hashes, verified in constant time. +- **Multi-tenant** — tenants own keys; limits and budgets inherit tenant → key. +- **Rate limiting** — per-key and per-tenant RPM / TPM / concurrent connections. +- **Budgets** — daily / monthly / total token budgets, enforced fail-closed. +- **Dual API surface** — native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`), both streaming. +- **Hard-blocked mutations** — `/api/pull`, `/api/push`, `/api/create`, `/api/copy`, + `/api/delete`, `/api/blobs/*` always return 403. Not configurable. +- **Audit log** — always-on request metadata; opt-in, TTL'd prompt logging per key. + +Administration (dashboards, tenant self-service) lives in a separate service, +`neuronetz-console`; it is **not** part of this repository. + +## Architecture + +``` +Internet ──TLS──> Caddy ──HTTP──> gateway ──┬──> Postgres (keys, budgets, audit) + ├──> Redis (key cache, rate limits) + └──> Ollama (internal network only) +``` + +## Quickstart (dev) + +Requires Docker + Docker Compose. The dev stack runs Postgres, Redis, and the gateway — +**no Caddy and no Ollama** (so `/readyz` reports 503 until a real Ollama backend is wired +in; that is expected). + +```bash +git clone neuronetz-gateway && cd neuronetz-gateway +cp .env.example .env # adjust if you like; defaults work for local dev +docker compose -f docker-compose.dev.yml up --build +``` + +The gateway runs `alembic upgrade head` on startup, then serves on `http://localhost:8080`. + +```bash +curl -i http://localhost:8080/healthz # -> 200 {"status":"ok"} +curl -i http://localhost:8080/readyz # -> 503 (no Ollama backend in the dev stack) +``` + +## Production + +`docker-compose.yml` brings up the full stack — Caddy (TLS via Let's Encrypt for +`api.neuronetz.ai`), the gateway, Postgres, Redis, and Ollama. The `ollama` service has +**no `ports:` mapping** and is reachable only on the internal Docker network. See +[`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) (added in a later phase) and +[`ops/caddy/Caddyfile.example`](ops/caddy/Caddyfile.example). + +## Managing tenants and keys + +Use the bootstrap CLI (Typer). Keys have the form `nz_`; the full key is +printed exactly once at creation and only its Argon2id hash is stored. + +```bash +neuronetz-gateway create-tenant --name acme +neuronetz-gateway create-key --tenant acme --name prod-server-1 +neuronetz-gateway list-keys --tenant acme +neuronetz-gateway revoke-key --prefix nz_abc12345 +``` + +## Development + +```bash +just dev # run the dev stack +just test # pytest + coverage +just lint # ruff +just typecheck # mypy --strict +just migrate # alembic upgrade head +``` + +Tooling: Python 3.12, `uv`, FastAPI + uvicorn, SQLAlchemy 2.0 (async) + asyncpg, Redis, +httpx, structlog, Pydantic. Lint/type/security gates: ruff, mypy `--strict`, bandit, +pip-audit. + +## License + +Apache 2.0 — see [`LICENSE`](LICENSE). Owner: Stephan Berbig / Neuronetz. diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..44ee442 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,49 @@ +# Alembic configuration for neuronetz-gateway. +# The database URL is read from the DATABASE_URL environment variable in +# alembic/env.py (do not hardcode credentials here). + +[alembic] +script_location = alembic +prepend_sys_path = src +version_path_separator = os +# version_locations defaults to alembic/versions + +# DATABASE_URL is injected at runtime; this placeholder is never used directly. +sqlalchemy.url = driver://user:pass@localhost/dbname + +[post_write_hooks] +# (none) + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/alembic/env.py b/alembic/env.py new file mode 100644 index 0000000..8ab4c24 --- /dev/null +++ b/alembic/env.py @@ -0,0 +1,97 @@ +"""Alembic environment for neuronetz-gateway (async engine). + +Reads ``DATABASE_URL`` from the environment (the same value the app uses, +``postgresql+asyncpg://...``). Ensures schema ``gateway`` exists and pins the +Alembic version table into that schema so migration bookkeeping never collides +with the ``console`` schema in the shared database. +""" + +from __future__ import annotations + +import asyncio +import os +from logging.config import fileConfig + +from alembic import context +from sqlalchemy import pool, text +from sqlalchemy.engine import Connection +from sqlalchemy.ext.asyncio import async_engine_from_config + +from neuronetz_gateway.config import get_settings +from neuronetz_gateway.db.models import GATEWAY_SCHEMA, Base + +config = context.config + +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +target_metadata = Base.metadata + + +def _database_url() -> str: + """Resolve the async database URL from env, falling back to settings.""" + return os.environ.get("DATABASE_URL") or get_settings().database_url + + +def _configure_context(connection: Connection) -> None: + """Configure migration context with the gateway schema + version table.""" + context.configure( + connection=connection, + target_metadata=target_metadata, + version_table="alembic_version", + version_table_schema=GATEWAY_SCHEMA, + include_schemas=True, + compare_type=True, + ) + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode (emit SQL without a DBAPI connection).""" + context.configure( + url=_database_url(), + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + version_table="alembic_version", + version_table_schema=GATEWAY_SCHEMA, + include_schemas=True, + ) + with context.begin_transaction(): + context.run_migrations() + + +def _do_run_migrations(connection: Connection) -> None: + """Ensure the schema exists, then run migrations within a transaction. + + The ``CREATE SCHEMA`` is committed in its own transaction before configuring + Alembic. Under SQLAlchemy 2.0, ``execute()`` auto-begins a transaction; if it + were left open, Alembic's ``begin_transaction()`` would treat the connection as + caller-managed and become a no-op that never commits, so the whole migration + (and the schema) would be rolled back on connection close. Committing here + leaves the connection clean so Alembic owns — and commits — its own transaction. + """ + connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{GATEWAY_SCHEMA}"')) + connection.commit() + _configure_context(connection) + with context.begin_transaction(): + context.run_migrations() + + +async def run_migrations_online() -> None: + """Run migrations in 'online' mode using an async engine.""" + configuration = config.get_section(config.config_ini_section) or {} + configuration["sqlalchemy.url"] = _database_url() + connectable = async_engine_from_config( + configuration, + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + async with connectable.connect() as connection: + await connection.run_sync(_do_run_migrations) + await connectable.dispose() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + asyncio.run(run_migrations_online()) diff --git a/alembic/versions/0001_initial.py b/alembic/versions/0001_initial.py new file mode 100644 index 0000000..0faf866 --- /dev/null +++ b/alembic/versions/0001_initial.py @@ -0,0 +1,342 @@ +"""initial gateway schema + +Creates schema ``gateway``, the three enum types, all tables and indexes, and +the ``notify_key_revoked()`` function plus ``trg_notify_key_revoked`` trigger, +matching SPEC §5 verbatim in structure. + +Revision ID: 0001_initial +Revises: +Create Date: 2026-05-22 + +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision: str = "0001_initial" +down_revision: str | None = None +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +SCHEMA = "gateway" + +# Enum types are created explicitly via raw SQL below; the table columns +# reference them with create_type=False so they are not created twice. +_key_status = postgresql.ENUM( + "active", "disabled", "revoked", name="key_status", schema=SCHEMA, create_type=False +) +_tenant_status = postgresql.ENUM( + "active", "suspended", "closed", name="tenant_status", schema=SCHEMA, create_type=False +) +_budget_period = postgresql.ENUM( + "day", "month", "total", name="budget_period", schema=SCHEMA, create_type=False +) + + +def upgrade() -> None: + """Create the full ``gateway`` schema.""" + op.execute(f'CREATE SCHEMA IF NOT EXISTS "{SCHEMA}"') + + # --- Enum types (SPEC §5) --- + op.execute("CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked')") + op.execute("CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed')") + op.execute("CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total')") + + # --- tenants --- + op.create_table( + "tenants", + sa.Column( + "id", + postgresql.UUID(as_uuid=True), + primary_key=True, + server_default=sa.text("gen_random_uuid()"), + ), + sa.Column("name", sa.Text(), nullable=False, unique=True), + sa.Column( + "status", _tenant_status, nullable=False, server_default=sa.text("'active'") + ), + sa.Column( + "created_at", + postgresql.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.text("now()"), + ), + sa.Column( + "metadata", + postgresql.JSONB(), + nullable=False, + server_default=sa.text("'{}'::jsonb"), + ), + schema=SCHEMA, + ) + + # --- tenant_limits --- + op.create_table( + "tenant_limits", + sa.Column( + "tenant_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"), + primary_key=True, + ), + sa.Column("rpm", sa.Integer(), nullable=False, server_default=sa.text("60")), + sa.Column("tpm", sa.Integer(), nullable=False, server_default=sa.text("100000")), + sa.Column("concurrent", sa.Integer(), nullable=False, server_default=sa.text("8")), + sa.Column("tokens_daily", sa.BigInteger(), nullable=True), + sa.Column("tokens_monthly", sa.BigInteger(), nullable=True), + sa.Column("tokens_total", sa.BigInteger(), nullable=True), + sa.Column( + "allowed_models", + postgresql.ARRAY(sa.Text()), + nullable=False, + server_default=sa.text("'{}'"), + ), + sa.Column( + "allow_all_models", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + sa.Column( + "log_prompts_default", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + sa.Column( + "prompt_retention_days", sa.Integer(), nullable=False, server_default=sa.text("30") + ), + sa.Column( + "audit_retention_days", sa.Integer(), nullable=False, server_default=sa.text("365") + ), + schema=SCHEMA, + ) + + # --- api_keys --- + op.create_table( + "api_keys", + sa.Column( + "id", + postgresql.UUID(as_uuid=True), + primary_key=True, + server_default=sa.text("gen_random_uuid()"), + ), + sa.Column( + "tenant_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column("prefix", sa.Text(), nullable=False, unique=True), + sa.Column("key_hash", sa.Text(), nullable=False), + sa.Column("name", sa.Text(), nullable=False), + sa.Column("status", _key_status, nullable=False, server_default=sa.text("'active'")), + sa.Column( + "scopes", + postgresql.ARRAY(sa.Text()), + nullable=False, + server_default=sa.text("'{chat,embeddings}'"), + ), + sa.Column( + "created_at", + postgresql.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.text("now()"), + ), + sa.Column("last_used_at", postgresql.TIMESTAMP(timezone=True), nullable=True), + sa.Column("expires_at", postgresql.TIMESTAMP(timezone=True), nullable=True), + sa.Column("log_prompts", sa.Boolean(), nullable=True), + sa.Column( + "metadata", + postgresql.JSONB(), + nullable=False, + server_default=sa.text("'{}'::jsonb"), + ), + schema=SCHEMA, + ) + op.create_index( + "idx_api_keys_prefix", + "api_keys", + ["prefix"], + schema=SCHEMA, + postgresql_where=sa.text("status = 'active'"), + ) + op.create_index("idx_api_keys_tenant", "api_keys", ["tenant_id"], schema=SCHEMA) + + # --- key_limits --- + op.create_table( + "key_limits", + sa.Column( + "key_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"), + primary_key=True, + ), + sa.Column("rpm", sa.Integer(), nullable=True), + sa.Column("tpm", sa.Integer(), nullable=True), + sa.Column("concurrent", sa.Integer(), nullable=True), + sa.Column("tokens_daily", sa.BigInteger(), nullable=True), + sa.Column("tokens_monthly", sa.BigInteger(), nullable=True), + sa.Column("tokens_total", sa.BigInteger(), nullable=True), + sa.Column("allowed_models", postgresql.ARRAY(sa.Text()), nullable=True), + sa.Column("allow_all_models", sa.Boolean(), nullable=True), + schema=SCHEMA, + ) + + # --- budget_usage --- + op.create_table( + "budget_usage", + sa.Column( + "key_id", + postgresql.UUID(as_uuid=True), + sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"), + primary_key=True, + nullable=False, + ), + sa.Column("period", _budget_period, primary_key=True, nullable=False), + sa.Column( + "period_start", + postgresql.TIMESTAMP(timezone=True), + primary_key=True, + nullable=False, + ), + sa.Column("tokens_in", sa.BigInteger(), nullable=False, server_default=sa.text("0")), + sa.Column("tokens_out", sa.BigInteger(), nullable=False, server_default=sa.text("0")), + sa.Column("requests", sa.BigInteger(), nullable=False, server_default=sa.text("0")), + schema=SCHEMA, + ) + op.create_index( + "idx_budget_usage_period", + "budget_usage", + ["period", "period_start"], + schema=SCHEMA, + ) + + # --- audit_log --- + op.create_table( + "audit_log", + sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True), + sa.Column( + "ts", + postgresql.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.text("now()"), + ), + sa.Column("request_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("tenant_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=True), + sa.Column("key_prefix", sa.Text(), nullable=True), + sa.Column("method", sa.Text(), nullable=False), + sa.Column("path", sa.Text(), nullable=False), + sa.Column("model", sa.Text(), nullable=True), + sa.Column("tokens_in", sa.Integer(), nullable=True), + sa.Column("tokens_out", sa.Integer(), nullable=True), + sa.Column("latency_ms", sa.Integer(), nullable=True), + sa.Column("status", sa.Integer(), nullable=False), + sa.Column("client_ip", postgresql.INET(), nullable=True), + sa.Column("user_agent", sa.Text(), nullable=True), + sa.Column("error_code", sa.Text(), nullable=True), + schema=SCHEMA, + ) + op.create_index("idx_audit_ts", "audit_log", ["ts"], schema=SCHEMA) + op.create_index("idx_audit_tenant_ts", "audit_log", ["tenant_id", "ts"], schema=SCHEMA) + op.create_index("idx_audit_key_ts", "audit_log", ["key_id", "ts"], schema=SCHEMA) + + # --- prompt_log --- + op.create_table( + "prompt_log", + sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True), + sa.Column( + "audit_id", + sa.BigInteger(), + sa.ForeignKey(f"{SCHEMA}.audit_log.id", ondelete="CASCADE"), + nullable=False, + ), + sa.Column( + "ts", + postgresql.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.text("now()"), + ), + sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("request_body", postgresql.JSONB(), nullable=False), + sa.Column("response_text", sa.Text(), nullable=True), + sa.Column("retention_until", postgresql.TIMESTAMP(timezone=True), nullable=False), + schema=SCHEMA, + ) + op.create_index( + "idx_prompt_log_retention", "prompt_log", ["retention_until"], schema=SCHEMA + ) + + # --- revocations --- + op.create_table( + "revocations", + sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True), + sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column( + "ts", + postgresql.TIMESTAMP(timezone=True), + nullable=False, + server_default=sa.text("now()"), + ), + sa.Column("reason", sa.Text(), nullable=True), + sa.Column("processed_at", postgresql.TIMESTAMP(timezone=True), nullable=True), + schema=SCHEMA, + ) + + # --- NOTIFY trigger on revocation insert (SPEC §5) --- + op.execute( + """ + CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$ + BEGIN + PERFORM pg_notify('key_revoked', NEW.key_id::text); + RETURN NEW; + END; + $$ LANGUAGE plpgsql; + """ + ) + op.execute( + """ + CREATE TRIGGER trg_notify_key_revoked + AFTER INSERT ON gateway.revocations + FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked(); + """ + ) + + +def downgrade() -> None: + """Drop the entire ``gateway`` schema and its objects.""" + op.execute("DROP TRIGGER IF EXISTS trg_notify_key_revoked ON gateway.revocations") + op.execute("DROP FUNCTION IF EXISTS gateway.notify_key_revoked()") + + op.drop_index("idx_prompt_log_retention", table_name="prompt_log", schema=SCHEMA) + op.drop_table("prompt_log", schema=SCHEMA) + + op.drop_index("idx_audit_key_ts", table_name="audit_log", schema=SCHEMA) + op.drop_index("idx_audit_tenant_ts", table_name="audit_log", schema=SCHEMA) + op.drop_index("idx_audit_ts", table_name="audit_log", schema=SCHEMA) + op.drop_table("audit_log", schema=SCHEMA) + + op.drop_index("idx_budget_usage_period", table_name="budget_usage", schema=SCHEMA) + op.drop_table("budget_usage", schema=SCHEMA) + + op.drop_table("key_limits", schema=SCHEMA) + + op.drop_index("idx_api_keys_tenant", table_name="api_keys", schema=SCHEMA) + op.drop_index("idx_api_keys_prefix", table_name="api_keys", schema=SCHEMA) + op.drop_table("api_keys", schema=SCHEMA) + + op.drop_table("tenant_limits", schema=SCHEMA) + op.drop_table("tenants", schema=SCHEMA) + + op.execute("DROP TYPE IF EXISTS gateway.budget_period") + op.execute("DROP TYPE IF EXISTS gateway.tenant_status") + op.execute("DROP TYPE IF EXISTS gateway.key_status") + + op.execute(f'DROP SCHEMA IF EXISTS "{SCHEMA}"') diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml new file mode 100644 index 0000000..a73ce4d --- /dev/null +++ b/docker-compose.dev.yml @@ -0,0 +1,101 @@ +# neuronetz-gateway — DEV stack (postgres + redis + gateway only). +# +# Deliberately differs from the production stack: +# * NO caddy — the gateway is published directly on localhost:8080. +# * NO ollama — Phase 1 expects /readyz to return 503 *because* there is no +# Ollama backend yet. This is the intended exit-criterion state. +# +# Bring it up with: +# docker compose -f docker-compose.dev.yml up --build +# +# Then: +# curl -i http://localhost:8080/healthz # -> 200 +# curl -i http://localhost:8080/readyz # -> 503 (no Ollama) +# +# The gateway container runs `alembic upgrade head` and then starts the server. + +services: + gateway: + build: + context: . + dockerfile: Dockerfile + restart: unless-stopped + ports: + - "127.0.0.1:8080:8080" + environment: + GATEWAY_BIND_HOST: 0.0.0.0 + GATEWAY_BIND_PORT: "8080" + GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO} + GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console} + GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID} + GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1} + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz} + DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10} + DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20} + REDIS_URL: redis://redis:6379/0 + REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60} + # No Ollama in the dev stack — point at the (absent) service name so the + # readiness check fails closed with 503, exactly as Phase 1 expects. + OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434} + OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5} + OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600} + OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64} + DEFAULT_RPM: ${DEFAULT_RPM:-60} + DEFAULT_TPM: ${DEFAULT_TPM:-100000} + DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8} + MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144} + MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096} + ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3} + ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536} + ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4} + AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20} + AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000} + PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30} + AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365} + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + # Run migrations, then start the server. + command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"] + interval: 10s + timeout: 3s + retries: 5 + start_period: 30s + + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER:-gateway} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway} + POSTGRES_DB: ${POSTGRES_DB:-neuronetz} + ports: + # Exposed on localhost for dev convenience (psql, migrations from host). + - "127.0.0.1:5432:5432" + volumes: + - postgres_dev_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"] + interval: 5s + timeout: 3s + retries: 10 + + redis: + image: redis:7-alpine + restart: unless-stopped + command: ["redis-server", "--save", "", "--appendonly", "no"] + ports: + # Exposed on localhost for dev convenience (redis-cli from host). + - "127.0.0.1:6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + +volumes: + postgres_dev_data: diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6a2d0ae --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,152 @@ +# neuronetz-gateway — FULL production stack (SPEC §4.1). +# +# Internet ──TLS──▶ caddy ──HTTP/1.1 internal──▶ gateway ──▶ postgres / redis / ollama +# +# Only Caddy publishes ports to the host. The gateway is reachable solely through +# Caddy on the internal network. Postgres, Redis and (critically) Ollama are NOT +# published to the host at all. +# +# ┌─────────────────────────────────────────────────────────────────────────┐ +# │ SECURITY NON-NEGOTIABLE: │ +# │ The `ollama` service has NO `ports:` mapping and MUST NEVER get one. │ +# │ Ollama is reachable only on the internal Docker network via the │ +# │ service name `ollama:11434`. Publishing it would re-open the exact │ +# │ unauthenticated exposure this whole project exists to close. │ +# └─────────────────────────────────────────────────────────────────────────┘ +# +# Copy `.env.example` to `.env` and adjust before running: +# docker compose up -d --build + +services: + caddy: + image: caddy:2-alpine + restart: unless-stopped + depends_on: + gateway: + condition: service_healthy + ports: + - "80:80" + - "443:443" + - "443:443/udp" # HTTP/3 + volumes: + - ./ops/caddy/Caddyfile.example:/etc/caddy/Caddyfile:ro + - caddy_data:/data + - caddy_config:/config + networks: + - edge + - internal + + gateway: + build: + context: . + dockerfile: Dockerfile + restart: unless-stopped + # NOTE: deliberately NO `ports:` — the gateway is internal-only and is + # reached exclusively through Caddy. + expose: + - "8080" + environment: + GATEWAY_BIND_HOST: 0.0.0.0 + GATEWAY_BIND_PORT: "8080" + GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO} + GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-json} + GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID} + GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1,caddy} + # Service-name addressing on the internal network. + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-changeme}@postgres:5432/${POSTGRES_DB:-neuronetz} + DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10} + DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20} + REDIS_URL: redis://redis:6379/0 + REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60} + OLLAMA_BASE_URL: http://ollama:11434 + OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5} + OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600} + OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64} + DEFAULT_RPM: ${DEFAULT_RPM:-60} + DEFAULT_TPM: ${DEFAULT_TPM:-100000} + DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8} + MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144} + MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096} + ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3} + ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536} + ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4} + AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20} + AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000} + PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30} + AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365} + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + ollama: + condition: service_started + # Apply migrations, then start the server. + command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"] + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"] + interval: 15s + timeout: 3s + retries: 5 + start_period: 30s + networks: + - internal + + postgres: + image: postgres:16-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER:-gateway} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme} + POSTGRES_DB: ${POSTGRES_DB:-neuronetz} + volumes: + - postgres_data:/var/lib/postgresql/data + # No `ports:` — Postgres is internal-only. + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"] + interval: 5s + timeout: 3s + retries: 10 + networks: + - internal + + redis: + image: redis:7-alpine + restart: unless-stopped + command: ["redis-server", "--save", "", "--appendonly", "no"] + # No `ports:` — Redis is internal-only. + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 10 + networks: + - internal + + # ─────────────────────────────────────────────────────────────────────────── + # Ollama — INTERNAL NETWORK ONLY. DO NOT ADD A `ports:` MAPPING. + # Reachable only as `http://ollama:11434` from the gateway container. + # ─────────────────────────────────────────────────────────────────────────── + ollama: + image: ollama/ollama:latest + restart: unless-stopped + # !!! NO `ports:` — never publish Ollama to the host or the internet. !!! + volumes: + - ollama_data:/root/.ollama + networks: + - internal + +networks: + # Public-facing network: only Caddy is attached alongside `internal`. + edge: + driver: bridge + # Private network for inter-service traffic; not reachable from the host. + internal: + driver: bridge + internal: false + +volumes: + postgres_data: + ollama_data: + caddy_data: + caddy_config: diff --git a/justfile b/justfile new file mode 100644 index 0000000..8194f81 --- /dev/null +++ b/justfile @@ -0,0 +1,60 @@ +# neuronetz-gateway — task runner. +# +# Requires `just` (https://github.com/casey/just) and `uv` +# (https://github.com/astral-sh/uv) on the host. +# +# just # list available targets +# just dev # run postgres + redis + gateway locally (dev stack) +# just test # run the test suite with coverage +# just lint # ruff check +# just typecheck # mypy --strict +# just migrate # apply alembic migrations against DATABASE_URL + +set dotenv-load := true + +# uv runs commands inside the project's managed environment. +uv := "uv" + +# Show the list of targets (default). +default: + @just --list + +# Sync dependencies into the local uv-managed virtualenv (incl. dev extras). +install: + {{uv}} sync --extra dev + +# Run the dev stack: postgres + redis + gateway (no caddy, no ollama). +dev: + docker compose -f docker-compose.dev.yml up --build + +# Run the test suite with coverage. +test: + {{uv}} run pytest + +# Lint with ruff. +lint: + {{uv}} run ruff check . + +# Static type checking (strict). +typecheck: + {{uv}} run mypy --strict src + +# Apply database migrations to head. +migrate: + {{uv}} run alembic upgrade head + +# Security lint. +bandit: + {{uv}} run bandit -q -r src + +# Dependency vulnerability audit. +audit: + {{uv}} run pip-audit + +# Bring the FULL production stack up (caddy + gateway + postgres + redis + ollama). +compose-up: + docker compose up -d --build + +# Tear the production stack down. +compose-down: + docker compose down diff --git a/ops/caddy/Caddyfile.example b/ops/caddy/Caddyfile.example new file mode 100644 index 0000000..75fead8 --- /dev/null +++ b/ops/caddy/Caddyfile.example @@ -0,0 +1,59 @@ +# neuronetz-gateway — Caddy reverse proxy (SPEC §4.1, §6.5). +# +# Caddy is the only public-facing component. It terminates TLS (HTTP/2 + HTTP/3), +# obtains a Let's Encrypt certificate for api.neuronetz.ai automatically, applies +# security headers, and reverse-proxies to the internal-only gateway:8080. +# +# Copy this file to `Caddyfile` and edit the site address / admin email. +# The production docker-compose.yml mounts it at /etc/caddy/Caddyfile. + +{ + # Email for Let's Encrypt account + expiry notices. Replace before deploy. + email ops@neuronetz.ai +} + +api.neuronetz.ai { + # --- Reverse proxy to the internal gateway --- + # `gateway` is the Docker service name on the internal network; it is never + # published to the host. Caddy forwards plain HTTP/1.1 to it. + reverse_proxy gateway:8080 + + # --- Security headers --- + header { + # HSTS: force HTTPS for two years, include subdomains, allow preload. + Strict-Transport-Security "max-age=63072000; includeSubDomains; preload" + # Disable MIME sniffing. + X-Content-Type-Options "nosniff" + # Clickjacking defense (API has no UI, deny framing outright). + X-Frame-Options "DENY" + # Conservative referrer policy. + Referrer-Policy "no-referrer" + # Strip server-identifying headers so we don't advertise the stack. + -Server + -X-Powered-By + } + + # Structured access logs to stdout (collected by the container runtime). + log { + output stdout + format json + } +} + +# ───────────────────────────────────────────────────────────────────────────── +# DEV / LOCAL note: +# +# For local testing without a public domain or real certificate, replace the +# site block above with a localhost block that uses Caddy's internal self-signed +# CA (no Let's Encrypt round-trip): +# +# localhost { +# tls internal +# reverse_proxy gateway:8080 +# } +# +# Caddy will install its local root CA; trust it or pass `-k` to curl. Note the +# Phase 1 *dev* compose stack (docker-compose.dev.yml) ships WITHOUT Caddy and +# exposes the gateway directly on localhost:8080 — this file is for the full +# production stack only. +# ───────────────────────────────────────────────────────────────────────────── diff --git a/ops/systemd/neuronetz-gateway.service b/ops/systemd/neuronetz-gateway.service new file mode 100644 index 0000000..7041375 --- /dev/null +++ b/ops/systemd/neuronetz-gateway.service @@ -0,0 +1,58 @@ +# neuronetz-gateway — systemd unit for non-Compose deployments. +# +# Assumes the project is installed into a virtualenv at /opt/neuronetz-gateway/venv +# (e.g. `uv venv /opt/neuronetz-gateway/venv && uv pip install ...`) and that +# configuration lives in /etc/neuronetz-gateway/gateway.env (same keys as +# .env.example). Postgres, Redis and Ollama are reached over the network/loopback +# per that env file — Ollama must remain bound to localhost / a private network +# and never be published publicly. +# +# Install: +# sudo cp neuronetz-gateway.service /etc/systemd/system/ +# sudo systemctl daemon-reload +# sudo systemctl enable --now neuronetz-gateway + +[Unit] +Description=neuronetz-gateway — secure API gateway in front of Ollama +Documentation=https://github.com/neuronetz/neuronetz-gateway +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple + +# Dedicated unprivileged service account (create with: useradd --system gateway). +User=gateway +Group=gateway + +WorkingDirectory=/opt/neuronetz-gateway +EnvironmentFile=/etc/neuronetz-gateway/gateway.env + +# Apply migrations before starting (idempotent; no-op when already at head). +ExecStartPre=/opt/neuronetz-gateway/venv/bin/alembic upgrade head +ExecStart=/opt/neuronetz-gateway/venv/bin/python -m neuronetz_gateway + +Restart=on-failure +RestartSec=5 +TimeoutStopSec=30 + +# --- Hardening --- +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +PrivateDevices=true +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectControlGroups=true +RestrictNamespaces=true +RestrictRealtime=true +RestrictSUIDSGID=true +LockPersonality=true +MemoryDenyWriteExecute=true +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX +# Allow writing only where the app legitimately needs to (none by default). +ReadWritePaths= + +[Install] +WantedBy=multi-user.target diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..166bf8f --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,94 @@ +[project] +name = "neuronetz-gateway" +version = "0.1.0" +description = "Secure multi-tenant API gateway in front of Ollama for the Neuronetz platform." +readme = "README.md" +license = { text = "Apache-2.0" } +requires-python = ">=3.12" +authors = [{ name = "Neuronetz", email = "ops@neuronetz.ai" }] +dependencies = [ + "fastapi>=0.115", + "uvicorn[standard]>=0.30", + "httpx>=0.27", + "sqlalchemy[asyncio]>=2.0", + "asyncpg>=0.29", + "redis[hiredis]>=5.0", + "structlog>=24.1", + "pydantic>=2.9", + "pydantic-settings>=2.4", + "argon2-cffi>=23.1", + "typer>=0.12", + "prometheus-client>=0.20", + "alembic>=1.13", +] + +[project.scripts] +neuronetz-gateway = "neuronetz_gateway.cli.manage:app" + +[project.optional-dependencies] +dev = [ + "ruff>=0.6", + "mypy>=1.11", + "bandit>=1.7", + "pip-audit>=2.7", + "pytest>=8.3", + "pytest-asyncio>=0.24", + "pytest-cov>=5.0", + "testcontainers>=4.8", + "respx>=0.21", + "locust>=2.31", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/neuronetz_gateway"] + +[tool.ruff] +target-version = "py312" +line-length = 100 +src = ["src", "tests"] + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP", "S", "ASYNC"] + +[tool.ruff.lint.per-file-ignores] +# Tests may use assert and bind to all interfaces in fixtures. +"tests/**" = ["S101", "S104"] + +[tool.mypy] +python_version = "3.12" +strict = true +mypy_path = "src" +plugins = ["pydantic.mypy"] +namespace_packages = true +explicit_package_bases = true + +[[tool.mypy.overrides]] +# argon2 ships types but some transitive deps may not; keep strictness elsewhere. +# asyncpg ships no stubs/py.typed marker; it is used in revocation.py only. +module = ["testcontainers.*", "locust.*", "asyncpg", "asyncpg.*"] +ignore_missing_imports = true + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +pythonpath = ["src"] +addopts = "--cov=neuronetz_gateway --cov-report=term-missing" + +[tool.coverage.run] +source = ["src/neuronetz_gateway"] +branch = true +omit = [ + "src/neuronetz_gateway/__main__.py", + "src/neuronetz_gateway/cli/*", +] + +[tool.coverage.report] +# Phase 1: coverage is reported but non-blocking. Later phases set fail_under. +show_missing = true + +[tool.bandit] +exclude_dirs = ["tests"] diff --git a/scope-docs/AGENT_PROMPT.md b/scope-docs/AGENT_PROMPT.md new file mode 100644 index 0000000..5361a96 --- /dev/null +++ b/scope-docs/AGENT_PROMPT.md @@ -0,0 +1,121 @@ +# Build Order: neuronetz-gateway v0.1.0 + +## Context + +The Ollama instance at `https://api.neuronetz.ai` is currently exposed without authentication. This is a security incident in waiting. Your job is to build the gateway that closes that gap and forms the commercial API surface of the Neuronetz AI platform. + +The full specification is in **`SPEC.md`** in this repository. Read it before writing any code. It is the source of truth; if anything below conflicts with it, SPEC.md wins. + +## Mission + +Implement `neuronetz-gateway` per SPEC.md to a state that satisfies **§12 Acceptance Criteria**. Nothing less ships. + +## Non-Negotiables + +These are hard constraints. Violating any of them is a build failure regardless of feature completeness. + +1. **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down, DB unreachable, ambiguous state), deny the request. Never default to allow. +2. **Ollama never reachable from outside the Docker internal network.** No `ports:` mapping for the ollama service in any compose file shipped with the project. Document this prominently. +3. **No secrets in code, no secrets in logs, no secrets in errors.** Argon2id for key storage. Constant-time comparison only. Keys printed exactly once at creation. +4. **No reflected upstream errors.** Ollama errors are sanitized at the gateway boundary. Map to generic 4xx/5xx with a request ID. +5. **Mutating Ollama endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) are hard-blocked.** Not configurable. Not behind a feature flag. Blocked. +6. **Streaming integrity.** Token counting and audit writes happen **after** stream close, never on the hot path. Time-to-first-byte must not be degraded by gateway bookkeeping. +7. **`mypy --strict` and `ruff check` clean before any PR is opened.** No `# type: ignore` without an inline justification comment. +8. **Test coverage targets (§9) are a gate, not a goal.** 100% on `auth/`, `ratelimit/`, `budget/`. CI fails below threshold. +9. **Apache 2.0 license file present from commit one.** No GPL dependencies. +10. **The bootstrap CLI must work before the first manual `curl`.** No "I'll create a key by hand in the DB just to test it" — if the CLI can't create a key, fix the CLI first. + +## Phasing + +Five phases. Each phase has an explicit exit criterion. **Do not start phase N+1 until phase N's exit criterion is verifiably met.** PM/Control: enforce this. + +### Phase 1 — Scaffold + +- Repo layout per SPEC §8 +- `pyproject.toml`, `uv.lock`, Dockerfile, docker-compose.yml, docker-compose.dev.yml, .env.example, README, LICENSE +- Alembic configured; migration `0001_initial.py` creates schema `gateway` and all tables per SPEC §5 +- `make` or `just` targets: `dev`, `test`, `lint`, `typecheck`, `migrate`, `compose-up`, `compose-down` +- CI workflow runs: ruff, mypy, pytest, bandit, pip-audit +- **Exit criterion:** `docker compose -f docker-compose.dev.yml up` brings up postgres + redis + a stub gateway that responds 200 on `/healthz` and 503 on `/readyz` (because no Ollama yet). Migrations apply cleanly. CI is green on an empty test suite. + +### Phase 2 — Core proxy + auth + +- Bootstrap CLI (`create-tenant`, `create-key`, `list-keys`, `revoke-key`) working end-to-end +- Argon2id hashing module with unit tests covering: hash, verify, constant-time behavior, rehash-on-parameter-change +- Auth middleware: Bearer extraction, prefix lookup, hash verify, Redis cache with TTL +- Ollama proxy for `/api/chat` and `/api/generate` — both streamed (NDJSON) and non-streamed +- Endpoint allowlist enforced +- **Model discovery (SPEC §4.6):** background poll of Ollama `/api/tags`, cached in Redis + in-process, fail-closed when unavailable +- Model allowlist enforced per-tenant via the **effective set** (allow_all → all discovered; else `allowed_models ∩ discovered`); key-level `allow_all_models` overrides tenant +- Error handler: sanitized responses, request ID in every error +- Audit log writer (buffered, async) +- Mock Ollama in `tests/integration/mock_ollama.py` (no real model required for CI) +- **Exit criterion:** A key created via CLI can call `/api/chat` and `/api/generate` through Caddy → gateway → mock Ollama, streaming works, audit rows land in Postgres with correct token counts, `/api/pull` returns 403, no-auth returns 401, wrong-key returns 401. Model discovery populates from the (mock) Ollama `/api/tags`; `/api/tags` returns the tenant's effective set; an `allow_all_models` tenant sees all discovered models, a default-deny tenant sees only `allowed ∩ discovered`, and a non-effective model returns 403; discovery-unavailable fails closed. Integration tests cover all of the above. + +### Phase 3 — Rate limit + budget + OpenAI-compat + +- Sliding window rate limit (Redis Lua script) — per-key RPM, per-tenant RPM, per-key TPM +- Concurrency semaphore (Redis-backed) with TTL guard +- Token budget counters in Redis with Postgres ledger reconciliation on period rollover +- OpenAI-compatibility layer: `/v1/chat/completions`, `/v1/completions`, `/v1/embeddings`, `/v1/models` with full SSE streaming and `data: [DONE]` terminator +- Schema translation tests with golden fixtures (request in OpenAI → expected Ollama request; response from Ollama → expected OpenAI response) +- Rate-limit and budget response headers per SPEC §6.5 +- **Exit criterion:** Locust test (100 concurrent users, 5 min) shows correct 429 behavior at the limit, correct token accounting, p99 gateway overhead < 25 ms. OpenAI Python SDK pointed at `/v1` successfully completes streaming chat. Killing Redis mid-test produces 503 (fail closed), not 200. + +### Phase 4 — Audit, prompt log, revocation + +- Prompt log (opt-in per key, TTL) with daily sweeper task +- Audit log retention sweeper (TTL per tenant config) +- Buffered audit writer with ring-buffer overflow → deny-mode behavior +- Revocation flow: console (simulated via direct INSERT in tests) writes `gateway.revocations` → NOTIFY → gateway evicts Redis cache → next request with revoked key returns 401 within 1 second +- Prometheus `/metrics` (loopback only) with: `gateway_requests_total{tenant,model,status}`, `gateway_tokens_total{tenant,model,direction}`, `gateway_request_duration_seconds{tenant,model}` (histogram) +- `/readyz` checks DB + Redis + Ollama all reachable +- Circuit breaker on Ollama failures +- **Exit criterion:** Revocation E2E test green. Prompt log retention TTL works (use freeze-time to simulate). Metrics scrape returns valid Prometheus exposition. `/readyz` flips to 503 when any dependency is down. + +### Phase 5 — Harden, document, release + +- `docs/ARCHITECTURE.md`, `docs/DEPLOYMENT.md`, `docs/API.md`, `docs/THREAT_MODEL.md`, `docs/OPERATIONS.md` complete +- Caddyfile example with Let's Encrypt for `api.neuronetz.ai` and security headers (HSTS, X-Content-Type-Options, no Server header, no X-Powered-By) +- Systemd unit file for non-Compose deployments +- Multi-stage Dockerfile with non-root user, distroless or `python:3.12-slim` final stage, no build tools in final image +- `pip-audit` and `bandit` clean in CI +- Image scan (Trivy or Grype) clean of HIGH/CRITICAL +- Tag `v0.1.0`, build and push image, GitHub release with changelog +- **Exit criterion:** Every box in SPEC §12 checked, signed off by Control. Image runnable from a fresh host with only docker + a `.env`. README quickstart works for someone who has never seen the repo. + +## Agent Role Assignments + +For the multi-agent orchestrator (Fritz/UI-UX/DevOps/QA/Control/Timo/PM): + +| Agent | Owns | +|---|---| +| **Backend / Fritz** | All Python code under `src/neuronetz_gateway/`, Alembic migrations, CLI. Primary author. | +| **DevOps** | Dockerfile, docker-compose.yml(s), Caddyfile, systemd unit, CI workflows, image scanning, release tagging. | +| **QA** | All tests under `tests/`. Owns coverage gate. Writes the locust scenarios. Verifies acceptance criteria at each phase exit. | +| **UI-UX** | Not active this project (no UI surface here). Console project will pick this up. | +| **Control / Timo** | Enforces phase gates. Refuses to advance a phase whose exit criterion isn't met. Runs the acceptance checklist at end of Phase 5. | +| **PM** | Tracks the phase progression, opens YouTrack tickets per phase, runs daily standups against this prompt, surfaces blockers. | + +## Working Agreements + +- **Branch per phase.** `phase-1-scaffold`, `phase-2-proxy-auth`, etc. Merge to `main` only after phase exit criterion is verified. +- **PRs are reviewed against SPEC.md.** "Does this match the spec? If not, is SPEC.md wrong or is the PR wrong?" — that's the review question. +- **SPEC changes are explicit.** If a phase reveals a spec mistake, amend SPEC.md in a separate PR before changing the implementation. Never drift silently. +- **Commit messages reference the section.** e.g. `auth: implement argon2id verify per SPEC §5, §9`. +- **No TODOs in main.** If something is deferred, it becomes a tracked issue, not a code comment. +- **Open questions (SPEC §13) are resolved in writing.** Decision goes in SPEC.md, not in a Slack message that gets lost. + +## What "Done" Looks Like + +A fresh clone, a fresh host, a domain pointing at it, and a `.env` file. `docker compose up`. Five minutes later, `curl -H "Authorization: Bearer nz_..." https://api.neuronetz.ai/v1/chat/completions -d '...'` streams a response. The Ollama port is not open. The audit log has a row. The budget counter decremented. The metrics endpoint shows the request. The locust suite passes. The threat model document explains every defense. + +When all of that is true and SPEC §12 is fully ticked, ship v0.1.0. + +## When You Get Stuck + +- **Ambiguity in the spec → ask, don't guess.** Open a question in the PM channel; if resolved, amend SPEC.md. +- **Conflict between speed and correctness → correctness wins.** This is security infrastructure. We do not ship "good enough." +- **Conflict between scope creep and v0.1.0 → defer.** New ideas go in a follow-up issue. v0.1.0 ships per spec. + +Start with Phase 1. Read SPEC.md first. diff --git a/scope-docs/SPEC.md b/scope-docs/SPEC.md new file mode 100644 index 0000000..ddc226a --- /dev/null +++ b/scope-docs/SPEC.md @@ -0,0 +1,593 @@ +# neuronetz-gateway — SPEC.md + +**Project:** `neuronetz-gateway` +**Version:** 0.1.0 (target) +**Status:** Specification — not yet implemented +**License:** Apache 2.0 +**Owner:** Stephan Berbig / Neuronetz + +--- + +## 1. Purpose + +A secure, multi-tenant API gateway in front of an Ollama instance currently exposed at `https://api.neuronetz.ai`. The Ollama endpoint must never be reachable directly from the public internet again. All access flows through this gateway. + +The gateway is the **hot path** of the Neuronetz API. A separate service (`neuronetz-console`, built on the Nibiru PHP framework) handles administration, dashboards, and tenant self-service. This SPEC covers only the gateway. + +## 2. Scope + +### In scope (v0.1.0) + +- Authentication via API keys (Bearer tokens) +- Multi-tenant data model (tenants → keys, with inheritance) +- Per-key and per-tenant rate limiting (RPM, TPM, concurrent) +- Per-key and per-tenant token budgets (daily, monthly, total) +- Streaming and non-streaming proxy to Ollama +- Dual API surface: native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`) +- Endpoint allowlist (block all model-mutating Ollama endpoints) +- **Dynamic model discovery** from the Ollama backend — the live set of installed models is queried, cached, and auto-refreshed; nothing about the model list is hand-maintained +- Model allowlist (per-tenant override), **default-deny, resolved against the live discovered set** (stale/typo'd entries never resolve) +- **Per-tenant `allow_all_models` toggle** — opt-in: a flagged tenant may use any currently-installed model, so models newly pulled into Ollama are auto-granted on the next discovery refresh +- Request size limits, response size limits, timeouts +- Token counting from Ollama responses (precise, not heuristic) +- Audit log (always-on metadata) +- Prompt log (opt-in per key, TTL'd retention) +- Bootstrap CLI: create tenants, keys, set budgets +- Health and readiness endpoints +- Docker Compose deployment (gateway + caddy + postgres + redis + ollama) +- Caddy as TLS terminator (Let's Encrypt for `api.neuronetz.ai`) + +### Out of scope (v0.1.0, document as future) + +- Web admin UI (lives in `neuronetz-console`, separate repo) +- Billing / Stripe integration (budgets only, no money yet) +- Multi-region / HA / k8s +- Content moderation / prompt-injection filtering +- Response caching +- Multi-backend routing (one Ollama; pluggable backend interface stays for later) +- Webhook notifications +- SSO / OAuth2 for admin + +## 3. Threat Model (abbreviated) + +| Threat | Mitigation | +|---|---| +| Internet scanners hitting Ollama directly | Ollama bound to internal Docker network; never published | +| Unauthenticated API abuse | Mandatory Bearer token; fail-closed on auth errors | +| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP | +| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent connection cap | +| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096) | +| Model enumeration / training-data exfil via uncommon models | Model allowlist; default-deny. `allow_all_models` is **opt-in per tenant and audited**. Discovery only ever exposes models actually installed on the backend; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the same generic response | +| Discovery backend unreachable | Fail-closed: an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models" | +| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`) hard-blocked at the gateway | +| Information disclosure via error messages | Sanitize upstream errors; never proxy Ollama internals to client | +| Audit log tampering | Append-only at app layer; DB role separation; optional WAL archiving | +| Prompt data leakage | Prompt logging off by default; opt-in per key; TTL'd; redaction hook | +| Redis outage causing "fail open" | Fail-closed: if rate-limit/budget backend is unavailable, deny | +| Compromised admin token | Admin token lives in `neuronetz-console`, not in gateway; gateway has no admin endpoints | + +## 4. Architecture + +### 4.1 Component diagram + +``` + Internet + │ TLS + ▼ + ┌──────────────────────┐ + │ Caddy (sidecar) │ Let's Encrypt for api.neuronetz.ai + │ - TLS termination │ HSTS, security headers + │ - HTTP/2, HTTP/3 │ + └──────────┬───────────┘ + │ HTTP/1.1 internal + ┌──────────▼───────────┐ + │ neuronetz-gateway │ FastAPI + uvicorn + │ - authn │ + │ - rate limit │ + │ - budget check │ + │ - proxy + stream │ + │ - token count │ + │ - audit write │ + └──┬────────┬──────┬───┘ + │ │ │ + ┌──────▼──┐ ┌──▼───┐ │ + │Postgres │ │Redis │ │ + │ schema: │ │ keys │ │ + │ gateway │ │bucket│ │ + └─────────┘ └──────┘ │ + │ internal network only + ┌──────▼──────┐ + │ Ollama │ + │ 127.0.0.1 │ + └─────────────┘ + +Same Compose stack also hosts (separate from this SPEC): + - neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT) +``` + +### 4.2 Database schemas + +**Single Postgres instance, two schemas:** + +- `gateway` — owned by the gateway service; gateway role has full DDL +- `console` — owned by `neuronetz-console` (out of scope here); console role has full DDL +- Both services connect with their own role. Cross-schema access is explicit GRANT. + +**Console role gets `SELECT` on all `gateway.*` tables.** Console writes go only to `console.*` tables. If the console needs to mutate gateway state (e.g. revoke a key), it does so by writing to a `gateway.revocations` outbox table that the gateway tails (see §4.5). + +### 4.3 Request lifecycle + +1. Caddy terminates TLS, forwards to gateway on internal port. +2. Gateway middleware extracts `Authorization: Bearer `. +3. Key prefix (first 12 chars) used as Redis cache key. On miss, lookup `gateway.api_keys` by prefix; verify full key with argon2id `verify`; cache resolved key metadata in Redis (TTL 60s). +4. Rate limit check (sliding window in Redis, Lua-atomic) — per-key RPM + per-tenant RPM. +5. Budget check (Redis counter for current period; Postgres ledger is source of truth on reset). +6. Concurrent-connection semaphore (Redis `INCR` with TTL). +7. Model allowlist check. Resolve the **effective model set** for the key: + `allow_all := key.allow_all_models ?? tenant.allow_all_models`; + `effective := discovered` if `allow_all` else `(key.allowed_models ?? tenant.allowed_models) ∩ discovered`, + where `discovered` is the cached live model set from discovery (§4.6). The request's + `model` must be in `effective`, else a generic 403 with no disclosure of whether the + model exists but is unpermitted vs. is not installed. +8. Endpoint allowlist check. +9. Request body validation (size, schema, `num_predict` cap). +10. If OpenAI-compat path, translate request to Ollama schema. +11. Open httpx async stream to Ollama. +12. Stream response back to client, accumulating final `prompt_eval_count` + `eval_count`. +13. On stream close: write `gateway.audit_log` row; decrement budget; release semaphore; if prompt logging enabled, write `gateway.prompt_log` row. +14. On any failure: sanitized error to client, audit row with status code, semaphore released. + +### 4.4 Failure modes (fail-closed) + +| Subsystem | If down | Behavior | +|---|---|---| +| Postgres (read) | Key lookup fails | 503 with retry-after; no requests proxied | +| Postgres (write) | Audit write fails | Request still succeeds, audit row buffered in-memory ring (max 1000), drained on recovery; if buffer fills, switch to deny mode | +| Redis | Rate limit / budget unavailable | 503 — fail closed. Never "allow because we can't check." | +| Ollama | Upstream unreachable | 502 with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30s | +| Caddy | Not a gateway concern | — | + +### 4.5 Cache invalidation (key revocation) + +Console can revoke a key by inserting into `gateway.revocations(key_id, ts, reason)`. Gateway has a background task (`asyncio.create_task` in lifespan) that: +- LISTENs on Postgres channel `key_revoked` (gateway emits NOTIFY on its own write path; console emits via INSERT trigger) +- On notification, evicts the Redis cache entry for that key's prefix +- This makes revocation effectively immediate (≤ Redis RTT) without cross-service HTTP + +### 4.6 Model discovery + +The set of usable models is **never hand-maintained**; it is extracted live from the +Ollama backend. + +- A background task (started in lifespan, like the revocation listener) polls Ollama + `GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds. +- The parsed model set (names + sanitized metadata: family, parameter size, quantization, + size bytes, modified-at) is cached in Redis under `gateway:models:discovered` with TTL + `MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path. +- On startup an initial fetch runs; if Ollama is unreachable the discovered set is empty. +- **Fail-closed:** if the discovered set is empty or its cache has expired and cannot be + refreshed, no model resolves and requests are denied (consistent with default-deny). + Discovery never opens access on failure. +- "Auto-grant": because the effective set (§4.3 step 7) intersects with `discovered` (or + *is* `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band + becomes usable to `allow_all` tenants on the next refresh — no per-tenant config change. +- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags` + endpoint; it never triggers a model pull. + +## 5. Data Model (schema `gateway`) + +```sql +CREATE SCHEMA gateway; + +CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked'); +CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed'); +CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total'); + +CREATE TABLE gateway.tenants ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + name text NOT NULL UNIQUE, + status gateway.tenant_status NOT NULL DEFAULT 'active', + created_at timestamptz NOT NULL DEFAULT now(), + metadata jsonb NOT NULL DEFAULT '{}'::jsonb +); + +CREATE TABLE gateway.tenant_limits ( + tenant_id uuid PRIMARY KEY REFERENCES gateway.tenants(id) ON DELETE CASCADE, + rpm integer NOT NULL DEFAULT 60, + tpm integer NOT NULL DEFAULT 100000, + concurrent integer NOT NULL DEFAULT 8, + tokens_daily bigint, + tokens_monthly bigint, + tokens_total bigint, + allowed_models text[] NOT NULL DEFAULT '{}', + allow_all_models boolean NOT NULL DEFAULT false, -- opt-in: allow any installed model + log_prompts_default boolean NOT NULL DEFAULT false, + prompt_retention_days integer NOT NULL DEFAULT 30, + audit_retention_days integer NOT NULL DEFAULT 365 +); + +CREATE TABLE gateway.api_keys ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + tenant_id uuid NOT NULL REFERENCES gateway.tenants(id) ON DELETE CASCADE, + prefix text NOT NULL UNIQUE, -- first 12 chars, indexed + key_hash text NOT NULL, -- argon2id + name text NOT NULL, + status gateway.key_status NOT NULL DEFAULT 'active', + scopes text[] NOT NULL DEFAULT '{chat,embeddings}', + created_at timestamptz NOT NULL DEFAULT now(), + last_used_at timestamptz, + expires_at timestamptz, + log_prompts boolean, -- NULL = inherit from tenant + metadata jsonb NOT NULL DEFAULT '{}'::jsonb +); + +CREATE INDEX idx_api_keys_prefix ON gateway.api_keys(prefix) WHERE status = 'active'; +CREATE INDEX idx_api_keys_tenant ON gateway.api_keys(tenant_id); + +CREATE TABLE gateway.key_limits ( + key_id uuid PRIMARY KEY REFERENCES gateway.api_keys(id) ON DELETE CASCADE, + rpm integer, -- NULL = inherit tenant + tpm integer, + concurrent integer, + tokens_daily bigint, + tokens_monthly bigint, + tokens_total bigint, + allowed_models text[], -- NULL = inherit tenant + allow_all_models boolean -- NULL = inherit tenant +); + +CREATE TABLE gateway.budget_usage ( + key_id uuid NOT NULL REFERENCES gateway.api_keys(id) ON DELETE CASCADE, + period gateway.budget_period NOT NULL, + period_start timestamptz NOT NULL, + tokens_in bigint NOT NULL DEFAULT 0, + tokens_out bigint NOT NULL DEFAULT 0, + requests bigint NOT NULL DEFAULT 0, + PRIMARY KEY (key_id, period, period_start) +); + +CREATE INDEX idx_budget_usage_period ON gateway.budget_usage(period, period_start); + +CREATE TABLE gateway.audit_log ( + id bigserial PRIMARY KEY, + ts timestamptz NOT NULL DEFAULT now(), + request_id uuid NOT NULL, + tenant_id uuid, -- nullable for auth-failed rows + key_id uuid, + key_prefix text, -- denormalized for forensic queries + method text NOT NULL, + path text NOT NULL, + model text, + tokens_in integer, + tokens_out integer, + latency_ms integer, + status integer NOT NULL, + client_ip inet, + user_agent text, + error_code text +); + +CREATE INDEX idx_audit_ts ON gateway.audit_log(ts); +CREATE INDEX idx_audit_tenant_ts ON gateway.audit_log(tenant_id, ts); +CREATE INDEX idx_audit_key_ts ON gateway.audit_log(key_id, ts); + +CREATE TABLE gateway.prompt_log ( + id bigserial PRIMARY KEY, + audit_id bigint NOT NULL REFERENCES gateway.audit_log(id) ON DELETE CASCADE, + ts timestamptz NOT NULL DEFAULT now(), + key_id uuid NOT NULL, + request_body jsonb NOT NULL, + response_text text, + retention_until timestamptz NOT NULL +); + +CREATE INDEX idx_prompt_log_retention ON gateway.prompt_log(retention_until); + +CREATE TABLE gateway.revocations ( + id bigserial PRIMARY KEY, + key_id uuid NOT NULL, + ts timestamptz NOT NULL DEFAULT now(), + reason text, + processed_at timestamptz +); + +-- Trigger to NOTIFY on revocation insert +CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$ +BEGIN + PERFORM pg_notify('key_revoked', NEW.key_id::text); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER trg_notify_key_revoked + AFTER INSERT ON gateway.revocations + FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked(); + +-- Grants for console role (created in console SPEC, referenced here) +-- GRANT USAGE ON SCHEMA gateway TO console_role; +-- GRANT SELECT ON ALL TABLES IN SCHEMA gateway TO console_role; +-- GRANT INSERT ON gateway.revocations TO console_role; +``` + +## 6. API Surface + +### 6.1 Native Ollama passthrough (allowlisted) + +| Path | Method | Notes | +|---|---|---| +| `/api/chat` | POST | Streamed (NDJSON) and non-streamed | +| `/api/generate` | POST | Streamed (NDJSON) and non-streamed | +| `/api/embeddings` | POST | Non-streamed | +| `/api/embed` | POST | Newer Ollama embeddings endpoint | +| `/api/tags` | GET | Returns the tenant's **effective** model set (live-discovered ∩ allowed, or *all* discovered when `allow_all_models`). Sourced from discovery (§4.6), never a static list | +| `/api/show` | POST | Allowed only for models in the tenant's effective set; returns sanitized model info (no system prompts, no template) | +| `/api/ps` | GET | **Blocked** — leaks loaded models | +| `/api/version` | GET | Returns gateway version, not Ollama version | + +### 6.2 Hard-blocked Ollama endpoints (always 403) + +`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*` + +### 6.3 OpenAI-compatible + +| Path | Method | Maps to | +|---|---|---| +| `/v1/chat/completions` | POST | `/api/chat` | +| `/v1/completions` | POST | `/api/generate` | +| `/v1/embeddings` | POST | `/api/embed` | +| `/v1/models` | GET | `/api/tags` (the tenant's effective discovered set), in OpenAI model-list format | + +Translation must preserve streaming. SSE (`data: {...}\n\n`) for OpenAI-compat; NDJSON for native. + +### 6.4 Gateway endpoints + +| Path | Method | Auth | Purpose | +|---|---|---|---| +| `/healthz` | GET | none | Liveness — process responsive | +| `/readyz` | GET | none | Readiness — DB + Redis + Ollama all reachable | +| `/metrics` | GET | none (loopback only) | Prometheus exposition (counters, histograms) | + +No admin endpoints. Admin lives in `neuronetz-console`. + +### 6.5 Response headers + +Every proxied response carries: +- `X-Request-ID: ` +- `X-RateLimit-Limit-Requests: ` +- `X-RateLimit-Remaining-Requests: ` +- `X-RateLimit-Limit-Tokens: ` +- `X-RateLimit-Remaining-Tokens: ` +- `X-Budget-Period: day|month|total` +- `X-Budget-Tokens-Remaining: ` + +429 responses additionally carry `Retry-After: `. + +## 7. Configuration + +All via environment variables, validated by Pydantic Settings on boot. Boot fails loudly on invalid config. + +``` +# Service +GATEWAY_BIND_HOST=0.0.0.0 +GATEWAY_BIND_PORT=8080 +GATEWAY_LOG_LEVEL=INFO +GATEWAY_LOG_FORMAT=json # json|console +GATEWAY_REQUEST_ID_HEADER=X-Request-ID +GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy # for X-Forwarded-For + +# Upstream +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_CONNECT_TIMEOUT_S=5 +OLLAMA_READ_TIMEOUT_S=600 +OLLAMA_MAX_CONNECTIONS=64 + +# Model discovery (§4.6) +MODEL_DISCOVERY_REFRESH_S=60 # how often to re-query Ollama /api/tags +MODEL_DISCOVERY_CACHE_TTL_S=120 # Redis cache TTL for the discovered model set + +# Database +DATABASE_URL=postgresql+asyncpg://gateway:...@postgres:5432/neuronetz +DATABASE_POOL_SIZE=10 +DATABASE_POOL_OVERFLOW=20 + +# Redis +REDIS_URL=redis://redis:6379/0 +REDIS_KEY_CACHE_TTL_S=60 + +# Limits (defaults; per-tenant/key overrides in DB) +DEFAULT_RPM=60 +DEFAULT_TPM=100000 +DEFAULT_CONCURRENT=8 +MAX_REQUEST_BODY_BYTES=262144 +MAX_NUM_PREDICT=4096 + +# Security +ARGON2_TIME_COST=3 +ARGON2_MEMORY_COST_KIB=65536 +ARGON2_PARALLELISM=4 +AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20 + +# Audit +AUDIT_BUFFER_SIZE=1000 +PROMPT_LOG_DEFAULT_RETENTION_DAYS=30 +AUDIT_LOG_DEFAULT_RETENTION_DAYS=365 +``` + +## 8. Repository Layout + +``` +neuronetz-gateway/ +├── pyproject.toml # uv-managed, ruff, mypy --strict, pytest +├── README.md +├── LICENSE # Apache 2.0 +├── docker-compose.yml # full stack incl. console placeholder +├── docker-compose.dev.yml # without caddy, gateway exposed on localhost +├── Dockerfile # multi-stage, python:3.12-slim base +├── .env.example +├── .dockerignore +├── .gitignore +├── alembic.ini +├── alembic/ +│ ├── env.py +│ └── versions/ +│ └── 0001_initial.py # creates schema `gateway` and all tables +├── ops/ +│ ├── caddy/ +│ │ └── Caddyfile.example +│ └── systemd/ +│ └── neuronetz-gateway.service +├── src/neuronetz_gateway/ +│ ├── __init__.py +│ ├── __main__.py # uvicorn entry +│ ├── app.py # FastAPI factory +│ ├── config.py # Pydantic Settings +│ ├── deps.py # DI providers +│ ├── lifespan.py # startup/shutdown, NOTIFY listener +│ ├── errors.py # exception types, handlers, sanitization +│ ├── auth/ +│ │ ├── __init__.py +│ │ ├── hashing.py # argon2id wrapper +│ │ ├── keys.py # key generation, prefix, verify +│ │ └── middleware.py +│ ├── ratelimit/ +│ │ ├── __init__.py +│ │ ├── sliding_window.py # Redis Lua script +│ │ └── concurrency.py # semaphore via Redis +│ ├── budget/ +│ │ ├── __init__.py +│ │ ├── counter.py # Redis period counters +│ │ └── ledger.py # Postgres reconciliation +│ ├── proxy/ +│ │ ├── __init__.py +│ │ ├── ollama.py # httpx streaming client +│ │ ├── translate.py # OpenAI <-> Ollama schemas +│ │ ├── token_counter.py # parse usage from stream +│ │ ├── discovery.py # live model discovery from Ollama /api/tags (§4.6) +│ │ └── allowlist.py # effective-set resolution (allow_all / allowed ∩ discovered) +│ ├── routes/ +│ │ ├── __init__.py +│ │ ├── ollama_native.py +│ │ ├── openai_compat.py +│ │ └── health.py +│ ├── db/ +│ │ ├── __init__.py +│ │ ├── session.py +│ │ ├── models.py # SQLAlchemy 2.0 +│ │ └── repositories.py +│ ├── audit/ +│ │ ├── __init__.py +│ │ ├── writer.py # buffered async writer +│ │ └── prompt_log.py +│ ├── observability/ +│ │ ├── __init__.py +│ │ ├── logging.py # structlog config +│ │ └── metrics.py # prometheus +│ └── cli/ +│ ├── __init__.py +│ └── manage.py # typer: create-tenant, create-key, ... +├── tests/ +│ ├── conftest.py # testcontainers fixtures +│ ├── unit/ +│ │ ├── test_hashing.py +│ │ ├── test_translate.py +│ │ ├── test_token_counter.py +│ │ ├── test_discovery.py +│ │ ├── test_allowlist.py +│ │ └── test_sliding_window.py +│ ├── integration/ +│ │ ├── test_auth_flow.py +│ │ ├── test_rate_limit.py +│ │ ├── test_budget.py +│ │ ├── test_proxy_stream.py +│ │ ├── test_openai_compat.py +│ │ ├── test_revocation.py +│ │ └── mock_ollama.py # FastAPI mock with NDJSON/SSE +│ └── load/ +│ └── locustfile.py +└── docs/ + ├── ARCHITECTURE.md + ├── DEPLOYMENT.md + ├── API.md + ├── THREAT_MODEL.md + └── OPERATIONS.md # runbook: revoke key, rotate, check usage +``` + +## 9. Non-Functional Requirements + +- **Performance:** p50 overhead < 5 ms over direct Ollama call (auth + ratelimit + audit); p99 < 25 ms (excluding upstream latency) +- **Streaming:** Time-to-first-byte must not be degraded by gateway logic — audit write happens **after** stream close +- **Memory:** Steady-state RSS < 200 MiB per gateway worker under 100 concurrent streams +- **Concurrency:** Handle 200 concurrent connections per worker; 4 workers per instance default +- **Test coverage:** ≥ 85% line coverage on `src/neuronetz_gateway/` excluding `__main__` and CLI; 100% on `auth/`, `ratelimit/`, `budget/` +- **Security:** No `eval`, no `exec`, no shell-out, no `pickle`. Bandit clean. `pip-audit` clean on every CI run. +- **Type safety:** `mypy --strict` clean +- **Lint:** `ruff check` clean with project ruleset (E, F, I, B, UP, S, ASYNC) + +## 10. Tooling + +- Python 3.12 +- `uv` for dependency management (pyproject.toml + uv.lock) +- FastAPI ≥ 0.115, uvicorn[standard], httpx ≥ 0.27, SQLAlchemy 2.0 (async), asyncpg, redis ≥ 5.0 (with hiredis), structlog, pydantic ≥ 2.9, pydantic-settings, argon2-cffi, typer, prometheus-client +- Test: pytest, pytest-asyncio, pytest-cov, testcontainers, httpx (test client), respx (mock), locust +- Lint/format: ruff, mypy --strict, bandit, pip-audit +- CI: GitHub Actions workflow (lint, type, test with coverage, build image, push on tag) + +## 11. Bootstrap CLI (Typer) + +``` +neuronetz-gateway create-tenant --name "acme" [--rpm 60] [--tpm 100000] +neuronetz-gateway create-key --tenant acme --name "prod-server-1" [--scopes chat,embeddings] +neuronetz-gateway revoke-key --prefix nz_abc12345 +neuronetz-gateway list-keys --tenant acme +neuronetz-gateway show-usage --tenant acme [--period day|month|total] +neuronetz-gateway set-budget --key nz_abc12345 --daily 1000000 --monthly 30000000 +neuronetz-gateway set-models --tenant acme --models llama3.1:8b,mistral:7b +neuronetz-gateway set-models --tenant acme --allow-all # opt into allow_all_models +neuronetz-gateway set-models --tenant acme --no-allow-all # back to explicit allowlist +neuronetz-gateway list-models [--tenant acme] # show live-discovered models + # (and the tenant's effective set) +``` + +`create-tenant` accepts `--allow-all-models / --no-allow-all-models` (default off). +`list-models` reads the discovery cache (§4.6); with `--tenant` it also shows that tenant's +resolved effective set. + +Key format: `nz_<12-char-prefix><32-char-random>`. Prefix is stored; full key is hashed (argon2id). On creation, the full key is printed exactly once. + +## 12. Acceptance Criteria + +The build is "done" when every box below is checked. The orchestrator must verify each before declaring v0.1.0. + +- [ ] `docker compose up` from a clean checkout produces a running stack with TLS via Caddy (self-signed in dev, Let's Encrypt-ready in prod). +- [ ] CLI creates tenant and key; printed key successfully authenticates an `/api/chat` call. +- [ ] Unauthenticated request returns 401 with no Ollama details leaked. +- [ ] Request to `/api/pull` returns 403 with generic error message. +- [ ] Streaming `/api/chat` works end-to-end; first byte arrives within Ollama's own TTFB + < 10 ms gateway overhead. +- [ ] Streaming `/v1/chat/completions` returns valid SSE with `data: [DONE]` terminator. +- [ ] Token counts in audit log match Ollama's reported `prompt_eval_count` + `eval_count` exactly. +- [ ] `/api/tags` and `/v1/models` reflect the **live** Ollama model set (discovery, §4.6): an `allow_all_models` tenant sees every installed model and a newly-pulled model appears within one refresh interval; a default-deny tenant sees only `allowed_models ∩ discovered`; a request for a model outside the effective set returns a generic 403; with discovery unavailable, requests fail closed (deny), not open. +- [ ] Rate limit triggers at configured RPM with `Retry-After` header. +- [ ] Token budget enforces and blocks at zero remaining with descriptive error. +- [ ] Redis outage causes 503 (fail-closed), not 200. +- [ ] Revocation via `INSERT INTO gateway.revocations` evicts Redis cache within 1 second. +- [ ] `mypy --strict`, `ruff check`, `bandit`, `pip-audit` all clean in CI. +- [ ] Test coverage ≥ 85% overall, 100% in `auth/`, `ratelimit/`, `budget/`. +- [ ] `docs/THREAT_MODEL.md`, `docs/DEPLOYMENT.md`, `docs/OPERATIONS.md` present and accurate. +- [ ] Load test (locust): 100 concurrent users sustained 5 minutes, p99 gateway overhead < 25 ms, zero 5xx outside induced failures. + +## 13. Open Questions (decide during build) + +1. Embedding cost accounting — Ollama doesn't return `eval_count` for embeddings. Decision: charge based on `prompt_eval_count` only; document as such. +2. SSE vs NDJSON heuristic for OpenAI-compat — always SSE per OpenAI spec. NDJSON only on native `/api/*`. +3. Prometheus cardinality — do not label by `key_id` (too many series); label by `tenant_id` only; per-key data lives in Postgres. +4. **Model discovery source** — the live model list is `GET /api/tags` on the Ollama backend; there is no separate registry. Cached in Redis + in-process, refreshed every `MODEL_DISCOVERY_REFRESH_S`. +5. **Discovery failure is fail-closed** — empty/expired discovered set ⇒ no model resolves ⇒ deny. Discovery never opens access on error. +6. **No existence disclosure** — a model that is installed-but-unpermitted and a model that is not installed both return the same generic response, to prevent enumeration. +7. **`allow_all_models` precedence** — key-level `allow_all_models` (when non-NULL) overrides the tenant flag; otherwise the tenant flag applies. Same NULL-inherits-tenant rule as the other key limits. + +## 14. References + +- Ollama API: https://github.com/ollama/ollama/blob/main/docs/api.md +- OpenAI Chat Completions: https://platform.openai.com/docs/api-reference/chat +- Nibiru (sibling console project): https://nibiru-framework.com +- Argon2 RFC 9106 diff --git a/src/neuronetz_gateway/__init__.py b/src/neuronetz_gateway/__init__.py new file mode 100644 index 0000000..2bdde0a --- /dev/null +++ b/src/neuronetz_gateway/__init__.py @@ -0,0 +1,7 @@ +"""neuronetz-gateway: secure multi-tenant API gateway in front of Ollama.""" + +from __future__ import annotations + +__version__ = "0.1.0" + +__all__ = ["__version__"] diff --git a/src/neuronetz_gateway/__main__.py b/src/neuronetz_gateway/__main__.py new file mode 100644 index 0000000..825af6a --- /dev/null +++ b/src/neuronetz_gateway/__main__.py @@ -0,0 +1,28 @@ +"""Uvicorn entry point: ``python -m neuronetz_gateway``. + +Binds the app to ``GATEWAY_BIND_HOST``:``GATEWAY_BIND_PORT`` (default +0.0.0.0:8080). The factory string is passed to uvicorn so the app is built in +the worker process. +""" + +from __future__ import annotations + +import uvicorn + +from neuronetz_gateway.config import get_settings + + +def main() -> None: + """Run the gateway under uvicorn using the configured bind address.""" + settings = get_settings() + uvicorn.run( + "neuronetz_gateway.app:create_app", + factory=True, + host=settings.gateway_bind_host, + port=settings.gateway_bind_port, + log_level=settings.gateway_log_level.lower(), + ) + + +if __name__ == "__main__": + main() diff --git a/src/neuronetz_gateway/app.py b/src/neuronetz_gateway/app.py new file mode 100644 index 0000000..a1042d5 --- /dev/null +++ b/src/neuronetz_gateway/app.py @@ -0,0 +1,111 @@ +"""FastAPI application factory. + +``create_app()`` is the shared contract entry point: other agents (DevOps, QA) +import and serve this. It configures logging, installs the request-id and auth +middleware, registers the sanitizing exception handlers, mounts routers, and +binds the lifespan that manages backend handles + background tasks. + +Production safety: FastAPI's ``/docs`` + ``/openapi.json`` are disabled by +default (enabled only via ``DOCS_ENABLED``). The ``/playground`` route is served +only when ``PLAYGROUND_ENABLED`` is true and ``PLAYGROUND_FILE`` exists. +""" + +from __future__ import annotations + +import uuid +from pathlib import Path + +from fastapi import FastAPI, Request +from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint +from starlette.responses import HTMLResponse, Response +from starlette.types import ASGIApp + +from neuronetz_gateway import __version__ +from neuronetz_gateway.auth.middleware import AuthMiddleware +from neuronetz_gateway.config import Settings, get_settings +from neuronetz_gateway.errors import register_exception_handlers +from neuronetz_gateway.lifespan import lifespan +from neuronetz_gateway.observability.logging import configure_logging +from neuronetz_gateway.routes import health, ollama_native, openai_compat + + +class RequestIDMiddleware(BaseHTTPMiddleware): + """Assign/propagate a request id and expose it on ``request.state``. + + Honours an inbound ``X-Request-ID`` from a trusted proxy; otherwise mints a + fresh UUID. The id is echoed on the response and used by error handlers. + """ + + def __init__(self, app: ASGIApp, header_name: str) -> None: + super().__init__(app) + self._header = header_name + + async def dispatch( + self, request: Request, call_next: RequestResponseEndpoint + ) -> Response: + incoming = request.headers.get(self._header) + request_id = incoming or str(uuid.uuid4()) + request.state.request_id = request_id + response = await call_next(request) + response.headers[self._header] = request_id + return response + + +def _register_playground(app: FastAPI, cfg: Settings) -> None: + """Add the flag-gated ``/playground`` route (HTML asset, owned by docs agent). + + The file is read off the event loop via ``asyncio.to_thread`` so a slow disk + cannot stall request handling. Missing-file is a simple 404, never an error. + """ + import asyncio as _asyncio + + def _load(path_str: str) -> str | None: + p = Path(path_str) + if not p.is_file(): + return None + return p.read_text(encoding="utf-8") + + @app.get("/playground", include_in_schema=False) + async def playground() -> Response: + content = await _asyncio.to_thread(_load, cfg.playground_file) + if content is None: + return Response(status_code=404, content="Not found") + return HTMLResponse(content) + + +def create_app(settings: Settings | None = None) -> FastAPI: + """Build and return the configured FastAPI application.""" + cfg = settings or get_settings() + configure_logging(level=cfg.gateway_log_level, fmt=cfg.gateway_log_format) + + app = FastAPI( + title="neuronetz-gateway", + version=__version__, + lifespan=lifespan, + docs_url="/docs" if cfg.docs_enabled else None, + redoc_url="/redoc" if cfg.docs_enabled else None, + openapi_url="/openapi.json" if cfg.docs_enabled else None, + ) + # Settings are needed by the auth middleware before lifespan runs in some + # test setups; lifespan also sets this. Setting here is idempotent. + app.state.settings = cfg + + # Auth runs inside RequestID so a request id is always available for the + # sanitized 401 the auth middleware emits. add_middleware wraps outermost + # last, so add Auth first then RequestID. + app.add_middleware(AuthMiddleware) + app.add_middleware(RequestIDMiddleware, header_name=cfg.gateway_request_id_header) + + register_exception_handlers(app) + + app.include_router(health.router) + app.include_router(openai_compat.router) + app.include_router(ollama_native.router) + + if cfg.playground_enabled: + _register_playground(app, cfg) + + return app + + +__all__ = ["RequestIDMiddleware", "create_app"] diff --git a/src/neuronetz_gateway/config.py b/src/neuronetz_gateway/config.py new file mode 100644 index 0000000..327652a --- /dev/null +++ b/src/neuronetz_gateway/config.py @@ -0,0 +1,86 @@ +"""Application configuration via Pydantic Settings v2. + +Reads every environment variable documented in SPEC §7 with the documented +defaults. Boot fails loudly (ValidationError) on invalid config. +""" + +from __future__ import annotations + +from functools import lru_cache + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Gateway runtime configuration. All fields map to SPEC §7 env vars.""" + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + case_sensitive=False, + ) + + # --- Service --- + gateway_bind_host: str = Field(default="0.0.0.0") # noqa: S104 - bind-all is intended in container + gateway_bind_port: int = Field(default=8080) + gateway_log_level: str = Field(default="INFO") + gateway_log_format: str = Field(default="json") # json|console + gateway_request_id_header: str = Field(default="X-Request-ID") + gateway_trusted_proxies: str = Field(default="127.0.0.1,caddy") + + # --- Upstream (Ollama) --- + ollama_base_url: str = Field(default="http://ollama:11434") + ollama_connect_timeout_s: int = Field(default=5) + ollama_read_timeout_s: int = Field(default=600) + ollama_max_connections: int = Field(default=64) + + # --- Model discovery (SPEC §4.6) --- + model_discovery_refresh_s: int = Field(default=60) + model_discovery_cache_ttl_s: int = Field(default=120) + + # --- Database --- + database_url: str = Field( + default="postgresql+asyncpg://gateway:gateway@postgres:5432/neuronetz", + ) + database_pool_size: int = Field(default=10) + database_pool_overflow: int = Field(default=20) + + # --- Redis --- + redis_url: str = Field(default="redis://redis:6379/0") + redis_key_cache_ttl_s: int = Field(default=60) + + # --- Limits --- + default_rpm: int = Field(default=60) + default_tpm: int = Field(default=100_000) + default_concurrent: int = Field(default=8) + max_request_body_bytes: int = Field(default=262_144) + max_num_predict: int = Field(default=4096) + + # --- Security --- + argon2_time_cost: int = Field(default=3) + argon2_memory_cost_kib: int = Field(default=65_536) + argon2_parallelism: int = Field(default=4) + auth_failure_rate_limit_per_ip_per_min: int = Field(default=20) + + # --- Audit --- + audit_buffer_size: int = Field(default=1000) + prompt_log_default_retention_days: int = Field(default=30) + audit_log_default_retention_days: int = Field(default=365) + + # --- Playground / docs (prod-safe defaults: both OFF) --- + playground_enabled: bool = Field(default=False) + playground_file: str = Field(default="/app/playground/index.html") + docs_enabled: bool = Field(default=False) + + @property + def trusted_proxies_list(self) -> list[str]: + """Parse the comma-separated trusted-proxy list into individual hosts.""" + return [p.strip() for p in self.gateway_trusted_proxies.split(",") if p.strip()] + + +@lru_cache(maxsize=1) +def get_settings() -> Settings: + """Return a cached Settings instance, constructed from the environment.""" + return Settings() diff --git a/src/neuronetz_gateway/db/__init__.py b/src/neuronetz_gateway/db/__init__.py new file mode 100644 index 0000000..bcfa2ea --- /dev/null +++ b/src/neuronetz_gateway/db/__init__.py @@ -0,0 +1,3 @@ +"""Database access layer: SQLAlchemy models, session factory, repositories.""" + +from __future__ import annotations diff --git a/src/neuronetz_gateway/db/models.py b/src/neuronetz_gateway/db/models.py new file mode 100644 index 0000000..eba7f0a --- /dev/null +++ b/src/neuronetz_gateway/db/models.py @@ -0,0 +1,292 @@ +"""SQLAlchemy 2.0 (async) ORM models for schema ``gateway`` per SPEC §5. + +These mirror the migration in ``alembic/versions/0001_initial.py`` exactly. +The migration is the authoritative DDL; these models are for application use. +""" + +from __future__ import annotations + +import datetime +import enum +import uuid + +from sqlalchemy import ( + BigInteger, + Boolean, + ForeignKey, + Integer, + MetaData, + String, + Text, + text, +) +from sqlalchemy.dialects.postgresql import ARRAY, ENUM, INET, JSONB, TIMESTAMP, UUID +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + +GATEWAY_SCHEMA = "gateway" + +# Stable naming convention so Alembic autogenerate and ad-hoc DDL agree. +_NAMING_CONVENTION = { + "ix": "ix_%(column_0_label)s", + "uq": "uq_%(table_name)s_%(column_0_name)s", + "ck": "ck_%(table_name)s_%(constraint_name)s", + "fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s", + "pk": "pk_%(table_name)s", +} + + +class Base(DeclarativeBase): + """Declarative base; all tables live in the ``gateway`` schema.""" + + metadata = MetaData(schema=GATEWAY_SCHEMA, naming_convention=_NAMING_CONVENTION) + + +class KeyStatus(enum.StrEnum): + """Lifecycle states for an API key (SPEC §5 ``gateway.key_status``).""" + + active = "active" + disabled = "disabled" + revoked = "revoked" + + +class TenantStatus(enum.StrEnum): + """Lifecycle states for a tenant (SPEC §5 ``gateway.tenant_status``).""" + + active = "active" + suspended = "suspended" + closed = "closed" + + +class BudgetPeriod(enum.StrEnum): + """Budget accounting periods (SPEC §5 ``gateway.budget_period``).""" + + day = "day" + month = "month" + total = "total" + + +# Reuse existing Postgres enum types (the migration creates them); do not let +# SQLAlchemy try to CREATE TYPE again at runtime. +_key_status_enum = ENUM(KeyStatus, name="key_status", schema=GATEWAY_SCHEMA, create_type=False) +_tenant_status_enum = ENUM( + TenantStatus, name="tenant_status", schema=GATEWAY_SCHEMA, create_type=False +) +_budget_period_enum = ENUM( + BudgetPeriod, name="budget_period", schema=GATEWAY_SCHEMA, create_type=False +) + + +class Tenant(Base): + """A tenant: the top-level isolation and ownership boundary.""" + + __tablename__ = "tenants" + + id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()") + ) + name: Mapped[str] = mapped_column(Text, nullable=False, unique=True) + status: Mapped[TenantStatus] = mapped_column( + _tenant_status_enum, nullable=False, server_default=text("'active'") + ) + created_at: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False, server_default=text("now()") + ) + tenant_metadata: Mapped[dict[str, object]] = mapped_column( + "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb") + ) + + +class TenantLimit(Base): + """Per-tenant default limits and retention policy.""" + + __tablename__ = "tenant_limits" + + tenant_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("tenants.id", ondelete="CASCADE"), + primary_key=True, + ) + rpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("60")) + tpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("100000")) + concurrent: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("8")) + tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + allowed_models: Mapped[list[str]] = mapped_column( + ARRAY(Text), nullable=False, server_default=text("'{}'") + ) + # When true, the tenant may use ANY model currently installed on the Ollama + # backend (resolved live via model discovery). When false (default), access is + # default-deny and restricted to ``allowed_models`` intersected with the live set. + allow_all_models: Mapped[bool] = mapped_column( + Boolean, nullable=False, server_default=text("false") + ) + log_prompts_default: Mapped[bool] = mapped_column( + Boolean, nullable=False, server_default=text("false") + ) + prompt_retention_days: Mapped[int] = mapped_column( + Integer, nullable=False, server_default=text("30") + ) + audit_retention_days: Mapped[int] = mapped_column( + Integer, nullable=False, server_default=text("365") + ) + + +class ApiKey(Base): + """An API key belonging to a tenant. The full key is never stored.""" + + __tablename__ = "api_keys" + + id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()") + ) + tenant_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("tenants.id", ondelete="CASCADE"), + nullable=False, + ) + prefix: Mapped[str] = mapped_column(Text, nullable=False, unique=True) + key_hash: Mapped[str] = mapped_column(Text, nullable=False) + name: Mapped[str] = mapped_column(Text, nullable=False) + status: Mapped[KeyStatus] = mapped_column( + _key_status_enum, nullable=False, server_default=text("'active'") + ) + scopes: Mapped[list[str]] = mapped_column( + ARRAY(Text), nullable=False, server_default=text("'{chat,embeddings}'") + ) + created_at: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False, server_default=text("now()") + ) + last_used_at: Mapped[datetime.datetime | None] = mapped_column( + TIMESTAMP(timezone=True), nullable=True + ) + expires_at: Mapped[datetime.datetime | None] = mapped_column( + TIMESTAMP(timezone=True), nullable=True + ) + log_prompts: Mapped[bool | None] = mapped_column(Boolean, nullable=True) + key_metadata: Mapped[dict[str, object]] = mapped_column( + "metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb") + ) + + +class KeyLimit(Base): + """Per-key overrides; NULL columns inherit the tenant value.""" + + __tablename__ = "key_limits" + + key_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("api_keys.id", ondelete="CASCADE"), + primary_key=True, + ) + rpm: Mapped[int | None] = mapped_column(Integer, nullable=True) + tpm: Mapped[int | None] = mapped_column(Integer, nullable=True) + concurrent: Mapped[int | None] = mapped_column(Integer, nullable=True) + tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True) + allowed_models: Mapped[list[str] | None] = mapped_column(ARRAY(Text), nullable=True) + # NULL = inherit tenant's allow_all_models; otherwise overrides it for this key. + allow_all_models: Mapped[bool | None] = mapped_column(Boolean, nullable=True) + + +class BudgetUsage(Base): + """Token/request accounting per key, period, and period start.""" + + __tablename__ = "budget_usage" + + key_id: Mapped[uuid.UUID] = mapped_column( + UUID(as_uuid=True), + ForeignKey("api_keys.id", ondelete="CASCADE"), + primary_key=True, + ) + period: Mapped[BudgetPeriod] = mapped_column(_budget_period_enum, primary_key=True) + period_start: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), primary_key=True + ) + tokens_in: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0")) + tokens_out: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0")) + requests: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0")) + + +class AuditLog(Base): + """Always-on append-only request metadata log.""" + + __tablename__ = "audit_log" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) + ts: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False, server_default=text("now()") + ) + request_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False) + tenant_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True) + key_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True) + key_prefix: Mapped[str | None] = mapped_column(Text, nullable=True) + method: Mapped[str] = mapped_column(Text, nullable=False) + path: Mapped[str] = mapped_column(Text, nullable=False) + model: Mapped[str | None] = mapped_column(Text, nullable=True) + tokens_in: Mapped[int | None] = mapped_column(Integer, nullable=True) + tokens_out: Mapped[int | None] = mapped_column(Integer, nullable=True) + latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True) + status: Mapped[int] = mapped_column(Integer, nullable=False) + client_ip: Mapped[str | None] = mapped_column(INET, nullable=True) + user_agent: Mapped[str | None] = mapped_column(Text, nullable=True) + error_code: Mapped[str | None] = mapped_column(Text, nullable=True) + + +class PromptLog(Base): + """Opt-in, TTL'd capture of request/response bodies.""" + + __tablename__ = "prompt_log" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) + audit_id: Mapped[int] = mapped_column( + BigInteger, + ForeignKey("audit_log.id", ondelete="CASCADE"), + nullable=False, + ) + ts: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False, server_default=text("now()") + ) + key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False) + request_body: Mapped[dict[str, object]] = mapped_column(JSONB, nullable=False) + response_text: Mapped[str | None] = mapped_column(Text, nullable=True) + retention_until: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False + ) + + +class Revocation(Base): + """Outbox table written by console (or gateway) to revoke a key. + + An ``AFTER INSERT`` trigger fires ``pg_notify('key_revoked', key_id)``. + """ + + __tablename__ = "revocations" + + id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) + key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False) + ts: Mapped[datetime.datetime] = mapped_column( + TIMESTAMP(timezone=True), nullable=False, server_default=text("now()") + ) + reason: Mapped[str | None] = mapped_column(String, nullable=True) + processed_at: Mapped[datetime.datetime | None] = mapped_column( + TIMESTAMP(timezone=True), nullable=True + ) + + +__all__ = [ + "GATEWAY_SCHEMA", + "ApiKey", + "AuditLog", + "Base", + "BudgetPeriod", + "BudgetUsage", + "KeyLimit", + "KeyStatus", + "PromptLog", + "Revocation", + "Tenant", + "TenantLimit", + "TenantStatus", +] diff --git a/src/neuronetz_gateway/db/session.py b/src/neuronetz_gateway/db/session.py new file mode 100644 index 0000000..0b074c4 --- /dev/null +++ b/src/neuronetz_gateway/db/session.py @@ -0,0 +1,53 @@ +"""Async SQLAlchemy engine and session factory construction. + +Phase 1 provides the wiring only; the lifespan owns the engine instance and +stores it on ``app.state``. Business-logic callers should depend on the +session factory via ``deps.py``. +""" + +from __future__ import annotations + +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager + +from sqlalchemy.ext.asyncio import ( + AsyncEngine, + AsyncSession, + async_sessionmaker, + create_async_engine, +) + +from neuronetz_gateway.config import Settings + + +def create_engine(settings: Settings) -> AsyncEngine: + """Build the async engine from settings (asyncpg driver, pooled).""" + return create_async_engine( + settings.database_url, + pool_size=settings.database_pool_size, + max_overflow=settings.database_pool_overflow, + pool_pre_ping=True, + future=True, + ) + + +def create_session_factory(engine: AsyncEngine) -> async_sessionmaker[AsyncSession]: + """Build a session factory bound to the given engine.""" + return async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession) + + +@asynccontextmanager +async def session_scope( + factory: async_sessionmaker[AsyncSession], +) -> AsyncIterator[AsyncSession]: + """Provide a transactional session scope, committing on success.""" + async with factory() as session: + try: + yield session + await session.commit() + except Exception: + await session.rollback() + raise + + +__all__ = ["create_engine", "create_session_factory", "session_scope"] diff --git a/src/neuronetz_gateway/deps.py b/src/neuronetz_gateway/deps.py new file mode 100644 index 0000000..a93e5d6 --- /dev/null +++ b/src/neuronetz_gateway/deps.py @@ -0,0 +1,180 @@ +"""FastAPI dependency-injection providers. + +Exposes typed accessors for the handles placed on ``app.state`` by the lifespan +(Redis, the upstream httpx client, the DB session factory, the discovery cache) +plus the request principal and the proxy client. + +QA override contract +-------------------- +Routes obtain the upstream proxy via :func:`get_ollama_client`. Tests override +the *Ollama backend* by overriding this provider:: + + from neuronetz_gateway.deps import get_ollama_client + from neuronetz_gateway.proxy.ollama import OllamaClient + import httpx + from tests.integration.mock_ollama import create_mock_ollama + + transport = httpx.ASGITransport(app=create_mock_ollama()) + mock_http = httpx.AsyncClient(transport=transport, base_url="http://ollama") + app.dependency_overrides[get_ollama_client] = lambda: OllamaClient(mock_http) + +Because ``get_ollama_client`` returns a fully-built :class:`OllamaClient`, an +override needs no access to ``app.state`` and can point at the in-process mock. +""" + +from __future__ import annotations + +from collections.abc import AsyncIterator +from typing import Annotated + +import httpx +import redis.asyncio as redis +from fastapi import Depends, Request +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker + +from neuronetz_gateway.audit.writer import AuditWriter +from neuronetz_gateway.auth.principal import Principal +from neuronetz_gateway.budget.counter import BudgetCounter +from neuronetz_gateway.config import Settings, get_settings +from neuronetz_gateway.errors import AuthenticationError, DependencyUnavailableError +from neuronetz_gateway.proxy.discovery import DiscoveryCache +from neuronetz_gateway.proxy.ollama import OllamaClient +from neuronetz_gateway.proxy.pipeline import Pipeline +from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter +from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter + + +def get_config() -> Settings: + """Provide the cached application settings.""" + return get_settings() + + +def get_redis(request: Request) -> redis.Redis: + """Provide the shared Redis client, failing closed if unavailable.""" + client: redis.Redis | None = getattr(request.app.state, "redis", None) + if client is None: + raise DependencyUnavailableError(internal_detail="redis client not initialised") + return client + + +def get_http_client(request: Request) -> httpx.AsyncClient: + """Provide the shared upstream httpx client.""" + client: httpx.AsyncClient | None = getattr(request.app.state, "http_client", None) + if client is None: + raise DependencyUnavailableError(internal_detail="http client not initialised") + return client + + +def get_ollama_client(request: Request) -> OllamaClient: + """Provide the upstream Ollama proxy client (override target for tests).""" + return OllamaClient(get_http_client(request)) + + +def get_discovery_cache(request: Request) -> DiscoveryCache: + """Provide the in-process discovery cache; fail closed if absent.""" + cache: DiscoveryCache | None = getattr(request.app.state, "discovery_cache", None) + if cache is None: + raise DependencyUnavailableError(internal_detail="discovery cache not initialised") + return cache + + +def get_principal(request: Request) -> Principal: + """Return the authenticated principal placed on ``request.state``. + + The auth middleware attaches it before routing; its absence on a non-exempt + route is a programming error, so we fail closed with a 401. + """ + principal: Principal | None = getattr(request.state, "principal", None) + if principal is None: + raise AuthenticationError(internal_detail="principal missing on authenticated route") + return principal + + +def get_audit_writer(request: Request) -> AuditWriter: + """Provide the shared buffered audit writer; fail closed if absent.""" + writer: AuditWriter | None = getattr(request.app.state, "audit_writer", None) + if writer is None: + raise DependencyUnavailableError(internal_detail="audit writer not initialised") + return writer + + +def get_pipeline( + request: Request, + principal: Annotated[Principal, Depends(get_principal)], + settings: Annotated[Settings, Depends(get_config)], + ollama: Annotated[OllamaClient, Depends(get_ollama_client)], + discovery: Annotated[DiscoveryCache, Depends(get_discovery_cache)], + redis_client: Annotated[redis.Redis, Depends(get_redis)], + audit: Annotated[AuditWriter, Depends(get_audit_writer)], +) -> Pipeline: + """Assemble a per-request enforcement + proxy pipeline. + + The pipeline owns all hot-path checks (rate limit, budget, concurrency, + model/endpoint allowlist) and the streaming-with-bookkeeping contract. + Audit deny-mode flips this to fail closed at the route layer. + """ + sessionmaker: async_sessionmaker[AsyncSession] | None = getattr( + request.app.state, "db_sessionmaker", None + ) + return Pipeline( + request=request, + principal=principal, + settings=settings, + ollama=ollama, + discovery=discovery, + rate_limiter=SlidingWindowLimiter(redis_client), + concurrency=ConcurrencyLimiter(redis_client), + budget=BudgetCounter(redis_client), + audit=audit, + sessionmaker=sessionmaker, + ) + + +def _get_sessionmaker(request: Request) -> async_sessionmaker[AsyncSession]: + """Return the session factory or fail closed if the engine is absent.""" + factory: async_sessionmaker[AsyncSession] | None = getattr( + request.app.state, "db_sessionmaker", None + ) + if factory is None: + raise DependencyUnavailableError(internal_detail="db session factory not initialised") + return factory + + +async def get_db_session(request: Request) -> AsyncIterator[AsyncSession]: + """Provide a request-scoped async DB session.""" + factory = _get_sessionmaker(request) + async with factory() as session: + yield session + + +ConfigDep = Annotated[Settings, Depends(get_config)] +RedisDep = Annotated[redis.Redis, Depends(get_redis)] +HttpClientDep = Annotated[httpx.AsyncClient, Depends(get_http_client)] +OllamaClientDep = Annotated[OllamaClient, Depends(get_ollama_client)] +DiscoveryCacheDep = Annotated[DiscoveryCache, Depends(get_discovery_cache)] +PrincipalDep = Annotated[Principal, Depends(get_principal)] +AuditWriterDep = Annotated[AuditWriter, Depends(get_audit_writer)] +PipelineDep = Annotated[Pipeline, Depends(get_pipeline)] +DbSessionDep = Annotated[AsyncSession, Depends(get_db_session)] + + +__all__ = [ + "AuditWriterDep", + "ConfigDep", + "DbSessionDep", + "DiscoveryCacheDep", + "HttpClientDep", + "OllamaClientDep", + "PipelineDep", + "PrincipalDep", + "RedisDep", + "get_audit_writer", + "get_config", + "get_db_session", + "get_discovery_cache", + "get_http_client", + "get_ollama_client", + "get_pipeline", + "get_principal", + "get_redis", +] diff --git a/src/neuronetz_gateway/errors.py b/src/neuronetz_gateway/errors.py new file mode 100644 index 0000000..45d0ccd --- /dev/null +++ b/src/neuronetz_gateway/errors.py @@ -0,0 +1,179 @@ +"""Exception types and FastAPI exception handlers. + +Hard rule (SPEC §3, AGENT_PROMPT non-negotiable #4): never leak upstream or +internal error details to the client. Every error response is a generic, +sanitized JSON body carrying only a stable ``error.code``, a safe message, and +the request id. Detailed context is logged server-side, never returned. +""" + +from __future__ import annotations + +from fastapi import FastAPI, Request, status +from fastapi.responses import JSONResponse + +from neuronetz_gateway.observability.logging import get_logger + +_log = get_logger("errors") + + +class GatewayError(Exception): + """Base class for gateway errors that map to a sanitized HTTP response. + + ``message`` MUST be safe to return to clients. Anything sensitive belongs + in ``internal_detail`` which is logged but never serialized to the client. + """ + + status_code: int = status.HTTP_500_INTERNAL_SERVER_ERROR + code: str = "internal_error" + message: str = "An internal error occurred." + + def __init__(self, message: str | None = None, *, internal_detail: str | None = None) -> None: + super().__init__(message or self.message) + if message is not None: + self.message = message + self.internal_detail = internal_detail + + +class AuthenticationError(GatewayError): + """Missing/invalid credentials. Fail closed, no detail.""" + + status_code = status.HTTP_401_UNAUTHORIZED + code = "unauthorized" + message = "Authentication required." + + +class AuthorizationError(GatewayError): + """Authenticated but not permitted (scope/model/endpoint denied).""" + + status_code = status.HTTP_403_FORBIDDEN + code = "forbidden" + message = "This request is not permitted." + + +class RateLimitError(GatewayError): + """Rate limit exceeded. Handler attaches ``Retry-After`` when known.""" + + status_code = status.HTTP_429_TOO_MANY_REQUESTS + code = "rate_limited" + message = "Rate limit exceeded." + + def __init__( + self, + message: str | None = None, + *, + retry_after: int | None = None, + internal_detail: str | None = None, + ) -> None: + super().__init__(message, internal_detail=internal_detail) + self.retry_after = retry_after + + +class BudgetExceededError(GatewayError): + """Token budget exhausted for the active period.""" + + status_code = status.HTTP_429_TOO_MANY_REQUESTS + code = "budget_exceeded" + message = "Token budget exhausted for the current period." + + +class RequestTooLargeError(GatewayError): + """Request body exceeds the configured limit.""" + + status_code = status.HTTP_413_REQUEST_ENTITY_TOO_LARGE + code = "request_too_large" + message = "Request body is too large." + + +class UpstreamUnavailableError(GatewayError): + """Ollama (or another dependency) is unreachable. Fail closed.""" + + status_code = status.HTTP_502_BAD_GATEWAY + code = "upstream_unavailable" + message = "The upstream service is temporarily unavailable." + + +class DependencyUnavailableError(GatewayError): + """A required backend (DB/Redis) is unavailable; serve 503, fail closed.""" + + status_code = status.HTTP_503_SERVICE_UNAVAILABLE + code = "service_unavailable" + message = "The service is temporarily unavailable." + + +def _request_id(request: Request) -> str: + """Extract the request id placed on ``request.state`` by middleware.""" + rid = getattr(request.state, "request_id", None) + return str(rid) if rid else "" + + +def _error_response( + request: Request, + *, + status_code: int, + code: str, + message: str, + extra_headers: dict[str, str] | None = None, +) -> JSONResponse: + """Build a sanitized JSON error response with the request id header.""" + request_id = _request_id(request) + headers = {"X-Request-ID": request_id} if request_id else {} + if extra_headers: + headers.update(extra_headers) + return JSONResponse( + status_code=status_code, + content={"error": {"code": code, "message": message, "request_id": request_id}}, + headers=headers, + ) + + +async def _gateway_error_handler(request: Request, exc: GatewayError) -> JSONResponse: + """Render a ``GatewayError`` as a sanitized response.""" + if exc.internal_detail: + _log.warning( + "gateway_error", + code=exc.code, + status_code=exc.status_code, + internal_detail=exc.internal_detail, + ) + extra: dict[str, str] | None = None + if isinstance(exc, RateLimitError) and exc.retry_after is not None: + extra = {"Retry-After": str(exc.retry_after)} + return _error_response( + request, + status_code=exc.status_code, + code=exc.code, + message=exc.message, + extra_headers=extra, + ) + + +async def _unhandled_error_handler(request: Request, exc: Exception) -> JSONResponse: + """Catch-all: log the real exception, return a generic 500. No leakage.""" + _log.error("unhandled_exception", exc_info=exc) + return _error_response( + request, + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + code="internal_error", + message="An internal error occurred.", + ) + + +def register_exception_handlers(app: FastAPI) -> None: + """Attach the gateway's sanitizing exception handlers to the app.""" + # mypy: FastAPI's add_exception_handler accepts these handler signatures; + # the stubs are intentionally broad, so casts are unnecessary here. + app.add_exception_handler(GatewayError, _gateway_error_handler) # type: ignore[arg-type] # handler typed for GatewayError subclass + app.add_exception_handler(Exception, _unhandled_error_handler) + + +__all__ = [ + "AuthenticationError", + "AuthorizationError", + "BudgetExceededError", + "DependencyUnavailableError", + "GatewayError", + "RateLimitError", + "RequestTooLargeError", + "UpstreamUnavailableError", + "register_exception_handlers", +] diff --git a/src/neuronetz_gateway/lifespan.py b/src/neuronetz_gateway/lifespan.py new file mode 100644 index 0000000..9912cd5 --- /dev/null +++ b/src/neuronetz_gateway/lifespan.py @@ -0,0 +1,131 @@ +"""Application lifespan: connect/dispose backends and run background tasks. + +Startup connects Postgres + Redis + the upstream httpx client, builds the +argon2 hasher and the buffered audit writer, and launches the background tasks: +the model-discovery poller (SPEC §4.6) and the Postgres revocation NOTIFY +listener (SPEC §4.5). Connection failures are tolerated so ``/healthz`` always +serves; ``/readyz`` reports true readiness. All handles live on ``app.state``. +""" + +from __future__ import annotations + +import asyncio +import contextlib +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager +from typing import TYPE_CHECKING + +import httpx +import redis.asyncio as redis + +from neuronetz_gateway.audit.writer import AuditWriter +from neuronetz_gateway.auth.hashing import build_hasher +from neuronetz_gateway.config import Settings, get_settings +from neuronetz_gateway.db.session import create_engine, create_session_factory +from neuronetz_gateway.observability.logging import get_logger +from neuronetz_gateway.proxy.discovery import DiscoveryCache, discovery_loop +from neuronetz_gateway.revocation import revocation_listener + +if TYPE_CHECKING: + from fastapi import FastAPI + +_log = get_logger("lifespan") + + +def _build_http_client(settings: Settings) -> httpx.AsyncClient: + """Construct the shared httpx client used to reach Ollama.""" + timeout = httpx.Timeout( + connect=settings.ollama_connect_timeout_s, + read=settings.ollama_read_timeout_s, + write=settings.ollama_read_timeout_s, + pool=settings.ollama_connect_timeout_s, + ) + limits = httpx.Limits(max_connections=settings.ollama_max_connections) + return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits) + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[None]: + """Manage startup/shutdown of all backends and background tasks.""" + settings: Settings = get_settings() + app.state.settings = settings + app.state.hasher = build_hasher(settings) + app.state.discovery_cache = DiscoveryCache() + tasks: list[asyncio.Task[None]] = [] + + try: + engine = create_engine(settings) + app.state.db_engine = engine + app.state.db_sessionmaker = create_session_factory(engine) + except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves + _log.error("db_engine_init_failed", error=str(exc)) + app.state.db_engine = None + app.state.db_sessionmaker = None + + try: + app.state.redis = redis.from_url(settings.redis_url, decode_responses=True) + except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves + _log.error("redis_init_failed", error=str(exc)) + app.state.redis = None + + app.state.http_client = _build_http_client(settings) + + audit_writer = AuditWriter(settings.audit_buffer_size, app.state.db_sessionmaker) + audit_writer.start() + app.state.audit_writer = audit_writer + + # Background tasks (cancelled on shutdown). + tasks.append( + asyncio.create_task( + discovery_loop( + app.state.http_client, app.state.redis, app.state.discovery_cache, settings + ) + ) + ) + if app.state.redis is not None and app.state.db_sessionmaker is not None: + tasks.append( + asyncio.create_task( + revocation_listener(settings, app.state.redis, app.state.db_sessionmaker) + ) + ) + app.state.background_tasks = tasks + + _log.info("gateway_startup_complete") + try: + yield + finally: + await _shutdown(app, tasks, audit_writer) + + +async def _shutdown( + app: FastAPI, tasks: list[asyncio.Task[None]], audit_writer: AuditWriter +) -> None: + """Cancel background tasks and dispose of all backend handles.""" + for task in tasks: + task.cancel() + for task in tasks: + with contextlib.suppress(asyncio.CancelledError): + await task + + with contextlib.suppress(Exception): + await audit_writer.stop() + + http_client: httpx.AsyncClient | None = getattr(app.state, "http_client", None) + if http_client is not None: + with contextlib.suppress(Exception): + await http_client.aclose() + + redis_client = getattr(app.state, "redis", None) + if redis_client is not None: + with contextlib.suppress(Exception): + await redis_client.aclose() + + engine = getattr(app.state, "db_engine", None) + if engine is not None: + with contextlib.suppress(Exception): + await engine.dispose() + + _log.info("gateway_shutdown_complete") + + +__all__ = ["lifespan"] diff --git a/src/neuronetz_gateway/observability/__init__.py b/src/neuronetz_gateway/observability/__init__.py new file mode 100644 index 0000000..b10f28d --- /dev/null +++ b/src/neuronetz_gateway/observability/__init__.py @@ -0,0 +1,3 @@ +"""Observability: structured logging and Prometheus metrics.""" + +from __future__ import annotations diff --git a/src/neuronetz_gateway/observability/logging.py b/src/neuronetz_gateway/observability/logging.py new file mode 100644 index 0000000..e9b25f8 --- /dev/null +++ b/src/neuronetz_gateway/observability/logging.py @@ -0,0 +1,48 @@ +"""structlog configuration. + +Renders JSON in production (``GATEWAY_LOG_FORMAT=json``) and a human-friendly +console format in development. No secrets are ever logged; processors here +must not introduce any. +""" + +from __future__ import annotations + +import logging +from typing import Any + +import structlog + + +def configure_logging(level: str = "INFO", fmt: str = "json") -> None: + """Configure stdlib logging and structlog according to settings.""" + log_level = getattr(logging, level.upper(), logging.INFO) + logging.basicConfig(format="%(message)s", level=log_level) + + shared_processors: list[structlog.types.Processor] = [ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + structlog.processors.StackInfoRenderer(), + structlog.processors.format_exc_info, + ] + + renderer: structlog.types.Processor + if fmt == "console": + renderer = structlog.dev.ConsoleRenderer() + else: + renderer = structlog.processors.JSONRenderer() + + structlog.configure( + processors=[*shared_processors, renderer], + wrapper_class=structlog.make_filtering_bound_logger(log_level), + logger_factory=structlog.PrintLoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str | None = None) -> Any: # noqa: ANN401 - structlog returns a dynamic proxy + """Return a bound structlog logger.""" + return structlog.get_logger(name) + + +__all__ = ["configure_logging", "get_logger"] diff --git a/src/neuronetz_gateway/routes/__init__.py b/src/neuronetz_gateway/routes/__init__.py new file mode 100644 index 0000000..7c091ce --- /dev/null +++ b/src/neuronetz_gateway/routes/__init__.py @@ -0,0 +1,3 @@ +"""HTTP route modules: health, native Ollama passthrough, OpenAI-compat.""" + +from __future__ import annotations diff --git a/src/neuronetz_gateway/routes/health.py b/src/neuronetz_gateway/routes/health.py new file mode 100644 index 0000000..771c92a --- /dev/null +++ b/src/neuronetz_gateway/routes/health.py @@ -0,0 +1,114 @@ +"""Health, readiness, and metrics endpoints (SPEC §6.4). + +- ``GET /healthz`` : liveness — always 200 if the process can respond. +- ``GET /readyz`` : readiness — 200 only if Postgres + Redis + Ollama are all + reachable; otherwise 503 with which dependencies are down. + In Phase 1 dev there is no Ollama, so 503 is expected. +- ``GET /metrics`` : Prometheus exposition. (Loopback-only IP check deferred.) + +None of these endpoints require auth and none leak secrets or internal detail. +""" + +from __future__ import annotations + +from collections.abc import Awaitable +from typing import Literal, cast + +import httpx +import redis.asyncio as redis +from fastapi import APIRouter, Request, Response, status +from pydantic import BaseModel +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker + +from neuronetz_gateway.observability.logging import get_logger +from neuronetz_gateway.observability.metrics import CONTENT_TYPE_LATEST, render_latest + +router = APIRouter(tags=["health"]) +_log = get_logger("health") + + +class HealthResponse(BaseModel): + """Liveness response body.""" + + status: Literal["ok"] = "ok" + + +class ReadyResponse(BaseModel): + """Readiness response body. ``checks`` maps dependency -> reachable bool.""" + + status: Literal["ready", "not_ready"] + checks: dict[str, bool] + + +@router.get("/healthz", response_model=HealthResponse, status_code=status.HTTP_200_OK) +async def healthz() -> HealthResponse: + """Liveness probe — always returns 200 while the process is responsive.""" + return HealthResponse() + + +async def _check_postgres(app_state: object) -> bool: + """Return True if a trivial query succeeds against Postgres.""" + factory: async_sessionmaker[AsyncSession] | None = getattr( + app_state, "db_sessionmaker", None + ) + if factory is None: + return False + try: + async with factory() as session: + await session.execute(text("SELECT 1")) + return True + except Exception as exc: # noqa: BLE001 - any failure means not ready + _log.warning("readyz_postgres_unreachable", error=str(exc)) + return False + + +async def _check_redis(app_state: object) -> bool: + """Return True if Redis answers PING.""" + client: redis.Redis | None = getattr(app_state, "redis", None) + if client is None: + return False + try: + # redis-py types ping() as Awaitable[bool] | bool (sync+async share stubs); + # the asyncio client always returns an awaitable at runtime. + return bool(await cast("Awaitable[bool]", client.ping())) + except Exception as exc: # noqa: BLE001 - any failure means not ready + _log.warning("readyz_redis_unreachable", error=str(exc)) + return False + + +async def _check_ollama(app_state: object) -> bool: + """Return True if Ollama's root endpoint is reachable.""" + client: httpx.AsyncClient | None = getattr(app_state, "http_client", None) + if client is None: + return False + try: + resp = await client.get("/") + return resp.status_code < 500 + except Exception as exc: # noqa: BLE001 - any failure means not ready + _log.warning("readyz_ollama_unreachable", error=str(exc)) + return False + + +@router.get("/readyz", response_model=ReadyResponse) +async def readyz(request: Request, response: Response) -> ReadyResponse: + """Readiness probe — 200 only if every dependency is reachable, else 503.""" + app_state = request.app.state + checks = { + "postgres": await _check_postgres(app_state), + "redis": await _check_redis(app_state), + "ollama": await _check_ollama(app_state), + } + all_ready = all(checks.values()) + if not all_ready: + response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE + return ReadyResponse(status="ready" if all_ready else "not_ready", checks=checks) + + +@router.get("/metrics") +async def metrics() -> Response: + """Prometheus exposition. Loopback-only enforcement is deferred to Phase 4.""" + return Response(content=render_latest(), media_type=CONTENT_TYPE_LATEST) + + +__all__ = ["router"]