scaffold: project skeleton, schema, healthz/readyz, CI

Initial project structure for neuronetz-gateway per scope-docs/SPEC.md:

- Python 3.12 / FastAPI / SQLAlchemy 2.0 (async) / Redis / Postgres stack
  managed by uv. Multi-stage non-root Dockerfile, prod + dev compose files
  (ollama service is NEVER published in either), Caddyfile + systemd unit,
  justfile, GitHub Actions CI (ruff, mypy --strict, pytest, bandit, pip-audit).
- Pydantic-Settings config covering every env var from SPEC §7, including the
  MODEL_DISCOVERY_* keys for the dynamic-discovery feature (§4.6).
- Alembic 0001_initial creates the full gateway schema (8 tables, 3 enums,
  notify_key_revoked() trigger), incl. allow_all_models on tenant_limits and
  key_limits for the per-tenant auto-grant toggle.
- Working /healthz, /readyz (fail-closed when deps unreachable), and a
  Prometheus /metrics stub. Sanitizing error handlers that attach X-Request-ID
  to every response and never leak upstream internals.
- SPEC + AGENT_PROMPT included under scope-docs/ (source of truth).
This commit is contained in:
Stephan Berbig
2026-05-26 20:50:35 +02:00
commit d79f17b3bb
32 changed files with 3610 additions and 0 deletions

44
.dockerignore Normal file
View File

@@ -0,0 +1,44 @@
# Keep the build context lean and never ship secrets into an image layer.
# Secrets / local env
.env
.env.*
!.env.example
# VCS & CI
.git
.gitignore
.github
# Python caches & build artefacts
__pycache__/
*.py[cod]
*.egg-info/
.eggs/
build/
dist/
.venv/
venv/
.mypy_cache/
.ruff_cache/
.pytest_cache/
.coverage
htmlcov/
coverage.xml
# Tests & docs are not needed in the runtime image
tests/
docs/
scope-docs/
# Editor / OS cruft
.idea/
.vscode/
*.swp
.DS_Store
# Compose / ops files don't belong in the image
docker-compose*.yml
ops/
# NOTE: README.md and LICENSE are intentionally NOT ignored — the build backend
# (hatchling) reads `readme`/`license` from pyproject.toml at build time.

63
.env.example Normal file
View File

@@ -0,0 +1,63 @@
# neuronetz-gateway — environment configuration (SPEC §7).
#
# Copy to `.env` and adjust. `.env` is gitignored and MUST NOT be committed.
# All values here are SAFE EXAMPLES — change every secret before any real deploy.
# ──────────────────────────── Service ────────────────────────────
GATEWAY_BIND_HOST=0.0.0.0
GATEWAY_BIND_PORT=8080
GATEWAY_LOG_LEVEL=INFO
GATEWAY_LOG_FORMAT=json # json|console
GATEWAY_REQUEST_ID_HEADER=X-Request-ID
GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy # for X-Forwarded-For
# ──────────────────────────── Upstream ───────────────────────────
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_CONNECT_TIMEOUT_S=5
OLLAMA_READ_TIMEOUT_S=600
OLLAMA_MAX_CONNECTIONS=64
# ──────────────────────── Model discovery (§4.6) ─────────────────
MODEL_DISCOVERY_REFRESH_S=60
MODEL_DISCOVERY_CACHE_TTL_S=120
# ──────────────────────────── Database ───────────────────────────
# Compose builds DATABASE_URL from the POSTGRES_* parts below, but the gateway
# also accepts a full DATABASE_URL directly.
DATABASE_URL=postgresql+asyncpg://gateway:changeme@postgres:5432/neuronetz
DATABASE_POOL_SIZE=10
DATABASE_POOL_OVERFLOW=20
# Postgres container credentials (consumed by docker-compose).
POSTGRES_USER=gateway
POSTGRES_PASSWORD=changeme
POSTGRES_DB=neuronetz
# ──────────────────────────── Redis ──────────────────────────────
REDIS_URL=redis://redis:6379/0
REDIS_KEY_CACHE_TTL_S=60
# ────────────────── Limits (defaults; DB overrides) ──────────────
DEFAULT_RPM=60
DEFAULT_TPM=100000
DEFAULT_CONCURRENT=8
MAX_REQUEST_BODY_BYTES=262144
MAX_NUM_PREDICT=4096
# ──────────────────────────── Security ───────────────────────────
ARGON2_TIME_COST=3
ARGON2_MEMORY_COST_KIB=65536
ARGON2_PARALLELISM=4
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
# ──────────────────────────── Audit ──────────────────────────────
AUDIT_BUFFER_SIZE=1000
PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
# ──────────────── Playground / API docs (prod-safe: OFF) ─────────
# Serve the playground HTML (owned by the docs agent) at /playground.
PLAYGROUND_ENABLED=false
PLAYGROUND_FILE=/app/playground/index.html
# Enable FastAPI's /docs + /openapi.json (default off in production).
DOCS_ENABLED=false

108
.github/workflows/ci.yml vendored Normal file
View File

@@ -0,0 +1,108 @@
name: CI
on:
push:
branches: ["**"]
pull_request:
workflow_dispatch:
# Cancel superseded runs on the same ref.
concurrency:
group: ci-${{ github.ref }}
cancel-in-progress: true
env:
PYTHON_VERSION: "3.12"
jobs:
lint:
name: ruff
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra dev
- name: ruff check
run: uv run ruff check .
typecheck:
name: mypy --strict
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra dev
- name: mypy
run: uv run mypy --strict src
test:
name: pytest
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra dev
# Phase 1: an empty/placeholder suite must pass. pytest exits 5 when it
# collects no tests; we treat that as success this phase. Coverage is
# reported but not gated yet (no --cov-fail-under until later phases).
- name: pytest
shell: bash
run: |
set +e
uv run pytest --cov=neuronetz_gateway --cov-report=term-missing
code=$?
if [ "$code" -eq 5 ]; then
echo "::notice::No tests collected (Phase 1) — treating as success."
exit 0
fi
exit "$code"
bandit:
name: bandit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra dev
- name: bandit
run: uv run bandit -q -r src
pip-audit:
name: pip-audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true
- name: Set up Python
run: uv python install ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: uv sync --extra dev
- name: pip-audit
run: uv run pip-audit

40
.gitignore vendored Normal file
View File

@@ -0,0 +1,40 @@
# Secrets — NEVER commit. Only .env.example is tracked.
.env
.env.*
!.env.example
# Python
__pycache__/
*.py[cod]
*$py.class
*.egg-info/
.eggs/
build/
dist/
*.so
# Virtualenvs / uv
.venv/
venv/
.python-version
# Type / lint / test caches
.mypy_cache/
.ruff_cache/
.pytest_cache/
.coverage
.coverage.*
htmlcov/
coverage.xml
.tox/
# Docker
*.pid
# Editor / OS
.idea/
.vscode/
*.swp
*~
.DS_Store
Thumbs.db

97
Dockerfile Normal file
View File

@@ -0,0 +1,97 @@
# syntax=docker/dockerfile:1.7
#
# neuronetz-gateway — multi-stage image.
#
# builder stage : installs dependencies into a self-contained virtualenv using uv.
# runtime stage : copies the venv + source, drops to a NON-ROOT user, contains
# no build tools, and runs `python -m neuronetz_gateway`.
#
# uv is pulled from the official distroless image so we don't need network access
# to `pip install uv`. Dependencies come from pyproject.toml (+ uv.lock if present).
# ----------------------------------------------------------------------------
# Stage 1 — builder
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS builder
# Bring in the `uv` binary from its official image.
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
ENV UV_LINK_MODE=copy \
UV_COMPILE_BYTECODE=1 \
UV_PYTHON_DOWNLOADS=never \
# Create the project venv at a stable, copyable location.
VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH
WORKDIR /app
# Create the target virtualenv up front so uv installs into it.
RUN uv venv /opt/venv
# Dependency layer: copy only the manifest(s) first for better caching.
# uv.lock is optional in Phase 1 — the wildcard makes COPY succeed either way.
COPY pyproject.toml ./
COPY uv.loc[k] ./
# Install dependencies. If a lockfile is present `uv sync` honours it; otherwise
# we fall back to resolving straight from pyproject.toml. Either way the build
# does NOT fail when the lock is absent.
RUN --mount=type=cache,target=/root/.cache/uv \
if [ -f uv.lock ]; then \
uv sync --frozen --no-install-project --no-dev ; \
else \
uv pip install --python /opt/venv/bin/python -r pyproject.toml ; \
fi
# Now copy the application source and install the project itself into the venv.
# README.md + LICENSE are required by the build backend (pyproject `readme`/license).
COPY README.md LICENSE ./
COPY src ./src
COPY alembi[c] ./alembic
COPY alembic.in[i] ./
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --python /opt/venv/bin/python --no-deps .
# ----------------------------------------------------------------------------
# Stage 2 — runtime
# ----------------------------------------------------------------------------
FROM python:3.12-slim AS runtime
# Runtime-only OS packages: curl is used by the compose healthcheck.
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
# Non-root user.
RUN groupadd --system --gid 10001 gateway \
&& useradd --system --uid 10001 --gid gateway --home-dir /app --shell /usr/sbin/nologin gateway
ENV VIRTUAL_ENV=/opt/venv \
PATH=/opt/venv/bin:$PATH \
PYTHONUNBUFFERED=1 \
PYTHONDONTWRITEBYTECODE=1 \
GATEWAY_BIND_HOST=0.0.0.0 \
GATEWAY_BIND_PORT=8080
WORKDIR /app
# Copy the fully-populated virtualenv and the application from the builder.
COPY --from=builder /opt/venv /opt/venv
COPY --from=builder /app/src ./src
# alembic assets are optional during early scaffolding; copy if present.
COPY --from=builder /app/alembi[c] ./alembic
COPY --from=builder /app/alembic.in[i] ./
# Drop privileges. No build tools are present in this stage.
USER gateway
EXPOSE 8080
# Liveness probe target lives at /healthz (see SPEC §6.4).
HEALTHCHECK --interval=15s --timeout=3s --start-period=20s --retries=5 \
CMD curl -fsS "http://127.0.0.1:${GATEWAY_BIND_PORT}/healthz" || exit 1
# Default command: run the server. Compose overrides this in dev to run
# `alembic upgrade head` first (see docker-compose.dev.yml).
CMD ["python", "-m", "neuronetz_gateway"]

202
LICENSE Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

92
README.md Normal file
View File

@@ -0,0 +1,92 @@
# neuronetz-gateway
A secure, multi-tenant API gateway in front of an [Ollama](https://github.com/ollama/ollama)
instance. It is the hot path of the Neuronetz API: every request to the models flows
through here, authenticated, rate-limited, budgeted, and audited.
**The Ollama backend is never reachable from the public internet.** It is bound to an
internal Docker network with no published ports. All access is via this gateway, behind
TLS terminated by Caddy.
> Status: **v0.1.0 — in development.** See [`scope-docs/SPEC.md`](scope-docs/SPEC.md) for
> the full specification and [`scope-docs/AGENT_PROMPT.md`](scope-docs/AGENT_PROMPT.md) for
> the phased build plan. `SPEC.md` is the source of truth.
## What it does
- **Auth** — API keys as Bearer tokens, stored as Argon2id hashes, verified in constant time.
- **Multi-tenant** — tenants own keys; limits and budgets inherit tenant → key.
- **Rate limiting** — per-key and per-tenant RPM / TPM / concurrent connections.
- **Budgets** — daily / monthly / total token budgets, enforced fail-closed.
- **Dual API surface** — native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`), both streaming.
- **Hard-blocked mutations** — `/api/pull`, `/api/push`, `/api/create`, `/api/copy`,
`/api/delete`, `/api/blobs/*` always return 403. Not configurable.
- **Audit log** — always-on request metadata; opt-in, TTL'd prompt logging per key.
Administration (dashboards, tenant self-service) lives in a separate service,
`neuronetz-console`; it is **not** part of this repository.
## Architecture
```
Internet ──TLS──> Caddy ──HTTP──> gateway ──┬──> Postgres (keys, budgets, audit)
├──> Redis (key cache, rate limits)
└──> Ollama (internal network only)
```
## Quickstart (dev)
Requires Docker + Docker Compose. The dev stack runs Postgres, Redis, and the gateway —
**no Caddy and no Ollama** (so `/readyz` reports 503 until a real Ollama backend is wired
in; that is expected).
```bash
git clone <repo> neuronetz-gateway && cd neuronetz-gateway
cp .env.example .env # adjust if you like; defaults work for local dev
docker compose -f docker-compose.dev.yml up --build
```
The gateway runs `alembic upgrade head` on startup, then serves on `http://localhost:8080`.
```bash
curl -i http://localhost:8080/healthz # -> 200 {"status":"ok"}
curl -i http://localhost:8080/readyz # -> 503 (no Ollama backend in the dev stack)
```
## Production
`docker-compose.yml` brings up the full stack — Caddy (TLS via Let's Encrypt for
`api.neuronetz.ai`), the gateway, Postgres, Redis, and Ollama. The `ollama` service has
**no `ports:` mapping** and is reachable only on the internal Docker network. See
[`docs/DEPLOYMENT.md`](docs/DEPLOYMENT.md) (added in a later phase) and
[`ops/caddy/Caddyfile.example`](ops/caddy/Caddyfile.example).
## Managing tenants and keys
Use the bootstrap CLI (Typer). Keys have the form `nz_<prefix><secret>`; the full key is
printed exactly once at creation and only its Argon2id hash is stored.
```bash
neuronetz-gateway create-tenant --name acme
neuronetz-gateway create-key --tenant acme --name prod-server-1
neuronetz-gateway list-keys --tenant acme
neuronetz-gateway revoke-key --prefix nz_abc12345
```
## Development
```bash
just dev # run the dev stack
just test # pytest + coverage
just lint # ruff
just typecheck # mypy --strict
just migrate # alembic upgrade head
```
Tooling: Python 3.12, `uv`, FastAPI + uvicorn, SQLAlchemy 2.0 (async) + asyncpg, Redis,
httpx, structlog, Pydantic. Lint/type/security gates: ruff, mypy `--strict`, bandit,
pip-audit.
## License
Apache 2.0 — see [`LICENSE`](LICENSE). Owner: Stephan Berbig / Neuronetz.

49
alembic.ini Normal file
View File

@@ -0,0 +1,49 @@
# Alembic configuration for neuronetz-gateway.
# The database URL is read from the DATABASE_URL environment variable in
# alembic/env.py (do not hardcode credentials here).
[alembic]
script_location = alembic
prepend_sys_path = src
version_path_separator = os
# version_locations defaults to alembic/versions
# DATABASE_URL is injected at runtime; this placeholder is never used directly.
sqlalchemy.url = driver://user:pass@localhost/dbname
[post_write_hooks]
# (none)
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARNING
handlers = console
qualname =
[logger_sqlalchemy]
level = WARNING
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

97
alembic/env.py Normal file
View File

@@ -0,0 +1,97 @@
"""Alembic environment for neuronetz-gateway (async engine).
Reads ``DATABASE_URL`` from the environment (the same value the app uses,
``postgresql+asyncpg://...``). Ensures schema ``gateway`` exists and pins the
Alembic version table into that schema so migration bookkeeping never collides
with the ``console`` schema in the shared database.
"""
from __future__ import annotations
import asyncio
import os
from logging.config import fileConfig
from alembic import context
from sqlalchemy import pool, text
from sqlalchemy.engine import Connection
from sqlalchemy.ext.asyncio import async_engine_from_config
from neuronetz_gateway.config import get_settings
from neuronetz_gateway.db.models import GATEWAY_SCHEMA, Base
config = context.config
if config.config_file_name is not None:
fileConfig(config.config_file_name)
target_metadata = Base.metadata
def _database_url() -> str:
"""Resolve the async database URL from env, falling back to settings."""
return os.environ.get("DATABASE_URL") or get_settings().database_url
def _configure_context(connection: Connection) -> None:
"""Configure migration context with the gateway schema + version table."""
context.configure(
connection=connection,
target_metadata=target_metadata,
version_table="alembic_version",
version_table_schema=GATEWAY_SCHEMA,
include_schemas=True,
compare_type=True,
)
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode (emit SQL without a DBAPI connection)."""
context.configure(
url=_database_url(),
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
version_table="alembic_version",
version_table_schema=GATEWAY_SCHEMA,
include_schemas=True,
)
with context.begin_transaction():
context.run_migrations()
def _do_run_migrations(connection: Connection) -> None:
"""Ensure the schema exists, then run migrations within a transaction.
The ``CREATE SCHEMA`` is committed in its own transaction before configuring
Alembic. Under SQLAlchemy 2.0, ``execute()`` auto-begins a transaction; if it
were left open, Alembic's ``begin_transaction()`` would treat the connection as
caller-managed and become a no-op that never commits, so the whole migration
(and the schema) would be rolled back on connection close. Committing here
leaves the connection clean so Alembic owns — and commits — its own transaction.
"""
connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{GATEWAY_SCHEMA}"'))
connection.commit()
_configure_context(connection)
with context.begin_transaction():
context.run_migrations()
async def run_migrations_online() -> None:
"""Run migrations in 'online' mode using an async engine."""
configuration = config.get_section(config.config_ini_section) or {}
configuration["sqlalchemy.url"] = _database_url()
connectable = async_engine_from_config(
configuration,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
async with connectable.connect() as connection:
await connection.run_sync(_do_run_migrations)
await connectable.dispose()
if context.is_offline_mode():
run_migrations_offline()
else:
asyncio.run(run_migrations_online())

View File

@@ -0,0 +1,342 @@
"""initial gateway schema
Creates schema ``gateway``, the three enum types, all tables and indexes, and
the ``notify_key_revoked()`` function plus ``trg_notify_key_revoked`` trigger,
matching SPEC §5 verbatim in structure.
Revision ID: 0001_initial
Revises:
Create Date: 2026-05-22
"""
from __future__ import annotations
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision: str = "0001_initial"
down_revision: str | None = None
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
SCHEMA = "gateway"
# Enum types are created explicitly via raw SQL below; the table columns
# reference them with create_type=False so they are not created twice.
_key_status = postgresql.ENUM(
"active", "disabled", "revoked", name="key_status", schema=SCHEMA, create_type=False
)
_tenant_status = postgresql.ENUM(
"active", "suspended", "closed", name="tenant_status", schema=SCHEMA, create_type=False
)
_budget_period = postgresql.ENUM(
"day", "month", "total", name="budget_period", schema=SCHEMA, create_type=False
)
def upgrade() -> None:
"""Create the full ``gateway`` schema."""
op.execute(f'CREATE SCHEMA IF NOT EXISTS "{SCHEMA}"')
# --- Enum types (SPEC §5) ---
op.execute("CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked')")
op.execute("CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed')")
op.execute("CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total')")
# --- tenants ---
op.create_table(
"tenants",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
server_default=sa.text("gen_random_uuid()"),
),
sa.Column("name", sa.Text(), nullable=False, unique=True),
sa.Column(
"status", _tenant_status, nullable=False, server_default=sa.text("'active'")
),
sa.Column(
"created_at",
postgresql.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column(
"metadata",
postgresql.JSONB(),
nullable=False,
server_default=sa.text("'{}'::jsonb"),
),
schema=SCHEMA,
)
# --- tenant_limits ---
op.create_table(
"tenant_limits",
sa.Column(
"tenant_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column("rpm", sa.Integer(), nullable=False, server_default=sa.text("60")),
sa.Column("tpm", sa.Integer(), nullable=False, server_default=sa.text("100000")),
sa.Column("concurrent", sa.Integer(), nullable=False, server_default=sa.text("8")),
sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
sa.Column("tokens_total", sa.BigInteger(), nullable=True),
sa.Column(
"allowed_models",
postgresql.ARRAY(sa.Text()),
nullable=False,
server_default=sa.text("'{}'"),
),
sa.Column(
"allow_all_models",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
sa.Column(
"log_prompts_default",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
sa.Column(
"prompt_retention_days", sa.Integer(), nullable=False, server_default=sa.text("30")
),
sa.Column(
"audit_retention_days", sa.Integer(), nullable=False, server_default=sa.text("365")
),
schema=SCHEMA,
)
# --- api_keys ---
op.create_table(
"api_keys",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
server_default=sa.text("gen_random_uuid()"),
),
sa.Column(
"tenant_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey(f"{SCHEMA}.tenants.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("prefix", sa.Text(), nullable=False, unique=True),
sa.Column("key_hash", sa.Text(), nullable=False),
sa.Column("name", sa.Text(), nullable=False),
sa.Column("status", _key_status, nullable=False, server_default=sa.text("'active'")),
sa.Column(
"scopes",
postgresql.ARRAY(sa.Text()),
nullable=False,
server_default=sa.text("'{chat,embeddings}'"),
),
sa.Column(
"created_at",
postgresql.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column("last_used_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
sa.Column("expires_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
sa.Column("log_prompts", sa.Boolean(), nullable=True),
sa.Column(
"metadata",
postgresql.JSONB(),
nullable=False,
server_default=sa.text("'{}'::jsonb"),
),
schema=SCHEMA,
)
op.create_index(
"idx_api_keys_prefix",
"api_keys",
["prefix"],
schema=SCHEMA,
postgresql_where=sa.text("status = 'active'"),
)
op.create_index("idx_api_keys_tenant", "api_keys", ["tenant_id"], schema=SCHEMA)
# --- key_limits ---
op.create_table(
"key_limits",
sa.Column(
"key_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
primary_key=True,
),
sa.Column("rpm", sa.Integer(), nullable=True),
sa.Column("tpm", sa.Integer(), nullable=True),
sa.Column("concurrent", sa.Integer(), nullable=True),
sa.Column("tokens_daily", sa.BigInteger(), nullable=True),
sa.Column("tokens_monthly", sa.BigInteger(), nullable=True),
sa.Column("tokens_total", sa.BigInteger(), nullable=True),
sa.Column("allowed_models", postgresql.ARRAY(sa.Text()), nullable=True),
sa.Column("allow_all_models", sa.Boolean(), nullable=True),
schema=SCHEMA,
)
# --- budget_usage ---
op.create_table(
"budget_usage",
sa.Column(
"key_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey(f"{SCHEMA}.api_keys.id", ondelete="CASCADE"),
primary_key=True,
nullable=False,
),
sa.Column("period", _budget_period, primary_key=True, nullable=False),
sa.Column(
"period_start",
postgresql.TIMESTAMP(timezone=True),
primary_key=True,
nullable=False,
),
sa.Column("tokens_in", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
sa.Column("tokens_out", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
sa.Column("requests", sa.BigInteger(), nullable=False, server_default=sa.text("0")),
schema=SCHEMA,
)
op.create_index(
"idx_budget_usage_period",
"budget_usage",
["period", "period_start"],
schema=SCHEMA,
)
# --- audit_log ---
op.create_table(
"audit_log",
sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
sa.Column(
"ts",
postgresql.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column("request_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column("tenant_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("key_prefix", sa.Text(), nullable=True),
sa.Column("method", sa.Text(), nullable=False),
sa.Column("path", sa.Text(), nullable=False),
sa.Column("model", sa.Text(), nullable=True),
sa.Column("tokens_in", sa.Integer(), nullable=True),
sa.Column("tokens_out", sa.Integer(), nullable=True),
sa.Column("latency_ms", sa.Integer(), nullable=True),
sa.Column("status", sa.Integer(), nullable=False),
sa.Column("client_ip", postgresql.INET(), nullable=True),
sa.Column("user_agent", sa.Text(), nullable=True),
sa.Column("error_code", sa.Text(), nullable=True),
schema=SCHEMA,
)
op.create_index("idx_audit_ts", "audit_log", ["ts"], schema=SCHEMA)
op.create_index("idx_audit_tenant_ts", "audit_log", ["tenant_id", "ts"], schema=SCHEMA)
op.create_index("idx_audit_key_ts", "audit_log", ["key_id", "ts"], schema=SCHEMA)
# --- prompt_log ---
op.create_table(
"prompt_log",
sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
sa.Column(
"audit_id",
sa.BigInteger(),
sa.ForeignKey(f"{SCHEMA}.audit_log.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"ts",
postgresql.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column("request_body", postgresql.JSONB(), nullable=False),
sa.Column("response_text", sa.Text(), nullable=True),
sa.Column("retention_until", postgresql.TIMESTAMP(timezone=True), nullable=False),
schema=SCHEMA,
)
op.create_index(
"idx_prompt_log_retention", "prompt_log", ["retention_until"], schema=SCHEMA
)
# --- revocations ---
op.create_table(
"revocations",
sa.Column("id", sa.BigInteger(), sa.Identity(always=False), primary_key=True),
sa.Column("key_id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column(
"ts",
postgresql.TIMESTAMP(timezone=True),
nullable=False,
server_default=sa.text("now()"),
),
sa.Column("reason", sa.Text(), nullable=True),
sa.Column("processed_at", postgresql.TIMESTAMP(timezone=True), nullable=True),
schema=SCHEMA,
)
# --- NOTIFY trigger on revocation insert (SPEC §5) ---
op.execute(
"""
CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
BEGIN
PERFORM pg_notify('key_revoked', NEW.key_id::text);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
"""
)
op.execute(
"""
CREATE TRIGGER trg_notify_key_revoked
AFTER INSERT ON gateway.revocations
FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
"""
)
def downgrade() -> None:
"""Drop the entire ``gateway`` schema and its objects."""
op.execute("DROP TRIGGER IF EXISTS trg_notify_key_revoked ON gateway.revocations")
op.execute("DROP FUNCTION IF EXISTS gateway.notify_key_revoked()")
op.drop_index("idx_prompt_log_retention", table_name="prompt_log", schema=SCHEMA)
op.drop_table("prompt_log", schema=SCHEMA)
op.drop_index("idx_audit_key_ts", table_name="audit_log", schema=SCHEMA)
op.drop_index("idx_audit_tenant_ts", table_name="audit_log", schema=SCHEMA)
op.drop_index("idx_audit_ts", table_name="audit_log", schema=SCHEMA)
op.drop_table("audit_log", schema=SCHEMA)
op.drop_index("idx_budget_usage_period", table_name="budget_usage", schema=SCHEMA)
op.drop_table("budget_usage", schema=SCHEMA)
op.drop_table("key_limits", schema=SCHEMA)
op.drop_index("idx_api_keys_tenant", table_name="api_keys", schema=SCHEMA)
op.drop_index("idx_api_keys_prefix", table_name="api_keys", schema=SCHEMA)
op.drop_table("api_keys", schema=SCHEMA)
op.drop_table("tenant_limits", schema=SCHEMA)
op.drop_table("tenants", schema=SCHEMA)
op.execute("DROP TYPE IF EXISTS gateway.budget_period")
op.execute("DROP TYPE IF EXISTS gateway.tenant_status")
op.execute("DROP TYPE IF EXISTS gateway.key_status")
op.execute(f'DROP SCHEMA IF EXISTS "{SCHEMA}"')

101
docker-compose.dev.yml Normal file
View File

@@ -0,0 +1,101 @@
# neuronetz-gateway — DEV stack (postgres + redis + gateway only).
#
# Deliberately differs from the production stack:
# * NO caddy — the gateway is published directly on localhost:8080.
# * NO ollama — Phase 1 expects /readyz to return 503 *because* there is no
# Ollama backend yet. This is the intended exit-criterion state.
#
# Bring it up with:
# docker compose -f docker-compose.dev.yml up --build
#
# Then:
# curl -i http://localhost:8080/healthz # -> 200
# curl -i http://localhost:8080/readyz # -> 503 (no Ollama)
#
# The gateway container runs `alembic upgrade head` and then starts the server.
services:
gateway:
build:
context: .
dockerfile: Dockerfile
restart: unless-stopped
ports:
- "127.0.0.1:8080:8080"
environment:
GATEWAY_BIND_HOST: 0.0.0.0
GATEWAY_BIND_PORT: "8080"
GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-console}
GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1}
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-gateway}@postgres:5432/${POSTGRES_DB:-neuronetz}
DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
REDIS_URL: redis://redis:6379/0
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
# No Ollama in the dev stack — point at the (absent) service name so the
# readiness check fails closed with 503, exactly as Phase 1 expects.
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://ollama:11434}
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
DEFAULT_RPM: ${DEFAULT_RPM:-60}
DEFAULT_TPM: ${DEFAULT_TPM:-100000}
DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
# Run migrations, then start the server.
command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
interval: 10s
timeout: 3s
retries: 5
start_period: 30s
postgres:
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_USER: ${POSTGRES_USER:-gateway}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-gateway}
POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
ports:
# Exposed on localhost for dev convenience (psql, migrations from host).
- "127.0.0.1:5432:5432"
volumes:
- postgres_dev_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
interval: 5s
timeout: 3s
retries: 10
redis:
image: redis:7-alpine
restart: unless-stopped
command: ["redis-server", "--save", "", "--appendonly", "no"]
ports:
# Exposed on localhost for dev convenience (redis-cli from host).
- "127.0.0.1:6379:6379"
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 3s
retries: 10
volumes:
postgres_dev_data:

152
docker-compose.yml Normal file
View File

@@ -0,0 +1,152 @@
# neuronetz-gateway — FULL production stack (SPEC §4.1).
#
# Internet ──TLS──▶ caddy ──HTTP/1.1 internal──▶ gateway ──▶ postgres / redis / ollama
#
# Only Caddy publishes ports to the host. The gateway is reachable solely through
# Caddy on the internal network. Postgres, Redis and (critically) Ollama are NOT
# published to the host at all.
#
# ┌─────────────────────────────────────────────────────────────────────────┐
# │ SECURITY NON-NEGOTIABLE: │
# │ The `ollama` service has NO `ports:` mapping and MUST NEVER get one. │
# │ Ollama is reachable only on the internal Docker network via the │
# │ service name `ollama:11434`. Publishing it would re-open the exact │
# │ unauthenticated exposure this whole project exists to close. │
# └─────────────────────────────────────────────────────────────────────────┘
#
# Copy `.env.example` to `.env` and adjust before running:
# docker compose up -d --build
services:
caddy:
image: caddy:2-alpine
restart: unless-stopped
depends_on:
gateway:
condition: service_healthy
ports:
- "80:80"
- "443:443"
- "443:443/udp" # HTTP/3
volumes:
- ./ops/caddy/Caddyfile.example:/etc/caddy/Caddyfile:ro
- caddy_data:/data
- caddy_config:/config
networks:
- edge
- internal
gateway:
build:
context: .
dockerfile: Dockerfile
restart: unless-stopped
# NOTE: deliberately NO `ports:` — the gateway is internal-only and is
# reached exclusively through Caddy.
expose:
- "8080"
environment:
GATEWAY_BIND_HOST: 0.0.0.0
GATEWAY_BIND_PORT: "8080"
GATEWAY_LOG_LEVEL: ${GATEWAY_LOG_LEVEL:-INFO}
GATEWAY_LOG_FORMAT: ${GATEWAY_LOG_FORMAT:-json}
GATEWAY_REQUEST_ID_HEADER: ${GATEWAY_REQUEST_ID_HEADER:-X-Request-ID}
GATEWAY_TRUSTED_PROXIES: ${GATEWAY_TRUSTED_PROXIES:-127.0.0.1,caddy}
# Service-name addressing on the internal network.
DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-gateway}:${POSTGRES_PASSWORD:-changeme}@postgres:5432/${POSTGRES_DB:-neuronetz}
DATABASE_POOL_SIZE: ${DATABASE_POOL_SIZE:-10}
DATABASE_POOL_OVERFLOW: ${DATABASE_POOL_OVERFLOW:-20}
REDIS_URL: redis://redis:6379/0
REDIS_KEY_CACHE_TTL_S: ${REDIS_KEY_CACHE_TTL_S:-60}
OLLAMA_BASE_URL: http://ollama:11434
OLLAMA_CONNECT_TIMEOUT_S: ${OLLAMA_CONNECT_TIMEOUT_S:-5}
OLLAMA_READ_TIMEOUT_S: ${OLLAMA_READ_TIMEOUT_S:-600}
OLLAMA_MAX_CONNECTIONS: ${OLLAMA_MAX_CONNECTIONS:-64}
DEFAULT_RPM: ${DEFAULT_RPM:-60}
DEFAULT_TPM: ${DEFAULT_TPM:-100000}
DEFAULT_CONCURRENT: ${DEFAULT_CONCURRENT:-8}
MAX_REQUEST_BODY_BYTES: ${MAX_REQUEST_BODY_BYTES:-262144}
MAX_NUM_PREDICT: ${MAX_NUM_PREDICT:-4096}
ARGON2_TIME_COST: ${ARGON2_TIME_COST:-3}
ARGON2_MEMORY_COST_KIB: ${ARGON2_MEMORY_COST_KIB:-65536}
ARGON2_PARALLELISM: ${ARGON2_PARALLELISM:-4}
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN: ${AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN:-20}
AUDIT_BUFFER_SIZE: ${AUDIT_BUFFER_SIZE:-1000}
PROMPT_LOG_DEFAULT_RETENTION_DAYS: ${PROMPT_LOG_DEFAULT_RETENTION_DAYS:-30}
AUDIT_LOG_DEFAULT_RETENTION_DAYS: ${AUDIT_LOG_DEFAULT_RETENTION_DAYS:-365}
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
ollama:
condition: service_started
# Apply migrations, then start the server.
command: ["sh", "-c", "alembic upgrade head && exec python -m neuronetz_gateway"]
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/healthz"]
interval: 15s
timeout: 3s
retries: 5
start_period: 30s
networks:
- internal
postgres:
image: postgres:16-alpine
restart: unless-stopped
environment:
POSTGRES_USER: ${POSTGRES_USER:-gateway}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme}
POSTGRES_DB: ${POSTGRES_DB:-neuronetz}
volumes:
- postgres_data:/var/lib/postgresql/data
# No `ports:` — Postgres is internal-only.
healthcheck:
test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-gateway} -d ${POSTGRES_DB:-neuronetz}"]
interval: 5s
timeout: 3s
retries: 10
networks:
- internal
redis:
image: redis:7-alpine
restart: unless-stopped
command: ["redis-server", "--save", "", "--appendonly", "no"]
# No `ports:` — Redis is internal-only.
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 3s
retries: 10
networks:
- internal
# ───────────────────────────────────────────────────────────────────────────
# Ollama — INTERNAL NETWORK ONLY. DO NOT ADD A `ports:` MAPPING.
# Reachable only as `http://ollama:11434` from the gateway container.
# ───────────────────────────────────────────────────────────────────────────
ollama:
image: ollama/ollama:latest
restart: unless-stopped
# !!! NO `ports:` — never publish Ollama to the host or the internet. !!!
volumes:
- ollama_data:/root/.ollama
networks:
- internal
networks:
# Public-facing network: only Caddy is attached alongside `internal`.
edge:
driver: bridge
# Private network for inter-service traffic; not reachable from the host.
internal:
driver: bridge
internal: false
volumes:
postgres_data:
ollama_data:
caddy_data:
caddy_config:

60
justfile Normal file
View File

@@ -0,0 +1,60 @@
# neuronetz-gateway — task runner.
#
# Requires `just` (https://github.com/casey/just) and `uv`
# (https://github.com/astral-sh/uv) on the host.
#
# just # list available targets
# just dev # run postgres + redis + gateway locally (dev stack)
# just test # run the test suite with coverage
# just lint # ruff check
# just typecheck # mypy --strict
# just migrate # apply alembic migrations against DATABASE_URL
set dotenv-load := true
# uv runs commands inside the project's managed environment.
uv := "uv"
# Show the list of targets (default).
default:
@just --list
# Sync dependencies into the local uv-managed virtualenv (incl. dev extras).
install:
{{uv}} sync --extra dev
# Run the dev stack: postgres + redis + gateway (no caddy, no ollama).
dev:
docker compose -f docker-compose.dev.yml up --build
# Run the test suite with coverage.
test:
{{uv}} run pytest
# Lint with ruff.
lint:
{{uv}} run ruff check .
# Static type checking (strict).
typecheck:
{{uv}} run mypy --strict src
# Apply database migrations to head.
migrate:
{{uv}} run alembic upgrade head
# Security lint.
bandit:
{{uv}} run bandit -q -r src
# Dependency vulnerability audit.
audit:
{{uv}} run pip-audit
# Bring the FULL production stack up (caddy + gateway + postgres + redis + ollama).
compose-up:
docker compose up -d --build
# Tear the production stack down.
compose-down:
docker compose down

View File

@@ -0,0 +1,59 @@
# neuronetz-gateway — Caddy reverse proxy (SPEC §4.1, §6.5).
#
# Caddy is the only public-facing component. It terminates TLS (HTTP/2 + HTTP/3),
# obtains a Let's Encrypt certificate for api.neuronetz.ai automatically, applies
# security headers, and reverse-proxies to the internal-only gateway:8080.
#
# Copy this file to `Caddyfile` and edit the site address / admin email.
# The production docker-compose.yml mounts it at /etc/caddy/Caddyfile.
{
# Email for Let's Encrypt account + expiry notices. Replace before deploy.
email ops@neuronetz.ai
}
api.neuronetz.ai {
# --- Reverse proxy to the internal gateway ---
# `gateway` is the Docker service name on the internal network; it is never
# published to the host. Caddy forwards plain HTTP/1.1 to it.
reverse_proxy gateway:8080
# --- Security headers ---
header {
# HSTS: force HTTPS for two years, include subdomains, allow preload.
Strict-Transport-Security "max-age=63072000; includeSubDomains; preload"
# Disable MIME sniffing.
X-Content-Type-Options "nosniff"
# Clickjacking defense (API has no UI, deny framing outright).
X-Frame-Options "DENY"
# Conservative referrer policy.
Referrer-Policy "no-referrer"
# Strip server-identifying headers so we don't advertise the stack.
-Server
-X-Powered-By
}
# Structured access logs to stdout (collected by the container runtime).
log {
output stdout
format json
}
}
# ─────────────────────────────────────────────────────────────────────────────
# DEV / LOCAL note:
#
# For local testing without a public domain or real certificate, replace the
# site block above with a localhost block that uses Caddy's internal self-signed
# CA (no Let's Encrypt round-trip):
#
# localhost {
# tls internal
# reverse_proxy gateway:8080
# }
#
# Caddy will install its local root CA; trust it or pass `-k` to curl. Note the
# Phase 1 *dev* compose stack (docker-compose.dev.yml) ships WITHOUT Caddy and
# exposes the gateway directly on localhost:8080 — this file is for the full
# production stack only.
# ─────────────────────────────────────────────────────────────────────────────

View File

@@ -0,0 +1,58 @@
# neuronetz-gateway — systemd unit for non-Compose deployments.
#
# Assumes the project is installed into a virtualenv at /opt/neuronetz-gateway/venv
# (e.g. `uv venv /opt/neuronetz-gateway/venv && uv pip install ...`) and that
# configuration lives in /etc/neuronetz-gateway/gateway.env (same keys as
# .env.example). Postgres, Redis and Ollama are reached over the network/loopback
# per that env file — Ollama must remain bound to localhost / a private network
# and never be published publicly.
#
# Install:
# sudo cp neuronetz-gateway.service /etc/systemd/system/
# sudo systemctl daemon-reload
# sudo systemctl enable --now neuronetz-gateway
[Unit]
Description=neuronetz-gateway — secure API gateway in front of Ollama
Documentation=https://github.com/neuronetz/neuronetz-gateway
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
# Dedicated unprivileged service account (create with: useradd --system gateway).
User=gateway
Group=gateway
WorkingDirectory=/opt/neuronetz-gateway
EnvironmentFile=/etc/neuronetz-gateway/gateway.env
# Apply migrations before starting (idempotent; no-op when already at head).
ExecStartPre=/opt/neuronetz-gateway/venv/bin/alembic upgrade head
ExecStart=/opt/neuronetz-gateway/venv/bin/python -m neuronetz_gateway
Restart=on-failure
RestartSec=5
TimeoutStopSec=30
# --- Hardening ---
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=true
PrivateTmp=true
PrivateDevices=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
LockPersonality=true
MemoryDenyWriteExecute=true
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
# Allow writing only where the app legitimately needs to (none by default).
ReadWritePaths=
[Install]
WantedBy=multi-user.target

94
pyproject.toml Normal file
View File

@@ -0,0 +1,94 @@
[project]
name = "neuronetz-gateway"
version = "0.1.0"
description = "Secure multi-tenant API gateway in front of Ollama for the Neuronetz platform."
readme = "README.md"
license = { text = "Apache-2.0" }
requires-python = ">=3.12"
authors = [{ name = "Neuronetz", email = "ops@neuronetz.ai" }]
dependencies = [
"fastapi>=0.115",
"uvicorn[standard]>=0.30",
"httpx>=0.27",
"sqlalchemy[asyncio]>=2.0",
"asyncpg>=0.29",
"redis[hiredis]>=5.0",
"structlog>=24.1",
"pydantic>=2.9",
"pydantic-settings>=2.4",
"argon2-cffi>=23.1",
"typer>=0.12",
"prometheus-client>=0.20",
"alembic>=1.13",
]
[project.scripts]
neuronetz-gateway = "neuronetz_gateway.cli.manage:app"
[project.optional-dependencies]
dev = [
"ruff>=0.6",
"mypy>=1.11",
"bandit>=1.7",
"pip-audit>=2.7",
"pytest>=8.3",
"pytest-asyncio>=0.24",
"pytest-cov>=5.0",
"testcontainers>=4.8",
"respx>=0.21",
"locust>=2.31",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["src/neuronetz_gateway"]
[tool.ruff]
target-version = "py312"
line-length = 100
src = ["src", "tests"]
[tool.ruff.lint]
select = ["E", "F", "I", "B", "UP", "S", "ASYNC"]
[tool.ruff.lint.per-file-ignores]
# Tests may use assert and bind to all interfaces in fixtures.
"tests/**" = ["S101", "S104"]
[tool.mypy]
python_version = "3.12"
strict = true
mypy_path = "src"
plugins = ["pydantic.mypy"]
namespace_packages = true
explicit_package_bases = true
[[tool.mypy.overrides]]
# argon2 ships types but some transitive deps may not; keep strictness elsewhere.
# asyncpg ships no stubs/py.typed marker; it is used in revocation.py only.
module = ["testcontainers.*", "locust.*", "asyncpg", "asyncpg.*"]
ignore_missing_imports = true
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
pythonpath = ["src"]
addopts = "--cov=neuronetz_gateway --cov-report=term-missing"
[tool.coverage.run]
source = ["src/neuronetz_gateway"]
branch = true
omit = [
"src/neuronetz_gateway/__main__.py",
"src/neuronetz_gateway/cli/*",
]
[tool.coverage.report]
# Phase 1: coverage is reported but non-blocking. Later phases set fail_under.
show_missing = true
[tool.bandit]
exclude_dirs = ["tests"]

121
scope-docs/AGENT_PROMPT.md Normal file
View File

@@ -0,0 +1,121 @@
# Build Order: neuronetz-gateway v0.1.0
## Context
The Ollama instance at `https://api.neuronetz.ai` is currently exposed without authentication. This is a security incident in waiting. Your job is to build the gateway that closes that gap and forms the commercial API surface of the Neuronetz AI platform.
The full specification is in **`SPEC.md`** in this repository. Read it before writing any code. It is the source of truth; if anything below conflicts with it, SPEC.md wins.
## Mission
Implement `neuronetz-gateway` per SPEC.md to a state that satisfies **§12 Acceptance Criteria**. Nothing less ships.
## Non-Negotiables
These are hard constraints. Violating any of them is a build failure regardless of feature completeness.
1. **Fail closed, always.** If a security or budgeting check cannot be performed (Redis down, DB unreachable, ambiguous state), deny the request. Never default to allow.
2. **Ollama never reachable from outside the Docker internal network.** No `ports:` mapping for the ollama service in any compose file shipped with the project. Document this prominently.
3. **No secrets in code, no secrets in logs, no secrets in errors.** Argon2id for key storage. Constant-time comparison only. Keys printed exactly once at creation.
4. **No reflected upstream errors.** Ollama errors are sanitized at the gateway boundary. Map to generic 4xx/5xx with a request ID.
5. **Mutating Ollama endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`) are hard-blocked.** Not configurable. Not behind a feature flag. Blocked.
6. **Streaming integrity.** Token counting and audit writes happen **after** stream close, never on the hot path. Time-to-first-byte must not be degraded by gateway bookkeeping.
7. **`mypy --strict` and `ruff check` clean before any PR is opened.** No `# type: ignore` without an inline justification comment.
8. **Test coverage targets (§9) are a gate, not a goal.** 100% on `auth/`, `ratelimit/`, `budget/`. CI fails below threshold.
9. **Apache 2.0 license file present from commit one.** No GPL dependencies.
10. **The bootstrap CLI must work before the first manual `curl`.** No "I'll create a key by hand in the DB just to test it" — if the CLI can't create a key, fix the CLI first.
## Phasing
Five phases. Each phase has an explicit exit criterion. **Do not start phase N+1 until phase N's exit criterion is verifiably met.** PM/Control: enforce this.
### Phase 1 — Scaffold
- Repo layout per SPEC §8
- `pyproject.toml`, `uv.lock`, Dockerfile, docker-compose.yml, docker-compose.dev.yml, .env.example, README, LICENSE
- Alembic configured; migration `0001_initial.py` creates schema `gateway` and all tables per SPEC §5
- `make` or `just` targets: `dev`, `test`, `lint`, `typecheck`, `migrate`, `compose-up`, `compose-down`
- CI workflow runs: ruff, mypy, pytest, bandit, pip-audit
- **Exit criterion:** `docker compose -f docker-compose.dev.yml up` brings up postgres + redis + a stub gateway that responds 200 on `/healthz` and 503 on `/readyz` (because no Ollama yet). Migrations apply cleanly. CI is green on an empty test suite.
### Phase 2 — Core proxy + auth
- Bootstrap CLI (`create-tenant`, `create-key`, `list-keys`, `revoke-key`) working end-to-end
- Argon2id hashing module with unit tests covering: hash, verify, constant-time behavior, rehash-on-parameter-change
- Auth middleware: Bearer extraction, prefix lookup, hash verify, Redis cache with TTL
- Ollama proxy for `/api/chat` and `/api/generate` — both streamed (NDJSON) and non-streamed
- Endpoint allowlist enforced
- **Model discovery (SPEC §4.6):** background poll of Ollama `/api/tags`, cached in Redis + in-process, fail-closed when unavailable
- Model allowlist enforced per-tenant via the **effective set** (allow_all → all discovered; else `allowed_models ∩ discovered`); key-level `allow_all_models` overrides tenant
- Error handler: sanitized responses, request ID in every error
- Audit log writer (buffered, async)
- Mock Ollama in `tests/integration/mock_ollama.py` (no real model required for CI)
- **Exit criterion:** A key created via CLI can call `/api/chat` and `/api/generate` through Caddy → gateway → mock Ollama, streaming works, audit rows land in Postgres with correct token counts, `/api/pull` returns 403, no-auth returns 401, wrong-key returns 401. Model discovery populates from the (mock) Ollama `/api/tags`; `/api/tags` returns the tenant's effective set; an `allow_all_models` tenant sees all discovered models, a default-deny tenant sees only `allowed ∩ discovered`, and a non-effective model returns 403; discovery-unavailable fails closed. Integration tests cover all of the above.
### Phase 3 — Rate limit + budget + OpenAI-compat
- Sliding window rate limit (Redis Lua script) — per-key RPM, per-tenant RPM, per-key TPM
- Concurrency semaphore (Redis-backed) with TTL guard
- Token budget counters in Redis with Postgres ledger reconciliation on period rollover
- OpenAI-compatibility layer: `/v1/chat/completions`, `/v1/completions`, `/v1/embeddings`, `/v1/models` with full SSE streaming and `data: [DONE]` terminator
- Schema translation tests with golden fixtures (request in OpenAI → expected Ollama request; response from Ollama → expected OpenAI response)
- Rate-limit and budget response headers per SPEC §6.5
- **Exit criterion:** Locust test (100 concurrent users, 5 min) shows correct 429 behavior at the limit, correct token accounting, p99 gateway overhead < 25 ms. OpenAI Python SDK pointed at `/v1` successfully completes streaming chat. Killing Redis mid-test produces 503 (fail closed), not 200.
### Phase 4 — Audit, prompt log, revocation
- Prompt log (opt-in per key, TTL) with daily sweeper task
- Audit log retention sweeper (TTL per tenant config)
- Buffered audit writer with ring-buffer overflow → deny-mode behavior
- Revocation flow: console (simulated via direct INSERT in tests) writes `gateway.revocations` → NOTIFY → gateway evicts Redis cache → next request with revoked key returns 401 within 1 second
- Prometheus `/metrics` (loopback only) with: `gateway_requests_total{tenant,model,status}`, `gateway_tokens_total{tenant,model,direction}`, `gateway_request_duration_seconds{tenant,model}` (histogram)
- `/readyz` checks DB + Redis + Ollama all reachable
- Circuit breaker on Ollama failures
- **Exit criterion:** Revocation E2E test green. Prompt log retention TTL works (use freeze-time to simulate). Metrics scrape returns valid Prometheus exposition. `/readyz` flips to 503 when any dependency is down.
### Phase 5 — Harden, document, release
- `docs/ARCHITECTURE.md`, `docs/DEPLOYMENT.md`, `docs/API.md`, `docs/THREAT_MODEL.md`, `docs/OPERATIONS.md` complete
- Caddyfile example with Let's Encrypt for `api.neuronetz.ai` and security headers (HSTS, X-Content-Type-Options, no Server header, no X-Powered-By)
- Systemd unit file for non-Compose deployments
- Multi-stage Dockerfile with non-root user, distroless or `python:3.12-slim` final stage, no build tools in final image
- `pip-audit` and `bandit` clean in CI
- Image scan (Trivy or Grype) clean of HIGH/CRITICAL
- Tag `v0.1.0`, build and push image, GitHub release with changelog
- **Exit criterion:** Every box in SPEC §12 checked, signed off by Control. Image runnable from a fresh host with only docker + a `.env`. README quickstart works for someone who has never seen the repo.
## Agent Role Assignments
For the multi-agent orchestrator (Fritz/UI-UX/DevOps/QA/Control/Timo/PM):
| Agent | Owns |
|---|---|
| **Backend / Fritz** | All Python code under `src/neuronetz_gateway/`, Alembic migrations, CLI. Primary author. |
| **DevOps** | Dockerfile, docker-compose.yml(s), Caddyfile, systemd unit, CI workflows, image scanning, release tagging. |
| **QA** | All tests under `tests/`. Owns coverage gate. Writes the locust scenarios. Verifies acceptance criteria at each phase exit. |
| **UI-UX** | Not active this project (no UI surface here). Console project will pick this up. |
| **Control / Timo** | Enforces phase gates. Refuses to advance a phase whose exit criterion isn't met. Runs the acceptance checklist at end of Phase 5. |
| **PM** | Tracks the phase progression, opens YouTrack tickets per phase, runs daily standups against this prompt, surfaces blockers. |
## Working Agreements
- **Branch per phase.** `phase-1-scaffold`, `phase-2-proxy-auth`, etc. Merge to `main` only after phase exit criterion is verified.
- **PRs are reviewed against SPEC.md.** "Does this match the spec? If not, is SPEC.md wrong or is the PR wrong?" — that's the review question.
- **SPEC changes are explicit.** If a phase reveals a spec mistake, amend SPEC.md in a separate PR before changing the implementation. Never drift silently.
- **Commit messages reference the section.** e.g. `auth: implement argon2id verify per SPEC §5, §9`.
- **No TODOs in main.** If something is deferred, it becomes a tracked issue, not a code comment.
- **Open questions (SPEC §13) are resolved in writing.** Decision goes in SPEC.md, not in a Slack message that gets lost.
## What "Done" Looks Like
A fresh clone, a fresh host, a domain pointing at it, and a `.env` file. `docker compose up`. Five minutes later, `curl -H "Authorization: Bearer nz_..." https://api.neuronetz.ai/v1/chat/completions -d '...'` streams a response. The Ollama port is not open. The audit log has a row. The budget counter decremented. The metrics endpoint shows the request. The locust suite passes. The threat model document explains every defense.
When all of that is true and SPEC §12 is fully ticked, ship v0.1.0.
## When You Get Stuck
- **Ambiguity in the spec → ask, don't guess.** Open a question in the PM channel; if resolved, amend SPEC.md.
- **Conflict between speed and correctness → correctness wins.** This is security infrastructure. We do not ship "good enough."
- **Conflict between scope creep and v0.1.0 → defer.** New ideas go in a follow-up issue. v0.1.0 ships per spec.
Start with Phase 1. Read SPEC.md first.

593
scope-docs/SPEC.md Normal file
View File

@@ -0,0 +1,593 @@
# neuronetz-gateway — SPEC.md
**Project:** `neuronetz-gateway`
**Version:** 0.1.0 (target)
**Status:** Specification — not yet implemented
**License:** Apache 2.0
**Owner:** Stephan Berbig / Neuronetz
---
## 1. Purpose
A secure, multi-tenant API gateway in front of an Ollama instance currently exposed at `https://api.neuronetz.ai`. The Ollama endpoint must never be reachable directly from the public internet again. All access flows through this gateway.
The gateway is the **hot path** of the Neuronetz API. A separate service (`neuronetz-console`, built on the Nibiru PHP framework) handles administration, dashboards, and tenant self-service. This SPEC covers only the gateway.
## 2. Scope
### In scope (v0.1.0)
- Authentication via API keys (Bearer tokens)
- Multi-tenant data model (tenants → keys, with inheritance)
- Per-key and per-tenant rate limiting (RPM, TPM, concurrent)
- Per-key and per-tenant token budgets (daily, monthly, total)
- Streaming and non-streaming proxy to Ollama
- Dual API surface: native Ollama (`/api/*`) and OpenAI-compatible (`/v1/*`)
- Endpoint allowlist (block all model-mutating Ollama endpoints)
- **Dynamic model discovery** from the Ollama backend — the live set of installed models is queried, cached, and auto-refreshed; nothing about the model list is hand-maintained
- Model allowlist (per-tenant override), **default-deny, resolved against the live discovered set** (stale/typo'd entries never resolve)
- **Per-tenant `allow_all_models` toggle** — opt-in: a flagged tenant may use any currently-installed model, so models newly pulled into Ollama are auto-granted on the next discovery refresh
- Request size limits, response size limits, timeouts
- Token counting from Ollama responses (precise, not heuristic)
- Audit log (always-on metadata)
- Prompt log (opt-in per key, TTL'd retention)
- Bootstrap CLI: create tenants, keys, set budgets
- Health and readiness endpoints
- Docker Compose deployment (gateway + caddy + postgres + redis + ollama)
- Caddy as TLS terminator (Let's Encrypt for `api.neuronetz.ai`)
### Out of scope (v0.1.0, document as future)
- Web admin UI (lives in `neuronetz-console`, separate repo)
- Billing / Stripe integration (budgets only, no money yet)
- Multi-region / HA / k8s
- Content moderation / prompt-injection filtering
- Response caching
- Multi-backend routing (one Ollama; pluggable backend interface stays for later)
- Webhook notifications
- SSO / OAuth2 for admin
## 3. Threat Model (abbreviated)
| Threat | Mitigation |
|---|---|
| Internet scanners hitting Ollama directly | Ollama bound to internal Docker network; never published |
| Unauthenticated API abuse | Mandatory Bearer token; fail-closed on auth errors |
| API key brute force | Argon2id hashing; constant-time compare; rate limit on auth failures per source IP |
| GPU/token exhaustion (cost attack) | Per-key TPM + token budget; per-tenant ceiling; concurrent connection cap |
| Resource exhaustion via large payloads | Request body size limit (default 256 KiB); `num_predict` cap (default 4096) |
| Model enumeration / training-data exfil via uncommon models | Model allowlist; default-deny. `allow_all_models` is **opt-in per tenant and audited**. Discovery only ever exposes models actually installed on the backend; `/api/tags` and `/v1/models` never reveal models outside the tenant's effective set; "not allowed" and "doesn't exist" return the same generic response |
| Discovery backend unreachable | Fail-closed: an empty/stale-expired discovered set means no model resolves, so requests are denied — never "allow because we couldn't list models" |
| Ollama mutation (model pull/delete) by attacker | Endpoint allowlist; mutating endpoints (`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`) hard-blocked at the gateway |
| Information disclosure via error messages | Sanitize upstream errors; never proxy Ollama internals to client |
| Audit log tampering | Append-only at app layer; DB role separation; optional WAL archiving |
| Prompt data leakage | Prompt logging off by default; opt-in per key; TTL'd; redaction hook |
| Redis outage causing "fail open" | Fail-closed: if rate-limit/budget backend is unavailable, deny |
| Compromised admin token | Admin token lives in `neuronetz-console`, not in gateway; gateway has no admin endpoints |
## 4. Architecture
### 4.1 Component diagram
```
Internet
│ TLS
┌──────────────────────┐
│ Caddy (sidecar) │ Let's Encrypt for api.neuronetz.ai
│ - TLS termination │ HSTS, security headers
│ - HTTP/2, HTTP/3 │
└──────────┬───────────┘
│ HTTP/1.1 internal
┌──────────▼───────────┐
│ neuronetz-gateway │ FastAPI + uvicorn
│ - authn │
│ - rate limit │
│ - budget check │
│ - proxy + stream │
│ - token count │
│ - audit write │
└──┬────────┬──────┬───┘
│ │ │
┌──────▼──┐ ┌──▼───┐ │
│Postgres │ │Redis │ │
│ schema: │ │ keys │ │
│ gateway │ │bucket│ │
└─────────┘ └──────┘ │
│ internal network only
┌──────▼──────┐
│ Ollama │
│ 127.0.0.1 │
└─────────────┘
Same Compose stack also hosts (separate from this SPEC):
- neuronetz-console (PHP/Nibiru) → reads schema `console`, reads schema `gateway` (SELECT)
```
### 4.2 Database schemas
**Single Postgres instance, two schemas:**
- `gateway` — owned by the gateway service; gateway role has full DDL
- `console` — owned by `neuronetz-console` (out of scope here); console role has full DDL
- Both services connect with their own role. Cross-schema access is explicit GRANT.
**Console role gets `SELECT` on all `gateway.*` tables.** Console writes go only to `console.*` tables. If the console needs to mutate gateway state (e.g. revoke a key), it does so by writing to a `gateway.revocations` outbox table that the gateway tails (see §4.5).
### 4.3 Request lifecycle
1. Caddy terminates TLS, forwards to gateway on internal port.
2. Gateway middleware extracts `Authorization: Bearer <key>`.
3. Key prefix (first 12 chars) used as Redis cache key. On miss, lookup `gateway.api_keys` by prefix; verify full key with argon2id `verify`; cache resolved key metadata in Redis (TTL 60s).
4. Rate limit check (sliding window in Redis, Lua-atomic) — per-key RPM + per-tenant RPM.
5. Budget check (Redis counter for current period; Postgres ledger is source of truth on reset).
6. Concurrent-connection semaphore (Redis `INCR` with TTL).
7. Model allowlist check. Resolve the **effective model set** for the key:
`allow_all := key.allow_all_models ?? tenant.allow_all_models`;
`effective := discovered` if `allow_all` else `(key.allowed_models ?? tenant.allowed_models) ∩ discovered`,
where `discovered` is the cached live model set from discovery (§4.6). The request's
`model` must be in `effective`, else a generic 403 with no disclosure of whether the
model exists but is unpermitted vs. is not installed.
8. Endpoint allowlist check.
9. Request body validation (size, schema, `num_predict` cap).
10. If OpenAI-compat path, translate request to Ollama schema.
11. Open httpx async stream to Ollama.
12. Stream response back to client, accumulating final `prompt_eval_count` + `eval_count`.
13. On stream close: write `gateway.audit_log` row; decrement budget; release semaphore; if prompt logging enabled, write `gateway.prompt_log` row.
14. On any failure: sanitized error to client, audit row with status code, semaphore released.
### 4.4 Failure modes (fail-closed)
| Subsystem | If down | Behavior |
|---|---|---|
| Postgres (read) | Key lookup fails | 503 with retry-after; no requests proxied |
| Postgres (write) | Audit write fails | Request still succeeds, audit row buffered in-memory ring (max 1000), drained on recovery; if buffer fills, switch to deny mode |
| Redis | Rate limit / budget unavailable | 503 — fail closed. Never "allow because we can't check." |
| Ollama | Upstream unreachable | 502 with retry-after; circuit breaker opens after 5 consecutive failures, half-open after 30s |
| Caddy | Not a gateway concern | — |
### 4.5 Cache invalidation (key revocation)
Console can revoke a key by inserting into `gateway.revocations(key_id, ts, reason)`. Gateway has a background task (`asyncio.create_task` in lifespan) that:
- LISTENs on Postgres channel `key_revoked` (gateway emits NOTIFY on its own write path; console emits via INSERT trigger)
- On notification, evicts the Redis cache entry for that key's prefix
- This makes revocation effectively immediate (≤ Redis RTT) without cross-service HTTP
### 4.6 Model discovery
The set of usable models is **never hand-maintained**; it is extracted live from the
Ollama backend.
- A background task (started in lifespan, like the revocation listener) polls Ollama
`GET /api/tags` every `MODEL_DISCOVERY_REFRESH_S` seconds.
- The parsed model set (names + sanitized metadata: family, parameter size, quantization,
size bytes, modified-at) is cached in Redis under `gateway:models:discovered` with TTL
`MODEL_DISCOVERY_CACHE_TTL_S`, and held in-process for hot reads on the request path.
- On startup an initial fetch runs; if Ollama is unreachable the discovered set is empty.
- **Fail-closed:** if the discovered set is empty or its cache has expired and cannot be
refreshed, no model resolves and requests are denied (consistent with default-deny).
Discovery never opens access on failure.
- "Auto-grant": because the effective set (§4.3 step 7) intersects with `discovered` (or
*is* `discovered` when `allow_all_models`), a model pulled into Ollama out-of-band
becomes usable to `allow_all` tenants on the next refresh — no per-tenant config change.
- Discovery is **read-only** against Ollama and uses only the allowlisted `/api/tags`
endpoint; it never triggers a model pull.
## 5. Data Model (schema `gateway`)
```sql
CREATE SCHEMA gateway;
CREATE TYPE gateway.key_status AS ENUM ('active', 'disabled', 'revoked');
CREATE TYPE gateway.tenant_status AS ENUM ('active', 'suspended', 'closed');
CREATE TYPE gateway.budget_period AS ENUM ('day', 'month', 'total');
CREATE TABLE gateway.tenants (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
name text NOT NULL UNIQUE,
status gateway.tenant_status NOT NULL DEFAULT 'active',
created_at timestamptz NOT NULL DEFAULT now(),
metadata jsonb NOT NULL DEFAULT '{}'::jsonb
);
CREATE TABLE gateway.tenant_limits (
tenant_id uuid PRIMARY KEY REFERENCES gateway.tenants(id) ON DELETE CASCADE,
rpm integer NOT NULL DEFAULT 60,
tpm integer NOT NULL DEFAULT 100000,
concurrent integer NOT NULL DEFAULT 8,
tokens_daily bigint,
tokens_monthly bigint,
tokens_total bigint,
allowed_models text[] NOT NULL DEFAULT '{}',
allow_all_models boolean NOT NULL DEFAULT false, -- opt-in: allow any installed model
log_prompts_default boolean NOT NULL DEFAULT false,
prompt_retention_days integer NOT NULL DEFAULT 30,
audit_retention_days integer NOT NULL DEFAULT 365
);
CREATE TABLE gateway.api_keys (
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
tenant_id uuid NOT NULL REFERENCES gateway.tenants(id) ON DELETE CASCADE,
prefix text NOT NULL UNIQUE, -- first 12 chars, indexed
key_hash text NOT NULL, -- argon2id
name text NOT NULL,
status gateway.key_status NOT NULL DEFAULT 'active',
scopes text[] NOT NULL DEFAULT '{chat,embeddings}',
created_at timestamptz NOT NULL DEFAULT now(),
last_used_at timestamptz,
expires_at timestamptz,
log_prompts boolean, -- NULL = inherit from tenant
metadata jsonb NOT NULL DEFAULT '{}'::jsonb
);
CREATE INDEX idx_api_keys_prefix ON gateway.api_keys(prefix) WHERE status = 'active';
CREATE INDEX idx_api_keys_tenant ON gateway.api_keys(tenant_id);
CREATE TABLE gateway.key_limits (
key_id uuid PRIMARY KEY REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
rpm integer, -- NULL = inherit tenant
tpm integer,
concurrent integer,
tokens_daily bigint,
tokens_monthly bigint,
tokens_total bigint,
allowed_models text[], -- NULL = inherit tenant
allow_all_models boolean -- NULL = inherit tenant
);
CREATE TABLE gateway.budget_usage (
key_id uuid NOT NULL REFERENCES gateway.api_keys(id) ON DELETE CASCADE,
period gateway.budget_period NOT NULL,
period_start timestamptz NOT NULL,
tokens_in bigint NOT NULL DEFAULT 0,
tokens_out bigint NOT NULL DEFAULT 0,
requests bigint NOT NULL DEFAULT 0,
PRIMARY KEY (key_id, period, period_start)
);
CREATE INDEX idx_budget_usage_period ON gateway.budget_usage(period, period_start);
CREATE TABLE gateway.audit_log (
id bigserial PRIMARY KEY,
ts timestamptz NOT NULL DEFAULT now(),
request_id uuid NOT NULL,
tenant_id uuid, -- nullable for auth-failed rows
key_id uuid,
key_prefix text, -- denormalized for forensic queries
method text NOT NULL,
path text NOT NULL,
model text,
tokens_in integer,
tokens_out integer,
latency_ms integer,
status integer NOT NULL,
client_ip inet,
user_agent text,
error_code text
);
CREATE INDEX idx_audit_ts ON gateway.audit_log(ts);
CREATE INDEX idx_audit_tenant_ts ON gateway.audit_log(tenant_id, ts);
CREATE INDEX idx_audit_key_ts ON gateway.audit_log(key_id, ts);
CREATE TABLE gateway.prompt_log (
id bigserial PRIMARY KEY,
audit_id bigint NOT NULL REFERENCES gateway.audit_log(id) ON DELETE CASCADE,
ts timestamptz NOT NULL DEFAULT now(),
key_id uuid NOT NULL,
request_body jsonb NOT NULL,
response_text text,
retention_until timestamptz NOT NULL
);
CREATE INDEX idx_prompt_log_retention ON gateway.prompt_log(retention_until);
CREATE TABLE gateway.revocations (
id bigserial PRIMARY KEY,
key_id uuid NOT NULL,
ts timestamptz NOT NULL DEFAULT now(),
reason text,
processed_at timestamptz
);
-- Trigger to NOTIFY on revocation insert
CREATE OR REPLACE FUNCTION gateway.notify_key_revoked() RETURNS trigger AS $$
BEGIN
PERFORM pg_notify('key_revoked', NEW.key_id::text);
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_notify_key_revoked
AFTER INSERT ON gateway.revocations
FOR EACH ROW EXECUTE FUNCTION gateway.notify_key_revoked();
-- Grants for console role (created in console SPEC, referenced here)
-- GRANT USAGE ON SCHEMA gateway TO console_role;
-- GRANT SELECT ON ALL TABLES IN SCHEMA gateway TO console_role;
-- GRANT INSERT ON gateway.revocations TO console_role;
```
## 6. API Surface
### 6.1 Native Ollama passthrough (allowlisted)
| Path | Method | Notes |
|---|---|---|
| `/api/chat` | POST | Streamed (NDJSON) and non-streamed |
| `/api/generate` | POST | Streamed (NDJSON) and non-streamed |
| `/api/embeddings` | POST | Non-streamed |
| `/api/embed` | POST | Newer Ollama embeddings endpoint |
| `/api/tags` | GET | Returns the tenant's **effective** model set (live-discovered ∩ allowed, or *all* discovered when `allow_all_models`). Sourced from discovery (§4.6), never a static list |
| `/api/show` | POST | Allowed only for models in the tenant's effective set; returns sanitized model info (no system prompts, no template) |
| `/api/ps` | GET | **Blocked** — leaks loaded models |
| `/api/version` | GET | Returns gateway version, not Ollama version |
### 6.2 Hard-blocked Ollama endpoints (always 403)
`/api/pull`, `/api/push`, `/api/create`, `/api/copy`, `/api/delete`, `/api/blobs/*`
### 6.3 OpenAI-compatible
| Path | Method | Maps to |
|---|---|---|
| `/v1/chat/completions` | POST | `/api/chat` |
| `/v1/completions` | POST | `/api/generate` |
| `/v1/embeddings` | POST | `/api/embed` |
| `/v1/models` | GET | `/api/tags` (the tenant's effective discovered set), in OpenAI model-list format |
Translation must preserve streaming. SSE (`data: {...}\n\n`) for OpenAI-compat; NDJSON for native.
### 6.4 Gateway endpoints
| Path | Method | Auth | Purpose |
|---|---|---|---|
| `/healthz` | GET | none | Liveness — process responsive |
| `/readyz` | GET | none | Readiness — DB + Redis + Ollama all reachable |
| `/metrics` | GET | none (loopback only) | Prometheus exposition (counters, histograms) |
No admin endpoints. Admin lives in `neuronetz-console`.
### 6.5 Response headers
Every proxied response carries:
- `X-Request-ID: <uuid>`
- `X-RateLimit-Limit-Requests: <n>`
- `X-RateLimit-Remaining-Requests: <n>`
- `X-RateLimit-Limit-Tokens: <n>`
- `X-RateLimit-Remaining-Tokens: <n>`
- `X-Budget-Period: day|month|total`
- `X-Budget-Tokens-Remaining: <n>`
429 responses additionally carry `Retry-After: <seconds>`.
## 7. Configuration
All via environment variables, validated by Pydantic Settings on boot. Boot fails loudly on invalid config.
```
# Service
GATEWAY_BIND_HOST=0.0.0.0
GATEWAY_BIND_PORT=8080
GATEWAY_LOG_LEVEL=INFO
GATEWAY_LOG_FORMAT=json # json|console
GATEWAY_REQUEST_ID_HEADER=X-Request-ID
GATEWAY_TRUSTED_PROXIES=127.0.0.1,caddy # for X-Forwarded-For
# Upstream
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_CONNECT_TIMEOUT_S=5
OLLAMA_READ_TIMEOUT_S=600
OLLAMA_MAX_CONNECTIONS=64
# Model discovery (§4.6)
MODEL_DISCOVERY_REFRESH_S=60 # how often to re-query Ollama /api/tags
MODEL_DISCOVERY_CACHE_TTL_S=120 # Redis cache TTL for the discovered model set
# Database
DATABASE_URL=postgresql+asyncpg://gateway:...@postgres:5432/neuronetz
DATABASE_POOL_SIZE=10
DATABASE_POOL_OVERFLOW=20
# Redis
REDIS_URL=redis://redis:6379/0
REDIS_KEY_CACHE_TTL_S=60
# Limits (defaults; per-tenant/key overrides in DB)
DEFAULT_RPM=60
DEFAULT_TPM=100000
DEFAULT_CONCURRENT=8
MAX_REQUEST_BODY_BYTES=262144
MAX_NUM_PREDICT=4096
# Security
ARGON2_TIME_COST=3
ARGON2_MEMORY_COST_KIB=65536
ARGON2_PARALLELISM=4
AUTH_FAILURE_RATE_LIMIT_PER_IP_PER_MIN=20
# Audit
AUDIT_BUFFER_SIZE=1000
PROMPT_LOG_DEFAULT_RETENTION_DAYS=30
AUDIT_LOG_DEFAULT_RETENTION_DAYS=365
```
## 8. Repository Layout
```
neuronetz-gateway/
├── pyproject.toml # uv-managed, ruff, mypy --strict, pytest
├── README.md
├── LICENSE # Apache 2.0
├── docker-compose.yml # full stack incl. console placeholder
├── docker-compose.dev.yml # without caddy, gateway exposed on localhost
├── Dockerfile # multi-stage, python:3.12-slim base
├── .env.example
├── .dockerignore
├── .gitignore
├── alembic.ini
├── alembic/
│ ├── env.py
│ └── versions/
│ └── 0001_initial.py # creates schema `gateway` and all tables
├── ops/
│ ├── caddy/
│ │ └── Caddyfile.example
│ └── systemd/
│ └── neuronetz-gateway.service
├── src/neuronetz_gateway/
│ ├── __init__.py
│ ├── __main__.py # uvicorn entry
│ ├── app.py # FastAPI factory
│ ├── config.py # Pydantic Settings
│ ├── deps.py # DI providers
│ ├── lifespan.py # startup/shutdown, NOTIFY listener
│ ├── errors.py # exception types, handlers, sanitization
│ ├── auth/
│ │ ├── __init__.py
│ │ ├── hashing.py # argon2id wrapper
│ │ ├── keys.py # key generation, prefix, verify
│ │ └── middleware.py
│ ├── ratelimit/
│ │ ├── __init__.py
│ │ ├── sliding_window.py # Redis Lua script
│ │ └── concurrency.py # semaphore via Redis
│ ├── budget/
│ │ ├── __init__.py
│ │ ├── counter.py # Redis period counters
│ │ └── ledger.py # Postgres reconciliation
│ ├── proxy/
│ │ ├── __init__.py
│ │ ├── ollama.py # httpx streaming client
│ │ ├── translate.py # OpenAI <-> Ollama schemas
│ │ ├── token_counter.py # parse usage from stream
│ │ ├── discovery.py # live model discovery from Ollama /api/tags (§4.6)
│ │ └── allowlist.py # effective-set resolution (allow_all / allowed ∩ discovered)
│ ├── routes/
│ │ ├── __init__.py
│ │ ├── ollama_native.py
│ │ ├── openai_compat.py
│ │ └── health.py
│ ├── db/
│ │ ├── __init__.py
│ │ ├── session.py
│ │ ├── models.py # SQLAlchemy 2.0
│ │ └── repositories.py
│ ├── audit/
│ │ ├── __init__.py
│ │ ├── writer.py # buffered async writer
│ │ └── prompt_log.py
│ ├── observability/
│ │ ├── __init__.py
│ │ ├── logging.py # structlog config
│ │ └── metrics.py # prometheus
│ └── cli/
│ ├── __init__.py
│ └── manage.py # typer: create-tenant, create-key, ...
├── tests/
│ ├── conftest.py # testcontainers fixtures
│ ├── unit/
│ │ ├── test_hashing.py
│ │ ├── test_translate.py
│ │ ├── test_token_counter.py
│ │ ├── test_discovery.py
│ │ ├── test_allowlist.py
│ │ └── test_sliding_window.py
│ ├── integration/
│ │ ├── test_auth_flow.py
│ │ ├── test_rate_limit.py
│ │ ├── test_budget.py
│ │ ├── test_proxy_stream.py
│ │ ├── test_openai_compat.py
│ │ ├── test_revocation.py
│ │ └── mock_ollama.py # FastAPI mock with NDJSON/SSE
│ └── load/
│ └── locustfile.py
└── docs/
├── ARCHITECTURE.md
├── DEPLOYMENT.md
├── API.md
├── THREAT_MODEL.md
└── OPERATIONS.md # runbook: revoke key, rotate, check usage
```
## 9. Non-Functional Requirements
- **Performance:** p50 overhead < 5 ms over direct Ollama call (auth + ratelimit + audit); p99 < 25 ms (excluding upstream latency)
- **Streaming:** Time-to-first-byte must not be degraded by gateway logic — audit write happens **after** stream close
- **Memory:** Steady-state RSS < 200 MiB per gateway worker under 100 concurrent streams
- **Concurrency:** Handle 200 concurrent connections per worker; 4 workers per instance default
- **Test coverage:** ≥ 85% line coverage on `src/neuronetz_gateway/` excluding `__main__` and CLI; 100% on `auth/`, `ratelimit/`, `budget/`
- **Security:** No `eval`, no `exec`, no shell-out, no `pickle`. Bandit clean. `pip-audit` clean on every CI run.
- **Type safety:** `mypy --strict` clean
- **Lint:** `ruff check` clean with project ruleset (E, F, I, B, UP, S, ASYNC)
## 10. Tooling
- Python 3.12
- `uv` for dependency management (pyproject.toml + uv.lock)
- FastAPI ≥ 0.115, uvicorn[standard], httpx ≥ 0.27, SQLAlchemy 2.0 (async), asyncpg, redis ≥ 5.0 (with hiredis), structlog, pydantic ≥ 2.9, pydantic-settings, argon2-cffi, typer, prometheus-client
- Test: pytest, pytest-asyncio, pytest-cov, testcontainers, httpx (test client), respx (mock), locust
- Lint/format: ruff, mypy --strict, bandit, pip-audit
- CI: GitHub Actions workflow (lint, type, test with coverage, build image, push on tag)
## 11. Bootstrap CLI (Typer)
```
neuronetz-gateway create-tenant --name "acme" [--rpm 60] [--tpm 100000]
neuronetz-gateway create-key --tenant acme --name "prod-server-1" [--scopes chat,embeddings]
neuronetz-gateway revoke-key --prefix nz_abc12345
neuronetz-gateway list-keys --tenant acme
neuronetz-gateway show-usage --tenant acme [--period day|month|total]
neuronetz-gateway set-budget --key nz_abc12345 --daily 1000000 --monthly 30000000
neuronetz-gateway set-models --tenant acme --models llama3.1:8b,mistral:7b
neuronetz-gateway set-models --tenant acme --allow-all # opt into allow_all_models
neuronetz-gateway set-models --tenant acme --no-allow-all # back to explicit allowlist
neuronetz-gateway list-models [--tenant acme] # show live-discovered models
# (and the tenant's effective set)
```
`create-tenant` accepts `--allow-all-models / --no-allow-all-models` (default off).
`list-models` reads the discovery cache (§4.6); with `--tenant` it also shows that tenant's
resolved effective set.
Key format: `nz_<12-char-prefix><32-char-random>`. Prefix is stored; full key is hashed (argon2id). On creation, the full key is printed exactly once.
## 12. Acceptance Criteria
The build is "done" when every box below is checked. The orchestrator must verify each before declaring v0.1.0.
- [ ] `docker compose up` from a clean checkout produces a running stack with TLS via Caddy (self-signed in dev, Let's Encrypt-ready in prod).
- [ ] CLI creates tenant and key; printed key successfully authenticates an `/api/chat` call.
- [ ] Unauthenticated request returns 401 with no Ollama details leaked.
- [ ] Request to `/api/pull` returns 403 with generic error message.
- [ ] Streaming `/api/chat` works end-to-end; first byte arrives within Ollama's own TTFB + < 10 ms gateway overhead.
- [ ] Streaming `/v1/chat/completions` returns valid SSE with `data: [DONE]` terminator.
- [ ] Token counts in audit log match Ollama's reported `prompt_eval_count` + `eval_count` exactly.
- [ ] `/api/tags` and `/v1/models` reflect the **live** Ollama model set (discovery, §4.6): an `allow_all_models` tenant sees every installed model and a newly-pulled model appears within one refresh interval; a default-deny tenant sees only `allowed_models ∩ discovered`; a request for a model outside the effective set returns a generic 403; with discovery unavailable, requests fail closed (deny), not open.
- [ ] Rate limit triggers at configured RPM with `Retry-After` header.
- [ ] Token budget enforces and blocks at zero remaining with descriptive error.
- [ ] Redis outage causes 503 (fail-closed), not 200.
- [ ] Revocation via `INSERT INTO gateway.revocations` evicts Redis cache within 1 second.
- [ ] `mypy --strict`, `ruff check`, `bandit`, `pip-audit` all clean in CI.
- [ ] Test coverage ≥ 85% overall, 100% in `auth/`, `ratelimit/`, `budget/`.
- [ ] `docs/THREAT_MODEL.md`, `docs/DEPLOYMENT.md`, `docs/OPERATIONS.md` present and accurate.
- [ ] Load test (locust): 100 concurrent users sustained 5 minutes, p99 gateway overhead < 25 ms, zero 5xx outside induced failures.
## 13. Open Questions (decide during build)
1. Embedding cost accounting — Ollama doesn't return `eval_count` for embeddings. Decision: charge based on `prompt_eval_count` only; document as such.
2. SSE vs NDJSON heuristic for OpenAI-compat — always SSE per OpenAI spec. NDJSON only on native `/api/*`.
3. Prometheus cardinality — do not label by `key_id` (too many series); label by `tenant_id` only; per-key data lives in Postgres.
4. **Model discovery source** — the live model list is `GET /api/tags` on the Ollama backend; there is no separate registry. Cached in Redis + in-process, refreshed every `MODEL_DISCOVERY_REFRESH_S`.
5. **Discovery failure is fail-closed** — empty/expired discovered set ⇒ no model resolves ⇒ deny. Discovery never opens access on error.
6. **No existence disclosure** — a model that is installed-but-unpermitted and a model that is not installed both return the same generic response, to prevent enumeration.
7. **`allow_all_models` precedence** — key-level `allow_all_models` (when non-NULL) overrides the tenant flag; otherwise the tenant flag applies. Same NULL-inherits-tenant rule as the other key limits.
## 14. References
- Ollama API: https://github.com/ollama/ollama/blob/main/docs/api.md
- OpenAI Chat Completions: https://platform.openai.com/docs/api-reference/chat
- Nibiru (sibling console project): https://nibiru-framework.com
- Argon2 RFC 9106

View File

@@ -0,0 +1,7 @@
"""neuronetz-gateway: secure multi-tenant API gateway in front of Ollama."""
from __future__ import annotations
__version__ = "0.1.0"
__all__ = ["__version__"]

View File

@@ -0,0 +1,28 @@
"""Uvicorn entry point: ``python -m neuronetz_gateway``.
Binds the app to ``GATEWAY_BIND_HOST``:``GATEWAY_BIND_PORT`` (default
0.0.0.0:8080). The factory string is passed to uvicorn so the app is built in
the worker process.
"""
from __future__ import annotations
import uvicorn
from neuronetz_gateway.config import get_settings
def main() -> None:
"""Run the gateway under uvicorn using the configured bind address."""
settings = get_settings()
uvicorn.run(
"neuronetz_gateway.app:create_app",
factory=True,
host=settings.gateway_bind_host,
port=settings.gateway_bind_port,
log_level=settings.gateway_log_level.lower(),
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,111 @@
"""FastAPI application factory.
``create_app()`` is the shared contract entry point: other agents (DevOps, QA)
import and serve this. It configures logging, installs the request-id and auth
middleware, registers the sanitizing exception handlers, mounts routers, and
binds the lifespan that manages backend handles + background tasks.
Production safety: FastAPI's ``/docs`` + ``/openapi.json`` are disabled by
default (enabled only via ``DOCS_ENABLED``). The ``/playground`` route is served
only when ``PLAYGROUND_ENABLED`` is true and ``PLAYGROUND_FILE`` exists.
"""
from __future__ import annotations
import uuid
from pathlib import Path
from fastapi import FastAPI, Request
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.responses import HTMLResponse, Response
from starlette.types import ASGIApp
from neuronetz_gateway import __version__
from neuronetz_gateway.auth.middleware import AuthMiddleware
from neuronetz_gateway.config import Settings, get_settings
from neuronetz_gateway.errors import register_exception_handlers
from neuronetz_gateway.lifespan import lifespan
from neuronetz_gateway.observability.logging import configure_logging
from neuronetz_gateway.routes import health, ollama_native, openai_compat
class RequestIDMiddleware(BaseHTTPMiddleware):
"""Assign/propagate a request id and expose it on ``request.state``.
Honours an inbound ``X-Request-ID`` from a trusted proxy; otherwise mints a
fresh UUID. The id is echoed on the response and used by error handlers.
"""
def __init__(self, app: ASGIApp, header_name: str) -> None:
super().__init__(app)
self._header = header_name
async def dispatch(
self, request: Request, call_next: RequestResponseEndpoint
) -> Response:
incoming = request.headers.get(self._header)
request_id = incoming or str(uuid.uuid4())
request.state.request_id = request_id
response = await call_next(request)
response.headers[self._header] = request_id
return response
def _register_playground(app: FastAPI, cfg: Settings) -> None:
"""Add the flag-gated ``/playground`` route (HTML asset, owned by docs agent).
The file is read off the event loop via ``asyncio.to_thread`` so a slow disk
cannot stall request handling. Missing-file is a simple 404, never an error.
"""
import asyncio as _asyncio
def _load(path_str: str) -> str | None:
p = Path(path_str)
if not p.is_file():
return None
return p.read_text(encoding="utf-8")
@app.get("/playground", include_in_schema=False)
async def playground() -> Response:
content = await _asyncio.to_thread(_load, cfg.playground_file)
if content is None:
return Response(status_code=404, content="Not found")
return HTMLResponse(content)
def create_app(settings: Settings | None = None) -> FastAPI:
"""Build and return the configured FastAPI application."""
cfg = settings or get_settings()
configure_logging(level=cfg.gateway_log_level, fmt=cfg.gateway_log_format)
app = FastAPI(
title="neuronetz-gateway",
version=__version__,
lifespan=lifespan,
docs_url="/docs" if cfg.docs_enabled else None,
redoc_url="/redoc" if cfg.docs_enabled else None,
openapi_url="/openapi.json" if cfg.docs_enabled else None,
)
# Settings are needed by the auth middleware before lifespan runs in some
# test setups; lifespan also sets this. Setting here is idempotent.
app.state.settings = cfg
# Auth runs inside RequestID so a request id is always available for the
# sanitized 401 the auth middleware emits. add_middleware wraps outermost
# last, so add Auth first then RequestID.
app.add_middleware(AuthMiddleware)
app.add_middleware(RequestIDMiddleware, header_name=cfg.gateway_request_id_header)
register_exception_handlers(app)
app.include_router(health.router)
app.include_router(openai_compat.router)
app.include_router(ollama_native.router)
if cfg.playground_enabled:
_register_playground(app, cfg)
return app
__all__ = ["RequestIDMiddleware", "create_app"]

View File

@@ -0,0 +1,86 @@
"""Application configuration via Pydantic Settings v2.
Reads every environment variable documented in SPEC §7 with the documented
defaults. Boot fails loudly (ValidationError) on invalid config.
"""
from __future__ import annotations
from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Gateway runtime configuration. All fields map to SPEC §7 env vars."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
case_sensitive=False,
)
# --- Service ---
gateway_bind_host: str = Field(default="0.0.0.0") # noqa: S104 - bind-all is intended in container
gateway_bind_port: int = Field(default=8080)
gateway_log_level: str = Field(default="INFO")
gateway_log_format: str = Field(default="json") # json|console
gateway_request_id_header: str = Field(default="X-Request-ID")
gateway_trusted_proxies: str = Field(default="127.0.0.1,caddy")
# --- Upstream (Ollama) ---
ollama_base_url: str = Field(default="http://ollama:11434")
ollama_connect_timeout_s: int = Field(default=5)
ollama_read_timeout_s: int = Field(default=600)
ollama_max_connections: int = Field(default=64)
# --- Model discovery (SPEC §4.6) ---
model_discovery_refresh_s: int = Field(default=60)
model_discovery_cache_ttl_s: int = Field(default=120)
# --- Database ---
database_url: str = Field(
default="postgresql+asyncpg://gateway:gateway@postgres:5432/neuronetz",
)
database_pool_size: int = Field(default=10)
database_pool_overflow: int = Field(default=20)
# --- Redis ---
redis_url: str = Field(default="redis://redis:6379/0")
redis_key_cache_ttl_s: int = Field(default=60)
# --- Limits ---
default_rpm: int = Field(default=60)
default_tpm: int = Field(default=100_000)
default_concurrent: int = Field(default=8)
max_request_body_bytes: int = Field(default=262_144)
max_num_predict: int = Field(default=4096)
# --- Security ---
argon2_time_cost: int = Field(default=3)
argon2_memory_cost_kib: int = Field(default=65_536)
argon2_parallelism: int = Field(default=4)
auth_failure_rate_limit_per_ip_per_min: int = Field(default=20)
# --- Audit ---
audit_buffer_size: int = Field(default=1000)
prompt_log_default_retention_days: int = Field(default=30)
audit_log_default_retention_days: int = Field(default=365)
# --- Playground / docs (prod-safe defaults: both OFF) ---
playground_enabled: bool = Field(default=False)
playground_file: str = Field(default="/app/playground/index.html")
docs_enabled: bool = Field(default=False)
@property
def trusted_proxies_list(self) -> list[str]:
"""Parse the comma-separated trusted-proxy list into individual hosts."""
return [p.strip() for p in self.gateway_trusted_proxies.split(",") if p.strip()]
@lru_cache(maxsize=1)
def get_settings() -> Settings:
"""Return a cached Settings instance, constructed from the environment."""
return Settings()

View File

@@ -0,0 +1,3 @@
"""Database access layer: SQLAlchemy models, session factory, repositories."""
from __future__ import annotations

View File

@@ -0,0 +1,292 @@
"""SQLAlchemy 2.0 (async) ORM models for schema ``gateway`` per SPEC §5.
These mirror the migration in ``alembic/versions/0001_initial.py`` exactly.
The migration is the authoritative DDL; these models are for application use.
"""
from __future__ import annotations
import datetime
import enum
import uuid
from sqlalchemy import (
BigInteger,
Boolean,
ForeignKey,
Integer,
MetaData,
String,
Text,
text,
)
from sqlalchemy.dialects.postgresql import ARRAY, ENUM, INET, JSONB, TIMESTAMP, UUID
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column
GATEWAY_SCHEMA = "gateway"
# Stable naming convention so Alembic autogenerate and ad-hoc DDL agree.
_NAMING_CONVENTION = {
"ix": "ix_%(column_0_label)s",
"uq": "uq_%(table_name)s_%(column_0_name)s",
"ck": "ck_%(table_name)s_%(constraint_name)s",
"fk": "fk_%(table_name)s_%(column_0_name)s_%(referred_table_name)s",
"pk": "pk_%(table_name)s",
}
class Base(DeclarativeBase):
"""Declarative base; all tables live in the ``gateway`` schema."""
metadata = MetaData(schema=GATEWAY_SCHEMA, naming_convention=_NAMING_CONVENTION)
class KeyStatus(enum.StrEnum):
"""Lifecycle states for an API key (SPEC §5 ``gateway.key_status``)."""
active = "active"
disabled = "disabled"
revoked = "revoked"
class TenantStatus(enum.StrEnum):
"""Lifecycle states for a tenant (SPEC §5 ``gateway.tenant_status``)."""
active = "active"
suspended = "suspended"
closed = "closed"
class BudgetPeriod(enum.StrEnum):
"""Budget accounting periods (SPEC §5 ``gateway.budget_period``)."""
day = "day"
month = "month"
total = "total"
# Reuse existing Postgres enum types (the migration creates them); do not let
# SQLAlchemy try to CREATE TYPE again at runtime.
_key_status_enum = ENUM(KeyStatus, name="key_status", schema=GATEWAY_SCHEMA, create_type=False)
_tenant_status_enum = ENUM(
TenantStatus, name="tenant_status", schema=GATEWAY_SCHEMA, create_type=False
)
_budget_period_enum = ENUM(
BudgetPeriod, name="budget_period", schema=GATEWAY_SCHEMA, create_type=False
)
class Tenant(Base):
"""A tenant: the top-level isolation and ownership boundary."""
__tablename__ = "tenants"
id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
)
name: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
status: Mapped[TenantStatus] = mapped_column(
_tenant_status_enum, nullable=False, server_default=text("'active'")
)
created_at: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
)
tenant_metadata: Mapped[dict[str, object]] = mapped_column(
"metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
)
class TenantLimit(Base):
"""Per-tenant default limits and retention policy."""
__tablename__ = "tenant_limits"
tenant_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("tenants.id", ondelete="CASCADE"),
primary_key=True,
)
rpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("60"))
tpm: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("100000"))
concurrent: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("8"))
tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
allowed_models: Mapped[list[str]] = mapped_column(
ARRAY(Text), nullable=False, server_default=text("'{}'")
)
# When true, the tenant may use ANY model currently installed on the Ollama
# backend (resolved live via model discovery). When false (default), access is
# default-deny and restricted to ``allowed_models`` intersected with the live set.
allow_all_models: Mapped[bool] = mapped_column(
Boolean, nullable=False, server_default=text("false")
)
log_prompts_default: Mapped[bool] = mapped_column(
Boolean, nullable=False, server_default=text("false")
)
prompt_retention_days: Mapped[int] = mapped_column(
Integer, nullable=False, server_default=text("30")
)
audit_retention_days: Mapped[int] = mapped_column(
Integer, nullable=False, server_default=text("365")
)
class ApiKey(Base):
"""An API key belonging to a tenant. The full key is never stored."""
__tablename__ = "api_keys"
id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True), primary_key=True, server_default=text("gen_random_uuid()")
)
tenant_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("tenants.id", ondelete="CASCADE"),
nullable=False,
)
prefix: Mapped[str] = mapped_column(Text, nullable=False, unique=True)
key_hash: Mapped[str] = mapped_column(Text, nullable=False)
name: Mapped[str] = mapped_column(Text, nullable=False)
status: Mapped[KeyStatus] = mapped_column(
_key_status_enum, nullable=False, server_default=text("'active'")
)
scopes: Mapped[list[str]] = mapped_column(
ARRAY(Text), nullable=False, server_default=text("'{chat,embeddings}'")
)
created_at: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
)
last_used_at: Mapped[datetime.datetime | None] = mapped_column(
TIMESTAMP(timezone=True), nullable=True
)
expires_at: Mapped[datetime.datetime | None] = mapped_column(
TIMESTAMP(timezone=True), nullable=True
)
log_prompts: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
key_metadata: Mapped[dict[str, object]] = mapped_column(
"metadata", JSONB, nullable=False, server_default=text("'{}'::jsonb")
)
class KeyLimit(Base):
"""Per-key overrides; NULL columns inherit the tenant value."""
__tablename__ = "key_limits"
key_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("api_keys.id", ondelete="CASCADE"),
primary_key=True,
)
rpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
tpm: Mapped[int | None] = mapped_column(Integer, nullable=True)
concurrent: Mapped[int | None] = mapped_column(Integer, nullable=True)
tokens_daily: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
tokens_monthly: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
tokens_total: Mapped[int | None] = mapped_column(BigInteger, nullable=True)
allowed_models: Mapped[list[str] | None] = mapped_column(ARRAY(Text), nullable=True)
# NULL = inherit tenant's allow_all_models; otherwise overrides it for this key.
allow_all_models: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
class BudgetUsage(Base):
"""Token/request accounting per key, period, and period start."""
__tablename__ = "budget_usage"
key_id: Mapped[uuid.UUID] = mapped_column(
UUID(as_uuid=True),
ForeignKey("api_keys.id", ondelete="CASCADE"),
primary_key=True,
)
period: Mapped[BudgetPeriod] = mapped_column(_budget_period_enum, primary_key=True)
period_start: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), primary_key=True
)
tokens_in: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
tokens_out: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
requests: Mapped[int] = mapped_column(BigInteger, nullable=False, server_default=text("0"))
class AuditLog(Base):
"""Always-on append-only request metadata log."""
__tablename__ = "audit_log"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
ts: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
)
request_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
tenant_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
key_id: Mapped[uuid.UUID | None] = mapped_column(UUID(as_uuid=True), nullable=True)
key_prefix: Mapped[str | None] = mapped_column(Text, nullable=True)
method: Mapped[str] = mapped_column(Text, nullable=False)
path: Mapped[str] = mapped_column(Text, nullable=False)
model: Mapped[str | None] = mapped_column(Text, nullable=True)
tokens_in: Mapped[int | None] = mapped_column(Integer, nullable=True)
tokens_out: Mapped[int | None] = mapped_column(Integer, nullable=True)
latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
status: Mapped[int] = mapped_column(Integer, nullable=False)
client_ip: Mapped[str | None] = mapped_column(INET, nullable=True)
user_agent: Mapped[str | None] = mapped_column(Text, nullable=True)
error_code: Mapped[str | None] = mapped_column(Text, nullable=True)
class PromptLog(Base):
"""Opt-in, TTL'd capture of request/response bodies."""
__tablename__ = "prompt_log"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
audit_id: Mapped[int] = mapped_column(
BigInteger,
ForeignKey("audit_log.id", ondelete="CASCADE"),
nullable=False,
)
ts: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
)
key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
request_body: Mapped[dict[str, object]] = mapped_column(JSONB, nullable=False)
response_text: Mapped[str | None] = mapped_column(Text, nullable=True)
retention_until: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False
)
class Revocation(Base):
"""Outbox table written by console (or gateway) to revoke a key.
An ``AFTER INSERT`` trigger fires ``pg_notify('key_revoked', key_id)``.
"""
__tablename__ = "revocations"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
key_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
ts: Mapped[datetime.datetime] = mapped_column(
TIMESTAMP(timezone=True), nullable=False, server_default=text("now()")
)
reason: Mapped[str | None] = mapped_column(String, nullable=True)
processed_at: Mapped[datetime.datetime | None] = mapped_column(
TIMESTAMP(timezone=True), nullable=True
)
__all__ = [
"GATEWAY_SCHEMA",
"ApiKey",
"AuditLog",
"Base",
"BudgetPeriod",
"BudgetUsage",
"KeyLimit",
"KeyStatus",
"PromptLog",
"Revocation",
"Tenant",
"TenantLimit",
"TenantStatus",
]

View File

@@ -0,0 +1,53 @@
"""Async SQLAlchemy engine and session factory construction.
Phase 1 provides the wiring only; the lifespan owns the engine instance and
stores it on ``app.state``. Business-logic callers should depend on the
session factory via ``deps.py``.
"""
from __future__ import annotations
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from sqlalchemy.ext.asyncio import (
AsyncEngine,
AsyncSession,
async_sessionmaker,
create_async_engine,
)
from neuronetz_gateway.config import Settings
def create_engine(settings: Settings) -> AsyncEngine:
"""Build the async engine from settings (asyncpg driver, pooled)."""
return create_async_engine(
settings.database_url,
pool_size=settings.database_pool_size,
max_overflow=settings.database_pool_overflow,
pool_pre_ping=True,
future=True,
)
def create_session_factory(engine: AsyncEngine) -> async_sessionmaker[AsyncSession]:
"""Build a session factory bound to the given engine."""
return async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
@asynccontextmanager
async def session_scope(
factory: async_sessionmaker[AsyncSession],
) -> AsyncIterator[AsyncSession]:
"""Provide a transactional session scope, committing on success."""
async with factory() as session:
try:
yield session
await session.commit()
except Exception:
await session.rollback()
raise
__all__ = ["create_engine", "create_session_factory", "session_scope"]

View File

@@ -0,0 +1,180 @@
"""FastAPI dependency-injection providers.
Exposes typed accessors for the handles placed on ``app.state`` by the lifespan
(Redis, the upstream httpx client, the DB session factory, the discovery cache)
plus the request principal and the proxy client.
QA override contract
--------------------
Routes obtain the upstream proxy via :func:`get_ollama_client`. Tests override
the *Ollama backend* by overriding this provider::
from neuronetz_gateway.deps import get_ollama_client
from neuronetz_gateway.proxy.ollama import OllamaClient
import httpx
from tests.integration.mock_ollama import create_mock_ollama
transport = httpx.ASGITransport(app=create_mock_ollama())
mock_http = httpx.AsyncClient(transport=transport, base_url="http://ollama")
app.dependency_overrides[get_ollama_client] = lambda: OllamaClient(mock_http)
Because ``get_ollama_client`` returns a fully-built :class:`OllamaClient`, an
override needs no access to ``app.state`` and can point at the in-process mock.
"""
from __future__ import annotations
from collections.abc import AsyncIterator
from typing import Annotated
import httpx
import redis.asyncio as redis
from fastapi import Depends, Request
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from neuronetz_gateway.audit.writer import AuditWriter
from neuronetz_gateway.auth.principal import Principal
from neuronetz_gateway.budget.counter import BudgetCounter
from neuronetz_gateway.config import Settings, get_settings
from neuronetz_gateway.errors import AuthenticationError, DependencyUnavailableError
from neuronetz_gateway.proxy.discovery import DiscoveryCache
from neuronetz_gateway.proxy.ollama import OllamaClient
from neuronetz_gateway.proxy.pipeline import Pipeline
from neuronetz_gateway.ratelimit.concurrency import ConcurrencyLimiter
from neuronetz_gateway.ratelimit.sliding_window import SlidingWindowLimiter
def get_config() -> Settings:
"""Provide the cached application settings."""
return get_settings()
def get_redis(request: Request) -> redis.Redis:
"""Provide the shared Redis client, failing closed if unavailable."""
client: redis.Redis | None = getattr(request.app.state, "redis", None)
if client is None:
raise DependencyUnavailableError(internal_detail="redis client not initialised")
return client
def get_http_client(request: Request) -> httpx.AsyncClient:
"""Provide the shared upstream httpx client."""
client: httpx.AsyncClient | None = getattr(request.app.state, "http_client", None)
if client is None:
raise DependencyUnavailableError(internal_detail="http client not initialised")
return client
def get_ollama_client(request: Request) -> OllamaClient:
"""Provide the upstream Ollama proxy client (override target for tests)."""
return OllamaClient(get_http_client(request))
def get_discovery_cache(request: Request) -> DiscoveryCache:
"""Provide the in-process discovery cache; fail closed if absent."""
cache: DiscoveryCache | None = getattr(request.app.state, "discovery_cache", None)
if cache is None:
raise DependencyUnavailableError(internal_detail="discovery cache not initialised")
return cache
def get_principal(request: Request) -> Principal:
"""Return the authenticated principal placed on ``request.state``.
The auth middleware attaches it before routing; its absence on a non-exempt
route is a programming error, so we fail closed with a 401.
"""
principal: Principal | None = getattr(request.state, "principal", None)
if principal is None:
raise AuthenticationError(internal_detail="principal missing on authenticated route")
return principal
def get_audit_writer(request: Request) -> AuditWriter:
"""Provide the shared buffered audit writer; fail closed if absent."""
writer: AuditWriter | None = getattr(request.app.state, "audit_writer", None)
if writer is None:
raise DependencyUnavailableError(internal_detail="audit writer not initialised")
return writer
def get_pipeline(
request: Request,
principal: Annotated[Principal, Depends(get_principal)],
settings: Annotated[Settings, Depends(get_config)],
ollama: Annotated[OllamaClient, Depends(get_ollama_client)],
discovery: Annotated[DiscoveryCache, Depends(get_discovery_cache)],
redis_client: Annotated[redis.Redis, Depends(get_redis)],
audit: Annotated[AuditWriter, Depends(get_audit_writer)],
) -> Pipeline:
"""Assemble a per-request enforcement + proxy pipeline.
The pipeline owns all hot-path checks (rate limit, budget, concurrency,
model/endpoint allowlist) and the streaming-with-bookkeeping contract.
Audit deny-mode flips this to fail closed at the route layer.
"""
sessionmaker: async_sessionmaker[AsyncSession] | None = getattr(
request.app.state, "db_sessionmaker", None
)
return Pipeline(
request=request,
principal=principal,
settings=settings,
ollama=ollama,
discovery=discovery,
rate_limiter=SlidingWindowLimiter(redis_client),
concurrency=ConcurrencyLimiter(redis_client),
budget=BudgetCounter(redis_client),
audit=audit,
sessionmaker=sessionmaker,
)
def _get_sessionmaker(request: Request) -> async_sessionmaker[AsyncSession]:
"""Return the session factory or fail closed if the engine is absent."""
factory: async_sessionmaker[AsyncSession] | None = getattr(
request.app.state, "db_sessionmaker", None
)
if factory is None:
raise DependencyUnavailableError(internal_detail="db session factory not initialised")
return factory
async def get_db_session(request: Request) -> AsyncIterator[AsyncSession]:
"""Provide a request-scoped async DB session."""
factory = _get_sessionmaker(request)
async with factory() as session:
yield session
ConfigDep = Annotated[Settings, Depends(get_config)]
RedisDep = Annotated[redis.Redis, Depends(get_redis)]
HttpClientDep = Annotated[httpx.AsyncClient, Depends(get_http_client)]
OllamaClientDep = Annotated[OllamaClient, Depends(get_ollama_client)]
DiscoveryCacheDep = Annotated[DiscoveryCache, Depends(get_discovery_cache)]
PrincipalDep = Annotated[Principal, Depends(get_principal)]
AuditWriterDep = Annotated[AuditWriter, Depends(get_audit_writer)]
PipelineDep = Annotated[Pipeline, Depends(get_pipeline)]
DbSessionDep = Annotated[AsyncSession, Depends(get_db_session)]
__all__ = [
"AuditWriterDep",
"ConfigDep",
"DbSessionDep",
"DiscoveryCacheDep",
"HttpClientDep",
"OllamaClientDep",
"PipelineDep",
"PrincipalDep",
"RedisDep",
"get_audit_writer",
"get_config",
"get_db_session",
"get_discovery_cache",
"get_http_client",
"get_ollama_client",
"get_pipeline",
"get_principal",
"get_redis",
]

View File

@@ -0,0 +1,179 @@
"""Exception types and FastAPI exception handlers.
Hard rule (SPEC §3, AGENT_PROMPT non-negotiable #4): never leak upstream or
internal error details to the client. Every error response is a generic,
sanitized JSON body carrying only a stable ``error.code``, a safe message, and
the request id. Detailed context is logged server-side, never returned.
"""
from __future__ import annotations
from fastapi import FastAPI, Request, status
from fastapi.responses import JSONResponse
from neuronetz_gateway.observability.logging import get_logger
_log = get_logger("errors")
class GatewayError(Exception):
"""Base class for gateway errors that map to a sanitized HTTP response.
``message`` MUST be safe to return to clients. Anything sensitive belongs
in ``internal_detail`` which is logged but never serialized to the client.
"""
status_code: int = status.HTTP_500_INTERNAL_SERVER_ERROR
code: str = "internal_error"
message: str = "An internal error occurred."
def __init__(self, message: str | None = None, *, internal_detail: str | None = None) -> None:
super().__init__(message or self.message)
if message is not None:
self.message = message
self.internal_detail = internal_detail
class AuthenticationError(GatewayError):
"""Missing/invalid credentials. Fail closed, no detail."""
status_code = status.HTTP_401_UNAUTHORIZED
code = "unauthorized"
message = "Authentication required."
class AuthorizationError(GatewayError):
"""Authenticated but not permitted (scope/model/endpoint denied)."""
status_code = status.HTTP_403_FORBIDDEN
code = "forbidden"
message = "This request is not permitted."
class RateLimitError(GatewayError):
"""Rate limit exceeded. Handler attaches ``Retry-After`` when known."""
status_code = status.HTTP_429_TOO_MANY_REQUESTS
code = "rate_limited"
message = "Rate limit exceeded."
def __init__(
self,
message: str | None = None,
*,
retry_after: int | None = None,
internal_detail: str | None = None,
) -> None:
super().__init__(message, internal_detail=internal_detail)
self.retry_after = retry_after
class BudgetExceededError(GatewayError):
"""Token budget exhausted for the active period."""
status_code = status.HTTP_429_TOO_MANY_REQUESTS
code = "budget_exceeded"
message = "Token budget exhausted for the current period."
class RequestTooLargeError(GatewayError):
"""Request body exceeds the configured limit."""
status_code = status.HTTP_413_REQUEST_ENTITY_TOO_LARGE
code = "request_too_large"
message = "Request body is too large."
class UpstreamUnavailableError(GatewayError):
"""Ollama (or another dependency) is unreachable. Fail closed."""
status_code = status.HTTP_502_BAD_GATEWAY
code = "upstream_unavailable"
message = "The upstream service is temporarily unavailable."
class DependencyUnavailableError(GatewayError):
"""A required backend (DB/Redis) is unavailable; serve 503, fail closed."""
status_code = status.HTTP_503_SERVICE_UNAVAILABLE
code = "service_unavailable"
message = "The service is temporarily unavailable."
def _request_id(request: Request) -> str:
"""Extract the request id placed on ``request.state`` by middleware."""
rid = getattr(request.state, "request_id", None)
return str(rid) if rid else ""
def _error_response(
request: Request,
*,
status_code: int,
code: str,
message: str,
extra_headers: dict[str, str] | None = None,
) -> JSONResponse:
"""Build a sanitized JSON error response with the request id header."""
request_id = _request_id(request)
headers = {"X-Request-ID": request_id} if request_id else {}
if extra_headers:
headers.update(extra_headers)
return JSONResponse(
status_code=status_code,
content={"error": {"code": code, "message": message, "request_id": request_id}},
headers=headers,
)
async def _gateway_error_handler(request: Request, exc: GatewayError) -> JSONResponse:
"""Render a ``GatewayError`` as a sanitized response."""
if exc.internal_detail:
_log.warning(
"gateway_error",
code=exc.code,
status_code=exc.status_code,
internal_detail=exc.internal_detail,
)
extra: dict[str, str] | None = None
if isinstance(exc, RateLimitError) and exc.retry_after is not None:
extra = {"Retry-After": str(exc.retry_after)}
return _error_response(
request,
status_code=exc.status_code,
code=exc.code,
message=exc.message,
extra_headers=extra,
)
async def _unhandled_error_handler(request: Request, exc: Exception) -> JSONResponse:
"""Catch-all: log the real exception, return a generic 500. No leakage."""
_log.error("unhandled_exception", exc_info=exc)
return _error_response(
request,
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
code="internal_error",
message="An internal error occurred.",
)
def register_exception_handlers(app: FastAPI) -> None:
"""Attach the gateway's sanitizing exception handlers to the app."""
# mypy: FastAPI's add_exception_handler accepts these handler signatures;
# the stubs are intentionally broad, so casts are unnecessary here.
app.add_exception_handler(GatewayError, _gateway_error_handler) # type: ignore[arg-type] # handler typed for GatewayError subclass
app.add_exception_handler(Exception, _unhandled_error_handler)
__all__ = [
"AuthenticationError",
"AuthorizationError",
"BudgetExceededError",
"DependencyUnavailableError",
"GatewayError",
"RateLimitError",
"RequestTooLargeError",
"UpstreamUnavailableError",
"register_exception_handlers",
]

View File

@@ -0,0 +1,131 @@
"""Application lifespan: connect/dispose backends and run background tasks.
Startup connects Postgres + Redis + the upstream httpx client, builds the
argon2 hasher and the buffered audit writer, and launches the background tasks:
the model-discovery poller (SPEC §4.6) and the Postgres revocation NOTIFY
listener (SPEC §4.5). Connection failures are tolerated so ``/healthz`` always
serves; ``/readyz`` reports true readiness. All handles live on ``app.state``.
"""
from __future__ import annotations
import asyncio
import contextlib
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from typing import TYPE_CHECKING
import httpx
import redis.asyncio as redis
from neuronetz_gateway.audit.writer import AuditWriter
from neuronetz_gateway.auth.hashing import build_hasher
from neuronetz_gateway.config import Settings, get_settings
from neuronetz_gateway.db.session import create_engine, create_session_factory
from neuronetz_gateway.observability.logging import get_logger
from neuronetz_gateway.proxy.discovery import DiscoveryCache, discovery_loop
from neuronetz_gateway.revocation import revocation_listener
if TYPE_CHECKING:
from fastapi import FastAPI
_log = get_logger("lifespan")
def _build_http_client(settings: Settings) -> httpx.AsyncClient:
"""Construct the shared httpx client used to reach Ollama."""
timeout = httpx.Timeout(
connect=settings.ollama_connect_timeout_s,
read=settings.ollama_read_timeout_s,
write=settings.ollama_read_timeout_s,
pool=settings.ollama_connect_timeout_s,
)
limits = httpx.Limits(max_connections=settings.ollama_max_connections)
return httpx.AsyncClient(base_url=settings.ollama_base_url, timeout=timeout, limits=limits)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
"""Manage startup/shutdown of all backends and background tasks."""
settings: Settings = get_settings()
app.state.settings = settings
app.state.hasher = build_hasher(settings)
app.state.discovery_cache = DiscoveryCache()
tasks: list[asyncio.Task[None]] = []
try:
engine = create_engine(settings)
app.state.db_engine = engine
app.state.db_sessionmaker = create_session_factory(engine)
except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves
_log.error("db_engine_init_failed", error=str(exc))
app.state.db_engine = None
app.state.db_sessionmaker = None
try:
app.state.redis = redis.from_url(settings.redis_url, decode_responses=True)
except Exception as exc: # noqa: BLE001 - tolerate so /healthz still serves
_log.error("redis_init_failed", error=str(exc))
app.state.redis = None
app.state.http_client = _build_http_client(settings)
audit_writer = AuditWriter(settings.audit_buffer_size, app.state.db_sessionmaker)
audit_writer.start()
app.state.audit_writer = audit_writer
# Background tasks (cancelled on shutdown).
tasks.append(
asyncio.create_task(
discovery_loop(
app.state.http_client, app.state.redis, app.state.discovery_cache, settings
)
)
)
if app.state.redis is not None and app.state.db_sessionmaker is not None:
tasks.append(
asyncio.create_task(
revocation_listener(settings, app.state.redis, app.state.db_sessionmaker)
)
)
app.state.background_tasks = tasks
_log.info("gateway_startup_complete")
try:
yield
finally:
await _shutdown(app, tasks, audit_writer)
async def _shutdown(
app: FastAPI, tasks: list[asyncio.Task[None]], audit_writer: AuditWriter
) -> None:
"""Cancel background tasks and dispose of all backend handles."""
for task in tasks:
task.cancel()
for task in tasks:
with contextlib.suppress(asyncio.CancelledError):
await task
with contextlib.suppress(Exception):
await audit_writer.stop()
http_client: httpx.AsyncClient | None = getattr(app.state, "http_client", None)
if http_client is not None:
with contextlib.suppress(Exception):
await http_client.aclose()
redis_client = getattr(app.state, "redis", None)
if redis_client is not None:
with contextlib.suppress(Exception):
await redis_client.aclose()
engine = getattr(app.state, "db_engine", None)
if engine is not None:
with contextlib.suppress(Exception):
await engine.dispose()
_log.info("gateway_shutdown_complete")
__all__ = ["lifespan"]

View File

@@ -0,0 +1,3 @@
"""Observability: structured logging and Prometheus metrics."""
from __future__ import annotations

View File

@@ -0,0 +1,48 @@
"""structlog configuration.
Renders JSON in production (``GATEWAY_LOG_FORMAT=json``) and a human-friendly
console format in development. No secrets are ever logged; processors here
must not introduce any.
"""
from __future__ import annotations
import logging
from typing import Any
import structlog
def configure_logging(level: str = "INFO", fmt: str = "json") -> None:
"""Configure stdlib logging and structlog according to settings."""
log_level = getattr(logging, level.upper(), logging.INFO)
logging.basicConfig(format="%(message)s", level=log_level)
shared_processors: list[structlog.types.Processor] = [
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso", utc=True),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
]
renderer: structlog.types.Processor
if fmt == "console":
renderer = structlog.dev.ConsoleRenderer()
else:
renderer = structlog.processors.JSONRenderer()
structlog.configure(
processors=[*shared_processors, renderer],
wrapper_class=structlog.make_filtering_bound_logger(log_level),
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
def get_logger(name: str | None = None) -> Any: # noqa: ANN401 - structlog returns a dynamic proxy
"""Return a bound structlog logger."""
return structlog.get_logger(name)
__all__ = ["configure_logging", "get_logger"]

View File

@@ -0,0 +1,3 @@
"""HTTP route modules: health, native Ollama passthrough, OpenAI-compat."""
from __future__ import annotations

View File

@@ -0,0 +1,114 @@
"""Health, readiness, and metrics endpoints (SPEC §6.4).
- ``GET /healthz`` : liveness — always 200 if the process can respond.
- ``GET /readyz`` : readiness — 200 only if Postgres + Redis + Ollama are all
reachable; otherwise 503 with which dependencies are down.
In Phase 1 dev there is no Ollama, so 503 is expected.
- ``GET /metrics`` : Prometheus exposition. (Loopback-only IP check deferred.)
None of these endpoints require auth and none leak secrets or internal detail.
"""
from __future__ import annotations
from collections.abc import Awaitable
from typing import Literal, cast
import httpx
import redis.asyncio as redis
from fastapi import APIRouter, Request, Response, status
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from neuronetz_gateway.observability.logging import get_logger
from neuronetz_gateway.observability.metrics import CONTENT_TYPE_LATEST, render_latest
router = APIRouter(tags=["health"])
_log = get_logger("health")
class HealthResponse(BaseModel):
"""Liveness response body."""
status: Literal["ok"] = "ok"
class ReadyResponse(BaseModel):
"""Readiness response body. ``checks`` maps dependency -> reachable bool."""
status: Literal["ready", "not_ready"]
checks: dict[str, bool]
@router.get("/healthz", response_model=HealthResponse, status_code=status.HTTP_200_OK)
async def healthz() -> HealthResponse:
"""Liveness probe — always returns 200 while the process is responsive."""
return HealthResponse()
async def _check_postgres(app_state: object) -> bool:
"""Return True if a trivial query succeeds against Postgres."""
factory: async_sessionmaker[AsyncSession] | None = getattr(
app_state, "db_sessionmaker", None
)
if factory is None:
return False
try:
async with factory() as session:
await session.execute(text("SELECT 1"))
return True
except Exception as exc: # noqa: BLE001 - any failure means not ready
_log.warning("readyz_postgres_unreachable", error=str(exc))
return False
async def _check_redis(app_state: object) -> bool:
"""Return True if Redis answers PING."""
client: redis.Redis | None = getattr(app_state, "redis", None)
if client is None:
return False
try:
# redis-py types ping() as Awaitable[bool] | bool (sync+async share stubs);
# the asyncio client always returns an awaitable at runtime.
return bool(await cast("Awaitable[bool]", client.ping()))
except Exception as exc: # noqa: BLE001 - any failure means not ready
_log.warning("readyz_redis_unreachable", error=str(exc))
return False
async def _check_ollama(app_state: object) -> bool:
"""Return True if Ollama's root endpoint is reachable."""
client: httpx.AsyncClient | None = getattr(app_state, "http_client", None)
if client is None:
return False
try:
resp = await client.get("/")
return resp.status_code < 500
except Exception as exc: # noqa: BLE001 - any failure means not ready
_log.warning("readyz_ollama_unreachable", error=str(exc))
return False
@router.get("/readyz", response_model=ReadyResponse)
async def readyz(request: Request, response: Response) -> ReadyResponse:
"""Readiness probe — 200 only if every dependency is reachable, else 503."""
app_state = request.app.state
checks = {
"postgres": await _check_postgres(app_state),
"redis": await _check_redis(app_state),
"ollama": await _check_ollama(app_state),
}
all_ready = all(checks.values())
if not all_ready:
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return ReadyResponse(status="ready" if all_ready else "not_ready", checks=checks)
@router.get("/metrics")
async def metrics() -> Response:
"""Prometheus exposition. Loopback-only enforcement is deferred to Phase 4."""
return Response(content=render_latest(), media_type=CONTENT_TYPE_LATEST)
__all__ = ["router"]