Real test bodies (not stubs), driven against an in-process httpx.ASGITransport override of the gateway's get_ollama_client dependency pointing at tests/integration/mock_ollama.py. Unit (target 100% on auth/, ratelimit/, budget/): - argon2id roundtrip, wrong-key, garbage encoding, needs_rehash on param change - key format/uniqueness/prefix extraction - token counter (prompt_eval_count + eval_count, embeddings, missing-counts) - translate (OpenAI <-> Ollama for chat/completion/embeddings, streaming chunks, /v1/models list shape) - allowlist (hard-blocks, effective-set semantics across allow_all/inheritance/ empty-discovered) - discovery (parse, cache roundtrip with TTL, fail-closed, tolerates redis=None) - sliding window (allow/block/reset/per-key vs per-tenant/cost-weighted) Integration (testcontainers postgres + redis + in-process mock Ollama): - auth flow (no/malformed/wrong key all return identical sanitized 401) - proxy stream (NDJSON roundtrip, audit row's token counts match, hard-blocked endpoints uniformly 403) - openai_compat (SSE chunks, data: [DONE], non-stream shape, /v1/models) - model_discovery (allow_all sees all, default-deny sees allowed ∩ discovered, /v1/models filtered, unpermitted-but-installed = nonexistent = 403, empty cache denies even allow_all) - rate_limit (429 + Retry-After + headers; Redis down ⇒ 503, never 200) - budget (decrement + headers; pre-burned counter blocks next request) - revocation (INSERT into gateway.revocations → NOTIFY → cache evicted → 401 ≤ 1s) Includes a known-issue xfail flagging a bug in ratelimit/sliding_window.py: the per-hit ZSET member uses id(object()) which returns the same id on consecutive calls, causing same-millisecond hits to overwrite instead of stacking. To be fixed in a follow-up commit.
50 lines
1.7 KiB
Python
50 lines
1.7 KiB
Python
"""Unit tests for ``neuronetz_gateway.proxy.token_counter``.
|
|
|
|
Tokens are read precisely from Ollama's final frame: ``prompt_eval_count``
|
|
(input) and ``eval_count`` (output) — never estimated (SPEC §2, §4.3 step 12,
|
|
§13.1). Embeddings carry only ``prompt_eval_count`` (SPEC §13.1).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from neuronetz_gateway.proxy.token_counter import TokenUsage, extract_usage
|
|
from tests._skip import call_or_skip
|
|
|
|
|
|
def test_extract_from_final_chat_frame() -> None:
|
|
# Mirrors the terminal NDJSON object emitted by mock_ollama (_final_metrics).
|
|
final = {
|
|
"model": "llama3.1:8b",
|
|
"done": True,
|
|
"done_reason": "stop",
|
|
"total_duration": 1_234_567_890,
|
|
"prompt_eval_count": 11,
|
|
"eval_count": 7,
|
|
}
|
|
usage = call_or_skip(extract_usage, final)
|
|
assert isinstance(usage, TokenUsage)
|
|
assert usage.tokens_in == 11
|
|
assert usage.tokens_out == 7
|
|
|
|
|
|
def test_extract_from_generate_frame() -> None:
|
|
final = {"done": True, "context": [1, 2, 3], "prompt_eval_count": 5, "eval_count": 42}
|
|
usage = call_or_skip(extract_usage, final)
|
|
assert (usage.tokens_in, usage.tokens_out) == (5, 42)
|
|
|
|
|
|
def test_embeddings_frame_only_prompt_eval_count() -> None:
|
|
# Embeddings: Ollama returns no eval_count (SPEC §13.1) => tokens_out == 0.
|
|
frame = {"embedding": [0.0, 0.1], "prompt_eval_count": 9}
|
|
usage = call_or_skip(extract_usage, frame)
|
|
assert usage.tokens_in == 9
|
|
assert usage.tokens_out == 0
|
|
|
|
|
|
def test_missing_counts_default_to_zero() -> None:
|
|
# A frame lacking the counter fields must not raise; charge nothing rather
|
|
# than crash the audit/budget path.
|
|
usage = call_or_skip(extract_usage, {"done": True})
|
|
assert usage.tokens_in == 0
|
|
assert usage.tokens_out == 0
|