stage-19-fix2: OTX — narrow by modified_since, longer timeout

The /pulses/subscribed endpoint enumerates every curated feed a fresh
account is auto-subscribed to. On its own that's enough to 504 from
OTX's backend regardless of client timeout. Narrowing by
modified_since=now-7d brings the response back to a single-second fetch.

Also: _http now accepts params + per-call timeout overrides (OTX uses
120s). The CLI --limit still slices post-fetch.

Verified live: 10 OTX pulse-cases ingested, each carrying real
paragraph-form descriptions (Mirai, macOS Stealer, FlowerStorm PhaaS,
Vidar v1.5, manufacturing intrusion) — exactly the real-prose source
the IOC extractor's been missing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
m17hr1l
2026-05-20 22:39:24 +02:00
parent f6fa52839f
commit 376c5b6f4a

View File

@@ -11,7 +11,7 @@ from __future__ import annotations
import csv
import io
import os
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from typing import Any, Callable, Dict, Iterable, List, Optional
from urllib.parse import urlparse
@@ -40,18 +40,20 @@ def _http(
headers: Optional[Dict[str, str]] = None,
json_body: Optional[Dict[str, Any]] = None,
form_body: Optional[Dict[str, Any]] = None,
params: Optional[Dict[str, Any]] = None,
timeout: float = HTTP_TIMEOUT,
) -> httpx.Response:
h = {"User-Agent": USER_AGENT}
if headers:
h.update(headers)
with httpx.Client(timeout=HTTP_TIMEOUT, headers=h, follow_redirects=True) as client:
with httpx.Client(timeout=timeout, headers=h, follow_redirects=True) as client:
if method.upper() == "POST":
if form_body is not None:
resp = client.post(url, data=form_body)
resp = client.post(url, data=form_body, params=params)
else:
resp = client.post(url, json=json_body)
resp = client.post(url, json=json_body, params=params)
else:
resp = client.get(url)
resp = client.get(url, params=params)
resp.raise_for_status()
return resp
@@ -352,7 +354,16 @@ def _fetch_otx() -> List[Case]:
key = os.environ.get("OTX_API_KEY", "").strip()
if not key:
raise RuntimeError("OTX_API_KEY not set — free key at https://otx.alienvault.com → settings → API")
data = _http("GET", OTX_PULSES_API, headers={"X-OTX-API-KEY": key}).json()
# OTX subscribes a new account to many curated feeds, so the unfiltered
# /pulses/subscribed page can 504 on its own backend. modified_since
# narrows to recent pulses; page size 20 caps the response.
since = (datetime.now(timezone.utc) - timedelta(days=7)).strftime("%Y-%m-%dT%H:%M:%S")
data = _http(
"GET", OTX_PULSES_API,
headers={"X-OTX-API-KEY": key},
params={"limit": 20, "modified_since": since},
timeout=120.0,
).json()
pulses = data.get("results") or []
out: List[Case] = []
for p in pulses: