From 376c5b6f4a9ba5d55e66e660177b2b22535d3588 Mon Sep 17 00:00:00 2001 From: m17hr1l Date: Wed, 20 May 2026 22:39:24 +0200 Subject: [PATCH] =?UTF-8?q?stage-19-fix2:=20OTX=20=E2=80=94=20narrow=20by?= =?UTF-8?q?=20modified=5Fsince,=20longer=20timeout?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /pulses/subscribed endpoint enumerates every curated feed a fresh account is auto-subscribed to. On its own that's enough to 504 from OTX's backend regardless of client timeout. Narrowing by modified_since=now-7d brings the response back to a single-second fetch. Also: _http now accepts params + per-call timeout overrides (OTX uses 120s). The CLI --limit still slices post-fetch. Verified live: 10 OTX pulse-cases ingested, each carrying real paragraph-form descriptions (Mirai, macOS Stealer, FlowerStorm PhaaS, Vidar v1.5, manufacturing intrusion) — exactly the real-prose source the IOC extractor's been missing. Co-Authored-By: Claude Opus 4.7 --- src/psyc/lines/scout.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/psyc/lines/scout.py b/src/psyc/lines/scout.py index 917670f..e822a02 100644 --- a/src/psyc/lines/scout.py +++ b/src/psyc/lines/scout.py @@ -11,7 +11,7 @@ from __future__ import annotations import csv import io import os -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import Any, Callable, Dict, Iterable, List, Optional from urllib.parse import urlparse @@ -40,18 +40,20 @@ def _http( headers: Optional[Dict[str, str]] = None, json_body: Optional[Dict[str, Any]] = None, form_body: Optional[Dict[str, Any]] = None, + params: Optional[Dict[str, Any]] = None, + timeout: float = HTTP_TIMEOUT, ) -> httpx.Response: h = {"User-Agent": USER_AGENT} if headers: h.update(headers) - with httpx.Client(timeout=HTTP_TIMEOUT, headers=h, follow_redirects=True) as client: + with httpx.Client(timeout=timeout, headers=h, follow_redirects=True) as client: if method.upper() == "POST": if form_body is not None: - resp = client.post(url, data=form_body) + resp = client.post(url, data=form_body, params=params) else: - resp = client.post(url, json=json_body) + resp = client.post(url, json=json_body, params=params) else: - resp = client.get(url) + resp = client.get(url, params=params) resp.raise_for_status() return resp @@ -352,7 +354,16 @@ def _fetch_otx() -> List[Case]: key = os.environ.get("OTX_API_KEY", "").strip() if not key: raise RuntimeError("OTX_API_KEY not set — free key at https://otx.alienvault.com → settings → API") - data = _http("GET", OTX_PULSES_API, headers={"X-OTX-API-KEY": key}).json() + # OTX subscribes a new account to many curated feeds, so the unfiltered + # /pulses/subscribed page can 504 on its own backend. modified_since + # narrows to recent pulses; page size 20 caps the response. + since = (datetime.now(timezone.utc) - timedelta(days=7)).strftime("%Y-%m-%dT%H:%M:%S") + data = _http( + "GET", OTX_PULSES_API, + headers={"X-OTX-API-KEY": key}, + params={"limit": 20, "modified_since": since}, + timeout=120.0, + ).json() pulses = data.get("results") or [] out: List[Case] = [] for p in pulses: