stage-19: ThreatFox + MalwareBazaar + OTX Scoutline sources

Three new feeds — biggest near-term data-diversity win. ThreatFox brings
multi-malware IOCs with threat_type signal (botnet_cc → BOTNET,
payload_delivery → MALWARE, phishing → PHISHING). MalwareBazaar brings
file-hash samples with signatures. OTX brings curated multi-source pulses
with paragraph-form descriptions — by far the richest real-prose source.

Auth: THREATFOX_AUTH_KEY (one abuse.ch key covers ThreatFox + MalwareBazaar)
and OTX_API_KEY. fetch-all skips keyed feeds cleanly with where-to-get-it
guidance instead of tracebacking. Proofline reliability table extended;
abuse.ch sources rated B/2, OTX rated C/3 (community-driven).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
m17hr1l
2026-05-20 22:14:18 +02:00
parent 994a5c642f
commit d87bd710bb
6 changed files with 385 additions and 12 deletions

View File

@@ -88,10 +88,34 @@ def fetch_feodo(limit: int = typer.Option(50, help="max C2 records to ingest"))
_ingest("feodo", limit)
@app.command("fetch-threatfox")
def fetch_threatfox(limit: int = typer.Option(200, help="max IOCs to ingest")) -> None:
"""ThreatFox (abuse.ch) — needs THREATFOX_AUTH_KEY in .env."""
_ingest("threatfox", limit)
@app.command("fetch-malware-bazaar")
def fetch_malware_bazaar(limit: int = typer.Option(100, help="max samples to ingest")) -> None:
"""MalwareBazaar (abuse.ch) — also uses THREATFOX_AUTH_KEY."""
_ingest("malware-bazaar", limit)
@app.command("fetch-otx")
def fetch_otx(limit: int = typer.Option(100, help="max pulse-cases to ingest")) -> None:
"""AlienVault OTX — needs OTX_API_KEY in .env."""
_ingest("otx", limit)
@app.command("fetch-all")
def fetch_all() -> None:
for source, limit in (("urlhaus", 50), ("cisa-kev", 100), ("feodo", 50)):
_ingest(source, limit)
"""Fetch every configured source. Keyed feeds skip cleanly when the key is missing."""
plan = (("urlhaus", 50), ("cisa-kev", 100), ("feodo", 50),
("threatfox", 200), ("malware-bazaar", 100), ("otx", 100))
for source, limit in plan:
try:
_ingest(source, limit)
except RuntimeError as exc:
typer.echo(f" skip {source}: {exc}", err=True)
@app.command("classify-case")

View File

@@ -12,6 +12,16 @@ _FEED_INCIDENT = {
"urlhaus": IncidentType.MALWARE,
"feodo": IncidentType.BOTNET,
"cisa-kev": IncidentType.EXPLOIT,
"malware-bazaar": IncidentType.MALWARE,
"otx": IncidentType.MALWARE, # default; OTX pulses span many types
}
# ThreatFox carries its own type signal — map it instead of using a feed default.
_THREATFOX_THREAT_TYPE = {
"botnet_cc": IncidentType.BOTNET,
"payload_delivery": IncidentType.MALWARE,
"payload": IncidentType.MALWARE,
"phishing": IncidentType.PHISHING,
}
@@ -33,7 +43,11 @@ def classify(case: Case) -> Case:
def _classify_incident_type_and_tlp(case: Case) -> None:
if case.classification.incident_type is not None:
return
incident = _FEED_INCIDENT.get(case.source_metadata.get("feed", ""))
feed = case.source_metadata.get("feed", "")
if feed == "threatfox":
incident = _THREATFOX_THREAT_TYPE.get(case.source_metadata.get("threat_type", ""), IncidentType.MALWARE)
else:
incident = _FEED_INCIDENT.get(feed)
if incident is None and case.observables.urls:
incident = IncidentType.MALWARE # fallback for un-tagged feeds
if incident is None:

View File

@@ -23,9 +23,12 @@ _SHA_RE = re.compile(r"^[a-fA-F0-9]{32,64}$")
# feed -> (Admiralty source reliability A-F, information credibility 1-6)
_FEED_RELIABILITY = {
"cisa-kev": ("A", "1"), # government catalog, confirmed exploited
"urlhaus": ("B", "2"), # established CTI source, confirmed malware
"feodo": ("B", "2"), # established CTI source, confirmed C2
"cisa-kev": ("A", "1"), # government catalog, confirmed exploited
"urlhaus": ("B", "2"), # established CTI source, confirmed malware
"feodo": ("B", "2"), # established CTI source, confirmed C2
"threatfox": ("B", "2"), # abuse.ch CTI source
"malware-bazaar": ("B", "2"), # abuse.ch CTI source, confirmed sample
"otx": ("C", "3"), # community-driven, varying quality
}

View File

@@ -10,14 +10,15 @@ from __future__ import annotations
import csv
import io
import os
from datetime import datetime, timezone
from typing import Callable, Dict, Iterable, List, Optional
from typing import Any, Callable, Dict, Iterable, List, Optional
from urllib.parse import urlparse
import httpx
from psyc import log
from psyc.models import Case, Observables
from psyc.models import Case, IncidentType, Observables
USER_AGENT = "psyc/0.1 (defensive CTI; hackathon prototype)"
@@ -26,17 +27,30 @@ HTTP_TIMEOUT = 30.0
URLHAUS_RECENT_CSV = "https://urlhaus.abuse.ch/downloads/csv_recent/"
CISA_KEV_JSON = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
FEODO_BLOCKLIST_JSON = "https://feodotracker.abuse.ch/downloads/ipblocklist.json"
THREATFOX_API = "https://threatfox-api.abuse.ch/api/v1/"
MALWARE_BAZAAR_API = "https://mb-api.abuse.ch/api/v1/"
OTX_PULSES_API = "https://otx.alienvault.com/api/v1/pulses/subscribed"
_log = log.get(__name__)
def _http_get(url: str) -> httpx.Response:
with httpx.Client(timeout=HTTP_TIMEOUT, headers={"User-Agent": USER_AGENT}, follow_redirects=True) as client:
resp = client.get(url)
def _http(method: str, url: str, headers: Optional[Dict[str, str]] = None, json_body: Optional[Dict[str, Any]] = None) -> httpx.Response:
h = {"User-Agent": USER_AGENT}
if headers:
h.update(headers)
with httpx.Client(timeout=HTTP_TIMEOUT, headers=h, follow_redirects=True) as client:
if method.upper() == "POST":
resp = client.post(url, json=json_body)
else:
resp = client.get(url)
resp.raise_for_status()
return resp
def _http_get(url: str) -> httpx.Response:
return _http("GET", url)
def _parse_dt(value: str, fmt: str) -> datetime:
try:
return datetime.strptime(value, fmt).replace(tzinfo=timezone.utc)
@@ -142,12 +156,207 @@ def _fetch_feodo() -> List[Case]:
return [_feodo_record_to_case(r) for r in data]
# --- ThreatFox — multi-malware IOC feed (abuse.ch) -----------------------
# ThreatFox threat_type values → psyc IncidentType.
THREATFOX_THREAT_TYPE: Dict[str, IncidentType] = {
"botnet_cc": IncidentType.BOTNET,
"payload_delivery": IncidentType.MALWARE,
"payload": IncidentType.MALWARE,
"phishing": IncidentType.PHISHING,
}
def _threatfox_row_to_case(r: Dict[str, Any]) -> Optional[Case]:
ioc_value = str(r.get("ioc_value") or "").strip()
ioc_type = str(r.get("ioc_type") or "").lower()
if not ioc_value or not ioc_type:
return None
malware = str(r.get("malware_printable") or r.get("malware") or "unknown")
threat_type = str(r.get("threat_type") or "")
tags_raw = r.get("tags") or []
tags = tags_raw if isinstance(tags_raw, list) else []
obs = Observables()
host = ""
if ioc_type in ("ip:port", "ipv4", "ipv6"):
ip = ioc_value.split(":")[0]
obs.ips = [ip]
elif ioc_type == "domain":
obs.domains = [ioc_value]
host = ioc_value
elif ioc_type == "url":
obs.urls = [ioc_value]
host = urlparse(ioc_value).hostname or ""
if host:
obs.domains = [host]
elif ioc_type in ("sha256_hash", "md5_hash", "sha1_hash"):
obs.hashes = [ioc_value]
else:
return None
threat_label = threat_type.replace("_", " ") or "malware"
summary = f"ThreatFox: {malware} {threat_label}{ioc_value}"
return Case(
case_id=f"PSYC-THREATFOX-{r.get('id', '')}",
summary=summary,
source_type="abuse_feed",
source_ref=str(r.get("reference") or f"https://threatfox.abuse.ch/ioc/{r.get('id', '')}/"),
source_metadata=dict(
feed="threatfox",
malware=malware,
threat_type=threat_type,
ioc_type=ioc_type,
confidence_level=str(r.get("confidence_level", "")),
tags=",".join(t for t in tags if t),
reporter=str(r.get("reporter", "")),
),
observed_at=_parse_dt(str(r.get("first_seen_utc", "")), "%Y-%m-%d %H:%M:%S"),
observables=obs,
)
def _fetch_threatfox() -> List[Case]:
key = os.environ.get("THREATFOX_AUTH_KEY", "").strip()
if not key:
raise RuntimeError("THREATFOX_AUTH_KEY not set — free abuse.ch auth-key from https://auth.abuse.ch/")
data = _http("POST", THREATFOX_API, headers={"Auth-Key": key}, json_body={"query": "get_iocs", "days": 1}).json()
rows = data.get("data") or []
out: List[Case] = []
for r in rows:
c = _threatfox_row_to_case(r)
if c is not None:
out.append(c)
return out
# --- MalwareBazaar — recent malware samples (abuse.ch) -------------------
def _mb_row_to_case(r: Dict[str, Any]) -> Optional[Case]:
sha256 = str(r.get("sha256_hash") or "")
if not sha256:
return None
sha1 = str(r.get("sha1_hash") or "")
md5 = str(r.get("md5_hash") or "")
file_name = str(r.get("file_name") or "unknown")
signature = str(r.get("signature") or "")
file_type = str(r.get("file_type") or "")
tags_raw = r.get("tags") or []
tags = tags_raw if isinstance(tags_raw, list) else []
hashes = [h for h in (sha256, sha1, md5) if h]
label = signature or "unsigned"
summary = f"MalwareBazaar: {label} {file_type} sample — {file_name}"
return Case(
case_id=f"PSYC-MBAZAAR-{sha256[:16]}",
summary=summary,
source_type="abuse_feed",
source_ref=f"https://bazaar.abuse.ch/sample/{sha256}/",
source_metadata=dict(
feed="malware-bazaar",
signature=signature,
file_type=file_type,
file_name=file_name,
tags=",".join(t for t in tags if t),
reporter=str(r.get("reporter", "")),
),
observed_at=_parse_dt(str(r.get("first_seen") or ""), "%Y-%m-%d %H:%M:%S"),
observables=Observables(hashes=hashes),
)
def _fetch_malware_bazaar() -> List[Case]:
key = os.environ.get("THREATFOX_AUTH_KEY", "").strip()
if not key:
raise RuntimeError("THREATFOX_AUTH_KEY not set — abuse.ch auth-key from https://auth.abuse.ch/ also covers MalwareBazaar")
data = _http("POST", MALWARE_BAZAAR_API, headers={"Auth-Key": key}, json_body={"query": "get_recent", "selector": "100"}).json()
rows = data.get("data") or []
out: List[Case] = []
for r in rows:
c = _mb_row_to_case(r)
if c is not None:
out.append(c)
return out
# --- AlienVault OTX — curated multi-source pulses ------------------------
_OTX_IOC_LIMIT_PER_PULSE = 50
def _otx_pulse_to_case(p: Dict[str, Any]) -> Optional[Case]:
pulse_id = str(p.get("id") or "")
if not pulse_id:
return None
pulse_name = str(p.get("name") or "OTX pulse")
description = str(p.get("description") or "")
tags_raw = p.get("tags") or []
tags = tags_raw if isinstance(tags_raw, list) else []
tlp_pulse = str(p.get("tlp") or "white").upper()
indicators = p.get("indicators") or []
obs = Observables()
for ind in indicators[:_OTX_IOC_LIMIT_PER_PULSE]:
value = str(ind.get("indicator") or "").strip()
itype = str(ind.get("type") or "").lower()
if not value:
continue
if itype in ("ipv4", "ipv6"):
obs.ips.append(value)
elif itype in ("domain", "hostname"):
obs.domains.append(value)
elif itype == "url":
obs.urls.append(value)
host = urlparse(value).hostname or ""
if host and host not in obs.domains:
obs.domains.append(host)
elif itype in ("filehash-sha256", "filehash-sha1", "filehash-md5"):
obs.hashes.append(value)
elif itype == "cve":
obs.cves.append(value)
if not (obs.urls or obs.domains or obs.ips or obs.hashes or obs.cves):
return None
return Case(
case_id=f"PSYC-OTX-{pulse_id}",
summary=f"OTX: {pulse_name}",
source_type="threat_intel",
source_ref=f"https://otx.alienvault.com/pulse/{pulse_id}",
source_metadata=dict(
feed="otx",
pulse_name=pulse_name,
description=description[:2000],
tags=",".join(t for t in tags if t),
tlp_pulse=tlp_pulse,
),
observed_at=_parse_dt(str(p.get("created") or "").split(".")[0], "%Y-%m-%dT%H:%M:%S"),
observables=obs,
)
def _fetch_otx() -> List[Case]:
key = os.environ.get("OTX_API_KEY", "").strip()
if not key:
raise RuntimeError("OTX_API_KEY not set — free key at https://otx.alienvault.com → settings → API")
data = _http("GET", OTX_PULSES_API, headers={"X-OTX-API-KEY": key}).json()
pulses = data.get("results") or []
out: List[Case] = []
for p in pulses:
c = _otx_pulse_to_case(p)
if c is not None:
out.append(c)
return out
# --- registry + dispatch -------------------------------------------------
SOURCES: Dict[str, Callable[[], List[Case]]] = {
"urlhaus": _fetch_urlhaus,
"cisa-kev": _fetch_cisa_kev,
"feodo": _fetch_feodo,
"threatfox": _fetch_threatfox,
"malware-bazaar": _fetch_malware_bazaar,
"otx": _fetch_otx,
}