stage-20: defanging pipeline for IOC-extraction augmentation

Real CTI prose defangs IOCs (1[.]2[.]3[.]4, hxxp://, evil[dot]com) so they
don't auto-link in email/chat. A model trained only on canonical inputs
will fail to extract them.

New lines/defang.py: defang_ip, defang_domain, defang_url, defang_text —
four dot-styles ([.], (.), [dot], {.}) plus protocol defanging
(http→hxxp, https→hxxps). Each occurrence picks its style independently
since real advisories don't keep one style across paragraphs.

train.BuildOptions adds defang_frac (default 0.0) and seed; build()
threads options + a seeded Random through the example builders so
the augmentation is reproducible. Only _ex_ioc_extraction reads it
today — output stays canonical so the model learns messy→canonical.

CLI: train-build and train-build-all gain --defang-frac and --seed.
8 new tests including a frac=1.0 / output-canonical integration check.
The pipeline runs but is dormant at defang_frac=0.0 — psyc-v5 dataset
build will set 0.5 once OTX cases land.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
m17hr1l
2026-05-20 22:33:52 +02:00
parent 85830be9fa
commit f6fa52839f
4 changed files with 197 additions and 11 deletions

View File

@@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
from typing import List from typing import List, Optional
import typer import typer
import uvicorn import uvicorn
@@ -366,12 +366,15 @@ def mock_cert_serve(host: str = "127.0.0.1", port: int = 8770) -> None:
def train_build( def train_build(
task: str = typer.Option(..., "--task", "-t", help=f"one of: {', '.join(train.TASKS)}"), task: str = typer.Option(..., "--task", "-t", help=f"one of: {', '.join(train.TASKS)}"),
limit: int = typer.Option(10_000, help="max cases to process"), limit: int = typer.Option(10_000, help="max cases to process"),
defang_frac: float = typer.Option(0.0, "--defang-frac", help="fraction of ioc_extraction inputs to defang ([0.0, 1.0])"),
seed: Optional[int] = typer.Option(None, "--seed", help="rng seed for reproducible defanging"),
) -> None: ) -> None:
if task not in train.TASKS: if task not in train.TASKS:
typer.echo(f"unknown task: {task}; choices: {', '.join(train.TASKS)}", err=True) typer.echo(f"unknown task: {task}; choices: {', '.join(train.TASKS)}", err=True)
raise typer.Exit(1) raise typer.Exit(1)
cases = db.list_cases(limit=limit) cases = db.list_cases(limit=limit)
report = train.build(task, cases) options = train.BuildOptions(defang_frac=defang_frac, seed=seed)
report = train.build(task, cases, options=options)
typer.echo(f"task: {report.task}") typer.echo(f"task: {report.task}")
typer.echo(f"path: {report.path}") typer.echo(f"path: {report.path}")
typer.echo(f" written: {report.written}") typer.echo(f" written: {report.written}")
@@ -382,10 +385,15 @@ def train_build(
@app.command("train-build-all") @app.command("train-build-all")
def train_build_all(limit: int = typer.Option(10_000, help="max cases per task")) -> None: def train_build_all(
limit: int = typer.Option(10_000, help="max cases per task"),
defang_frac: float = typer.Option(0.0, "--defang-frac", help="fraction of ioc_extraction inputs to defang ([0.0, 1.0])"),
seed: Optional[int] = typer.Option(None, "--seed", help="rng seed for reproducible defanging"),
) -> None:
cases = db.list_cases(limit=limit) cases = db.list_cases(limit=limit)
options = train.BuildOptions(defang_frac=defang_frac, seed=seed)
for task in train.TASKS: for task in train.TASKS:
report = train.build(task, cases) report = train.build(task, cases, options=options)
typer.echo(f" {task}: wrote {report.written}{report.path.name}") typer.echo(f" {task}: wrote {report.written}{report.path.name}")

73
src/psyc/lines/defang.py Normal file
View File

@@ -0,0 +1,73 @@
"""Defanging — IOC obfuscation styles common in real CTI prose.
Real advisories don't write `1.2.3.4` and `http://evil.com` verbatim; they
defang IOCs into bracket/paren/word forms (`1[.]2[.]3[.]4`, `hxxp://evil[.]com`)
so indicators don't auto-link in email/chat clients. Training the IOC extractor
purely on canonical inputs leaves it brittle. This module corrupts canonical
IOCs into common defanged forms for use as training-time data augmentation.
"""
from __future__ import annotations
import random
from typing import List, Optional
# Dot replacement styles seen in the wild, in rough frequency order.
_DOT_FORMS = ("[.]", "(.)", "[dot]", "{.}")
_PROTOCOL_FORMS = {
"http://": "hxxp://",
"https://": "hxxps://",
}
def _rng(r: Optional[random.Random]) -> random.Random:
return r if r is not None else random.Random()
def defang_ip(ip: str, rng: Optional[random.Random] = None) -> str:
"""`1.2.3.4` → `1[.]2[.]3[.]4` (one randomly chosen dot style)."""
return ip.replace(".", _rng(rng).choice(_DOT_FORMS))
def defang_domain(domain: str, rng: Optional[random.Random] = None) -> str:
"""`evil.com` → `evil[.]com`."""
return domain.replace(".", _rng(rng).choice(_DOT_FORMS))
def defang_url(url: str, rng: Optional[random.Random] = None) -> str:
"""`http://evil.com/x` → `hxxp://evil[.]com/x` — protocol + dot defanging."""
r = _rng(rng)
out = url
for proto, replacement in _PROTOCOL_FORMS.items():
if out.startswith(proto):
out = replacement + out[len(proto):]
break
out = out.replace(".", r.choice(_DOT_FORMS))
return out
def defang_text(
text: str,
ips: List[str],
domains: List[str],
urls: List[str],
rng: Optional[random.Random] = None,
) -> str:
"""Defang every occurrence of the given IOCs inside a free-text body.
URLs are replaced before domains (URLs contain domain substrings, so
domain-first would corrupt the URL match). Likewise IPs last. Each
occurrence picks its own dot-style independently — real advisories don't
keep one style consistent across paragraphs.
"""
r = _rng(rng)
out = text
for u in sorted(set(urls), key=len, reverse=True):
out = out.replace(u, defang_url(u, r))
for d in sorted(set(domains), key=len, reverse=True):
out = out.replace(d, defang_domain(d, r))
for i in sorted(set(ips), key=len, reverse=True):
out = out.replace(i, defang_ip(i, r))
return out

View File

@@ -15,6 +15,7 @@ restricted source types, never empty input/output.
from __future__ import annotations from __future__ import annotations
import json import json
import random
import re import re
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -24,10 +25,18 @@ from pydantic import BaseModel, Field
from psyc import DATA_DIR, log from psyc import DATA_DIR, log
from psyc.lines import classify as classify_line from psyc.lines import classify as classify_line
from psyc.lines import defang as defang_line
from psyc.lines import route as route_line from psyc.lines import route as route_line
from psyc.models import Case, TLP from psyc.models import Case, TLP
class BuildOptions(BaseModel):
"""Per-build configuration. Currently only ioc_extraction reads any field."""
defang_frac: float = 0.0 # in [0.0, 1.0] — fraction of ioc_extraction inputs to defang
seed: Optional[int] = None # reproducible RNG when set
_log = log.get(__name__) _log = log.get(__name__)
DATASETS_DIR = DATA_DIR / "datasets" DATASETS_DIR = DATA_DIR / "datasets"
@@ -60,7 +69,11 @@ class DatasetReport(BaseModel):
# ---------- ExampleBuilder per task --------------------------------------- # ---------- ExampleBuilder per task ---------------------------------------
def _ex_ioc_extraction(case: Case) -> Optional[Example]: def _ex_ioc_extraction(
case: Case,
options: Optional["BuildOptions"] = None,
rng: Optional[random.Random] = None,
) -> Optional[Example]:
obs = case.observables obs = case.observables
if not (obs.urls or obs.domains or obs.ips or obs.hashes or obs.cves): if not (obs.urls or obs.domains or obs.ips or obs.hashes or obs.cves):
return None return None
@@ -81,6 +94,13 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
body.append("Related CVEs: " + ", ".join(obs.cves) + ".") body.append("Related CVEs: " + ", ".join(obs.cves) + ".")
if tags: if tags:
body.append(f"Tags: {tags}.") body.append(f"Tags: {tags}.")
body_text = " ".join(body)
# Defanging augmentation: with probability options.defang_frac, replace IOCs
# in the input with common real-world defanged forms (1[.]2[.]3[.]4,
# hxxp://, etc.). Output stays canonical so the model learns the mapping.
if options is not None and rng is not None and options.defang_frac > 0.0:
if rng.random() < options.defang_frac:
body_text = defang_line.defang_text(body_text, obs.ips, obs.domains, obs.urls, rng)
output_obj = { output_obj = {
"urls": obs.urls, "urls": obs.urls,
"domains": obs.domains, "domains": obs.domains,
@@ -90,7 +110,7 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
} }
return Example( return Example(
instruction="Extract all indicators of compromise from the advisory and return JSON with keys: urls, domains, ips, hashes, cves.", instruction="Extract all indicators of compromise from the advisory and return JSON with keys: urls, domains, ips, hashes, cves.",
input=" ".join(body), input=body_text,
output=json.dumps(output_obj, ensure_ascii=False), output=json.dumps(output_obj, ensure_ascii=False),
task="ioc_extraction", task="ioc_extraction",
case_id=case.case_id, case_id=case.case_id,
@@ -119,7 +139,11 @@ def severity_features(case: Case) -> Dict[str, object]:
} }
def _ex_severity_classification(case: Case) -> Optional[Example]: def _ex_severity_classification(
case: Case,
options: Optional["BuildOptions"] = None,
rng: Optional[random.Random] = None,
) -> Optional[Example]:
if case.classification.severity is None: if case.classification.severity is None:
return None return None
return Example( return Example(
@@ -132,7 +156,11 @@ def _ex_severity_classification(case: Case) -> Optional[Example]:
) )
def _ex_routing_decision(case: Case) -> Optional[Example]: def _ex_routing_decision(
case: Case,
options: Optional["BuildOptions"] = None,
rng: Optional[random.Random] = None,
) -> Optional[Example]:
if case.classification.incident_type is None: if case.classification.incident_type is None:
return None return None
routes, blocked = route_line.plan(case) routes, blocked = route_line.plan(case)
@@ -158,7 +186,11 @@ def _ex_routing_decision(case: Case) -> Optional[Example]:
) )
def _ex_tlp_assignment(case: Case) -> Optional[Example]: def _ex_tlp_assignment(
case: Case,
options: Optional["BuildOptions"] = None,
rng: Optional[random.Random] = None,
) -> Optional[Example]:
input_obj = { input_obj = {
"source_type": case.source_type, "source_type": case.source_type,
"incident_type": case.classification.incident_type.value if case.classification.incident_type else None, "incident_type": case.classification.incident_type.value if case.classification.incident_type else None,
@@ -217,10 +249,12 @@ def _next_version(task: str) -> int:
return (max(used) + 1) if used else 1 return (max(used) + 1) if used else 1
def build(task: str, cases: Iterable[Case]) -> DatasetReport: def build(task: str, cases: Iterable[Case], options: Optional[BuildOptions] = None) -> DatasetReport:
if task not in _BUILDERS: if task not in _BUILDERS:
raise ValueError(f"unknown task: {task}; choices: {sorted(_BUILDERS)}") raise ValueError(f"unknown task: {task}; choices: {sorted(_BUILDERS)}")
builder = _BUILDERS[task] builder = _BUILDERS[task]
options = options or BuildOptions()
rng = random.Random(options.seed)
version = _next_version(task) version = _next_version(task)
path = DATASETS_DIR / f"{task}-v{version}.jsonl" path = DATASETS_DIR / f"{task}-v{version}.jsonl"
written = 0 written = 0
@@ -230,7 +264,7 @@ def build(task: str, cases: Iterable[Case]) -> DatasetReport:
skipped_empty = 0 skipped_empty = 0
with path.open("w", encoding="utf-8") as fh: with path.open("w", encoding="utf-8") as fh:
for case in cases: for case in cases:
example = builder(case) example = builder(case, options, rng)
if example is None: if example is None:
skipped_empty += 1 skipped_empty += 1
continue continue

71
tests/test_defang.py Normal file
View File

@@ -0,0 +1,71 @@
"""Defanging — IOC obfuscation styles for training-data augmentation."""
from __future__ import annotations
import json
import random
from psyc.lines.defang import defang_domain, defang_ip, defang_text, defang_url
from psyc.lines.train import BuildOptions, _ex_ioc_extraction
from conftest import make_case
def test_defang_ip_breaks_canonical_form():
out = defang_ip("1.2.3.4", random.Random(0))
assert "1.2.3.4" not in out # canonical IP substring no longer appears
assert "1" in out and "4" in out # digits preserved
assert any(form in out for form in ("[.]", "(.)", "[dot]", "{.}"))
def test_defang_domain_preserves_label_text():
out = defang_domain("evil.example.com", random.Random(1))
assert "evil" in out and "example" in out and "com" in out
assert "evil.example.com" not in out # canonical domain broken
def test_defang_url_defangs_protocol_and_breaks_canonical_form():
out = defang_url("http://evil.example.com/payload.bin", random.Random(2))
assert out.startswith("hxxp://") # protocol defanged
assert "http://" not in out
assert "evil.example.com" not in out # host part defanged
def test_defang_url_handles_https():
assert defang_url("https://evil.com/x", random.Random(0)).startswith("hxxps://")
def test_defang_text_substitutes_every_listed_ioc():
text = "See URL http://1.2.3.4/x and IP 1.2.3.4 and domain evil.com please."
out = defang_text(text, ips=["1.2.3.4"], domains=["evil.com"], urls=["http://1.2.3.4/x"], rng=random.Random(3))
# No canonical IOC string should remain anywhere in the corrupted body.
assert "http://" not in out
assert "1.2.3.4" not in out
assert "evil.com" not in out
# Surrounding prose is preserved.
assert "See URL" in out and "please" in out
def test_ioc_extraction_with_defang_frac_1_corrupts_input_only():
case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], domains=["1.2.3.4"], ips=["1.2.3.4"])
options = BuildOptions(defang_frac=1.0, seed=42)
rng = random.Random(options.seed)
ex = _ex_ioc_extraction(case, options, rng)
assert ex is not None
# Input has been defanged.
assert "1.2.3.4" not in ex.input
assert "http://" not in ex.input
# Output stays canonical so the model learns the inverse mapping.
output = json.loads(ex.output)
assert "1.2.3.4" in output["ips"]
assert "http://1.2.3.4/x" in output["urls"]
def test_ioc_extraction_with_defang_frac_0_is_canonical():
case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], ips=["1.2.3.4"])
options = BuildOptions(defang_frac=0.0, seed=0)
rng = random.Random(0)
ex = _ex_ioc_extraction(case, options, rng)
assert ex is not None
# No defanging → input keeps the canonical IOCs.
assert "http://1.2.3.4/x" in ex.input
assert "1.2.3.4" in ex.input