Files
psyc/tests/test_defang.py
m17hr1l f6fa52839f stage-20: defanging pipeline for IOC-extraction augmentation
Real CTI prose defangs IOCs (1[.]2[.]3[.]4, hxxp://, evil[dot]com) so they
don't auto-link in email/chat. A model trained only on canonical inputs
will fail to extract them.

New lines/defang.py: defang_ip, defang_domain, defang_url, defang_text —
four dot-styles ([.], (.), [dot], {.}) plus protocol defanging
(http→hxxp, https→hxxps). Each occurrence picks its style independently
since real advisories don't keep one style across paragraphs.

train.BuildOptions adds defang_frac (default 0.0) and seed; build()
threads options + a seeded Random through the example builders so
the augmentation is reproducible. Only _ex_ioc_extraction reads it
today — output stays canonical so the model learns messy→canonical.

CLI: train-build and train-build-all gain --defang-frac and --seed.
8 new tests including a frac=1.0 / output-canonical integration check.
The pipeline runs but is dormant at defang_frac=0.0 — psyc-v5 dataset
build will set 0.5 once OTX cases land.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 22:33:52 +02:00

72 lines
2.8 KiB
Python

"""Defanging — IOC obfuscation styles for training-data augmentation."""
from __future__ import annotations
import json
import random
from psyc.lines.defang import defang_domain, defang_ip, defang_text, defang_url
from psyc.lines.train import BuildOptions, _ex_ioc_extraction
from conftest import make_case
def test_defang_ip_breaks_canonical_form():
out = defang_ip("1.2.3.4", random.Random(0))
assert "1.2.3.4" not in out # canonical IP substring no longer appears
assert "1" in out and "4" in out # digits preserved
assert any(form in out for form in ("[.]", "(.)", "[dot]", "{.}"))
def test_defang_domain_preserves_label_text():
out = defang_domain("evil.example.com", random.Random(1))
assert "evil" in out and "example" in out and "com" in out
assert "evil.example.com" not in out # canonical domain broken
def test_defang_url_defangs_protocol_and_breaks_canonical_form():
out = defang_url("http://evil.example.com/payload.bin", random.Random(2))
assert out.startswith("hxxp://") # protocol defanged
assert "http://" not in out
assert "evil.example.com" not in out # host part defanged
def test_defang_url_handles_https():
assert defang_url("https://evil.com/x", random.Random(0)).startswith("hxxps://")
def test_defang_text_substitutes_every_listed_ioc():
text = "See URL http://1.2.3.4/x and IP 1.2.3.4 and domain evil.com please."
out = defang_text(text, ips=["1.2.3.4"], domains=["evil.com"], urls=["http://1.2.3.4/x"], rng=random.Random(3))
# No canonical IOC string should remain anywhere in the corrupted body.
assert "http://" not in out
assert "1.2.3.4" not in out
assert "evil.com" not in out
# Surrounding prose is preserved.
assert "See URL" in out and "please" in out
def test_ioc_extraction_with_defang_frac_1_corrupts_input_only():
case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], domains=["1.2.3.4"], ips=["1.2.3.4"])
options = BuildOptions(defang_frac=1.0, seed=42)
rng = random.Random(options.seed)
ex = _ex_ioc_extraction(case, options, rng)
assert ex is not None
# Input has been defanged.
assert "1.2.3.4" not in ex.input
assert "http://" not in ex.input
# Output stays canonical so the model learns the inverse mapping.
output = json.loads(ex.output)
assert "1.2.3.4" in output["ips"]
assert "http://1.2.3.4/x" in output["urls"]
def test_ioc_extraction_with_defang_frac_0_is_canonical():
case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], ips=["1.2.3.4"])
options = BuildOptions(defang_frac=0.0, seed=0)
rng = random.Random(0)
ex = _ex_ioc_extraction(case, options, rng)
assert ex is not None
# No defanging → input keeps the canonical IOCs.
assert "http://1.2.3.4/x" in ex.input
assert "1.2.3.4" in ex.input