"""Defanging — IOC obfuscation styles for training-data augmentation.""" from __future__ import annotations import json import random from psyc.lines.defang import defang_domain, defang_ip, defang_text, defang_url from psyc.lines.train import BuildOptions, _ex_ioc_extraction from conftest import make_case def test_defang_ip_breaks_canonical_form(): out = defang_ip("1.2.3.4", random.Random(0)) assert "1.2.3.4" not in out # canonical IP substring no longer appears assert "1" in out and "4" in out # digits preserved assert any(form in out for form in ("[.]", "(.)", "[dot]", "{.}")) def test_defang_domain_preserves_label_text(): out = defang_domain("evil.example.com", random.Random(1)) assert "evil" in out and "example" in out and "com" in out assert "evil.example.com" not in out # canonical domain broken def test_defang_url_defangs_protocol_and_breaks_canonical_form(): out = defang_url("http://evil.example.com/payload.bin", random.Random(2)) assert out.startswith("hxxp://") # protocol defanged assert "http://" not in out assert "evil.example.com" not in out # host part defanged def test_defang_url_handles_https(): assert defang_url("https://evil.com/x", random.Random(0)).startswith("hxxps://") def test_defang_text_substitutes_every_listed_ioc(): text = "See URL http://1.2.3.4/x and IP 1.2.3.4 and domain evil.com please." out = defang_text(text, ips=["1.2.3.4"], domains=["evil.com"], urls=["http://1.2.3.4/x"], rng=random.Random(3)) # No canonical IOC string should remain anywhere in the corrupted body. assert "http://" not in out assert "1.2.3.4" not in out assert "evil.com" not in out # Surrounding prose is preserved. assert "See URL" in out and "please" in out def test_ioc_extraction_with_defang_frac_1_corrupts_input_only(): case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], domains=["1.2.3.4"], ips=["1.2.3.4"]) options = BuildOptions(defang_frac=1.0, seed=42) rng = random.Random(options.seed) ex = _ex_ioc_extraction(case, options, rng) assert ex is not None # Input has been defanged. assert "1.2.3.4" not in ex.input assert "http://" not in ex.input # Output stays canonical so the model learns the inverse mapping. output = json.loads(ex.output) assert "1.2.3.4" in output["ips"] assert "http://1.2.3.4/x" in output["urls"] def test_ioc_extraction_with_defang_frac_0_is_canonical(): case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], ips=["1.2.3.4"]) options = BuildOptions(defang_frac=0.0, seed=0) rng = random.Random(0) ex = _ex_ioc_extraction(case, options, rng) assert ex is not None # No defanging → input keeps the canonical IOCs. assert "http://1.2.3.4/x" in ex.input assert "1.2.3.4" in ex.input