psyc/tests/test_defang.py

"""Defanging — IOC obfuscation styles for training-data augmentation."""

from __future__ import annotations

import json
import random

from psyc.lines.defang import defang_domain, defang_ip, defang_text, defang_url
from psyc.lines.train import BuildOptions, _ex_ioc_extraction
from conftest import make_case


def test_defang_ip_breaks_canonical_form():
    out = defang_ip("1.2.3.4", random.Random(0))
    assert "1.2.3.4" not in out  # canonical IP substring no longer appears
    assert "1" in out and "4" in out  # digits preserved
    assert any(form in out for form in ("[.]", "(.)", "[dot]", "{.}"))


def test_defang_domain_preserves_label_text():
    out = defang_domain("evil.example.com", random.Random(1))
    assert "evil" in out and "example" in out and "com" in out
    assert "evil.example.com" not in out  # canonical domain broken


def test_defang_url_defangs_protocol_and_breaks_canonical_form():
    out = defang_url("http://evil.example.com/payload.bin", random.Random(2))
    assert out.startswith("hxxp://")  # protocol defanged
    assert "http://" not in out
    assert "evil.example.com" not in out  # host part defanged


def test_defang_url_handles_https():
    assert defang_url("https://evil.com/x", random.Random(0)).startswith("hxxps://")


def test_defang_text_substitutes_every_listed_ioc():
    text = "See URL http://1.2.3.4/x and IP 1.2.3.4 and domain evil.com please."
    out = defang_text(text, ips=["1.2.3.4"], domains=["evil.com"], urls=["http://1.2.3.4/x"], rng=random.Random(3))
    # No canonical IOC string should remain anywhere in the corrupted body.
    assert "http://" not in out
    assert "1.2.3.4" not in out
    assert "evil.com" not in out
    # Surrounding prose is preserved.
    assert "See URL" in out and "please" in out


def test_ioc_extraction_with_defang_frac_1_corrupts_input_only():
    case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], domains=["1.2.3.4"], ips=["1.2.3.4"])
    options = BuildOptions(defang_frac=1.0, seed=42)
    rng = random.Random(options.seed)
    ex = _ex_ioc_extraction(case, options, rng)
    assert ex is not None
    # Input has been defanged.
    assert "1.2.3.4" not in ex.input
    assert "http://" not in ex.input
    # Output stays canonical so the model learns the inverse mapping.
    output = json.loads(ex.output)
    assert "1.2.3.4" in output["ips"]
    assert "http://1.2.3.4/x" in output["urls"]


def test_ioc_extraction_with_defang_frac_0_is_canonical():
    case = make_case(feed="urlhaus", urls=["http://1.2.3.4/x"], ips=["1.2.3.4"])
    options = BuildOptions(defang_frac=0.0, seed=0)
    rng = random.Random(0)
    ex = _ex_ioc_extraction(case, options, rng)
    assert ex is not None
    # No defanging → input keeps the canonical IOCs.
    assert "http://1.2.3.4/x" in ex.input
    assert "1.2.3.4" in ex.input