From b4c66c2e87d0e4a24614897815cb3c36eef7ef5f Mon Sep 17 00:00:00 2001 From: m17hr1l Date: Sun, 17 May 2026 18:09:37 +0200 Subject: [PATCH] stage-3e: well-posed ioc_extraction dataset + clearer /train page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ioc_extraction ExampleBuilder now embeds every IOC into the advisory text so the extraction task is answerable from the input (v1 asked the model to "extract" a URL that was never given). /train page distinguishes trained / training… / not-started, and renders a per-step loss bar chart. Dockerfile no longer bakes the training script — scripts/ is mounted at run time so edits take effect without a 21 GB rebuild (this is why psyc-v2's loss capture was silently skipped on its first run). Co-Authored-By: Claude Opus 4.7 --- Dockerfile.train | 25 ++++++++++---------- README.md | 25 +++++++++++--------- src/psyc/cockpit/static/cockpit.css | 8 +++++++ src/psyc/cockpit/templates/train.html | 26 +++++++++++++-------- src/psyc/lines/train.py | 33 +++++++++++++++++++++++---- 5 files changed, 80 insertions(+), 37 deletions(-) diff --git a/Dockerfile.train b/Dockerfile.train index 115a51b..03d94ad 100644 --- a/Dockerfile.train +++ b/Dockerfile.train @@ -3,19 +3,20 @@ # Build: # docker build -t psyc-trainer -f Dockerfile.train . # -# Run (24 GB GPU, mounts host data/ for datasets + adapter output): -# docker run --gpus all --rm \ -# -v $(pwd)/data:/data \ -# psyc-trainer \ -# --dataset /data/datasets/ioc_extraction-v1.jsonl \ -# --dataset /data/datasets/severity_classification-v1.jsonl \ -# --dataset /data/datasets/routing_decision-v1.jsonl \ -# --dataset /data/datasets/tlp_assignment-v1.jsonl \ -# --output /data/adapters/psyc-v1 +# Run (24 GB GPU; mounts host data/ + scripts/ so script edits need no rebuild): +# docker run --gpus all --rm --entrypoint python \ +# -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ +# psyc-trainer /scripts/train_qlora.py \ +# --dataset /data/datasets/ioc_extraction-v2.jsonl \ +# --dataset /data/datasets/severity_classification-v2.jsonl \ +# --dataset /data/datasets/routing_decision-v2.jsonl \ +# --dataset /data/datasets/tlp_assignment-v2.jsonl \ +# --output /data/adapters/psyc-v2 # # Base image already ships Python 3.11 + torch 2.6 + CUDA 12.4 + cuDNN9, so # there is no apt step and no torch download. Qwen3.5 needs transformers v5 — -# unsloth pulls it automatically. +# unsloth pulls it automatically. The training/eval scripts are MOUNTED at run +# time (not baked in) so editing scripts/*.py never needs an image rebuild. FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel @@ -27,6 +28,6 @@ RUN pip install --upgrade pip && \ pip install unsloth unsloth_zoo trl datasets WORKDIR /workspace -COPY scripts/train_qlora.py /workspace/train_qlora.py -ENTRYPOINT ["python", "/workspace/train_qlora.py"] +# Scripts are mounted at run time (-v $(pwd)/scripts:/scripts), never baked in. +ENTRYPOINT ["python"] diff --git a/README.md b/README.md index 28b7ac1..b6033da 100644 --- a/README.md +++ b/README.md @@ -124,15 +124,15 @@ To fine-tune Qwen3.5-4B with QLoRA in an NVIDIA Docker container: # 2. build the training image (pytorch 2.6/CUDA 12.4 base + unsloth + Qwen3.5) docker build -t psyc-trainer -f Dockerfile.train . -# 3. fine-tune (mount host data/ so adapters land there) -docker run --gpus all --rm \ - -v $(pwd)/data:/data \ - psyc-trainer \ - --dataset /data/datasets/ioc_extraction-v1.jsonl \ - --dataset /data/datasets/severity_classification-v1.jsonl \ - --dataset /data/datasets/routing_decision-v1.jsonl \ - --dataset /data/datasets/tlp_assignment-v1.jsonl \ - --output /data/adapters/psyc-v1 +# 3. fine-tune — scripts/ + data/ are mounted, so script edits need no rebuild +docker run --gpus all --rm --entrypoint python \ + -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ + psyc-trainer /scripts/train_qlora.py \ + --dataset /data/datasets/ioc_extraction-v2.jsonl \ + --dataset /data/datasets/severity_classification-v2.jsonl \ + --dataset /data/datasets/routing_decision-v2.jsonl \ + --dataset /data/datasets/tlp_assignment-v2.jsonl \ + --output /data/adapters/psyc-v2 ``` Defaults target a 24 GB consumer GPU (3090/4090): `unsloth/Qwen3.5-4B` at 4-bit, @@ -150,10 +150,13 @@ docker run --gpus all --rm \ --entrypoint python \ -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ psyc-trainer /scripts/eval_adapter.py \ - --adapter /data/adapters/psyc-v1/final \ - --dataset /data/datasets/ioc_extraction-v1.jsonl --n 5 + --adapter /data/adapters/psyc-v2/final \ + --dataset /data/datasets/ioc_extraction-v2.jsonl --n 5 ``` +The cockpit `/train` page lists every built dataset and trained adapter with +its base model, hyperparameters, dataset provenance, and a per-step loss chart. + ## Status Day 2 of a 48h build. Shipped: Scoutline (URLhaus) → Classifyline → Mapline diff --git a/src/psyc/cockpit/static/cockpit.css b/src/psyc/cockpit/static/cockpit.css index 5a43309..7b24933 100644 --- a/src/psyc/cockpit/static/cockpit.css +++ b/src/psyc/cockpit/static/cockpit.css @@ -94,3 +94,11 @@ tr.sev-low .sev-badge { color: var(--muted); } .card dt { color: var(--muted); } .card dd { margin: 0; } .card ul { margin: 0; padding-left: 18px; font-size: 13px; } + +/* training loss chart (Trainline /train page) */ +.loss-chart { display: flex; flex-direction: column; gap: 6px; margin-top: 8px; } +.loss-row { display: grid; grid-template-columns: 130px 1fr 72px; align-items: center; gap: 10px; font-size: 12px; } +.loss-step { color: var(--muted); } +.loss-bar-track { background: var(--panel-2); border: 1px solid var(--border); border-radius: 3px; height: 16px; overflow: hidden; } +.loss-bar { display: block; height: 100%; background: linear-gradient(90deg, var(--accent), var(--green)); } +.loss-val { text-align: right; color: var(--text); } diff --git a/src/psyc/cockpit/templates/train.html b/src/psyc/cockpit/templates/train.html index b90d3b3..8082194 100644 --- a/src/psyc/cockpit/templates/train.html +++ b/src/psyc/cockpit/templates/train.html @@ -38,26 +38,34 @@
{% for a in adapters %}
-

{{ a.name }}{% if a.has_adapter %} trained{% else %} incomplete{% endif %}

+

{{ a.name }} + {% if a.status == 'trained' %}trained + {% elif a.status == 'in_progress' %} + {% else %}not started{% endif %} +

Base model
{{ a.base_model }}
Examples
{{ a.examples }}
Epochs
{{ a.epochs }}
LoRA r
{{ a.lora_r }}
Learning rate
{{ a.lr }}
-
Final train loss
{% if a.train_loss is not none %}{{ '%.4f'|format(a.train_loss) }}{% else %}— (trained before loss capture){% endif %}
+
Final train loss
{% if a.train_loss is not none %}{{ '%.4f'|format(a.train_loss) }}{% else %}— (trained before loss capture){% endif %}
Datasets
{% for ds in a.datasets %}{{ ds }} {% endfor %}{% if not a.datasets %}—{% endif %}
{% if a.loss_history %} -

Loss by step

- - - +

Training loss by step

+ {% set max_loss = a.loss_history | map(attribute='loss') | max %} +
{% for h in a.loss_history %} -
+
+ step {{ h.step }} · ep {{ h.epoch | round(0, 'floor') | int }} + + {{ '%.4f'|format(h.loss) }} +
{% endfor %} - -
StepEpochLoss
{{ h.step }}{{ h.epoch }}{{ '%.4f'|format(h.loss) }}
+
+ {% else %} +

No per-step loss recorded for this run.

{% endif %}
{% endfor %} diff --git a/src/psyc/lines/train.py b/src/psyc/lines/train.py index 25d509b..1a534e0 100644 --- a/src/psyc/lines/train.py +++ b/src/psyc/lines/train.py @@ -64,8 +64,23 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]: obs = case.observables if not (obs.urls or obs.domains or obs.ips or obs.hashes): return None - summary_or_threat = case.summary or case.source_metadata.get("threat", "") - input_text = f"Advisory: {summary_or_threat}\nSource: {case.source_ref or case.source_type}" + threat = case.source_metadata.get("threat", "malware") + tags = case.source_metadata.get("tags", "") + # The extraction task is only well-posed if every IOC in the output also + # appears in the input — so build the advisory body from the observables. + body = [f"Threat advisory — {threat}."] + if obs.urls: + body.append("Malicious URLs: " + ", ".join(obs.urls) + ".") + if obs.domains: + body.append("Domains: " + ", ".join(obs.domains) + ".") + if obs.ips: + body.append("Hosting IPs: " + ", ".join(obs.ips) + ".") + if obs.hashes: + body.append("Sample hashes: " + ", ".join(obs.hashes) + ".") + if obs.cves: + body.append("Related CVEs: " + ", ".join(obs.cves) + ".") + if tags: + body.append(f"Tags: {tags}.") output_obj = { "urls": obs.urls, "domains": obs.domains, @@ -74,8 +89,8 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]: "cves": obs.cves, } return Example( - instruction="Extract all indicators of compromise (URLs, domains, IPs, hashes, CVEs) from the advisory. Return JSON with keys: urls, domains, ips, hashes, cves.", - input=input_text, + instruction="Extract all indicators of compromise from the advisory and return JSON with keys: urls, domains, ips, hashes, cves.", + input=" ".join(body), output=json.dumps(output_obj, ensure_ascii=False), task="ioc_extraction", case_id=case.case_id, @@ -248,6 +263,14 @@ def list_datasets() -> List[Dict[str, str]]: return out +def _adapter_status(d: Path) -> str: + if (d / "final" / "adapter_model.safetensors").exists(): + return "trained" + if (d / "checkpoints").exists(): + return "in_progress" + return "not_started" + + def list_adapters() -> List[Dict[str, object]]: if not ADAPTERS_DIR.exists(): return [] @@ -261,7 +284,7 @@ def list_adapters() -> List[Dict[str, object]]: meta = json.loads(meta_path.read_text(encoding="utf-8")) out.append({ "name": d.name, - "has_adapter": (d / "final" / "adapter_model.safetensors").exists(), + "status": _adapter_status(d), "base_model": meta.get("base_model", "—"), "examples": meta.get("examples", 0), "epochs": meta.get("epochs", 0),