From b4c66c2e87d0e4a24614897815cb3c36eef7ef5f Mon Sep 17 00:00:00 2001
From: m17hr1l <m17hr1l@wehackforyou.com>
Date: Sun, 17 May 2026 18:09:37 +0200
Subject: [PATCH] stage-3e: well-posed ioc_extraction dataset + clearer /train
 page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ioc_extraction ExampleBuilder now embeds every IOC into the advisory text so
the extraction task is answerable from the input (v1 asked the model to
"extract" a URL that was never given). /train page distinguishes trained /
training… / not-started, and renders a per-step loss bar chart. Dockerfile no
longer bakes the training script — scripts/ is mounted at run time so edits
take effect without a 21 GB rebuild (this is why psyc-v2's loss capture was
silently skipped on its first run).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 Dockerfile.train                      | 25 ++++++++++----------
 README.md                             | 25 +++++++++++---------
 src/psyc/cockpit/static/cockpit.css   |  8 +++++++
 src/psyc/cockpit/templates/train.html | 26 +++++++++++++--------
 src/psyc/lines/train.py               | 33 +++++++++++++++++++++++----
 5 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/Dockerfile.train b/Dockerfile.train
index 115a51b..03d94ad 100644
--- a/Dockerfile.train
+++ b/Dockerfile.train
@@ -3,19 +3,20 @@
 # Build:
 #   docker build -t psyc-trainer -f Dockerfile.train .
 #
-# Run (24 GB GPU, mounts host data/ for datasets + adapter output):
-#   docker run --gpus all --rm \
-#       -v $(pwd)/data:/data \
-#       psyc-trainer \
-#       --dataset /data/datasets/ioc_extraction-v1.jsonl \
-#       --dataset /data/datasets/severity_classification-v1.jsonl \
-#       --dataset /data/datasets/routing_decision-v1.jsonl \
-#       --dataset /data/datasets/tlp_assignment-v1.jsonl \
-#       --output /data/adapters/psyc-v1
+# Run (24 GB GPU; mounts host data/ + scripts/ so script edits need no rebuild):
+#   docker run --gpus all --rm --entrypoint python \
+#       -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \
+#       psyc-trainer /scripts/train_qlora.py \
+#       --dataset /data/datasets/ioc_extraction-v2.jsonl \
+#       --dataset /data/datasets/severity_classification-v2.jsonl \
+#       --dataset /data/datasets/routing_decision-v2.jsonl \
+#       --dataset /data/datasets/tlp_assignment-v2.jsonl \
+#       --output /data/adapters/psyc-v2
 #
 # Base image already ships Python 3.11 + torch 2.6 + CUDA 12.4 + cuDNN9, so
 # there is no apt step and no torch download. Qwen3.5 needs transformers v5 —
-# unsloth pulls it automatically.
+# unsloth pulls it automatically. The training/eval scripts are MOUNTED at run
+# time (not baked in) so editing scripts/*.py never needs an image rebuild.
 
 FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
 
@@ -27,6 +28,6 @@ RUN pip install --upgrade pip && \
     pip install unsloth unsloth_zoo trl datasets
 
 WORKDIR /workspace
-COPY scripts/train_qlora.py /workspace/train_qlora.py
 
-ENTRYPOINT ["python", "/workspace/train_qlora.py"]
+# Scripts are mounted at run time (-v $(pwd)/scripts:/scripts), never baked in.
+ENTRYPOINT ["python"]
diff --git a/README.md b/README.md
index 28b7ac1..b6033da 100644
--- a/README.md
+++ b/README.md
@@ -124,15 +124,15 @@ To fine-tune Qwen3.5-4B with QLoRA in an NVIDIA Docker container:
 # 2. build the training image (pytorch 2.6/CUDA 12.4 base + unsloth + Qwen3.5)
 docker build -t psyc-trainer -f Dockerfile.train .
 
-# 3. fine-tune (mount host data/ so adapters land there)
-docker run --gpus all --rm \
-    -v $(pwd)/data:/data \
-    psyc-trainer \
-    --dataset /data/datasets/ioc_extraction-v1.jsonl \
-    --dataset /data/datasets/severity_classification-v1.jsonl \
-    --dataset /data/datasets/routing_decision-v1.jsonl \
-    --dataset /data/datasets/tlp_assignment-v1.jsonl \
-    --output /data/adapters/psyc-v1
+# 3. fine-tune — scripts/ + data/ are mounted, so script edits need no rebuild
+docker run --gpus all --rm --entrypoint python \
+    -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \
+    psyc-trainer /scripts/train_qlora.py \
+    --dataset /data/datasets/ioc_extraction-v2.jsonl \
+    --dataset /data/datasets/severity_classification-v2.jsonl \
+    --dataset /data/datasets/routing_decision-v2.jsonl \
+    --dataset /data/datasets/tlp_assignment-v2.jsonl \
+    --output /data/adapters/psyc-v2
 ```
 
 Defaults target a 24 GB consumer GPU (3090/4090): `unsloth/Qwen3.5-4B` at 4-bit,
@@ -150,10 +150,13 @@ docker run --gpus all --rm \
     --entrypoint python \
     -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \
     psyc-trainer /scripts/eval_adapter.py \
-    --adapter /data/adapters/psyc-v1/final \
-    --dataset /data/datasets/ioc_extraction-v1.jsonl --n 5
+    --adapter /data/adapters/psyc-v2/final \
+    --dataset /data/datasets/ioc_extraction-v2.jsonl --n 5
 ```
 
+The cockpit `/train` page lists every built dataset and trained adapter with
+its base model, hyperparameters, dataset provenance, and a per-step loss chart.
+
 ## Status
 
 Day 2 of a 48h build. Shipped: Scoutline (URLhaus) → Classifyline → Mapline
diff --git a/src/psyc/cockpit/static/cockpit.css b/src/psyc/cockpit/static/cockpit.css
index 5a43309..7b24933 100644
--- a/src/psyc/cockpit/static/cockpit.css
+++ b/src/psyc/cockpit/static/cockpit.css
@@ -94,3 +94,11 @@ tr.sev-low .sev-badge { color: var(--muted); }
 .card dt { color: var(--muted); }
 .card dd { margin: 0; }
 .card ul { margin: 0; padding-left: 18px; font-size: 13px; }
+
+/* training loss chart (Trainline /train page) */
+.loss-chart { display: flex; flex-direction: column; gap: 6px; margin-top: 8px; }
+.loss-row { display: grid; grid-template-columns: 130px 1fr 72px; align-items: center; gap: 10px; font-size: 12px; }
+.loss-step { color: var(--muted); }
+.loss-bar-track { background: var(--panel-2); border: 1px solid var(--border); border-radius: 3px; height: 16px; overflow: hidden; }
+.loss-bar { display: block; height: 100%; background: linear-gradient(90deg, var(--accent), var(--green)); }
+.loss-val { text-align: right; color: var(--text); }
diff --git a/src/psyc/cockpit/templates/train.html b/src/psyc/cockpit/templates/train.html
index b90d3b3..8082194 100644
--- a/src/psyc/cockpit/templates/train.html
+++ b/src/psyc/cockpit/templates/train.html
@@ -38,26 +38,34 @@
   <div class="grid">
     {% for a in adapters %}
     <div class="card wide">
-      <h2>{{ a.name }}{% if a.has_adapter %} <span class="outcome-badge outcome-actioned">trained</span>{% else %} <span class="outcome-badge outcome-rejected">incomplete</span>{% endif %}</h2>
+      <h2>{{ a.name }}
+        {% if a.status == 'trained' %}<span class="outcome-badge outcome-actioned">trained</span>
+        {% elif a.status == 'in_progress' %}<span class="outcome-badge outcome-submitted">training…</span>
+        {% else %}<span class="outcome-badge outcome-rejected">not started</span>{% endif %}
+      </h2>
       <dl>
         <dt>Base model</dt><dd><code>{{ a.base_model }}</code></dd>
         <dt>Examples</dt><dd>{{ a.examples }}</dd>
         <dt>Epochs</dt><dd>{{ a.epochs }}</dd>
         <dt>LoRA r</dt><dd>{{ a.lora_r }}</dd>
         <dt>Learning rate</dt><dd>{{ a.lr }}</dd>
-        <dt>Final train loss</dt><dd>{% if a.train_loss is not none %}{{ '%.4f'|format(a.train_loss) }}{% else %}<span class="muted">— (trained before loss capture)</span>{% endif %}</dd>
+        <dt>Final train loss</dt><dd>{% if a.train_loss is not none %}<strong>{{ '%.4f'|format(a.train_loss) }}</strong>{% else %}<span class="muted">— (trained before loss capture)</span>{% endif %}</dd>
         <dt>Datasets</dt><dd>{% for ds in a.datasets %}<code>{{ ds }}</code> {% endfor %}{% if not a.datasets %}—{% endif %}</dd>
       </dl>
       {% if a.loss_history %}
-      <h3>Loss by step</h3>
-      <table class="cases">
-        <thead><tr><th>Step</th><th>Epoch</th><th>Loss</th></tr></thead>
-        <tbody>
+      <h3>Training loss by step</h3>
+      {% set max_loss = a.loss_history | map(attribute='loss') | max %}
+      <div class="loss-chart">
         {% for h in a.loss_history %}
-          <tr><td>{{ h.step }}</td><td class="muted">{{ h.epoch }}</td><td>{{ '%.4f'|format(h.loss) }}</td></tr>
+        <div class="loss-row">
+          <span class="loss-step">step {{ h.step }} · ep {{ h.epoch | round(0, 'floor') | int }}</span>
+          <span class="loss-bar-track"><span class="loss-bar" style="width: {{ (h.loss / max_loss * 100) | round(1) }}%"></span></span>
+          <span class="loss-val">{{ '%.4f'|format(h.loss) }}</span>
+        </div>
         {% endfor %}
-        </tbody>
-      </table>
+      </div>
+      {% else %}
+      <p class="muted">No per-step loss recorded for this run.</p>
       {% endif %}
     </div>
     {% endfor %}
diff --git a/src/psyc/lines/train.py b/src/psyc/lines/train.py
index 25d509b..1a534e0 100644
--- a/src/psyc/lines/train.py
+++ b/src/psyc/lines/train.py
@@ -64,8 +64,23 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
     obs = case.observables
     if not (obs.urls or obs.domains or obs.ips or obs.hashes):
         return None
-    summary_or_threat = case.summary or case.source_metadata.get("threat", "")
-    input_text = f"Advisory: {summary_or_threat}\nSource: {case.source_ref or case.source_type}"
+    threat = case.source_metadata.get("threat", "malware")
+    tags = case.source_metadata.get("tags", "")
+    # The extraction task is only well-posed if every IOC in the output also
+    # appears in the input — so build the advisory body from the observables.
+    body = [f"Threat advisory — {threat}."]
+    if obs.urls:
+        body.append("Malicious URLs: " + ", ".join(obs.urls) + ".")
+    if obs.domains:
+        body.append("Domains: " + ", ".join(obs.domains) + ".")
+    if obs.ips:
+        body.append("Hosting IPs: " + ", ".join(obs.ips) + ".")
+    if obs.hashes:
+        body.append("Sample hashes: " + ", ".join(obs.hashes) + ".")
+    if obs.cves:
+        body.append("Related CVEs: " + ", ".join(obs.cves) + ".")
+    if tags:
+        body.append(f"Tags: {tags}.")
     output_obj = {
         "urls": obs.urls,
         "domains": obs.domains,
@@ -74,8 +89,8 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
         "cves": obs.cves,
     }
     return Example(
-        instruction="Extract all indicators of compromise (URLs, domains, IPs, hashes, CVEs) from the advisory. Return JSON with keys: urls, domains, ips, hashes, cves.",
-        input=input_text,
+        instruction="Extract all indicators of compromise from the advisory and return JSON with keys: urls, domains, ips, hashes, cves.",
+        input=" ".join(body),
         output=json.dumps(output_obj, ensure_ascii=False),
         task="ioc_extraction",
         case_id=case.case_id,
@@ -248,6 +263,14 @@ def list_datasets() -> List[Dict[str, str]]:
     return out
 
 
+def _adapter_status(d: Path) -> str:
+    if (d / "final" / "adapter_model.safetensors").exists():
+        return "trained"
+    if (d / "checkpoints").exists():
+        return "in_progress"
+    return "not_started"
+
+
 def list_adapters() -> List[Dict[str, object]]:
     if not ADAPTERS_DIR.exists():
         return []
@@ -261,7 +284,7 @@ def list_adapters() -> List[Dict[str, object]]:
             meta = json.loads(meta_path.read_text(encoding="utf-8"))
         out.append({
             "name": d.name,
-            "has_adapter": (d / "final" / "adapter_model.safetensors").exists(),
+            "status": _adapter_status(d),
             "base_model": meta.get("base_model", "—"),
             "examples": meta.get("examples", 0),
             "epochs": meta.get("epochs", 0),