stage-3e: well-posed ioc_extraction dataset + clearer /train page
ioc_extraction ExampleBuilder now embeds every IOC into the advisory text so the extraction task is answerable from the input (v1 asked the model to "extract" a URL that was never given). /train page distinguishes trained / training… / not-started, and renders a per-step loss bar chart. Dockerfile no longer bakes the training script — scripts/ is mounted at run time so edits take effect without a 21 GB rebuild (this is why psyc-v2's loss capture was silently skipped on its first run). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -94,3 +94,11 @@ tr.sev-low .sev-badge { color: var(--muted); }
|
||||
.card dt { color: var(--muted); }
|
||||
.card dd { margin: 0; }
|
||||
.card ul { margin: 0; padding-left: 18px; font-size: 13px; }
|
||||
|
||||
/* training loss chart (Trainline /train page) */
|
||||
.loss-chart { display: flex; flex-direction: column; gap: 6px; margin-top: 8px; }
|
||||
.loss-row { display: grid; grid-template-columns: 130px 1fr 72px; align-items: center; gap: 10px; font-size: 12px; }
|
||||
.loss-step { color: var(--muted); }
|
||||
.loss-bar-track { background: var(--panel-2); border: 1px solid var(--border); border-radius: 3px; height: 16px; overflow: hidden; }
|
||||
.loss-bar { display: block; height: 100%; background: linear-gradient(90deg, var(--accent), var(--green)); }
|
||||
.loss-val { text-align: right; color: var(--text); }
|
||||
|
||||
@@ -38,26 +38,34 @@
|
||||
<div class="grid">
|
||||
{% for a in adapters %}
|
||||
<div class="card wide">
|
||||
<h2>{{ a.name }}{% if a.has_adapter %} <span class="outcome-badge outcome-actioned">trained</span>{% else %} <span class="outcome-badge outcome-rejected">incomplete</span>{% endif %}</h2>
|
||||
<h2>{{ a.name }}
|
||||
{% if a.status == 'trained' %}<span class="outcome-badge outcome-actioned">trained</span>
|
||||
{% elif a.status == 'in_progress' %}<span class="outcome-badge outcome-submitted">training…</span>
|
||||
{% else %}<span class="outcome-badge outcome-rejected">not started</span>{% endif %}
|
||||
</h2>
|
||||
<dl>
|
||||
<dt>Base model</dt><dd><code>{{ a.base_model }}</code></dd>
|
||||
<dt>Examples</dt><dd>{{ a.examples }}</dd>
|
||||
<dt>Epochs</dt><dd>{{ a.epochs }}</dd>
|
||||
<dt>LoRA r</dt><dd>{{ a.lora_r }}</dd>
|
||||
<dt>Learning rate</dt><dd>{{ a.lr }}</dd>
|
||||
<dt>Final train loss</dt><dd>{% if a.train_loss is not none %}{{ '%.4f'|format(a.train_loss) }}{% else %}<span class="muted">— (trained before loss capture)</span>{% endif %}</dd>
|
||||
<dt>Final train loss</dt><dd>{% if a.train_loss is not none %}<strong>{{ '%.4f'|format(a.train_loss) }}</strong>{% else %}<span class="muted">— (trained before loss capture)</span>{% endif %}</dd>
|
||||
<dt>Datasets</dt><dd>{% for ds in a.datasets %}<code>{{ ds }}</code> {% endfor %}{% if not a.datasets %}—{% endif %}</dd>
|
||||
</dl>
|
||||
{% if a.loss_history %}
|
||||
<h3>Loss by step</h3>
|
||||
<table class="cases">
|
||||
<thead><tr><th>Step</th><th>Epoch</th><th>Loss</th></tr></thead>
|
||||
<tbody>
|
||||
<h3>Training loss by step</h3>
|
||||
{% set max_loss = a.loss_history | map(attribute='loss') | max %}
|
||||
<div class="loss-chart">
|
||||
{% for h in a.loss_history %}
|
||||
<tr><td>{{ h.step }}</td><td class="muted">{{ h.epoch }}</td><td>{{ '%.4f'|format(h.loss) }}</td></tr>
|
||||
<div class="loss-row">
|
||||
<span class="loss-step">step {{ h.step }} · ep {{ h.epoch | round(0, 'floor') | int }}</span>
|
||||
<span class="loss-bar-track"><span class="loss-bar" style="width: {{ (h.loss / max_loss * 100) | round(1) }}%"></span></span>
|
||||
<span class="loss-val">{{ '%.4f'|format(h.loss) }}</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
{% else %}
|
||||
<p class="muted">No per-step loss recorded for this run.</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
@@ -64,8 +64,23 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
|
||||
obs = case.observables
|
||||
if not (obs.urls or obs.domains or obs.ips or obs.hashes):
|
||||
return None
|
||||
summary_or_threat = case.summary or case.source_metadata.get("threat", "")
|
||||
input_text = f"Advisory: {summary_or_threat}\nSource: {case.source_ref or case.source_type}"
|
||||
threat = case.source_metadata.get("threat", "malware")
|
||||
tags = case.source_metadata.get("tags", "")
|
||||
# The extraction task is only well-posed if every IOC in the output also
|
||||
# appears in the input — so build the advisory body from the observables.
|
||||
body = [f"Threat advisory — {threat}."]
|
||||
if obs.urls:
|
||||
body.append("Malicious URLs: " + ", ".join(obs.urls) + ".")
|
||||
if obs.domains:
|
||||
body.append("Domains: " + ", ".join(obs.domains) + ".")
|
||||
if obs.ips:
|
||||
body.append("Hosting IPs: " + ", ".join(obs.ips) + ".")
|
||||
if obs.hashes:
|
||||
body.append("Sample hashes: " + ", ".join(obs.hashes) + ".")
|
||||
if obs.cves:
|
||||
body.append("Related CVEs: " + ", ".join(obs.cves) + ".")
|
||||
if tags:
|
||||
body.append(f"Tags: {tags}.")
|
||||
output_obj = {
|
||||
"urls": obs.urls,
|
||||
"domains": obs.domains,
|
||||
@@ -74,8 +89,8 @@ def _ex_ioc_extraction(case: Case) -> Optional[Example]:
|
||||
"cves": obs.cves,
|
||||
}
|
||||
return Example(
|
||||
instruction="Extract all indicators of compromise (URLs, domains, IPs, hashes, CVEs) from the advisory. Return JSON with keys: urls, domains, ips, hashes, cves.",
|
||||
input=input_text,
|
||||
instruction="Extract all indicators of compromise from the advisory and return JSON with keys: urls, domains, ips, hashes, cves.",
|
||||
input=" ".join(body),
|
||||
output=json.dumps(output_obj, ensure_ascii=False),
|
||||
task="ioc_extraction",
|
||||
case_id=case.case_id,
|
||||
@@ -248,6 +263,14 @@ def list_datasets() -> List[Dict[str, str]]:
|
||||
return out
|
||||
|
||||
|
||||
def _adapter_status(d: Path) -> str:
|
||||
if (d / "final" / "adapter_model.safetensors").exists():
|
||||
return "trained"
|
||||
if (d / "checkpoints").exists():
|
||||
return "in_progress"
|
||||
return "not_started"
|
||||
|
||||
|
||||
def list_adapters() -> List[Dict[str, object]]:
|
||||
if not ADAPTERS_DIR.exists():
|
||||
return []
|
||||
@@ -261,7 +284,7 @@ def list_adapters() -> List[Dict[str, object]]:
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
out.append({
|
||||
"name": d.name,
|
||||
"has_adapter": (d / "final" / "adapter_model.safetensors").exists(),
|
||||
"status": _adapter_status(d),
|
||||
"base_model": meta.get("base_model", "—"),
|
||||
"examples": meta.get("examples", 0),
|
||||
"epochs": meta.get("epochs", 0),
|
||||
|
||||
Reference in New Issue
Block a user