diff --git a/Dockerfile.train b/Dockerfile.train index 115a51b..03d94ad 100644 --- a/Dockerfile.train +++ b/Dockerfile.train @@ -3,19 +3,20 @@ # Build: # docker build -t psyc-trainer -f Dockerfile.train . # -# Run (24 GB GPU, mounts host data/ for datasets + adapter output): -# docker run --gpus all --rm \ -# -v $(pwd)/data:/data \ -# psyc-trainer \ -# --dataset /data/datasets/ioc_extraction-v1.jsonl \ -# --dataset /data/datasets/severity_classification-v1.jsonl \ -# --dataset /data/datasets/routing_decision-v1.jsonl \ -# --dataset /data/datasets/tlp_assignment-v1.jsonl \ -# --output /data/adapters/psyc-v1 +# Run (24 GB GPU; mounts host data/ + scripts/ so script edits need no rebuild): +# docker run --gpus all --rm --entrypoint python \ +# -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ +# psyc-trainer /scripts/train_qlora.py \ +# --dataset /data/datasets/ioc_extraction-v2.jsonl \ +# --dataset /data/datasets/severity_classification-v2.jsonl \ +# --dataset /data/datasets/routing_decision-v2.jsonl \ +# --dataset /data/datasets/tlp_assignment-v2.jsonl \ +# --output /data/adapters/psyc-v2 # # Base image already ships Python 3.11 + torch 2.6 + CUDA 12.4 + cuDNN9, so # there is no apt step and no torch download. Qwen3.5 needs transformers v5 — -# unsloth pulls it automatically. +# unsloth pulls it automatically. The training/eval scripts are MOUNTED at run +# time (not baked in) so editing scripts/*.py never needs an image rebuild. FROM pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel @@ -27,6 +28,6 @@ RUN pip install --upgrade pip && \ pip install unsloth unsloth_zoo trl datasets WORKDIR /workspace -COPY scripts/train_qlora.py /workspace/train_qlora.py -ENTRYPOINT ["python", "/workspace/train_qlora.py"] +# Scripts are mounted at run time (-v $(pwd)/scripts:/scripts), never baked in. +ENTRYPOINT ["python"] diff --git a/README.md b/README.md index 28b7ac1..b6033da 100644 --- a/README.md +++ b/README.md @@ -124,15 +124,15 @@ To fine-tune Qwen3.5-4B with QLoRA in an NVIDIA Docker container: # 2. build the training image (pytorch 2.6/CUDA 12.4 base + unsloth + Qwen3.5) docker build -t psyc-trainer -f Dockerfile.train . -# 3. fine-tune (mount host data/ so adapters land there) -docker run --gpus all --rm \ - -v $(pwd)/data:/data \ - psyc-trainer \ - --dataset /data/datasets/ioc_extraction-v1.jsonl \ - --dataset /data/datasets/severity_classification-v1.jsonl \ - --dataset /data/datasets/routing_decision-v1.jsonl \ - --dataset /data/datasets/tlp_assignment-v1.jsonl \ - --output /data/adapters/psyc-v1 +# 3. fine-tune — scripts/ + data/ are mounted, so script edits need no rebuild +docker run --gpus all --rm --entrypoint python \ + -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ + psyc-trainer /scripts/train_qlora.py \ + --dataset /data/datasets/ioc_extraction-v2.jsonl \ + --dataset /data/datasets/severity_classification-v2.jsonl \ + --dataset /data/datasets/routing_decision-v2.jsonl \ + --dataset /data/datasets/tlp_assignment-v2.jsonl \ + --output /data/adapters/psyc-v2 ``` Defaults target a 24 GB consumer GPU (3090/4090): `unsloth/Qwen3.5-4B` at 4-bit, @@ -150,10 +150,13 @@ docker run --gpus all --rm \ --entrypoint python \ -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ psyc-trainer /scripts/eval_adapter.py \ - --adapter /data/adapters/psyc-v1/final \ - --dataset /data/datasets/ioc_extraction-v1.jsonl --n 5 + --adapter /data/adapters/psyc-v2/final \ + --dataset /data/datasets/ioc_extraction-v2.jsonl --n 5 ``` +The cockpit `/train` page lists every built dataset and trained adapter with +its base model, hyperparameters, dataset provenance, and a per-step loss chart. + ## Status Day 2 of a 48h build. Shipped: Scoutline (URLhaus) → Classifyline → Mapline diff --git a/src/psyc/cockpit/static/cockpit.css b/src/psyc/cockpit/static/cockpit.css index 5a43309..7b24933 100644 --- a/src/psyc/cockpit/static/cockpit.css +++ b/src/psyc/cockpit/static/cockpit.css @@ -94,3 +94,11 @@ tr.sev-low .sev-badge { color: var(--muted); } .card dt { color: var(--muted); } .card dd { margin: 0; } .card ul { margin: 0; padding-left: 18px; font-size: 13px; } + +/* training loss chart (Trainline /train page) */ +.loss-chart { display: flex; flex-direction: column; gap: 6px; margin-top: 8px; } +.loss-row { display: grid; grid-template-columns: 130px 1fr 72px; align-items: center; gap: 10px; font-size: 12px; } +.loss-step { color: var(--muted); } +.loss-bar-track { background: var(--panel-2); border: 1px solid var(--border); border-radius: 3px; height: 16px; overflow: hidden; } +.loss-bar { display: block; height: 100%; background: linear-gradient(90deg, var(--accent), var(--green)); } +.loss-val { text-align: right; color: var(--text); } diff --git a/src/psyc/cockpit/templates/train.html b/src/psyc/cockpit/templates/train.html index b90d3b3..8082194 100644 --- a/src/psyc/cockpit/templates/train.html +++ b/src/psyc/cockpit/templates/train.html @@ -38,26 +38,34 @@
{{ a.base_model }}{{ ds }} {% endfor %}{% if not a.datasets %}—{% endif %}| Step | Epoch | Loss |
|---|---|---|
| {{ h.step }} | {{ h.epoch }} | {{ '%.4f'|format(h.loss) }} |
No per-step loss recorded for this run.
{% endif %}