"""Evaluate a psyc QLoRA adapter — run held-out dataset rows through the model. Run inside the psyc training container (override the entrypoint): docker run --gpus all --rm --entrypoint python \ -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ psyc-trainer /scripts/eval_adapter.py \ --adapter /data/adapters/psyc-v1/final \ --dataset /data/datasets/ioc_extraction-v1.jsonl --n 5 Sanity check, not a benchmark: for `--n` rows it prints the prompt, the model's generation, and the dataset's reference output side by side. With a tiny dataset the model has seen these rows, so this verifies the adapter learned the output FORMAT and task shape — not generalization. """ from __future__ import annotations # unsloth must be imported BEFORE transformers. from unsloth import FastLanguageModel # noqa: I001 import argparse import json import re from pathlib import Path from typing import Dict, List def strip_think(text: str) -> str: """Drop Qwen3.5 thinking-mode blocks so exact-match compares the answer only.""" return re.sub(r".*?\s*", "", text, flags=re.DOTALL).strip() def load_examples(path: Path, n: int) -> List[Dict[str, str]]: out: List[Dict[str, str]] = [] with path.open("r", encoding="utf-8") as fh: for line in fh: line = line.strip() if not line: continue out.append(json.loads(line)) if len(out) >= n: break return out def main() -> None: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--adapter", required=True, help="path to adapter final/ dir") parser.add_argument("--base-model", default="unsloth/Qwen3.5-4B") parser.add_argument("--dataset", required=True, help="JSONL to sample test rows from") parser.add_argument("--n", type=int, default=5) parser.add_argument("--max-seq-length", type=int, default=4096) parser.add_argument("--max-new-tokens", type=int, default=256) args = parser.parse_args() examples = load_examples(Path(args.dataset), args.n) if not examples: raise SystemExit("no examples loaded") model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.adapter, max_seq_length=args.max_seq_length, dtype=None, load_in_4bit=True, ) FastLanguageModel.for_inference(model) correct = 0 for i, ex in enumerate(examples, 1): prompt = f"{ex['instruction']}\n\n{ex['input']}" messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", enable_thinking=False, ).to(model.device) out = model.generate(input_ids=inputs, max_new_tokens=args.max_new_tokens, do_sample=False) generated = strip_think(tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)) expected = ex["output"].strip() match = generated == expected correct += int(match) print(f"\n===== example {i}/{len(examples)} [{ex.get('task', '?')}] {'MATCH' if match else 'DIFF'} =====") print(f"-- prompt --\n{prompt[:600]}") print(f"-- expected --\n{expected[:600]}") print(f"-- generated --\n{generated[:600]}") print(f"\n[psyc-eval] exact-match {correct}/{len(examples)}") if __name__ == "__main__": main()