diff --git a/Dockerfile.train b/Dockerfile.train index 03d94ad..21db2a2 100644 --- a/Dockerfile.train +++ b/Dockerfile.train @@ -27,6 +27,9 @@ ENV PYTHONUNBUFFERED=1 \ RUN pip install --upgrade pip && \ pip install unsloth unsloth_zoo trl datasets +# fastapi + uvicorn power scripts/serve_model.py (the inference server). +RUN pip install fastapi uvicorn + WORKDIR /workspace # Scripts are mounted at run time (-v $(pwd)/scripts:/scripts), never baked in. diff --git a/scripts/serve_model.py b/scripts/serve_model.py new file mode 100644 index 0000000..ec98145 --- /dev/null +++ b/scripts/serve_model.py @@ -0,0 +1,88 @@ +"""psyc model inference server — loads a psyc adapter once, serves /infer over HTTP. + +Run inside the CUDA container (keeps the model resident, serves many requests): + docker run --gpus all --rm -p 8771:8771 --entrypoint python \ + -v $(pwd)/data:/data -v $(pwd)/scripts:/scripts \ + psyc-trainer /scripts/serve_model.py --adapter /data/adapters/psyc-v4/final + +The cockpit (which has no torch) calls this over HTTP to put a real fine-tuned +model behind a Worker Mesh bot. +""" + +from __future__ import annotations + +# unsloth must be imported BEFORE transformers. +from unsloth import FastLanguageModel # noqa: I001 + +import argparse +import re + +import uvicorn +from fastapi import FastAPI +from pydantic import BaseModel + + +def strip_think(text: str) -> str: + """Drop Qwen3.5 thinking-mode blocks so the caller gets just the answer.""" + return re.sub(r".*?\s*", "", text, flags=re.DOTALL).strip() + + +class InferRequest(BaseModel): + instruction: str + input: str + max_new_tokens: int = 256 + + +class InferResponse(BaseModel): + output: str + adapter: str + + +def build_app(model: object, tokenizer: object, adapter: str) -> FastAPI: + app = FastAPI(title="psyc inference server", version="0.1.0") + + @app.get("/healthz") + def healthz() -> dict: + return {"status": "ok", "adapter": adapter} + + @app.post("/infer", response_model=InferResponse) + def infer(req: InferRequest) -> InferResponse: + prompt = f"{req.instruction}\n\n{req.input}" + messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}] + inputs = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_tensors="pt", + enable_thinking=False, + ).to(model.device) + out = model.generate(input_ids=inputs, max_new_tokens=req.max_new_tokens, do_sample=False) + generated = strip_think(tokenizer.decode(out[0][inputs.shape[1]:], skip_special_tokens=True)) + return InferResponse(output=generated, adapter=adapter) + + return app + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--adapter", required=True, help="path to adapter final/ dir") + parser.add_argument("--base-model", default="unsloth/Qwen3.5-4B") + parser.add_argument("--host", default="0.0.0.0") + parser.add_argument("--port", type=int, default=8771) + parser.add_argument("--max-seq-length", type=int, default=4096) + args = parser.parse_args() + + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=args.adapter, + max_seq_length=args.max_seq_length, + dtype=None, + load_in_4bit=True, + ) + FastLanguageModel.for_inference(model) + app = build_app(model, tokenizer, args.adapter) + print(f"[psyc-serve] model ready — adapter {args.adapter}, listening on {args.host}:{args.port}") + uvicorn.run(app, host=args.host, port=args.port, log_level="warning") + + +if __name__ == "__main__": + main()