#!/usr/bin/env node /** * Build the LoRA training corpus. * * node scripts/build-corpus.mjs * * Sources, in order of priority: * 1. scripts/extraction/framework-reference-v2.md (deep, file:line cited) * 2. src/content/docs/{en,de,ja,es,fr}/ (the public docs) * * Outputs under dist/corpus/: * - chunks.jsonl — raw chunks (one record per source chunk, no Q/A) * - instructions.jsonl — instruction/input/output triples (Alpaca-style) * - chat.jsonl — sharegpt/messages format (system+user+assistant) * - completion.jsonl — prompt/completion pairs (legacy fine-tunes) * - manifest.json — size, sha256, record count, sample preview per file * * Augmentation: per chunk, we emit 3-4 question variants (definition, * procedural, code-focused, file-pointer). Code-block recall samples are * emitted as additional records for the framework-reference source so the * model learns exact framework idioms. */ import fs from 'node:fs'; import path from 'node:path'; import crypto from 'node:crypto'; import { fileURLToPath } from 'node:url'; import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DOCS_DIR = path.resolve(__dirname, '../src/content/docs'); const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md'); // Optional research-agent augmentation. JSONL — one alpaca-style record // per line. When present, records are merged into instructions/chat/ // completion outputs alongside the templated ones. const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl'); // Write straight into public/corpus/ so Astro serves the files at // /corpus/.jsonl without a separate copy step. Gitignored. const OUT_DIR = path.resolve(__dirname, '../public/corpus'); // ============================================================================= // System prompts // ============================================================================= const SYSTEM_PROMPT = { en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.', de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.', ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。', es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.', fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.", }; // Stricter system prompt for the framework-reference source — it's the // gold material with exact namespaces, file:line citations, and the small // idioms we want the model to internalise. const SYSTEM_PROMPT_REFERENCE = 'You are a senior PHP architect and Nibiru framework expert. ' + 'Answers must include exact namespaces, file paths with line numbers when available, ' + 'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' + '— if you do not know, say so plainly.'; // ============================================================================= // Question-variant generation (deterministic, no LLM) // ============================================================================= const QUESTION_TEMPLATES = { en: { definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'], procedural: ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'], topic: ['{topic}', '{topic} — overview', '{topic} in Nibiru'], filePointer: ['Where is {topic} defined?', 'Which file contains {topic}?'], codeFocused: ['Show me the code for {topic}.', 'Quote the {topic} implementation.'], }, de: { definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'], procedural: ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'], topic: ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'], filePointer: ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'], codeFocused: ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'], }, ja: { definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'], procedural: ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'], topic: ['{topic}', '{topic} — 概要'], filePointer: ['{topic} はどこで定義されていますか?'], codeFocused: ['{topic} のコードを見せてください。'], }, es: { definitional: ['¿Qué es {topic}?', 'Explica {topic}.'], procedural: ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'], topic: ['{topic}', '{topic} — visión general'], filePointer: ['¿Dónde se define {topic}?'], codeFocused: ['Muéstrame el código de {topic}.'], }, fr: { definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'], procedural: ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'], topic: ['{topic}', '{topic} — vue d\'ensemble'], filePointer: ['Où est défini {topic} ?'], codeFocused: ['Montre-moi le code de {topic}.'], }, }; // Hash-pick a template deterministically from a kind, so two builds give // the same corpus (necessary for reproducible LoRA training runs). function hashPick(arr, seed) { const h = crypto.createHash('md5').update(seed).digest(); return arr[h[0] % arr.length]; } function questionVariants(chunk) { const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en'; const tpl = QUESTION_TEMPLATES[lang]; const topic = chunk.sectionTitle || chunk.pageTitle; const topicLc = topic.toLowerCase(); const seed = chunk.id + '|' + topic; const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc); // Always include one of each kind so a chunk gets 4 phrasings minimum. const variants = [ fill(hashPick(tpl.definitional, seed + '|d')), fill(hashPick(tpl.procedural, seed + '|p')), fill(hashPick(tpl.topic, seed + '|t')), ]; // Add file-pointer / code-focused variants when the chunk actually // references a file path or contains a code block. if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) { variants.push(fill(hashPick(tpl.filePointer, seed + '|f'))); } if (/```/.test(chunk.content)) { variants.push(fill(hashPick(tpl.codeFocused, seed + '|c'))); } return variants; } // ============================================================================= // Code-block extraction // ============================================================================= // Pull out fenced code blocks paired with a leading sentence as the prompt. function extractCodeBlockSamples(chunk) { const out = []; const lines = chunk.content.split('\n'); let inFence = false; let fenceLang = ''; let buf = []; let leadIn = ''; let prevPara = []; for (const line of lines) { const fence = line.match(/^```(.*)$/); if (fence) { if (!inFence) { inFence = true; fenceLang = fence[1].trim(); buf = []; leadIn = prevPara.join(' ').trim().slice(0, 240); prevPara = []; } else { inFence = false; if (buf.length >= 2) { out.push({ language: fenceLang || 'text', leadIn, code: buf.join('\n'), }); } } continue; } if (inFence) { buf.push(line); } else if (line.trim() === '') { prevPara = []; } else { prevPara.push(line); } } return out; } function codeBlockQA(chunk, block, lang) { const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en; const topic = chunk.sectionTitle || chunk.pageTitle; const seed = chunk.id + '|code|' + block.code.slice(0, 32); const q = hashPick(tpl.codeFocused, seed) .replaceAll('{topic}', topic) .replaceAll('{topic_lc}', topic.toLowerCase()); // Answer = optional lead-in + the code block. Wrap code in fences so the // model learns to emit syntactically valid code blocks too. const fence = '```' + (block.language || '') + '\n' + block.code + '\n```'; const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence; return { question: q, answer }; } // ============================================================================= // Source ingestion // ============================================================================= function ingestPublicDocs() { const files = walkDocs(DOCS_DIR); const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR)); return chunks.map((c) => ({ ...c, source: 'docs' })); } function ingestFrameworkReference() { if (!fs.existsSync(REFERENCE_FILE)) { console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`); return []; } const raw = fs.readFileSync(REFERENCE_FILE, 'utf8'); return chunkMarkdown(raw, { language: 'en', file: 'framework-reference-v2.md', pageTitle: 'Nibiru Framework Reference v2', pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.', baseUrl: '/reference/', }).map((c) => ({ ...c, source: 'framework-reference-v2' })); } // ============================================================================= // Record assembly // ============================================================================= function systemFor(chunk) { if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE; return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en; } // Read the optional research-agent augmentation. Each line is alpaca-format // `{instruction, input, output, metadata}`. Returns [] if the file is absent. function loadAugmentation() { if (!fs.existsSync(AUGMENTATION_FILE)) { console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`); return []; } const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean); const records = []; for (const [i, line] of lines.entries()) { try { const rec = JSON.parse(line); if (rec.instruction && rec.output) records.push(rec); } catch (e) { console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`); } } console.log(`[corpus] loaded ${records.length} augmentation records`); return records; } function buildRecords(chunks) { const chunksOut = []; const instructionsOut = []; const chatOut = []; const completionOut = []; for (const c of chunks) { // 1. Raw chunk record chunksOut.push({ id: c.id, source: c.source, url: c.url, pageTitle: c.pageTitle, sectionTitle: c.sectionTitle, language: c.language, tokens: c.tokens, content: c.content, }); // 2. Question-variant records const sys = systemFor(c); for (const q of questionVariants(c)) { const meta = { language: c.language, source: c.url, page: c.pageTitle, origin: c.source, }; instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta }); chatOut.push({ messages: [ { role: 'system', content: sys }, { role: 'user', content: q }, { role: 'assistant', content: c.content }, ], metadata: meta, }); completionOut.push({ prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`, completion: ' ' + c.content, }); } // 3. Code-block recall samples — only for the framework reference, // where the code is gold (file:line cited, framework-canonical). if (c.source === 'framework-reference-v2') { const blocks = extractCodeBlockSamples(c); for (const b of blocks) { const { question, answer } = codeBlockQA(c, b, c.language); const meta = { language: c.language, source: c.url, page: c.pageTitle, origin: c.source, codeLanguage: b.language, kind: 'code-recall', }; instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta }); chatOut.push({ messages: [ { role: 'system', content: sys }, { role: 'user', content: question }, { role: 'assistant', content: answer }, ], metadata: meta, }); completionOut.push({ prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`, completion: ' ' + answer, }); } } } // Merge research-agent augmentation. Each input record is alpaca-style; // we fan it out into instructions / chat / completion to match the rest. const augmentation = loadAugmentation(); for (const a of augmentation) { const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' }; instructionsOut.push({ instruction: a.instruction, input: a.input || '', output: a.output, metadata: meta, }); chatOut.push({ messages: [ { role: 'system', content: sys }, { role: 'user', content: a.instruction }, { role: 'assistant', content: a.output }, ], metadata: meta, }); completionOut.push({ prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`, completion: ' ' + a.output, }); } if (augmentation.length) { console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`); } return { chunksOut, instructionsOut, chatOut, completionOut }; } // ============================================================================= // IO + manifest // ============================================================================= function ensureDir(d) { fs.mkdirSync(d, { recursive: true }); } function writeJsonl(filePath, items) { ensureDir(path.dirname(filePath)); const stream = fs.createWriteStream(filePath, { encoding: 'utf8' }); for (const item of items) stream.write(JSON.stringify(item) + '\n'); stream.end(); return new Promise((res) => stream.on('close', res)); } function fileStats(filePath) { const buf = fs.readFileSync(filePath); return { bytes: buf.length, sha256: crypto.createHash('sha256').update(buf).digest('hex'), }; } function firstNonEmptyLine(filePath) { const text = fs.readFileSync(filePath, 'utf8'); const line = text.split('\n').find((l) => l.trim().length > 0) || ''; return line.length > 800 ? line.slice(0, 800) + '…' : line; } // ============================================================================= // Main // ============================================================================= // Language metadata — used both to bucket files and label them in the UI. // Order matters: English first (the framework-reference is English-only and // rolls into the en bucket), then localised docs. const LANGUAGES = [ { code: 'en', label: 'English' }, { code: 'de', label: 'Deutsch' }, { code: 'ja', label: '日本語' }, { code: 'es', label: 'Español' }, { code: 'fr', label: 'Français' }, ]; // Bucket records by their `language` (raw chunks) or `metadata.language` // (alpaca/chat/completion records). Returns Map. function bucketByLanguage(records, getLang) { const map = new Map(); for (const lang of LANGUAGES) map.set(lang.code, []); for (const r of records) { const lang = getLang(r) || 'en'; const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang)); bucket.push(r); } return map; } async function main() { console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`); console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`); console.log(`[corpus] OUT_DIR=${OUT_DIR}`); const docsChunks = ingestPublicDocs(); const refChunks = ingestFrameworkReference(); const chunks = [...refChunks, ...docsChunks]; // reference first → priority console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`); const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks); console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`); ensureDir(OUT_DIR); // Wipe any leftover files from a previous run so stale per-language // buckets don't linger. for (const f of fs.readdirSync(OUT_DIR)) { if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f)); } // Per-language buckets. Each format gets one file per language plus a // combined `*-all.jsonl` for callers who want everything. const buckets = { chunks: bucketByLanguage(chunksOut, (r) => r.language), instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language), chat: bucketByLanguage(chatOut, (r) => r.metadata?.language), completion: bucketByLanguage(completionOut, (r) => r.metadata?.language ?? 'en'), }; // `completion` records don't carry metadata (prompt/completion-only), // so its bucketing falls back to en. To keep splits accurate we recompute // from instructionsOut which has the same shape and ordering pre-bucket: { const completionMap = new Map(); for (const lang of LANGUAGES) completionMap.set(lang.code, []); for (let i = 0; i < instructionsOut.length; i++) { const lang = instructionsOut[i].metadata?.language || 'en'; completionMap.get(lang)?.push(completionOut[i]); } buckets.completion = completionMap; } const writeBucketed = async (formatName, bucketMap, allRecords) => { const out = []; // Per-language files for (const lang of LANGUAGES) { const records = bucketMap.get(lang.code) || []; if (records.length === 0) continue; const filename = `${formatName}-${lang.code}.jsonl`; await writeJsonl(path.join(OUT_DIR, filename), records); out.push({ format: formatName, language: lang.code, languageLabel: lang.label, filename, records: records.length, }); } // Combined all-language file const allFilename = `${formatName}-all.jsonl`; await writeJsonl(path.join(OUT_DIR, allFilename), allRecords); out.push({ format: formatName, language: 'all', languageLabel: 'All languages', filename: allFilename, records: allRecords.length, }); return out; }; const allFileMeta = [ ...await writeBucketed('chunks', buckets.chunks, chunksOut), ...await writeBucketed('instructions', buckets.instructions, instructionsOut), ...await writeBucketed('chat', buckets.chat, chatOut), ...await writeBucketed('completion', buckets.completion, completionOut), ]; // Per-language breakdown of the chunks (handy for inspection). const byLanguage = chunks.reduce((acc, c) => { acc[c.language] = (acc[c.language] || 0) + 1; return acc; }, {}); const bySource = chunks.reduce((acc, c) => { acc[c.source] = (acc[c.source] || 0) + 1; return acc; }, {}); // Hash + size + preview for every file written. const filesEnriched = allFileMeta.map((f) => { const fp = path.join(OUT_DIR, f.filename); const st = fileStats(fp); return { ...f, bytes: st.bytes, sha256: st.sha256, samplePreview: firstNonEmptyLine(fp), }; }); const manifest = { generatedAt: new Date().toISOString(), generator: { script: 'scripts/build-corpus.mjs', node: process.version, }, encoding: 'utf-8', sources: { 'framework-reference-v2.md': refChunks.length, 'src/content/docs/': docsChunks.length, }, chunkCount: chunks.length, byLanguage, bySource, languages: LANGUAGES, formats: ['chunks', 'instructions', 'chat', 'completion'], files: filesEnriched, }; fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2)); console.log('[corpus] done — wrote', filesEnriched.length, 'files'); console.log('[corpus] per-format/per-language summary:'); for (const f of filesEnriched) { console.log(` ${f.filename.padEnd(28)} ${String(f.records).padStart(5)} records ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`); } } main().catch((e) => { console.error(e); process.exit(1); });