#!/usr/bin/env node /** * Export the docs as a LoRA-training-ready corpus. * * node scripts/build-corpus.mjs * * Outputs four files under dist/corpus/: * - chunks.jsonl — raw chunks (one section per line) * - instructions.jsonl — instruction/input/output triples * - chat.jsonl — sharegpt/chat-format messages * - completion.jsonl — prompt/completion pairs (legacy fine-tunes) * * The instruction text for each chunk is derived from the section heading * with a per-language template ("How do I X?", "Wie X?", "X するには?"). */ import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { chunkFile, walkDocs } from './lib/chunk.mjs'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DOCS_DIR = path.resolve(__dirname, '../src/content/docs'); const OUT_DIR = path.resolve(__dirname, '../dist/corpus'); const SYSTEM_PROMPT = { en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.', de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.', ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。', es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.', fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.", }; const QUESTION_PREFIX = { en: ['How do I', 'What is', 'Explain', 'Show me'], de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'], ja: ['', '', 'について教えてください:', ''], es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'], fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'], }; function questionFor(chunk) { const lang = chunk.language || 'en'; const heading = chunk.sectionTitle || chunk.pageTitle; if (lang === 'ja') { return `${heading} について教えてください。`; } const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en; const prefix = prefixes[heading.length % prefixes.length]; if (lang === 'es' || lang === 'fr') { return `${prefix} ${heading.toLowerCase()} ?`.replace(' ', ' '); } return `${prefix} ${heading.toLowerCase()}?`; } function ensureDir(d) { fs.mkdirSync(d, { recursive: true }); } function writeJsonl(filePath, items) { ensureDir(path.dirname(filePath)); const stream = fs.createWriteStream(filePath, { encoding: 'utf8' }); for (const item of items) stream.write(JSON.stringify(item) + '\n'); stream.end(); return new Promise((res) => stream.on('close', res)); } async function main() { console.log(`Walking ${DOCS_DIR}…`); const files = walkDocs(DOCS_DIR); const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR)); console.log(`Produced ${chunks.length} chunks across ${files.length} files.`); const chunksOut = chunks.map((c) => ({ id: c.id, url: c.url, pageTitle: c.pageTitle, sectionTitle: c.sectionTitle, language: c.language, tokens: c.tokens, content: c.content, })); const instructionsOut = chunks.map((c) => ({ instruction: questionFor(c), input: '', output: c.content, metadata: { language: c.language, source: c.url, page: c.pageTitle }, })); const chatOut = chunks.map((c) => ({ messages: [ { role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en }, { role: 'user', content: questionFor(c) }, { role: 'assistant', content: c.content }, ], metadata: { language: c.language, source: c.url, page: c.pageTitle }, })); const completionOut = chunks.map((c) => ({ prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`, completion: ' ' + c.content, })); await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut); await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut); await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut); await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut); const stats = { generatedAt: new Date().toISOString(), fileCount: files.length, chunkCount: chunks.length, byLanguage: chunks.reduce((acc, c) => { acc[c.language] = (acc[c.language] || 0) + 1; return acc; }, {}), }; fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2)); console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`); console.log(JSON.stringify(stats, null, 2)); } main().catch((e) => { console.error(e); process.exit(1); });