nibiru-framework.com/docs/scripts/build-corpus.mjs

#!/usr/bin/env node
/**
 * Build the LoRA training corpus.
 *
 *   node scripts/build-corpus.mjs
 *
 * Sources, in order of priority:
 *   1. scripts/extraction/framework-reference-v2.md   (deep, file:line cited)
 *   2. src/content/docs/{en,de,ja,es,fr}/             (the public docs)
 *
 * Outputs under dist/corpus/:
 *   - chunks.jsonl       — raw chunks (one record per source chunk, no Q/A)
 *   - instructions.jsonl — instruction/input/output triples (Alpaca-style)
 *   - chat.jsonl         — sharegpt/messages format (system+user+assistant)
 *   - completion.jsonl   — prompt/completion pairs (legacy fine-tunes)
 *   - manifest.json      — size, sha256, record count, sample preview per file
 *
 * Augmentation: per chunk, we emit 3-4 question variants (definition,
 * procedural, code-focused, file-pointer). Code-block recall samples are
 * emitted as additional records for the framework-reference source so the
 * model learns exact framework idioms.
 */

import fs from 'node:fs';
import path from 'node:path';
import crypto from 'node:crypto';
import { fileURLToPath } from 'node:url';
import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md');
// Optional research-agent augmentation. JSONL — one alpaca-style record
// per line. When present, records are merged into instructions/chat/
// completion outputs alongside the templated ones.
const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl');
// Write straight into public/corpus/ so Astro serves the files at
// /corpus/<name>.jsonl without a separate copy step. Gitignored.
const OUT_DIR = path.resolve(__dirname, '../public/corpus');

// =============================================================================
//  System prompts
// =============================================================================

const SYSTEM_PROMPT = {
	en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
	de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.',
	ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。',
	es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.',
	fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
};

// Stricter system prompt for the framework-reference source — it's the
// gold material with exact namespaces, file:line citations, and the small
// idioms we want the model to internalise.
const SYSTEM_PROMPT_REFERENCE =
	'You are a senior PHP architect and Nibiru framework expert. ' +
	'Answers must include exact namespaces, file paths with line numbers when available, ' +
	'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' +
	'— if you do not know, say so plainly.';

// =============================================================================
//  Question-variant generation (deterministic, no LLM)
// =============================================================================

const QUESTION_TEMPLATES = {
	en: {
		definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'],
		procedural:   ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'],
		topic:        ['{topic}', '{topic} — overview', '{topic} in Nibiru'],
		filePointer:  ['Where is {topic} defined?', 'Which file contains {topic}?'],
		codeFocused:  ['Show me the code for {topic}.', 'Quote the {topic} implementation.'],
	},
	de: {
		definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'],
		procedural:   ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'],
		topic:        ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'],
		filePointer:  ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'],
		codeFocused:  ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'],
	},
	ja: {
		definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'],
		procedural:   ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'],
		topic:        ['{topic}', '{topic} — 概要'],
		filePointer:  ['{topic} はどこで定義されていますか?'],
		codeFocused:  ['{topic} のコードを見せてください。'],
	},
	es: {
		definitional: ['¿Qué es {topic}?', 'Explica {topic}.'],
		procedural:   ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'],
		topic:        ['{topic}', '{topic} — visión general'],
		filePointer:  ['¿Dónde se define {topic}?'],
		codeFocused:  ['Muéstrame el código de {topic}.'],
	},
	fr: {
		definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'],
		procedural:   ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'],
		topic:        ['{topic}', '{topic} — vue d\'ensemble'],
		filePointer:  ['Où est défini {topic} ?'],
		codeFocused:  ['Montre-moi le code de {topic}.'],
	},
};

// Hash-pick a template deterministically from a kind, so two builds give
// the same corpus (necessary for reproducible LoRA training runs).
function hashPick(arr, seed) {
	const h = crypto.createHash('md5').update(seed).digest();
	return arr[h[0] % arr.length];
}

function questionVariants(chunk) {
	const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en';
	const tpl = QUESTION_TEMPLATES[lang];
	const topic = chunk.sectionTitle || chunk.pageTitle;
	const topicLc = topic.toLowerCase();
	const seed = chunk.id + '|' + topic;

	const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc);

	// Always include one of each kind so a chunk gets 4 phrasings minimum.
	const variants = [
		fill(hashPick(tpl.definitional, seed + '|d')),
		fill(hashPick(tpl.procedural,   seed + '|p')),
		fill(hashPick(tpl.topic,        seed + '|t')),
	];
	// Add file-pointer / code-focused variants when the chunk actually
	// references a file path or contains a code block.
	if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) {
		variants.push(fill(hashPick(tpl.filePointer, seed + '|f')));
	}
	if (/```/.test(chunk.content)) {
		variants.push(fill(hashPick(tpl.codeFocused, seed + '|c')));
	}
	return variants;
}

// =============================================================================
//  Code-block extraction
// =============================================================================

// Pull out fenced code blocks paired with a leading sentence as the prompt.
function extractCodeBlockSamples(chunk) {
	const out = [];
	const lines = chunk.content.split('\n');
	let inFence = false;
	let fenceLang = '';
	let buf = [];
	let leadIn = '';
	let prevPara = [];

	for (const line of lines) {
		const fence = line.match(/^```(.*)$/);
		if (fence) {
			if (!inFence) {
				inFence = true;
				fenceLang = fence[1].trim();
				buf = [];
				leadIn = prevPara.join(' ').trim().slice(0, 240);
				prevPara = [];
			} else {
				inFence = false;
				if (buf.length >= 2) {
					out.push({
						language: fenceLang || 'text',
						leadIn,
						code: buf.join('\n'),
					});
				}
			}
			continue;
		}
		if (inFence) {
			buf.push(line);
		} else if (line.trim() === '') {
			prevPara = [];
		} else {
			prevPara.push(line);
		}
	}
	return out;
}

function codeBlockQA(chunk, block, lang) {
	const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en;
	const topic = chunk.sectionTitle || chunk.pageTitle;
	const seed = chunk.id + '|code|' + block.code.slice(0, 32);
	const q = hashPick(tpl.codeFocused, seed)
		.replaceAll('{topic}', topic)
		.replaceAll('{topic_lc}', topic.toLowerCase());

	// Answer = optional lead-in + the code block. Wrap code in fences so the
	// model learns to emit syntactically valid code blocks too.
	const fence = '```' + (block.language || '') + '\n' + block.code + '\n```';
	const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence;
	return { question: q, answer };
}

// =============================================================================
//  Source ingestion
// =============================================================================

function ingestPublicDocs() {
	const files = walkDocs(DOCS_DIR);
	const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
	return chunks.map((c) => ({ ...c, source: 'docs' }));
}

function ingestFrameworkReference() {
	if (!fs.existsSync(REFERENCE_FILE)) {
		console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`);
		return [];
	}
	const raw = fs.readFileSync(REFERENCE_FILE, 'utf8');
	return chunkMarkdown(raw, {
		language: 'en',
		file: 'framework-reference-v2.md',
		pageTitle: 'Nibiru Framework Reference v2',
		pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.',
		baseUrl: '/reference/',
	}).map((c) => ({ ...c, source: 'framework-reference-v2' }));
}

// =============================================================================
//  Record assembly
// =============================================================================

function systemFor(chunk) {
	if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE;
	return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en;
}

// Read the optional research-agent augmentation. Each line is alpaca-format
// `{instruction, input, output, metadata}`. Returns [] if the file is absent.
function loadAugmentation() {
	if (!fs.existsSync(AUGMENTATION_FILE)) {
		console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`);
		return [];
	}
	const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean);
	const records = [];
	for (const [i, line] of lines.entries()) {
		try {
			const rec = JSON.parse(line);
			if (rec.instruction && rec.output) records.push(rec);
		} catch (e) {
			console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`);
		}
	}
	console.log(`[corpus] loaded ${records.length} augmentation records`);
	return records;
}

function buildRecords(chunks) {
	const chunksOut = [];
	const instructionsOut = [];
	const chatOut = [];
	const completionOut = [];

	for (const c of chunks) {
		// 1. Raw chunk record
		chunksOut.push({
			id: c.id,
			source: c.source,
			url: c.url,
			pageTitle: c.pageTitle,
			sectionTitle: c.sectionTitle,
			language: c.language,
			tokens: c.tokens,
			content: c.content,
		});

		// 2. Question-variant records
		const sys = systemFor(c);
		for (const q of questionVariants(c)) {
			const meta = {
				language: c.language,
				source: c.url,
				page: c.pageTitle,
				origin: c.source,
			};
			instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta });
			chatOut.push({
				messages: [
					{ role: 'system', content: sys },
					{ role: 'user', content: q },
					{ role: 'assistant', content: c.content },
				],
				metadata: meta,
			});
			completionOut.push({
				prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`,
				completion: ' ' + c.content,
			});
		}

		// 3. Code-block recall samples — only for the framework reference,
		//    where the code is gold (file:line cited, framework-canonical).
		if (c.source === 'framework-reference-v2') {
			const blocks = extractCodeBlockSamples(c);
			for (const b of blocks) {
				const { question, answer } = codeBlockQA(c, b, c.language);
				const meta = {
					language: c.language,
					source: c.url,
					page: c.pageTitle,
					origin: c.source,
					codeLanguage: b.language,
					kind: 'code-recall',
				};
				instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta });
				chatOut.push({
					messages: [
						{ role: 'system', content: sys },
						{ role: 'user', content: question },
						{ role: 'assistant', content: answer },
					],
					metadata: meta,
				});
				completionOut.push({
					prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`,
					completion: ' ' + answer,
				});
			}
		}
	}

	// Merge research-agent augmentation. Each input record is alpaca-style;
	// we fan it out into instructions / chat / completion to match the rest.
	const augmentation = loadAugmentation();
	for (const a of augmentation) {
		const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade
		const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' };
		instructionsOut.push({
			instruction: a.instruction,
			input: a.input || '',
			output: a.output,
			metadata: meta,
		});
		chatOut.push({
			messages: [
				{ role: 'system', content: sys },
				{ role: 'user', content: a.instruction },
				{ role: 'assistant', content: a.output },
			],
			metadata: meta,
		});
		completionOut.push({
			prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`,
			completion: ' ' + a.output,
		});
	}
	if (augmentation.length) {
		console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`);
	}

	return { chunksOut, instructionsOut, chatOut, completionOut };
}

// =============================================================================
//  IO + manifest
// =============================================================================

function ensureDir(d) {
	fs.mkdirSync(d, { recursive: true });
}

function writeJsonl(filePath, items) {
	ensureDir(path.dirname(filePath));
	const stream = fs.createWriteStream(filePath, { encoding: 'utf8' });
	for (const item of items) stream.write(JSON.stringify(item) + '\n');
	stream.end();
	return new Promise((res) => stream.on('close', res));
}

function fileStats(filePath) {
	const buf = fs.readFileSync(filePath);
	return {
		bytes: buf.length,
		sha256: crypto.createHash('sha256').update(buf).digest('hex'),
	};
}

function firstNonEmptyLine(filePath) {
	const text = fs.readFileSync(filePath, 'utf8');
	const line = text.split('\n').find((l) => l.trim().length > 0) || '';
	return line.length > 800 ? line.slice(0, 800) + '…' : line;
}

// =============================================================================
//  Main
// =============================================================================

// Language metadata — used both to bucket files and label them in the UI.
// Order matters: English first (the framework-reference is English-only and
// rolls into the en bucket), then localised docs.
const LANGUAGES = [
	{ code: 'en', label: 'English'  },
	{ code: 'de', label: 'Deutsch'  },
	{ code: 'ja', label: '日本語'    },
	{ code: 'es', label: 'Español'  },
	{ code: 'fr', label: 'Français' },
];

// Bucket records by their `language` (raw chunks) or `metadata.language`
// (alpaca/chat/completion records). Returns Map<lang, items[]>.
function bucketByLanguage(records, getLang) {
	const map = new Map();
	for (const lang of LANGUAGES) map.set(lang.code, []);
	for (const r of records) {
		const lang = getLang(r) || 'en';
		const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang));
		bucket.push(r);
	}
	return map;
}

async function main() {
	console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`);
	console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`);
	console.log(`[corpus] OUT_DIR=${OUT_DIR}`);

	const docsChunks = ingestPublicDocs();
	const refChunks = ingestFrameworkReference();
	const chunks = [...refChunks, ...docsChunks]; // reference first → priority

	console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`);

	const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks);
	console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`);

	ensureDir(OUT_DIR);
	// Wipe any leftover files from a previous run so stale per-language
	// buckets don't linger.
	for (const f of fs.readdirSync(OUT_DIR)) {
		if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f));
	}

	// Per-language buckets. Each format gets one file per language plus a
	// combined `*-all.jsonl` for callers who want everything.
	const buckets = {
		chunks:       bucketByLanguage(chunksOut,       (r) => r.language),
		instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language),
		chat:         bucketByLanguage(chatOut,         (r) => r.metadata?.language),
		completion:   bucketByLanguage(completionOut,   (r) => r.metadata?.language ?? 'en'),
	};
	// `completion` records don't carry metadata (prompt/completion-only),
	// so its bucketing falls back to en. To keep splits accurate we recompute
	// from instructionsOut which has the same shape and ordering pre-bucket:
	{
		const completionMap = new Map();
		for (const lang of LANGUAGES) completionMap.set(lang.code, []);
		for (let i = 0; i < instructionsOut.length; i++) {
			const lang = instructionsOut[i].metadata?.language || 'en';
			completionMap.get(lang)?.push(completionOut[i]);
		}
		buckets.completion = completionMap;
	}

	const writeBucketed = async (formatName, bucketMap, allRecords) => {
		const out = [];
		// Per-language files
		for (const lang of LANGUAGES) {
			const records = bucketMap.get(lang.code) || [];
			if (records.length === 0) continue;
			const filename = `${formatName}-${lang.code}.jsonl`;
			await writeJsonl(path.join(OUT_DIR, filename), records);
			out.push({
				format: formatName,
				language: lang.code,
				languageLabel: lang.label,
				filename,
				records: records.length,
			});
		}
		// Combined all-language file
		const allFilename = `${formatName}-all.jsonl`;
		await writeJsonl(path.join(OUT_DIR, allFilename), allRecords);
		out.push({
			format: formatName,
			language: 'all',
			languageLabel: 'All languages',
			filename: allFilename,
			records: allRecords.length,
		});
		return out;
	};

	const allFileMeta = [
		...await writeBucketed('chunks',       buckets.chunks,       chunksOut),
		...await writeBucketed('instructions', buckets.instructions, instructionsOut),
		...await writeBucketed('chat',         buckets.chat,         chatOut),
		...await writeBucketed('completion',   buckets.completion,   completionOut),
	];

	// Per-language breakdown of the chunks (handy for inspection).
	const byLanguage = chunks.reduce((acc, c) => {
		acc[c.language] = (acc[c.language] || 0) + 1;
		return acc;
	}, {});
	const bySource = chunks.reduce((acc, c) => {
		acc[c.source] = (acc[c.source] || 0) + 1;
		return acc;
	}, {});

	// Hash + size + preview for every file written.
	const filesEnriched = allFileMeta.map((f) => {
		const fp = path.join(OUT_DIR, f.filename);
		const st = fileStats(fp);
		return {
			...f,
			bytes: st.bytes,
			sha256: st.sha256,
			samplePreview: firstNonEmptyLine(fp),
		};
	});

	const manifest = {
		generatedAt: new Date().toISOString(),
		generator: {
			script: 'scripts/build-corpus.mjs',
			node: process.version,
		},
		encoding: 'utf-8',
		sources: {
			'framework-reference-v2.md': refChunks.length,
			'src/content/docs/':         docsChunks.length,
		},
		chunkCount: chunks.length,
		byLanguage,
		bySource,
		languages: LANGUAGES,
		formats: ['chunks', 'instructions', 'chat', 'completion'],
		files: filesEnriched,
	};
	fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2));

	console.log('[corpus] done — wrote', filesEnriched.length, 'files');
	console.log('[corpus] per-format/per-language summary:');
	for (const f of filesEnriched) {
		console.log(`  ${f.filename.padEnd(28)}  ${String(f.records).padStart(5)} records  ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`);
	}
}

main().catch((e) => {
	console.error(e);
	process.exit(1);
});