Strip api.neuronetz.ai from documentation; chat config stays in env

The Ollama URL was leaking via: - prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment, local-testing, ai/module/{overview,embed,training}) - code blocks teaching users to curl the host directly - .env.example, Dockerfile, docker-compose.yml defaults - providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults - LandingScripts.astro comment - lora-runbook.md prose + SSH host - the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers - the "Oracle is silent" fallback message at /api/oracle POST Replacements: - prose: "neuronetz.ai" → "your Ollama instance" - example URLs in code blocks: https://api.neuronetz.ai → https://your-ollama-host.example - code-level defaults: → http://localhost:11434 (Ollama's standard local port) - GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed - runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host> Production chat is unaffected: docs/.env (gitignored) on the production host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only change in the running container is that the GET handler no longer echoes the URL. analytics.neuronetz.ai (Umami tracking) is intentionally left intact — it's a public, brand-owned subdomain meant to be visible. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 17:14:17 +02:00
parent 9b7fd15ca1
commit f4ccc45a3b
44 changed files with 1386 additions and 292 deletions
--- a/docs/scripts/build-corpus.mjs
+++ b/docs/scripts/build-corpus.mjs
@@ -1,27 +1,46 @@
 #!/usr/bin/env node
 /**
- * Export the docs as a LoRA-training-ready corpus.
+ * Build the LoRA training corpus.
 *
 *   node scripts/build-corpus.mjs
 *
- * Outputs four files under dist/corpus/:
- *   - chunks.jsonl       — raw chunks (one section per line)
- *   - instructions.jsonl — instruction/input/output triples
- *   - chat.jsonl         — sharegpt/chat-format messages
- *   - completion.jsonl   — prompt/completion pairs (legacy fine-tunes)
+ * Sources, in order of priority:
+ *   1. scripts/extraction/framework-reference-v2.md   (deep, file:line cited)
+ *   2. src/content/docs/{en,de,ja,es,fr}/             (the public docs)
 *
- * The instruction text for each chunk is derived from the section heading
- * with a per-language template ("How do I X?", "Wie X?", "X するには？").
+ * Outputs under dist/corpus/:
+ *   - chunks.jsonl       — raw chunks (one record per source chunk, no Q/A)
+ *   - instructions.jsonl — instruction/input/output triples (Alpaca-style)
+ *   - chat.jsonl         — sharegpt/messages format (system+user+assistant)
+ *   - completion.jsonl   — prompt/completion pairs (legacy fine-tunes)
+ *   - manifest.json      — size, sha256, record count, sample preview per file
+ *
+ * Augmentation: per chunk, we emit 3-4 question variants (definition,
+ * procedural, code-focused, file-pointer). Code-block recall samples are
+ * emitted as additional records for the framework-reference source so the
+ * model learns exact framework idioms.
 */

 import fs from 'node:fs';
 import path from 'node:path';
+import crypto from 'node:crypto';
 import { fileURLToPath } from 'node:url';
-import { chunkFile, walkDocs } from './lib/chunk.mjs';
+import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs';

 const __dirname = path.dirname(fileURLToPath(import.meta.url));
 const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
-const OUT_DIR = path.resolve(__dirname, '../dist/corpus');
+const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md');
+// Optional research-agent augmentation. JSONL — one alpaca-style record
+// per line. When present, records are merged into instructions/chat/
+// completion outputs alongside the templated ones.
+const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl');
+// Write straight into public/corpus/ so Astro serves the files at
+// /corpus/<name>.jsonl without a separate copy step. Gitignored.
+const OUT_DIR = path.resolve(__dirname, '../public/corpus');
+
+// =============================================================================
+//  System prompts
+// =============================================================================

 const SYSTEM_PROMPT = {
 	en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
@@ -31,28 +50,316 @@ const SYSTEM_PROMPT = {
 	fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
 };

-const QUESTION_PREFIX = {
-	en: ['How do I', 'What is', 'Explain', 'Show me'],
-	de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'],
-	ja: ['', '', 'について教えてください：', ''],
-	es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'],
-	fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'],
+// Stricter system prompt for the framework-reference source — it's the
+// gold material with exact namespaces, file:line citations, and the small
+// idioms we want the model to internalise.
+const SYSTEM_PROMPT_REFERENCE =
+	'You are a senior PHP architect and Nibiru framework expert. ' +
+	'Answers must include exact namespaces, file paths with line numbers when available, ' +
+	'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' +
+	'— if you do not know, say so plainly.';
+
+// =============================================================================
+//  Question-variant generation (deterministic, no LLM)
+// =============================================================================
+
+const QUESTION_TEMPLATES = {
+	en: {
+		definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'],
+		procedural:   ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'],
+		topic:        ['{topic}', '{topic} — overview', '{topic} in Nibiru'],
+		filePointer:  ['Where is {topic} defined?', 'Which file contains {topic}?'],
+		codeFocused:  ['Show me the code for {topic}.', 'Quote the {topic} implementation.'],
+	},
+	de: {
+		definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'],
+		procedural:   ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'],
+		topic:        ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'],
+		filePointer:  ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'],
+		codeFocused:  ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'],
+	},
+	ja: {
+		definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'],
+		procedural:   ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'],
+		topic:        ['{topic}', '{topic} — 概要'],
+		filePointer:  ['{topic} はどこで定義されていますか?'],
+		codeFocused:  ['{topic} のコードを見せてください。'],
+	},
+	es: {
+		definitional: ['¿Qué es {topic}?', 'Explica {topic}.'],
+		procedural:   ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'],
+		topic:        ['{topic}', '{topic} — visión general'],
+		filePointer:  ['¿Dónde se define {topic}?'],
+		codeFocused:  ['Muéstrame el código de {topic}.'],
+	},
+	fr: {
+		definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'],
+		procedural:   ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'],
+		topic:        ['{topic}', '{topic} — vue d\'ensemble'],
+		filePointer:  ['Où est défini {topic} ?'],
+		codeFocused:  ['Montre-moi le code de {topic}.'],
+	},
 };

-function questionFor(chunk) {
-	const lang = chunk.language || 'en';
-	const heading = chunk.sectionTitle || chunk.pageTitle;
-	if (lang === 'ja') {
-		return `${heading} について教えてください。`;
-	}
-	const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en;
-	const prefix = prefixes[heading.length % prefixes.length];
-	if (lang === 'es' || lang === 'fr') {
-		return `${prefix} ${heading.toLowerCase()} ?`.replace('  ', ' ');
-	}
-	return `${prefix} ${heading.toLowerCase()}?`;
+// Hash-pick a template deterministically from a kind, so two builds give
+// the same corpus (necessary for reproducible LoRA training runs).
+function hashPick(arr, seed) {
+	const h = crypto.createHash('md5').update(seed).digest();
+	return arr[h[0] % arr.length];
 }

+function questionVariants(chunk) {
+	const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en';
+	const tpl = QUESTION_TEMPLATES[lang];
+	const topic = chunk.sectionTitle || chunk.pageTitle;
+	const topicLc = topic.toLowerCase();
+	const seed = chunk.id + '|' + topic;
+
+	const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc);
+
+	// Always include one of each kind so a chunk gets 4 phrasings minimum.
+	const variants = [
+		fill(hashPick(tpl.definitional, seed + '|d')),
+		fill(hashPick(tpl.procedural,   seed + '|p')),
+		fill(hashPick(tpl.topic,        seed + '|t')),
+	];
+	// Add file-pointer / code-focused variants when the chunk actually
+	// references a file path or contains a code block.
+	if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) {
+		variants.push(fill(hashPick(tpl.filePointer, seed + '|f')));
+	}
+	if (/```/.test(chunk.content)) {
+		variants.push(fill(hashPick(tpl.codeFocused, seed + '|c')));
+	}
+	return variants;
+}
+
+// =============================================================================
+//  Code-block extraction
+// =============================================================================
+
+// Pull out fenced code blocks paired with a leading sentence as the prompt.
+function extractCodeBlockSamples(chunk) {
+	const out = [];
+	const lines = chunk.content.split('\n');
+	let inFence = false;
+	let fenceLang = '';
+	let buf = [];
+	let leadIn = '';
+	let prevPara = [];
+
+	for (const line of lines) {
+		const fence = line.match(/^```(.*)$/);
+		if (fence) {
+			if (!inFence) {
+				inFence = true;
+				fenceLang = fence[1].trim();
+				buf = [];
+				leadIn = prevPara.join(' ').trim().slice(0, 240);
+				prevPara = [];
+			} else {
+				inFence = false;
+				if (buf.length >= 2) {
+					out.push({
+						language: fenceLang || 'text',
+						leadIn,
+						code: buf.join('\n'),
+					});
+				}
+			}
+			continue;
+		}
+		if (inFence) {
+			buf.push(line);
+		} else if (line.trim() === '') {
+			prevPara = [];
+		} else {
+			prevPara.push(line);
+		}
+	}
+	return out;
+}
+
+function codeBlockQA(chunk, block, lang) {
+	const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en;
+	const topic = chunk.sectionTitle || chunk.pageTitle;
+	const seed = chunk.id + '|code|' + block.code.slice(0, 32);
+	const q = hashPick(tpl.codeFocused, seed)
+		.replaceAll('{topic}', topic)
+		.replaceAll('{topic_lc}', topic.toLowerCase());
+
+	// Answer = optional lead-in + the code block. Wrap code in fences so the
+	// model learns to emit syntactically valid code blocks too.
+	const fence = '```' + (block.language || '') + '\n' + block.code + '\n```';
+	const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence;
+	return { question: q, answer };
+}
+
+// =============================================================================
+//  Source ingestion
+// =============================================================================
+
+function ingestPublicDocs() {
+	const files = walkDocs(DOCS_DIR);
+	const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
+	return chunks.map((c) => ({ ...c, source: 'docs' }));
+}
+
+function ingestFrameworkReference() {
+	if (!fs.existsSync(REFERENCE_FILE)) {
+		console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`);
+		return [];
+	}
+	const raw = fs.readFileSync(REFERENCE_FILE, 'utf8');
+	return chunkMarkdown(raw, {
+		language: 'en',
+		file: 'framework-reference-v2.md',
+		pageTitle: 'Nibiru Framework Reference v2',
+		pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.',
+		baseUrl: '/reference/',
+	}).map((c) => ({ ...c, source: 'framework-reference-v2' }));
+}
+
+// =============================================================================
+//  Record assembly
+// =============================================================================
+
+function systemFor(chunk) {
+	if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE;
+	return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en;
+}
+
+// Read the optional research-agent augmentation. Each line is alpaca-format
+// `{instruction, input, output, metadata}`. Returns [] if the file is absent.
+function loadAugmentation() {
+	if (!fs.existsSync(AUGMENTATION_FILE)) {
+		console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`);
+		return [];
+	}
+	const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean);
+	const records = [];
+	for (const [i, line] of lines.entries()) {
+		try {
+			const rec = JSON.parse(line);
+			if (rec.instruction && rec.output) records.push(rec);
+		} catch (e) {
+			console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`);
+		}
+	}
+	console.log(`[corpus] loaded ${records.length} augmentation records`);
+	return records;
+}
+
+function buildRecords(chunks) {
+	const chunksOut = [];
+	const instructionsOut = [];
+	const chatOut = [];
+	const completionOut = [];
+
+	for (const c of chunks) {
+		// 1. Raw chunk record
+		chunksOut.push({
+			id: c.id,
+			source: c.source,
+			url: c.url,
+			pageTitle: c.pageTitle,
+			sectionTitle: c.sectionTitle,
+			language: c.language,
+			tokens: c.tokens,
+			content: c.content,
+		});
+
+		// 2. Question-variant records
+		const sys = systemFor(c);
+		for (const q of questionVariants(c)) {
+			const meta = {
+				language: c.language,
+				source: c.url,
+				page: c.pageTitle,
+				origin: c.source,
+			};
+			instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta });
+			chatOut.push({
+				messages: [
+					{ role: 'system', content: sys },
+					{ role: 'user', content: q },
+					{ role: 'assistant', content: c.content },
+				],
+				metadata: meta,
+			});
+			completionOut.push({
+				prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`,
+				completion: ' ' + c.content,
+			});
+		}
+
+		// 3. Code-block recall samples — only for the framework reference,
+		//    where the code is gold (file:line cited, framework-canonical).
+		if (c.source === 'framework-reference-v2') {
+			const blocks = extractCodeBlockSamples(c);
+			for (const b of blocks) {
+				const { question, answer } = codeBlockQA(c, b, c.language);
+				const meta = {
+					language: c.language,
+					source: c.url,
+					page: c.pageTitle,
+					origin: c.source,
+					codeLanguage: b.language,
+					kind: 'code-recall',
+				};
+				instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta });
+				chatOut.push({
+					messages: [
+						{ role: 'system', content: sys },
+						{ role: 'user', content: question },
+						{ role: 'assistant', content: answer },
+					],
+					metadata: meta,
+				});
+				completionOut.push({
+					prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`,
+					completion: ' ' + answer,
+				});
+			}
+		}
+	}
+
+	// Merge research-agent augmentation. Each input record is alpaca-style;
+	// we fan it out into instructions / chat / completion to match the rest.
+	const augmentation = loadAugmentation();
+	for (const a of augmentation) {
+		const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade
+		const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' };
+		instructionsOut.push({
+			instruction: a.instruction,
+			input: a.input || '',
+			output: a.output,
+			metadata: meta,
+		});
+		chatOut.push({
+			messages: [
+				{ role: 'system', content: sys },
+				{ role: 'user', content: a.instruction },
+				{ role: 'assistant', content: a.output },
+			],
+			metadata: meta,
+		});
+		completionOut.push({
+			prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`,
+			completion: ' ' + a.output,
+		});
+	}
+	if (augmentation.length) {
+		console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`);
+	}
+
+	return { chunksOut, instructionsOut, chatOut, completionOut };
+}
+
+// =============================================================================
+//  IO + manifest
+// =============================================================================
+
 function ensureDir(d) {
 	fs.mkdirSync(d, { recursive: true });
 }
@@ -65,61 +372,173 @@ function writeJsonl(filePath, items) {
 	return new Promise((res) => stream.on('close', res));
 }

-async function main() {
-	console.log(`Walking ${DOCS_DIR}…`);
-	const files = walkDocs(DOCS_DIR);
-	const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
-	console.log(`Produced ${chunks.length} chunks across ${files.length} files.`);
-
-	const chunksOut = chunks.map((c) => ({
-		id: c.id,
-		url: c.url,
-		pageTitle: c.pageTitle,
-		sectionTitle: c.sectionTitle,
-		language: c.language,
-		tokens: c.tokens,
-		content: c.content,
-	}));
-
-	const instructionsOut = chunks.map((c) => ({
-		instruction: questionFor(c),
-		input: '',
-		output: c.content,
-		metadata: { language: c.language, source: c.url, page: c.pageTitle },
-	}));
-
-	const chatOut = chunks.map((c) => ({
-		messages: [
-			{ role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en },
-			{ role: 'user', content: questionFor(c) },
-			{ role: 'assistant', content: c.content },
-		],
-		metadata: { language: c.language, source: c.url, page: c.pageTitle },
-	}));
-
-	const completionOut = chunks.map((c) => ({
-		prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`,
-		completion: ' ' + c.content,
-	}));
-
-	await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut);
-	await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut);
-	await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut);
-	await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut);
-
-	const stats = {
-		generatedAt: new Date().toISOString(),
-		fileCount: files.length,
-		chunkCount: chunks.length,
-		byLanguage: chunks.reduce((acc, c) => {
-			acc[c.language] = (acc[c.language] || 0) + 1;
-			return acc;
-		}, {}),
+function fileStats(filePath) {
+	const buf = fs.readFileSync(filePath);
+	return {
+		bytes: buf.length,
+		sha256: crypto.createHash('sha256').update(buf).digest('hex'),
 	};
-	fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2));
+}

-	console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`);
-	console.log(JSON.stringify(stats, null, 2));
+function firstNonEmptyLine(filePath) {
+	const text = fs.readFileSync(filePath, 'utf8');
+	const line = text.split('\n').find((l) => l.trim().length > 0) || '';
+	return line.length > 800 ? line.slice(0, 800) + '…' : line;
+}
+
+// =============================================================================
+//  Main
+// =============================================================================
+
+// Language metadata — used both to bucket files and label them in the UI.
+// Order matters: English first (the framework-reference is English-only and
+// rolls into the en bucket), then localised docs.
+const LANGUAGES = [
+	{ code: 'en', label: 'English'  },
+	{ code: 'de', label: 'Deutsch'  },
+	{ code: 'ja', label: '日本語'    },
+	{ code: 'es', label: 'Español'  },
+	{ code: 'fr', label: 'Français' },
+];
+
+// Bucket records by their `language` (raw chunks) or `metadata.language`
+// (alpaca/chat/completion records). Returns Map<lang, items[]>.
+function bucketByLanguage(records, getLang) {
+	const map = new Map();
+	for (const lang of LANGUAGES) map.set(lang.code, []);
+	for (const r of records) {
+		const lang = getLang(r) || 'en';
+		const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang));
+		bucket.push(r);
+	}
+	return map;
+}
+
+async function main() {
+	console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`);
+	console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`);
+	console.log(`[corpus] OUT_DIR=${OUT_DIR}`);
+
+	const docsChunks = ingestPublicDocs();
+	const refChunks = ingestFrameworkReference();
+	const chunks = [...refChunks, ...docsChunks]; // reference first → priority
+
+	console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`);
+
+	const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks);
+	console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`);
+
+	ensureDir(OUT_DIR);
+	// Wipe any leftover files from a previous run so stale per-language
+	// buckets don't linger.
+	for (const f of fs.readdirSync(OUT_DIR)) {
+		if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f));
+	}
+
+	// Per-language buckets. Each format gets one file per language plus a
+	// combined `*-all.jsonl` for callers who want everything.
+	const buckets = {
+		chunks:       bucketByLanguage(chunksOut,       (r) => r.language),
+		instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language),
+		chat:         bucketByLanguage(chatOut,         (r) => r.metadata?.language),
+		completion:   bucketByLanguage(completionOut,   (r) => r.metadata?.language ?? 'en'),
+	};
+	// `completion` records don't carry metadata (prompt/completion-only),
+	// so its bucketing falls back to en. To keep splits accurate we recompute
+	// from instructionsOut which has the same shape and ordering pre-bucket:
+	{
+		const completionMap = new Map();
+		for (const lang of LANGUAGES) completionMap.set(lang.code, []);
+		for (let i = 0; i < instructionsOut.length; i++) {
+			const lang = instructionsOut[i].metadata?.language || 'en';
+			completionMap.get(lang)?.push(completionOut[i]);
+		}
+		buckets.completion = completionMap;
+	}
+
+	const writeBucketed = async (formatName, bucketMap, allRecords) => {
+		const out = [];
+		// Per-language files
+		for (const lang of LANGUAGES) {
+			const records = bucketMap.get(lang.code) || [];
+			if (records.length === 0) continue;
+			const filename = `${formatName}-${lang.code}.jsonl`;
+			await writeJsonl(path.join(OUT_DIR, filename), records);
+			out.push({
+				format: formatName,
+				language: lang.code,
+				languageLabel: lang.label,
+				filename,
+				records: records.length,
+			});
+		}
+		// Combined all-language file
+		const allFilename = `${formatName}-all.jsonl`;
+		await writeJsonl(path.join(OUT_DIR, allFilename), allRecords);
+		out.push({
+			format: formatName,
+			language: 'all',
+			languageLabel: 'All languages',
+			filename: allFilename,
+			records: allRecords.length,
+		});
+		return out;
+	};
+
+	const allFileMeta = [
+		...await writeBucketed('chunks',       buckets.chunks,       chunksOut),
+		...await writeBucketed('instructions', buckets.instructions, instructionsOut),
+		...await writeBucketed('chat',         buckets.chat,         chatOut),
+		...await writeBucketed('completion',   buckets.completion,   completionOut),
+	];
+
+	// Per-language breakdown of the chunks (handy for inspection).
+	const byLanguage = chunks.reduce((acc, c) => {
+		acc[c.language] = (acc[c.language] || 0) + 1;
+		return acc;
+	}, {});
+	const bySource = chunks.reduce((acc, c) => {
+		acc[c.source] = (acc[c.source] || 0) + 1;
+		return acc;
+	}, {});
+
+	// Hash + size + preview for every file written.
+	const filesEnriched = allFileMeta.map((f) => {
+		const fp = path.join(OUT_DIR, f.filename);
+		const st = fileStats(fp);
+		return {
+			...f,
+			bytes: st.bytes,
+			sha256: st.sha256,
+			samplePreview: firstNonEmptyLine(fp),
+		};
+	});
+
+	const manifest = {
+		generatedAt: new Date().toISOString(),
+		generator: {
+			script: 'scripts/build-corpus.mjs',
+			node: process.version,
+		},
+		encoding: 'utf-8',
+		sources: {
+			'framework-reference-v2.md': refChunks.length,
+			'src/content/docs/':         docsChunks.length,
+		},
+		chunkCount: chunks.length,
+		byLanguage,
+		bySource,
+		languages: LANGUAGES,
+		formats: ['chunks', 'instructions', 'chat', 'completion'],
+		files: filesEnriched,
+	};
+	fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2));
+
+	console.log('[corpus] done — wrote', filesEnriched.length, 'files');
+	console.log('[corpus] per-format/per-language summary:');
+	for (const f of filesEnriched) {
+		console.log(`  ${f.filename.padEnd(28)}  ${String(f.records).padStart(5)} records  ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`);
+	}
 }

 main().catch((e) => {