|
|
|
|
@@ -1,27 +1,46 @@
|
|
|
|
|
#!/usr/bin/env node
|
|
|
|
|
/**
|
|
|
|
|
* Export the docs as a LoRA-training-ready corpus.
|
|
|
|
|
* Build the LoRA training corpus.
|
|
|
|
|
*
|
|
|
|
|
* node scripts/build-corpus.mjs
|
|
|
|
|
*
|
|
|
|
|
* Outputs four files under dist/corpus/:
|
|
|
|
|
* - chunks.jsonl — raw chunks (one section per line)
|
|
|
|
|
* - instructions.jsonl — instruction/input/output triples
|
|
|
|
|
* - chat.jsonl — sharegpt/chat-format messages
|
|
|
|
|
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
|
|
|
|
|
* Sources, in order of priority:
|
|
|
|
|
* 1. scripts/extraction/framework-reference-v2.md (deep, file:line cited)
|
|
|
|
|
* 2. src/content/docs/{en,de,ja,es,fr}/ (the public docs)
|
|
|
|
|
*
|
|
|
|
|
* The instruction text for each chunk is derived from the section heading
|
|
|
|
|
* with a per-language template ("How do I X?", "Wie X?", "X するには?").
|
|
|
|
|
* Outputs under dist/corpus/:
|
|
|
|
|
* - chunks.jsonl — raw chunks (one record per source chunk, no Q/A)
|
|
|
|
|
* - instructions.jsonl — instruction/input/output triples (Alpaca-style)
|
|
|
|
|
* - chat.jsonl — sharegpt/messages format (system+user+assistant)
|
|
|
|
|
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
|
|
|
|
|
* - manifest.json — size, sha256, record count, sample preview per file
|
|
|
|
|
*
|
|
|
|
|
* Augmentation: per chunk, we emit 3-4 question variants (definition,
|
|
|
|
|
* procedural, code-focused, file-pointer). Code-block recall samples are
|
|
|
|
|
* emitted as additional records for the framework-reference source so the
|
|
|
|
|
* model learns exact framework idioms.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
import fs from 'node:fs';
|
|
|
|
|
import path from 'node:path';
|
|
|
|
|
import crypto from 'node:crypto';
|
|
|
|
|
import { fileURLToPath } from 'node:url';
|
|
|
|
|
import { chunkFile, walkDocs } from './lib/chunk.mjs';
|
|
|
|
|
import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs';
|
|
|
|
|
|
|
|
|
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
|
|
|
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
|
|
|
|
|
const OUT_DIR = path.resolve(__dirname, '../dist/corpus');
|
|
|
|
|
const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md');
|
|
|
|
|
// Optional research-agent augmentation. JSONL — one alpaca-style record
|
|
|
|
|
// per line. When present, records are merged into instructions/chat/
|
|
|
|
|
// completion outputs alongside the templated ones.
|
|
|
|
|
const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl');
|
|
|
|
|
// Write straight into public/corpus/ so Astro serves the files at
|
|
|
|
|
// /corpus/<name>.jsonl without a separate copy step. Gitignored.
|
|
|
|
|
const OUT_DIR = path.resolve(__dirname, '../public/corpus');
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// System prompts
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
const SYSTEM_PROMPT = {
|
|
|
|
|
en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
|
|
|
|
|
@@ -31,28 +50,316 @@ const SYSTEM_PROMPT = {
|
|
|
|
|
fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const QUESTION_PREFIX = {
|
|
|
|
|
en: ['How do I', 'What is', 'Explain', 'Show me'],
|
|
|
|
|
de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'],
|
|
|
|
|
ja: ['', '', 'について教えてください:', ''],
|
|
|
|
|
es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'],
|
|
|
|
|
fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'],
|
|
|
|
|
// Stricter system prompt for the framework-reference source — it's the
|
|
|
|
|
// gold material with exact namespaces, file:line citations, and the small
|
|
|
|
|
// idioms we want the model to internalise.
|
|
|
|
|
const SYSTEM_PROMPT_REFERENCE =
|
|
|
|
|
'You are a senior PHP architect and Nibiru framework expert. ' +
|
|
|
|
|
'Answers must include exact namespaces, file paths with line numbers when available, ' +
|
|
|
|
|
'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' +
|
|
|
|
|
'— if you do not know, say so plainly.';
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// Question-variant generation (deterministic, no LLM)
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
const QUESTION_TEMPLATES = {
|
|
|
|
|
en: {
|
|
|
|
|
definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'],
|
|
|
|
|
procedural: ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'],
|
|
|
|
|
topic: ['{topic}', '{topic} — overview', '{topic} in Nibiru'],
|
|
|
|
|
filePointer: ['Where is {topic} defined?', 'Which file contains {topic}?'],
|
|
|
|
|
codeFocused: ['Show me the code for {topic}.', 'Quote the {topic} implementation.'],
|
|
|
|
|
},
|
|
|
|
|
de: {
|
|
|
|
|
definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'],
|
|
|
|
|
procedural: ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'],
|
|
|
|
|
topic: ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'],
|
|
|
|
|
filePointer: ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'],
|
|
|
|
|
codeFocused: ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'],
|
|
|
|
|
},
|
|
|
|
|
ja: {
|
|
|
|
|
definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'],
|
|
|
|
|
procedural: ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'],
|
|
|
|
|
topic: ['{topic}', '{topic} — 概要'],
|
|
|
|
|
filePointer: ['{topic} はどこで定義されていますか?'],
|
|
|
|
|
codeFocused: ['{topic} のコードを見せてください。'],
|
|
|
|
|
},
|
|
|
|
|
es: {
|
|
|
|
|
definitional: ['¿Qué es {topic}?', 'Explica {topic}.'],
|
|
|
|
|
procedural: ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'],
|
|
|
|
|
topic: ['{topic}', '{topic} — visión general'],
|
|
|
|
|
filePointer: ['¿Dónde se define {topic}?'],
|
|
|
|
|
codeFocused: ['Muéstrame el código de {topic}.'],
|
|
|
|
|
},
|
|
|
|
|
fr: {
|
|
|
|
|
definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'],
|
|
|
|
|
procedural: ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'],
|
|
|
|
|
topic: ['{topic}', '{topic} — vue d\'ensemble'],
|
|
|
|
|
filePointer: ['Où est défini {topic} ?'],
|
|
|
|
|
codeFocused: ['Montre-moi le code de {topic}.'],
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
function questionFor(chunk) {
|
|
|
|
|
const lang = chunk.language || 'en';
|
|
|
|
|
const heading = chunk.sectionTitle || chunk.pageTitle;
|
|
|
|
|
if (lang === 'ja') {
|
|
|
|
|
return `${heading} について教えてください。`;
|
|
|
|
|
}
|
|
|
|
|
const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en;
|
|
|
|
|
const prefix = prefixes[heading.length % prefixes.length];
|
|
|
|
|
if (lang === 'es' || lang === 'fr') {
|
|
|
|
|
return `${prefix} ${heading.toLowerCase()} ?`.replace(' ', ' ');
|
|
|
|
|
}
|
|
|
|
|
return `${prefix} ${heading.toLowerCase()}?`;
|
|
|
|
|
// Hash-pick a template deterministically from a kind, so two builds give
|
|
|
|
|
// the same corpus (necessary for reproducible LoRA training runs).
|
|
|
|
|
function hashPick(arr, seed) {
|
|
|
|
|
const h = crypto.createHash('md5').update(seed).digest();
|
|
|
|
|
return arr[h[0] % arr.length];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function questionVariants(chunk) {
|
|
|
|
|
const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en';
|
|
|
|
|
const tpl = QUESTION_TEMPLATES[lang];
|
|
|
|
|
const topic = chunk.sectionTitle || chunk.pageTitle;
|
|
|
|
|
const topicLc = topic.toLowerCase();
|
|
|
|
|
const seed = chunk.id + '|' + topic;
|
|
|
|
|
|
|
|
|
|
const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc);
|
|
|
|
|
|
|
|
|
|
// Always include one of each kind so a chunk gets 4 phrasings minimum.
|
|
|
|
|
const variants = [
|
|
|
|
|
fill(hashPick(tpl.definitional, seed + '|d')),
|
|
|
|
|
fill(hashPick(tpl.procedural, seed + '|p')),
|
|
|
|
|
fill(hashPick(tpl.topic, seed + '|t')),
|
|
|
|
|
];
|
|
|
|
|
// Add file-pointer / code-focused variants when the chunk actually
|
|
|
|
|
// references a file path or contains a code block.
|
|
|
|
|
if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) {
|
|
|
|
|
variants.push(fill(hashPick(tpl.filePointer, seed + '|f')));
|
|
|
|
|
}
|
|
|
|
|
if (/```/.test(chunk.content)) {
|
|
|
|
|
variants.push(fill(hashPick(tpl.codeFocused, seed + '|c')));
|
|
|
|
|
}
|
|
|
|
|
return variants;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// Code-block extraction
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
// Pull out fenced code blocks paired with a leading sentence as the prompt.
|
|
|
|
|
function extractCodeBlockSamples(chunk) {
|
|
|
|
|
const out = [];
|
|
|
|
|
const lines = chunk.content.split('\n');
|
|
|
|
|
let inFence = false;
|
|
|
|
|
let fenceLang = '';
|
|
|
|
|
let buf = [];
|
|
|
|
|
let leadIn = '';
|
|
|
|
|
let prevPara = [];
|
|
|
|
|
|
|
|
|
|
for (const line of lines) {
|
|
|
|
|
const fence = line.match(/^```(.*)$/);
|
|
|
|
|
if (fence) {
|
|
|
|
|
if (!inFence) {
|
|
|
|
|
inFence = true;
|
|
|
|
|
fenceLang = fence[1].trim();
|
|
|
|
|
buf = [];
|
|
|
|
|
leadIn = prevPara.join(' ').trim().slice(0, 240);
|
|
|
|
|
prevPara = [];
|
|
|
|
|
} else {
|
|
|
|
|
inFence = false;
|
|
|
|
|
if (buf.length >= 2) {
|
|
|
|
|
out.push({
|
|
|
|
|
language: fenceLang || 'text',
|
|
|
|
|
leadIn,
|
|
|
|
|
code: buf.join('\n'),
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (inFence) {
|
|
|
|
|
buf.push(line);
|
|
|
|
|
} else if (line.trim() === '') {
|
|
|
|
|
prevPara = [];
|
|
|
|
|
} else {
|
|
|
|
|
prevPara.push(line);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return out;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function codeBlockQA(chunk, block, lang) {
|
|
|
|
|
const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en;
|
|
|
|
|
const topic = chunk.sectionTitle || chunk.pageTitle;
|
|
|
|
|
const seed = chunk.id + '|code|' + block.code.slice(0, 32);
|
|
|
|
|
const q = hashPick(tpl.codeFocused, seed)
|
|
|
|
|
.replaceAll('{topic}', topic)
|
|
|
|
|
.replaceAll('{topic_lc}', topic.toLowerCase());
|
|
|
|
|
|
|
|
|
|
// Answer = optional lead-in + the code block. Wrap code in fences so the
|
|
|
|
|
// model learns to emit syntactically valid code blocks too.
|
|
|
|
|
const fence = '```' + (block.language || '') + '\n' + block.code + '\n```';
|
|
|
|
|
const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence;
|
|
|
|
|
return { question: q, answer };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// Source ingestion
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
function ingestPublicDocs() {
|
|
|
|
|
const files = walkDocs(DOCS_DIR);
|
|
|
|
|
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
|
|
|
|
|
return chunks.map((c) => ({ ...c, source: 'docs' }));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function ingestFrameworkReference() {
|
|
|
|
|
if (!fs.existsSync(REFERENCE_FILE)) {
|
|
|
|
|
console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`);
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
const raw = fs.readFileSync(REFERENCE_FILE, 'utf8');
|
|
|
|
|
return chunkMarkdown(raw, {
|
|
|
|
|
language: 'en',
|
|
|
|
|
file: 'framework-reference-v2.md',
|
|
|
|
|
pageTitle: 'Nibiru Framework Reference v2',
|
|
|
|
|
pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.',
|
|
|
|
|
baseUrl: '/reference/',
|
|
|
|
|
}).map((c) => ({ ...c, source: 'framework-reference-v2' }));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// Record assembly
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
function systemFor(chunk) {
|
|
|
|
|
if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE;
|
|
|
|
|
return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Read the optional research-agent augmentation. Each line is alpaca-format
|
|
|
|
|
// `{instruction, input, output, metadata}`. Returns [] if the file is absent.
|
|
|
|
|
function loadAugmentation() {
|
|
|
|
|
if (!fs.existsSync(AUGMENTATION_FILE)) {
|
|
|
|
|
console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`);
|
|
|
|
|
return [];
|
|
|
|
|
}
|
|
|
|
|
const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean);
|
|
|
|
|
const records = [];
|
|
|
|
|
for (const [i, line] of lines.entries()) {
|
|
|
|
|
try {
|
|
|
|
|
const rec = JSON.parse(line);
|
|
|
|
|
if (rec.instruction && rec.output) records.push(rec);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
console.log(`[corpus] loaded ${records.length} augmentation records`);
|
|
|
|
|
return records;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function buildRecords(chunks) {
|
|
|
|
|
const chunksOut = [];
|
|
|
|
|
const instructionsOut = [];
|
|
|
|
|
const chatOut = [];
|
|
|
|
|
const completionOut = [];
|
|
|
|
|
|
|
|
|
|
for (const c of chunks) {
|
|
|
|
|
// 1. Raw chunk record
|
|
|
|
|
chunksOut.push({
|
|
|
|
|
id: c.id,
|
|
|
|
|
source: c.source,
|
|
|
|
|
url: c.url,
|
|
|
|
|
pageTitle: c.pageTitle,
|
|
|
|
|
sectionTitle: c.sectionTitle,
|
|
|
|
|
language: c.language,
|
|
|
|
|
tokens: c.tokens,
|
|
|
|
|
content: c.content,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// 2. Question-variant records
|
|
|
|
|
const sys = systemFor(c);
|
|
|
|
|
for (const q of questionVariants(c)) {
|
|
|
|
|
const meta = {
|
|
|
|
|
language: c.language,
|
|
|
|
|
source: c.url,
|
|
|
|
|
page: c.pageTitle,
|
|
|
|
|
origin: c.source,
|
|
|
|
|
};
|
|
|
|
|
instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta });
|
|
|
|
|
chatOut.push({
|
|
|
|
|
messages: [
|
|
|
|
|
{ role: 'system', content: sys },
|
|
|
|
|
{ role: 'user', content: q },
|
|
|
|
|
{ role: 'assistant', content: c.content },
|
|
|
|
|
],
|
|
|
|
|
metadata: meta,
|
|
|
|
|
});
|
|
|
|
|
completionOut.push({
|
|
|
|
|
prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`,
|
|
|
|
|
completion: ' ' + c.content,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 3. Code-block recall samples — only for the framework reference,
|
|
|
|
|
// where the code is gold (file:line cited, framework-canonical).
|
|
|
|
|
if (c.source === 'framework-reference-v2') {
|
|
|
|
|
const blocks = extractCodeBlockSamples(c);
|
|
|
|
|
for (const b of blocks) {
|
|
|
|
|
const { question, answer } = codeBlockQA(c, b, c.language);
|
|
|
|
|
const meta = {
|
|
|
|
|
language: c.language,
|
|
|
|
|
source: c.url,
|
|
|
|
|
page: c.pageTitle,
|
|
|
|
|
origin: c.source,
|
|
|
|
|
codeLanguage: b.language,
|
|
|
|
|
kind: 'code-recall',
|
|
|
|
|
};
|
|
|
|
|
instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta });
|
|
|
|
|
chatOut.push({
|
|
|
|
|
messages: [
|
|
|
|
|
{ role: 'system', content: sys },
|
|
|
|
|
{ role: 'user', content: question },
|
|
|
|
|
{ role: 'assistant', content: answer },
|
|
|
|
|
],
|
|
|
|
|
metadata: meta,
|
|
|
|
|
});
|
|
|
|
|
completionOut.push({
|
|
|
|
|
prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`,
|
|
|
|
|
completion: ' ' + answer,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Merge research-agent augmentation. Each input record is alpaca-style;
|
|
|
|
|
// we fan it out into instructions / chat / completion to match the rest.
|
|
|
|
|
const augmentation = loadAugmentation();
|
|
|
|
|
for (const a of augmentation) {
|
|
|
|
|
const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade
|
|
|
|
|
const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' };
|
|
|
|
|
instructionsOut.push({
|
|
|
|
|
instruction: a.instruction,
|
|
|
|
|
input: a.input || '',
|
|
|
|
|
output: a.output,
|
|
|
|
|
metadata: meta,
|
|
|
|
|
});
|
|
|
|
|
chatOut.push({
|
|
|
|
|
messages: [
|
|
|
|
|
{ role: 'system', content: sys },
|
|
|
|
|
{ role: 'user', content: a.instruction },
|
|
|
|
|
{ role: 'assistant', content: a.output },
|
|
|
|
|
],
|
|
|
|
|
metadata: meta,
|
|
|
|
|
});
|
|
|
|
|
completionOut.push({
|
|
|
|
|
prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`,
|
|
|
|
|
completion: ' ' + a.output,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
if (augmentation.length) {
|
|
|
|
|
console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return { chunksOut, instructionsOut, chatOut, completionOut };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// IO + manifest
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
function ensureDir(d) {
|
|
|
|
|
fs.mkdirSync(d, { recursive: true });
|
|
|
|
|
}
|
|
|
|
|
@@ -65,61 +372,173 @@ function writeJsonl(filePath, items) {
|
|
|
|
|
return new Promise((res) => stream.on('close', res));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function main() {
|
|
|
|
|
console.log(`Walking ${DOCS_DIR}…`);
|
|
|
|
|
const files = walkDocs(DOCS_DIR);
|
|
|
|
|
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
|
|
|
|
|
console.log(`Produced ${chunks.length} chunks across ${files.length} files.`);
|
|
|
|
|
|
|
|
|
|
const chunksOut = chunks.map((c) => ({
|
|
|
|
|
id: c.id,
|
|
|
|
|
url: c.url,
|
|
|
|
|
pageTitle: c.pageTitle,
|
|
|
|
|
sectionTitle: c.sectionTitle,
|
|
|
|
|
language: c.language,
|
|
|
|
|
tokens: c.tokens,
|
|
|
|
|
content: c.content,
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
const instructionsOut = chunks.map((c) => ({
|
|
|
|
|
instruction: questionFor(c),
|
|
|
|
|
input: '',
|
|
|
|
|
output: c.content,
|
|
|
|
|
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
const chatOut = chunks.map((c) => ({
|
|
|
|
|
messages: [
|
|
|
|
|
{ role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en },
|
|
|
|
|
{ role: 'user', content: questionFor(c) },
|
|
|
|
|
{ role: 'assistant', content: c.content },
|
|
|
|
|
],
|
|
|
|
|
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
const completionOut = chunks.map((c) => ({
|
|
|
|
|
prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`,
|
|
|
|
|
completion: ' ' + c.content,
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut);
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut);
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut);
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut);
|
|
|
|
|
|
|
|
|
|
const stats = {
|
|
|
|
|
generatedAt: new Date().toISOString(),
|
|
|
|
|
fileCount: files.length,
|
|
|
|
|
chunkCount: chunks.length,
|
|
|
|
|
byLanguage: chunks.reduce((acc, c) => {
|
|
|
|
|
acc[c.language] = (acc[c.language] || 0) + 1;
|
|
|
|
|
return acc;
|
|
|
|
|
}, {}),
|
|
|
|
|
function fileStats(filePath) {
|
|
|
|
|
const buf = fs.readFileSync(filePath);
|
|
|
|
|
return {
|
|
|
|
|
bytes: buf.length,
|
|
|
|
|
sha256: crypto.createHash('sha256').update(buf).digest('hex'),
|
|
|
|
|
};
|
|
|
|
|
fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`);
|
|
|
|
|
console.log(JSON.stringify(stats, null, 2));
|
|
|
|
|
function firstNonEmptyLine(filePath) {
|
|
|
|
|
const text = fs.readFileSync(filePath, 'utf8');
|
|
|
|
|
const line = text.split('\n').find((l) => l.trim().length > 0) || '';
|
|
|
|
|
return line.length > 800 ? line.slice(0, 800) + '…' : line;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// =============================================================================
|
|
|
|
|
// Main
|
|
|
|
|
// =============================================================================
|
|
|
|
|
|
|
|
|
|
// Language metadata — used both to bucket files and label them in the UI.
|
|
|
|
|
// Order matters: English first (the framework-reference is English-only and
|
|
|
|
|
// rolls into the en bucket), then localised docs.
|
|
|
|
|
const LANGUAGES = [
|
|
|
|
|
{ code: 'en', label: 'English' },
|
|
|
|
|
{ code: 'de', label: 'Deutsch' },
|
|
|
|
|
{ code: 'ja', label: '日本語' },
|
|
|
|
|
{ code: 'es', label: 'Español' },
|
|
|
|
|
{ code: 'fr', label: 'Français' },
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// Bucket records by their `language` (raw chunks) or `metadata.language`
|
|
|
|
|
// (alpaca/chat/completion records). Returns Map<lang, items[]>.
|
|
|
|
|
function bucketByLanguage(records, getLang) {
|
|
|
|
|
const map = new Map();
|
|
|
|
|
for (const lang of LANGUAGES) map.set(lang.code, []);
|
|
|
|
|
for (const r of records) {
|
|
|
|
|
const lang = getLang(r) || 'en';
|
|
|
|
|
const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang));
|
|
|
|
|
bucket.push(r);
|
|
|
|
|
}
|
|
|
|
|
return map;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function main() {
|
|
|
|
|
console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`);
|
|
|
|
|
console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`);
|
|
|
|
|
console.log(`[corpus] OUT_DIR=${OUT_DIR}`);
|
|
|
|
|
|
|
|
|
|
const docsChunks = ingestPublicDocs();
|
|
|
|
|
const refChunks = ingestFrameworkReference();
|
|
|
|
|
const chunks = [...refChunks, ...docsChunks]; // reference first → priority
|
|
|
|
|
|
|
|
|
|
console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`);
|
|
|
|
|
|
|
|
|
|
const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks);
|
|
|
|
|
console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`);
|
|
|
|
|
|
|
|
|
|
ensureDir(OUT_DIR);
|
|
|
|
|
// Wipe any leftover files from a previous run so stale per-language
|
|
|
|
|
// buckets don't linger.
|
|
|
|
|
for (const f of fs.readdirSync(OUT_DIR)) {
|
|
|
|
|
if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Per-language buckets. Each format gets one file per language plus a
|
|
|
|
|
// combined `*-all.jsonl` for callers who want everything.
|
|
|
|
|
const buckets = {
|
|
|
|
|
chunks: bucketByLanguage(chunksOut, (r) => r.language),
|
|
|
|
|
instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language),
|
|
|
|
|
chat: bucketByLanguage(chatOut, (r) => r.metadata?.language),
|
|
|
|
|
completion: bucketByLanguage(completionOut, (r) => r.metadata?.language ?? 'en'),
|
|
|
|
|
};
|
|
|
|
|
// `completion` records don't carry metadata (prompt/completion-only),
|
|
|
|
|
// so its bucketing falls back to en. To keep splits accurate we recompute
|
|
|
|
|
// from instructionsOut which has the same shape and ordering pre-bucket:
|
|
|
|
|
{
|
|
|
|
|
const completionMap = new Map();
|
|
|
|
|
for (const lang of LANGUAGES) completionMap.set(lang.code, []);
|
|
|
|
|
for (let i = 0; i < instructionsOut.length; i++) {
|
|
|
|
|
const lang = instructionsOut[i].metadata?.language || 'en';
|
|
|
|
|
completionMap.get(lang)?.push(completionOut[i]);
|
|
|
|
|
}
|
|
|
|
|
buckets.completion = completionMap;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const writeBucketed = async (formatName, bucketMap, allRecords) => {
|
|
|
|
|
const out = [];
|
|
|
|
|
// Per-language files
|
|
|
|
|
for (const lang of LANGUAGES) {
|
|
|
|
|
const records = bucketMap.get(lang.code) || [];
|
|
|
|
|
if (records.length === 0) continue;
|
|
|
|
|
const filename = `${formatName}-${lang.code}.jsonl`;
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, filename), records);
|
|
|
|
|
out.push({
|
|
|
|
|
format: formatName,
|
|
|
|
|
language: lang.code,
|
|
|
|
|
languageLabel: lang.label,
|
|
|
|
|
filename,
|
|
|
|
|
records: records.length,
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
// Combined all-language file
|
|
|
|
|
const allFilename = `${formatName}-all.jsonl`;
|
|
|
|
|
await writeJsonl(path.join(OUT_DIR, allFilename), allRecords);
|
|
|
|
|
out.push({
|
|
|
|
|
format: formatName,
|
|
|
|
|
language: 'all',
|
|
|
|
|
languageLabel: 'All languages',
|
|
|
|
|
filename: allFilename,
|
|
|
|
|
records: allRecords.length,
|
|
|
|
|
});
|
|
|
|
|
return out;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const allFileMeta = [
|
|
|
|
|
...await writeBucketed('chunks', buckets.chunks, chunksOut),
|
|
|
|
|
...await writeBucketed('instructions', buckets.instructions, instructionsOut),
|
|
|
|
|
...await writeBucketed('chat', buckets.chat, chatOut),
|
|
|
|
|
...await writeBucketed('completion', buckets.completion, completionOut),
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// Per-language breakdown of the chunks (handy for inspection).
|
|
|
|
|
const byLanguage = chunks.reduce((acc, c) => {
|
|
|
|
|
acc[c.language] = (acc[c.language] || 0) + 1;
|
|
|
|
|
return acc;
|
|
|
|
|
}, {});
|
|
|
|
|
const bySource = chunks.reduce((acc, c) => {
|
|
|
|
|
acc[c.source] = (acc[c.source] || 0) + 1;
|
|
|
|
|
return acc;
|
|
|
|
|
}, {});
|
|
|
|
|
|
|
|
|
|
// Hash + size + preview for every file written.
|
|
|
|
|
const filesEnriched = allFileMeta.map((f) => {
|
|
|
|
|
const fp = path.join(OUT_DIR, f.filename);
|
|
|
|
|
const st = fileStats(fp);
|
|
|
|
|
return {
|
|
|
|
|
...f,
|
|
|
|
|
bytes: st.bytes,
|
|
|
|
|
sha256: st.sha256,
|
|
|
|
|
samplePreview: firstNonEmptyLine(fp),
|
|
|
|
|
};
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const manifest = {
|
|
|
|
|
generatedAt: new Date().toISOString(),
|
|
|
|
|
generator: {
|
|
|
|
|
script: 'scripts/build-corpus.mjs',
|
|
|
|
|
node: process.version,
|
|
|
|
|
},
|
|
|
|
|
encoding: 'utf-8',
|
|
|
|
|
sources: {
|
|
|
|
|
'framework-reference-v2.md': refChunks.length,
|
|
|
|
|
'src/content/docs/': docsChunks.length,
|
|
|
|
|
},
|
|
|
|
|
chunkCount: chunks.length,
|
|
|
|
|
byLanguage,
|
|
|
|
|
bySource,
|
|
|
|
|
languages: LANGUAGES,
|
|
|
|
|
formats: ['chunks', 'instructions', 'chat', 'completion'],
|
|
|
|
|
files: filesEnriched,
|
|
|
|
|
};
|
|
|
|
|
fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2));
|
|
|
|
|
|
|
|
|
|
console.log('[corpus] done — wrote', filesEnriched.length, 'files');
|
|
|
|
|
console.log('[corpus] per-format/per-language summary:');
|
|
|
|
|
for (const f of filesEnriched) {
|
|
|
|
|
console.log(` ${f.filename.padEnd(28)} ${String(f.records).padStart(5)} records ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
main().catch((e) => {
|
|
|
|
|
|