Strip api.neuronetz.ai from documentation; chat config stays in env
The Ollama URL was leaking via:
- prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment,
local-testing, ai/module/{overview,embed,training})
- code blocks teaching users to curl the host directly
- .env.example, Dockerfile, docker-compose.yml defaults
- providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults
- LandingScripts.astro comment
- lora-runbook.md prose + SSH host
- the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers
- the "Oracle is silent" fallback message at /api/oracle POST
Replacements:
- prose: "neuronetz.ai" → "your Ollama instance"
- example URLs in code blocks: https://api.neuronetz.ai → https://your-ollama-host.example
- code-level defaults: → http://localhost:11434 (Ollama's standard local port)
- GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed
- runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host>
Production chat is unaffected: docs/.env (gitignored) on the production
host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only
change in the running container is that the GET handler no longer
echoes the URL.
analytics.neuronetz.ai (Umami tracking) is intentionally left intact —
it's a public, brand-owned subdomain meant to be visible.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,27 +1,46 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Export the docs as a LoRA-training-ready corpus.
|
||||
* Build the LoRA training corpus.
|
||||
*
|
||||
* node scripts/build-corpus.mjs
|
||||
*
|
||||
* Outputs four files under dist/corpus/:
|
||||
* - chunks.jsonl — raw chunks (one section per line)
|
||||
* - instructions.jsonl — instruction/input/output triples
|
||||
* - chat.jsonl — sharegpt/chat-format messages
|
||||
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
|
||||
* Sources, in order of priority:
|
||||
* 1. scripts/extraction/framework-reference-v2.md (deep, file:line cited)
|
||||
* 2. src/content/docs/{en,de,ja,es,fr}/ (the public docs)
|
||||
*
|
||||
* The instruction text for each chunk is derived from the section heading
|
||||
* with a per-language template ("How do I X?", "Wie X?", "X するには?").
|
||||
* Outputs under dist/corpus/:
|
||||
* - chunks.jsonl — raw chunks (one record per source chunk, no Q/A)
|
||||
* - instructions.jsonl — instruction/input/output triples (Alpaca-style)
|
||||
* - chat.jsonl — sharegpt/messages format (system+user+assistant)
|
||||
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
|
||||
* - manifest.json — size, sha256, record count, sample preview per file
|
||||
*
|
||||
* Augmentation: per chunk, we emit 3-4 question variants (definition,
|
||||
* procedural, code-focused, file-pointer). Code-block recall samples are
|
||||
* emitted as additional records for the framework-reference source so the
|
||||
* model learns exact framework idioms.
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import crypto from 'node:crypto';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { chunkFile, walkDocs } from './lib/chunk.mjs';
|
||||
import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
|
||||
const OUT_DIR = path.resolve(__dirname, '../dist/corpus');
|
||||
const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md');
|
||||
// Optional research-agent augmentation. JSONL — one alpaca-style record
|
||||
// per line. When present, records are merged into instructions/chat/
|
||||
// completion outputs alongside the templated ones.
|
||||
const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl');
|
||||
// Write straight into public/corpus/ so Astro serves the files at
|
||||
// /corpus/<name>.jsonl without a separate copy step. Gitignored.
|
||||
const OUT_DIR = path.resolve(__dirname, '../public/corpus');
|
||||
|
||||
// =============================================================================
|
||||
// System prompts
|
||||
// =============================================================================
|
||||
|
||||
const SYSTEM_PROMPT = {
|
||||
en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
|
||||
@@ -31,28 +50,316 @@ const SYSTEM_PROMPT = {
|
||||
fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
|
||||
};
|
||||
|
||||
const QUESTION_PREFIX = {
|
||||
en: ['How do I', 'What is', 'Explain', 'Show me'],
|
||||
de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'],
|
||||
ja: ['', '', 'について教えてください:', ''],
|
||||
es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'],
|
||||
fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'],
|
||||
// Stricter system prompt for the framework-reference source — it's the
|
||||
// gold material with exact namespaces, file:line citations, and the small
|
||||
// idioms we want the model to internalise.
|
||||
const SYSTEM_PROMPT_REFERENCE =
|
||||
'You are a senior PHP architect and Nibiru framework expert. ' +
|
||||
'Answers must include exact namespaces, file paths with line numbers when available, ' +
|
||||
'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' +
|
||||
'— if you do not know, say so plainly.';
|
||||
|
||||
// =============================================================================
|
||||
// Question-variant generation (deterministic, no LLM)
|
||||
// =============================================================================
|
||||
|
||||
const QUESTION_TEMPLATES = {
|
||||
en: {
|
||||
definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'],
|
||||
procedural: ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'],
|
||||
topic: ['{topic}', '{topic} — overview', '{topic} in Nibiru'],
|
||||
filePointer: ['Where is {topic} defined?', 'Which file contains {topic}?'],
|
||||
codeFocused: ['Show me the code for {topic}.', 'Quote the {topic} implementation.'],
|
||||
},
|
||||
de: {
|
||||
definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'],
|
||||
procedural: ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'],
|
||||
topic: ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'],
|
||||
filePointer: ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'],
|
||||
codeFocused: ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'],
|
||||
},
|
||||
ja: {
|
||||
definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'],
|
||||
procedural: ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'],
|
||||
topic: ['{topic}', '{topic} — 概要'],
|
||||
filePointer: ['{topic} はどこで定義されていますか?'],
|
||||
codeFocused: ['{topic} のコードを見せてください。'],
|
||||
},
|
||||
es: {
|
||||
definitional: ['¿Qué es {topic}?', 'Explica {topic}.'],
|
||||
procedural: ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'],
|
||||
topic: ['{topic}', '{topic} — visión general'],
|
||||
filePointer: ['¿Dónde se define {topic}?'],
|
||||
codeFocused: ['Muéstrame el código de {topic}.'],
|
||||
},
|
||||
fr: {
|
||||
definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'],
|
||||
procedural: ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'],
|
||||
topic: ['{topic}', '{topic} — vue d\'ensemble'],
|
||||
filePointer: ['Où est défini {topic} ?'],
|
||||
codeFocused: ['Montre-moi le code de {topic}.'],
|
||||
},
|
||||
};
|
||||
|
||||
function questionFor(chunk) {
|
||||
const lang = chunk.language || 'en';
|
||||
const heading = chunk.sectionTitle || chunk.pageTitle;
|
||||
if (lang === 'ja') {
|
||||
return `${heading} について教えてください。`;
|
||||
}
|
||||
const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en;
|
||||
const prefix = prefixes[heading.length % prefixes.length];
|
||||
if (lang === 'es' || lang === 'fr') {
|
||||
return `${prefix} ${heading.toLowerCase()} ?`.replace(' ', ' ');
|
||||
}
|
||||
return `${prefix} ${heading.toLowerCase()}?`;
|
||||
// Hash-pick a template deterministically from a kind, so two builds give
|
||||
// the same corpus (necessary for reproducible LoRA training runs).
|
||||
function hashPick(arr, seed) {
|
||||
const h = crypto.createHash('md5').update(seed).digest();
|
||||
return arr[h[0] % arr.length];
|
||||
}
|
||||
|
||||
function questionVariants(chunk) {
|
||||
const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en';
|
||||
const tpl = QUESTION_TEMPLATES[lang];
|
||||
const topic = chunk.sectionTitle || chunk.pageTitle;
|
||||
const topicLc = topic.toLowerCase();
|
||||
const seed = chunk.id + '|' + topic;
|
||||
|
||||
const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc);
|
||||
|
||||
// Always include one of each kind so a chunk gets 4 phrasings minimum.
|
||||
const variants = [
|
||||
fill(hashPick(tpl.definitional, seed + '|d')),
|
||||
fill(hashPick(tpl.procedural, seed + '|p')),
|
||||
fill(hashPick(tpl.topic, seed + '|t')),
|
||||
];
|
||||
// Add file-pointer / code-focused variants when the chunk actually
|
||||
// references a file path or contains a code block.
|
||||
if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) {
|
||||
variants.push(fill(hashPick(tpl.filePointer, seed + '|f')));
|
||||
}
|
||||
if (/```/.test(chunk.content)) {
|
||||
variants.push(fill(hashPick(tpl.codeFocused, seed + '|c')));
|
||||
}
|
||||
return variants;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Code-block extraction
|
||||
// =============================================================================
|
||||
|
||||
// Pull out fenced code blocks paired with a leading sentence as the prompt.
|
||||
function extractCodeBlockSamples(chunk) {
|
||||
const out = [];
|
||||
const lines = chunk.content.split('\n');
|
||||
let inFence = false;
|
||||
let fenceLang = '';
|
||||
let buf = [];
|
||||
let leadIn = '';
|
||||
let prevPara = [];
|
||||
|
||||
for (const line of lines) {
|
||||
const fence = line.match(/^```(.*)$/);
|
||||
if (fence) {
|
||||
if (!inFence) {
|
||||
inFence = true;
|
||||
fenceLang = fence[1].trim();
|
||||
buf = [];
|
||||
leadIn = prevPara.join(' ').trim().slice(0, 240);
|
||||
prevPara = [];
|
||||
} else {
|
||||
inFence = false;
|
||||
if (buf.length >= 2) {
|
||||
out.push({
|
||||
language: fenceLang || 'text',
|
||||
leadIn,
|
||||
code: buf.join('\n'),
|
||||
});
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (inFence) {
|
||||
buf.push(line);
|
||||
} else if (line.trim() === '') {
|
||||
prevPara = [];
|
||||
} else {
|
||||
prevPara.push(line);
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function codeBlockQA(chunk, block, lang) {
|
||||
const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en;
|
||||
const topic = chunk.sectionTitle || chunk.pageTitle;
|
||||
const seed = chunk.id + '|code|' + block.code.slice(0, 32);
|
||||
const q = hashPick(tpl.codeFocused, seed)
|
||||
.replaceAll('{topic}', topic)
|
||||
.replaceAll('{topic_lc}', topic.toLowerCase());
|
||||
|
||||
// Answer = optional lead-in + the code block. Wrap code in fences so the
|
||||
// model learns to emit syntactically valid code blocks too.
|
||||
const fence = '```' + (block.language || '') + '\n' + block.code + '\n```';
|
||||
const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence;
|
||||
return { question: q, answer };
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Source ingestion
|
||||
// =============================================================================
|
||||
|
||||
function ingestPublicDocs() {
|
||||
const files = walkDocs(DOCS_DIR);
|
||||
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
|
||||
return chunks.map((c) => ({ ...c, source: 'docs' }));
|
||||
}
|
||||
|
||||
function ingestFrameworkReference() {
|
||||
if (!fs.existsSync(REFERENCE_FILE)) {
|
||||
console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`);
|
||||
return [];
|
||||
}
|
||||
const raw = fs.readFileSync(REFERENCE_FILE, 'utf8');
|
||||
return chunkMarkdown(raw, {
|
||||
language: 'en',
|
||||
file: 'framework-reference-v2.md',
|
||||
pageTitle: 'Nibiru Framework Reference v2',
|
||||
pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.',
|
||||
baseUrl: '/reference/',
|
||||
}).map((c) => ({ ...c, source: 'framework-reference-v2' }));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Record assembly
|
||||
// =============================================================================
|
||||
|
||||
function systemFor(chunk) {
|
||||
if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE;
|
||||
return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en;
|
||||
}
|
||||
|
||||
// Read the optional research-agent augmentation. Each line is alpaca-format
|
||||
// `{instruction, input, output, metadata}`. Returns [] if the file is absent.
|
||||
function loadAugmentation() {
|
||||
if (!fs.existsSync(AUGMENTATION_FILE)) {
|
||||
console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`);
|
||||
return [];
|
||||
}
|
||||
const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean);
|
||||
const records = [];
|
||||
for (const [i, line] of lines.entries()) {
|
||||
try {
|
||||
const rec = JSON.parse(line);
|
||||
if (rec.instruction && rec.output) records.push(rec);
|
||||
} catch (e) {
|
||||
console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
console.log(`[corpus] loaded ${records.length} augmentation records`);
|
||||
return records;
|
||||
}
|
||||
|
||||
function buildRecords(chunks) {
|
||||
const chunksOut = [];
|
||||
const instructionsOut = [];
|
||||
const chatOut = [];
|
||||
const completionOut = [];
|
||||
|
||||
for (const c of chunks) {
|
||||
// 1. Raw chunk record
|
||||
chunksOut.push({
|
||||
id: c.id,
|
||||
source: c.source,
|
||||
url: c.url,
|
||||
pageTitle: c.pageTitle,
|
||||
sectionTitle: c.sectionTitle,
|
||||
language: c.language,
|
||||
tokens: c.tokens,
|
||||
content: c.content,
|
||||
});
|
||||
|
||||
// 2. Question-variant records
|
||||
const sys = systemFor(c);
|
||||
for (const q of questionVariants(c)) {
|
||||
const meta = {
|
||||
language: c.language,
|
||||
source: c.url,
|
||||
page: c.pageTitle,
|
||||
origin: c.source,
|
||||
};
|
||||
instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta });
|
||||
chatOut.push({
|
||||
messages: [
|
||||
{ role: 'system', content: sys },
|
||||
{ role: 'user', content: q },
|
||||
{ role: 'assistant', content: c.content },
|
||||
],
|
||||
metadata: meta,
|
||||
});
|
||||
completionOut.push({
|
||||
prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`,
|
||||
completion: ' ' + c.content,
|
||||
});
|
||||
}
|
||||
|
||||
// 3. Code-block recall samples — only for the framework reference,
|
||||
// where the code is gold (file:line cited, framework-canonical).
|
||||
if (c.source === 'framework-reference-v2') {
|
||||
const blocks = extractCodeBlockSamples(c);
|
||||
for (const b of blocks) {
|
||||
const { question, answer } = codeBlockQA(c, b, c.language);
|
||||
const meta = {
|
||||
language: c.language,
|
||||
source: c.url,
|
||||
page: c.pageTitle,
|
||||
origin: c.source,
|
||||
codeLanguage: b.language,
|
||||
kind: 'code-recall',
|
||||
};
|
||||
instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta });
|
||||
chatOut.push({
|
||||
messages: [
|
||||
{ role: 'system', content: sys },
|
||||
{ role: 'user', content: question },
|
||||
{ role: 'assistant', content: answer },
|
||||
],
|
||||
metadata: meta,
|
||||
});
|
||||
completionOut.push({
|
||||
prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`,
|
||||
completion: ' ' + answer,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge research-agent augmentation. Each input record is alpaca-style;
|
||||
// we fan it out into instructions / chat / completion to match the rest.
|
||||
const augmentation = loadAugmentation();
|
||||
for (const a of augmentation) {
|
||||
const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade
|
||||
const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' };
|
||||
instructionsOut.push({
|
||||
instruction: a.instruction,
|
||||
input: a.input || '',
|
||||
output: a.output,
|
||||
metadata: meta,
|
||||
});
|
||||
chatOut.push({
|
||||
messages: [
|
||||
{ role: 'system', content: sys },
|
||||
{ role: 'user', content: a.instruction },
|
||||
{ role: 'assistant', content: a.output },
|
||||
],
|
||||
metadata: meta,
|
||||
});
|
||||
completionOut.push({
|
||||
prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`,
|
||||
completion: ' ' + a.output,
|
||||
});
|
||||
}
|
||||
if (augmentation.length) {
|
||||
console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`);
|
||||
}
|
||||
|
||||
return { chunksOut, instructionsOut, chatOut, completionOut };
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// IO + manifest
|
||||
// =============================================================================
|
||||
|
||||
function ensureDir(d) {
|
||||
fs.mkdirSync(d, { recursive: true });
|
||||
}
|
||||
@@ -65,61 +372,173 @@ function writeJsonl(filePath, items) {
|
||||
return new Promise((res) => stream.on('close', res));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`Walking ${DOCS_DIR}…`);
|
||||
const files = walkDocs(DOCS_DIR);
|
||||
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
|
||||
console.log(`Produced ${chunks.length} chunks across ${files.length} files.`);
|
||||
|
||||
const chunksOut = chunks.map((c) => ({
|
||||
id: c.id,
|
||||
url: c.url,
|
||||
pageTitle: c.pageTitle,
|
||||
sectionTitle: c.sectionTitle,
|
||||
language: c.language,
|
||||
tokens: c.tokens,
|
||||
content: c.content,
|
||||
}));
|
||||
|
||||
const instructionsOut = chunks.map((c) => ({
|
||||
instruction: questionFor(c),
|
||||
input: '',
|
||||
output: c.content,
|
||||
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
||||
}));
|
||||
|
||||
const chatOut = chunks.map((c) => ({
|
||||
messages: [
|
||||
{ role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en },
|
||||
{ role: 'user', content: questionFor(c) },
|
||||
{ role: 'assistant', content: c.content },
|
||||
],
|
||||
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
||||
}));
|
||||
|
||||
const completionOut = chunks.map((c) => ({
|
||||
prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`,
|
||||
completion: ' ' + c.content,
|
||||
}));
|
||||
|
||||
await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut);
|
||||
|
||||
const stats = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
fileCount: files.length,
|
||||
chunkCount: chunks.length,
|
||||
byLanguage: chunks.reduce((acc, c) => {
|
||||
acc[c.language] = (acc[c.language] || 0) + 1;
|
||||
return acc;
|
||||
}, {}),
|
||||
function fileStats(filePath) {
|
||||
const buf = fs.readFileSync(filePath);
|
||||
return {
|
||||
bytes: buf.length,
|
||||
sha256: crypto.createHash('sha256').update(buf).digest('hex'),
|
||||
};
|
||||
fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2));
|
||||
}
|
||||
|
||||
console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`);
|
||||
console.log(JSON.stringify(stats, null, 2));
|
||||
function firstNonEmptyLine(filePath) {
|
||||
const text = fs.readFileSync(filePath, 'utf8');
|
||||
const line = text.split('\n').find((l) => l.trim().length > 0) || '';
|
||||
return line.length > 800 ? line.slice(0, 800) + '…' : line;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Main
|
||||
// =============================================================================
|
||||
|
||||
// Language metadata — used both to bucket files and label them in the UI.
|
||||
// Order matters: English first (the framework-reference is English-only and
|
||||
// rolls into the en bucket), then localised docs.
|
||||
const LANGUAGES = [
|
||||
{ code: 'en', label: 'English' },
|
||||
{ code: 'de', label: 'Deutsch' },
|
||||
{ code: 'ja', label: '日本語' },
|
||||
{ code: 'es', label: 'Español' },
|
||||
{ code: 'fr', label: 'Français' },
|
||||
];
|
||||
|
||||
// Bucket records by their `language` (raw chunks) or `metadata.language`
|
||||
// (alpaca/chat/completion records). Returns Map<lang, items[]>.
|
||||
function bucketByLanguage(records, getLang) {
|
||||
const map = new Map();
|
||||
for (const lang of LANGUAGES) map.set(lang.code, []);
|
||||
for (const r of records) {
|
||||
const lang = getLang(r) || 'en';
|
||||
const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang));
|
||||
bucket.push(r);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`);
|
||||
console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`);
|
||||
console.log(`[corpus] OUT_DIR=${OUT_DIR}`);
|
||||
|
||||
const docsChunks = ingestPublicDocs();
|
||||
const refChunks = ingestFrameworkReference();
|
||||
const chunks = [...refChunks, ...docsChunks]; // reference first → priority
|
||||
|
||||
console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`);
|
||||
|
||||
const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks);
|
||||
console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`);
|
||||
|
||||
ensureDir(OUT_DIR);
|
||||
// Wipe any leftover files from a previous run so stale per-language
|
||||
// buckets don't linger.
|
||||
for (const f of fs.readdirSync(OUT_DIR)) {
|
||||
if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f));
|
||||
}
|
||||
|
||||
// Per-language buckets. Each format gets one file per language plus a
|
||||
// combined `*-all.jsonl` for callers who want everything.
|
||||
const buckets = {
|
||||
chunks: bucketByLanguage(chunksOut, (r) => r.language),
|
||||
instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language),
|
||||
chat: bucketByLanguage(chatOut, (r) => r.metadata?.language),
|
||||
completion: bucketByLanguage(completionOut, (r) => r.metadata?.language ?? 'en'),
|
||||
};
|
||||
// `completion` records don't carry metadata (prompt/completion-only),
|
||||
// so its bucketing falls back to en. To keep splits accurate we recompute
|
||||
// from instructionsOut which has the same shape and ordering pre-bucket:
|
||||
{
|
||||
const completionMap = new Map();
|
||||
for (const lang of LANGUAGES) completionMap.set(lang.code, []);
|
||||
for (let i = 0; i < instructionsOut.length; i++) {
|
||||
const lang = instructionsOut[i].metadata?.language || 'en';
|
||||
completionMap.get(lang)?.push(completionOut[i]);
|
||||
}
|
||||
buckets.completion = completionMap;
|
||||
}
|
||||
|
||||
const writeBucketed = async (formatName, bucketMap, allRecords) => {
|
||||
const out = [];
|
||||
// Per-language files
|
||||
for (const lang of LANGUAGES) {
|
||||
const records = bucketMap.get(lang.code) || [];
|
||||
if (records.length === 0) continue;
|
||||
const filename = `${formatName}-${lang.code}.jsonl`;
|
||||
await writeJsonl(path.join(OUT_DIR, filename), records);
|
||||
out.push({
|
||||
format: formatName,
|
||||
language: lang.code,
|
||||
languageLabel: lang.label,
|
||||
filename,
|
||||
records: records.length,
|
||||
});
|
||||
}
|
||||
// Combined all-language file
|
||||
const allFilename = `${formatName}-all.jsonl`;
|
||||
await writeJsonl(path.join(OUT_DIR, allFilename), allRecords);
|
||||
out.push({
|
||||
format: formatName,
|
||||
language: 'all',
|
||||
languageLabel: 'All languages',
|
||||
filename: allFilename,
|
||||
records: allRecords.length,
|
||||
});
|
||||
return out;
|
||||
};
|
||||
|
||||
const allFileMeta = [
|
||||
...await writeBucketed('chunks', buckets.chunks, chunksOut),
|
||||
...await writeBucketed('instructions', buckets.instructions, instructionsOut),
|
||||
...await writeBucketed('chat', buckets.chat, chatOut),
|
||||
...await writeBucketed('completion', buckets.completion, completionOut),
|
||||
];
|
||||
|
||||
// Per-language breakdown of the chunks (handy for inspection).
|
||||
const byLanguage = chunks.reduce((acc, c) => {
|
||||
acc[c.language] = (acc[c.language] || 0) + 1;
|
||||
return acc;
|
||||
}, {});
|
||||
const bySource = chunks.reduce((acc, c) => {
|
||||
acc[c.source] = (acc[c.source] || 0) + 1;
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
// Hash + size + preview for every file written.
|
||||
const filesEnriched = allFileMeta.map((f) => {
|
||||
const fp = path.join(OUT_DIR, f.filename);
|
||||
const st = fileStats(fp);
|
||||
return {
|
||||
...f,
|
||||
bytes: st.bytes,
|
||||
sha256: st.sha256,
|
||||
samplePreview: firstNonEmptyLine(fp),
|
||||
};
|
||||
});
|
||||
|
||||
const manifest = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
generator: {
|
||||
script: 'scripts/build-corpus.mjs',
|
||||
node: process.version,
|
||||
},
|
||||
encoding: 'utf-8',
|
||||
sources: {
|
||||
'framework-reference-v2.md': refChunks.length,
|
||||
'src/content/docs/': docsChunks.length,
|
||||
},
|
||||
chunkCount: chunks.length,
|
||||
byLanguage,
|
||||
bySource,
|
||||
languages: LANGUAGES,
|
||||
formats: ['chunks', 'instructions', 'chat', 'completion'],
|
||||
files: filesEnriched,
|
||||
};
|
||||
fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2));
|
||||
|
||||
console.log('[corpus] done — wrote', filesEnriched.length, 'files');
|
||||
console.log('[corpus] per-format/per-language summary:');
|
||||
for (const f of filesEnriched) {
|
||||
console.log(` ${f.filename.padEnd(28)} ${String(f.records).padStart(5)} records ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
|
||||
Reference in New Issue
Block a user