Files
nibiru-framework.com/docs/scripts/build-corpus.mjs
stephan f4ccc45a3b Strip api.neuronetz.ai from documentation; chat config stays in env
The Ollama URL was leaking via:
  - prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment,
    local-testing, ai/module/{overview,embed,training})
  - code blocks teaching users to curl the host directly
  - .env.example, Dockerfile, docker-compose.yml defaults
  - providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults
  - LandingScripts.astro comment
  - lora-runbook.md prose + SSH host
  - the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers
  - the "Oracle is silent" fallback message at /api/oracle POST

Replacements:
  - prose: "neuronetz.ai" → "your Ollama instance"
  - example URLs in code blocks: https://api.neuronetz.aihttps://your-ollama-host.example
  - code-level defaults: → http://localhost:11434 (Ollama's standard local port)
  - GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed
  - runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host>

Production chat is unaffected: docs/.env (gitignored) on the production
host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only
change in the running container is that the GET handler no longer
echoes the URL.

analytics.neuronetz.ai (Umami tracking) is intentionally left intact —
it's a public, brand-owned subdomain meant to be visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 17:14:17 +02:00

548 lines
20 KiB
JavaScript

#!/usr/bin/env node
/**
* Build the LoRA training corpus.
*
* node scripts/build-corpus.mjs
*
* Sources, in order of priority:
* 1. scripts/extraction/framework-reference-v2.md (deep, file:line cited)
* 2. src/content/docs/{en,de,ja,es,fr}/ (the public docs)
*
* Outputs under dist/corpus/:
* - chunks.jsonl — raw chunks (one record per source chunk, no Q/A)
* - instructions.jsonl — instruction/input/output triples (Alpaca-style)
* - chat.jsonl — sharegpt/messages format (system+user+assistant)
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
* - manifest.json — size, sha256, record count, sample preview per file
*
* Augmentation: per chunk, we emit 3-4 question variants (definition,
* procedural, code-focused, file-pointer). Code-block recall samples are
* emitted as additional records for the framework-reference source so the
* model learns exact framework idioms.
*/
import fs from 'node:fs';
import path from 'node:path';
import crypto from 'node:crypto';
import { fileURLToPath } from 'node:url';
import { chunkFile, chunkMarkdown, walkDocs } from './lib/chunk.mjs';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const REFERENCE_FILE = path.resolve(__dirname, 'extraction/framework-reference-v2.md');
// Optional research-agent augmentation. JSONL — one alpaca-style record
// per line. When present, records are merged into instructions/chat/
// completion outputs alongside the templated ones.
const AUGMENTATION_FILE = path.resolve(__dirname, 'extraction/lora-augmentation.jsonl');
// Write straight into public/corpus/ so Astro serves the files at
// /corpus/<name>.jsonl without a separate copy step. Gitignored.
const OUT_DIR = path.resolve(__dirname, '../public/corpus');
// =============================================================================
// System prompts
// =============================================================================
const SYSTEM_PROMPT = {
en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.',
ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。',
es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.',
fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
};
// Stricter system prompt for the framework-reference source — it's the
// gold material with exact namespaces, file:line citations, and the small
// idioms we want the model to internalise.
const SYSTEM_PROMPT_REFERENCE =
'You are a senior PHP architect and Nibiru framework expert. ' +
'Answers must include exact namespaces, file paths with line numbers when available, ' +
'and concrete code excerpts. Never say "presumably", "likely", or "appears to" ' +
'— if you do not know, say so plainly.';
// =============================================================================
// Question-variant generation (deterministic, no LLM)
// =============================================================================
const QUESTION_TEMPLATES = {
en: {
definitional: ['What is {topic}?', 'Explain {topic}.', 'Tell me about {topic}.'],
procedural: ['How do I {topic_lc}?', 'Show me how to {topic_lc}.', 'Walk me through {topic_lc}.'],
topic: ['{topic}', '{topic} — overview', '{topic} in Nibiru'],
filePointer: ['Where is {topic} defined?', 'Which file contains {topic}?'],
codeFocused: ['Show me the code for {topic}.', 'Quote the {topic} implementation.'],
},
de: {
definitional: ['Was ist {topic}?', 'Erkläre {topic}.', 'Was bedeutet {topic}?'],
procedural: ['Wie {topic_lc}?', 'Zeig mir, wie {topic_lc}.', 'Wie geht {topic_lc}?'],
topic: ['{topic}', '{topic} — Übersicht', '{topic} in Nibiru'],
filePointer: ['Wo ist {topic} definiert?', 'Welche Datei enthält {topic}?'],
codeFocused: ['Zeig mir den Code für {topic}.', 'Zitiere die {topic}-Implementierung.'],
},
ja: {
definitional: ['{topic} とは何ですか?', '{topic} について説明してください。'],
procedural: ['{topic} のやり方を教えてください。', '{topic} の手順を教えてください。'],
topic: ['{topic}', '{topic} — 概要'],
filePointer: ['{topic} はどこで定義されていますか?'],
codeFocused: ['{topic} のコードを見せてください。'],
},
es: {
definitional: ['¿Qué es {topic}?', 'Explica {topic}.'],
procedural: ['¿Cómo {topic_lc}?', 'Muéstrame cómo {topic_lc}.'],
topic: ['{topic}', '{topic} — visión general'],
filePointer: ['¿Dónde se define {topic}?'],
codeFocused: ['Muéstrame el código de {topic}.'],
},
fr: {
definitional: ['Qu\'est-ce que {topic} ?', 'Explique {topic}.'],
procedural: ['Comment {topic_lc} ?', 'Montre-moi comment {topic_lc}.'],
topic: ['{topic}', '{topic} — vue d\'ensemble'],
filePointer: ['Où est défini {topic} ?'],
codeFocused: ['Montre-moi le code de {topic}.'],
},
};
// Hash-pick a template deterministically from a kind, so two builds give
// the same corpus (necessary for reproducible LoRA training runs).
function hashPick(arr, seed) {
const h = crypto.createHash('md5').update(seed).digest();
return arr[h[0] % arr.length];
}
function questionVariants(chunk) {
const lang = chunk.language in QUESTION_TEMPLATES ? chunk.language : 'en';
const tpl = QUESTION_TEMPLATES[lang];
const topic = chunk.sectionTitle || chunk.pageTitle;
const topicLc = topic.toLowerCase();
const seed = chunk.id + '|' + topic;
const fill = (s) => s.replaceAll('{topic}', topic).replaceAll('{topic_lc}', topicLc);
// Always include one of each kind so a chunk gets 4 phrasings minimum.
const variants = [
fill(hashPick(tpl.definitional, seed + '|d')),
fill(hashPick(tpl.procedural, seed + '|p')),
fill(hashPick(tpl.topic, seed + '|t')),
];
// Add file-pointer / code-focused variants when the chunk actually
// references a file path or contains a code block.
if (/[a-z0-9_/.-]+\.(php|mjs|ts|tsx|astro|css|ini|sql)(:\d+)?/i.test(chunk.content)) {
variants.push(fill(hashPick(tpl.filePointer, seed + '|f')));
}
if (/```/.test(chunk.content)) {
variants.push(fill(hashPick(tpl.codeFocused, seed + '|c')));
}
return variants;
}
// =============================================================================
// Code-block extraction
// =============================================================================
// Pull out fenced code blocks paired with a leading sentence as the prompt.
function extractCodeBlockSamples(chunk) {
const out = [];
const lines = chunk.content.split('\n');
let inFence = false;
let fenceLang = '';
let buf = [];
let leadIn = '';
let prevPara = [];
for (const line of lines) {
const fence = line.match(/^```(.*)$/);
if (fence) {
if (!inFence) {
inFence = true;
fenceLang = fence[1].trim();
buf = [];
leadIn = prevPara.join(' ').trim().slice(0, 240);
prevPara = [];
} else {
inFence = false;
if (buf.length >= 2) {
out.push({
language: fenceLang || 'text',
leadIn,
code: buf.join('\n'),
});
}
}
continue;
}
if (inFence) {
buf.push(line);
} else if (line.trim() === '') {
prevPara = [];
} else {
prevPara.push(line);
}
}
return out;
}
function codeBlockQA(chunk, block, lang) {
const tpl = QUESTION_TEMPLATES[lang] || QUESTION_TEMPLATES.en;
const topic = chunk.sectionTitle || chunk.pageTitle;
const seed = chunk.id + '|code|' + block.code.slice(0, 32);
const q = hashPick(tpl.codeFocused, seed)
.replaceAll('{topic}', topic)
.replaceAll('{topic_lc}', topic.toLowerCase());
// Answer = optional lead-in + the code block. Wrap code in fences so the
// model learns to emit syntactically valid code blocks too.
const fence = '```' + (block.language || '') + '\n' + block.code + '\n```';
const answer = block.leadIn ? `${block.leadIn}\n\n${fence}` : fence;
return { question: q, answer };
}
// =============================================================================
// Source ingestion
// =============================================================================
function ingestPublicDocs() {
const files = walkDocs(DOCS_DIR);
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
return chunks.map((c) => ({ ...c, source: 'docs' }));
}
function ingestFrameworkReference() {
if (!fs.existsSync(REFERENCE_FILE)) {
console.warn(`[corpus] no framework-reference-v2 at ${REFERENCE_FILE} — skipping`);
return [];
}
const raw = fs.readFileSync(REFERENCE_FILE, 'utf8');
return chunkMarkdown(raw, {
language: 'en',
file: 'framework-reference-v2.md',
pageTitle: 'Nibiru Framework Reference v2',
pageDescription: 'Deep technical reference — every public factory, namespace, idiom and gotcha with file:line citations.',
baseUrl: '/reference/',
}).map((c) => ({ ...c, source: 'framework-reference-v2' }));
}
// =============================================================================
// Record assembly
// =============================================================================
function systemFor(chunk) {
if (chunk.source === 'framework-reference-v2') return SYSTEM_PROMPT_REFERENCE;
return SYSTEM_PROMPT[chunk.language] || SYSTEM_PROMPT.en;
}
// Read the optional research-agent augmentation. Each line is alpaca-format
// `{instruction, input, output, metadata}`. Returns [] if the file is absent.
function loadAugmentation() {
if (!fs.existsSync(AUGMENTATION_FILE)) {
console.log(`[corpus] no augmentation file at ${AUGMENTATION_FILE} — skipping`);
return [];
}
const lines = fs.readFileSync(AUGMENTATION_FILE, 'utf8').split('\n').filter(Boolean);
const records = [];
for (const [i, line] of lines.entries()) {
try {
const rec = JSON.parse(line);
if (rec.instruction && rec.output) records.push(rec);
} catch (e) {
console.warn(`[corpus] skipping malformed augmentation line ${i + 1}: ${e.message}`);
}
}
console.log(`[corpus] loaded ${records.length} augmentation records`);
return records;
}
function buildRecords(chunks) {
const chunksOut = [];
const instructionsOut = [];
const chatOut = [];
const completionOut = [];
for (const c of chunks) {
// 1. Raw chunk record
chunksOut.push({
id: c.id,
source: c.source,
url: c.url,
pageTitle: c.pageTitle,
sectionTitle: c.sectionTitle,
language: c.language,
tokens: c.tokens,
content: c.content,
});
// 2. Question-variant records
const sys = systemFor(c);
for (const q of questionVariants(c)) {
const meta = {
language: c.language,
source: c.url,
page: c.pageTitle,
origin: c.source,
};
instructionsOut.push({ instruction: q, input: '', output: c.content, metadata: meta });
chatOut.push({
messages: [
{ role: 'system', content: sys },
{ role: 'user', content: q },
{ role: 'assistant', content: c.content },
],
metadata: meta,
});
completionOut.push({
prompt: `${sys}\n\nQuestion: ${q}\n\nAnswer:`,
completion: ' ' + c.content,
});
}
// 3. Code-block recall samples — only for the framework reference,
// where the code is gold (file:line cited, framework-canonical).
if (c.source === 'framework-reference-v2') {
const blocks = extractCodeBlockSamples(c);
for (const b of blocks) {
const { question, answer } = codeBlockQA(c, b, c.language);
const meta = {
language: c.language,
source: c.url,
page: c.pageTitle,
origin: c.source,
codeLanguage: b.language,
kind: 'code-recall',
};
instructionsOut.push({ instruction: question, input: '', output: answer, metadata: meta });
chatOut.push({
messages: [
{ role: 'system', content: sys },
{ role: 'user', content: question },
{ role: 'assistant', content: answer },
],
metadata: meta,
});
completionOut.push({
prompt: `${sys}\n\nQuestion: ${question}\n\nAnswer:`,
completion: ' ' + answer,
});
}
}
}
// Merge research-agent augmentation. Each input record is alpaca-style;
// we fan it out into instructions / chat / completion to match the rest.
const augmentation = loadAugmentation();
for (const a of augmentation) {
const sys = SYSTEM_PROMPT_REFERENCE; // augmentation is always English, framework-grade
const meta = { ...(a.metadata || {}), origin: 'lora-augmentation' };
instructionsOut.push({
instruction: a.instruction,
input: a.input || '',
output: a.output,
metadata: meta,
});
chatOut.push({
messages: [
{ role: 'system', content: sys },
{ role: 'user', content: a.instruction },
{ role: 'assistant', content: a.output },
],
metadata: meta,
});
completionOut.push({
prompt: `${sys}\n\nQuestion: ${a.instruction}\n\nAnswer:`,
completion: ' ' + a.output,
});
}
if (augmentation.length) {
console.log(`[corpus] merged ${augmentation.length} augmentation records into instructions/chat/completion`);
}
return { chunksOut, instructionsOut, chatOut, completionOut };
}
// =============================================================================
// IO + manifest
// =============================================================================
function ensureDir(d) {
fs.mkdirSync(d, { recursive: true });
}
function writeJsonl(filePath, items) {
ensureDir(path.dirname(filePath));
const stream = fs.createWriteStream(filePath, { encoding: 'utf8' });
for (const item of items) stream.write(JSON.stringify(item) + '\n');
stream.end();
return new Promise((res) => stream.on('close', res));
}
function fileStats(filePath) {
const buf = fs.readFileSync(filePath);
return {
bytes: buf.length,
sha256: crypto.createHash('sha256').update(buf).digest('hex'),
};
}
function firstNonEmptyLine(filePath) {
const text = fs.readFileSync(filePath, 'utf8');
const line = text.split('\n').find((l) => l.trim().length > 0) || '';
return line.length > 800 ? line.slice(0, 800) + '…' : line;
}
// =============================================================================
// Main
// =============================================================================
// Language metadata — used both to bucket files and label them in the UI.
// Order matters: English first (the framework-reference is English-only and
// rolls into the en bucket), then localised docs.
const LANGUAGES = [
{ code: 'en', label: 'English' },
{ code: 'de', label: 'Deutsch' },
{ code: 'ja', label: '日本語' },
{ code: 'es', label: 'Español' },
{ code: 'fr', label: 'Français' },
];
// Bucket records by their `language` (raw chunks) or `metadata.language`
// (alpaca/chat/completion records). Returns Map<lang, items[]>.
function bucketByLanguage(records, getLang) {
const map = new Map();
for (const lang of LANGUAGES) map.set(lang.code, []);
for (const r of records) {
const lang = getLang(r) || 'en';
const bucket = map.get(lang) ?? (map.set(lang, []), map.get(lang));
bucket.push(r);
}
return map;
}
async function main() {
console.log(`[corpus] DOCS_DIR=${DOCS_DIR}`);
console.log(`[corpus] REFERENCE=${REFERENCE_FILE}`);
console.log(`[corpus] OUT_DIR=${OUT_DIR}`);
const docsChunks = ingestPublicDocs();
const refChunks = ingestFrameworkReference();
const chunks = [...refChunks, ...docsChunks]; // reference first → priority
console.log(`[corpus] ingested ${refChunks.length} reference chunks + ${docsChunks.length} docs chunks`);
const { chunksOut, instructionsOut, chatOut, completionOut } = buildRecords(chunks);
console.log(`[corpus] records: chunks=${chunksOut.length} instructions=${instructionsOut.length} chat=${chatOut.length} completion=${completionOut.length}`);
ensureDir(OUT_DIR);
// Wipe any leftover files from a previous run so stale per-language
// buckets don't linger.
for (const f of fs.readdirSync(OUT_DIR)) {
if (/\.(jsonl|json)$/.test(f)) fs.unlinkSync(path.join(OUT_DIR, f));
}
// Per-language buckets. Each format gets one file per language plus a
// combined `*-all.jsonl` for callers who want everything.
const buckets = {
chunks: bucketByLanguage(chunksOut, (r) => r.language),
instructions: bucketByLanguage(instructionsOut, (r) => r.metadata?.language),
chat: bucketByLanguage(chatOut, (r) => r.metadata?.language),
completion: bucketByLanguage(completionOut, (r) => r.metadata?.language ?? 'en'),
};
// `completion` records don't carry metadata (prompt/completion-only),
// so its bucketing falls back to en. To keep splits accurate we recompute
// from instructionsOut which has the same shape and ordering pre-bucket:
{
const completionMap = new Map();
for (const lang of LANGUAGES) completionMap.set(lang.code, []);
for (let i = 0; i < instructionsOut.length; i++) {
const lang = instructionsOut[i].metadata?.language || 'en';
completionMap.get(lang)?.push(completionOut[i]);
}
buckets.completion = completionMap;
}
const writeBucketed = async (formatName, bucketMap, allRecords) => {
const out = [];
// Per-language files
for (const lang of LANGUAGES) {
const records = bucketMap.get(lang.code) || [];
if (records.length === 0) continue;
const filename = `${formatName}-${lang.code}.jsonl`;
await writeJsonl(path.join(OUT_DIR, filename), records);
out.push({
format: formatName,
language: lang.code,
languageLabel: lang.label,
filename,
records: records.length,
});
}
// Combined all-language file
const allFilename = `${formatName}-all.jsonl`;
await writeJsonl(path.join(OUT_DIR, allFilename), allRecords);
out.push({
format: formatName,
language: 'all',
languageLabel: 'All languages',
filename: allFilename,
records: allRecords.length,
});
return out;
};
const allFileMeta = [
...await writeBucketed('chunks', buckets.chunks, chunksOut),
...await writeBucketed('instructions', buckets.instructions, instructionsOut),
...await writeBucketed('chat', buckets.chat, chatOut),
...await writeBucketed('completion', buckets.completion, completionOut),
];
// Per-language breakdown of the chunks (handy for inspection).
const byLanguage = chunks.reduce((acc, c) => {
acc[c.language] = (acc[c.language] || 0) + 1;
return acc;
}, {});
const bySource = chunks.reduce((acc, c) => {
acc[c.source] = (acc[c.source] || 0) + 1;
return acc;
}, {});
// Hash + size + preview for every file written.
const filesEnriched = allFileMeta.map((f) => {
const fp = path.join(OUT_DIR, f.filename);
const st = fileStats(fp);
return {
...f,
bytes: st.bytes,
sha256: st.sha256,
samplePreview: firstNonEmptyLine(fp),
};
});
const manifest = {
generatedAt: new Date().toISOString(),
generator: {
script: 'scripts/build-corpus.mjs',
node: process.version,
},
encoding: 'utf-8',
sources: {
'framework-reference-v2.md': refChunks.length,
'src/content/docs/': docsChunks.length,
},
chunkCount: chunks.length,
byLanguage,
bySource,
languages: LANGUAGES,
formats: ['chunks', 'instructions', 'chat', 'completion'],
files: filesEnriched,
};
fs.writeFileSync(path.join(OUT_DIR, 'manifest.json'), JSON.stringify(manifest, null, 2));
console.log('[corpus] done — wrote', filesEnriched.length, 'files');
console.log('[corpus] per-format/per-language summary:');
for (const f of filesEnriched) {
console.log(` ${f.filename.padEnd(28)} ${String(f.records).padStart(5)} records ${(f.bytes / 1024).toFixed(1).padStart(7)} KB`);
}
}
main().catch((e) => {
console.error(e);
process.exit(1);
});