Initial public push: docs cosmos v4 + AI module + framework groundwork

This is the snapshot the production landing site (nibiru-framework.com) is
deployed from. Brings together the recent splash + docs migration to the v4
"Cosmos" design system, the new in-framework AI module, and the framework
groundwork that backs the framework-reference extraction.

What lands:
- docs/: Astro + Starlight site with the v4 dark cosmic palette, GalaxyHero
  canvas constellation, Mission Control chat (wired to /api/oracle →
  api.neuronetz.ai via providers.mjs Ollama), 5-panel MMVC stage
  (Model · AI · Module · Controller · View), translated EN/DE/JA/ES/FR
  content, PWA + sitemap + llms.txt + Umami analytics.
- docs/design-system/: canonical mockup bundle (source/index-v2.html for
  splash, source/docs-system.html + preview/ for docs, SPEC.md, tokens).
- docs/scripts/extraction/framework-reference-v2.md: deep framework
  reference (~1.6k lines, file:line citations, every public factory and
  idiom — basis for the LoRA training corpus.
- application/module/ai/: AI module with chat / embed / RAG / agent
  plugins, plus pdoQuery / httpGet / fileRead tools and Modelfile +
  smoke-test in training/.
- application/module/users/: user / ACL / form-factory traits used as the
  reference plugin pattern for the framework docs.
- application/settings/config/database/: schema + seed migrations
  including the AI module tables (200–203).
- Form factory + autogenerator changes the framework-reference-v2 covers.

Production secrets stay out: docs/.env, settings.production.ini and
ai.production.ini are all gitignored (.example files are in tree).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
stephan
2026-05-08 15:22:18 +02:00
parent a60ce90643
commit 48c839d927
662 changed files with 172811 additions and 1 deletions

View File

@@ -0,0 +1,128 @@
#!/usr/bin/env node
/**
* Export the docs as a LoRA-training-ready corpus.
*
* node scripts/build-corpus.mjs
*
* Outputs four files under dist/corpus/:
* - chunks.jsonl — raw chunks (one section per line)
* - instructions.jsonl — instruction/input/output triples
* - chat.jsonl — sharegpt/chat-format messages
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
*
* The instruction text for each chunk is derived from the section heading
* with a per-language template ("How do I X?", "Wie X?", "X するには?").
*/
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { chunkFile, walkDocs } from './lib/chunk.mjs';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const OUT_DIR = path.resolve(__dirname, '../dist/corpus');
const SYSTEM_PROMPT = {
en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.',
ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。',
es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.',
fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
};
const QUESTION_PREFIX = {
en: ['How do I', 'What is', 'Explain', 'Show me'],
de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'],
ja: ['', '', 'について教えてください:', ''],
es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'],
fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'],
};
function questionFor(chunk) {
const lang = chunk.language || 'en';
const heading = chunk.sectionTitle || chunk.pageTitle;
if (lang === 'ja') {
return `${heading} について教えてください。`;
}
const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en;
const prefix = prefixes[heading.length % prefixes.length];
if (lang === 'es' || lang === 'fr') {
return `${prefix} ${heading.toLowerCase()} ?`.replace(' ', ' ');
}
return `${prefix} ${heading.toLowerCase()}?`;
}
function ensureDir(d) {
fs.mkdirSync(d, { recursive: true });
}
function writeJsonl(filePath, items) {
ensureDir(path.dirname(filePath));
const stream = fs.createWriteStream(filePath, { encoding: 'utf8' });
for (const item of items) stream.write(JSON.stringify(item) + '\n');
stream.end();
return new Promise((res) => stream.on('close', res));
}
async function main() {
console.log(`Walking ${DOCS_DIR}`);
const files = walkDocs(DOCS_DIR);
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
console.log(`Produced ${chunks.length} chunks across ${files.length} files.`);
const chunksOut = chunks.map((c) => ({
id: c.id,
url: c.url,
pageTitle: c.pageTitle,
sectionTitle: c.sectionTitle,
language: c.language,
tokens: c.tokens,
content: c.content,
}));
const instructionsOut = chunks.map((c) => ({
instruction: questionFor(c),
input: '',
output: c.content,
metadata: { language: c.language, source: c.url, page: c.pageTitle },
}));
const chatOut = chunks.map((c) => ({
messages: [
{ role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en },
{ role: 'user', content: questionFor(c) },
{ role: 'assistant', content: c.content },
],
metadata: { language: c.language, source: c.url, page: c.pageTitle },
}));
const completionOut = chunks.map((c) => ({
prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`,
completion: ' ' + c.content,
}));
await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut);
await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut);
await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut);
await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut);
const stats = {
generatedAt: new Date().toISOString(),
fileCount: files.length,
chunkCount: chunks.length,
byLanguage: chunks.reduce((acc, c) => {
acc[c.language] = (acc[c.language] || 0) + 1;
return acc;
}, {}),
};
fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2));
console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`);
console.log(JSON.stringify(stats, null, 2));
}
main().catch((e) => {
console.error(e);
process.exit(1);
});

View File

@@ -0,0 +1,117 @@
#!/usr/bin/env node
/**
* Build a vector index over the docs for the in-site Oracle (RAG).
*
* node scripts/build-oracle-index.mjs
*
* Defaults to Ollama at https://api.neuronetz.ai with model nomic-embed-text.
* Override via env:
* OLLAMA_BASE_URL=...
* OLLAMA_EMBED_MODEL=... (e.g. nomic-embed-text, mxbai-embed-large)
* EMBED_PROVIDER=openai (uses OpenAI embeddings via OPENAI_API_KEY)
*
* Output: public/oracle-index.json
*
* Soft-fail behaviour: if the embedding provider is unreachable or the model
* is missing, an empty index is written and the runtime endpoint will operate
* in chat-only (no-RAG) mode.
*/
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { chunkFile, walkDocs } from './lib/chunk.mjs';
import { embed, embedConfig } from './lib/providers.mjs';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const OUT_FILE = path.resolve(__dirname, '../public/oracle-index.json');
const BATCH = 16;
function embedToBase64(f32) {
return Buffer.from(new Float32Array(f32).buffer).toString('base64');
}
async function main() {
const cfg = embedConfig();
console.log(`Embedding provider: ${cfg.provider}`);
if (cfg.provider === 'ollama') {
console.log(` Ollama: ${cfg.ollamaUrl}`);
console.log(` Model: ${cfg.ollamaEmbedModel}`);
} else if (cfg.provider === 'openai') {
console.log(` OpenAI model: ${cfg.openaiEmbedModel}`);
if (!cfg.hasOpenAIKey) {
console.warn(' ⚠ OPENAI_API_KEY missing — writing empty index (chat-only mode).');
writeEmpty('openai-key-missing');
return;
}
}
console.log(`Walking ${DOCS_DIR}`);
const files = walkDocs(DOCS_DIR);
console.log(`Found ${files.length} markdown files.`);
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
console.log(`Produced ${chunks.length} chunks.`);
const embeddings = [];
try {
for (let i = 0; i < chunks.length; i += BATCH) {
const batch = chunks.slice(i, i + BATCH);
const inputs = batch.map((c) => `${c.pageTitle}\n${c.sectionTitle}\n\n${c.content}`);
const vecs = await embed(inputs);
for (const v of vecs) embeddings.push(v);
process.stdout.write(`\r embedded ${embeddings.length}/${chunks.length}`);
}
process.stdout.write('\n');
} catch (err) {
console.error(`\n⚠ Embedding failed: ${err.message}`);
console.error(` → writing empty index, Oracle will run in chat-only (no-RAG) mode.`);
if (cfg.provider === 'ollama') {
console.error(` → To fix: pull the embedding model on your Ollama server:`);
console.error(` curl ${cfg.ollamaUrl}/api/pull -d '{"name":"${cfg.ollamaEmbedModel}"}'`);
}
writeEmpty(err.message);
return;
}
const dim = embeddings[0]?.length ?? 0;
const out = {
provider: cfg.provider,
model: cfg.provider === 'ollama' ? cfg.ollamaEmbedModel : cfg.openaiEmbedModel,
dim,
builtAt: new Date().toISOString(),
chunks: chunks.map((c) => ({
id: c.id,
url: c.url,
pageTitle: c.pageTitle,
sectionTitle: c.sectionTitle,
language: c.language,
content: c.content,
})),
embeddings: embeddings.map(embedToBase64),
};
fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true });
fs.writeFileSync(OUT_FILE, JSON.stringify(out));
const kb = (JSON.stringify(out).length / 1024).toFixed(1);
console.log(`✔ Wrote ${OUT_FILE} (${kb} KB, dim=${dim})`);
}
function writeEmpty(reason) {
fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true });
fs.writeFileSync(
OUT_FILE,
JSON.stringify(
{ provider: null, model: null, dim: 0, builtAt: null, reason, chunks: [], embeddings: [] },
null,
2
)
);
}
main().catch((e) => {
console.error(e);
writeEmpty(e.message);
process.exit(0); // soft-fail so Docker build doesn't break the site
});

3
docs/scripts/extraction/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
*.md
*.json
*.jsonl

163
docs/scripts/lib/chunk.mjs Normal file
View File

@@ -0,0 +1,163 @@
// Markdown → chunks at H2/H3 boundaries.
// Used by both build-oracle-index.mjs (RAG) and build-corpus.mjs (LoRA training).
import fs from 'node:fs';
import path from 'node:path';
const TARGET_TOKENS = 600;
const MIN_TOKENS = 120;
const MAX_TOKENS = 900;
// Cheap token estimate: ~4 chars per token for English / European languages,
// closer to 1.5 for CJK. We use a conservative average to avoid undersizing chunks.
export function estimateTokens(text) {
const cjk = (text.match(/[぀-ヿ㐀-䶿一-鿿豈-﫿]/g) || []).length;
const other = text.length - cjk;
return Math.ceil(cjk / 1.5 + other / 4);
}
function stripFrontmatter(md) {
if (!md.startsWith('---')) return { frontmatter: {}, body: md };
const end = md.indexOf('\n---', 3);
if (end === -1) return { frontmatter: {}, body: md };
const fm = md.slice(3, end).trim();
const body = md.slice(end + 4).replace(/^\n/, '');
const frontmatter = {};
for (const line of fm.split('\n')) {
const m = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
if (m) frontmatter[m[1]] = m[2].replace(/^["']|["']$/g, '');
}
return { frontmatter, body };
}
function slugify(s) {
return String(s)
.toLowerCase()
.normalize('NFKD')
.replace(/[̀-ͯ]/g, '')
.replace(/[^a-z0-9\s-]/g, '')
.trim()
.replace(/\s+/g, '-');
}
// Split body at H2/H3 boundaries; keep code fences intact.
function splitByHeadings(body) {
const sections = [];
const lines = body.split('\n');
let inFence = false;
let current = { heading: null, level: 0, anchor: null, lines: [] };
for (const line of lines) {
const fence = line.match(/^(```|~~~)/);
if (fence) inFence = !inFence;
if (!inFence) {
const h = line.match(/^(#{2,3})\s+(.+?)\s*$/);
if (h) {
if (current.lines.length || current.heading) sections.push(current);
current = {
heading: h[2].trim(),
level: h[1].length,
anchor: slugify(h[2].trim()),
lines: [line],
};
continue;
}
}
current.lines.push(line);
}
if (current.lines.length || current.heading) sections.push(current);
return sections;
}
// Further split a too-large section by paragraph boundaries, preserving fences.
function splitOversized(section) {
const text = section.lines.join('\n');
if (estimateTokens(text) <= MAX_TOKENS) return [section];
const parts = [];
const paras = text.split(/\n\n+/);
let buf = [];
let bufTokens = 0;
for (const p of paras) {
const t = estimateTokens(p);
if (bufTokens + t > TARGET_TOKENS && buf.length) {
parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
buf = [];
bufTokens = 0;
}
buf.push(p);
bufTokens += t;
}
if (buf.length) parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
return parts;
}
// Merge tiny adjacent sections so chunks don't drop below MIN_TOKENS.
function mergeSmall(sections) {
const out = [];
for (const s of sections) {
const text = s.lines.join('\n');
const tokens = estimateTokens(text);
if (out.length && tokens < MIN_TOKENS) {
const prev = out[out.length - 1];
prev.lines = [...prev.lines, '', ...s.lines];
} else {
out.push({ ...s, lines: [...s.lines] });
}
}
return out;
}
export function chunkFile(filePath, rootDir) {
const raw = fs.readFileSync(filePath, 'utf8');
const { frontmatter, body } = stripFrontmatter(raw);
// URL: docs/<lang>/<rest>.md(x) → /<lang>/<rest>/
const rel = path.relative(rootDir, filePath).replace(/\\/g, '/');
const parts = rel.split('/');
const lang = parts[0];
const slug = parts.slice(1).join('/').replace(/\.(md|mdx)$/, '').replace(/\/index$/, '');
const baseUrl = '/' + (slug ? `${lang}/${slug}/` : `${lang}/`);
let sections = splitByHeadings(body);
sections = sections.flatMap(splitOversized);
sections = mergeSmall(sections);
const pageTitle = frontmatter.title || slug || 'Untitled';
const pageDescription = frontmatter.description || '';
return sections
.filter((s) => s.lines.join('\n').trim().length > 0)
.map((s, idx) => {
const content = s.lines.join('\n').trim();
const sectionTitle = s.heading || pageTitle;
const url = s.anchor && s.heading ? `${baseUrl}#${s.anchor}` : baseUrl;
return {
id: `${rel}#${s.anchor ?? `_${idx}`}`,
language: lang,
file: rel,
url,
pageTitle,
pageDescription,
sectionTitle,
headingLevel: s.level || 1,
tokens: estimateTokens(content),
content,
};
});
}
export function walkDocs(docsDir) {
const out = [];
const stack = [docsDir];
while (stack.length) {
const d = stack.pop();
for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
const p = path.join(d, entry.name);
if (entry.isDirectory()) stack.push(p);
else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
}
}
return out.sort();
}

View File

@@ -0,0 +1,138 @@
// Unified provider abstraction for chat and embeddings.
// Used by build-oracle-index.mjs (build time) and src/pages/api/oracle.ts (runtime).
const DEFAULT_OLLAMA_URL = 'https://api.neuronetz.ai';
const DEFAULT_OLLAMA_CHAT = 'qwen2.5-coder:14b';
const DEFAULT_OLLAMA_EMBED = 'nomic-embed-text';
const DEFAULT_ANTHROPIC = 'claude-haiku-4-5-20251001';
const DEFAULT_OPENAI_EMBED = 'text-embedding-3-small';
export function llmConfig() {
return {
provider: process.env.LLM_PROVIDER ?? 'ollama',
ollamaUrl: process.env.OLLAMA_BASE_URL ?? DEFAULT_OLLAMA_URL,
ollamaChatModel: process.env.OLLAMA_CHAT_MODEL ?? DEFAULT_OLLAMA_CHAT,
anthropicModel: process.env.ANTHROPIC_MODEL ?? DEFAULT_ANTHROPIC,
hasAnthropicKey: !!process.env.ANTHROPIC_API_KEY,
};
}
export function embedConfig() {
const provider = process.env.EMBED_PROVIDER ?? 'ollama';
return {
provider,
ollamaUrl: process.env.OLLAMA_BASE_URL ?? DEFAULT_OLLAMA_URL,
ollamaEmbedModel: process.env.OLLAMA_EMBED_MODEL ?? DEFAULT_OLLAMA_EMBED,
openaiEmbedModel: process.env.OPENAI_EMBED_MODEL ?? DEFAULT_OPENAI_EMBED,
hasOpenAIKey: !!process.env.OPENAI_API_KEY,
};
}
// ---------------------------------------------------------------------------
// Embeddings
// ---------------------------------------------------------------------------
async function ollamaEmbedBatch(baseUrl, model, inputs) {
const out = [];
// Ollama /api/embeddings is single-input. Batch by looping.
for (const text of inputs) {
const res = await fetch(`${baseUrl.replace(/\/$/, '')}/api/embeddings`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model, prompt: text }),
});
if (!res.ok) {
const body = await res.text();
throw new Error(`Ollama embeddings ${res.status}: ${body}`);
}
const data = await res.json();
if (!Array.isArray(data.embedding)) {
throw new Error(`Ollama embeddings: unexpected response: ${JSON.stringify(data).slice(0, 200)}`);
}
out.push(data.embedding);
}
return out;
}
async function openaiEmbedBatch(model, inputs) {
const { default: OpenAI } = await import('openai');
const client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const res = await client.embeddings.create({ model, input: inputs });
return res.data.map((d) => d.embedding);
}
export async function embed(inputs, opts = {}) {
const cfg = embedConfig();
const provider = opts.provider ?? cfg.provider;
const list = Array.isArray(inputs) ? inputs : [inputs];
if (provider === 'ollama') {
return ollamaEmbedBatch(cfg.ollamaUrl, cfg.ollamaEmbedModel, list);
}
if (provider === 'openai') {
if (!cfg.hasOpenAIKey) throw new Error('OPENAI_API_KEY not set.');
return openaiEmbedBatch(cfg.openaiEmbedModel, list);
}
throw new Error(`Unknown EMBED_PROVIDER: ${provider}`);
}
// ---------------------------------------------------------------------------
// Chat
// ---------------------------------------------------------------------------
export async function chat({ system, messages, maxTokens = 800 }) {
const cfg = llmConfig();
if (cfg.provider === 'ollama') {
const url = `${cfg.ollamaUrl.replace(/\/$/, '')}/api/chat`;
const ollamaMessages = [];
if (system) ollamaMessages.push({ role: 'system', content: system });
for (const m of messages) {
if (m.role === 'user' || m.role === 'assistant') {
ollamaMessages.push({ role: m.role, content: m.content });
}
}
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model: cfg.ollamaChatModel,
messages: ollamaMessages,
stream: false,
options: { num_predict: maxTokens, temperature: 0.4 },
}),
});
if (!res.ok) {
const body = await res.text();
throw new Error(`Ollama chat ${res.status}: ${body}`);
}
const data = await res.json();
return {
text: data.message?.content ?? '',
model: cfg.ollamaChatModel,
provider: 'ollama',
};
}
if (cfg.provider === 'anthropic') {
if (!cfg.hasAnthropicKey) throw new Error('ANTHROPIC_API_KEY not set.');
const { default: Anthropic } = await import('@anthropic-ai/sdk');
const client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
const apiMessages = messages
.filter((m) => m.role === 'user' || m.role === 'assistant')
.map((m) => ({ role: m.role, content: m.content }));
const completion = await client.messages.create({
model: cfg.anthropicModel,
max_tokens: maxTokens,
system,
messages: apiMessages.length ? apiMessages : [{ role: 'user', content: '' }],
});
const text = completion.content
.filter((p) => p.type === 'text')
.map((p) => p.text)
.join('\n');
return { text, model: cfg.anthropicModel, provider: 'anthropic' };
}
throw new Error(`Unknown LLM_PROVIDER: ${cfg.provider}`);
}

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env node
/**
* Mirror /design-system/ → /public/design-system/ AND /src/styles/design-system/
*
* The canonical tokens live at the project root in design-system/. They need
* to land in two places before the build runs:
*
* • public/design-system/ — served as static assets so partner sites can
* `<link rel="stylesheet" href="https://nibiru-framework.com/design-system/tokens.css">`.
*
* • src/styles/design-system/ — Starlight's `customCss` only accepts paths
* under src/, so the site itself imports the tokens from here.
*
* Both are copies of the same source. Rather than hand-maintain three trees,
* this script syncs them on every install / build.
*/
import { mkdir, readdir, copyFile, stat, rm } from 'node:fs/promises';
import { dirname, join, resolve } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const SRC = resolve(__dirname, '..', 'design-system');
const DSTS = [
resolve(__dirname, '..', 'public', 'design-system'),
resolve(__dirname, '..', 'src', 'styles', 'design-system'),
];
// Files we publish. Anything else (zips, internal notes) stays out of public/.
const PUBLISHED = new Set([
'README.md',
'tokens.css',
'tokens.scss',
'tokens.json',
'tailwind.preset.js',
'docs-page-mockup.html',
]);
async function ensure(dir) {
await mkdir(dir, { recursive: true });
}
async function copyTree(src, dst) {
await ensure(dst);
const entries = await readdir(src, { withFileTypes: true });
for (const entry of entries) {
if (!PUBLISHED.has(entry.name)) continue;
const from = join(src, entry.name);
const to = join(dst, entry.name);
if (entry.isDirectory()) {
await copyTree(from, to);
} else {
await copyFile(from, to);
}
}
}
async function pruneStale(dst) {
try {
const entries = await readdir(dst, { withFileTypes: true });
for (const entry of entries) {
if (!PUBLISHED.has(entry.name)) {
await rm(join(dst, entry.name), { recursive: true, force: true });
}
}
} catch (err) {
if (err.code !== 'ENOENT') throw err;
}
}
try {
await stat(SRC);
} catch {
console.error(`[sync-design-system] no source directory at ${SRC}`);
process.exit(1);
}
for (const dst of DSTS) {
await pruneStale(dst);
await copyTree(SRC, dst);
console.log(`[sync-design-system] mirrored ${SRC}${dst}`);
}

View File

@@ -0,0 +1,404 @@
#!/usr/bin/env node
/**
* Translate every English doc (src/content/docs/en/**) into one or more
* target locales using your own Ollama on neuronetz.ai.
*
* node scripts/translate-docs.mjs # all locales (de, ja, es, fr)
* node scripts/translate-docs.mjs --lang=de # only German
* node scripts/translate-docs.mjs --lang=de,ja # multi-select
* node scripts/translate-docs.mjs --force # overwrite existing
* node scripts/translate-docs.mjs --only=start/ # path prefix filter
*
* Env:
* OLLAMA_BASE_URL (default https://api.neuronetz.ai)
* OLLAMA_TRANSLATE_MODEL (default qwen3.6:35b → falls back to mistral-small,
* then qwen2.5-coder:14b)
*
* What it preserves verbatim:
* - YAML frontmatter STRUCTURE (only `title` + `description` get translated;
* keys, slugs, hero.actions[].link stay untouched)
* - Code fences (```…```) and inline code (`…`)
* - HTML/JSX tags including <CardGrid>, <Card title="…">
* - Markdown links + images
* - Internal anchor URLs (/en/… stays /en/… → not auto-rewritten;
* a separate pass rewrites the locale prefix in produced output.)
*/
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const SOURCE_LANG = 'en';
const ALL_TARGETS = ['de', 'ja', 'es', 'fr'];
const OLLAMA_URL = (process.env.OLLAMA_BASE_URL ?? 'https://api.neuronetz.ai').replace(/\/$/, '');
// Default to a fast model that fits inside the nginx 60-90s timeout.
// qwen2.5-coder:14b is verified live; mistral-small handles European
// languages well; qwen2 is solid for Japanese.
// qwen2.5-coder:14b is conservative (low hallucination), proven on real
// pages. Slower than qwen2:7.6b but qwen2 hallucinated frontmatter and
// mixed scripts (Chinese chars in German output) — never again.
const PRIMARY_MODEL = process.env.OLLAMA_TRANSLATE_MODEL ?? 'qwen2.5-coder:14b';
const FALLBACK_MODELS = ['mistral-small:latest'];
const args = process.argv.slice(2);
const cli = {
langs: ALL_TARGETS,
force: false,
only: null,
};
for (const a of args) {
if (a.startsWith('--lang=')) cli.langs = a.slice(7).split(',').map((s) => s.trim()).filter(Boolean);
else if (a === '--force') cli.force = true;
else if (a.startsWith('--only=')) cli.only = a.slice(7);
}
const LANG_NAME = {
de: 'German', ja: 'Japanese', es: 'Spanish', fr: 'French',
};
// ---------------------------------------------------------------------------
// Markdown chunking — split into the smallest sensible units:
// - Each fenced code block is its own segment (kept verbatim).
// - Prose is split on blank-line paragraph boundaries.
// - Headings travel with their paragraph.
// Goal: every translatable chunk is ≤ ~300 tokens so the LLM completes
// in well under the nginx 60-90s window.
// ---------------------------------------------------------------------------
function splitForTranslate(body) {
const segments = [];
const lines = body.split('\n');
let buf = [];
let inFence = false;
let fenceMarker = '';
const flushProse = () => {
if (!buf.length) return;
// Split prose buffer at blank-line boundaries.
const text = buf.join('\n');
const paragraphs = text.split(/(\n\s*\n)/); // keep separators
let acc = '';
for (const part of paragraphs) {
if (/^\n\s*\n$/.test(part)) {
if (acc.trim()) segments.push({ kind: 'prose', text: acc });
else if (acc.length) segments.push({ kind: 'sep', text: acc });
segments.push({ kind: 'sep', text: part });
acc = '';
} else {
acc += part;
}
}
if (acc.length) {
segments.push({ kind: acc.trim() ? 'prose' : 'sep', text: acc });
}
buf = [];
};
const flushCode = () => {
if (!buf.length) return;
segments.push({ kind: 'code', text: buf.join('\n') });
buf = [];
};
for (const line of lines) {
const fenceOpen = line.match(/^([`~]{3,})/);
if (fenceOpen) {
if (!inFence) {
flushProse();
inFence = true;
fenceMarker = fenceOpen[1].slice(0, 3); // normalise length match
buf.push(line);
continue;
}
if (line.startsWith(fenceMarker)) {
buf.push(line);
flushCode();
inFence = false;
fenceMarker = '';
continue;
}
}
buf.push(line);
}
if (inFence) flushCode();
else flushProse();
return segments;
}
// ---------------------------------------------------------------------------
// Frontmatter parse + selective rewrite. Keep YAML keys verbatim; only
// translate the values of `title` and `description`.
// ---------------------------------------------------------------------------
function splitFrontmatter(raw) {
if (!raw.startsWith('---\n') && !raw.startsWith('---\r\n')) {
return { fm: '', body: raw };
}
const closeIdx = raw.indexOf('\n---', 3);
if (closeIdx === -1) return { fm: '', body: raw };
const fm = raw.slice(0, closeIdx + 4);
const body = raw.slice(closeIdx + 4).replace(/^\r?\n/, '');
return { fm, body };
}
function rewriteFrontmatter(fm, translateString) {
if (!fm) return fm;
return fm
.split('\n')
.map((line) => {
const m = line.match(/^(\s*)(title|description):\s*(.*)$/);
if (!m) return line;
const [, indent, key, raw] = m;
const stripped = raw.replace(/^["']|["']$/g, '').trim();
if (!stripped) return line;
const tr = translateString(stripped);
// Always quote (handles colons, quotes safely)
const escaped = tr.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
return `${indent}${key}: "${escaped}"`;
})
.join('\n');
}
// ---------------------------------------------------------------------------
// Ollama call with model fallback.
// ---------------------------------------------------------------------------
async function tryModels(messages, attempt = 0) {
const tryList = [PRIMARY_MODEL, ...FALLBACK_MODELS];
const model = tryList[attempt];
if (!model) throw new Error('All models failed.');
const res = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
options: { temperature: 0.2, num_predict: 1024 },
}),
});
if (!res.ok) {
const text = await res.text();
if (res.status === 404 && attempt < tryList.length - 1) {
console.error(`${model} not found, trying ${tryList[attempt + 1]}`);
return tryModels(messages, attempt + 1);
}
throw new Error(`Ollama ${res.status}: ${text}`);
}
const data = await res.json();
return { text: data.message?.content ?? '', model };
}
const SYSTEM_PROMPT = (lang) => `You are a professional technical translator. Translate the user's text from English into ${LANG_NAME[lang]}.
CRITICAL: You MUST NOT invent, add, expand, summarise, or omit any content. The output must contain exactly the same number of paragraphs, list items and sentences as the input. If the input has 2 list items, the output has 2. Same count, same order.
ABSOLUTE PRESERVATION RULES (do NOT translate any of these):
- Anything between backticks: \`like_this\`, \`./command -flag\`, \`SomeClass::method()\`
- File paths: src/foo/bar.php, /core/modules/, application/view/
- URLs and URL fragments: https://…, /en/start/, #anchor-name
- CLI flags including their hyphens: -new-cms-page, -delete-cms-page, -mi, -m, --force, -c, -p
- Environment variable names: OLLAMA_BASE_URL, ANTHROPIC_API_KEY, APPLICATION_ENV
- Class/function/method names: pageAction, navigationAction, View::assign, Form::create
- Technical proper nouns: Nibiru, Smarty, Composer, PHP, MySQL, PostgreSQL, MMVC, MVC, CLI, ODBC, PDO, RAG, LoRA, Astro, Starlight, Ollama, Anthropic, Claude, OpenAI, GitHub, Elasticsearch, Graylog, JSON, YAML, INI, HTML, CSS, JS, TS, AJAX, SQL, AES.
- Code blocks (lines starting with \`\`\`) — copy verbatim
- HTML/JSX tags — copy structure verbatim, only translate visible text content
PRESERVE THE EXACT STRUCTURE:
- Headings keep their # / ## / ### level
- List bullet style (- or 1.) and indentation
- Bold (**), italic (*), strikethrough — keep the syntax around translated text
- Empty lines (paragraph breaks) — preserve every one
- Trailing newline / leading newline — preserve
OUTPUT FORMAT:
- Output the translated Markdown only. No preamble. No surrounding code fence. No "Here is the translation:" header. Start with the first character of the translation.
LANGUAGE STYLE:
- ja: 自然な技術日本語(です/ます調), no kana for translatable English words.
- fr: European French, formal "vous", proper accents (à, é, è, ê, ç, …).
- de: Formal "Sie" address. Standard German spelling.
- es: Neutral Latin-American Spanish, "tú" voice.`;
async function translateText(text, lang) {
if (!text.trim()) return text;
const { text: out } = await tryModels([
{ role: 'system', content: SYSTEM_PROMPT(lang) },
{ role: 'user', content: text },
]);
let cleaned = out;
// Models occasionally wrap the output in code fences — strip them.
const trimmedHead = cleaned.replace(/^\s*```[a-zA-Z]*\n?/, '');
if (trimmedHead !== cleaned) {
cleaned = trimmedHead.replace(/\n?```\s*$/, '');
}
// Restore the EXACT leading + trailing whitespace from the source so
// blank lines around the chunk (e.g. between a heading and a code fence)
// are preserved across translation.
const srcLead = (text.match(/^\s*/) || [''])[0];
const srcTrail = (text.match(/\s*$/) || [''])[0];
cleaned = srcLead + cleaned.replace(/^\s+|\s+$/g, '') + srcTrail;
return cleaned;
}
// Tiny synchronous shim for frontmatter — collect strings to translate, then
// translate them, then patch back. (Frontmatter has only 2 strings per file
// so we just translate them serially before rewriting.)
async function translateFrontmatter(fm, lang) {
if (!fm) return fm;
const matches = [...fm.matchAll(/^(\s*)(title|description):\s*(.+)$/gm)];
const cache = new Map();
for (const m of matches) {
const stripped = m[3].replace(/^["']|["']$/g, '').trim();
if (!stripped || cache.has(stripped)) continue;
try {
const tr = await translateText(stripped, lang);
// Single-line title sanity check — no 10x expansion, no CJK leak.
const issue = looksHallucinated(stripped, tr, lang);
if (issue) {
console.error(` ⚠ frontmatter "${stripped.slice(0, 40)}…" rejected (${issue}); keeping English.`);
cache.set(stripped, stripped);
} else {
cache.set(stripped, tr);
}
} catch (e) {
console.error(` ⚠ frontmatter "${stripped.slice(0, 40)}…" failed: ${e.message}`);
cache.set(stripped, stripped);
}
}
return rewriteFrontmatter(fm, (s) => cache.get(s) ?? s);
}
// Sanity check: a good translation has roughly the same number of newlines
// and Markdown structural markers as the source. If they diverge wildly
// we keep the original to avoid hallucinated padding.
function looksHallucinated(src, out, lang) {
if (!out.trim()) return 'empty output';
// 2.2x expansion ratio threshold
// 2.2x expansion is the hallucination-detection threshold for prose chunks,
// but short technical strings (e.g. "Auth" → "Authentifizierung") routinely
// expand 45×. So we use a soft floor: small inputs get an absolute budget.
const floorBudget = 80;
if (out.length > Math.max(floorBudget, src.length * 2.2)) {
return `expansion ${out.length}/${src.length}`;
}
// Same count of fenced code starts
const fences = (s) => (s.match(/^```/gm) || []).length;
if (fences(src) !== fences(out)) return 'fence count mismatch';
// Same count of list items
const bullets = (s) => (s.match(/^\s*([-*]|\d+\.) /gm) || []).length;
if (Math.abs(bullets(src) - bullets(out)) > 1) return 'list-item count mismatch';
// Same headings, AT THE SAME LEVELS
const headings = (s) => (s.match(/^(#{1,6}) /gm) || []).map((h) => h.trim().length);
const hSrc = headings(src);
const hOut = headings(out);
if (hSrc.length !== hOut.length) return 'heading count mismatch';
for (let i = 0; i < hSrc.length; i++) {
if (hSrc[i] !== hOut[i]) return `heading level mismatch at #${i + 1}: ${hSrc[i]}${hOut[i]}`;
}
// Script mixing: don't allow CJK characters in non-CJK target output.
if (lang !== 'ja') {
if (/[぀-ヿ㐀-䶿一-鿿]/.test(out)) return 'CJK chars leaked into non-CJK output';
}
// Frontmatter title length sanity (when src is a single-line title): no
// 10x expansion. This guards the "Spanish title hallucinated paragraph" case.
if (!src.includes('\n') && out.length > Math.max(80, src.length * 3)) {
return `single-line ${out.length}/${src.length} expansion`;
}
return null;
}
async function translateBody(body, lang) {
const segs = splitForTranslate(body);
const out = [];
let proseChunks = 0;
for (const seg of segs) {
if (seg.kind === 'code' || seg.kind === 'sep') {
out.push(seg.text);
continue;
}
proseChunks++;
try {
const tr = await translateText(seg.text, lang);
const issue = looksHallucinated(seg.text, tr, lang);
if (issue) {
console.error(` ⚠ prose chunk ${proseChunks} rejected (${issue}); keeping English.`);
out.push(seg.text);
} else {
out.push(tr);
}
} catch (e) {
const msg = String(e.message).slice(0, 120);
console.error(` ⚠ prose chunk ${proseChunks} failed: ${msg}; keeping English.`);
out.push(seg.text);
}
}
return out.join('');
}
function relPath(file) {
return path.relative(DOCS_DIR, file).replace(/\\/g, '/');
}
function targetPath(srcPath, lang) {
const rel = relPath(srcPath);
const stripped = rel.replace(/^en\//, `${lang}/`);
return path.join(DOCS_DIR, stripped);
}
function walkEnglish() {
const root = path.join(DOCS_DIR, SOURCE_LANG);
const out = [];
const stack = [root];
while (stack.length) {
const d = stack.pop();
for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
const p = path.join(d, entry.name);
if (entry.isDirectory()) stack.push(p);
else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
}
}
return out.sort();
}
async function main() {
console.log(`Ollama: ${OLLAMA_URL}`);
console.log(`Primary model: ${PRIMARY_MODEL}`);
console.log(`Languages: ${cli.langs.join(', ')}`);
if (cli.only) console.log(`Only: ${cli.only}`);
if (cli.force) console.log('Force overwrite: yes');
const files = walkEnglish().filter((f) => {
if (!cli.only) return true;
return relPath(f).replace(/^en\//, '').startsWith(cli.only);
});
console.log(`\nFound ${files.length} English source files.\n`);
for (const lang of cli.langs) {
console.log(`\n=== ${LANG_NAME[lang]} (${lang}) ===`);
for (const src of files) {
const dst = targetPath(src, lang);
if (fs.existsSync(dst) && !cli.force) {
console.log(` · ${relPath(src)}${path.relative(DOCS_DIR, dst)} (exists, skip)`);
continue;
}
console.log(`${relPath(src)}`);
const raw = fs.readFileSync(src, 'utf8');
const { fm, body } = splitFrontmatter(raw);
const newFm = await translateFrontmatter(fm, lang);
const newBody = await translateBody(body, lang);
fs.mkdirSync(path.dirname(dst), { recursive: true });
fs.writeFileSync(dst, newFm + '\n' + newBody);
console.log(`${path.relative(DOCS_DIR, dst)}`);
}
}
console.log('\nDone.');
}
main().catch((e) => {
console.error(e);
process.exit(1);
});