Files
nibiru-framework.com/docs/scripts/translate-docs.mjs
stephan f4ccc45a3b Strip api.neuronetz.ai from documentation; chat config stays in env
The Ollama URL was leaking via:
  - prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment,
    local-testing, ai/module/{overview,embed,training})
  - code blocks teaching users to curl the host directly
  - .env.example, Dockerfile, docker-compose.yml defaults
  - providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults
  - LandingScripts.astro comment
  - lora-runbook.md prose + SSH host
  - the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers
  - the "Oracle is silent" fallback message at /api/oracle POST

Replacements:
  - prose: "neuronetz.ai" → "your Ollama instance"
  - example URLs in code blocks: https://api.neuronetz.aihttps://your-ollama-host.example
  - code-level defaults: → http://localhost:11434 (Ollama's standard local port)
  - GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed
  - runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host>

Production chat is unaffected: docs/.env (gitignored) on the production
host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only
change in the running container is that the GET handler no longer
echoes the URL.

analytics.neuronetz.ai (Umami tracking) is intentionally left intact —
it's a public, brand-owned subdomain meant to be visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 17:14:17 +02:00

405 lines
15 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* Translate every English doc (src/content/docs/en/**) into one or more
* target locales using your own Ollama on your Ollama instance.
*
* node scripts/translate-docs.mjs # all locales (de, ja, es, fr)
* node scripts/translate-docs.mjs --lang=de # only German
* node scripts/translate-docs.mjs --lang=de,ja # multi-select
* node scripts/translate-docs.mjs --force # overwrite existing
* node scripts/translate-docs.mjs --only=start/ # path prefix filter
*
* Env:
* OLLAMA_BASE_URL (default https://your-ollama-host.example)
* OLLAMA_TRANSLATE_MODEL (default qwen3.6:35b → falls back to mistral-small,
* then qwen2.5-coder:14b)
*
* What it preserves verbatim:
* - YAML frontmatter STRUCTURE (only `title` + `description` get translated;
* keys, slugs, hero.actions[].link stay untouched)
* - Code fences (```…```) and inline code (`…`)
* - HTML/JSX tags including <CardGrid>, <Card title="…">
* - Markdown links + images
* - Internal anchor URLs (/en/… stays /en/… → not auto-rewritten;
* a separate pass rewrites the locale prefix in produced output.)
*/
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const SOURCE_LANG = 'en';
const ALL_TARGETS = ['de', 'ja', 'es', 'fr'];
const OLLAMA_URL = (process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434').replace(/\/$/, '');
// Default to a fast model that fits inside the nginx 60-90s timeout.
// qwen2.5-coder:14b is verified live; mistral-small handles European
// languages well; qwen2 is solid for Japanese.
// qwen2.5-coder:14b is conservative (low hallucination), proven on real
// pages. Slower than qwen2:7.6b but qwen2 hallucinated frontmatter and
// mixed scripts (Chinese chars in German output) — never again.
const PRIMARY_MODEL = process.env.OLLAMA_TRANSLATE_MODEL ?? 'qwen2.5-coder:14b';
const FALLBACK_MODELS = ['mistral-small:latest'];
const args = process.argv.slice(2);
const cli = {
langs: ALL_TARGETS,
force: false,
only: null,
};
for (const a of args) {
if (a.startsWith('--lang=')) cli.langs = a.slice(7).split(',').map((s) => s.trim()).filter(Boolean);
else if (a === '--force') cli.force = true;
else if (a.startsWith('--only=')) cli.only = a.slice(7);
}
const LANG_NAME = {
de: 'German', ja: 'Japanese', es: 'Spanish', fr: 'French',
};
// ---------------------------------------------------------------------------
// Markdown chunking — split into the smallest sensible units:
// - Each fenced code block is its own segment (kept verbatim).
// - Prose is split on blank-line paragraph boundaries.
// - Headings travel with their paragraph.
// Goal: every translatable chunk is ≤ ~300 tokens so the LLM completes
// in well under the nginx 60-90s window.
// ---------------------------------------------------------------------------
function splitForTranslate(body) {
const segments = [];
const lines = body.split('\n');
let buf = [];
let inFence = false;
let fenceMarker = '';
const flushProse = () => {
if (!buf.length) return;
// Split prose buffer at blank-line boundaries.
const text = buf.join('\n');
const paragraphs = text.split(/(\n\s*\n)/); // keep separators
let acc = '';
for (const part of paragraphs) {
if (/^\n\s*\n$/.test(part)) {
if (acc.trim()) segments.push({ kind: 'prose', text: acc });
else if (acc.length) segments.push({ kind: 'sep', text: acc });
segments.push({ kind: 'sep', text: part });
acc = '';
} else {
acc += part;
}
}
if (acc.length) {
segments.push({ kind: acc.trim() ? 'prose' : 'sep', text: acc });
}
buf = [];
};
const flushCode = () => {
if (!buf.length) return;
segments.push({ kind: 'code', text: buf.join('\n') });
buf = [];
};
for (const line of lines) {
const fenceOpen = line.match(/^([`~]{3,})/);
if (fenceOpen) {
if (!inFence) {
flushProse();
inFence = true;
fenceMarker = fenceOpen[1].slice(0, 3); // normalise length match
buf.push(line);
continue;
}
if (line.startsWith(fenceMarker)) {
buf.push(line);
flushCode();
inFence = false;
fenceMarker = '';
continue;
}
}
buf.push(line);
}
if (inFence) flushCode();
else flushProse();
return segments;
}
// ---------------------------------------------------------------------------
// Frontmatter parse + selective rewrite. Keep YAML keys verbatim; only
// translate the values of `title` and `description`.
// ---------------------------------------------------------------------------
function splitFrontmatter(raw) {
if (!raw.startsWith('---\n') && !raw.startsWith('---\r\n')) {
return { fm: '', body: raw };
}
const closeIdx = raw.indexOf('\n---', 3);
if (closeIdx === -1) return { fm: '', body: raw };
const fm = raw.slice(0, closeIdx + 4);
const body = raw.slice(closeIdx + 4).replace(/^\r?\n/, '');
return { fm, body };
}
function rewriteFrontmatter(fm, translateString) {
if (!fm) return fm;
return fm
.split('\n')
.map((line) => {
const m = line.match(/^(\s*)(title|description):\s*(.*)$/);
if (!m) return line;
const [, indent, key, raw] = m;
const stripped = raw.replace(/^["']|["']$/g, '').trim();
if (!stripped) return line;
const tr = translateString(stripped);
// Always quote (handles colons, quotes safely)
const escaped = tr.replace(/\\/g, '\\\\').replace(/"/g, '\\"');
return `${indent}${key}: "${escaped}"`;
})
.join('\n');
}
// ---------------------------------------------------------------------------
// Ollama call with model fallback.
// ---------------------------------------------------------------------------
async function tryModels(messages, attempt = 0) {
const tryList = [PRIMARY_MODEL, ...FALLBACK_MODELS];
const model = tryList[attempt];
if (!model) throw new Error('All models failed.');
const res = await fetch(`${OLLAMA_URL}/api/chat`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
model,
messages,
stream: false,
options: { temperature: 0.2, num_predict: 1024 },
}),
});
if (!res.ok) {
const text = await res.text();
if (res.status === 404 && attempt < tryList.length - 1) {
console.error(`${model} not found, trying ${tryList[attempt + 1]}`);
return tryModels(messages, attempt + 1);
}
throw new Error(`Ollama ${res.status}: ${text}`);
}
const data = await res.json();
return { text: data.message?.content ?? '', model };
}
const SYSTEM_PROMPT = (lang) => `You are a professional technical translator. Translate the user's text from English into ${LANG_NAME[lang]}.
CRITICAL: You MUST NOT invent, add, expand, summarise, or omit any content. The output must contain exactly the same number of paragraphs, list items and sentences as the input. If the input has 2 list items, the output has 2. Same count, same order.
ABSOLUTE PRESERVATION RULES (do NOT translate any of these):
- Anything between backticks: \`like_this\`, \`./command -flag\`, \`SomeClass::method()\`
- File paths: src/foo/bar.php, /core/modules/, application/view/
- URLs and URL fragments: https://…, /en/start/, #anchor-name
- CLI flags including their hyphens: -new-cms-page, -delete-cms-page, -mi, -m, --force, -c, -p
- Environment variable names: OLLAMA_BASE_URL, ANTHROPIC_API_KEY, APPLICATION_ENV
- Class/function/method names: pageAction, navigationAction, View::assign, Form::create
- Technical proper nouns: Nibiru, Smarty, Composer, PHP, MySQL, PostgreSQL, MMVC, MVC, CLI, ODBC, PDO, RAG, LoRA, Astro, Starlight, Ollama, Anthropic, Claude, OpenAI, GitHub, Elasticsearch, Graylog, JSON, YAML, INI, HTML, CSS, JS, TS, AJAX, SQL, AES.
- Code blocks (lines starting with \`\`\`) — copy verbatim
- HTML/JSX tags — copy structure verbatim, only translate visible text content
PRESERVE THE EXACT STRUCTURE:
- Headings keep their # / ## / ### level
- List bullet style (- or 1.) and indentation
- Bold (**), italic (*), strikethrough — keep the syntax around translated text
- Empty lines (paragraph breaks) — preserve every one
- Trailing newline / leading newline — preserve
OUTPUT FORMAT:
- Output the translated Markdown only. No preamble. No surrounding code fence. No "Here is the translation:" header. Start with the first character of the translation.
LANGUAGE STYLE:
- ja: 自然な技術日本語(です/ます調), no kana for translatable English words.
- fr: European French, formal "vous", proper accents (à, é, è, ê, ç, …).
- de: Formal "Sie" address. Standard German spelling.
- es: Neutral Latin-American Spanish, "tú" voice.`;
async function translateText(text, lang) {
if (!text.trim()) return text;
const { text: out } = await tryModels([
{ role: 'system', content: SYSTEM_PROMPT(lang) },
{ role: 'user', content: text },
]);
let cleaned = out;
// Models occasionally wrap the output in code fences — strip them.
const trimmedHead = cleaned.replace(/^\s*```[a-zA-Z]*\n?/, '');
if (trimmedHead !== cleaned) {
cleaned = trimmedHead.replace(/\n?```\s*$/, '');
}
// Restore the EXACT leading + trailing whitespace from the source so
// blank lines around the chunk (e.g. between a heading and a code fence)
// are preserved across translation.
const srcLead = (text.match(/^\s*/) || [''])[0];
const srcTrail = (text.match(/\s*$/) || [''])[0];
cleaned = srcLead + cleaned.replace(/^\s+|\s+$/g, '') + srcTrail;
return cleaned;
}
// Tiny synchronous shim for frontmatter — collect strings to translate, then
// translate them, then patch back. (Frontmatter has only 2 strings per file
// so we just translate them serially before rewriting.)
async function translateFrontmatter(fm, lang) {
if (!fm) return fm;
const matches = [...fm.matchAll(/^(\s*)(title|description):\s*(.+)$/gm)];
const cache = new Map();
for (const m of matches) {
const stripped = m[3].replace(/^["']|["']$/g, '').trim();
if (!stripped || cache.has(stripped)) continue;
try {
const tr = await translateText(stripped, lang);
// Single-line title sanity check — no 10x expansion, no CJK leak.
const issue = looksHallucinated(stripped, tr, lang);
if (issue) {
console.error(` ⚠ frontmatter "${stripped.slice(0, 40)}…" rejected (${issue}); keeping English.`);
cache.set(stripped, stripped);
} else {
cache.set(stripped, tr);
}
} catch (e) {
console.error(` ⚠ frontmatter "${stripped.slice(0, 40)}…" failed: ${e.message}`);
cache.set(stripped, stripped);
}
}
return rewriteFrontmatter(fm, (s) => cache.get(s) ?? s);
}
// Sanity check: a good translation has roughly the same number of newlines
// and Markdown structural markers as the source. If they diverge wildly
// we keep the original to avoid hallucinated padding.
function looksHallucinated(src, out, lang) {
if (!out.trim()) return 'empty output';
// 2.2x expansion ratio threshold
// 2.2x expansion is the hallucination-detection threshold for prose chunks,
// but short technical strings (e.g. "Auth" → "Authentifizierung") routinely
// expand 45×. So we use a soft floor: small inputs get an absolute budget.
const floorBudget = 80;
if (out.length > Math.max(floorBudget, src.length * 2.2)) {
return `expansion ${out.length}/${src.length}`;
}
// Same count of fenced code starts
const fences = (s) => (s.match(/^```/gm) || []).length;
if (fences(src) !== fences(out)) return 'fence count mismatch';
// Same count of list items
const bullets = (s) => (s.match(/^\s*([-*]|\d+\.) /gm) || []).length;
if (Math.abs(bullets(src) - bullets(out)) > 1) return 'list-item count mismatch';
// Same headings, AT THE SAME LEVELS
const headings = (s) => (s.match(/^(#{1,6}) /gm) || []).map((h) => h.trim().length);
const hSrc = headings(src);
const hOut = headings(out);
if (hSrc.length !== hOut.length) return 'heading count mismatch';
for (let i = 0; i < hSrc.length; i++) {
if (hSrc[i] !== hOut[i]) return `heading level mismatch at #${i + 1}: ${hSrc[i]}${hOut[i]}`;
}
// Script mixing: don't allow CJK characters in non-CJK target output.
if (lang !== 'ja') {
if (/[぀-ヿ㐀-䶿一-鿿]/.test(out)) return 'CJK chars leaked into non-CJK output';
}
// Frontmatter title length sanity (when src is a single-line title): no
// 10x expansion. This guards the "Spanish title hallucinated paragraph" case.
if (!src.includes('\n') && out.length > Math.max(80, src.length * 3)) {
return `single-line ${out.length}/${src.length} expansion`;
}
return null;
}
async function translateBody(body, lang) {
const segs = splitForTranslate(body);
const out = [];
let proseChunks = 0;
for (const seg of segs) {
if (seg.kind === 'code' || seg.kind === 'sep') {
out.push(seg.text);
continue;
}
proseChunks++;
try {
const tr = await translateText(seg.text, lang);
const issue = looksHallucinated(seg.text, tr, lang);
if (issue) {
console.error(` ⚠ prose chunk ${proseChunks} rejected (${issue}); keeping English.`);
out.push(seg.text);
} else {
out.push(tr);
}
} catch (e) {
const msg = String(e.message).slice(0, 120);
console.error(` ⚠ prose chunk ${proseChunks} failed: ${msg}; keeping English.`);
out.push(seg.text);
}
}
return out.join('');
}
function relPath(file) {
return path.relative(DOCS_DIR, file).replace(/\\/g, '/');
}
function targetPath(srcPath, lang) {
const rel = relPath(srcPath);
const stripped = rel.replace(/^en\//, `${lang}/`);
return path.join(DOCS_DIR, stripped);
}
function walkEnglish() {
const root = path.join(DOCS_DIR, SOURCE_LANG);
const out = [];
const stack = [root];
while (stack.length) {
const d = stack.pop();
for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
const p = path.join(d, entry.name);
if (entry.isDirectory()) stack.push(p);
else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
}
}
return out.sort();
}
async function main() {
console.log(`Ollama: ${OLLAMA_URL}`);
console.log(`Primary model: ${PRIMARY_MODEL}`);
console.log(`Languages: ${cli.langs.join(', ')}`);
if (cli.only) console.log(`Only: ${cli.only}`);
if (cli.force) console.log('Force overwrite: yes');
const files = walkEnglish().filter((f) => {
if (!cli.only) return true;
return relPath(f).replace(/^en\//, '').startsWith(cli.only);
});
console.log(`\nFound ${files.length} English source files.\n`);
for (const lang of cli.langs) {
console.log(`\n=== ${LANG_NAME[lang]} (${lang}) ===`);
for (const src of files) {
const dst = targetPath(src, lang);
if (fs.existsSync(dst) && !cli.force) {
console.log(` · ${relPath(src)}${path.relative(DOCS_DIR, dst)} (exists, skip)`);
continue;
}
console.log(`${relPath(src)}`);
const raw = fs.readFileSync(src, 'utf8');
const { fm, body } = splitFrontmatter(raw);
const newFm = await translateFrontmatter(fm, lang);
const newBody = await translateBody(body, lang);
fs.mkdirSync(path.dirname(dst), { recursive: true });
fs.writeFileSync(dst, newFm + '\n' + newBody);
console.log(`${path.relative(DOCS_DIR, dst)}`);
}
}
console.log('\nDone.');
}
main().catch((e) => {
console.error(e);
process.exit(1);
});