Files
nibiru-framework.com/docs/scripts/lib/chunk.mjs
stephan f4ccc45a3b Strip api.neuronetz.ai from documentation; chat config stays in env
The Ollama URL was leaking via:
  - prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment,
    local-testing, ai/module/{overview,embed,training})
  - code blocks teaching users to curl the host directly
  - .env.example, Dockerfile, docker-compose.yml defaults
  - providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults
  - LandingScripts.astro comment
  - lora-runbook.md prose + SSH host
  - the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers
  - the "Oracle is silent" fallback message at /api/oracle POST

Replacements:
  - prose: "neuronetz.ai" → "your Ollama instance"
  - example URLs in code blocks: https://api.neuronetz.aihttps://your-ollama-host.example
  - code-level defaults: → http://localhost:11434 (Ollama's standard local port)
  - GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed
  - runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host>

Production chat is unaffected: docs/.env (gitignored) on the production
host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only
change in the running container is that the GET handler no longer
echoes the URL.

analytics.neuronetz.ai (Umami tracking) is intentionally left intact —
it's a public, brand-owned subdomain meant to be visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 17:14:17 +02:00

182 lines
5.3 KiB
JavaScript

// Markdown → chunks at H2/H3 boundaries.
// Used by both build-oracle-index.mjs (RAG) and build-corpus.mjs (LoRA training).
import fs from 'node:fs';
import path from 'node:path';
const TARGET_TOKENS = 600;
const MIN_TOKENS = 120;
const MAX_TOKENS = 900;
// Cheap token estimate: ~4 chars per token for English / European languages,
// closer to 1.5 for CJK. We use a conservative average to avoid undersizing chunks.
export function estimateTokens(text) {
const cjk = (text.match(/[぀-ヿ㐀-䶿一-鿿豈-﫿]/g) || []).length;
const other = text.length - cjk;
return Math.ceil(cjk / 1.5 + other / 4);
}
function stripFrontmatter(md) {
if (!md.startsWith('---')) return { frontmatter: {}, body: md };
const end = md.indexOf('\n---', 3);
if (end === -1) return { frontmatter: {}, body: md };
const fm = md.slice(3, end).trim();
const body = md.slice(end + 4).replace(/^\n/, '');
const frontmatter = {};
for (const line of fm.split('\n')) {
const m = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
if (m) frontmatter[m[1]] = m[2].replace(/^["']|["']$/g, '');
}
return { frontmatter, body };
}
function slugify(s) {
return String(s)
.toLowerCase()
.normalize('NFKD')
.replace(/[̀-ͯ]/g, '')
.replace(/[^a-z0-9\s-]/g, '')
.trim()
.replace(/\s+/g, '-');
}
// Split body at H2/H3 boundaries; keep code fences intact.
function splitByHeadings(body) {
const sections = [];
const lines = body.split('\n');
let inFence = false;
let current = { heading: null, level: 0, anchor: null, lines: [] };
for (const line of lines) {
const fence = line.match(/^(```|~~~)/);
if (fence) inFence = !inFence;
if (!inFence) {
const h = line.match(/^(#{2,3})\s+(.+?)\s*$/);
if (h) {
if (current.lines.length || current.heading) sections.push(current);
current = {
heading: h[2].trim(),
level: h[1].length,
anchor: slugify(h[2].trim()),
lines: [line],
};
continue;
}
}
current.lines.push(line);
}
if (current.lines.length || current.heading) sections.push(current);
return sections;
}
// Further split a too-large section by paragraph boundaries, preserving fences.
function splitOversized(section) {
const text = section.lines.join('\n');
if (estimateTokens(text) <= MAX_TOKENS) return [section];
const parts = [];
const paras = text.split(/\n\n+/);
let buf = [];
let bufTokens = 0;
for (const p of paras) {
const t = estimateTokens(p);
if (bufTokens + t > TARGET_TOKENS && buf.length) {
parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
buf = [];
bufTokens = 0;
}
buf.push(p);
bufTokens += t;
}
if (buf.length) parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
return parts;
}
// Merge tiny adjacent sections so chunks don't drop below MIN_TOKENS.
function mergeSmall(sections) {
const out = [];
for (const s of sections) {
const text = s.lines.join('\n');
const tokens = estimateTokens(text);
if (out.length && tokens < MIN_TOKENS) {
const prev = out[out.length - 1];
prev.lines = [...prev.lines, '', ...s.lines];
} else {
out.push({ ...s, lines: [...s.lines] });
}
}
return out;
}
/**
* Lower-level: chunk a raw markdown string with caller-supplied metadata.
* Used both by chunkFile() (which derives meta from a path) and by
* external sources like the framework-reference-v2 doc.
*
* @param {string} raw raw markdown (frontmatter optional)
* @param {object} meta {
* language, file, baseUrl, pageTitle, pageDescription
* } — language defaults to 'en'; baseUrl defaults to '/'; pageTitle to file
*/
export function chunkMarkdown(raw, meta = {}) {
const { frontmatter, body } = stripFrontmatter(raw);
const language = meta.language || frontmatter.lang || 'en';
const file = meta.file || meta.pageTitle || 'untitled.md';
const baseUrl = meta.baseUrl || '/';
const pageTitle = meta.pageTitle || frontmatter.title || file;
const pageDescription = meta.pageDescription || frontmatter.description || '';
let sections = splitByHeadings(body);
sections = sections.flatMap(splitOversized);
sections = mergeSmall(sections);
return sections
.filter((s) => s.lines.join('\n').trim().length > 0)
.map((s, idx) => {
const content = s.lines.join('\n').trim();
const sectionTitle = s.heading || pageTitle;
const url = s.anchor && s.heading ? `${baseUrl}#${s.anchor}` : baseUrl;
return {
id: `${file}#${s.anchor ?? `_${idx}`}`,
language,
file,
url,
pageTitle,
pageDescription,
sectionTitle,
headingLevel: s.level || 1,
tokens: estimateTokens(content),
content,
};
});
}
export function chunkFile(filePath, rootDir) {
const raw = fs.readFileSync(filePath, 'utf8');
// URL: docs/<lang>/<rest>.md(x) → /<lang>/<rest>/
const rel = path.relative(rootDir, filePath).replace(/\\/g, '/');
const parts = rel.split('/');
const lang = parts[0];
const slug = parts.slice(1).join('/').replace(/\.(md|mdx)$/, '').replace(/\/index$/, '');
const baseUrl = '/' + (slug ? `${lang}/${slug}/` : `${lang}/`);
return chunkMarkdown(raw, { language: lang, file: rel, baseUrl });
}
export function walkDocs(docsDir) {
const out = [];
const stack = [docsDir];
while (stack.length) {
const d = stack.pop();
for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
const p = path.join(d, entry.name);
if (entry.isDirectory()) stack.push(p);
else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
}
}
return out.sort();
}