nibiru-framework.com/docs/scripts/lib/chunk.mjs

// Markdown → chunks at H2/H3 boundaries.
// Used by both build-oracle-index.mjs (RAG) and build-corpus.mjs (LoRA training).

import fs from 'node:fs';
import path from 'node:path';

const TARGET_TOKENS = 600;
const MIN_TOKENS = 120;
const MAX_TOKENS = 900;

// Cheap token estimate: ~4 chars per token for English / European languages,
// closer to 1.5 for CJK. We use a conservative average to avoid undersizing chunks.
export function estimateTokens(text) {
	const cjk = (text.match(/[぀-ヿ㐀-䶿一-鿿豈-﫿]/g) || []).length;
	const other = text.length - cjk;
	return Math.ceil(cjk / 1.5 + other / 4);
}

function stripFrontmatter(md) {
	if (!md.startsWith('---')) return { frontmatter: {}, body: md };
	const end = md.indexOf('\n---', 3);
	if (end === -1) return { frontmatter: {}, body: md };
	const fm = md.slice(3, end).trim();
	const body = md.slice(end + 4).replace(/^\n/, '');
	const frontmatter = {};
	for (const line of fm.split('\n')) {
		const m = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
		if (m) frontmatter[m[1]] = m[2].replace(/^["']|["']$/g, '');
	}
	return { frontmatter, body };
}

function slugify(s) {
	return String(s)
		.toLowerCase()
		.normalize('NFKD')
		.replace(/[̀-ͯ]/g, '')
		.replace(/[^a-z0-9\s-]/g, '')
		.trim()
		.replace(/\s+/g, '-');
}

// Split body at H2/H3 boundaries; keep code fences intact.
function splitByHeadings(body) {
	const sections = [];
	const lines = body.split('\n');
	let inFence = false;
	let current = { heading: null, level: 0, anchor: null, lines: [] };

	for (const line of lines) {
		const fence = line.match(/^(```|~~~)/);
		if (fence) inFence = !inFence;

		if (!inFence) {
			const h = line.match(/^(#{2,3})\s+(.+?)\s*$/);
			if (h) {
				if (current.lines.length || current.heading) sections.push(current);
				current = {
					heading: h[2].trim(),
					level: h[1].length,
					anchor: slugify(h[2].trim()),
					lines: [line],
				};
				continue;
			}
		}
		current.lines.push(line);
	}
	if (current.lines.length || current.heading) sections.push(current);
	return sections;
}

// Further split a too-large section by paragraph boundaries, preserving fences.
function splitOversized(section) {
	const text = section.lines.join('\n');
	if (estimateTokens(text) <= MAX_TOKENS) return [section];

	const parts = [];
	const paras = text.split(/\n\n+/);
	let buf = [];
	let bufTokens = 0;
	for (const p of paras) {
		const t = estimateTokens(p);
		if (bufTokens + t > TARGET_TOKENS && buf.length) {
			parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
			buf = [];
			bufTokens = 0;
		}
		buf.push(p);
		bufTokens += t;
	}
	if (buf.length) parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
	return parts;
}

// Merge tiny adjacent sections so chunks don't drop below MIN_TOKENS.
function mergeSmall(sections) {
	const out = [];
	for (const s of sections) {
		const text = s.lines.join('\n');
		const tokens = estimateTokens(text);
		if (out.length && tokens < MIN_TOKENS) {
			const prev = out[out.length - 1];
			prev.lines = [...prev.lines, '', ...s.lines];
		} else {
			out.push({ ...s, lines: [...s.lines] });
		}
	}
	return out;
}

/**
 * Lower-level: chunk a raw markdown string with caller-supplied metadata.
 * Used both by chunkFile() (which derives meta from a path) and by
 * external sources like the framework-reference-v2 doc.
 *
 * @param {string} raw  raw markdown (frontmatter optional)
 * @param {object} meta {
 *   language, file, baseUrl, pageTitle, pageDescription
 * }   — language defaults to 'en'; baseUrl defaults to '/'; pageTitle to file
 */
export function chunkMarkdown(raw, meta = {}) {
	const { frontmatter, body } = stripFrontmatter(raw);

	const language = meta.language || frontmatter.lang || 'en';
	const file = meta.file || meta.pageTitle || 'untitled.md';
	const baseUrl = meta.baseUrl || '/';
	const pageTitle = meta.pageTitle || frontmatter.title || file;
	const pageDescription = meta.pageDescription || frontmatter.description || '';

	let sections = splitByHeadings(body);
	sections = sections.flatMap(splitOversized);
	sections = mergeSmall(sections);

	return sections
		.filter((s) => s.lines.join('\n').trim().length > 0)
		.map((s, idx) => {
			const content = s.lines.join('\n').trim();
			const sectionTitle = s.heading || pageTitle;
			const url = s.anchor && s.heading ? `${baseUrl}#${s.anchor}` : baseUrl;
			return {
				id: `${file}#${s.anchor ?? `_${idx}`}`,
				language,
				file,
				url,
				pageTitle,
				pageDescription,
				sectionTitle,
				headingLevel: s.level || 1,
				tokens: estimateTokens(content),
				content,
			};
		});
}

export function chunkFile(filePath, rootDir) {
	const raw = fs.readFileSync(filePath, 'utf8');

	// URL: docs/<lang>/<rest>.md(x)  →  /<lang>/<rest>/
	const rel = path.relative(rootDir, filePath).replace(/\\/g, '/');
	const parts = rel.split('/');
	const lang = parts[0];
	const slug = parts.slice(1).join('/').replace(/\.(md|mdx)$/, '').replace(/\/index$/, '');
	const baseUrl = '/' + (slug ? `${lang}/${slug}/` : `${lang}/`);

	return chunkMarkdown(raw, { language: lang, file: rel, baseUrl });
}

export function walkDocs(docsDir) {
	const out = [];
	const stack = [docsDir];
	while (stack.length) {
		const d = stack.pop();
		for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
			const p = path.join(d, entry.name);
			if (entry.isDirectory()) stack.push(p);
			else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
		}
	}
	return out.sort();
}