// Markdown → chunks at H2/H3 boundaries. // Used by both build-oracle-index.mjs (RAG) and build-corpus.mjs (LoRA training). import fs from 'node:fs'; import path from 'node:path'; const TARGET_TOKENS = 600; const MIN_TOKENS = 120; const MAX_TOKENS = 900; // Cheap token estimate: ~4 chars per token for English / European languages, // closer to 1.5 for CJK. We use a conservative average to avoid undersizing chunks. export function estimateTokens(text) { const cjk = (text.match(/[぀-ヿ㐀-䶿一-鿿豈-﫿]/g) || []).length; const other = text.length - cjk; return Math.ceil(cjk / 1.5 + other / 4); } function stripFrontmatter(md) { if (!md.startsWith('---')) return { frontmatter: {}, body: md }; const end = md.indexOf('\n---', 3); if (end === -1) return { frontmatter: {}, body: md }; const fm = md.slice(3, end).trim(); const body = md.slice(end + 4).replace(/^\n/, ''); const frontmatter = {}; for (const line of fm.split('\n')) { const m = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/); if (m) frontmatter[m[1]] = m[2].replace(/^["']|["']$/g, ''); } return { frontmatter, body }; } function slugify(s) { return String(s) .toLowerCase() .normalize('NFKD') .replace(/[̀-ͯ]/g, '') .replace(/[^a-z0-9\s-]/g, '') .trim() .replace(/\s+/g, '-'); } // Split body at H2/H3 boundaries; keep code fences intact. function splitByHeadings(body) { const sections = []; const lines = body.split('\n'); let inFence = false; let current = { heading: null, level: 0, anchor: null, lines: [] }; for (const line of lines) { const fence = line.match(/^(```|~~~)/); if (fence) inFence = !inFence; if (!inFence) { const h = line.match(/^(#{2,3})\s+(.+?)\s*$/); if (h) { if (current.lines.length || current.heading) sections.push(current); current = { heading: h[2].trim(), level: h[1].length, anchor: slugify(h[2].trim()), lines: [line], }; continue; } } current.lines.push(line); } if (current.lines.length || current.heading) sections.push(current); return sections; } // Further split a too-large section by paragraph boundaries, preserving fences. function splitOversized(section) { const text = section.lines.join('\n'); if (estimateTokens(text) <= MAX_TOKENS) return [section]; const parts = []; const paras = text.split(/\n\n+/); let buf = []; let bufTokens = 0; for (const p of paras) { const t = estimateTokens(p); if (bufTokens + t > TARGET_TOKENS && buf.length) { parts.push({ ...section, lines: buf.join('\n\n').split('\n') }); buf = []; bufTokens = 0; } buf.push(p); bufTokens += t; } if (buf.length) parts.push({ ...section, lines: buf.join('\n\n').split('\n') }); return parts; } // Merge tiny adjacent sections so chunks don't drop below MIN_TOKENS. function mergeSmall(sections) { const out = []; for (const s of sections) { const text = s.lines.join('\n'); const tokens = estimateTokens(text); if (out.length && tokens < MIN_TOKENS) { const prev = out[out.length - 1]; prev.lines = [...prev.lines, '', ...s.lines]; } else { out.push({ ...s, lines: [...s.lines] }); } } return out; } /** * Lower-level: chunk a raw markdown string with caller-supplied metadata. * Used both by chunkFile() (which derives meta from a path) and by * external sources like the framework-reference-v2 doc. * * @param {string} raw raw markdown (frontmatter optional) * @param {object} meta { * language, file, baseUrl, pageTitle, pageDescription * } — language defaults to 'en'; baseUrl defaults to '/'; pageTitle to file */ export function chunkMarkdown(raw, meta = {}) { const { frontmatter, body } = stripFrontmatter(raw); const language = meta.language || frontmatter.lang || 'en'; const file = meta.file || meta.pageTitle || 'untitled.md'; const baseUrl = meta.baseUrl || '/'; const pageTitle = meta.pageTitle || frontmatter.title || file; const pageDescription = meta.pageDescription || frontmatter.description || ''; let sections = splitByHeadings(body); sections = sections.flatMap(splitOversized); sections = mergeSmall(sections); return sections .filter((s) => s.lines.join('\n').trim().length > 0) .map((s, idx) => { const content = s.lines.join('\n').trim(); const sectionTitle = s.heading || pageTitle; const url = s.anchor && s.heading ? `${baseUrl}#${s.anchor}` : baseUrl; return { id: `${file}#${s.anchor ?? `_${idx}`}`, language, file, url, pageTitle, pageDescription, sectionTitle, headingLevel: s.level || 1, tokens: estimateTokens(content), content, }; }); } export function chunkFile(filePath, rootDir) { const raw = fs.readFileSync(filePath, 'utf8'); // URL: docs//.md(x) → /// const rel = path.relative(rootDir, filePath).replace(/\\/g, '/'); const parts = rel.split('/'); const lang = parts[0]; const slug = parts.slice(1).join('/').replace(/\.(md|mdx)$/, '').replace(/\/index$/, ''); const baseUrl = '/' + (slug ? `${lang}/${slug}/` : `${lang}/`); return chunkMarkdown(raw, { language: lang, file: rel, baseUrl }); } export function walkDocs(docsDir) { const out = []; const stack = [docsDir]; while (stack.length) { const d = stack.pop(); for (const entry of fs.readdirSync(d, { withFileTypes: true })) { const p = path.join(d, entry.name); if (entry.isDirectory()) stack.push(p); else if (/\.(md|mdx)$/.test(entry.name)) out.push(p); } } return out.sort(); }