Initial public push: docs cosmos v4 + AI module + framework groundwork

This is the snapshot the production landing site (nibiru-framework.com) is deployed from. Brings together the recent splash + docs migration to the v4 "Cosmos" design system, the new in-framework AI module, and the framework groundwork that backs the framework-reference extraction. What lands: - docs/: Astro + Starlight site with the v4 dark cosmic palette, GalaxyHero canvas constellation, Mission Control chat (wired to /api/oracle → api.neuronetz.ai via providers.mjs Ollama), 5-panel MMVC stage (Model · AI · Module · Controller · View), translated EN/DE/JA/ES/FR content, PWA + sitemap + llms.txt + Umami analytics. - docs/design-system/: canonical mockup bundle (source/index-v2.html for splash, source/docs-system.html + preview/ for docs, SPEC.md, tokens). - docs/scripts/extraction/framework-reference-v2.md: deep framework reference (~1.6k lines, file:line citations, every public factory and idiom — basis for the LoRA training corpus. - application/module/ai/: AI module with chat / embed / RAG / agent plugins, plus pdoQuery / httpGet / fileRead tools and Modelfile + smoke-test in training/. - application/module/users/: user / ACL / form-factory traits used as the reference plugin pattern for the framework docs. - application/settings/config/database/: schema + seed migrations including the AI module tables (200–203). - Form factory + autogenerator changes the framework-reference-v2 covers. Production secrets stay out: docs/.env, settings.production.ini and ai.production.ini are all gitignored (.example files are in tree). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 15:22:18 +02:00
parent a60ce90643
commit 48c839d927
662 changed files with 172811 additions and 1 deletions
--- a/docs/scripts/lib/chunk.mjs
+++ b/docs/scripts/lib/chunk.mjs
@@ -0,0 +1,163 @@
+// Markdown → chunks at H2/H3 boundaries.
+// Used by both build-oracle-index.mjs (RAG) and build-corpus.mjs (LoRA training).
+
+import fs from 'node:fs';
+import path from 'node:path';
+
+const TARGET_TOKENS = 600;
+const MIN_TOKENS = 120;
+const MAX_TOKENS = 900;
+
+// Cheap token estimate: ~4 chars per token for English / European languages,
+// closer to 1.5 for CJK. We use a conservative average to avoid undersizing chunks.
+export function estimateTokens(text) {
+	const cjk = (text.match(/[぀-ヿ㐀-䶿一-鿿豈-﫿]/g) || []).length;
+	const other = text.length - cjk;
+	return Math.ceil(cjk / 1.5 + other / 4);
+}
+
+function stripFrontmatter(md) {
+	if (!md.startsWith('---')) return { frontmatter: {}, body: md };
+	const end = md.indexOf('\n---', 3);
+	if (end === -1) return { frontmatter: {}, body: md };
+	const fm = md.slice(3, end).trim();
+	const body = md.slice(end + 4).replace(/^\n/, '');
+	const frontmatter = {};
+	for (const line of fm.split('\n')) {
+		const m = line.match(/^([A-Za-z0-9_-]+):\s*(.*)$/);
+		if (m) frontmatter[m[1]] = m[2].replace(/^["']|["']$/g, '');
+	}
+	return { frontmatter, body };
+}
+
+function slugify(s) {
+	return String(s)
+		.toLowerCase()
+		.normalize('NFKD')
+		.replace(/[̀-ͯ]/g, '')
+		.replace(/[^a-z0-9\s-]/g, '')
+		.trim()
+		.replace(/\s+/g, '-');
+}
+
+// Split body at H2/H3 boundaries; keep code fences intact.
+function splitByHeadings(body) {
+	const sections = [];
+	const lines = body.split('\n');
+	let inFence = false;
+	let current = { heading: null, level: 0, anchor: null, lines: [] };
+
+	for (const line of lines) {
+		const fence = line.match(/^(```|~~~)/);
+		if (fence) inFence = !inFence;
+
+		if (!inFence) {
+			const h = line.match(/^(#{2,3})\s+(.+?)\s*$/);
+			if (h) {
+				if (current.lines.length || current.heading) sections.push(current);
+				current = {
+					heading: h[2].trim(),
+					level: h[1].length,
+					anchor: slugify(h[2].trim()),
+					lines: [line],
+				};
+				continue;
+			}
+		}
+		current.lines.push(line);
+	}
+	if (current.lines.length || current.heading) sections.push(current);
+	return sections;
+}
+
+// Further split a too-large section by paragraph boundaries, preserving fences.
+function splitOversized(section) {
+	const text = section.lines.join('\n');
+	if (estimateTokens(text) <= MAX_TOKENS) return [section];
+
+	const parts = [];
+	const paras = text.split(/\n\n+/);
+	let buf = [];
+	let bufTokens = 0;
+	for (const p of paras) {
+		const t = estimateTokens(p);
+		if (bufTokens + t > TARGET_TOKENS && buf.length) {
+			parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
+			buf = [];
+			bufTokens = 0;
+		}
+		buf.push(p);
+		bufTokens += t;
+	}
+	if (buf.length) parts.push({ ...section, lines: buf.join('\n\n').split('\n') });
+	return parts;
+}
+
+// Merge tiny adjacent sections so chunks don't drop below MIN_TOKENS.
+function mergeSmall(sections) {
+	const out = [];
+	for (const s of sections) {
+		const text = s.lines.join('\n');
+		const tokens = estimateTokens(text);
+		if (out.length && tokens < MIN_TOKENS) {
+			const prev = out[out.length - 1];
+			prev.lines = [...prev.lines, '', ...s.lines];
+		} else {
+			out.push({ ...s, lines: [...s.lines] });
+		}
+	}
+	return out;
+}
+
+export function chunkFile(filePath, rootDir) {
+	const raw = fs.readFileSync(filePath, 'utf8');
+	const { frontmatter, body } = stripFrontmatter(raw);
+
+	// URL: docs/<lang>/<rest>.md(x)  →  /<lang>/<rest>/
+	const rel = path.relative(rootDir, filePath).replace(/\\/g, '/');
+	const parts = rel.split('/');
+	const lang = parts[0];
+	const slug = parts.slice(1).join('/').replace(/\.(md|mdx)$/, '').replace(/\/index$/, '');
+	const baseUrl = '/' + (slug ? `${lang}/${slug}/` : `${lang}/`);
+
+	let sections = splitByHeadings(body);
+	sections = sections.flatMap(splitOversized);
+	sections = mergeSmall(sections);
+
+	const pageTitle = frontmatter.title || slug || 'Untitled';
+	const pageDescription = frontmatter.description || '';
+
+	return sections
+		.filter((s) => s.lines.join('\n').trim().length > 0)
+		.map((s, idx) => {
+			const content = s.lines.join('\n').trim();
+			const sectionTitle = s.heading || pageTitle;
+			const url = s.anchor && s.heading ? `${baseUrl}#${s.anchor}` : baseUrl;
+			return {
+				id: `${rel}#${s.anchor ?? `_${idx}`}`,
+				language: lang,
+				file: rel,
+				url,
+				pageTitle,
+				pageDescription,
+				sectionTitle,
+				headingLevel: s.level || 1,
+				tokens: estimateTokens(content),
+				content,
+			};
+		});
+}
+
+export function walkDocs(docsDir) {
+	const out = [];
+	const stack = [docsDir];
+	while (stack.length) {
+		const d = stack.pop();
+		for (const entry of fs.readdirSync(d, { withFileTypes: true })) {
+			const p = path.join(d, entry.name);
+			if (entry.isDirectory()) stack.push(p);
+			else if (/\.(md|mdx)$/.test(entry.name)) out.push(p);
+		}
+	}
+	return out.sort();
+}