Initial public push: docs cosmos v4 + AI module + framework groundwork
This is the snapshot the production landing site (nibiru-framework.com) is deployed from. Brings together the recent splash + docs migration to the v4 "Cosmos" design system, the new in-framework AI module, and the framework groundwork that backs the framework-reference extraction. What lands: - docs/: Astro + Starlight site with the v4 dark cosmic palette, GalaxyHero canvas constellation, Mission Control chat (wired to /api/oracle → api.neuronetz.ai via providers.mjs Ollama), 5-panel MMVC stage (Model · AI · Module · Controller · View), translated EN/DE/JA/ES/FR content, PWA + sitemap + llms.txt + Umami analytics. - docs/design-system/: canonical mockup bundle (source/index-v2.html for splash, source/docs-system.html + preview/ for docs, SPEC.md, tokens). - docs/scripts/extraction/framework-reference-v2.md: deep framework reference (~1.6k lines, file:line citations, every public factory and idiom — basis for the LoRA training corpus. - application/module/ai/: AI module with chat / embed / RAG / agent plugins, plus pdoQuery / httpGet / fileRead tools and Modelfile + smoke-test in training/. - application/module/users/: user / ACL / form-factory traits used as the reference plugin pattern for the framework docs. - application/settings/config/database/: schema + seed migrations including the AI module tables (200–203). - Form factory + autogenerator changes the framework-reference-v2 covers. Production secrets stay out: docs/.env, settings.production.ini and ai.production.ini are all gitignored (.example files are in tree). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
128
docs/scripts/build-corpus.mjs
Normal file
128
docs/scripts/build-corpus.mjs
Normal file
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Export the docs as a LoRA-training-ready corpus.
|
||||
*
|
||||
* node scripts/build-corpus.mjs
|
||||
*
|
||||
* Outputs four files under dist/corpus/:
|
||||
* - chunks.jsonl — raw chunks (one section per line)
|
||||
* - instructions.jsonl — instruction/input/output triples
|
||||
* - chat.jsonl — sharegpt/chat-format messages
|
||||
* - completion.jsonl — prompt/completion pairs (legacy fine-tunes)
|
||||
*
|
||||
* The instruction text for each chunk is derived from the section heading
|
||||
* with a per-language template ("How do I X?", "Wie X?", "X するには?").
|
||||
*/
|
||||
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { chunkFile, walkDocs } from './lib/chunk.mjs';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
|
||||
const OUT_DIR = path.resolve(__dirname, '../dist/corpus');
|
||||
|
||||
const SYSTEM_PROMPT = {
|
||||
en: 'You are an expert on the Nibiru PHP framework. Answer based on the documentation, with concrete code examples and file paths where helpful.',
|
||||
de: 'Du bist Experte für das Nibiru-PHP-Framework. Antworte auf Basis der Dokumentation, mit konkreten Code-Beispielen und Dateipfaden, wo es hilft.',
|
||||
ja: 'あなたは Nibiru PHP フレームワークの専門家です。ドキュメントに基づいて、有用な箇所では具体的なコード例とファイルパスを示して回答してください。',
|
||||
es: 'Eres un experto en el framework PHP Nibiru. Responde basándote en la documentación, con ejemplos de código concretos y rutas de archivos donde sea útil.',
|
||||
fr: "Tu es expert du framework PHP Nibiru. Réponds sur la base de la documentation, avec des exemples de code concrets et des chemins de fichiers lorsque c'est utile.",
|
||||
};
|
||||
|
||||
const QUESTION_PREFIX = {
|
||||
en: ['How do I', 'What is', 'Explain', 'Show me'],
|
||||
de: ['Wie', 'Was ist', 'Erkläre', 'Zeig mir'],
|
||||
ja: ['', '', 'について教えてください:', ''],
|
||||
es: ['¿Cómo', '¿Qué es', 'Explica', 'Muéstrame'],
|
||||
fr: ['Comment', "Qu'est-ce que", 'Explique', 'Montre-moi'],
|
||||
};
|
||||
|
||||
function questionFor(chunk) {
|
||||
const lang = chunk.language || 'en';
|
||||
const heading = chunk.sectionTitle || chunk.pageTitle;
|
||||
if (lang === 'ja') {
|
||||
return `${heading} について教えてください。`;
|
||||
}
|
||||
const prefixes = QUESTION_PREFIX[lang] || QUESTION_PREFIX.en;
|
||||
const prefix = prefixes[heading.length % prefixes.length];
|
||||
if (lang === 'es' || lang === 'fr') {
|
||||
return `${prefix} ${heading.toLowerCase()} ?`.replace(' ', ' ');
|
||||
}
|
||||
return `${prefix} ${heading.toLowerCase()}?`;
|
||||
}
|
||||
|
||||
function ensureDir(d) {
|
||||
fs.mkdirSync(d, { recursive: true });
|
||||
}
|
||||
|
||||
function writeJsonl(filePath, items) {
|
||||
ensureDir(path.dirname(filePath));
|
||||
const stream = fs.createWriteStream(filePath, { encoding: 'utf8' });
|
||||
for (const item of items) stream.write(JSON.stringify(item) + '\n');
|
||||
stream.end();
|
||||
return new Promise((res) => stream.on('close', res));
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`Walking ${DOCS_DIR}…`);
|
||||
const files = walkDocs(DOCS_DIR);
|
||||
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
|
||||
console.log(`Produced ${chunks.length} chunks across ${files.length} files.`);
|
||||
|
||||
const chunksOut = chunks.map((c) => ({
|
||||
id: c.id,
|
||||
url: c.url,
|
||||
pageTitle: c.pageTitle,
|
||||
sectionTitle: c.sectionTitle,
|
||||
language: c.language,
|
||||
tokens: c.tokens,
|
||||
content: c.content,
|
||||
}));
|
||||
|
||||
const instructionsOut = chunks.map((c) => ({
|
||||
instruction: questionFor(c),
|
||||
input: '',
|
||||
output: c.content,
|
||||
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
||||
}));
|
||||
|
||||
const chatOut = chunks.map((c) => ({
|
||||
messages: [
|
||||
{ role: 'system', content: SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en },
|
||||
{ role: 'user', content: questionFor(c) },
|
||||
{ role: 'assistant', content: c.content },
|
||||
],
|
||||
metadata: { language: c.language, source: c.url, page: c.pageTitle },
|
||||
}));
|
||||
|
||||
const completionOut = chunks.map((c) => ({
|
||||
prompt: `${SYSTEM_PROMPT[c.language] || SYSTEM_PROMPT.en}\n\nQuestion: ${questionFor(c)}\n\nAnswer:`,
|
||||
completion: ' ' + c.content,
|
||||
}));
|
||||
|
||||
await writeJsonl(path.join(OUT_DIR, 'chunks.jsonl'), chunksOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'instructions.jsonl'), instructionsOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'chat.jsonl'), chatOut);
|
||||
await writeJsonl(path.join(OUT_DIR, 'completion.jsonl'), completionOut);
|
||||
|
||||
const stats = {
|
||||
generatedAt: new Date().toISOString(),
|
||||
fileCount: files.length,
|
||||
chunkCount: chunks.length,
|
||||
byLanguage: chunks.reduce((acc, c) => {
|
||||
acc[c.language] = (acc[c.language] || 0) + 1;
|
||||
return acc;
|
||||
}, {}),
|
||||
};
|
||||
fs.writeFileSync(path.join(OUT_DIR, 'stats.json'), JSON.stringify(stats, null, 2));
|
||||
|
||||
console.log(`Wrote 4 JSONL files + stats.json to ${OUT_DIR}`);
|
||||
console.log(JSON.stringify(stats, null, 2));
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user