#!/usr/bin/env node /** * Build a vector index over the docs for the in-site Oracle (RAG). * * node scripts/build-oracle-index.mjs * * Defaults to Ollama at https://your-ollama-host.example with model nomic-embed-text. * Override via env: * OLLAMA_BASE_URL=... * OLLAMA_EMBED_MODEL=... (e.g. nomic-embed-text, mxbai-embed-large) * EMBED_PROVIDER=openai (uses OpenAI embeddings via OPENAI_API_KEY) * * Output: public/oracle-index.json * * Soft-fail behaviour: if the embedding provider is unreachable or the model * is missing, an empty index is written and the runtime endpoint will operate * in chat-only (no-RAG) mode. */ import fs from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import { chunkFile, walkDocs } from './lib/chunk.mjs'; import { embed, embedConfig } from './lib/providers.mjs'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const DOCS_DIR = path.resolve(__dirname, '../src/content/docs'); const OUT_FILE = path.resolve(__dirname, '../public/oracle-index.json'); const BATCH = 16; function embedToBase64(f32) { return Buffer.from(new Float32Array(f32).buffer).toString('base64'); } async function main() { const cfg = embedConfig(); console.log(`Embedding provider: ${cfg.provider}`); if (cfg.provider === 'ollama') { console.log(` Ollama: ${cfg.ollamaUrl}`); console.log(` Model: ${cfg.ollamaEmbedModel}`); } else if (cfg.provider === 'openai') { console.log(` OpenAI model: ${cfg.openaiEmbedModel}`); if (!cfg.hasOpenAIKey) { console.warn(' ⚠ OPENAI_API_KEY missing — writing empty index (chat-only mode).'); writeEmpty('openai-key-missing'); return; } } console.log(`Walking ${DOCS_DIR}…`); const files = walkDocs(DOCS_DIR); console.log(`Found ${files.length} markdown files.`); const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR)); console.log(`Produced ${chunks.length} chunks.`); const embeddings = []; try { for (let i = 0; i < chunks.length; i += BATCH) { const batch = chunks.slice(i, i + BATCH); const inputs = batch.map((c) => `${c.pageTitle}\n${c.sectionTitle}\n\n${c.content}`); const vecs = await embed(inputs); for (const v of vecs) embeddings.push(v); process.stdout.write(`\r embedded ${embeddings.length}/${chunks.length}`); } process.stdout.write('\n'); } catch (err) { console.error(`\n⚠ Embedding failed: ${err.message}`); console.error(` → writing empty index, Oracle will run in chat-only (no-RAG) mode.`); if (cfg.provider === 'ollama') { console.error(` → To fix: pull the embedding model on your Ollama server:`); console.error(` curl ${cfg.ollamaUrl}/api/pull -d '{"name":"${cfg.ollamaEmbedModel}"}'`); } writeEmpty(err.message); return; } const dim = embeddings[0]?.length ?? 0; const out = { provider: cfg.provider, model: cfg.provider === 'ollama' ? cfg.ollamaEmbedModel : cfg.openaiEmbedModel, dim, builtAt: new Date().toISOString(), chunks: chunks.map((c) => ({ id: c.id, url: c.url, pageTitle: c.pageTitle, sectionTitle: c.sectionTitle, language: c.language, content: c.content, })), embeddings: embeddings.map(embedToBase64), }; fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true }); fs.writeFileSync(OUT_FILE, JSON.stringify(out)); const kb = (JSON.stringify(out).length / 1024).toFixed(1); console.log(`✔ Wrote ${OUT_FILE} (${kb} KB, dim=${dim})`); } function writeEmpty(reason) { fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true }); fs.writeFileSync( OUT_FILE, JSON.stringify( { provider: null, model: null, dim: 0, builtAt: null, reason, chunks: [], embeddings: [] }, null, 2 ) ); } main().catch((e) => { console.error(e); writeEmpty(e.message); process.exit(0); // soft-fail so Docker build doesn't break the site });