Files
nibiru-framework.com/docs/scripts/build-oracle-index.mjs
stephan f4ccc45a3b Strip api.neuronetz.ai from documentation; chat config stays in env
The Ollama URL was leaking via:
  - prose in /en/, /de/, /ja/, /es/, /fr/ docs (oracle, deployment,
    local-testing, ai/module/{overview,embed,training})
  - code blocks teaching users to curl the host directly
  - .env.example, Dockerfile, docker-compose.yml defaults
  - providers.mjs, translate-docs.mjs, build-oracle-index.mjs defaults
  - LandingScripts.astro comment
  - lora-runbook.md prose + SSH host
  - the GET handler at /api/oracle which echoed `ollamaUrl` back to public callers
  - the "Oracle is silent" fallback message at /api/oracle POST

Replacements:
  - prose: "neuronetz.ai" → "your Ollama instance"
  - example URLs in code blocks: https://api.neuronetz.aihttps://your-ollama-host.example
  - code-level defaults: → http://localhost:11434 (Ollama's standard local port)
  - GET /api/oracle: dropped the `ollamaUrl` field; provider + model still exposed
  - runbook SSH host: neuronetz@cloud.neuronetz.ai → <gpu-user>@<gpu-host>

Production chat is unaffected: docs/.env (gitignored) on the production
host still pins OLLAMA_BASE_URL=https://api.neuronetz.ai. The only
change in the running container is that the GET handler no longer
echoes the URL.

analytics.neuronetz.ai (Umami tracking) is intentionally left intact —
it's a public, brand-owned subdomain meant to be visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 17:14:17 +02:00

118 lines
3.8 KiB
JavaScript

#!/usr/bin/env node
/**
* Build a vector index over the docs for the in-site Oracle (RAG).
*
* node scripts/build-oracle-index.mjs
*
* Defaults to Ollama at https://your-ollama-host.example with model nomic-embed-text.
* Override via env:
* OLLAMA_BASE_URL=...
* OLLAMA_EMBED_MODEL=... (e.g. nomic-embed-text, mxbai-embed-large)
* EMBED_PROVIDER=openai (uses OpenAI embeddings via OPENAI_API_KEY)
*
* Output: public/oracle-index.json
*
* Soft-fail behaviour: if the embedding provider is unreachable or the model
* is missing, an empty index is written and the runtime endpoint will operate
* in chat-only (no-RAG) mode.
*/
import fs from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import { chunkFile, walkDocs } from './lib/chunk.mjs';
import { embed, embedConfig } from './lib/providers.mjs';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const DOCS_DIR = path.resolve(__dirname, '../src/content/docs');
const OUT_FILE = path.resolve(__dirname, '../public/oracle-index.json');
const BATCH = 16;
function embedToBase64(f32) {
return Buffer.from(new Float32Array(f32).buffer).toString('base64');
}
async function main() {
const cfg = embedConfig();
console.log(`Embedding provider: ${cfg.provider}`);
if (cfg.provider === 'ollama') {
console.log(` Ollama: ${cfg.ollamaUrl}`);
console.log(` Model: ${cfg.ollamaEmbedModel}`);
} else if (cfg.provider === 'openai') {
console.log(` OpenAI model: ${cfg.openaiEmbedModel}`);
if (!cfg.hasOpenAIKey) {
console.warn(' ⚠ OPENAI_API_KEY missing — writing empty index (chat-only mode).');
writeEmpty('openai-key-missing');
return;
}
}
console.log(`Walking ${DOCS_DIR}`);
const files = walkDocs(DOCS_DIR);
console.log(`Found ${files.length} markdown files.`);
const chunks = files.flatMap((f) => chunkFile(f, DOCS_DIR));
console.log(`Produced ${chunks.length} chunks.`);
const embeddings = [];
try {
for (let i = 0; i < chunks.length; i += BATCH) {
const batch = chunks.slice(i, i + BATCH);
const inputs = batch.map((c) => `${c.pageTitle}\n${c.sectionTitle}\n\n${c.content}`);
const vecs = await embed(inputs);
for (const v of vecs) embeddings.push(v);
process.stdout.write(`\r embedded ${embeddings.length}/${chunks.length}`);
}
process.stdout.write('\n');
} catch (err) {
console.error(`\n⚠ Embedding failed: ${err.message}`);
console.error(` → writing empty index, Oracle will run in chat-only (no-RAG) mode.`);
if (cfg.provider === 'ollama') {
console.error(` → To fix: pull the embedding model on your Ollama server:`);
console.error(` curl ${cfg.ollamaUrl}/api/pull -d '{"name":"${cfg.ollamaEmbedModel}"}'`);
}
writeEmpty(err.message);
return;
}
const dim = embeddings[0]?.length ?? 0;
const out = {
provider: cfg.provider,
model: cfg.provider === 'ollama' ? cfg.ollamaEmbedModel : cfg.openaiEmbedModel,
dim,
builtAt: new Date().toISOString(),
chunks: chunks.map((c) => ({
id: c.id,
url: c.url,
pageTitle: c.pageTitle,
sectionTitle: c.sectionTitle,
language: c.language,
content: c.content,
})),
embeddings: embeddings.map(embedToBase64),
};
fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true });
fs.writeFileSync(OUT_FILE, JSON.stringify(out));
const kb = (JSON.stringify(out).length / 1024).toFixed(1);
console.log(`✔ Wrote ${OUT_FILE} (${kb} KB, dim=${dim})`);
}
function writeEmpty(reason) {
fs.mkdirSync(path.dirname(OUT_FILE), { recursive: true });
fs.writeFileSync(
OUT_FILE,
JSON.stringify(
{ provider: null, model: null, dim: 0, builtAt: null, reason, chunks: [], embeddings: [] },
null,
2
)
);
}
main().catch((e) => {
console.error(e);
writeEmpty(e.message);
process.exit(0); // soft-fail so Docker build doesn't break the site
});