Files
nibiru-framework.com/application/module/ai/plugins/rag.php
stephan 48c839d927 Initial public push: docs cosmos v4 + AI module + framework groundwork
This is the snapshot the production landing site (nibiru-framework.com) is
deployed from. Brings together the recent splash + docs migration to the v4
"Cosmos" design system, the new in-framework AI module, and the framework
groundwork that backs the framework-reference extraction.

What lands:
- docs/: Astro + Starlight site with the v4 dark cosmic palette, GalaxyHero
  canvas constellation, Mission Control chat (wired to /api/oracle →
  api.neuronetz.ai via providers.mjs Ollama), 5-panel MMVC stage
  (Model · AI · Module · Controller · View), translated EN/DE/JA/ES/FR
  content, PWA + sitemap + llms.txt + Umami analytics.
- docs/design-system/: canonical mockup bundle (source/index-v2.html for
  splash, source/docs-system.html + preview/ for docs, SPEC.md, tokens).
- docs/scripts/extraction/framework-reference-v2.md: deep framework
  reference (~1.6k lines, file:line citations, every public factory and
  idiom — basis for the LoRA training corpus.
- application/module/ai/: AI module with chat / embed / RAG / agent
  plugins, plus pdoQuery / httpGet / fileRead tools and Modelfile +
  smoke-test in training/.
- application/module/users/: user / ACL / form-factory traits used as the
  reference plugin pattern for the framework docs.
- application/settings/config/database/: schema + seed migrations
  including the AI module tables (200–203).
- Form factory + autogenerator changes the framework-reference-v2 covers.

Production secrets stay out: docs/.env, settings.production.ini and
ai.production.ini are all gitignored (.example files are in tree).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-08 15:22:18 +02:00

316 lines
11 KiB
PHP

<?php
namespace Nibiru\Module\Ai\Plugins;
/**
* Retrieval-Augmented Generation. Ingest text or files, then ask
* grounded questions backed by cosine retrieval over the chunks.
*
* $rag = $ai->rag('docs');
* $rag->ingestText('The dispatcher runs every request.', ['source' => 'note-1']);
* $rag->ingestFile(__DIR__ . '/manual.md');
* $rag->ingestDir(__DIR__ . '/articles/');
*
* $answer = $rag->ask('How does the dispatcher work?');
* $hits = $rag->search('dispatcher', 5);
*
* Storage: a single JSON file per collection at
* <storage_path>/<collection>.json
*
* No database. Restartable. ~10k chunks fits in memory comfortably.
*/
class Rag
{
protected string $collection;
protected \stdClass $cfg;
protected Chat $chat;
protected Embed $embed;
/** @var array{chunks:array,embeddings:array} */
protected array $index = ['chunks' => [], 'embeddings' => []];
protected bool $loaded = false;
public function __construct(string $collection, \stdClass $cfg, Chat $chat, Embed $embed)
{
$this->collection = preg_replace('/[^a-z0-9_-]/i', '', $collection) ?: 'default';
$this->cfg = $cfg;
$this->chat = $chat;
$this->embed = $embed;
}
// -----------------------------------------------------------------------
// Ingestion
// -----------------------------------------------------------------------
/**
* Add a single chunk of text to the collection.
*/
public function ingestText(string $text, array $metadata = []): void
{
$this->load();
$this->addChunk($text, $metadata);
$this->save();
}
/**
* Read a file, chunk it, and ingest each chunk.
*/
public function ingestFile(string $path): int
{
if (!is_readable($path)) {
throw new \RuntimeException("RAG ingest: $path is not readable.");
}
$this->load();
$body = (string) file_get_contents($path);
$chunks = $this->chunk($body);
foreach ($chunks as $c) {
$this->addChunk($c, ['source' => $path]);
}
$this->save();
return count($chunks);
}
/**
* Recursively ingest every .md / .txt / .php file under a directory.
*/
public function ingestDir(string $dir, array $extensions = ['md', 'txt', 'php']): int
{
if (!is_dir($dir)) {
throw new \RuntimeException("RAG ingest: $dir is not a directory.");
}
$count = 0;
$extPattern = '/\.(' . implode('|', array_map('preg_quote', $extensions)) . ')$/i';
$iter = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(
$dir, \RecursiveDirectoryIterator::SKIP_DOTS
));
foreach ($iter as $entry) {
if (!$entry->isFile()) continue;
if (!preg_match($extPattern, $entry->getFilename())) continue;
$count += $this->ingestFile($entry->getPathname());
}
return $count;
}
/**
* Forget every chunk in this collection (and delete the storage file).
*/
public function reset(): void
{
$this->index = ['chunks' => [], 'embeddings' => []];
$this->loaded = true;
if ($this->backend() === 'database') {
\Nibiru\Pdo::query(
'DELETE c FROM ai_rag_chunk c '
. 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id '
. 'WHERE o.ai_rag_collection_name = :name',
[':name' => $this->collection]
);
\Nibiru\Pdo::delete('ai_rag_collection', ['ai_rag_collection_name' => $this->collection]);
return;
}
$path = $this->storagePath();
if (is_file($path)) @unlink($path);
}
// -----------------------------------------------------------------------
// Querying
// -----------------------------------------------------------------------
/**
* Top-K cosine similarity. Returns [{score, text, metadata}].
*/
public function search(string $query, ?int $k = null): array
{
$this->load();
if (empty($this->index['embeddings'])) return [];
$k = $k ?? (int) ($this->cfg->rag_top_k ?? 6);
$qv = $this->embed->one($query);
$scored = [];
foreach ($this->index['embeddings'] as $i => $packed) {
$vec = Embed::unpack($packed);
$scored[] = [
'score' => Embed::cosine($qv, $vec),
'text' => $this->index['chunks'][$i]['text'] ?? '',
'metadata' => $this->index['chunks'][$i]['metadata'] ?? [],
];
}
usort($scored, fn($a, $b) => $b['score'] <=> $a['score']);
return array_slice($scored, 0, $k);
}
/**
* Search the collection, then ask the LLM with the top-K chunks as context.
*/
public function ask(string $question, ?int $k = null): string
{
$hits = $this->search($question, $k);
if (empty($hits)) {
return $this->chat->reset()->ask($question);
}
$context = '';
foreach ($hits as $i => $h) {
$context .= '[' . ($i + 1) . '] ' . trim($h['text']) . "\n\n---\n\n";
}
$sys = ($this->chat->history() ? '' : (string) ($this->cfg->chat_system_prompt ?? ''));
$sys .= "\n\nUse these excerpts to answer. Cite by number like [1].\n\n" . $context;
return $this->chat->reset()->system(trim($sys))->ask($question);
}
// -----------------------------------------------------------------------
// Internals
// -----------------------------------------------------------------------
protected function addChunk(string $text, array $metadata): void
{
$text = trim($text);
if ($text === '') return;
$vec = $this->embed->one($text);
$packed = Embed::pack($vec);
$this->index['chunks'][] = ['text' => $text, 'metadata' => $metadata];
$this->index['embeddings'][] = $packed;
if ($this->backend() === 'database') {
\Nibiru\Pdo::insert('ai_rag_chunk', [
'ai_rag_chunk_collection_id' => $this->dbCollectionId(),
'ai_rag_chunk_text' => $text,
'ai_rag_chunk_metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE),
'ai_rag_chunk_embedding' => $packed,
'ai_rag_chunk_token_count' => (int) ceil(strlen($text) / 4),
'ai_rag_chunk_source' => isset($metadata['source']) ? (string) $metadata['source'] : null,
]);
}
}
protected function chunk(string $body): array
{
$target = (int) ($this->cfg->rag_chunk_target ?? 600);
$min = (int) ($this->cfg->rag_chunk_min ?? 120);
$max = (int) ($this->cfg->rag_chunk_max ?? 900);
// Split on paragraph boundaries first, then merge to target size.
$paragraphs = preg_split('/\n\s*\n/', $body) ?: [];
$out = [];
$buf = '';
$bufTokens = 0;
foreach ($paragraphs as $p) {
$pTokens = (int) ceil(strlen($p) / 4); // crude
if ($bufTokens + $pTokens > $target && $bufTokens >= $min) {
$out[] = $buf;
$buf = '';
$bufTokens = 0;
}
if ($pTokens > $max) {
if ($buf !== '') { $out[] = $buf; $buf = ''; $bufTokens = 0; }
// Split overlarge paragraph on sentence boundary
$sentences = preg_split('/(?<=[.!?])\s+/', $p) ?: [$p];
foreach ($sentences as $s) $out[] = $s;
continue;
}
$buf .= ($buf === '' ? '' : "\n\n") . $p;
$bufTokens += $pTokens;
}
if ($buf !== '') $out[] = $buf;
return $out;
}
/**
* Storage backend, controlled by [AI] rag.storage in ai.ini:
* "json" — single JSON file per collection (default; great for dev)
* "database" — uses ai_rag_collection / ai_rag_chunk tables via \Nibiru\Pdo
* (recommended for production; survives load-balancer fan-out)
*/
protected function backend(): string
{
$b = strtolower((string) ($this->cfg->rag_storage ?? 'json'));
return $b === 'database' ? 'database' : 'json';
}
protected function storagePath(): string
{
$base = $this->cfg->rag_storage_path
?? '/../../application/module/ai/cache/rag/';
$dir = realpath(__DIR__ . $base) ?: (__DIR__ . $base);
if (!is_dir($dir)) @mkdir($dir, 0775, true);
return rtrim($dir, '/') . '/' . $this->collection . '.json';
}
protected function load(): void
{
if ($this->loaded) return;
if ($this->backend() === 'database') {
$this->loadFromDatabase();
} else {
$path = $this->storagePath();
if (is_file($path)) {
$raw = json_decode((string) file_get_contents($path), true);
if (is_array($raw) && isset($raw['chunks'], $raw['embeddings'])) {
$this->index = $raw;
}
}
}
$this->loaded = true;
}
protected function save(): void
{
if ($this->backend() === 'database') {
// database is written incrementally in addChunk(); no-op here.
return;
}
$path = $this->storagePath();
file_put_contents($path, json_encode($this->index, JSON_UNESCAPED_UNICODE));
}
/**
* Load from ai_rag_collection + ai_rag_chunk tables. Uses Nibiru's
* `\Nibiru\Pdo` adapter — the tables are populated by migrations
* 200-ai_rag_collection.sql and 201-ai_rag_chunk.sql.
*/
protected function loadFromDatabase(): void
{
$rows = \Nibiru\Pdo::fetchAll(
'SELECT c.ai_rag_chunk_text AS text, c.ai_rag_chunk_metadata AS metadata, '
. ' c.ai_rag_chunk_embedding AS embedding '
. 'FROM ai_rag_chunk c '
. 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id '
. 'WHERE o.ai_rag_collection_name = :name '
. 'ORDER BY c.ai_rag_chunk_id',
[':name' => $this->collection]
);
foreach ($rows as $r) {
$this->index['chunks'][] = [
'text' => (string) $r['text'],
'metadata' => is_string($r['metadata']) ? (json_decode($r['metadata'], true) ?: []) : (array) ($r['metadata'] ?? []),
];
$this->index['embeddings'][] = (string) $r['embedding'];
}
}
/**
* Resolve (or create) the collection's row in ai_rag_collection.
* Called lazily by addChunk() in database mode.
*/
protected function dbCollectionId(): int
{
$row = \Nibiru\Pdo::fetchRow(
'SELECT ai_rag_collection_id AS id FROM ai_rag_collection '
. 'WHERE ai_rag_collection_name = :name',
[':name' => $this->collection]
);
if ($row && isset($row['id'])) return (int) $row['id'];
\Nibiru\Pdo::insert('ai_rag_collection', [
'ai_rag_collection_name' => $this->collection,
'ai_rag_collection_embed_model' => (string) ($this->cfg->embed_model ?? ''),
'ai_rag_collection_embed_dim' => (int) ($this->cfg->embed_dim ?? 0),
]);
return (int) \Nibiru\Pdo::lastInsertId();
}
public function size(): int
{
$this->load();
return count($this->index['chunks']);
}
}