rag('docs'); * $rag->ingestText('The dispatcher runs every request.', ['source' => 'note-1']); * $rag->ingestFile(__DIR__ . '/manual.md'); * $rag->ingestDir(__DIR__ . '/articles/'); * * $answer = $rag->ask('How does the dispatcher work?'); * $hits = $rag->search('dispatcher', 5); * * Storage: a single JSON file per collection at * /.json * * No database. Restartable. ~10k chunks fits in memory comfortably. */ class Rag { protected string $collection; protected \stdClass $cfg; protected Chat $chat; protected Embed $embed; /** @var array{chunks:array,embeddings:array} */ protected array $index = ['chunks' => [], 'embeddings' => []]; protected bool $loaded = false; public function __construct(string $collection, \stdClass $cfg, Chat $chat, Embed $embed) { $this->collection = preg_replace('/[^a-z0-9_-]/i', '', $collection) ?: 'default'; $this->cfg = $cfg; $this->chat = $chat; $this->embed = $embed; } // ----------------------------------------------------------------------- // Ingestion // ----------------------------------------------------------------------- /** * Add a single chunk of text to the collection. */ public function ingestText(string $text, array $metadata = []): void { $this->load(); $this->addChunk($text, $metadata); $this->save(); } /** * Read a file, chunk it, and ingest each chunk. */ public function ingestFile(string $path): int { if (!is_readable($path)) { throw new \RuntimeException("RAG ingest: $path is not readable."); } $this->load(); $body = (string) file_get_contents($path); $chunks = $this->chunk($body); foreach ($chunks as $c) { $this->addChunk($c, ['source' => $path]); } $this->save(); return count($chunks); } /** * Recursively ingest every .md / .txt / .php file under a directory. */ public function ingestDir(string $dir, array $extensions = ['md', 'txt', 'php']): int { if (!is_dir($dir)) { throw new \RuntimeException("RAG ingest: $dir is not a directory."); } $count = 0; $extPattern = '/\.(' . implode('|', array_map('preg_quote', $extensions)) . ')$/i'; $iter = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator( $dir, \RecursiveDirectoryIterator::SKIP_DOTS )); foreach ($iter as $entry) { if (!$entry->isFile()) continue; if (!preg_match($extPattern, $entry->getFilename())) continue; $count += $this->ingestFile($entry->getPathname()); } return $count; } /** * Forget every chunk in this collection (and delete the storage file). */ public function reset(): void { $this->index = ['chunks' => [], 'embeddings' => []]; $this->loaded = true; if ($this->backend() === 'database') { \Nibiru\Pdo::query( 'DELETE c FROM ai_rag_chunk c ' . 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id ' . 'WHERE o.ai_rag_collection_name = :name', [':name' => $this->collection] ); \Nibiru\Pdo::delete('ai_rag_collection', ['ai_rag_collection_name' => $this->collection]); return; } $path = $this->storagePath(); if (is_file($path)) @unlink($path); } // ----------------------------------------------------------------------- // Querying // ----------------------------------------------------------------------- /** * Top-K cosine similarity. Returns [{score, text, metadata}]. */ public function search(string $query, ?int $k = null): array { $this->load(); if (empty($this->index['embeddings'])) return []; $k = $k ?? (int) ($this->cfg->rag_top_k ?? 6); $qv = $this->embed->one($query); $scored = []; foreach ($this->index['embeddings'] as $i => $packed) { $vec = Embed::unpack($packed); $scored[] = [ 'score' => Embed::cosine($qv, $vec), 'text' => $this->index['chunks'][$i]['text'] ?? '', 'metadata' => $this->index['chunks'][$i]['metadata'] ?? [], ]; } usort($scored, fn($a, $b) => $b['score'] <=> $a['score']); return array_slice($scored, 0, $k); } /** * Search the collection, then ask the LLM with the top-K chunks as context. */ public function ask(string $question, ?int $k = null): string { $hits = $this->search($question, $k); if (empty($hits)) { return $this->chat->reset()->ask($question); } $context = ''; foreach ($hits as $i => $h) { $context .= '[' . ($i + 1) . '] ' . trim($h['text']) . "\n\n---\n\n"; } $sys = ($this->chat->history() ? '' : (string) ($this->cfg->chat_system_prompt ?? '')); $sys .= "\n\nUse these excerpts to answer. Cite by number like [1].\n\n" . $context; return $this->chat->reset()->system(trim($sys))->ask($question); } // ----------------------------------------------------------------------- // Internals // ----------------------------------------------------------------------- protected function addChunk(string $text, array $metadata): void { $text = trim($text); if ($text === '') return; $vec = $this->embed->one($text); $packed = Embed::pack($vec); $this->index['chunks'][] = ['text' => $text, 'metadata' => $metadata]; $this->index['embeddings'][] = $packed; if ($this->backend() === 'database') { \Nibiru\Pdo::insert('ai_rag_chunk', [ 'ai_rag_chunk_collection_id' => $this->dbCollectionId(), 'ai_rag_chunk_text' => $text, 'ai_rag_chunk_metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE), 'ai_rag_chunk_embedding' => $packed, 'ai_rag_chunk_token_count' => (int) ceil(strlen($text) / 4), 'ai_rag_chunk_source' => isset($metadata['source']) ? (string) $metadata['source'] : null, ]); } } protected function chunk(string $body): array { $target = (int) ($this->cfg->rag_chunk_target ?? 600); $min = (int) ($this->cfg->rag_chunk_min ?? 120); $max = (int) ($this->cfg->rag_chunk_max ?? 900); // Split on paragraph boundaries first, then merge to target size. $paragraphs = preg_split('/\n\s*\n/', $body) ?: []; $out = []; $buf = ''; $bufTokens = 0; foreach ($paragraphs as $p) { $pTokens = (int) ceil(strlen($p) / 4); // crude if ($bufTokens + $pTokens > $target && $bufTokens >= $min) { $out[] = $buf; $buf = ''; $bufTokens = 0; } if ($pTokens > $max) { if ($buf !== '') { $out[] = $buf; $buf = ''; $bufTokens = 0; } // Split overlarge paragraph on sentence boundary $sentences = preg_split('/(?<=[.!?])\s+/', $p) ?: [$p]; foreach ($sentences as $s) $out[] = $s; continue; } $buf .= ($buf === '' ? '' : "\n\n") . $p; $bufTokens += $pTokens; } if ($buf !== '') $out[] = $buf; return $out; } /** * Storage backend, controlled by [AI] rag.storage in ai.ini: * "json" — single JSON file per collection (default; great for dev) * "database" — uses ai_rag_collection / ai_rag_chunk tables via \Nibiru\Pdo * (recommended for production; survives load-balancer fan-out) */ protected function backend(): string { $b = strtolower((string) ($this->cfg->rag_storage ?? 'json')); return $b === 'database' ? 'database' : 'json'; } protected function storagePath(): string { $base = $this->cfg->rag_storage_path ?? '/../../application/module/ai/cache/rag/'; $dir = realpath(__DIR__ . $base) ?: (__DIR__ . $base); if (!is_dir($dir)) @mkdir($dir, 0775, true); return rtrim($dir, '/') . '/' . $this->collection . '.json'; } protected function load(): void { if ($this->loaded) return; if ($this->backend() === 'database') { $this->loadFromDatabase(); } else { $path = $this->storagePath(); if (is_file($path)) { $raw = json_decode((string) file_get_contents($path), true); if (is_array($raw) && isset($raw['chunks'], $raw['embeddings'])) { $this->index = $raw; } } } $this->loaded = true; } protected function save(): void { if ($this->backend() === 'database') { // database is written incrementally in addChunk(); no-op here. return; } $path = $this->storagePath(); file_put_contents($path, json_encode($this->index, JSON_UNESCAPED_UNICODE)); } /** * Load from ai_rag_collection + ai_rag_chunk tables. Uses Nibiru's * `\Nibiru\Pdo` adapter — the tables are populated by migrations * 200-ai_rag_collection.sql and 201-ai_rag_chunk.sql. */ protected function loadFromDatabase(): void { $rows = \Nibiru\Pdo::fetchAll( 'SELECT c.ai_rag_chunk_text AS text, c.ai_rag_chunk_metadata AS metadata, ' . ' c.ai_rag_chunk_embedding AS embedding ' . 'FROM ai_rag_chunk c ' . 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id ' . 'WHERE o.ai_rag_collection_name = :name ' . 'ORDER BY c.ai_rag_chunk_id', [':name' => $this->collection] ); foreach ($rows as $r) { $this->index['chunks'][] = [ 'text' => (string) $r['text'], 'metadata' => is_string($r['metadata']) ? (json_decode($r['metadata'], true) ?: []) : (array) ($r['metadata'] ?? []), ]; $this->index['embeddings'][] = (string) $r['embedding']; } } /** * Resolve (or create) the collection's row in ai_rag_collection. * Called lazily by addChunk() in database mode. */ protected function dbCollectionId(): int { $row = \Nibiru\Pdo::fetchRow( 'SELECT ai_rag_collection_id AS id FROM ai_rag_collection ' . 'WHERE ai_rag_collection_name = :name', [':name' => $this->collection] ); if ($row && isset($row['id'])) return (int) $row['id']; \Nibiru\Pdo::insert('ai_rag_collection', [ 'ai_rag_collection_name' => $this->collection, 'ai_rag_collection_embed_model' => (string) ($this->cfg->embed_model ?? ''), 'ai_rag_collection_embed_dim' => (int) ($this->cfg->embed_dim ?? 0), ]); return (int) \Nibiru\Pdo::lastInsertId(); } public function size(): int { $this->load(); return count($this->index['chunks']); } }