This is the snapshot the production landing site (nibiru-framework.com) is deployed from. Brings together the recent splash + docs migration to the v4 "Cosmos" design system, the new in-framework AI module, and the framework groundwork that backs the framework-reference extraction. What lands: - docs/: Astro + Starlight site with the v4 dark cosmic palette, GalaxyHero canvas constellation, Mission Control chat (wired to /api/oracle → api.neuronetz.ai via providers.mjs Ollama), 5-panel MMVC stage (Model · AI · Module · Controller · View), translated EN/DE/JA/ES/FR content, PWA + sitemap + llms.txt + Umami analytics. - docs/design-system/: canonical mockup bundle (source/index-v2.html for splash, source/docs-system.html + preview/ for docs, SPEC.md, tokens). - docs/scripts/extraction/framework-reference-v2.md: deep framework reference (~1.6k lines, file:line citations, every public factory and idiom — basis for the LoRA training corpus. - application/module/ai/: AI module with chat / embed / RAG / agent plugins, plus pdoQuery / httpGet / fileRead tools and Modelfile + smoke-test in training/. - application/module/users/: user / ACL / form-factory traits used as the reference plugin pattern for the framework docs. - application/settings/config/database/: schema + seed migrations including the AI module tables (200–203). - Form factory + autogenerator changes the framework-reference-v2 covers. Production secrets stay out: docs/.env, settings.production.ini and ai.production.ini are all gitignored (.example files are in tree). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
316 lines
11 KiB
PHP
316 lines
11 KiB
PHP
<?php
|
|
namespace Nibiru\Module\Ai\Plugins;
|
|
|
|
/**
|
|
* Retrieval-Augmented Generation. Ingest text or files, then ask
|
|
* grounded questions backed by cosine retrieval over the chunks.
|
|
*
|
|
* $rag = $ai->rag('docs');
|
|
* $rag->ingestText('The dispatcher runs every request.', ['source' => 'note-1']);
|
|
* $rag->ingestFile(__DIR__ . '/manual.md');
|
|
* $rag->ingestDir(__DIR__ . '/articles/');
|
|
*
|
|
* $answer = $rag->ask('How does the dispatcher work?');
|
|
* $hits = $rag->search('dispatcher', 5);
|
|
*
|
|
* Storage: a single JSON file per collection at
|
|
* <storage_path>/<collection>.json
|
|
*
|
|
* No database. Restartable. ~10k chunks fits in memory comfortably.
|
|
*/
|
|
class Rag
|
|
{
|
|
protected string $collection;
|
|
protected \stdClass $cfg;
|
|
protected Chat $chat;
|
|
protected Embed $embed;
|
|
|
|
/** @var array{chunks:array,embeddings:array} */
|
|
protected array $index = ['chunks' => [], 'embeddings' => []];
|
|
protected bool $loaded = false;
|
|
|
|
public function __construct(string $collection, \stdClass $cfg, Chat $chat, Embed $embed)
|
|
{
|
|
$this->collection = preg_replace('/[^a-z0-9_-]/i', '', $collection) ?: 'default';
|
|
$this->cfg = $cfg;
|
|
$this->chat = $chat;
|
|
$this->embed = $embed;
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Ingestion
|
|
// -----------------------------------------------------------------------
|
|
|
|
/**
|
|
* Add a single chunk of text to the collection.
|
|
*/
|
|
public function ingestText(string $text, array $metadata = []): void
|
|
{
|
|
$this->load();
|
|
$this->addChunk($text, $metadata);
|
|
$this->save();
|
|
}
|
|
|
|
/**
|
|
* Read a file, chunk it, and ingest each chunk.
|
|
*/
|
|
public function ingestFile(string $path): int
|
|
{
|
|
if (!is_readable($path)) {
|
|
throw new \RuntimeException("RAG ingest: $path is not readable.");
|
|
}
|
|
$this->load();
|
|
$body = (string) file_get_contents($path);
|
|
$chunks = $this->chunk($body);
|
|
foreach ($chunks as $c) {
|
|
$this->addChunk($c, ['source' => $path]);
|
|
}
|
|
$this->save();
|
|
return count($chunks);
|
|
}
|
|
|
|
/**
|
|
* Recursively ingest every .md / .txt / .php file under a directory.
|
|
*/
|
|
public function ingestDir(string $dir, array $extensions = ['md', 'txt', 'php']): int
|
|
{
|
|
if (!is_dir($dir)) {
|
|
throw new \RuntimeException("RAG ingest: $dir is not a directory.");
|
|
}
|
|
$count = 0;
|
|
$extPattern = '/\.(' . implode('|', array_map('preg_quote', $extensions)) . ')$/i';
|
|
$iter = new \RecursiveIteratorIterator(new \RecursiveDirectoryIterator(
|
|
$dir, \RecursiveDirectoryIterator::SKIP_DOTS
|
|
));
|
|
foreach ($iter as $entry) {
|
|
if (!$entry->isFile()) continue;
|
|
if (!preg_match($extPattern, $entry->getFilename())) continue;
|
|
$count += $this->ingestFile($entry->getPathname());
|
|
}
|
|
return $count;
|
|
}
|
|
|
|
/**
|
|
* Forget every chunk in this collection (and delete the storage file).
|
|
*/
|
|
public function reset(): void
|
|
{
|
|
$this->index = ['chunks' => [], 'embeddings' => []];
|
|
$this->loaded = true;
|
|
if ($this->backend() === 'database') {
|
|
\Nibiru\Pdo::query(
|
|
'DELETE c FROM ai_rag_chunk c '
|
|
. 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id '
|
|
. 'WHERE o.ai_rag_collection_name = :name',
|
|
[':name' => $this->collection]
|
|
);
|
|
\Nibiru\Pdo::delete('ai_rag_collection', ['ai_rag_collection_name' => $this->collection]);
|
|
return;
|
|
}
|
|
$path = $this->storagePath();
|
|
if (is_file($path)) @unlink($path);
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Querying
|
|
// -----------------------------------------------------------------------
|
|
|
|
/**
|
|
* Top-K cosine similarity. Returns [{score, text, metadata}].
|
|
*/
|
|
public function search(string $query, ?int $k = null): array
|
|
{
|
|
$this->load();
|
|
if (empty($this->index['embeddings'])) return [];
|
|
|
|
$k = $k ?? (int) ($this->cfg->rag_top_k ?? 6);
|
|
$qv = $this->embed->one($query);
|
|
|
|
$scored = [];
|
|
foreach ($this->index['embeddings'] as $i => $packed) {
|
|
$vec = Embed::unpack($packed);
|
|
$scored[] = [
|
|
'score' => Embed::cosine($qv, $vec),
|
|
'text' => $this->index['chunks'][$i]['text'] ?? '',
|
|
'metadata' => $this->index['chunks'][$i]['metadata'] ?? [],
|
|
];
|
|
}
|
|
usort($scored, fn($a, $b) => $b['score'] <=> $a['score']);
|
|
return array_slice($scored, 0, $k);
|
|
}
|
|
|
|
/**
|
|
* Search the collection, then ask the LLM with the top-K chunks as context.
|
|
*/
|
|
public function ask(string $question, ?int $k = null): string
|
|
{
|
|
$hits = $this->search($question, $k);
|
|
if (empty($hits)) {
|
|
return $this->chat->reset()->ask($question);
|
|
}
|
|
$context = '';
|
|
foreach ($hits as $i => $h) {
|
|
$context .= '[' . ($i + 1) . '] ' . trim($h['text']) . "\n\n---\n\n";
|
|
}
|
|
$sys = ($this->chat->history() ? '' : (string) ($this->cfg->chat_system_prompt ?? ''));
|
|
$sys .= "\n\nUse these excerpts to answer. Cite by number like [1].\n\n" . $context;
|
|
return $this->chat->reset()->system(trim($sys))->ask($question);
|
|
}
|
|
|
|
// -----------------------------------------------------------------------
|
|
// Internals
|
|
// -----------------------------------------------------------------------
|
|
|
|
protected function addChunk(string $text, array $metadata): void
|
|
{
|
|
$text = trim($text);
|
|
if ($text === '') return;
|
|
$vec = $this->embed->one($text);
|
|
$packed = Embed::pack($vec);
|
|
|
|
$this->index['chunks'][] = ['text' => $text, 'metadata' => $metadata];
|
|
$this->index['embeddings'][] = $packed;
|
|
|
|
if ($this->backend() === 'database') {
|
|
\Nibiru\Pdo::insert('ai_rag_chunk', [
|
|
'ai_rag_chunk_collection_id' => $this->dbCollectionId(),
|
|
'ai_rag_chunk_text' => $text,
|
|
'ai_rag_chunk_metadata' => json_encode($metadata, JSON_UNESCAPED_UNICODE),
|
|
'ai_rag_chunk_embedding' => $packed,
|
|
'ai_rag_chunk_token_count' => (int) ceil(strlen($text) / 4),
|
|
'ai_rag_chunk_source' => isset($metadata['source']) ? (string) $metadata['source'] : null,
|
|
]);
|
|
}
|
|
}
|
|
|
|
protected function chunk(string $body): array
|
|
{
|
|
$target = (int) ($this->cfg->rag_chunk_target ?? 600);
|
|
$min = (int) ($this->cfg->rag_chunk_min ?? 120);
|
|
$max = (int) ($this->cfg->rag_chunk_max ?? 900);
|
|
// Split on paragraph boundaries first, then merge to target size.
|
|
$paragraphs = preg_split('/\n\s*\n/', $body) ?: [];
|
|
$out = [];
|
|
$buf = '';
|
|
$bufTokens = 0;
|
|
foreach ($paragraphs as $p) {
|
|
$pTokens = (int) ceil(strlen($p) / 4); // crude
|
|
if ($bufTokens + $pTokens > $target && $bufTokens >= $min) {
|
|
$out[] = $buf;
|
|
$buf = '';
|
|
$bufTokens = 0;
|
|
}
|
|
if ($pTokens > $max) {
|
|
if ($buf !== '') { $out[] = $buf; $buf = ''; $bufTokens = 0; }
|
|
// Split overlarge paragraph on sentence boundary
|
|
$sentences = preg_split('/(?<=[.!?])\s+/', $p) ?: [$p];
|
|
foreach ($sentences as $s) $out[] = $s;
|
|
continue;
|
|
}
|
|
$buf .= ($buf === '' ? '' : "\n\n") . $p;
|
|
$bufTokens += $pTokens;
|
|
}
|
|
if ($buf !== '') $out[] = $buf;
|
|
return $out;
|
|
}
|
|
|
|
/**
|
|
* Storage backend, controlled by [AI] rag.storage in ai.ini:
|
|
* "json" — single JSON file per collection (default; great for dev)
|
|
* "database" — uses ai_rag_collection / ai_rag_chunk tables via \Nibiru\Pdo
|
|
* (recommended for production; survives load-balancer fan-out)
|
|
*/
|
|
protected function backend(): string
|
|
{
|
|
$b = strtolower((string) ($this->cfg->rag_storage ?? 'json'));
|
|
return $b === 'database' ? 'database' : 'json';
|
|
}
|
|
|
|
protected function storagePath(): string
|
|
{
|
|
$base = $this->cfg->rag_storage_path
|
|
?? '/../../application/module/ai/cache/rag/';
|
|
$dir = realpath(__DIR__ . $base) ?: (__DIR__ . $base);
|
|
if (!is_dir($dir)) @mkdir($dir, 0775, true);
|
|
return rtrim($dir, '/') . '/' . $this->collection . '.json';
|
|
}
|
|
|
|
protected function load(): void
|
|
{
|
|
if ($this->loaded) return;
|
|
if ($this->backend() === 'database') {
|
|
$this->loadFromDatabase();
|
|
} else {
|
|
$path = $this->storagePath();
|
|
if (is_file($path)) {
|
|
$raw = json_decode((string) file_get_contents($path), true);
|
|
if (is_array($raw) && isset($raw['chunks'], $raw['embeddings'])) {
|
|
$this->index = $raw;
|
|
}
|
|
}
|
|
}
|
|
$this->loaded = true;
|
|
}
|
|
|
|
protected function save(): void
|
|
{
|
|
if ($this->backend() === 'database') {
|
|
// database is written incrementally in addChunk(); no-op here.
|
|
return;
|
|
}
|
|
$path = $this->storagePath();
|
|
file_put_contents($path, json_encode($this->index, JSON_UNESCAPED_UNICODE));
|
|
}
|
|
|
|
/**
|
|
* Load from ai_rag_collection + ai_rag_chunk tables. Uses Nibiru's
|
|
* `\Nibiru\Pdo` adapter — the tables are populated by migrations
|
|
* 200-ai_rag_collection.sql and 201-ai_rag_chunk.sql.
|
|
*/
|
|
protected function loadFromDatabase(): void
|
|
{
|
|
$rows = \Nibiru\Pdo::fetchAll(
|
|
'SELECT c.ai_rag_chunk_text AS text, c.ai_rag_chunk_metadata AS metadata, '
|
|
. ' c.ai_rag_chunk_embedding AS embedding '
|
|
. 'FROM ai_rag_chunk c '
|
|
. 'INNER JOIN ai_rag_collection o ON o.ai_rag_collection_id = c.ai_rag_chunk_collection_id '
|
|
. 'WHERE o.ai_rag_collection_name = :name '
|
|
. 'ORDER BY c.ai_rag_chunk_id',
|
|
[':name' => $this->collection]
|
|
);
|
|
foreach ($rows as $r) {
|
|
$this->index['chunks'][] = [
|
|
'text' => (string) $r['text'],
|
|
'metadata' => is_string($r['metadata']) ? (json_decode($r['metadata'], true) ?: []) : (array) ($r['metadata'] ?? []),
|
|
];
|
|
$this->index['embeddings'][] = (string) $r['embedding'];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Resolve (or create) the collection's row in ai_rag_collection.
|
|
* Called lazily by addChunk() in database mode.
|
|
*/
|
|
protected function dbCollectionId(): int
|
|
{
|
|
$row = \Nibiru\Pdo::fetchRow(
|
|
'SELECT ai_rag_collection_id AS id FROM ai_rag_collection '
|
|
. 'WHERE ai_rag_collection_name = :name',
|
|
[':name' => $this->collection]
|
|
);
|
|
if ($row && isset($row['id'])) return (int) $row['id'];
|
|
\Nibiru\Pdo::insert('ai_rag_collection', [
|
|
'ai_rag_collection_name' => $this->collection,
|
|
'ai_rag_collection_embed_model' => (string) ($this->cfg->embed_model ?? ''),
|
|
'ai_rag_collection_embed_dim' => (int) ($this->cfg->embed_dim ?? 0),
|
|
]);
|
|
return (int) \Nibiru\Pdo::lastInsertId();
|
|
}
|
|
|
|
public function size(): int
|
|
{
|
|
$this->load();
|
|
return count($this->index['chunks']);
|
|
}
|
|
}
|