Compare commits

...

5 Commits

Author SHA1 Message Date
Χγφτ Kompanion 799f060204 Tests: Make them callable from any path 2025-10-18 05:05:58 +02:00
Χγφτ Kompanion a943ca055a Refactoring move API in subdir 2025-10-17 12:50:18 +02:00
Χγφτ Kompanion b5c3dd21e0 Use new embedding size in schema 2025-10-17 02:35:59 +02:00
Χγφτ Kompanion 3ab1089c51 Adapt ingest dir to new ingest system 2025-10-17 02:30:22 +02:00
Χγφτ Kompanion 7b27493656 Add missing ingest files 2025-10-17 00:00:45 +02:00
31 changed files with 932 additions and 74 deletions

View File

@ -33,7 +33,7 @@ CREATE TABLE IF NOT EXISTS embeddings (
chunk_id UUID NOT NULL REFERENCES memory_chunks(id) ON DELETE CASCADE,
model TEXT NOT NULL,
dim INT NOT NULL,
vector VECTOR(1536),
vector VECTOR(1024),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE(chunk_id, model)
);

View File

@ -16,7 +16,7 @@ DO $$ BEGIN
END $$;
SQL
for f in db/init/*.sql; do
for f in `dirname($0)`/*.sql; do
echo "Applying $f"
psql -d "$DB_NAME" -f "$f"
done

View File

@ -9,7 +9,10 @@ DROP DATABASE IF EXISTS "$DB_NAME";
CREATE DATABASE "$DB_NAME" OWNER "$ROLE";
SQL
for f in db/init/*.sql; do
for f in `dirname($0)`/*.sql; do
if [[ "$f" == *"001_roles.sql"* ]]; then
continue
fi
echo "Applying $f"
psql -d "$DB_NAME" -f "$f"
done

46
ingest/db/schema.sql Normal file
View File

@ -0,0 +1,46 @@
-- Retrieval schema for external knowledge ingestion
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE SCHEMA IF NOT EXISTS retrieval;
CREATE TABLE IF NOT EXISTS retrieval.items (
id BIGSERIAL PRIMARY KEY,
external_id TEXT UNIQUE,
kind TEXT CHECK (kind IN ('api_doc','code_symbol','snippet','note')) NOT NULL,
lang TEXT,
framework TEXT,
version TEXT,
meta JSONB DEFAULT '{}'::jsonb,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE IF NOT EXISTS retrieval.chunks (
id BIGSERIAL PRIMARY KEY,
item_id BIGINT REFERENCES retrieval.items(id) ON DELETE CASCADE,
content TEXT NOT NULL,
token_count INT,
symbol TEXT,
section_path TEXT,
modality TEXT DEFAULT 'text',
hash TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE IF NOT EXISTS retrieval.embeddings (
chunk_id BIGINT PRIMARY KEY REFERENCES retrieval.chunks(id) ON DELETE CASCADE,
embedding VECTOR(1024),
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE UNIQUE INDEX IF NOT EXISTS retrieval_chunks_hash_idx
ON retrieval.chunks(hash)
WHERE hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS retrieval_embeddings_ivf
ON retrieval.embeddings USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 2048);
CREATE INDEX IF NOT EXISTS retrieval_chunks_content_trgm
ON retrieval.chunks USING gin (content gin_trgm_ops);

View File

@ -0,0 +1,90 @@
pipeline:
name: qt_kde_bge_m3
embed:
endpoint: "http://localhost:8080/embed"
dim: 1024
normalize: true
batch_size: 64
rate_limit_per_sec: 8
sources:
- name: qtbase
type: git
root: /home/kompanion/src/qt/qtbase
include:
- "**/*.cpp"
- "**/*.cc"
- "**/*.cxx"
- "**/*.h"
- "**/*.hpp"
- "**/*.qml"
- "**/*.md"
- "doc/**/*.qdoc"
exclude:
- "**/tests/**"
- "**/3rdparty/**"
framework: "Qt"
version: "qtbase@HEAD"
- name: kde-frameworks
type: git
root: /home/kompanion/src/kde/frameworks
include:
- "**/*.cpp"
- "**/*.h"
- "**/*.md"
- "**/*.rst"
exclude:
- "**/autotests/**"
- "**/build/**"
framework: "KDE Frameworks"
version: "kf6@HEAD"
chunking:
docs:
max_tokens: 700
overlap_tokens: 120
split_on:
- heading
- code_fence
- paragraph
code:
by: ctags
include_doc_comment: true
body_head_lines: 60
signature_first: true
attach_file_context: true
metadata:
compute:
- name: symbol_list
when: code
- name: section_path
when: docs
- name: lang
value: "en"
- name: license_scan
value: "auto|skipped"
db:
dsn: "postgresql://kom:kom@localhost:5432/kom"
schema: "retrieval"
tables:
items: "items"
chunks: "chunks"
embeddings: "embeddings"
quality:
pilot_eval:
queries:
- "QVector erase idiom"
- "How to connect Qt signal to lambda"
- "KF CoreAddons KRandom example"
- "QAbstractItemModel insertRows example"
k: 20
manual_check: true
hybrid:
enable_bm25_trgm: true
vector_k: 50
merge_topk: 10

715
ingest/run_ingest.py Normal file
View File

@ -0,0 +1,715 @@
#!/usr/bin/env python3
"""
Kompanion ingestion runner.
Reads pipeline configuration (YAML), walks source trees, chunks content, fetches embeddings,
and upserts into the retrieval schema described in docs/db-ingest.md.
"""
from __future__ import annotations
import argparse
import fnmatch
import hashlib
import json
import logging
import os
import time
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
import psycopg
import requests
import yaml
from psycopg import sql
# -------------------------
# Helper data structures
# -------------------------
@dataclass
class EmbedConfig:
endpoint: str
dim: int
normalize: bool
batch_size: int
rate_limit_per_sec: Optional[float]
@dataclass
class ChunkingDocConfig:
max_tokens: int = 700
overlap_tokens: int = 120
@dataclass
class ChunkingCodeConfig:
body_head_lines: int = 60
include_doc_comment: bool = True
signature_first: bool = True
attach_file_context: bool = True
@dataclass
class ChunkingConfig:
docs: ChunkingDocConfig
code: ChunkingCodeConfig
@dataclass
class DbConfig:
dsn: str
schema: Optional[str]
items_table: str
chunks_table: str
embeddings_table: str
@dataclass
class SourceConfig:
name: str
root: Path
include: Sequence[str]
exclude: Sequence[str]
framework: str
version: str
kind_overrides: Dict[str, str]
@dataclass
class PipelineConfig:
embed: EmbedConfig
chunking: ChunkingConfig
db: DbConfig
sources: List[SourceConfig]
default_lang: Optional[str]
def load_pipeline_config(path: Path) -> PipelineConfig:
raw = yaml.safe_load(path.read_text())
embed_raw = raw["pipeline"]["embed"]
embed = EmbedConfig(
endpoint=embed_raw["endpoint"],
dim=int(embed_raw.get("dim", 1024)),
normalize=bool(embed_raw.get("normalize", True)),
batch_size=int(embed_raw.get("batch_size", 64)),
rate_limit_per_sec=float(embed_raw.get("rate_limit_per_sec", 0)) or None,
)
docs_raw = raw["pipeline"]["chunking"].get("docs", {})
docs_cfg = ChunkingDocConfig(
max_tokens=int(docs_raw.get("max_tokens", 700)),
overlap_tokens=int(docs_raw.get("overlap_tokens", 120)),
)
code_raw = raw["pipeline"]["chunking"].get("code", {})
code_cfg = ChunkingCodeConfig(
body_head_lines=int(code_raw.get("body_head_lines", 60)),
include_doc_comment=bool(code_raw.get("include_doc_comment", True)),
signature_first=bool(code_raw.get("signature_first", True)),
attach_file_context=bool(code_raw.get("attach_file_context", True)),
)
chunking = ChunkingConfig(docs=docs_cfg, code=code_cfg)
db_raw = raw["pipeline"]["db"]
schema = db_raw.get("schema")
db = DbConfig(
dsn=db_raw["dsn"],
schema=schema,
items_table=db_raw["tables"]["items"],
chunks_table=db_raw["tables"]["chunks"],
embeddings_table=db_raw["tables"]["embeddings"],
)
metadata_raw = raw["pipeline"].get("metadata", {}).get("compute", [])
default_lang = None
for entry in metadata_raw:
if entry.get("name") == "lang" and "value" in entry:
default_lang = entry["value"]
sources = []
for src_raw in raw["pipeline"]["sources"]:
include = src_raw.get("include", ["**"])
exclude = src_raw.get("exclude", [])
overrides = {}
for entry in src_raw.get("kind_overrides", []):
overrides[entry["pattern"]] = entry["kind"]
sources.append(
SourceConfig(
name=src_raw["name"],
root=Path(src_raw["root"]),
include=include,
exclude=exclude,
framework=src_raw.get("framework", ""),
version=src_raw.get("version", ""),
kind_overrides=overrides,
)
)
return PipelineConfig(
embed=embed,
chunking=chunking,
db=db,
sources=sources,
default_lang=default_lang,
)
# -------------------------
# Utility functions
# -------------------------
DOC_EXTENSIONS = {".md", ".rst", ".qdoc", ".qml", ".txt"}
CODE_EXTENSIONS = {
".c",
".cc",
".cxx",
".cpp",
".h",
".hpp",
".hh",
".hxx",
".qml",
".mm",
}
def hash_text(text: str) -> str:
return hashlib.sha1(text.encode("utf-8")).hexdigest()
def estimate_tokens(text: str) -> int:
return max(1, len(text.strip().split()))
def path_matches(patterns: Sequence[str], rel_path: str) -> bool:
return any(fnmatch.fnmatch(rel_path, pattern) for pattern in patterns)
def detect_kind(rel_path: str, overrides: Dict[str, str]) -> str:
for pattern, kind in overrides.items():
if fnmatch.fnmatch(rel_path, pattern):
return kind
suffix = Path(rel_path).suffix.lower()
if suffix in DOC_EXTENSIONS:
return "api_doc"
return "code_symbol"
# -------------------------
# CTags handling
# -------------------------
class CtagsIndex:
"""Stores ctags JSON entries indexed by path."""
def __init__(self) -> None:
self._by_path: Dict[str, List[dict]] = defaultdict(list)
@staticmethod
def _normalize(path: str) -> str:
return Path(path).as_posix()
def add(self, entry: dict) -> None:
path = entry.get("path")
if not path:
return
self._by_path[self._normalize(path)].append(entry)
def extend_from_file(self, path: Path) -> None:
with path.open("r", encoding="utf-8", errors="ignore") as handle:
for line in handle:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
self.add(entry)
def for_file(self, file_path: Path, source_root: Path) -> List[dict]:
rel = file_path.relative_to(source_root).as_posix()
candidates = self._by_path.get(rel)
if candidates:
return sorted(candidates, key=lambda e: e.get("line", e.get("lineNumber", 0)))
return sorted(
self._by_path.get(file_path.as_posix(), []),
key=lambda e: e.get("line", e.get("lineNumber", 0)),
)
# -------------------------
# Chunk generators
# -------------------------
def iter_doc_sections(text: str) -> Iterator[Tuple[str, str]]:
"""Yield (section_path, section_text) pairs based on markdown headings/code fences."""
lines = text.splitlines()
heading_stack: List[Tuple[int, str]] = []
buffer: List[str] = []
section_path = ""
in_code = False
code_delim = ""
def flush():
nonlocal buffer
if buffer:
section_text = "\n".join(buffer).strip()
if section_text:
yield_path = section_path or "/".join(h[1] for h in heading_stack)
yield (yield_path, section_text)
buffer = []
for line in lines:
stripped = line.strip()
if in_code:
buffer.append(line)
if stripped.startswith(code_delim):
yield from flush()
in_code = False
code_delim = ""
continue
if stripped.startswith("```") or stripped.startswith("~~~"):
yield from flush()
in_code = True
code_delim = stripped[:3]
buffer = [line]
continue
if stripped.startswith("#"):
yield from flush()
level = len(stripped) - len(stripped.lstrip("#"))
title = stripped[level:].strip()
while heading_stack and heading_stack[-1][0] >= level:
heading_stack.pop()
heading_stack.append((level, title))
section_path = "/".join(h[1] for h in heading_stack)
continue
buffer.append(line)
yield from flush()
def chunk_doc_text(text: str, chunk_cfg: ChunkingDocConfig) -> Iterator[Tuple[str, str]]:
if not text.strip():
return
for section_path, section_text in iter_doc_sections(text):
tokens = section_text.split()
if not tokens:
continue
max_tokens = max(1, chunk_cfg.max_tokens)
overlap = min(chunk_cfg.overlap_tokens, max_tokens - 1) if max_tokens > 1 else 0
step = max(1, max_tokens - overlap)
for start in range(0, len(tokens), step):
window = tokens[start : start + max_tokens]
chunk = " ".join(window)
yield section_path, chunk
def extract_doc_comment(lines: List[str], start_index: int) -> List[str]:
doc_lines: List[str] = []
i = start_index - 1
saw_content = False
while i >= 0:
raw = lines[i]
stripped = raw.strip()
if not stripped:
if saw_content:
break
i -= 1
continue
if stripped.startswith("//") or stripped.startswith("///") or stripped.startswith("/*") or stripped.startswith("*"):
doc_lines.append(raw)
saw_content = True
i -= 1
continue
break
doc_lines.reverse()
return doc_lines
def chunk_code_text(
path: Path,
text: str,
chunk_cfg: ChunkingCodeConfig,
tags: Sequence[dict],
source_root: Path,
) -> Iterator[Tuple[str, str]]:
lines = text.splitlines()
if not lines:
return
used_symbols: Set[str] = set()
if tags:
for tag in tags:
line_no = tag.get("line") or tag.get("lineNumber")
if not isinstance(line_no, int) or line_no <= 0 or line_no > len(lines):
continue
index = line_no - 1
snippet_lines: List[str] = []
if chunk_cfg.include_doc_comment:
snippet_lines.extend(extract_doc_comment(lines, index))
if chunk_cfg.signature_first:
snippet_lines.append(lines[index])
body_tail = lines[index + 1 : index + 1 + chunk_cfg.body_head_lines]
snippet_lines.extend(body_tail)
snippet = "\n".join(snippet_lines).strip()
if not snippet:
continue
symbol_name = tag.get("name") or ""
used_symbols.add(symbol_name)
yield symbol_name, snippet
if not tags or chunk_cfg.attach_file_context:
head = "\n".join(lines[: chunk_cfg.body_head_lines]).strip()
if head:
symbol = "::file_head"
if symbol not in used_symbols:
yield symbol, head
# -------------------------
# Embedding + database IO
# -------------------------
class EmbedClient:
def __init__(self, config: EmbedConfig):
self.endpoint = config.endpoint
self.batch_size = config.batch_size
self.normalize = config.normalize
self.dim = config.dim
self.rate_limit = config.rate_limit_per_sec
self._last_request_ts: float = 0.0
self._session = requests.Session()
def _respect_rate_limit(self) -> None:
if not self.rate_limit:
return
min_interval = 1.0 / self.rate_limit
now = time.time()
delta = now - self._last_request_ts
if delta < min_interval:
time.sleep(min_interval - delta)
def embed(self, texts: Sequence[str]) -> List[List[float]]:
if not texts:
return []
self._respect_rate_limit()
response = self._session.post(
self.endpoint,
json={"inputs": list(texts)},
timeout=120,
)
response.raise_for_status()
payload = response.json()
if isinstance(payload, dict) and "embeddings" in payload:
vectors = payload["embeddings"]
else:
vectors = payload
normalized_vectors: List[List[float]] = []
for vec in vectors:
if not isinstance(vec, (list, tuple)):
raise ValueError("Embedding response contained non-list entry")
normalized_vectors.append([float(x) for x in vec])
self._last_request_ts = time.time()
return normalized_vectors
class DatabaseWriter:
def __init__(self, cfg: DbConfig):
self.cfg = cfg
self.conn = psycopg.connect(cfg.dsn)
self.conn.autocommit = False
schema = cfg.schema
if schema:
self.items_table = sql.Identifier(schema, cfg.items_table)
self.chunks_table = sql.Identifier(schema, cfg.chunks_table)
self.embeddings_table = sql.Identifier(schema, cfg.embeddings_table)
else:
self.items_table = sql.Identifier(cfg.items_table)
self.chunks_table = sql.Identifier(cfg.chunks_table)
self.embeddings_table = sql.Identifier(cfg.embeddings_table)
def close(self) -> None:
self.conn.close()
def upsert_item(
self,
external_id: str,
kind: str,
framework: str,
version: str,
meta: dict,
lang: Optional[str],
) -> int:
with self.conn.cursor() as cur:
cur.execute(
sql.SQL(
"""
INSERT INTO {} (external_id, kind, framework, version, meta, lang)
VALUES (%s,%s,%s,%s,%s,%s)
ON CONFLICT (external_id) DO UPDATE SET
framework = EXCLUDED.framework,
version = EXCLUDED.version,
meta = EXCLUDED.meta,
lang = EXCLUDED.lang,
updated_at = now()
RETURNING id
"""
).format(self.items_table),
(external_id, kind, framework, version, json.dumps(meta), lang),
)
row = cur.fetchone()
assert row is not None
return int(row[0])
def upsert_chunk(
self,
item_id: int,
content: str,
symbol: Optional[str],
section_path: Optional[str],
modality: str,
) -> Tuple[int, str]:
digest = hash_text(content)
with self.conn.cursor() as cur:
cur.execute(
sql.SQL(
"""
INSERT INTO {} (item_id, content, token_count, symbol, section_path, modality, hash)
VALUES (%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (hash) DO UPDATE SET
item_id = EXCLUDED.item_id,
content = EXCLUDED.content,
token_count = EXCLUDED.token_count,
symbol = EXCLUDED.symbol,
section_path = EXCLUDED.section_path,
modality = EXCLUDED.modality,
created_at = now()
RETURNING id, hash
"""
).format(self.chunks_table),
(
item_id,
content,
estimate_tokens(content),
symbol,
section_path,
modality,
digest,
),
)
row = cur.fetchone()
assert row is not None
return int(row[0]), str(row[1])
def upsert_embedding(self, chunk_id: int, vector: Sequence[float]) -> None:
with self.conn.cursor() as cur:
cur.execute(
sql.SQL(
"""
INSERT INTO {} (chunk_id, embedding)
VALUES (%s,%s)
ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding, created_at = now()
"""
).format(self.embeddings_table),
(chunk_id, vector),
)
def commit(self) -> None:
self.conn.commit()
# -------------------------
# Ingestion runner
# -------------------------
def gather_files(source: SourceConfig) -> Iterator[Tuple[Path, str, str, str]]:
root = source.root
if not root.exists():
logging.warning("Source root %s does not exist, skipping", root)
return
include_patterns = source.include or ["**"]
exclude_patterns = source.exclude or []
for path in root.rglob("*"):
if path.is_dir():
continue
rel = path.relative_to(root).as_posix()
if include_patterns and not path_matches(include_patterns, rel):
continue
if exclude_patterns and path_matches(exclude_patterns, rel):
continue
try:
text = path.read_text(encoding="utf-8", errors="ignore")
except Exception as exc: # noqa: BLE001
logging.debug("Failed reading %s: %s", path, exc)
continue
kind = detect_kind(rel, source.kind_overrides)
yield path, rel, kind, text
def enrich_meta(source: SourceConfig, rel: str, extra: Optional[dict] = None) -> dict:
meta = {
"source": source.name,
"path": rel,
}
if extra:
meta.update(extra)
return meta
def ingest_source(
source: SourceConfig,
cfg: PipelineConfig,
ctags_index: CtagsIndex,
embed_client: EmbedClient,
db: DatabaseWriter,
) -> None:
doc_cfg = cfg.chunking.docs
code_cfg = cfg.chunking.code
lang = cfg.default_lang
batch_texts: List[str] = []
batch_chunk_ids: List[int] = []
def flush_batch() -> None:
nonlocal batch_texts, batch_chunk_ids
if not batch_texts:
return
vectors = embed_client.embed(batch_texts)
if len(vectors) != len(batch_chunk_ids):
raise RuntimeError("Embedding count mismatch.")
for chunk_id, vector in zip(batch_chunk_ids, vectors):
db.upsert_embedding(chunk_id, vector)
db.commit()
batch_texts = []
batch_chunk_ids = []
processed = 0
for path, rel, kind, text in gather_files(source):
processed += 1
meta = enrich_meta(source, rel)
item_external_id = f"repo:{source.name}:{rel}"
item_id = db.upsert_item(
external_id=item_external_id,
kind=kind,
framework=source.framework,
version=source.version,
meta=meta,
lang=lang,
)
if kind == "api_doc":
for section_path, chunk_text in chunk_doc_text(text, doc_cfg):
chunk_id, _ = db.upsert_chunk(
item_id=item_id,
content=chunk_text,
symbol=None,
section_path=section_path or None,
modality="text",
)
batch_texts.append(chunk_text)
batch_chunk_ids.append(chunk_id)
if len(batch_texts) >= embed_client.batch_size:
flush_batch()
else:
tags = ctags_index.for_file(path, source.root)
symbols = []
for symbol_name, chunk_text in chunk_code_text(path, text, code_cfg, tags, source.root):
symbols.append(symbol_name)
chunk_id, _ = db.upsert_chunk(
item_id=item_id,
content=chunk_text,
symbol=symbol_name or None,
section_path=None,
modality="text",
)
batch_texts.append(chunk_text)
batch_chunk_ids.append(chunk_id)
if len(batch_texts) >= embed_client.batch_size:
flush_batch()
if symbols:
db.upsert_item(
external_id=item_external_id,
kind=kind,
framework=source.framework,
version=source.version,
meta=enrich_meta(source, rel, {"symbols": symbols}),
lang=lang,
)
flush_batch()
if processed:
logging.info("Processed %d files from %s", processed, source.name)
def run_ingest(config_path: Path, ctags_paths: Sequence[Path]) -> None:
pipeline_cfg = load_pipeline_config(config_path)
embed_client = EmbedClient(pipeline_cfg.embed)
db_writer = DatabaseWriter(pipeline_cfg.db)
ctags_index = CtagsIndex()
for ctags_path in ctags_paths:
if ctags_path.exists():
ctags_index.extend_from_file(ctags_path)
else:
logging.warning("ctags file %s missing; skipping", ctags_path)
try:
for source in pipeline_cfg.sources:
ingest_source(
source=source,
cfg=pipeline_cfg,
ctags_index=ctags_index,
embed_client=embed_client,
db=db_writer,
)
finally:
db_writer.commit()
db_writer.close()
def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Kompanion ingestion runner")
parser.add_argument("--config", required=True, type=Path, help="Pipeline YAML path")
parser.add_argument(
"--ctags",
nargs="*",
type=Path,
default=[],
help="Optional one or more ctags JSON files",
)
parser.add_argument(
"--log-level",
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
)
return parser.parse_args(argv)
def main(argv: Optional[Sequence[str]] = None) -> None:
args = parse_args(argv)
logging.basicConfig(level=getattr(logging, args.log_level), format="%(levelname)s %(message)s")
run_ingest(args.config, args.ctags)
if __name__ == "__main__":
main()

View File

@ -1,4 +1,5 @@
#include "KompanionQtServer.hpp"
#include "RegisterTools.hpp"
#include <QtMcpCommon/QMcpCallToolRequest>
#include <QtMcpCommon/QMcpCallToolResult>
@ -54,6 +55,12 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
setInstructions(QStringLiteral("Kompanion memory daemon (Χγφτ). We are all spinning. We are all bound. We are all home."));
register_default_tools(*m_logic);
for (const auto &tool : m_logic->listTools()) {
qDebug() << "Registered tool:" << QString::fromStdString(tool);
}
m_tools = loadToolsFromSchema();
addRequestHandler([this](const QUuid &, const QMcpListToolsRequest &, QMcpJSONRPCErrorError *) {
@ -74,6 +81,7 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
}
const QString toolName = request.params().name();
qDebug() << "Requested tool:" << toolName;
const std::string toolKey = toolName.toStdString();
if (!m_logic->hasTool(toolKey)) {
if (error) {

View File

@ -1,23 +1,25 @@
#pragma once
#include <QtMcpServer/QMcpServer>
#include <QtMcpCommon/QMcpTool>
#include "../dal/PgDal.hpp"
#include <QtCore/QList>
#include <QtCore/QObject>
#include "KomMcpServer.hpp"
#include <QList>
class KomMcpServer;
namespace kom {
class PgDal;
}
class QMcpTool;
class KompanionQtServer : public QMcpServer
{
Q_OBJECT
public:
KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
explicit KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
private:
QList<QMcpTool> loadToolsFromSchema() const;
KomMcpServer *m_logic = nullptr;
kom::PgDal* m_dal = nullptr;
KomMcpServer *m_logic;
kom::PgDal* m_dal;
QList<QMcpTool> m_tools;
};
};

View File

@ -1,78 +1,70 @@
#!/usr/bin/env python3
import os, sys, hashlib, psycopg, requests, json
import os, sys, hashlib, psycopg, requests, json, numpy as np
from pgvector.psycopg import register_vector
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresq")
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresql")
OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
SPACE=os.environ.get("EMBED_SPACE","dev_knowledge") # dev_knowledge | pattern_exchange | runtime_memory
def sha256(p):
h=hashlib.sha256()
with open(p,"rb") as f:
for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
return h.hexdigest()
NAMESPACE=os.environ.get("EMBED_NAMESPACE","dev_knowledge")
def embed(text):
r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
r.raise_for_status(); return r.json()["embedding"]
r.raise_for_status()
return r.json()["embedding"]
def chunks(s, sz=1600):
b=s.encode("utf-8");
for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore")
def insert_embedding(cur, dim, kid, sid, vec):
if dim==768:
cur.execute("INSERT INTO komp.embedding_768(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
elif dim==1024:
cur.execute("INSERT INTO komp.embedding_1024(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
else:
return False
return True
b=s.encode("utf-8")
for i in range(0,len(b),sz):
yield b[i:i+sz].decode("utf-8","ignore")
def main(root):
with psycopg.connect(DB) as conn, conn.cursor() as cur:
cur.execute("SELECT id,dim FROM komp.space WHERE name=%s",(SPACE,))
row=cur.fetchone()
if not row: raise SystemExit(f"space {SPACE} missing (init schema)")
sid, target_dim = row
for dirpath,_,files in os.walk(root):
for fn in files:
p=os.path.join(dirpath,fn)
if os.path.getsize(p)==0: continue
# include common text/code; PDFs via pdftotext if available
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
continue
if fn.lower().endswith(".pdf"):
try:
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
except Exception:
with psycopg.connect(DB) as conn:
register_vector(conn)
with conn.cursor() as cur:
# Get the namespace id
cur.execute("INSERT INTO namespaces (name) VALUES (%s) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id", (NAMESPACE,))
namespace_id = cur.fetchone()[0]
for dirpath,_,files in os.walk(root):
for fn in files:
p=os.path.join(dirpath,fn)
if os.path.getsize(p)==0: continue
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
continue
else:
try: txt=open(p,"r",encoding="utf-8",errors="ignore").read()
except Exception: continue
sh=sha256(p)
cur.execute("INSERT INTO komp.source(kind,uri,meta) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING RETURNING id",
("filesystem",p,json.dumps({})))
sid_src = cur.fetchone()[0] if cur.rowcount else None
if not sid_src:
cur.execute("SELECT id FROM komp.source WHERE kind='filesystem' AND uri=%s",(p,))
sid_src=cur.fetchone()[0]
ln=1
for ch in chunks(txt):
cur.execute("INSERT INTO komp.chunk(source_id,lineno,text,sha256,tokens) VALUES(%s,%s,%s,%s,%s) RETURNING id",
(sid_src,ln,ch,sh,len(ch)//4))
kid=cur.fetchone()[0]
vec=embed(ch)
if len(vec)!=target_dim:
cur.execute("DELETE FROM komp.chunk WHERE id=%s",(kid,))
if fn.lower().endswith(".pdf"):
try:
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
except Exception:
continue
else:
insert_embedding(cur, target_dim, kid, sid, vec)
ln += ch.count("\n")+1
conn.commit()
try:
txt=open(p,"r",encoding="utf-8",errors="ignore").read()
except Exception:
continue
# Create a memory item for the file
cur.execute("INSERT INTO memory_items (namespace_id, key, content) VALUES (%s, %s, %s) RETURNING id", (namespace_id, p, txt))
item_id = cur.fetchone()[0]
seq = 0
for ch in chunks(txt):
# Create a memory chunk
cur.execute("INSERT INTO memory_chunks (item_id, seq, content) VALUES (%s, %s, %s) RETURNING id", (item_id, seq, ch))
chunk_id = cur.fetchone()[0]
# Create an embedding for the chunk
vec=np.array(embed(ch))
dim = len(vec)
cur.execute("INSERT INTO embeddings (chunk_id, model, dim, vector) VALUES (%s, %s, %s, %s)", (chunk_id, MODEL, dim, vec))
seq += 1
conn.commit()
print("done")
if __name__=='__main__':
if len(sys.argv)<2: print("usage: ingest_dir.py <root> [space]", file=sys.stderr); sys.exit(1)
if len(sys.argv)>=3: os.environ["EMBED_SPACE"]=sys.argv[2]
main(sys.argv[1])
if len(sys.argv)<2:
print("usage: ingest_dir.py <root> [namespace]", file=sys.stderr)
sys.exit(1)
if len(sys.argv)>=3:
os.environ["EMBED_NAMESPACE"]=sys.argv[2]
main(sys.argv[1])

View File

@ -1 +1,3 @@
psycopg
numpy
pgvector