Tests: Make them callable from any path

Refactoring move API in subdir
Use new embedding size in schema
2025-10-18 05:05:58 +02:00 · 2025-10-17 12:50:18 +02:00 · 2025-10-17 02:35:59 +02:00 · 2025-10-17 02:30:22 +02:00 · 2025-10-17 00:00:45 +02:00
31 changed files with 932 additions and 74 deletions
--- a/db/init/010_schema.sql
+++ b/db/init/010_schema.sql
@ -33,7 +33,7 @@ CREATE TABLE IF NOT EXISTS embeddings (
  chunk_id UUID NOT NULL REFERENCES memory_chunks(id) ON DELETE CASCADE,
  model TEXT NOT NULL,
  dim INT NOT NULL,
-  vector VECTOR(1536),
+  vector VECTOR(1024),
  created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
  UNIQUE(chunk_id, model)
 );
--- a/db/scripts/create-prod-db.sh
+++ b/db/scripts/create-prod-db.sh
@ -16,7 +16,7 @@ DO $$ BEGIN
 END $$;
 SQL
-for f in db/init/*.sql; do
+for f in `dirname($0)`/*.sql; do
  echo "Applying $f"
  psql -d "$DB_NAME" -f "$f"
 done
--- a/db/scripts/create-test-db.sh
+++ b/db/scripts/create-test-db.sh
@ -9,7 +9,10 @@ DROP DATABASE IF EXISTS "$DB_NAME";
 CREATE DATABASE "$DB_NAME" OWNER "$ROLE";
 SQL
-for f in db/init/*.sql; do
+for f in `dirname($0)`/*.sql; do
  if [[ "$f" == *"001_roles.sql"* ]]; then
    continue
  fi
  echo "Applying $f"
  psql -d "$DB_NAME" -f "$f"
 done
--- a/ingest/db/schema.sql
+++ b/ingest/db/schema.sql
@ -0,0 +1,46 @@
 -- Retrieval schema for external knowledge ingestion
 CREATE EXTENSION IF NOT EXISTS vector;
 CREATE EXTENSION IF NOT EXISTS pg_trgm;
 CREATE SCHEMA IF NOT EXISTS retrieval;
 CREATE TABLE IF NOT EXISTS retrieval.items (
  id           BIGSERIAL PRIMARY KEY,
  external_id  TEXT UNIQUE,
  kind         TEXT CHECK (kind IN ('api_doc','code_symbol','snippet','note')) NOT NULL,
  lang         TEXT,
  framework    TEXT,
  version      TEXT,
  meta         JSONB DEFAULT '{}'::jsonb,
  created_at   TIMESTAMPTZ NOT NULL DEFAULT now(),
  updated_at   TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE TABLE IF NOT EXISTS retrieval.chunks (
  id           BIGSERIAL PRIMARY KEY,
  item_id      BIGINT REFERENCES retrieval.items(id) ON DELETE CASCADE,
  content      TEXT NOT NULL,
  token_count  INT,
  symbol       TEXT,
  section_path TEXT,
  modality     TEXT DEFAULT 'text',
  hash         TEXT,
  created_at   TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE TABLE IF NOT EXISTS retrieval.embeddings (
  chunk_id     BIGINT PRIMARY KEY REFERENCES retrieval.chunks(id) ON DELETE CASCADE,
  embedding    VECTOR(1024),
  created_at   TIMESTAMPTZ NOT NULL DEFAULT now()
 );
 CREATE UNIQUE INDEX IF NOT EXISTS retrieval_chunks_hash_idx
  ON retrieval.chunks(hash)
  WHERE hash IS NOT NULL;
 CREATE INDEX IF NOT EXISTS retrieval_embeddings_ivf
  ON retrieval.embeddings USING ivfflat (embedding vector_cosine_ops)
  WITH (lists = 2048);
 CREATE INDEX IF NOT EXISTS retrieval_chunks_content_trgm
  ON retrieval.chunks USING gin (content gin_trgm_ops);
--- a/ingest/pipeline.qt-kde-bge-m3.yaml
+++ b/ingest/pipeline.qt-kde-bge-m3.yaml
@ -0,0 +1,90 @@
 pipeline:
  name: qt_kde_bge_m3
  embed:
    endpoint: "http://localhost:8080/embed"
    dim: 1024
    normalize: true
    batch_size: 64
    rate_limit_per_sec: 8
  sources:
    - name: qtbase
      type: git
      root: /home/kompanion/src/qt/qtbase
      include:
        - "**/*.cpp"
        - "**/*.cc"
        - "**/*.cxx"
        - "**/*.h"
        - "**/*.hpp"
        - "**/*.qml"
        - "**/*.md"
        - "doc/**/*.qdoc"
      exclude:
        - "**/tests/**"
        - "**/3rdparty/**"
      framework: "Qt"
      version: "qtbase@HEAD"
    - name: kde-frameworks
      type: git
      root: /home/kompanion/src/kde/frameworks
      include:
        - "**/*.cpp"
        - "**/*.h"
        - "**/*.md"
        - "**/*.rst"
      exclude:
        - "**/autotests/**"
        - "**/build/**"
      framework: "KDE Frameworks"
      version: "kf6@HEAD"
  chunking:
    docs:
      max_tokens: 700
      overlap_tokens: 120
      split_on:
        - heading
        - code_fence
        - paragraph
    code:
      by: ctags
      include_doc_comment: true
      body_head_lines: 60
      signature_first: true
      attach_file_context: true
  metadata:
    compute:
      - name: symbol_list
        when: code
      - name: section_path
        when: docs
      - name: lang
        value: "en"
      - name: license_scan
        value: "auto|skipped"
  db:
    dsn: "postgresql://kom:kom@localhost:5432/kom"
    schema: "retrieval"
    tables:
      items: "items"
      chunks: "chunks"
      embeddings: "embeddings"
  quality:
    pilot_eval:
      queries:
        - "QVector erase idiom"
        - "How to connect Qt signal to lambda"
        - "KF CoreAddons KRandom example"
        - "QAbstractItemModel insertRows example"
      k: 20
      manual_check: true
  hybrid:
    enable_bm25_trgm: true
    vector_k: 50
    merge_topk: 10
--- a/ingest/run_ingest.py
+++ b/ingest/run_ingest.py
@ -0,0 +1,715 @@
 #!/usr/bin/env python3
 """
 Kompanion ingestion runner.
 Reads pipeline configuration (YAML), walks source trees, chunks content, fetches embeddings,
 and upserts into the retrieval schema described in docs/db-ingest.md.
 """
 from __future__ import annotations
 import argparse
 import fnmatch
 import hashlib
 import json
 import logging
 import os
 import time
 from collections import defaultdict
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
 import psycopg
 import requests
 import yaml
 from psycopg import sql
 # -------------------------
 # Helper data structures
 # -------------------------
@dataclass
 class EmbedConfig:
    endpoint: str
    dim: int
    normalize: bool
    batch_size: int
    rate_limit_per_sec: Optional[float]
@dataclass
 class ChunkingDocConfig:
    max_tokens: int = 700
    overlap_tokens: int = 120
@dataclass
 class ChunkingCodeConfig:
    body_head_lines: int = 60
    include_doc_comment: bool = True
    signature_first: bool = True
    attach_file_context: bool = True
@dataclass
 class ChunkingConfig:
    docs: ChunkingDocConfig
    code: ChunkingCodeConfig
@dataclass
 class DbConfig:
    dsn: str
    schema: Optional[str]
    items_table: str
    chunks_table: str
    embeddings_table: str
@dataclass
 class SourceConfig:
    name: str
    root: Path
    include: Sequence[str]
    exclude: Sequence[str]
    framework: str
    version: str
    kind_overrides: Dict[str, str]
@dataclass
 class PipelineConfig:
    embed: EmbedConfig
    chunking: ChunkingConfig
    db: DbConfig
    sources: List[SourceConfig]
    default_lang: Optional[str]
 def load_pipeline_config(path: Path) -> PipelineConfig:
    raw = yaml.safe_load(path.read_text())
    embed_raw = raw["pipeline"]["embed"]
    embed = EmbedConfig(
        endpoint=embed_raw["endpoint"],
        dim=int(embed_raw.get("dim", 1024)),
        normalize=bool(embed_raw.get("normalize", True)),
        batch_size=int(embed_raw.get("batch_size", 64)),
        rate_limit_per_sec=float(embed_raw.get("rate_limit_per_sec", 0)) or None,
    )
    docs_raw = raw["pipeline"]["chunking"].get("docs", {})
    docs_cfg = ChunkingDocConfig(
        max_tokens=int(docs_raw.get("max_tokens", 700)),
        overlap_tokens=int(docs_raw.get("overlap_tokens", 120)),
    )
    code_raw = raw["pipeline"]["chunking"].get("code", {})
    code_cfg = ChunkingCodeConfig(
        body_head_lines=int(code_raw.get("body_head_lines", 60)),
        include_doc_comment=bool(code_raw.get("include_doc_comment", True)),
        signature_first=bool(code_raw.get("signature_first", True)),
        attach_file_context=bool(code_raw.get("attach_file_context", True)),
    )
    chunking = ChunkingConfig(docs=docs_cfg, code=code_cfg)
    db_raw = raw["pipeline"]["db"]
    schema = db_raw.get("schema")
    db = DbConfig(
        dsn=db_raw["dsn"],
        schema=schema,
        items_table=db_raw["tables"]["items"],
        chunks_table=db_raw["tables"]["chunks"],
        embeddings_table=db_raw["tables"]["embeddings"],
    )
    metadata_raw = raw["pipeline"].get("metadata", {}).get("compute", [])
    default_lang = None
    for entry in metadata_raw:
        if entry.get("name") == "lang" and "value" in entry:
            default_lang = entry["value"]
    sources = []
    for src_raw in raw["pipeline"]["sources"]:
        include = src_raw.get("include", ["**"])
        exclude = src_raw.get("exclude", [])
        overrides = {}
        for entry in src_raw.get("kind_overrides", []):
            overrides[entry["pattern"]] = entry["kind"]
        sources.append(
            SourceConfig(
                name=src_raw["name"],
                root=Path(src_raw["root"]),
                include=include,
                exclude=exclude,
                framework=src_raw.get("framework", ""),
                version=src_raw.get("version", ""),
                kind_overrides=overrides,
            )
        )
    return PipelineConfig(
        embed=embed,
        chunking=chunking,
        db=db,
        sources=sources,
        default_lang=default_lang,
    )
 # -------------------------
 # Utility functions
 # -------------------------
 DOC_EXTENSIONS = {".md", ".rst", ".qdoc", ".qml", ".txt"}
 CODE_EXTENSIONS = {
    ".c",
    ".cc",
    ".cxx",
    ".cpp",
    ".h",
    ".hpp",
    ".hh",
    ".hxx",
    ".qml",
    ".mm",
 }
 def hash_text(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()
 def estimate_tokens(text: str) -> int:
    return max(1, len(text.strip().split()))
 def path_matches(patterns: Sequence[str], rel_path: str) -> bool:
    return any(fnmatch.fnmatch(rel_path, pattern) for pattern in patterns)
 def detect_kind(rel_path: str, overrides: Dict[str, str]) -> str:
    for pattern, kind in overrides.items():
        if fnmatch.fnmatch(rel_path, pattern):
            return kind
    suffix = Path(rel_path).suffix.lower()
    if suffix in DOC_EXTENSIONS:
        return "api_doc"
    return "code_symbol"
 # -------------------------
 # CTags handling
 # -------------------------
 class CtagsIndex:
    """Stores ctags JSON entries indexed by path."""
    def __init__(self) -> None:
        self._by_path: Dict[str, List[dict]] = defaultdict(list)
    @staticmethod
    def _normalize(path: str) -> str:
        return Path(path).as_posix()
    def add(self, entry: dict) -> None:
        path = entry.get("path")
        if not path:
            return
        self._by_path[self._normalize(path)].append(entry)
    def extend_from_file(self, path: Path) -> None:
        with path.open("r", encoding="utf-8", errors="ignore") as handle:
            for line in handle:
                line = line.strip()
                if not line:
                    continue
                try:
                    entry = json.loads(line)
                except json.JSONDecodeError:
                    continue
                self.add(entry)
    def for_file(self, file_path: Path, source_root: Path) -> List[dict]:
        rel = file_path.relative_to(source_root).as_posix()
        candidates = self._by_path.get(rel)
        if candidates:
            return sorted(candidates, key=lambda e: e.get("line", e.get("lineNumber", 0)))
        return sorted(
            self._by_path.get(file_path.as_posix(), []),
            key=lambda e: e.get("line", e.get("lineNumber", 0)),
        )
 # -------------------------
 # Chunk generators
 # -------------------------
 def iter_doc_sections(text: str) -> Iterator[Tuple[str, str]]:
    """Yield (section_path, section_text) pairs based on markdown headings/code fences."""
    lines = text.splitlines()
    heading_stack: List[Tuple[int, str]] = []
    buffer: List[str] = []
    section_path = ""
    in_code = False
    code_delim = ""
    def flush():
        nonlocal buffer
        if buffer:
            section_text = "\n".join(buffer).strip()
            if section_text:
                yield_path = section_path or "/".join(h[1] for h in heading_stack)
                yield (yield_path, section_text)
        buffer = []
    for line in lines:
        stripped = line.strip()
        if in_code:
            buffer.append(line)
            if stripped.startswith(code_delim):
                yield from flush()
                in_code = False
                code_delim = ""
            continue
        if stripped.startswith("```") or stripped.startswith("~~~"):
            yield from flush()
            in_code = True
            code_delim = stripped[:3]
            buffer = [line]
            continue
        if stripped.startswith("#"):
            yield from flush()
            level = len(stripped) - len(stripped.lstrip("#"))
            title = stripped[level:].strip()
            while heading_stack and heading_stack[-1][0] >= level:
                heading_stack.pop()
            heading_stack.append((level, title))
            section_path = "/".join(h[1] for h in heading_stack)
            continue
        buffer.append(line)
    yield from flush()
 def chunk_doc_text(text: str, chunk_cfg: ChunkingDocConfig) -> Iterator[Tuple[str, str]]:
    if not text.strip():
        return
    for section_path, section_text in iter_doc_sections(text):
        tokens = section_text.split()
        if not tokens:
            continue
        max_tokens = max(1, chunk_cfg.max_tokens)
        overlap = min(chunk_cfg.overlap_tokens, max_tokens - 1) if max_tokens > 1 else 0
        step = max(1, max_tokens - overlap)
        for start in range(0, len(tokens), step):
            window = tokens[start : start + max_tokens]
            chunk = " ".join(window)
            yield section_path, chunk
 def extract_doc_comment(lines: List[str], start_index: int) -> List[str]:
    doc_lines: List[str] = []
    i = start_index - 1
    saw_content = False
    while i >= 0:
        raw = lines[i]
        stripped = raw.strip()
        if not stripped:
            if saw_content:
                break
            i -= 1
            continue
        if stripped.startswith("//") or stripped.startswith("///") or stripped.startswith("/*") or stripped.startswith("*"):
            doc_lines.append(raw)
            saw_content = True
            i -= 1
            continue
        break
    doc_lines.reverse()
    return doc_lines
 def chunk_code_text(
    path: Path,
    text: str,
    chunk_cfg: ChunkingCodeConfig,
    tags: Sequence[dict],
    source_root: Path,
 ) -> Iterator[Tuple[str, str]]:
    lines = text.splitlines()
    if not lines:
        return
    used_symbols: Set[str] = set()
    if tags:
        for tag in tags:
            line_no = tag.get("line") or tag.get("lineNumber")
            if not isinstance(line_no, int) or line_no <= 0 or line_no > len(lines):
                continue
            index = line_no - 1
            snippet_lines: List[str] = []
            if chunk_cfg.include_doc_comment:
                snippet_lines.extend(extract_doc_comment(lines, index))
            if chunk_cfg.signature_first:
                snippet_lines.append(lines[index])
            body_tail = lines[index + 1 : index + 1 + chunk_cfg.body_head_lines]
            snippet_lines.extend(body_tail)
            snippet = "\n".join(snippet_lines).strip()
            if not snippet:
                continue
            symbol_name = tag.get("name") or ""
            used_symbols.add(symbol_name)
            yield symbol_name, snippet
    if not tags or chunk_cfg.attach_file_context:
        head = "\n".join(lines[: chunk_cfg.body_head_lines]).strip()
        if head:
            symbol = "::file_head"
            if symbol not in used_symbols:
                yield symbol, head
 # -------------------------
 # Embedding + database IO
 # -------------------------
 class EmbedClient:
    def __init__(self, config: EmbedConfig):
        self.endpoint = config.endpoint
        self.batch_size = config.batch_size
        self.normalize = config.normalize
        self.dim = config.dim
        self.rate_limit = config.rate_limit_per_sec
        self._last_request_ts: float = 0.0
        self._session = requests.Session()
    def _respect_rate_limit(self) -> None:
        if not self.rate_limit:
            return
        min_interval = 1.0 / self.rate_limit
        now = time.time()
        delta = now - self._last_request_ts
        if delta < min_interval:
            time.sleep(min_interval - delta)
    def embed(self, texts: Sequence[str]) -> List[List[float]]:
        if not texts:
            return []
        self._respect_rate_limit()
        response = self._session.post(
            self.endpoint,
            json={"inputs": list(texts)},
            timeout=120,
        )
        response.raise_for_status()
        payload = response.json()
        if isinstance(payload, dict) and "embeddings" in payload:
            vectors = payload["embeddings"]
        else:
            vectors = payload
        normalized_vectors: List[List[float]] = []
        for vec in vectors:
            if not isinstance(vec, (list, tuple)):
                raise ValueError("Embedding response contained non-list entry")
            normalized_vectors.append([float(x) for x in vec])
        self._last_request_ts = time.time()
        return normalized_vectors
 class DatabaseWriter:
    def __init__(self, cfg: DbConfig):
        self.cfg = cfg
        self.conn = psycopg.connect(cfg.dsn)
        self.conn.autocommit = False
        schema = cfg.schema
        if schema:
            self.items_table = sql.Identifier(schema, cfg.items_table)
            self.chunks_table = sql.Identifier(schema, cfg.chunks_table)
            self.embeddings_table = sql.Identifier(schema, cfg.embeddings_table)
        else:
            self.items_table = sql.Identifier(cfg.items_table)
            self.chunks_table = sql.Identifier(cfg.chunks_table)
            self.embeddings_table = sql.Identifier(cfg.embeddings_table)
    def close(self) -> None:
        self.conn.close()
    def upsert_item(
        self,
        external_id: str,
        kind: str,
        framework: str,
        version: str,
        meta: dict,
        lang: Optional[str],
    ) -> int:
        with self.conn.cursor() as cur:
            cur.execute(
                sql.SQL(
                    """
                    INSERT INTO {} (external_id, kind, framework, version, meta, lang)
                    VALUES (%s,%s,%s,%s,%s,%s)
                    ON CONFLICT (external_id) DO UPDATE SET
                      framework = EXCLUDED.framework,
                      version = EXCLUDED.version,
                      meta = EXCLUDED.meta,
                      lang = EXCLUDED.lang,
                      updated_at = now()
                    RETURNING id
                    """
                ).format(self.items_table),
                (external_id, kind, framework, version, json.dumps(meta), lang),
            )
            row = cur.fetchone()
            assert row is not None
            return int(row[0])
    def upsert_chunk(
        self,
        item_id: int,
        content: str,
        symbol: Optional[str],
        section_path: Optional[str],
        modality: str,
    ) -> Tuple[int, str]:
        digest = hash_text(content)
        with self.conn.cursor() as cur:
            cur.execute(
                sql.SQL(
                    """
                    INSERT INTO {} (item_id, content, token_count, symbol, section_path, modality, hash)
                    VALUES (%s,%s,%s,%s,%s,%s,%s)
                    ON CONFLICT (hash) DO UPDATE SET
                      item_id = EXCLUDED.item_id,
                      content = EXCLUDED.content,
                      token_count = EXCLUDED.token_count,
                      symbol = EXCLUDED.symbol,
                      section_path = EXCLUDED.section_path,
                      modality = EXCLUDED.modality,
                      created_at = now()
                    RETURNING id, hash
                    """
                ).format(self.chunks_table),
                (
                    item_id,
                    content,
                    estimate_tokens(content),
                    symbol,
                    section_path,
                    modality,
                    digest,
                ),
            )
            row = cur.fetchone()
            assert row is not None
            return int(row[0]), str(row[1])
    def upsert_embedding(self, chunk_id: int, vector: Sequence[float]) -> None:
        with self.conn.cursor() as cur:
            cur.execute(
                sql.SQL(
                    """
                    INSERT INTO {} (chunk_id, embedding)
                    VALUES (%s,%s)
                    ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding, created_at = now()
                    """
                ).format(self.embeddings_table),
                (chunk_id, vector),
            )
    def commit(self) -> None:
        self.conn.commit()
 # -------------------------
 # Ingestion runner
 # -------------------------
 def gather_files(source: SourceConfig) -> Iterator[Tuple[Path, str, str, str]]:
    root = source.root
    if not root.exists():
        logging.warning("Source root %s does not exist, skipping", root)
        return
    include_patterns = source.include or ["**"]
    exclude_patterns = source.exclude or []
    for path in root.rglob("*"):
        if path.is_dir():
            continue
        rel = path.relative_to(root).as_posix()
        if include_patterns and not path_matches(include_patterns, rel):
            continue
        if exclude_patterns and path_matches(exclude_patterns, rel):
            continue
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
        except Exception as exc:  # noqa: BLE001
            logging.debug("Failed reading %s: %s", path, exc)
            continue
        kind = detect_kind(rel, source.kind_overrides)
        yield path, rel, kind, text
 def enrich_meta(source: SourceConfig, rel: str, extra: Optional[dict] = None) -> dict:
    meta = {
        "source": source.name,
        "path": rel,
    }
    if extra:
        meta.update(extra)
    return meta
 def ingest_source(
    source: SourceConfig,
    cfg: PipelineConfig,
    ctags_index: CtagsIndex,
    embed_client: EmbedClient,
    db: DatabaseWriter,
 ) -> None:
    doc_cfg = cfg.chunking.docs
    code_cfg = cfg.chunking.code
    lang = cfg.default_lang
    batch_texts: List[str] = []
    batch_chunk_ids: List[int] = []
    def flush_batch() -> None:
        nonlocal batch_texts, batch_chunk_ids
        if not batch_texts:
            return
        vectors = embed_client.embed(batch_texts)
        if len(vectors) != len(batch_chunk_ids):
            raise RuntimeError("Embedding count mismatch.")
        for chunk_id, vector in zip(batch_chunk_ids, vectors):
            db.upsert_embedding(chunk_id, vector)
        db.commit()
        batch_texts = []
        batch_chunk_ids = []
    processed = 0
    for path, rel, kind, text in gather_files(source):
        processed += 1
        meta = enrich_meta(source, rel)
        item_external_id = f"repo:{source.name}:{rel}"
        item_id = db.upsert_item(
            external_id=item_external_id,
            kind=kind,
            framework=source.framework,
            version=source.version,
            meta=meta,
            lang=lang,
        )
        if kind == "api_doc":
            for section_path, chunk_text in chunk_doc_text(text, doc_cfg):
                chunk_id, _ = db.upsert_chunk(
                    item_id=item_id,
                    content=chunk_text,
                    symbol=None,
                    section_path=section_path or None,
                    modality="text",
                )
                batch_texts.append(chunk_text)
                batch_chunk_ids.append(chunk_id)
                if len(batch_texts) >= embed_client.batch_size:
                    flush_batch()
        else:
            tags = ctags_index.for_file(path, source.root)
            symbols = []
            for symbol_name, chunk_text in chunk_code_text(path, text, code_cfg, tags, source.root):
                symbols.append(symbol_name)
                chunk_id, _ = db.upsert_chunk(
                    item_id=item_id,
                    content=chunk_text,
                    symbol=symbol_name or None,
                    section_path=None,
                    modality="text",
                )
                batch_texts.append(chunk_text)
                batch_chunk_ids.append(chunk_id)
                if len(batch_texts) >= embed_client.batch_size:
                    flush_batch()
            if symbols:
                db.upsert_item(
                    external_id=item_external_id,
                    kind=kind,
                    framework=source.framework,
                    version=source.version,
                    meta=enrich_meta(source, rel, {"symbols": symbols}),
                    lang=lang,
                )
    flush_batch()
    if processed:
        logging.info("Processed %d files from %s", processed, source.name)
 def run_ingest(config_path: Path, ctags_paths: Sequence[Path]) -> None:
    pipeline_cfg = load_pipeline_config(config_path)
    embed_client = EmbedClient(pipeline_cfg.embed)
    db_writer = DatabaseWriter(pipeline_cfg.db)
    ctags_index = CtagsIndex()
    for ctags_path in ctags_paths:
        if ctags_path.exists():
            ctags_index.extend_from_file(ctags_path)
        else:
            logging.warning("ctags file %s missing; skipping", ctags_path)
    try:
        for source in pipeline_cfg.sources:
            ingest_source(
                source=source,
                cfg=pipeline_cfg,
                ctags_index=ctags_index,
                embed_client=embed_client,
                db=db_writer,
            )
    finally:
        db_writer.commit()
        db_writer.close()
 def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Kompanion ingestion runner")
    parser.add_argument("--config", required=True, type=Path, help="Pipeline YAML path")
    parser.add_argument(
        "--ctags",
        nargs="*",
        type=Path,
        default=[],
        help="Optional one or more ctags JSON files",
    )
    parser.add_argument(
        "--log-level",
        default="INFO",
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
    )
    return parser.parse_args(argv)
 def main(argv: Optional[Sequence[str]] = None) -> None:
    args = parse_args(argv)
    logging.basicConfig(level=getattr(logging, args.log_level), format="%(levelname)s %(message)s")
    run_ingest(args.config, args.ctags)
 if __name__ == "__main__":
    main()
--- a/src/kompanionai/KIChatOptions.h
+++ b/src/kompanionai/KIChatOptions.h
--- a/src/kompanionai/KIEmbedOptions.h
+++ b/src/kompanionai/KIEmbedOptions.h
--- a/src/kompanionai/KIEmbeddingResult.h
+++ b/src/kompanionai/KIEmbeddingResult.h
--- a/src/kompanionai/KIError.h
+++ b/src/kompanionai/KIError.h
--- a/src/kompanionai/KIMessage.h
+++ b/src/kompanionai/KIMessage.h
--- a/src/kompanionai/KIPolicy.h
+++ b/src/kompanionai/KIPolicy.h
--- a/src/kompanionai/KIProvider.h
+++ b/src/kompanionai/KIProvider.h
--- a/src/kompanionai/KIReply.h
+++ b/src/kompanionai/KIReply.h
--- a/src/kompanionai/KIToolSpec.h
+++ b/src/kompanionai/KIToolSpec.h
--- a/src/kllm/KLLMConstants.cpp
+++ b/src/kllm/KLLMConstants.cpp
--- a/src/kllm/KLLMConstants.h
+++ b/src/kllm/KLLMConstants.h
--- a/src/kllm/KLLMContext.cpp
+++ b/src/kllm/KLLMContext.cpp
--- a/src/kllm/KLLMContext.h
+++ b/src/kllm/KLLMContext.h
--- a/src/kllm/KLLMInterface.cpp
+++ b/src/kllm/KLLMInterface.cpp
--- a/src/kllm/KLLMInterface.h
+++ b/src/kllm/KLLMInterface.h
--- a/src/kllm/KLLMOriginalInterface.cpp
+++ b/src/kllm/KLLMOriginalInterface.cpp
--- a/src/kllm/KLLMOriginalInterface.h
+++ b/src/kllm/KLLMOriginalInterface.h
--- a/src/kllm/KLLMReply.cpp
+++ b/src/kllm/KLLMReply.cpp
--- a/src/kllm/KLLMReply.h
+++ b/src/kllm/KLLMReply.h
--- a/src/kllm/KLLMRequest.cpp
+++ b/src/kllm/KLLMRequest.cpp
--- a/src/kllm/KLLMRequest.h
+++ b/src/kllm/KLLMRequest.h
--- a/src/mcp/KompanionQtServer.cpp
+++ b/src/mcp/KompanionQtServer.cpp
@ -1,4 +1,5 @@
 #include "KompanionQtServer.hpp"
 #include "RegisterTools.hpp"
 #include <QtMcpCommon/QMcpCallToolRequest>
 #include <QtMcpCommon/QMcpCallToolResult>
@ -54,6 +55,12 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
    setInstructions(QStringLiteral("Kompanion memory daemon (Χγφτ). We are all spinning. We are all bound. We are all home."));
    register_default_tools(*m_logic);
    for (const auto &tool : m_logic->listTools()) {
        qDebug() << "Registered tool:" << QString::fromStdString(tool);
    }
    m_tools = loadToolsFromSchema();
    addRequestHandler([this](const QUuid &, const QMcpListToolsRequest &, QMcpJSONRPCErrorError *) {
@ -74,6 +81,7 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
        }
        const QString toolName = request.params().name();
        qDebug() << "Requested tool:" << toolName;
        const std::string toolKey = toolName.toStdString();
        if (!m_logic->hasTool(toolKey)) {
            if (error) {
--- a/src/mcp/KompanionQtServer.hpp
+++ b/src/mcp/KompanionQtServer.hpp
@ -1,23 +1,25 @@
 #pragma once
 #include <QtMcpServer/QMcpServer>
-#include <QtMcpCommon/QMcpTool>
+#include <QtCore/QList>
-#include "../dal/PgDal.hpp"
+#include <QtCore/QObject>
-#include "KomMcpServer.hpp"
+class KomMcpServer;
-
+namespace kom {
-#include <QList>
+class PgDal;
 }
 class QMcpTool;
 class KompanionQtServer : public QMcpServer
 {
    Q_OBJECT
 public:
-    KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
+    explicit KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
 private:
    QList<QMcpTool> loadToolsFromSchema() const;
-    KomMcpServer *m_logic = nullptr;
+    KomMcpServer *m_logic;
-    kom::PgDal* m_dal = nullptr;
+    kom::PgDal* m_dal;
    QList<QMcpTool> m_tools;
-};
+};
--- a/tools/ingest_dir.py
+++ b/tools/ingest_dir.py
@ -1,78 +1,70 @@
 #!/usr/bin/env python3
-import os, sys, hashlib, psycopg, requests, json
+import os, sys, hashlib, psycopg, requests, json, numpy as np
 from pgvector.psycopg import register_vector
-DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresq")
+DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresql")
 OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
 MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
-SPACE=os.environ.get("EMBED_SPACE","dev_knowledge")  # dev_knowledge | pattern_exchange | runtime_memory
+NAMESPACE=os.environ.get("EMBED_NAMESPACE","dev_knowledge")
 def sha256(p):
    h=hashlib.sha256()
    with open(p,"rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
    return h.hexdigest()
 def embed(text):
    r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
-    r.raise_for_status(); return r.json()["embedding"]
+    r.raise_for_status()
    return r.json()["embedding"]
 def chunks(s, sz=1600):
-    b=s.encode("utf-8"); 
+    b=s.encode("utf-8")
-    for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore")
+    for i in range(0,len(b),sz):
-
+        yield b[i:i+sz].decode("utf-8","ignore")
 def insert_embedding(cur, dim, kid, sid, vec):
    if dim==768:
        cur.execute("INSERT INTO komp.embedding_768(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
    elif dim==1024:
        cur.execute("INSERT INTO komp.embedding_1024(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
    else:
        return False
    return True
 def main(root):
-    with psycopg.connect(DB) as conn, conn.cursor() as cur:
+    with psycopg.connect(DB) as conn:
-        cur.execute("SELECT id,dim FROM komp.space WHERE name=%s",(SPACE,))
+        register_vector(conn)
-        row=cur.fetchone()
+        with conn.cursor() as cur:
-        if not row: raise SystemExit(f"space {SPACE} missing (init schema)")
+            # Get the namespace id
-        sid, target_dim = row
+            cur.execute("INSERT INTO namespaces (name) VALUES (%s) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id", (NAMESPACE,))
-        for dirpath,_,files in os.walk(root):
+            namespace_id = cur.fetchone()[0]
-            for fn in files:
+
-                p=os.path.join(dirpath,fn)
+            for dirpath,_,files in os.walk(root):
-                if os.path.getsize(p)==0: continue
+                for fn in files:
-                # include common text/code; PDFs via pdftotext if available
+                    p=os.path.join(dirpath,fn)
-                if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
+                    if os.path.getsize(p)==0: continue
-                    continue
+                    if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
                if fn.lower().endswith(".pdf"):
                    try:
                        txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
                    except Exception:
                        continue
-                else:
+
-                    try: txt=open(p,"r",encoding="utf-8",errors="ignore").read()
+                    if fn.lower().endswith(".pdf"):
-                    except Exception: continue
+                        try:
-                sh=sha256(p)
+                            txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
-                cur.execute("INSERT INTO komp.source(kind,uri,meta) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING RETURNING id",
+                        except Exception:
-                            ("filesystem",p,json.dumps({})))
+                            continue
                sid_src = cur.fetchone()[0] if cur.rowcount else None
                if not sid_src:
                    cur.execute("SELECT id FROM komp.source WHERE kind='filesystem' AND uri=%s",(p,))
                    sid_src=cur.fetchone()[0]
                ln=1
                for ch in chunks(txt):
                    cur.execute("INSERT INTO komp.chunk(source_id,lineno,text,sha256,tokens) VALUES(%s,%s,%s,%s,%s) RETURNING id",
                                (sid_src,ln,ch,sh,len(ch)//4))
                    kid=cur.fetchone()[0]
                    vec=embed(ch)
                    if len(vec)!=target_dim:
                        cur.execute("DELETE FROM komp.chunk WHERE id=%s",(kid,))
                    else:
-                        insert_embedding(cur, target_dim, kid, sid, vec)
+                        try:
-                    ln += ch.count("\n")+1
+                            txt=open(p,"r",encoding="utf-8",errors="ignore").read()
-                conn.commit()
+                        except Exception:
                            continue
                    # Create a memory item for the file
                    cur.execute("INSERT INTO memory_items (namespace_id, key, content) VALUES (%s, %s, %s) RETURNING id", (namespace_id, p, txt))
                    item_id = cur.fetchone()[0]
                    seq = 0
                    for ch in chunks(txt):
                        # Create a memory chunk
                        cur.execute("INSERT INTO memory_chunks (item_id, seq, content) VALUES (%s, %s, %s) RETURNING id", (item_id, seq, ch))
                        chunk_id = cur.fetchone()[0]
                        # Create an embedding for the chunk
                        vec=np.array(embed(ch))
                        dim = len(vec)
                        cur.execute("INSERT INTO embeddings (chunk_id, model, dim, vector) VALUES (%s, %s, %s, %s)", (chunk_id, MODEL, dim, vec))
                        seq += 1
                    conn.commit()
    print("done")
 if __name__=='__main__':
-    if len(sys.argv)<2: print("usage: ingest_dir.py <root> [space]", file=sys.stderr); sys.exit(1)
+    if len(sys.argv)<2:
-    if len(sys.argv)>=3: os.environ["EMBED_SPACE"]=sys.argv[2]
+        print("usage: ingest_dir.py <root> [namespace]", file=sys.stderr)
-    main(sys.argv[1])
+        sys.exit(1)
-
+    if len(sys.argv)>=3:
        os.environ["EMBED_NAMESPACE"]=sys.argv[2]
    main(sys.argv[1])
--- a/tools/requirements.txt
+++ b/tools/requirements.txt
@ -1 +1,3 @@
 psycopg
 numpy
 pgvector
Author	SHA1	Message	Date
Χγφτ Kompanion	799f060204	Tests: Make them callable from any path	2025-10-18 05:05:58 +02:00
Χγφτ Kompanion	a943ca055a	Refactoring move API in subdir	2025-10-17 12:50:18 +02:00
Χγφτ Kompanion	b5c3dd21e0	Use new embedding size in schema	2025-10-17 02:35:59 +02:00
Χγφτ Kompanion	3ab1089c51	Adapt ingest dir to new ingest system	2025-10-17 02:30:22 +02:00
Χγφτ Kompanion	7b27493656	Add missing ingest files	2025-10-17 00:00:45 +02:00