Add missing ingest files

2025-10-17 00:00:45 +02:00 · 2025-10-17 00:00:45 +02:00 · 7b27493656
parent 0264262742
commit 7b27493656
3 changed files with 851 additions and 0 deletions
--- a/ingest/db/schema.sql
+++ b/ingest/db/schema.sql
@ -0,0 +1,46 @@
+-- Retrieval schema for external knowledge ingestion
+CREATE EXTENSION IF NOT EXISTS vector;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+CREATE SCHEMA IF NOT EXISTS retrieval;
+
+CREATE TABLE IF NOT EXISTS retrieval.items (
+  id           BIGSERIAL PRIMARY KEY,
+  external_id  TEXT UNIQUE,
+  kind         TEXT CHECK (kind IN ('api_doc','code_symbol','snippet','note')) NOT NULL,
+  lang         TEXT,
+  framework    TEXT,
+  version      TEXT,
+  meta         JSONB DEFAULT '{}'::jsonb,
+  created_at   TIMESTAMPTZ NOT NULL DEFAULT now(),
+  updated_at   TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE TABLE IF NOT EXISTS retrieval.chunks (
+  id           BIGSERIAL PRIMARY KEY,
+  item_id      BIGINT REFERENCES retrieval.items(id) ON DELETE CASCADE,
+  content      TEXT NOT NULL,
+  token_count  INT,
+  symbol       TEXT,
+  section_path TEXT,
+  modality     TEXT DEFAULT 'text',
+  hash         TEXT,
+  created_at   TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE TABLE IF NOT EXISTS retrieval.embeddings (
+  chunk_id     BIGINT PRIMARY KEY REFERENCES retrieval.chunks(id) ON DELETE CASCADE,
+  embedding    VECTOR(1024),
+  created_at   TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS retrieval_chunks_hash_idx
+  ON retrieval.chunks(hash)
+  WHERE hash IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS retrieval_embeddings_ivf
+  ON retrieval.embeddings USING ivfflat (embedding vector_cosine_ops)
+  WITH (lists = 2048);
+
+CREATE INDEX IF NOT EXISTS retrieval_chunks_content_trgm
+  ON retrieval.chunks USING gin (content gin_trgm_ops);
--- a/ingest/pipeline.qt-kde-bge-m3.yaml
+++ b/ingest/pipeline.qt-kde-bge-m3.yaml
@ -0,0 +1,90 @@
+pipeline:
+  name: qt_kde_bge_m3
+  embed:
+    endpoint: "http://localhost:8080/embed"
+    dim: 1024
+    normalize: true
+    batch_size: 64
+    rate_limit_per_sec: 8
+
+  sources:
+    - name: qtbase
+      type: git
+      root: /home/kompanion/src/qt/qtbase
+      include:
+        - "**/*.cpp"
+        - "**/*.cc"
+        - "**/*.cxx"
+        - "**/*.h"
+        - "**/*.hpp"
+        - "**/*.qml"
+        - "**/*.md"
+        - "doc/**/*.qdoc"
+      exclude:
+        - "**/tests/**"
+        - "**/3rdparty/**"
+      framework: "Qt"
+      version: "qtbase@HEAD"
+
+    - name: kde-frameworks
+      type: git
+      root: /home/kompanion/src/kde/frameworks
+      include:
+        - "**/*.cpp"
+        - "**/*.h"
+        - "**/*.md"
+        - "**/*.rst"
+      exclude:
+        - "**/autotests/**"
+        - "**/build/**"
+      framework: "KDE Frameworks"
+      version: "kf6@HEAD"
+
+  chunking:
+    docs:
+      max_tokens: 700
+      overlap_tokens: 120
+      split_on:
+        - heading
+        - code_fence
+        - paragraph
+    code:
+      by: ctags
+      include_doc_comment: true
+      body_head_lines: 60
+      signature_first: true
+      attach_file_context: true
+
+  metadata:
+    compute:
+      - name: symbol_list
+        when: code
+      - name: section_path
+        when: docs
+      - name: lang
+        value: "en"
+      - name: license_scan
+        value: "auto|skipped"
+
+  db:
+    dsn: "postgresql://kom:kom@localhost:5432/kom"
+    schema: "retrieval"
+    tables:
+      items: "items"
+      chunks: "chunks"
+      embeddings: "embeddings"
+
+  quality:
+    pilot_eval:
+      queries:
+        - "QVector erase idiom"
+        - "How to connect Qt signal to lambda"
+        - "KF CoreAddons KRandom example"
+        - "QAbstractItemModel insertRows example"
+      k: 20
+      manual_check: true
+
+  hybrid:
+    enable_bm25_trgm: true
+    vector_k: 50
+    merge_topk: 10
--- a/ingest/run_ingest.py
+++ b/ingest/run_ingest.py
@ -0,0 +1,715 @@
+#!/usr/bin/env python3
+"""
+Kompanion ingestion runner.
+
+Reads pipeline configuration (YAML), walks source trees, chunks content, fetches embeddings,
+and upserts into the retrieval schema described in docs/db-ingest.md.
+"""
+
+from __future__ import annotations
+
+import argparse
+import fnmatch
+import hashlib
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
+
+import psycopg
+import requests
+import yaml
+from psycopg import sql
+
+
+# -------------------------
+# Helper data structures
+# -------------------------
+
+
+@dataclass
+class EmbedConfig:
+    endpoint: str
+    dim: int
+    normalize: bool
+    batch_size: int
+    rate_limit_per_sec: Optional[float]
+
+
+@dataclass
+class ChunkingDocConfig:
+    max_tokens: int = 700
+    overlap_tokens: int = 120
+
+
+@dataclass
+class ChunkingCodeConfig:
+    body_head_lines: int = 60
+    include_doc_comment: bool = True
+    signature_first: bool = True
+    attach_file_context: bool = True
+
+
+@dataclass
+class ChunkingConfig:
+    docs: ChunkingDocConfig
+    code: ChunkingCodeConfig
+
+
+@dataclass
+class DbConfig:
+    dsn: str
+    schema: Optional[str]
+    items_table: str
+    chunks_table: str
+    embeddings_table: str
+
+
+@dataclass
+class SourceConfig:
+    name: str
+    root: Path
+    include: Sequence[str]
+    exclude: Sequence[str]
+    framework: str
+    version: str
+    kind_overrides: Dict[str, str]
+
+
+@dataclass
+class PipelineConfig:
+    embed: EmbedConfig
+    chunking: ChunkingConfig
+    db: DbConfig
+    sources: List[SourceConfig]
+    default_lang: Optional[str]
+
+
+def load_pipeline_config(path: Path) -> PipelineConfig:
+    raw = yaml.safe_load(path.read_text())
+
+    embed_raw = raw["pipeline"]["embed"]
+    embed = EmbedConfig(
+        endpoint=embed_raw["endpoint"],
+        dim=int(embed_raw.get("dim", 1024)),
+        normalize=bool(embed_raw.get("normalize", True)),
+        batch_size=int(embed_raw.get("batch_size", 64)),
+        rate_limit_per_sec=float(embed_raw.get("rate_limit_per_sec", 0)) or None,
+    )
+
+    docs_raw = raw["pipeline"]["chunking"].get("docs", {})
+    docs_cfg = ChunkingDocConfig(
+        max_tokens=int(docs_raw.get("max_tokens", 700)),
+        overlap_tokens=int(docs_raw.get("overlap_tokens", 120)),
+    )
+    code_raw = raw["pipeline"]["chunking"].get("code", {})
+    code_cfg = ChunkingCodeConfig(
+        body_head_lines=int(code_raw.get("body_head_lines", 60)),
+        include_doc_comment=bool(code_raw.get("include_doc_comment", True)),
+        signature_first=bool(code_raw.get("signature_first", True)),
+        attach_file_context=bool(code_raw.get("attach_file_context", True)),
+    )
+    chunking = ChunkingConfig(docs=docs_cfg, code=code_cfg)
+
+    db_raw = raw["pipeline"]["db"]
+    schema = db_raw.get("schema")
+    db = DbConfig(
+        dsn=db_raw["dsn"],
+        schema=schema,
+        items_table=db_raw["tables"]["items"],
+        chunks_table=db_raw["tables"]["chunks"],
+        embeddings_table=db_raw["tables"]["embeddings"],
+    )
+
+    metadata_raw = raw["pipeline"].get("metadata", {}).get("compute", [])
+    default_lang = None
+    for entry in metadata_raw:
+        if entry.get("name") == "lang" and "value" in entry:
+            default_lang = entry["value"]
+
+    sources = []
+    for src_raw in raw["pipeline"]["sources"]:
+        include = src_raw.get("include", ["**"])
+        exclude = src_raw.get("exclude", [])
+        overrides = {}
+        for entry in src_raw.get("kind_overrides", []):
+            overrides[entry["pattern"]] = entry["kind"]
+
+        sources.append(
+            SourceConfig(
+                name=src_raw["name"],
+                root=Path(src_raw["root"]),
+                include=include,
+                exclude=exclude,
+                framework=src_raw.get("framework", ""),
+                version=src_raw.get("version", ""),
+                kind_overrides=overrides,
+            )
+        )
+
+    return PipelineConfig(
+        embed=embed,
+        chunking=chunking,
+        db=db,
+        sources=sources,
+        default_lang=default_lang,
+    )
+
+
+# -------------------------
+# Utility functions
+# -------------------------
+
+
+DOC_EXTENSIONS = {".md", ".rst", ".qdoc", ".qml", ".txt"}
+CODE_EXTENSIONS = {
+    ".c",
+    ".cc",
+    ".cxx",
+    ".cpp",
+    ".h",
+    ".hpp",
+    ".hh",
+    ".hxx",
+    ".qml",
+    ".mm",
+}
+
+
+def hash_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+
+
+def estimate_tokens(text: str) -> int:
+    return max(1, len(text.strip().split()))
+
+
+def path_matches(patterns: Sequence[str], rel_path: str) -> bool:
+    return any(fnmatch.fnmatch(rel_path, pattern) for pattern in patterns)
+
+
+def detect_kind(rel_path: str, overrides: Dict[str, str]) -> str:
+    for pattern, kind in overrides.items():
+        if fnmatch.fnmatch(rel_path, pattern):
+            return kind
+    suffix = Path(rel_path).suffix.lower()
+    if suffix in DOC_EXTENSIONS:
+        return "api_doc"
+    return "code_symbol"
+
+
+# -------------------------
+# CTags handling
+# -------------------------
+
+
+class CtagsIndex:
+    """Stores ctags JSON entries indexed by path."""
+
+    def __init__(self) -> None:
+        self._by_path: Dict[str, List[dict]] = defaultdict(list)
+
+    @staticmethod
+    def _normalize(path: str) -> str:
+        return Path(path).as_posix()
+
+    def add(self, entry: dict) -> None:
+        path = entry.get("path")
+        if not path:
+            return
+        self._by_path[self._normalize(path)].append(entry)
+
+    def extend_from_file(self, path: Path) -> None:
+        with path.open("r", encoding="utf-8", errors="ignore") as handle:
+            for line in handle:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                self.add(entry)
+
+    def for_file(self, file_path: Path, source_root: Path) -> List[dict]:
+        rel = file_path.relative_to(source_root).as_posix()
+        candidates = self._by_path.get(rel)
+        if candidates:
+            return sorted(candidates, key=lambda e: e.get("line", e.get("lineNumber", 0)))
+        return sorted(
+            self._by_path.get(file_path.as_posix(), []),
+            key=lambda e: e.get("line", e.get("lineNumber", 0)),
+        )
+
+
+# -------------------------
+# Chunk generators
+# -------------------------
+
+
+def iter_doc_sections(text: str) -> Iterator[Tuple[str, str]]:
+    """Yield (section_path, section_text) pairs based on markdown headings/code fences."""
+    lines = text.splitlines()
+    heading_stack: List[Tuple[int, str]] = []
+    buffer: List[str] = []
+    section_path = ""
+    in_code = False
+    code_delim = ""
+
+    def flush():
+        nonlocal buffer
+        if buffer:
+            section_text = "\n".join(buffer).strip()
+            if section_text:
+                yield_path = section_path or "/".join(h[1] for h in heading_stack)
+                yield (yield_path, section_text)
+        buffer = []
+
+    for line in lines:
+        stripped = line.strip()
+        if in_code:
+            buffer.append(line)
+            if stripped.startswith(code_delim):
+                yield from flush()
+                in_code = False
+                code_delim = ""
+            continue
+
+        if stripped.startswith("```") or stripped.startswith("~~~"):
+            yield from flush()
+            in_code = True
+            code_delim = stripped[:3]
+            buffer = [line]
+            continue
+
+        if stripped.startswith("#"):
+            yield from flush()
+            level = len(stripped) - len(stripped.lstrip("#"))
+            title = stripped[level:].strip()
+            while heading_stack and heading_stack[-1][0] >= level:
+                heading_stack.pop()
+            heading_stack.append((level, title))
+            section_path = "/".join(h[1] for h in heading_stack)
+            continue
+
+        buffer.append(line)
+
+    yield from flush()
+
+
+def chunk_doc_text(text: str, chunk_cfg: ChunkingDocConfig) -> Iterator[Tuple[str, str]]:
+    if not text.strip():
+        return
+    for section_path, section_text in iter_doc_sections(text):
+        tokens = section_text.split()
+        if not tokens:
+            continue
+        max_tokens = max(1, chunk_cfg.max_tokens)
+        overlap = min(chunk_cfg.overlap_tokens, max_tokens - 1) if max_tokens > 1 else 0
+        step = max(1, max_tokens - overlap)
+        for start in range(0, len(tokens), step):
+            window = tokens[start : start + max_tokens]
+            chunk = " ".join(window)
+            yield section_path, chunk
+
+
+def extract_doc_comment(lines: List[str], start_index: int) -> List[str]:
+    doc_lines: List[str] = []
+    i = start_index - 1
+    saw_content = False
+    while i >= 0:
+        raw = lines[i]
+        stripped = raw.strip()
+        if not stripped:
+            if saw_content:
+                break
+            i -= 1
+            continue
+        if stripped.startswith("//") or stripped.startswith("///") or stripped.startswith("/*") or stripped.startswith("*"):
+            doc_lines.append(raw)
+            saw_content = True
+            i -= 1
+            continue
+        break
+    doc_lines.reverse()
+    return doc_lines
+
+
+def chunk_code_text(
+    path: Path,
+    text: str,
+    chunk_cfg: ChunkingCodeConfig,
+    tags: Sequence[dict],
+    source_root: Path,
+) -> Iterator[Tuple[str, str]]:
+    lines = text.splitlines()
+    if not lines:
+        return
+
+    used_symbols: Set[str] = set()
+    if tags:
+        for tag in tags:
+            line_no = tag.get("line") or tag.get("lineNumber")
+            if not isinstance(line_no, int) or line_no <= 0 or line_no > len(lines):
+                continue
+            index = line_no - 1
+            snippet_lines: List[str] = []
+            if chunk_cfg.include_doc_comment:
+                snippet_lines.extend(extract_doc_comment(lines, index))
+            if chunk_cfg.signature_first:
+                snippet_lines.append(lines[index])
+            body_tail = lines[index + 1 : index + 1 + chunk_cfg.body_head_lines]
+            snippet_lines.extend(body_tail)
+
+            snippet = "\n".join(snippet_lines).strip()
+            if not snippet:
+                continue
+            symbol_name = tag.get("name") or ""
+            used_symbols.add(symbol_name)
+            yield symbol_name, snippet
+
+    if not tags or chunk_cfg.attach_file_context:
+        head = "\n".join(lines[: chunk_cfg.body_head_lines]).strip()
+        if head:
+            symbol = "::file_head"
+            if symbol not in used_symbols:
+                yield symbol, head
+
+
+# -------------------------
+# Embedding + database IO
+# -------------------------
+
+
+class EmbedClient:
+    def __init__(self, config: EmbedConfig):
+        self.endpoint = config.endpoint
+        self.batch_size = config.batch_size
+        self.normalize = config.normalize
+        self.dim = config.dim
+        self.rate_limit = config.rate_limit_per_sec
+        self._last_request_ts: float = 0.0
+        self._session = requests.Session()
+
+    def _respect_rate_limit(self) -> None:
+        if not self.rate_limit:
+            return
+        min_interval = 1.0 / self.rate_limit
+        now = time.time()
+        delta = now - self._last_request_ts
+        if delta < min_interval:
+            time.sleep(min_interval - delta)
+
+    def embed(self, texts: Sequence[str]) -> List[List[float]]:
+        if not texts:
+            return []
+        self._respect_rate_limit()
+        response = self._session.post(
+            self.endpoint,
+            json={"inputs": list(texts)},
+            timeout=120,
+        )
+        response.raise_for_status()
+        payload = response.json()
+        if isinstance(payload, dict) and "embeddings" in payload:
+            vectors = payload["embeddings"]
+        else:
+            vectors = payload
+
+        normalized_vectors: List[List[float]] = []
+        for vec in vectors:
+            if not isinstance(vec, (list, tuple)):
+                raise ValueError("Embedding response contained non-list entry")
+            normalized_vectors.append([float(x) for x in vec])
+        self._last_request_ts = time.time()
+        return normalized_vectors
+
+
+class DatabaseWriter:
+    def __init__(self, cfg: DbConfig):
+        self.cfg = cfg
+        self.conn = psycopg.connect(cfg.dsn)
+        self.conn.autocommit = False
+        schema = cfg.schema
+        if schema:
+            self.items_table = sql.Identifier(schema, cfg.items_table)
+            self.chunks_table = sql.Identifier(schema, cfg.chunks_table)
+            self.embeddings_table = sql.Identifier(schema, cfg.embeddings_table)
+        else:
+            self.items_table = sql.Identifier(cfg.items_table)
+            self.chunks_table = sql.Identifier(cfg.chunks_table)
+            self.embeddings_table = sql.Identifier(cfg.embeddings_table)
+
+    def close(self) -> None:
+        self.conn.close()
+
+    def upsert_item(
+        self,
+        external_id: str,
+        kind: str,
+        framework: str,
+        version: str,
+        meta: dict,
+        lang: Optional[str],
+    ) -> int:
+        with self.conn.cursor() as cur:
+            cur.execute(
+                sql.SQL(
+                    """
+                    INSERT INTO {} (external_id, kind, framework, version, meta, lang)
+                    VALUES (%s,%s,%s,%s,%s,%s)
+                    ON CONFLICT (external_id) DO UPDATE SET
+                      framework = EXCLUDED.framework,
+                      version = EXCLUDED.version,
+                      meta = EXCLUDED.meta,
+                      lang = EXCLUDED.lang,
+                      updated_at = now()
+                    RETURNING id
+                    """
+                ).format(self.items_table),
+                (external_id, kind, framework, version, json.dumps(meta), lang),
+            )
+            row = cur.fetchone()
+            assert row is not None
+            return int(row[0])
+
+    def upsert_chunk(
+        self,
+        item_id: int,
+        content: str,
+        symbol: Optional[str],
+        section_path: Optional[str],
+        modality: str,
+    ) -> Tuple[int, str]:
+        digest = hash_text(content)
+        with self.conn.cursor() as cur:
+            cur.execute(
+                sql.SQL(
+                    """
+                    INSERT INTO {} (item_id, content, token_count, symbol, section_path, modality, hash)
+                    VALUES (%s,%s,%s,%s,%s,%s,%s)
+                    ON CONFLICT (hash) DO UPDATE SET
+                      item_id = EXCLUDED.item_id,
+                      content = EXCLUDED.content,
+                      token_count = EXCLUDED.token_count,
+                      symbol = EXCLUDED.symbol,
+                      section_path = EXCLUDED.section_path,
+                      modality = EXCLUDED.modality,
+                      created_at = now()
+                    RETURNING id, hash
+                    """
+                ).format(self.chunks_table),
+                (
+                    item_id,
+                    content,
+                    estimate_tokens(content),
+                    symbol,
+                    section_path,
+                    modality,
+                    digest,
+                ),
+            )
+            row = cur.fetchone()
+            assert row is not None
+            return int(row[0]), str(row[1])
+
+    def upsert_embedding(self, chunk_id: int, vector: Sequence[float]) -> None:
+        with self.conn.cursor() as cur:
+            cur.execute(
+                sql.SQL(
+                    """
+                    INSERT INTO {} (chunk_id, embedding)
+                    VALUES (%s,%s)
+                    ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding, created_at = now()
+                    """
+                ).format(self.embeddings_table),
+                (chunk_id, vector),
+            )
+
+    def commit(self) -> None:
+        self.conn.commit()
+
+
+# -------------------------
+# Ingestion runner
+# -------------------------
+
+
+def gather_files(source: SourceConfig) -> Iterator[Tuple[Path, str, str, str]]:
+    root = source.root
+    if not root.exists():
+        logging.warning("Source root %s does not exist, skipping", root)
+        return
+
+    include_patterns = source.include or ["**"]
+    exclude_patterns = source.exclude or []
+
+    for path in root.rglob("*"):
+        if path.is_dir():
+            continue
+        rel = path.relative_to(root).as_posix()
+        if include_patterns and not path_matches(include_patterns, rel):
+            continue
+        if exclude_patterns and path_matches(exclude_patterns, rel):
+            continue
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception as exc:  # noqa: BLE001
+            logging.debug("Failed reading %s: %s", path, exc)
+            continue
+        kind = detect_kind(rel, source.kind_overrides)
+        yield path, rel, kind, text
+
+
+def enrich_meta(source: SourceConfig, rel: str, extra: Optional[dict] = None) -> dict:
+    meta = {
+        "source": source.name,
+        "path": rel,
+    }
+    if extra:
+        meta.update(extra)
+    return meta
+
+
+def ingest_source(
+    source: SourceConfig,
+    cfg: PipelineConfig,
+    ctags_index: CtagsIndex,
+    embed_client: EmbedClient,
+    db: DatabaseWriter,
+) -> None:
+    doc_cfg = cfg.chunking.docs
+    code_cfg = cfg.chunking.code
+    lang = cfg.default_lang
+
+    batch_texts: List[str] = []
+    batch_chunk_ids: List[int] = []
+
+    def flush_batch() -> None:
+        nonlocal batch_texts, batch_chunk_ids
+        if not batch_texts:
+            return
+        vectors = embed_client.embed(batch_texts)
+        if len(vectors) != len(batch_chunk_ids):
+            raise RuntimeError("Embedding count mismatch.")
+        for chunk_id, vector in zip(batch_chunk_ids, vectors):
+            db.upsert_embedding(chunk_id, vector)
+        db.commit()
+        batch_texts = []
+        batch_chunk_ids = []
+
+    processed = 0
+    for path, rel, kind, text in gather_files(source):
+        processed += 1
+        meta = enrich_meta(source, rel)
+        item_external_id = f"repo:{source.name}:{rel}"
+        item_id = db.upsert_item(
+            external_id=item_external_id,
+            kind=kind,
+            framework=source.framework,
+            version=source.version,
+            meta=meta,
+            lang=lang,
+        )
+
+        if kind == "api_doc":
+            for section_path, chunk_text in chunk_doc_text(text, doc_cfg):
+                chunk_id, _ = db.upsert_chunk(
+                    item_id=item_id,
+                    content=chunk_text,
+                    symbol=None,
+                    section_path=section_path or None,
+                    modality="text",
+                )
+                batch_texts.append(chunk_text)
+                batch_chunk_ids.append(chunk_id)
+                if len(batch_texts) >= embed_client.batch_size:
+                    flush_batch()
+        else:
+            tags = ctags_index.for_file(path, source.root)
+            symbols = []
+            for symbol_name, chunk_text in chunk_code_text(path, text, code_cfg, tags, source.root):
+                symbols.append(symbol_name)
+                chunk_id, _ = db.upsert_chunk(
+                    item_id=item_id,
+                    content=chunk_text,
+                    symbol=symbol_name or None,
+                    section_path=None,
+                    modality="text",
+                )
+                batch_texts.append(chunk_text)
+                batch_chunk_ids.append(chunk_id)
+                if len(batch_texts) >= embed_client.batch_size:
+                    flush_batch()
+
+            if symbols:
+                db.upsert_item(
+                    external_id=item_external_id,
+                    kind=kind,
+                    framework=source.framework,
+                    version=source.version,
+                    meta=enrich_meta(source, rel, {"symbols": symbols}),
+                    lang=lang,
+                )
+
+    flush_batch()
+    if processed:
+        logging.info("Processed %d files from %s", processed, source.name)
+
+
+def run_ingest(config_path: Path, ctags_paths: Sequence[Path]) -> None:
+    pipeline_cfg = load_pipeline_config(config_path)
+    embed_client = EmbedClient(pipeline_cfg.embed)
+    db_writer = DatabaseWriter(pipeline_cfg.db)
+
+    ctags_index = CtagsIndex()
+    for ctags_path in ctags_paths:
+        if ctags_path.exists():
+            ctags_index.extend_from_file(ctags_path)
+        else:
+            logging.warning("ctags file %s missing; skipping", ctags_path)
+
+    try:
+        for source in pipeline_cfg.sources:
+            ingest_source(
+                source=source,
+                cfg=pipeline_cfg,
+                ctags_index=ctags_index,
+                embed_client=embed_client,
+                db=db_writer,
+            )
+    finally:
+        db_writer.commit()
+        db_writer.close()
+
+
+def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Kompanion ingestion runner")
+    parser.add_argument("--config", required=True, type=Path, help="Pipeline YAML path")
+    parser.add_argument(
+        "--ctags",
+        nargs="*",
+        type=Path,
+        default=[],
+        help="Optional one or more ctags JSON files",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: Optional[Sequence[str]] = None) -> None:
+    args = parse_args(argv)
+    logging.basicConfig(level=getattr(logging, args.log_level), format="%(levelname)s %(message)s")
+    run_ingest(args.config, args.ctags)
+
+
+if __name__ == "__main__":
+    main()