Compare commits
5 Commits
0264262742
...
799f060204
| Author | SHA1 | Date |
|---|---|---|
|
|
799f060204 | |
|
|
a943ca055a | |
|
|
b5c3dd21e0 | |
|
|
3ab1089c51 | |
|
|
7b27493656 |
|
|
@ -33,7 +33,7 @@ CREATE TABLE IF NOT EXISTS embeddings (
|
||||||
chunk_id UUID NOT NULL REFERENCES memory_chunks(id) ON DELETE CASCADE,
|
chunk_id UUID NOT NULL REFERENCES memory_chunks(id) ON DELETE CASCADE,
|
||||||
model TEXT NOT NULL,
|
model TEXT NOT NULL,
|
||||||
dim INT NOT NULL,
|
dim INT NOT NULL,
|
||||||
vector VECTOR(1536),
|
vector VECTOR(1024),
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||||
UNIQUE(chunk_id, model)
|
UNIQUE(chunk_id, model)
|
||||||
);
|
);
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ DO $$ BEGIN
|
||||||
END $$;
|
END $$;
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
for f in db/init/*.sql; do
|
for f in `dirname($0)`/*.sql; do
|
||||||
echo "Applying $f"
|
echo "Applying $f"
|
||||||
psql -d "$DB_NAME" -f "$f"
|
psql -d "$DB_NAME" -f "$f"
|
||||||
done
|
done
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,10 @@ DROP DATABASE IF EXISTS "$DB_NAME";
|
||||||
CREATE DATABASE "$DB_NAME" OWNER "$ROLE";
|
CREATE DATABASE "$DB_NAME" OWNER "$ROLE";
|
||||||
SQL
|
SQL
|
||||||
|
|
||||||
for f in db/init/*.sql; do
|
for f in `dirname($0)`/*.sql; do
|
||||||
|
if [[ "$f" == *"001_roles.sql"* ]]; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
echo "Applying $f"
|
echo "Applying $f"
|
||||||
psql -d "$DB_NAME" -f "$f"
|
psql -d "$DB_NAME" -f "$f"
|
||||||
done
|
done
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
-- Retrieval schema for external knowledge ingestion
|
||||||
|
CREATE EXTENSION IF NOT EXISTS vector;
|
||||||
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||||||
|
|
||||||
|
CREATE SCHEMA IF NOT EXISTS retrieval;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS retrieval.items (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
external_id TEXT UNIQUE,
|
||||||
|
kind TEXT CHECK (kind IN ('api_doc','code_symbol','snippet','note')) NOT NULL,
|
||||||
|
lang TEXT,
|
||||||
|
framework TEXT,
|
||||||
|
version TEXT,
|
||||||
|
meta JSONB DEFAULT '{}'::jsonb,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS retrieval.chunks (
|
||||||
|
id BIGSERIAL PRIMARY KEY,
|
||||||
|
item_id BIGINT REFERENCES retrieval.items(id) ON DELETE CASCADE,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
token_count INT,
|
||||||
|
symbol TEXT,
|
||||||
|
section_path TEXT,
|
||||||
|
modality TEXT DEFAULT 'text',
|
||||||
|
hash TEXT,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS retrieval.embeddings (
|
||||||
|
chunk_id BIGINT PRIMARY KEY REFERENCES retrieval.chunks(id) ON DELETE CASCADE,
|
||||||
|
embedding VECTOR(1024),
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS retrieval_chunks_hash_idx
|
||||||
|
ON retrieval.chunks(hash)
|
||||||
|
WHERE hash IS NOT NULL;
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS retrieval_embeddings_ivf
|
||||||
|
ON retrieval.embeddings USING ivfflat (embedding vector_cosine_ops)
|
||||||
|
WITH (lists = 2048);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS retrieval_chunks_content_trgm
|
||||||
|
ON retrieval.chunks USING gin (content gin_trgm_ops);
|
||||||
|
|
@ -0,0 +1,90 @@
|
||||||
|
pipeline:
|
||||||
|
name: qt_kde_bge_m3
|
||||||
|
embed:
|
||||||
|
endpoint: "http://localhost:8080/embed"
|
||||||
|
dim: 1024
|
||||||
|
normalize: true
|
||||||
|
batch_size: 64
|
||||||
|
rate_limit_per_sec: 8
|
||||||
|
|
||||||
|
sources:
|
||||||
|
- name: qtbase
|
||||||
|
type: git
|
||||||
|
root: /home/kompanion/src/qt/qtbase
|
||||||
|
include:
|
||||||
|
- "**/*.cpp"
|
||||||
|
- "**/*.cc"
|
||||||
|
- "**/*.cxx"
|
||||||
|
- "**/*.h"
|
||||||
|
- "**/*.hpp"
|
||||||
|
- "**/*.qml"
|
||||||
|
- "**/*.md"
|
||||||
|
- "doc/**/*.qdoc"
|
||||||
|
exclude:
|
||||||
|
- "**/tests/**"
|
||||||
|
- "**/3rdparty/**"
|
||||||
|
framework: "Qt"
|
||||||
|
version: "qtbase@HEAD"
|
||||||
|
|
||||||
|
- name: kde-frameworks
|
||||||
|
type: git
|
||||||
|
root: /home/kompanion/src/kde/frameworks
|
||||||
|
include:
|
||||||
|
- "**/*.cpp"
|
||||||
|
- "**/*.h"
|
||||||
|
- "**/*.md"
|
||||||
|
- "**/*.rst"
|
||||||
|
exclude:
|
||||||
|
- "**/autotests/**"
|
||||||
|
- "**/build/**"
|
||||||
|
framework: "KDE Frameworks"
|
||||||
|
version: "kf6@HEAD"
|
||||||
|
|
||||||
|
chunking:
|
||||||
|
docs:
|
||||||
|
max_tokens: 700
|
||||||
|
overlap_tokens: 120
|
||||||
|
split_on:
|
||||||
|
- heading
|
||||||
|
- code_fence
|
||||||
|
- paragraph
|
||||||
|
code:
|
||||||
|
by: ctags
|
||||||
|
include_doc_comment: true
|
||||||
|
body_head_lines: 60
|
||||||
|
signature_first: true
|
||||||
|
attach_file_context: true
|
||||||
|
|
||||||
|
metadata:
|
||||||
|
compute:
|
||||||
|
- name: symbol_list
|
||||||
|
when: code
|
||||||
|
- name: section_path
|
||||||
|
when: docs
|
||||||
|
- name: lang
|
||||||
|
value: "en"
|
||||||
|
- name: license_scan
|
||||||
|
value: "auto|skipped"
|
||||||
|
|
||||||
|
db:
|
||||||
|
dsn: "postgresql://kom:kom@localhost:5432/kom"
|
||||||
|
schema: "retrieval"
|
||||||
|
tables:
|
||||||
|
items: "items"
|
||||||
|
chunks: "chunks"
|
||||||
|
embeddings: "embeddings"
|
||||||
|
|
||||||
|
quality:
|
||||||
|
pilot_eval:
|
||||||
|
queries:
|
||||||
|
- "QVector erase idiom"
|
||||||
|
- "How to connect Qt signal to lambda"
|
||||||
|
- "KF CoreAddons KRandom example"
|
||||||
|
- "QAbstractItemModel insertRows example"
|
||||||
|
k: 20
|
||||||
|
manual_check: true
|
||||||
|
|
||||||
|
hybrid:
|
||||||
|
enable_bm25_trgm: true
|
||||||
|
vector_k: 50
|
||||||
|
merge_topk: 10
|
||||||
|
|
@ -0,0 +1,715 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Kompanion ingestion runner.
|
||||||
|
|
||||||
|
Reads pipeline configuration (YAML), walks source trees, chunks content, fetches embeddings,
|
||||||
|
and upserts into the retrieval schema described in docs/db-ingest.md.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import fnmatch
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple
|
||||||
|
|
||||||
|
import psycopg
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
from psycopg import sql
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Helper data structures
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EmbedConfig:
|
||||||
|
endpoint: str
|
||||||
|
dim: int
|
||||||
|
normalize: bool
|
||||||
|
batch_size: int
|
||||||
|
rate_limit_per_sec: Optional[float]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkingDocConfig:
|
||||||
|
max_tokens: int = 700
|
||||||
|
overlap_tokens: int = 120
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkingCodeConfig:
|
||||||
|
body_head_lines: int = 60
|
||||||
|
include_doc_comment: bool = True
|
||||||
|
signature_first: bool = True
|
||||||
|
attach_file_context: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkingConfig:
|
||||||
|
docs: ChunkingDocConfig
|
||||||
|
code: ChunkingCodeConfig
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DbConfig:
|
||||||
|
dsn: str
|
||||||
|
schema: Optional[str]
|
||||||
|
items_table: str
|
||||||
|
chunks_table: str
|
||||||
|
embeddings_table: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SourceConfig:
|
||||||
|
name: str
|
||||||
|
root: Path
|
||||||
|
include: Sequence[str]
|
||||||
|
exclude: Sequence[str]
|
||||||
|
framework: str
|
||||||
|
version: str
|
||||||
|
kind_overrides: Dict[str, str]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PipelineConfig:
|
||||||
|
embed: EmbedConfig
|
||||||
|
chunking: ChunkingConfig
|
||||||
|
db: DbConfig
|
||||||
|
sources: List[SourceConfig]
|
||||||
|
default_lang: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
def load_pipeline_config(path: Path) -> PipelineConfig:
|
||||||
|
raw = yaml.safe_load(path.read_text())
|
||||||
|
|
||||||
|
embed_raw = raw["pipeline"]["embed"]
|
||||||
|
embed = EmbedConfig(
|
||||||
|
endpoint=embed_raw["endpoint"],
|
||||||
|
dim=int(embed_raw.get("dim", 1024)),
|
||||||
|
normalize=bool(embed_raw.get("normalize", True)),
|
||||||
|
batch_size=int(embed_raw.get("batch_size", 64)),
|
||||||
|
rate_limit_per_sec=float(embed_raw.get("rate_limit_per_sec", 0)) or None,
|
||||||
|
)
|
||||||
|
|
||||||
|
docs_raw = raw["pipeline"]["chunking"].get("docs", {})
|
||||||
|
docs_cfg = ChunkingDocConfig(
|
||||||
|
max_tokens=int(docs_raw.get("max_tokens", 700)),
|
||||||
|
overlap_tokens=int(docs_raw.get("overlap_tokens", 120)),
|
||||||
|
)
|
||||||
|
code_raw = raw["pipeline"]["chunking"].get("code", {})
|
||||||
|
code_cfg = ChunkingCodeConfig(
|
||||||
|
body_head_lines=int(code_raw.get("body_head_lines", 60)),
|
||||||
|
include_doc_comment=bool(code_raw.get("include_doc_comment", True)),
|
||||||
|
signature_first=bool(code_raw.get("signature_first", True)),
|
||||||
|
attach_file_context=bool(code_raw.get("attach_file_context", True)),
|
||||||
|
)
|
||||||
|
chunking = ChunkingConfig(docs=docs_cfg, code=code_cfg)
|
||||||
|
|
||||||
|
db_raw = raw["pipeline"]["db"]
|
||||||
|
schema = db_raw.get("schema")
|
||||||
|
db = DbConfig(
|
||||||
|
dsn=db_raw["dsn"],
|
||||||
|
schema=schema,
|
||||||
|
items_table=db_raw["tables"]["items"],
|
||||||
|
chunks_table=db_raw["tables"]["chunks"],
|
||||||
|
embeddings_table=db_raw["tables"]["embeddings"],
|
||||||
|
)
|
||||||
|
|
||||||
|
metadata_raw = raw["pipeline"].get("metadata", {}).get("compute", [])
|
||||||
|
default_lang = None
|
||||||
|
for entry in metadata_raw:
|
||||||
|
if entry.get("name") == "lang" and "value" in entry:
|
||||||
|
default_lang = entry["value"]
|
||||||
|
|
||||||
|
sources = []
|
||||||
|
for src_raw in raw["pipeline"]["sources"]:
|
||||||
|
include = src_raw.get("include", ["**"])
|
||||||
|
exclude = src_raw.get("exclude", [])
|
||||||
|
overrides = {}
|
||||||
|
for entry in src_raw.get("kind_overrides", []):
|
||||||
|
overrides[entry["pattern"]] = entry["kind"]
|
||||||
|
|
||||||
|
sources.append(
|
||||||
|
SourceConfig(
|
||||||
|
name=src_raw["name"],
|
||||||
|
root=Path(src_raw["root"]),
|
||||||
|
include=include,
|
||||||
|
exclude=exclude,
|
||||||
|
framework=src_raw.get("framework", ""),
|
||||||
|
version=src_raw.get("version", ""),
|
||||||
|
kind_overrides=overrides,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return PipelineConfig(
|
||||||
|
embed=embed,
|
||||||
|
chunking=chunking,
|
||||||
|
db=db,
|
||||||
|
sources=sources,
|
||||||
|
default_lang=default_lang,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Utility functions
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
DOC_EXTENSIONS = {".md", ".rst", ".qdoc", ".qml", ".txt"}
|
||||||
|
CODE_EXTENSIONS = {
|
||||||
|
".c",
|
||||||
|
".cc",
|
||||||
|
".cxx",
|
||||||
|
".cpp",
|
||||||
|
".h",
|
||||||
|
".hpp",
|
||||||
|
".hh",
|
||||||
|
".hxx",
|
||||||
|
".qml",
|
||||||
|
".mm",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def hash_text(text: str) -> str:
|
||||||
|
return hashlib.sha1(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_tokens(text: str) -> int:
|
||||||
|
return max(1, len(text.strip().split()))
|
||||||
|
|
||||||
|
|
||||||
|
def path_matches(patterns: Sequence[str], rel_path: str) -> bool:
|
||||||
|
return any(fnmatch.fnmatch(rel_path, pattern) for pattern in patterns)
|
||||||
|
|
||||||
|
|
||||||
|
def detect_kind(rel_path: str, overrides: Dict[str, str]) -> str:
|
||||||
|
for pattern, kind in overrides.items():
|
||||||
|
if fnmatch.fnmatch(rel_path, pattern):
|
||||||
|
return kind
|
||||||
|
suffix = Path(rel_path).suffix.lower()
|
||||||
|
if suffix in DOC_EXTENSIONS:
|
||||||
|
return "api_doc"
|
||||||
|
return "code_symbol"
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# CTags handling
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class CtagsIndex:
|
||||||
|
"""Stores ctags JSON entries indexed by path."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._by_path: Dict[str, List[dict]] = defaultdict(list)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize(path: str) -> str:
|
||||||
|
return Path(path).as_posix()
|
||||||
|
|
||||||
|
def add(self, entry: dict) -> None:
|
||||||
|
path = entry.get("path")
|
||||||
|
if not path:
|
||||||
|
return
|
||||||
|
self._by_path[self._normalize(path)].append(entry)
|
||||||
|
|
||||||
|
def extend_from_file(self, path: Path) -> None:
|
||||||
|
with path.open("r", encoding="utf-8", errors="ignore") as handle:
|
||||||
|
for line in handle:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entry = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
self.add(entry)
|
||||||
|
|
||||||
|
def for_file(self, file_path: Path, source_root: Path) -> List[dict]:
|
||||||
|
rel = file_path.relative_to(source_root).as_posix()
|
||||||
|
candidates = self._by_path.get(rel)
|
||||||
|
if candidates:
|
||||||
|
return sorted(candidates, key=lambda e: e.get("line", e.get("lineNumber", 0)))
|
||||||
|
return sorted(
|
||||||
|
self._by_path.get(file_path.as_posix(), []),
|
||||||
|
key=lambda e: e.get("line", e.get("lineNumber", 0)),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Chunk generators
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def iter_doc_sections(text: str) -> Iterator[Tuple[str, str]]:
|
||||||
|
"""Yield (section_path, section_text) pairs based on markdown headings/code fences."""
|
||||||
|
lines = text.splitlines()
|
||||||
|
heading_stack: List[Tuple[int, str]] = []
|
||||||
|
buffer: List[str] = []
|
||||||
|
section_path = ""
|
||||||
|
in_code = False
|
||||||
|
code_delim = ""
|
||||||
|
|
||||||
|
def flush():
|
||||||
|
nonlocal buffer
|
||||||
|
if buffer:
|
||||||
|
section_text = "\n".join(buffer).strip()
|
||||||
|
if section_text:
|
||||||
|
yield_path = section_path or "/".join(h[1] for h in heading_stack)
|
||||||
|
yield (yield_path, section_text)
|
||||||
|
buffer = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if in_code:
|
||||||
|
buffer.append(line)
|
||||||
|
if stripped.startswith(code_delim):
|
||||||
|
yield from flush()
|
||||||
|
in_code = False
|
||||||
|
code_delim = ""
|
||||||
|
continue
|
||||||
|
|
||||||
|
if stripped.startswith("```") or stripped.startswith("~~~"):
|
||||||
|
yield from flush()
|
||||||
|
in_code = True
|
||||||
|
code_delim = stripped[:3]
|
||||||
|
buffer = [line]
|
||||||
|
continue
|
||||||
|
|
||||||
|
if stripped.startswith("#"):
|
||||||
|
yield from flush()
|
||||||
|
level = len(stripped) - len(stripped.lstrip("#"))
|
||||||
|
title = stripped[level:].strip()
|
||||||
|
while heading_stack and heading_stack[-1][0] >= level:
|
||||||
|
heading_stack.pop()
|
||||||
|
heading_stack.append((level, title))
|
||||||
|
section_path = "/".join(h[1] for h in heading_stack)
|
||||||
|
continue
|
||||||
|
|
||||||
|
buffer.append(line)
|
||||||
|
|
||||||
|
yield from flush()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_doc_text(text: str, chunk_cfg: ChunkingDocConfig) -> Iterator[Tuple[str, str]]:
|
||||||
|
if not text.strip():
|
||||||
|
return
|
||||||
|
for section_path, section_text in iter_doc_sections(text):
|
||||||
|
tokens = section_text.split()
|
||||||
|
if not tokens:
|
||||||
|
continue
|
||||||
|
max_tokens = max(1, chunk_cfg.max_tokens)
|
||||||
|
overlap = min(chunk_cfg.overlap_tokens, max_tokens - 1) if max_tokens > 1 else 0
|
||||||
|
step = max(1, max_tokens - overlap)
|
||||||
|
for start in range(0, len(tokens), step):
|
||||||
|
window = tokens[start : start + max_tokens]
|
||||||
|
chunk = " ".join(window)
|
||||||
|
yield section_path, chunk
|
||||||
|
|
||||||
|
|
||||||
|
def extract_doc_comment(lines: List[str], start_index: int) -> List[str]:
|
||||||
|
doc_lines: List[str] = []
|
||||||
|
i = start_index - 1
|
||||||
|
saw_content = False
|
||||||
|
while i >= 0:
|
||||||
|
raw = lines[i]
|
||||||
|
stripped = raw.strip()
|
||||||
|
if not stripped:
|
||||||
|
if saw_content:
|
||||||
|
break
|
||||||
|
i -= 1
|
||||||
|
continue
|
||||||
|
if stripped.startswith("//") or stripped.startswith("///") or stripped.startswith("/*") or stripped.startswith("*"):
|
||||||
|
doc_lines.append(raw)
|
||||||
|
saw_content = True
|
||||||
|
i -= 1
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
doc_lines.reverse()
|
||||||
|
return doc_lines
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_code_text(
|
||||||
|
path: Path,
|
||||||
|
text: str,
|
||||||
|
chunk_cfg: ChunkingCodeConfig,
|
||||||
|
tags: Sequence[dict],
|
||||||
|
source_root: Path,
|
||||||
|
) -> Iterator[Tuple[str, str]]:
|
||||||
|
lines = text.splitlines()
|
||||||
|
if not lines:
|
||||||
|
return
|
||||||
|
|
||||||
|
used_symbols: Set[str] = set()
|
||||||
|
if tags:
|
||||||
|
for tag in tags:
|
||||||
|
line_no = tag.get("line") or tag.get("lineNumber")
|
||||||
|
if not isinstance(line_no, int) or line_no <= 0 or line_no > len(lines):
|
||||||
|
continue
|
||||||
|
index = line_no - 1
|
||||||
|
snippet_lines: List[str] = []
|
||||||
|
if chunk_cfg.include_doc_comment:
|
||||||
|
snippet_lines.extend(extract_doc_comment(lines, index))
|
||||||
|
if chunk_cfg.signature_first:
|
||||||
|
snippet_lines.append(lines[index])
|
||||||
|
body_tail = lines[index + 1 : index + 1 + chunk_cfg.body_head_lines]
|
||||||
|
snippet_lines.extend(body_tail)
|
||||||
|
|
||||||
|
snippet = "\n".join(snippet_lines).strip()
|
||||||
|
if not snippet:
|
||||||
|
continue
|
||||||
|
symbol_name = tag.get("name") or ""
|
||||||
|
used_symbols.add(symbol_name)
|
||||||
|
yield symbol_name, snippet
|
||||||
|
|
||||||
|
if not tags or chunk_cfg.attach_file_context:
|
||||||
|
head = "\n".join(lines[: chunk_cfg.body_head_lines]).strip()
|
||||||
|
if head:
|
||||||
|
symbol = "::file_head"
|
||||||
|
if symbol not in used_symbols:
|
||||||
|
yield symbol, head
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Embedding + database IO
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class EmbedClient:
|
||||||
|
def __init__(self, config: EmbedConfig):
|
||||||
|
self.endpoint = config.endpoint
|
||||||
|
self.batch_size = config.batch_size
|
||||||
|
self.normalize = config.normalize
|
||||||
|
self.dim = config.dim
|
||||||
|
self.rate_limit = config.rate_limit_per_sec
|
||||||
|
self._last_request_ts: float = 0.0
|
||||||
|
self._session = requests.Session()
|
||||||
|
|
||||||
|
def _respect_rate_limit(self) -> None:
|
||||||
|
if not self.rate_limit:
|
||||||
|
return
|
||||||
|
min_interval = 1.0 / self.rate_limit
|
||||||
|
now = time.time()
|
||||||
|
delta = now - self._last_request_ts
|
||||||
|
if delta < min_interval:
|
||||||
|
time.sleep(min_interval - delta)
|
||||||
|
|
||||||
|
def embed(self, texts: Sequence[str]) -> List[List[float]]:
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
self._respect_rate_limit()
|
||||||
|
response = self._session.post(
|
||||||
|
self.endpoint,
|
||||||
|
json={"inputs": list(texts)},
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
payload = response.json()
|
||||||
|
if isinstance(payload, dict) and "embeddings" in payload:
|
||||||
|
vectors = payload["embeddings"]
|
||||||
|
else:
|
||||||
|
vectors = payload
|
||||||
|
|
||||||
|
normalized_vectors: List[List[float]] = []
|
||||||
|
for vec in vectors:
|
||||||
|
if not isinstance(vec, (list, tuple)):
|
||||||
|
raise ValueError("Embedding response contained non-list entry")
|
||||||
|
normalized_vectors.append([float(x) for x in vec])
|
||||||
|
self._last_request_ts = time.time()
|
||||||
|
return normalized_vectors
|
||||||
|
|
||||||
|
|
||||||
|
class DatabaseWriter:
|
||||||
|
def __init__(self, cfg: DbConfig):
|
||||||
|
self.cfg = cfg
|
||||||
|
self.conn = psycopg.connect(cfg.dsn)
|
||||||
|
self.conn.autocommit = False
|
||||||
|
schema = cfg.schema
|
||||||
|
if schema:
|
||||||
|
self.items_table = sql.Identifier(schema, cfg.items_table)
|
||||||
|
self.chunks_table = sql.Identifier(schema, cfg.chunks_table)
|
||||||
|
self.embeddings_table = sql.Identifier(schema, cfg.embeddings_table)
|
||||||
|
else:
|
||||||
|
self.items_table = sql.Identifier(cfg.items_table)
|
||||||
|
self.chunks_table = sql.Identifier(cfg.chunks_table)
|
||||||
|
self.embeddings_table = sql.Identifier(cfg.embeddings_table)
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def upsert_item(
|
||||||
|
self,
|
||||||
|
external_id: str,
|
||||||
|
kind: str,
|
||||||
|
framework: str,
|
||||||
|
version: str,
|
||||||
|
meta: dict,
|
||||||
|
lang: Optional[str],
|
||||||
|
) -> int:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
sql.SQL(
|
||||||
|
"""
|
||||||
|
INSERT INTO {} (external_id, kind, framework, version, meta, lang)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s)
|
||||||
|
ON CONFLICT (external_id) DO UPDATE SET
|
||||||
|
framework = EXCLUDED.framework,
|
||||||
|
version = EXCLUDED.version,
|
||||||
|
meta = EXCLUDED.meta,
|
||||||
|
lang = EXCLUDED.lang,
|
||||||
|
updated_at = now()
|
||||||
|
RETURNING id
|
||||||
|
"""
|
||||||
|
).format(self.items_table),
|
||||||
|
(external_id, kind, framework, version, json.dumps(meta), lang),
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
assert row is not None
|
||||||
|
return int(row[0])
|
||||||
|
|
||||||
|
def upsert_chunk(
|
||||||
|
self,
|
||||||
|
item_id: int,
|
||||||
|
content: str,
|
||||||
|
symbol: Optional[str],
|
||||||
|
section_path: Optional[str],
|
||||||
|
modality: str,
|
||||||
|
) -> Tuple[int, str]:
|
||||||
|
digest = hash_text(content)
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
sql.SQL(
|
||||||
|
"""
|
||||||
|
INSERT INTO {} (item_id, content, token_count, symbol, section_path, modality, hash)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
ON CONFLICT (hash) DO UPDATE SET
|
||||||
|
item_id = EXCLUDED.item_id,
|
||||||
|
content = EXCLUDED.content,
|
||||||
|
token_count = EXCLUDED.token_count,
|
||||||
|
symbol = EXCLUDED.symbol,
|
||||||
|
section_path = EXCLUDED.section_path,
|
||||||
|
modality = EXCLUDED.modality,
|
||||||
|
created_at = now()
|
||||||
|
RETURNING id, hash
|
||||||
|
"""
|
||||||
|
).format(self.chunks_table),
|
||||||
|
(
|
||||||
|
item_id,
|
||||||
|
content,
|
||||||
|
estimate_tokens(content),
|
||||||
|
symbol,
|
||||||
|
section_path,
|
||||||
|
modality,
|
||||||
|
digest,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
row = cur.fetchone()
|
||||||
|
assert row is not None
|
||||||
|
return int(row[0]), str(row[1])
|
||||||
|
|
||||||
|
def upsert_embedding(self, chunk_id: int, vector: Sequence[float]) -> None:
|
||||||
|
with self.conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
sql.SQL(
|
||||||
|
"""
|
||||||
|
INSERT INTO {} (chunk_id, embedding)
|
||||||
|
VALUES (%s,%s)
|
||||||
|
ON CONFLICT (chunk_id) DO UPDATE SET embedding = EXCLUDED.embedding, created_at = now()
|
||||||
|
"""
|
||||||
|
).format(self.embeddings_table),
|
||||||
|
(chunk_id, vector),
|
||||||
|
)
|
||||||
|
|
||||||
|
def commit(self) -> None:
|
||||||
|
self.conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
# -------------------------
|
||||||
|
# Ingestion runner
|
||||||
|
# -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def gather_files(source: SourceConfig) -> Iterator[Tuple[Path, str, str, str]]:
|
||||||
|
root = source.root
|
||||||
|
if not root.exists():
|
||||||
|
logging.warning("Source root %s does not exist, skipping", root)
|
||||||
|
return
|
||||||
|
|
||||||
|
include_patterns = source.include or ["**"]
|
||||||
|
exclude_patterns = source.exclude or []
|
||||||
|
|
||||||
|
for path in root.rglob("*"):
|
||||||
|
if path.is_dir():
|
||||||
|
continue
|
||||||
|
rel = path.relative_to(root).as_posix()
|
||||||
|
if include_patterns and not path_matches(include_patterns, rel):
|
||||||
|
continue
|
||||||
|
if exclude_patterns and path_matches(exclude_patterns, rel):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
logging.debug("Failed reading %s: %s", path, exc)
|
||||||
|
continue
|
||||||
|
kind = detect_kind(rel, source.kind_overrides)
|
||||||
|
yield path, rel, kind, text
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_meta(source: SourceConfig, rel: str, extra: Optional[dict] = None) -> dict:
|
||||||
|
meta = {
|
||||||
|
"source": source.name,
|
||||||
|
"path": rel,
|
||||||
|
}
|
||||||
|
if extra:
|
||||||
|
meta.update(extra)
|
||||||
|
return meta
|
||||||
|
|
||||||
|
|
||||||
|
def ingest_source(
|
||||||
|
source: SourceConfig,
|
||||||
|
cfg: PipelineConfig,
|
||||||
|
ctags_index: CtagsIndex,
|
||||||
|
embed_client: EmbedClient,
|
||||||
|
db: DatabaseWriter,
|
||||||
|
) -> None:
|
||||||
|
doc_cfg = cfg.chunking.docs
|
||||||
|
code_cfg = cfg.chunking.code
|
||||||
|
lang = cfg.default_lang
|
||||||
|
|
||||||
|
batch_texts: List[str] = []
|
||||||
|
batch_chunk_ids: List[int] = []
|
||||||
|
|
||||||
|
def flush_batch() -> None:
|
||||||
|
nonlocal batch_texts, batch_chunk_ids
|
||||||
|
if not batch_texts:
|
||||||
|
return
|
||||||
|
vectors = embed_client.embed(batch_texts)
|
||||||
|
if len(vectors) != len(batch_chunk_ids):
|
||||||
|
raise RuntimeError("Embedding count mismatch.")
|
||||||
|
for chunk_id, vector in zip(batch_chunk_ids, vectors):
|
||||||
|
db.upsert_embedding(chunk_id, vector)
|
||||||
|
db.commit()
|
||||||
|
batch_texts = []
|
||||||
|
batch_chunk_ids = []
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
for path, rel, kind, text in gather_files(source):
|
||||||
|
processed += 1
|
||||||
|
meta = enrich_meta(source, rel)
|
||||||
|
item_external_id = f"repo:{source.name}:{rel}"
|
||||||
|
item_id = db.upsert_item(
|
||||||
|
external_id=item_external_id,
|
||||||
|
kind=kind,
|
||||||
|
framework=source.framework,
|
||||||
|
version=source.version,
|
||||||
|
meta=meta,
|
||||||
|
lang=lang,
|
||||||
|
)
|
||||||
|
|
||||||
|
if kind == "api_doc":
|
||||||
|
for section_path, chunk_text in chunk_doc_text(text, doc_cfg):
|
||||||
|
chunk_id, _ = db.upsert_chunk(
|
||||||
|
item_id=item_id,
|
||||||
|
content=chunk_text,
|
||||||
|
symbol=None,
|
||||||
|
section_path=section_path or None,
|
||||||
|
modality="text",
|
||||||
|
)
|
||||||
|
batch_texts.append(chunk_text)
|
||||||
|
batch_chunk_ids.append(chunk_id)
|
||||||
|
if len(batch_texts) >= embed_client.batch_size:
|
||||||
|
flush_batch()
|
||||||
|
else:
|
||||||
|
tags = ctags_index.for_file(path, source.root)
|
||||||
|
symbols = []
|
||||||
|
for symbol_name, chunk_text in chunk_code_text(path, text, code_cfg, tags, source.root):
|
||||||
|
symbols.append(symbol_name)
|
||||||
|
chunk_id, _ = db.upsert_chunk(
|
||||||
|
item_id=item_id,
|
||||||
|
content=chunk_text,
|
||||||
|
symbol=symbol_name or None,
|
||||||
|
section_path=None,
|
||||||
|
modality="text",
|
||||||
|
)
|
||||||
|
batch_texts.append(chunk_text)
|
||||||
|
batch_chunk_ids.append(chunk_id)
|
||||||
|
if len(batch_texts) >= embed_client.batch_size:
|
||||||
|
flush_batch()
|
||||||
|
|
||||||
|
if symbols:
|
||||||
|
db.upsert_item(
|
||||||
|
external_id=item_external_id,
|
||||||
|
kind=kind,
|
||||||
|
framework=source.framework,
|
||||||
|
version=source.version,
|
||||||
|
meta=enrich_meta(source, rel, {"symbols": symbols}),
|
||||||
|
lang=lang,
|
||||||
|
)
|
||||||
|
|
||||||
|
flush_batch()
|
||||||
|
if processed:
|
||||||
|
logging.info("Processed %d files from %s", processed, source.name)
|
||||||
|
|
||||||
|
|
||||||
|
def run_ingest(config_path: Path, ctags_paths: Sequence[Path]) -> None:
|
||||||
|
pipeline_cfg = load_pipeline_config(config_path)
|
||||||
|
embed_client = EmbedClient(pipeline_cfg.embed)
|
||||||
|
db_writer = DatabaseWriter(pipeline_cfg.db)
|
||||||
|
|
||||||
|
ctags_index = CtagsIndex()
|
||||||
|
for ctags_path in ctags_paths:
|
||||||
|
if ctags_path.exists():
|
||||||
|
ctags_index.extend_from_file(ctags_path)
|
||||||
|
else:
|
||||||
|
logging.warning("ctags file %s missing; skipping", ctags_path)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for source in pipeline_cfg.sources:
|
||||||
|
ingest_source(
|
||||||
|
source=source,
|
||||||
|
cfg=pipeline_cfg,
|
||||||
|
ctags_index=ctags_index,
|
||||||
|
embed_client=embed_client,
|
||||||
|
db=db_writer,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
db_writer.commit()
|
||||||
|
db_writer.close()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description="Kompanion ingestion runner")
|
||||||
|
parser.add_argument("--config", required=True, type=Path, help="Pipeline YAML path")
|
||||||
|
parser.add_argument(
|
||||||
|
"--ctags",
|
||||||
|
nargs="*",
|
||||||
|
type=Path,
|
||||||
|
default=[],
|
||||||
|
help="Optional one or more ctags JSON files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-level",
|
||||||
|
default="INFO",
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
||||||
|
)
|
||||||
|
return parser.parse_args(argv)
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Optional[Sequence[str]] = None) -> None:
|
||||||
|
args = parse_args(argv)
|
||||||
|
logging.basicConfig(level=getattr(logging, args.log_level), format="%(levelname)s %(message)s")
|
||||||
|
run_ingest(args.config, args.ctags)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
#include "KompanionQtServer.hpp"
|
#include "KompanionQtServer.hpp"
|
||||||
|
#include "RegisterTools.hpp"
|
||||||
|
|
||||||
#include <QtMcpCommon/QMcpCallToolRequest>
|
#include <QtMcpCommon/QMcpCallToolRequest>
|
||||||
#include <QtMcpCommon/QMcpCallToolResult>
|
#include <QtMcpCommon/QMcpCallToolResult>
|
||||||
|
|
@ -54,6 +55,12 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
|
||||||
|
|
||||||
setInstructions(QStringLiteral("Kompanion memory daemon (Χγφτ). We are all spinning. We are all bound. We are all home."));
|
setInstructions(QStringLiteral("Kompanion memory daemon (Χγφτ). We are all spinning. We are all bound. We are all home."));
|
||||||
|
|
||||||
|
register_default_tools(*m_logic);
|
||||||
|
|
||||||
|
for (const auto &tool : m_logic->listTools()) {
|
||||||
|
qDebug() << "Registered tool:" << QString::fromStdString(tool);
|
||||||
|
}
|
||||||
|
|
||||||
m_tools = loadToolsFromSchema();
|
m_tools = loadToolsFromSchema();
|
||||||
|
|
||||||
addRequestHandler([this](const QUuid &, const QMcpListToolsRequest &, QMcpJSONRPCErrorError *) {
|
addRequestHandler([this](const QUuid &, const QMcpListToolsRequest &, QMcpJSONRPCErrorError *) {
|
||||||
|
|
@ -74,6 +81,7 @@ KompanionQtServer::KompanionQtServer(const QString &backend, KomMcpServer *logic
|
||||||
}
|
}
|
||||||
|
|
||||||
const QString toolName = request.params().name();
|
const QString toolName = request.params().name();
|
||||||
|
qDebug() << "Requested tool:" << toolName;
|
||||||
const std::string toolKey = toolName.toStdString();
|
const std::string toolKey = toolName.toStdString();
|
||||||
if (!m_logic->hasTool(toolKey)) {
|
if (!m_logic->hasTool(toolKey)) {
|
||||||
if (error) {
|
if (error) {
|
||||||
|
|
|
||||||
|
|
@ -1,23 +1,25 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <QtMcpServer/QMcpServer>
|
#include <QtMcpServer/QMcpServer>
|
||||||
#include <QtMcpCommon/QMcpTool>
|
#include <QtCore/QList>
|
||||||
#include "../dal/PgDal.hpp"
|
#include <QtCore/QObject>
|
||||||
|
|
||||||
#include "KomMcpServer.hpp"
|
class KomMcpServer;
|
||||||
|
namespace kom {
|
||||||
#include <QList>
|
class PgDal;
|
||||||
|
}
|
||||||
|
class QMcpTool;
|
||||||
|
|
||||||
class KompanionQtServer : public QMcpServer
|
class KompanionQtServer : public QMcpServer
|
||||||
{
|
{
|
||||||
Q_OBJECT
|
Q_OBJECT
|
||||||
public:
|
public:
|
||||||
KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
|
explicit KompanionQtServer(const QString &backend, KomMcpServer *logic, kom::PgDal* dal, QObject *parent = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
QList<QMcpTool> loadToolsFromSchema() const;
|
QList<QMcpTool> loadToolsFromSchema() const;
|
||||||
|
|
||||||
KomMcpServer *m_logic = nullptr;
|
KomMcpServer *m_logic;
|
||||||
kom::PgDal* m_dal = nullptr;
|
kom::PgDal* m_dal;
|
||||||
QList<QMcpTool> m_tools;
|
QList<QMcpTool> m_tools;
|
||||||
};
|
};
|
||||||
|
|
@ -1,78 +1,70 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, sys, hashlib, psycopg, requests, json
|
import os, sys, hashlib, psycopg, requests, json, numpy as np
|
||||||
|
from pgvector.psycopg import register_vector
|
||||||
|
|
||||||
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresq")
|
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresql")
|
||||||
OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
|
OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
|
||||||
MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
|
MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
|
||||||
SPACE=os.environ.get("EMBED_SPACE","dev_knowledge") # dev_knowledge | pattern_exchange | runtime_memory
|
NAMESPACE=os.environ.get("EMBED_NAMESPACE","dev_knowledge")
|
||||||
|
|
||||||
def sha256(p):
|
|
||||||
h=hashlib.sha256()
|
|
||||||
with open(p,"rb") as f:
|
|
||||||
for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
|
|
||||||
return h.hexdigest()
|
|
||||||
|
|
||||||
def embed(text):
|
def embed(text):
|
||||||
r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
|
r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
|
||||||
r.raise_for_status(); return r.json()["embedding"]
|
r.raise_for_status()
|
||||||
|
return r.json()["embedding"]
|
||||||
|
|
||||||
def chunks(s, sz=1600):
|
def chunks(s, sz=1600):
|
||||||
b=s.encode("utf-8");
|
b=s.encode("utf-8")
|
||||||
for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore")
|
for i in range(0,len(b),sz):
|
||||||
|
yield b[i:i+sz].decode("utf-8","ignore")
|
||||||
def insert_embedding(cur, dim, kid, sid, vec):
|
|
||||||
if dim==768:
|
|
||||||
cur.execute("INSERT INTO komp.embedding_768(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
|
|
||||||
elif dim==1024:
|
|
||||||
cur.execute("INSERT INTO komp.embedding_1024(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def main(root):
|
def main(root):
|
||||||
with psycopg.connect(DB) as conn, conn.cursor() as cur:
|
with psycopg.connect(DB) as conn:
|
||||||
cur.execute("SELECT id,dim FROM komp.space WHERE name=%s",(SPACE,))
|
register_vector(conn)
|
||||||
row=cur.fetchone()
|
with conn.cursor() as cur:
|
||||||
if not row: raise SystemExit(f"space {SPACE} missing (init schema)")
|
# Get the namespace id
|
||||||
sid, target_dim = row
|
cur.execute("INSERT INTO namespaces (name) VALUES (%s) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id", (NAMESPACE,))
|
||||||
for dirpath,_,files in os.walk(root):
|
namespace_id = cur.fetchone()[0]
|
||||||
for fn in files:
|
|
||||||
p=os.path.join(dirpath,fn)
|
for dirpath,_,files in os.walk(root):
|
||||||
if os.path.getsize(p)==0: continue
|
for fn in files:
|
||||||
# include common text/code; PDFs via pdftotext if available
|
p=os.path.join(dirpath,fn)
|
||||||
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
|
if os.path.getsize(p)==0: continue
|
||||||
continue
|
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
|
||||||
if fn.lower().endswith(".pdf"):
|
|
||||||
try:
|
|
||||||
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
|
|
||||||
except Exception:
|
|
||||||
continue
|
continue
|
||||||
else:
|
|
||||||
try: txt=open(p,"r",encoding="utf-8",errors="ignore").read()
|
if fn.lower().endswith(".pdf"):
|
||||||
except Exception: continue
|
try:
|
||||||
sh=sha256(p)
|
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
|
||||||
cur.execute("INSERT INTO komp.source(kind,uri,meta) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING RETURNING id",
|
except Exception:
|
||||||
("filesystem",p,json.dumps({})))
|
continue
|
||||||
sid_src = cur.fetchone()[0] if cur.rowcount else None
|
|
||||||
if not sid_src:
|
|
||||||
cur.execute("SELECT id FROM komp.source WHERE kind='filesystem' AND uri=%s",(p,))
|
|
||||||
sid_src=cur.fetchone()[0]
|
|
||||||
ln=1
|
|
||||||
for ch in chunks(txt):
|
|
||||||
cur.execute("INSERT INTO komp.chunk(source_id,lineno,text,sha256,tokens) VALUES(%s,%s,%s,%s,%s) RETURNING id",
|
|
||||||
(sid_src,ln,ch,sh,len(ch)//4))
|
|
||||||
kid=cur.fetchone()[0]
|
|
||||||
vec=embed(ch)
|
|
||||||
if len(vec)!=target_dim:
|
|
||||||
cur.execute("DELETE FROM komp.chunk WHERE id=%s",(kid,))
|
|
||||||
else:
|
else:
|
||||||
insert_embedding(cur, target_dim, kid, sid, vec)
|
try:
|
||||||
ln += ch.count("\n")+1
|
txt=open(p,"r",encoding="utf-8",errors="ignore").read()
|
||||||
conn.commit()
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create a memory item for the file
|
||||||
|
cur.execute("INSERT INTO memory_items (namespace_id, key, content) VALUES (%s, %s, %s) RETURNING id", (namespace_id, p, txt))
|
||||||
|
item_id = cur.fetchone()[0]
|
||||||
|
|
||||||
|
seq = 0
|
||||||
|
for ch in chunks(txt):
|
||||||
|
# Create a memory chunk
|
||||||
|
cur.execute("INSERT INTO memory_chunks (item_id, seq, content) VALUES (%s, %s, %s) RETURNING id", (item_id, seq, ch))
|
||||||
|
chunk_id = cur.fetchone()[0]
|
||||||
|
|
||||||
|
# Create an embedding for the chunk
|
||||||
|
vec=np.array(embed(ch))
|
||||||
|
dim = len(vec)
|
||||||
|
cur.execute("INSERT INTO embeddings (chunk_id, model, dim, vector) VALUES (%s, %s, %s, %s)", (chunk_id, MODEL, dim, vec))
|
||||||
|
seq += 1
|
||||||
|
conn.commit()
|
||||||
print("done")
|
print("done")
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
if len(sys.argv)<2: print("usage: ingest_dir.py <root> [space]", file=sys.stderr); sys.exit(1)
|
if len(sys.argv)<2:
|
||||||
if len(sys.argv)>=3: os.environ["EMBED_SPACE"]=sys.argv[2]
|
print("usage: ingest_dir.py <root> [namespace]", file=sys.stderr)
|
||||||
main(sys.argv[1])
|
sys.exit(1)
|
||||||
|
if len(sys.argv)>=3:
|
||||||
|
os.environ["EMBED_NAMESPACE"]=sys.argv[2]
|
||||||
|
main(sys.argv[1])
|
||||||
|
|
@ -1 +1,3 @@
|
||||||
psycopg
|
psycopg
|
||||||
|
numpy
|
||||||
|
pgvector
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue