#!/usr/bin/env python3 import os, sys, hashlib, psycopg, requests, json, numpy as np from pgvector.psycopg import register_vector DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresql") OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434") MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest") NAMESPACE=os.environ.get("EMBED_NAMESPACE","dev_knowledge") def embed(text): r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120) r.raise_for_status() return r.json()["embedding"] def chunks(s, sz=1600): b=s.encode("utf-8") for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore") def main(root): with psycopg.connect(DB) as conn: register_vector(conn) with conn.cursor() as cur: # Get the namespace id cur.execute("INSERT INTO namespaces (name) VALUES (%s) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id", (NAMESPACE,)) namespace_id = cur.fetchone()[0] for dirpath,_,files in os.walk(root): for fn in files: p=os.path.join(dirpath,fn) if os.path.getsize(p)==0: continue if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")): continue if fn.lower().endswith(".pdf"): try: txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read() except Exception: continue else: try: txt=open(p,"r",encoding="utf-8",errors="ignore").read() except Exception: continue # Create a memory item for the file cur.execute("INSERT INTO memory_items (namespace_id, key, content) VALUES (%s, %s, %s) RETURNING id", (namespace_id, p, txt)) item_id = cur.fetchone()[0] seq = 0 for ch in chunks(txt): # Create a memory chunk cur.execute("INSERT INTO memory_chunks (item_id, seq, content) VALUES (%s, %s, %s) RETURNING id", (item_id, seq, ch)) chunk_id = cur.fetchone()[0] # Create an embedding for the chunk vec=np.array(embed(ch)) dim = len(vec) cur.execute("INSERT INTO embeddings (chunk_id, model, dim, vector) VALUES (%s, %s, %s, %s)", (chunk_id, MODEL, dim, vec)) seq += 1 conn.commit() print("done") if __name__=='__main__': if len(sys.argv)<2: print("usage: ingest_dir.py [namespace]", file=sys.stderr) sys.exit(1) if len(sys.argv)>=3: os.environ["EMBED_NAMESPACE"]=sys.argv[2] main(sys.argv[1])