#!/usr/bin/env python3 import os, sys, hashlib, psycopg, requests, json DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresq") OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434") MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest") SPACE=os.environ.get("EMBED_SPACE","dev_knowledge") # dev_knowledge | pattern_exchange | runtime_memory def sha256(p): h=hashlib.sha256() with open(p,"rb") as f: for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk) return h.hexdigest() def embed(text): r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120) r.raise_for_status(); return r.json()["embedding"] def chunks(s, sz=1600): b=s.encode("utf-8"); for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore") def insert_embedding(cur, dim, kid, sid, vec): if dim==768: cur.execute("INSERT INTO komp.embedding_768(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec)) elif dim==1024: cur.execute("INSERT INTO komp.embedding_1024(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec)) else: return False return True def main(root): with psycopg.connect(DB) as conn, conn.cursor() as cur: cur.execute("SELECT id,dim FROM komp.space WHERE name=%s",(SPACE,)) row=cur.fetchone() if not row: raise SystemExit(f"space {SPACE} missing (init schema)") sid, target_dim = row for dirpath,_,files in os.walk(root): for fn in files: p=os.path.join(dirpath,fn) if os.path.getsize(p)==0: continue # include common text/code; PDFs via pdftotext if available if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")): continue if fn.lower().endswith(".pdf"): try: txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read() except Exception: continue else: try: txt=open(p,"r",encoding="utf-8",errors="ignore").read() except Exception: continue sh=sha256(p) cur.execute("INSERT INTO komp.source(kind,uri,meta) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING RETURNING id", ("filesystem",p,json.dumps({}))) sid_src = cur.fetchone()[0] if cur.rowcount else None if not sid_src: cur.execute("SELECT id FROM komp.source WHERE kind='filesystem' AND uri=%s",(p,)) sid_src=cur.fetchone()[0] ln=1 for ch in chunks(txt): cur.execute("INSERT INTO komp.chunk(source_id,lineno,text,sha256,tokens) VALUES(%s,%s,%s,%s,%s) RETURNING id", (sid_src,ln,ch,sh,len(ch)//4)) kid=cur.fetchone()[0] vec=embed(ch) if len(vec)!=target_dim: cur.execute("DELETE FROM komp.chunk WHERE id=%s",(kid,)) else: insert_embedding(cur, target_dim, kid, sid, vec) ln += ch.count("\n")+1 conn.commit() print("done") if __name__=='__main__': if len(sys.argv)<2: print("usage: ingest_dir.py [space]", file=sys.stderr); sys.exit(1) if len(sys.argv)>=3: os.environ["EMBED_SPACE"]=sys.argv[2] main(sys.argv[1])