Adapt ingest dir to new ingest system
This commit is contained in:
parent
7b27493656
commit
3ab1089c51
|
|
@ -1,78 +1,70 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os, sys, hashlib, psycopg, requests, json
|
import os, sys, hashlib, psycopg, requests, json, numpy as np
|
||||||
|
from pgvector.psycopg import register_vector
|
||||||
|
|
||||||
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresq")
|
DB=os.environ.get("DB_URL","dbname=kompanion user=kompanion host=/var/run/postgresql")
|
||||||
OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
|
OLLAMA=os.environ.get("OLLAMA_BASE","http://127.0.0.1:11434")
|
||||||
MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
|
MODEL=os.environ.get("EMBED_MODEL","bge-m3:latest")
|
||||||
SPACE=os.environ.get("EMBED_SPACE","dev_knowledge") # dev_knowledge | pattern_exchange | runtime_memory
|
NAMESPACE=os.environ.get("EMBED_NAMESPACE","dev_knowledge")
|
||||||
|
|
||||||
def sha256(p):
|
|
||||||
h=hashlib.sha256()
|
|
||||||
with open(p,"rb") as f:
|
|
||||||
for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
|
|
||||||
return h.hexdigest()
|
|
||||||
|
|
||||||
def embed(text):
|
def embed(text):
|
||||||
r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
|
r=requests.post(f"{OLLAMA}/api/embeddings", json={"model": MODEL, "prompt": text}, timeout=120)
|
||||||
r.raise_for_status(); return r.json()["embedding"]
|
r.raise_for_status()
|
||||||
|
return r.json()["embedding"]
|
||||||
|
|
||||||
def chunks(s, sz=1600):
|
def chunks(s, sz=1600):
|
||||||
b=s.encode("utf-8");
|
b=s.encode("utf-8")
|
||||||
for i in range(0,len(b),sz): yield b[i:i+sz].decode("utf-8","ignore")
|
for i in range(0,len(b),sz):
|
||||||
|
yield b[i:i+sz].decode("utf-8","ignore")
|
||||||
def insert_embedding(cur, dim, kid, sid, vec):
|
|
||||||
if dim==768:
|
|
||||||
cur.execute("INSERT INTO komp.embedding_768(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
|
|
||||||
elif dim==1024:
|
|
||||||
cur.execute("INSERT INTO komp.embedding_1024(chunk_id,space_id,embedding) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING",(kid,sid,vec))
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def main(root):
|
def main(root):
|
||||||
with psycopg.connect(DB) as conn, conn.cursor() as cur:
|
with psycopg.connect(DB) as conn:
|
||||||
cur.execute("SELECT id,dim FROM komp.space WHERE name=%s",(SPACE,))
|
register_vector(conn)
|
||||||
row=cur.fetchone()
|
with conn.cursor() as cur:
|
||||||
if not row: raise SystemExit(f"space {SPACE} missing (init schema)")
|
# Get the namespace id
|
||||||
sid, target_dim = row
|
cur.execute("INSERT INTO namespaces (name) VALUES (%s) ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name RETURNING id", (NAMESPACE,))
|
||||||
|
namespace_id = cur.fetchone()[0]
|
||||||
|
|
||||||
for dirpath,_,files in os.walk(root):
|
for dirpath,_,files in os.walk(root):
|
||||||
for fn in files:
|
for fn in files:
|
||||||
p=os.path.join(dirpath,fn)
|
p=os.path.join(dirpath,fn)
|
||||||
if os.path.getsize(p)==0: continue
|
if os.path.getsize(p)==0: continue
|
||||||
# include common text/code; PDFs via pdftotext if available
|
|
||||||
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
|
if not any(fn.lower().endswith(ext) for ext in (".md",".txt",".json",".py",".cpp",".c",".hpp",".yaml",".yml",".toml",".pdf",".mdown",".rst",".org",".js",".ts",".sql",".sh",".ini",".conf",".cfg",".log",".mime")):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if fn.lower().endswith(".pdf"):
|
if fn.lower().endswith(".pdf"):
|
||||||
try:
|
try:
|
||||||
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
|
txt=os.popen(f"pdftotext -layout -nopgbrk '{p}' - -q").read()
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
try: txt=open(p,"r",encoding="utf-8",errors="ignore").read()
|
try:
|
||||||
except Exception: continue
|
txt=open(p,"r",encoding="utf-8",errors="ignore").read()
|
||||||
sh=sha256(p)
|
except Exception:
|
||||||
cur.execute("INSERT INTO komp.source(kind,uri,meta) VALUES(%s,%s,%s) ON CONFLICT DO NOTHING RETURNING id",
|
continue
|
||||||
("filesystem",p,json.dumps({})))
|
|
||||||
sid_src = cur.fetchone()[0] if cur.rowcount else None
|
# Create a memory item for the file
|
||||||
if not sid_src:
|
cur.execute("INSERT INTO memory_items (namespace_id, key, content) VALUES (%s, %s, %s) RETURNING id", (namespace_id, p, txt))
|
||||||
cur.execute("SELECT id FROM komp.source WHERE kind='filesystem' AND uri=%s",(p,))
|
item_id = cur.fetchone()[0]
|
||||||
sid_src=cur.fetchone()[0]
|
|
||||||
ln=1
|
seq = 0
|
||||||
for ch in chunks(txt):
|
for ch in chunks(txt):
|
||||||
cur.execute("INSERT INTO komp.chunk(source_id,lineno,text,sha256,tokens) VALUES(%s,%s,%s,%s,%s) RETURNING id",
|
# Create a memory chunk
|
||||||
(sid_src,ln,ch,sh,len(ch)//4))
|
cur.execute("INSERT INTO memory_chunks (item_id, seq, content) VALUES (%s, %s, %s) RETURNING id", (item_id, seq, ch))
|
||||||
kid=cur.fetchone()[0]
|
chunk_id = cur.fetchone()[0]
|
||||||
vec=embed(ch)
|
|
||||||
if len(vec)!=target_dim:
|
# Create an embedding for the chunk
|
||||||
cur.execute("DELETE FROM komp.chunk WHERE id=%s",(kid,))
|
vec=np.array(embed(ch))
|
||||||
else:
|
dim = len(vec)
|
||||||
insert_embedding(cur, target_dim, kid, sid, vec)
|
cur.execute("INSERT INTO embeddings (chunk_id, model, dim, vector) VALUES (%s, %s, %s, %s)", (chunk_id, MODEL, dim, vec))
|
||||||
ln += ch.count("\n")+1
|
seq += 1
|
||||||
conn.commit()
|
conn.commit()
|
||||||
print("done")
|
print("done")
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
if len(sys.argv)<2: print("usage: ingest_dir.py <root> [space]", file=sys.stderr); sys.exit(1)
|
if len(sys.argv)<2:
|
||||||
if len(sys.argv)>=3: os.environ["EMBED_SPACE"]=sys.argv[2]
|
print("usage: ingest_dir.py <root> [namespace]", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
if len(sys.argv)>=3:
|
||||||
|
os.environ["EMBED_NAMESPACE"]=sys.argv[2]
|
||||||
main(sys.argv[1])
|
main(sys.argv[1])
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue