59 lines
2.0 KiB
Python
59 lines
2.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Lightweight embedding helper moved from ingest/ for transparency.
|
|
|
|
Usage examples:
|
|
- Single embedding via Ollama:
|
|
OLLAMA_BASE=http://localhost:11434 \
|
|
./py_embedder.py --model bge-m3:latest --text "hello world"
|
|
|
|
- Batch from stdin (one line per text):
|
|
./py_embedder.py --model bge-m3:latest --stdin < texts.txt
|
|
|
|
Outputs JSON array of floats (for single text) or array-of-arrays for batches.
|
|
This script does not touch the database; it only produces vectors.
|
|
"""
|
|
import os, sys, json, argparse, requests
|
|
|
|
def embed_ollama(texts, model, base):
|
|
url = f"{base}/api/embeddings"
|
|
# Some Ollama models accept a single prompt; do one-by-one for reliability
|
|
out = []
|
|
for t in texts:
|
|
r = requests.post(url, json={"model": model, "prompt": t}, timeout=120)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
if "embedding" in data:
|
|
out.append(data["embedding"]) # single vector
|
|
elif "embeddings" in data:
|
|
out.extend(data["embeddings"]) # multiple vectors
|
|
else:
|
|
raise RuntimeError("Embedding response missing 'embedding(s)'")
|
|
return out
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--model", default=os.environ.get("EMBED_MODEL","bge-m3:latest"))
|
|
ap.add_argument("--text", help="Text to embed; if omitted, use --stdin")
|
|
ap.add_argument("--stdin", action="store_true", help="Read texts from stdin (one per line)")
|
|
ap.add_argument("--base", default=os.environ.get("OLLAMA_BASE","http://localhost:11434"))
|
|
args = ap.parse_args()
|
|
|
|
texts = []
|
|
if args.text:
|
|
texts = [args.text]
|
|
elif args.stdin:
|
|
texts = [line.rstrip("\n") for line in sys.stdin if line.strip()]
|
|
else:
|
|
ap.error("Provide --text or --stdin")
|
|
|
|
vectors = embed_ollama(texts, args.model, args.base)
|
|
if len(texts) == 1 and vectors:
|
|
print(json.dumps(vectors[0]))
|
|
else:
|
|
print(json.dumps(vectors))
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|