swf-monitor/scripts/ingest_docs.py

0001 #!/usr/bin/env python3
0002 """Ingest documentation into ChromaDB for RAG-based search.
0003
0004 Reads a YAML config listing doc directories, chunks the files,
0005 embeds with sentence-transformers, and stores in a local ChromaDB.
0006
0007 Usage:
0008     python scripts/ingest_docs.py                  # incremental (skip unchanged)
0009     python scripts/ingest_docs.py --rebuild         # wipe and re-ingest everything
0010     python scripts/ingest_docs.py --config alt.yaml # use alternate config
0011     python scripts/ingest_docs.py --stats           # show collection stats only
0012 """
0013
0014 import argparse
0015 import glob
0016 import hashlib
0017 import json
0018 import os
0019 import sys
0020 import time
0021
0022 # ChromaDB requires sqlite3 >= 3.35; RHEL8 ships 3.26.
0023 # pysqlite3-binary bundles a modern sqlite3 — swap it in before chromadb loads.
0024 try:
0025     __import__("pysqlite3")
0026     sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
0027 except ImportError:
0028     pass
0029
0030 import yaml
0031
0032 # Defaults
0033 DEFAULT_CONFIG = os.path.join(os.path.dirname(__file__), "ingest_docs.yaml")
0034 DEFAULT_CHUNK_SIZE = 3000
0035 DEFAULT_CHUNK_OVERLAP = 300
0036 DEFAULT_MODEL = "all-MiniLM-L6-v2"
0037
0038
0039 def chunk_text(text, chunk_size, overlap):
0040     """Split text into overlapping chunks."""
0041     chunks = []
0042     start = 0
0043     while start < len(text):
0044         end = start + chunk_size
0045         chunks.append(text[start:end])
0046         start += chunk_size - overlap
0047         if start >= len(text):
0048             break
0049     return chunks
0050
0051
0052 def file_hash(path):
0053     """Return hex digest of file content + mtime for change detection."""
0054     mtime = str(os.path.getmtime(path))
0055     content = open(path, "rb").read()
0056     return hashlib.sha256(content + mtime.encode()).hexdigest()
0057
0058
0059 def load_config(config_path):
0060     with open(config_path) as f:
0061         return yaml.safe_load(f)
0062
0063
0064 def ingest(config_path, rebuild=False, stats_only=False):
0065     cfg = load_config(config_path)
0066
0067     chroma_path = os.path.expanduser(cfg.get("chroma_path", "./chroma_db"))
0068     collection_name = cfg.get("collection", "bamboo_docs")
0069     chunk_size = cfg.get("chunk_size", DEFAULT_CHUNK_SIZE)
0070     chunk_overlap = cfg.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)
0071     model_name = cfg.get("embedding_model", DEFAULT_MODEL)
0072     sources = cfg.get("sources", [])
0073
0074     # Lazy imports — these are heavy
0075     import chromadb
0076     from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
0077
0078     ef = SentenceTransformerEmbeddingFunction(model_name)
0079     client = chromadb.PersistentClient(path=chroma_path)
0080
0081     if stats_only:
0082         try:
0083             col = client.get_collection(collection_name, embedding_function=ef)
0084             print(f"Collection: {collection_name}")
0085             print(f"Path: {chroma_path}")
0086             print(f"Documents: {col.count()}")
0087             # Show source breakdown
0088             results = col.get(include=["metadatas"])
0089             labels = {}
0090             for m in results["metadatas"]:
0091                 lbl = m.get("source", "unknown")
0092                 labels[lbl] = labels.get(lbl, 0) + 1
0093             print("By source:")
0094             for lbl, count in sorted(labels.items()):
0095                 print(f"  {lbl}: {count} chunks")
0096         except Exception as e:
0097             print(f"No collection found: {e}")
0098         return
0099
0100     if rebuild:
0101         try:
0102             client.delete_collection(collection_name)
0103             print(f"Deleted existing collection '{collection_name}'")
0104         except chromadb.errors.NotFoundError:
0105             pass
0106
0107     collection = client.get_or_create_collection(
0108         collection_name, embedding_function=ef,
0109     )
0110
0111     # Load existing hashes for incremental mode
0112     existing = {}
0113     if not rebuild:
0114         results = collection.get(include=["metadatas"])
0115         for i, meta in enumerate(results["metadatas"]):
0116             fpath = meta.get("file_path", "")
0117             if fpath:
0118                 existing[fpath] = meta.get("file_hash", "")
0119
0120     total_chunks = 0
0121     total_files = 0
0122     skipped = 0
0123
0124     for source in sources:
0125         src_path = os.path.expanduser(source["path"])
0126         pattern = source.get("glob", "**/*.md")
0127         label = source.get("label", os.path.basename(src_path))
0128
0129         if not os.path.isdir(src_path):
0130             print(f"  SKIP {src_path} (not found)")
0131             continue
0132
0133         files = sorted(glob.glob(os.path.join(src_path, pattern), recursive=True))
0134         print(f"[{label}] {src_path}: {len(files)} files")
0135
0136         for fpath in files:
0137             fhash = file_hash(fpath)
0138
0139             # Skip unchanged files in incremental mode
0140             if not rebuild and existing.get(fpath) == fhash:
0141                 skipped += 1
0142                 continue
0143
0144             # Delete old chunks for this file before re-adding
0145             if not rebuild and fpath in existing:
0146                 old_ids = [
0147                     results["ids"][i]
0148                     for i, m in enumerate(results["metadatas"])
0149                     if m.get("file_path") == fpath
0150                 ]
0151                 if old_ids:
0152                     collection.delete(ids=old_ids)
0153
0154             text = open(fpath, encoding="utf-8", errors="replace").read()
0155             if not text.strip():
0156                 continue
0157
0158             chunks = chunk_text(text, chunk_size, chunk_overlap)
0159             rel_path = os.path.relpath(fpath, src_path)
0160
0161             ids = []
0162             documents = []
0163             metadatas = []
0164             for i, chunk in enumerate(chunks):
0165                 doc_id = hashlib.md5(
0166                     f"{label}:{rel_path}:{i}".encode()
0167                 ).hexdigest()
0168                 ids.append(doc_id)
0169                 documents.append(chunk)
0170                 metadatas.append({
0171                     "source": label,
0172                     "file_path": fpath,
0173                     "rel_path": rel_path,
0174                     "chunk_index": i,
0175                     "total_chunks": len(chunks),
0176                     "file_hash": fhash,
0177                 })
0178
0179             collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
0180             total_chunks += len(chunks)
0181             total_files += 1
0182             print(f"  {rel_path}: {len(chunks)} chunks")
0183
0184     print(f"\nDone: {total_files} files, {total_chunks} chunks indexed"
0185           f" ({skipped} unchanged files skipped)")
0186     print(f"Collection '{collection_name}': {collection.count()} total chunks")
0187
0188
0189 def main():
0190     parser = argparse.ArgumentParser(description="Ingest docs into ChromaDB")
0191     parser.add_argument("--config", default=DEFAULT_CONFIG,
0192                         help="YAML config file")
0193     parser.add_argument("--rebuild", action="store_true",
0194                         help="Wipe collection and re-ingest everything")
0195     parser.add_argument("--stats", action="store_true",
0196                         help="Show collection stats only")
0197     args = parser.parse_args()
0198
0199     if not os.path.exists(args.config):
0200         print(f"Config not found: {args.config}")
0201         sys.exit(1)
0202
0203     ingest(args.config, rebuild=args.rebuild, stats_only=args.stats)
0204
0205
0206 if __name__ == "__main__":
0207     main()