File indexing completed on 2026-04-25 08:29:08
0001
0002 """Ingest documentation into ChromaDB for RAG-based search.
0003
0004 Reads a YAML config listing doc directories, chunks the files,
0005 embeds with sentence-transformers, and stores in a local ChromaDB.
0006
0007 Usage:
0008 python scripts/ingest_docs.py # incremental (skip unchanged)
0009 python scripts/ingest_docs.py --rebuild # wipe and re-ingest everything
0010 python scripts/ingest_docs.py --config alt.yaml # use alternate config
0011 python scripts/ingest_docs.py --stats # show collection stats only
0012 """
0013
0014 import argparse
0015 import glob
0016 import hashlib
0017 import json
0018 import os
0019 import sys
0020 import time
0021
0022
0023
0024 try:
0025 __import__("pysqlite3")
0026 sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
0027 except ImportError:
0028 pass
0029
0030 import yaml
0031
0032
0033 DEFAULT_CONFIG = os.path.join(os.path.dirname(__file__), "ingest_docs.yaml")
0034 DEFAULT_CHUNK_SIZE = 3000
0035 DEFAULT_CHUNK_OVERLAP = 300
0036 DEFAULT_MODEL = "all-MiniLM-L6-v2"
0037
0038
0039 def chunk_text(text, chunk_size, overlap):
0040 """Split text into overlapping chunks."""
0041 chunks = []
0042 start = 0
0043 while start < len(text):
0044 end = start + chunk_size
0045 chunks.append(text[start:end])
0046 start += chunk_size - overlap
0047 if start >= len(text):
0048 break
0049 return chunks
0050
0051
0052 def file_hash(path):
0053 """Return hex digest of file content + mtime for change detection."""
0054 mtime = str(os.path.getmtime(path))
0055 content = open(path, "rb").read()
0056 return hashlib.sha256(content + mtime.encode()).hexdigest()
0057
0058
0059 def load_config(config_path):
0060 with open(config_path) as f:
0061 return yaml.safe_load(f)
0062
0063
0064 def ingest(config_path, rebuild=False, stats_only=False):
0065 cfg = load_config(config_path)
0066
0067 chroma_path = os.path.expanduser(cfg.get("chroma_path", "./chroma_db"))
0068 collection_name = cfg.get("collection", "bamboo_docs")
0069 chunk_size = cfg.get("chunk_size", DEFAULT_CHUNK_SIZE)
0070 chunk_overlap = cfg.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)
0071 model_name = cfg.get("embedding_model", DEFAULT_MODEL)
0072 sources = cfg.get("sources", [])
0073
0074
0075 import chromadb
0076 from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
0077
0078 ef = SentenceTransformerEmbeddingFunction(model_name)
0079 client = chromadb.PersistentClient(path=chroma_path)
0080
0081 if stats_only:
0082 try:
0083 col = client.get_collection(collection_name, embedding_function=ef)
0084 print(f"Collection: {collection_name}")
0085 print(f"Path: {chroma_path}")
0086 print(f"Documents: {col.count()}")
0087
0088 results = col.get(include=["metadatas"])
0089 labels = {}
0090 for m in results["metadatas"]:
0091 lbl = m.get("source", "unknown")
0092 labels[lbl] = labels.get(lbl, 0) + 1
0093 print("By source:")
0094 for lbl, count in sorted(labels.items()):
0095 print(f" {lbl}: {count} chunks")
0096 except Exception as e:
0097 print(f"No collection found: {e}")
0098 return
0099
0100 if rebuild:
0101 try:
0102 client.delete_collection(collection_name)
0103 print(f"Deleted existing collection '{collection_name}'")
0104 except chromadb.errors.NotFoundError:
0105 pass
0106
0107 collection = client.get_or_create_collection(
0108 collection_name, embedding_function=ef,
0109 )
0110
0111
0112 existing = {}
0113 if not rebuild:
0114 results = collection.get(include=["metadatas"])
0115 for i, meta in enumerate(results["metadatas"]):
0116 fpath = meta.get("file_path", "")
0117 if fpath:
0118 existing[fpath] = meta.get("file_hash", "")
0119
0120 total_chunks = 0
0121 total_files = 0
0122 skipped = 0
0123
0124 for source in sources:
0125 src_path = os.path.expanduser(source["path"])
0126 pattern = source.get("glob", "**/*.md")
0127 label = source.get("label", os.path.basename(src_path))
0128
0129 if not os.path.isdir(src_path):
0130 print(f" SKIP {src_path} (not found)")
0131 continue
0132
0133 files = sorted(glob.glob(os.path.join(src_path, pattern), recursive=True))
0134 print(f"[{label}] {src_path}: {len(files)} files")
0135
0136 for fpath in files:
0137 fhash = file_hash(fpath)
0138
0139
0140 if not rebuild and existing.get(fpath) == fhash:
0141 skipped += 1
0142 continue
0143
0144
0145 if not rebuild and fpath in existing:
0146 old_ids = [
0147 results["ids"][i]
0148 for i, m in enumerate(results["metadatas"])
0149 if m.get("file_path") == fpath
0150 ]
0151 if old_ids:
0152 collection.delete(ids=old_ids)
0153
0154 text = open(fpath, encoding="utf-8", errors="replace").read()
0155 if not text.strip():
0156 continue
0157
0158 chunks = chunk_text(text, chunk_size, chunk_overlap)
0159 rel_path = os.path.relpath(fpath, src_path)
0160
0161 ids = []
0162 documents = []
0163 metadatas = []
0164 for i, chunk in enumerate(chunks):
0165 doc_id = hashlib.md5(
0166 f"{label}:{rel_path}:{i}".encode()
0167 ).hexdigest()
0168 ids.append(doc_id)
0169 documents.append(chunk)
0170 metadatas.append({
0171 "source": label,
0172 "file_path": fpath,
0173 "rel_path": rel_path,
0174 "chunk_index": i,
0175 "total_chunks": len(chunks),
0176 "file_hash": fhash,
0177 })
0178
0179 collection.upsert(ids=ids, documents=documents, metadatas=metadatas)
0180 total_chunks += len(chunks)
0181 total_files += 1
0182 print(f" {rel_path}: {len(chunks)} chunks")
0183
0184 print(f"\nDone: {total_files} files, {total_chunks} chunks indexed"
0185 f" ({skipped} unchanged files skipped)")
0186 print(f"Collection '{collection_name}': {collection.count()} total chunks")
0187
0188
0189 def main():
0190 parser = argparse.ArgumentParser(description="Ingest docs into ChromaDB")
0191 parser.add_argument("--config", default=DEFAULT_CONFIG,
0192 help="YAML config file")
0193 parser.add_argument("--rebuild", action="store_true",
0194 help="Wipe collection and re-ingest everything")
0195 parser.add_argument("--stats", action="store_true",
0196 help="Show collection stats only")
0197 args = parser.parse_args()
0198
0199 if not os.path.exists(args.config):
0200 print(f"Config not found: {args.config}")
0201 sys.exit(1)
0202
0203 ingest(args.config, rebuild=args.rebuild, stats_only=args.stats)
0204
0205
0206 if __name__ == "__main__":
0207 main()