From 18f26815e387ab5fd3f0659a9c44d5d5a73a337b Mon Sep 17 00:00:00 2001 From: Matteo Rosati Date: Tue, 21 Apr 2026 15:28:20 +0200 Subject: [PATCH] add documents --- chroma_functions.py | 21 +++++++++++++++++++++ chunk.py | 10 ---------- chunk_functions.py | 17 +++++++++++++++++ cli_parser.py | 8 ++++++++ main.py | 26 ++++++++++++++++++++++++-- 5 files changed, 70 insertions(+), 12 deletions(-) delete mode 100644 chunk.py create mode 100644 chunk_functions.py diff --git a/chroma_functions.py b/chroma_functions.py index 824c506..0807cf0 100644 --- a/chroma_functions.py +++ b/chroma_functions.py @@ -1,6 +1,9 @@ import chromadb from chromadb.errors import NotFoundError from typing import List +from uuid import uuid4 + +from embed import EmbeddingRecord def list_collections() -> List[str]: @@ -34,3 +37,21 @@ def count_collection(name: str) -> int: raise return collection.count() + + +def add_data(collection: str, data: List[EmbeddingRecord]) -> None: + if not data: + return + + client = chromadb.PersistentClient() + + try: + target_collection = client.get_collection(name=collection) + except NotFoundError: + raise + + target_collection.add( + ids=[str(uuid4()) for _ in data], + documents=[record["text"] for record in data], + embeddings=[record["embedding"] for record in data], + ) diff --git a/chunk.py b/chunk.py deleted file mode 100644 index 57c0177..0000000 --- a/chunk.py +++ /dev/null @@ -1,10 +0,0 @@ -from typing import List - -import semchunk - - -def chunk(text: str, chunk_size: int = 800) -> List[str]: - chunker = semchunk.chunkerify("gpt-4", chunk_size) - chunks = chunker(text) - - return chunks diff --git a/chunk_functions.py b/chunk_functions.py new file mode 100644 index 0000000..e84af19 --- /dev/null +++ b/chunk_functions.py @@ -0,0 +1,17 @@ +from pathlib import Path +from typing import List + +import semchunk + + +def chunk_text(text: str, chunk_size: int = 800) -> List[str]: + chunker = semchunk.chunkerify("gpt-4", chunk_size) + chunks = chunker(text) + + return chunks + + +def chunk_file(filename: str, chunk_size: int = 800) -> List[str]: + contents = Path(filename).read_text() + + return chunk_text(contents, chunk_size) diff --git a/cli_parser.py b/cli_parser.py index 102ad00..30d0177 100644 --- a/cli_parser.py +++ b/cli_parser.py @@ -32,4 +32,12 @@ def build_parser() -> argparse.ArgumentParser: ) count_parser.add_argument("name", help="Name of the collection to count.") + add_parser = subparsers.add_parser( + "add-data", + aliases=["ad"], + help="Chunk, embed, and add a file to a collection in the local Chroma database.", + ) + add_parser.add_argument("collection", help="Name of the target collection.") + add_parser.add_argument("file", help="Path to the file to chunk and add to the collection.") + return parser diff --git a/main.py b/main.py index 72864f4..5007ef3 100644 --- a/main.py +++ b/main.py @@ -1,14 +1,20 @@ from __future__ import annotations -from chromadb.errors import NotFoundError, InternalError +from chromadb.errors import InternalError, NotFoundError +from dotenv import load_dotenv + from chroma_functions import ( + add_data, count_collection, create_collection, delete_collection, list_collections, ) - +from chunk_functions import chunk_file from cli_parser import build_parser +from embed import embed + +load_dotenv() def main() -> int: @@ -58,6 +64,22 @@ def main() -> int: return 0 + if args.command in {"add-data", "ad"}: + try: + chunks = chunk_file(args.file) + embeddings = embed(chunks) + add_data(args.collection, embeddings) + except NotFoundError: + print(f"Collection '{args.collection}' does not exist.") + return 1 + except FileNotFoundError: + print(f"The file {args.file} was not found.") + return 1 + + print(f"Added {len(embeddings)} records to collection '{args.collection}'.") + + return 0 + print("Nothing to do. Use -h to see available commands.") return 0