From 18f26815e387ab5fd3f0659a9c44d5d5a73a337b Mon Sep 17 00:00:00 2001
From: Matteo Rosati <rosati.matteo@gmail.com>
Date: Tue, 21 Apr 2026 15:28:20 +0200
Subject: [PATCH] add documents

---
 chroma_functions.py | 21 +++++++++++++++++++++
 chunk.py            | 10 ----------
 chunk_functions.py  | 17 +++++++++++++++++
 cli_parser.py       |  8 ++++++++
 main.py             | 26 ++++++++++++++++++++++++--
 5 files changed, 70 insertions(+), 12 deletions(-)
 delete mode 100644 chunk.py
 create mode 100644 chunk_functions.py

diff --git a/chroma_functions.py b/chroma_functions.py
index 824c506..0807cf0 100644
--- a/chroma_functions.py
+++ b/chroma_functions.py
@@ -1,6 +1,9 @@
 import chromadb
 from chromadb.errors import NotFoundError
 from typing import List
+from uuid import uuid4
+
+from embed import EmbeddingRecord
 
 
 def list_collections() -> List[str]:
@@ -34,3 +37,21 @@ def count_collection(name: str) -> int:
         raise
 
     return collection.count()
+
+
+def add_data(collection: str, data: List[EmbeddingRecord]) -> None:
+    if not data:
+        return
+
+    client = chromadb.PersistentClient()
+
+    try:
+        target_collection = client.get_collection(name=collection)
+    except NotFoundError:
+        raise
+
+    target_collection.add(
+        ids=[str(uuid4()) for _ in data],
+        documents=[record["text"] for record in data],
+        embeddings=[record["embedding"] for record in data],
+    )
diff --git a/chunk.py b/chunk.py
deleted file mode 100644
index 57c0177..0000000
--- a/chunk.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from typing import List
-
-import semchunk
-
-
-def chunk(text: str, chunk_size: int = 800) -> List[str]:
-    chunker = semchunk.chunkerify("gpt-4", chunk_size)
-    chunks = chunker(text)
-
-    return chunks
diff --git a/chunk_functions.py b/chunk_functions.py
new file mode 100644
index 0000000..e84af19
--- /dev/null
+++ b/chunk_functions.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+from typing import List
+
+import semchunk
+
+
+def chunk_text(text: str, chunk_size: int = 800) -> List[str]:
+    chunker = semchunk.chunkerify("gpt-4", chunk_size)
+    chunks = chunker(text)
+
+    return chunks
+
+
+def chunk_file(filename: str, chunk_size: int = 800) -> List[str]:
+    contents = Path(filename).read_text()
+
+    return chunk_text(contents, chunk_size)
diff --git a/cli_parser.py b/cli_parser.py
index 102ad00..30d0177 100644
--- a/cli_parser.py
+++ b/cli_parser.py
@@ -32,4 +32,12 @@ def build_parser() -> argparse.ArgumentParser:
     )
     count_parser.add_argument("name", help="Name of the collection to count.")
 
+    add_parser = subparsers.add_parser(
+        "add-data",
+        aliases=["ad"],
+        help="Chunk, embed, and add a file to a collection in the local Chroma database.",
+    )
+    add_parser.add_argument("collection", help="Name of the target collection.")
+    add_parser.add_argument("file", help="Path to the file to chunk and add to the collection.")
+
     return parser
diff --git a/main.py b/main.py
index 72864f4..5007ef3 100644
--- a/main.py
+++ b/main.py
@@ -1,14 +1,20 @@
 from __future__ import annotations
 
-from chromadb.errors import NotFoundError, InternalError
+from chromadb.errors import InternalError, NotFoundError
+from dotenv import load_dotenv
+
 from chroma_functions import (
+    add_data,
     count_collection,
     create_collection,
     delete_collection,
     list_collections,
 )
-
+from chunk_functions import chunk_file
 from cli_parser import build_parser
+from embed import embed
+
+load_dotenv()
 
 
 def main() -> int:
@@ -58,6 +64,22 @@ def main() -> int:
 
         return 0
 
+    if args.command in {"add-data", "ad"}:
+        try:
+            chunks = chunk_file(args.file)
+            embeddings = embed(chunks)
+            add_data(args.collection, embeddings)
+        except NotFoundError:
+            print(f"Collection '{args.collection}' does not exist.")
+            return 1
+        except FileNotFoundError:
+            print(f"The file {args.file} was not found.")
+            return 1
+
+        print(f"Added {len(embeddings)} records to collection '{args.collection}'.")
+
+        return 0
+
     print("Nothing to do. Use -h to see available commands.")
 
     return 0