From a7b91b9c4e1ce007f369c3f85bd965012c30e6f3 Mon Sep 17 00:00:00 2001 From: Matteo Rosati Date: Tue, 21 Apr 2026 17:42:37 +0200 Subject: [PATCH] complete refactor --- cli_app.py | 97 ++++++++++++++++++++++++ cli_parser.py | 139 +++++++++++++++++++++------------- handlers/add_data.py | 9 +++ handlers/count_collection.py | 8 ++ handlers/create_collection.py | 9 +++ handlers/delete_collection.py | 9 +++ handlers/list_collections.py | 14 ++++ handlers/query.py | 9 +++ main.py | 91 +--------------------- utilities.py | 46 +++++++++++ 10 files changed, 292 insertions(+), 139 deletions(-) create mode 100644 cli_app.py create mode 100644 handlers/add_data.py create mode 100644 handlers/count_collection.py create mode 100644 handlers/create_collection.py create mode 100644 handlers/delete_collection.py create mode 100644 handlers/list_collections.py create mode 100644 handlers/query.py create mode 100644 utilities.py diff --git a/cli_app.py b/cli_app.py new file mode 100644 index 0000000..cec76db --- /dev/null +++ b/cli_app.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from argparse import Namespace +from collections.abc import Callable +from dataclasses import dataclass + +from chromadb.errors import InternalError, NotFoundError + +from handlers.add_data import handle_add_data +from handlers.count_collection import handle_count_collection +from handlers.create_collection import handle_create_collection +from handlers.delete_collection import handle_delete_collection +from handlers.list_collections import handle_list_collections +from handlers.query import handle_query + + +CommandHandler = Callable[[Namespace], int] +ErrorMessageBuilder = Callable[[Namespace], str] + + +@dataclass(frozen=True, slots=True) +class CliErrorHandler: + exception_type: type[BaseException] + message: ErrorMessageBuilder + + +@dataclass(frozen=True, slots=True) +class CommandConfig: + handler: CommandHandler + error_handlers: tuple[CliErrorHandler, ...] = () + + +COMMANDS: dict[str, CommandConfig] = { + "list-collections": CommandConfig(handler=handle_list_collections), + "create-collection": CommandConfig( + handler=handle_create_collection, + error_handlers=( + CliErrorHandler( + exception_type=InternalError, + message=lambda args: f"Collection '{args.collection}' already exists.", + ), + ), + ), + "delete-collection": CommandConfig( + handler=handle_delete_collection, + error_handlers=( + CliErrorHandler( + exception_type=NotFoundError, + message=lambda args: f"Collection '{args.collection}' does not exist.", + ), + ), + ), + "count": CommandConfig( + handler=handle_count_collection, + error_handlers=( + CliErrorHandler( + exception_type=NotFoundError, + message=lambda args: f"Collection '{args.collection}' does not exist.", + ), + ), + ), + "add-data": CommandConfig( + handler=handle_add_data, + error_handlers=( + CliErrorHandler( + exception_type=NotFoundError, + message=lambda args: f"Collection '{args.collection}' does not exist.", + ), + CliErrorHandler( + exception_type=FileNotFoundError, + message=lambda args: f"The file {args.file} was not found.", + ), + ), + ), + "query": CommandConfig( + handler=handle_query, + error_handlers=( + CliErrorHandler( + exception_type=NotFoundError, + message=lambda args: f"Collection '{args.collection}' does not exist.", + ), + ), + ), +} + + +def execute_command(args: Namespace) -> int: + command = COMMANDS[args.command] + + try: + return command.handler(args) + except BaseException as exc: + for error_handler in command.error_handlers: + if isinstance(exc, error_handler.exception_type): + print(error_handler.message(args)) + return 1 + raise diff --git a/cli_parser.py b/cli_parser.py index f59b193..d2edcda 100644 --- a/cli_parser.py +++ b/cli_parser.py @@ -1,59 +1,96 @@ +from __future__ import annotations + import argparse +from dataclasses import dataclass + + +@dataclass(frozen=True, slots=True) +class ArgumentSpec: + name: str + help: str + + +@dataclass(frozen=True, slots=True) +class CommandSpec: + name: str + aliases: tuple[str, ...] + help: str + arguments: tuple[ArgumentSpec, ...] = () + + +COMMAND_SPECS: tuple[CommandSpec, ...] = ( + CommandSpec( + name="list-collections", + aliases=("lc",), + help="List all collections stored in the local Chroma database.", + ), + CommandSpec( + name="create-collection", + aliases=("cc",), + help="Create a collection in the local Chroma database.", + arguments=( + ArgumentSpec("collection", "Name of the collection to create."), + ), + ), + CommandSpec( + name="delete-collection", + aliases=("dc",), + help="Delete a collection from the local Chroma database.", + arguments=( + ArgumentSpec("collection", "Name of the collection to delete."), + ), + ), + CommandSpec( + name="count", + aliases=("co",), + help="Count records in a collection from the local Chroma database.", + arguments=( + ArgumentSpec("collection", "Name of the collection to count."), + ), + ), + CommandSpec( + name="add-data", + aliases=("ad",), + help="Chunk, embed, and add a file to a collection in the local Chroma database.", + arguments=( + ArgumentSpec("collection", "Name of the target collection."), + ArgumentSpec("file", "Path to the file to chunk and add to the collection."), + ), + ), + CommandSpec( + name="query", + aliases=("q",), + help="Query a collection with the provided text.", + arguments=( + ArgumentSpec("collection", "Name of the target collection."), + ArgumentSpec("query_text", "The text to query."), + ), + ), +) + + +def _add_command( + subparsers: argparse._SubParsersAction[argparse.ArgumentParser], + command: CommandSpec, +) -> None: + subparser = subparsers.add_parser( + command.name, + aliases=list(command.aliases), + help=command.help, + description=command.help, + ) + + for argument in command.arguments: + subparser.add_argument(argument.name, help=argument.help) + + subparser.set_defaults(command=command.name) def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Inspect local Chroma collections.") - subparsers = parser.add_subparsers(dest="command") + subparsers = parser.add_subparsers(dest="command", required=True) - # List existing collections - subparsers.add_parser( - "list-collections", - aliases=["lc"], - help="List all collections stored in the local Chroma database.", - ) - - # Create a new collection - create_parser = subparsers.add_parser( - "create-collection", - aliases=["cc"], - help="Create a collection in the local Chroma database.", - ) - create_parser.add_argument("name", help="Name of the collection to create.") - - # Delete a collection - delete_parser = subparsers.add_parser( - "delete-collection", - aliases=["dc"], - help="Delete a collection from the local Chroma database.", - ) - delete_parser.add_argument("name", help="Name of the collection to delete.") - - # Count documents in a collection - count_parser = subparsers.add_parser( - "count", - aliases=["co"], - help="Count records in a collection from the local Chroma database.", - ) - count_parser.add_argument("name", help="Name of the collection to count.") - - # Add documents to a collection - add_parser = subparsers.add_parser( - "add-data", - aliases=["ad"], - help="Chunk, embed, and add a file to a collection in the local Chroma database.", - ) - add_parser.add_argument("collection", help="Name of the target collection.") - add_parser.add_argument( - "file", help="Path to the file to chunk and add to the collection." - ) - - # Query doc - query_parser = subparsers.add_parser( - "query", - aliases=["q"], - help="Query a collection with given text/s.", - ) - query_parser.add_argument("collection", help="Name of the target collection.") - query_parser.add_argument("texts", help="The text/s to query.") + for command in COMMAND_SPECS: + _add_command(subparsers, command) return parser diff --git a/handlers/add_data.py b/handlers/add_data.py new file mode 100644 index 0000000..8bc86f5 --- /dev/null +++ b/handlers/add_data.py @@ -0,0 +1,9 @@ +from argparse import Namespace + +from utilities import ingest_file + + +def handle_add_data(args: Namespace) -> int: + records_added = ingest_file(args.collection, args.file) + print(f"Added {records_added} records to collection '{args.collection}'.") + return 0 diff --git a/handlers/count_collection.py b/handlers/count_collection.py new file mode 100644 index 0000000..9ebc5e3 --- /dev/null +++ b/handlers/count_collection.py @@ -0,0 +1,8 @@ +from argparse import Namespace + +from chroma_functions import count_collection + + +def handle_count_collection(args: Namespace) -> int: + print(count_collection(args.collection)) + return 0 diff --git a/handlers/create_collection.py b/handlers/create_collection.py new file mode 100644 index 0000000..055c375 --- /dev/null +++ b/handlers/create_collection.py @@ -0,0 +1,9 @@ +from argparse import Namespace + +from chroma_functions import create_collection + + +def handle_create_collection(args: Namespace) -> int: + collection_name = create_collection(args.collection) + print(f"Created collection '{collection_name}'.") + return 0 diff --git a/handlers/delete_collection.py b/handlers/delete_collection.py new file mode 100644 index 0000000..058c779 --- /dev/null +++ b/handlers/delete_collection.py @@ -0,0 +1,9 @@ +from argparse import Namespace + +from chroma_functions import delete_collection + + +def handle_delete_collection(args: Namespace) -> int: + delete_collection(args.collection) + print(f"Deleted collection '{args.collection}'.") + return 0 diff --git a/handlers/list_collections.py b/handlers/list_collections.py new file mode 100644 index 0000000..8ba61b2 --- /dev/null +++ b/handlers/list_collections.py @@ -0,0 +1,14 @@ +from argparse import Namespace + +from chroma_functions import list_collections +from utilities import print_lines + + +def handle_list_collections(_: Namespace) -> int: + collections = list_collections() + if not collections: + print("No collections found.") + return 0 + + print_lines(collections) + return 0 diff --git a/handlers/query.py b/handlers/query.py new file mode 100644 index 0000000..8ccb0b0 --- /dev/null +++ b/handlers/query.py @@ -0,0 +1,9 @@ +from argparse import Namespace + +from utilities import format_query_result, print_lines, run_query + + +def handle_query(args: Namespace) -> int: + result = run_query(args.collection, args.query_text) + print_lines(format_query_result(result)) + return 0 diff --git a/main.py b/main.py index bcb345f..a4cf5eb 100644 --- a/main.py +++ b/main.py @@ -1,100 +1,15 @@ from __future__ import annotations -from chromadb.errors import InternalError, NotFoundError from dotenv import load_dotenv -from chroma_functions import ( - add_data, - count_collection, - create_collection, - delete_collection, - list_collections, - query_data, -) -from chunk_functions import chunk_file +from cli_app import execute_command from cli_parser import build_parser -from embed import embed - -load_dotenv() def main() -> int: + load_dotenv() args = build_parser().parse_args() - - if args.command in {"list-collections", "lc"}: - collections = list_collections() - if not collections: - print("No collections found.") - return 0 - - for name in collections: - print(name) - - return 0 - - if args.command in {"create-collection", "cc"}: - try: - collection = create_collection(args.name) - except InternalError: - print(f"Collection '{args.name}' already exists.") - return 1 - - print(f"Created collection '{collection}'.") - - return 0 - - if args.command in {"delete-collection", "dc"}: - try: - delete_collection(args.name) - except NotFoundError: - print(f"Collection '{args.name}' does not exist.") - return 1 - - print(f"Deleted collection '{args.name}'.") - - return 0 - - if args.command in {"count", "co"}: - try: - count = count_collection(args.name) - except NotFoundError: - print(f"Collection '{args.name}' does not exist.") - return 1 - - print(count) - - return 0 - - if args.command in {"add-data", "ad"}: - try: - chunks = chunk_file(args.file) - embeddings = embed(chunks) - add_data(args.collection, embeddings) - except NotFoundError: - print(f"Collection '{args.collection}' does not exist.") - return 1 - except FileNotFoundError: - print(f"The file {args.file} was not found.") - return 1 - - print(f"Added {len(embeddings)} records to collection '{args.collection}'.") - - return 0 - - if args.command in {"query", "q"}: - try: - result = query_data(args.collection, [args.texts]) - except NotFoundError: - print(f"Collection '{args.collection}' does not exist.") - return 1 - - print(result) - - return 0 - - print("Nothing to do. Use -h to see available commands.") - - return 0 + return execute_command(args) if __name__ == "__main__": diff --git a/utilities.py b/utilities.py new file mode 100644 index 0000000..6173698 --- /dev/null +++ b/utilities.py @@ -0,0 +1,46 @@ +from chromadb import QueryResult + +from chroma_functions import add_data, query_data +from chunk_functions import chunk_file +from embed import embed + + +def print_lines(lines: list[str]) -> None: + for line in lines: + print(line) + + +def ingest_file(collection_name: str, file_path: str) -> int: + chunks = chunk_file(file_path) + embeddings = embed(chunks) + add_data(collection_name, embeddings) + return len(embeddings) + + +def run_query(collection_name: str, query_text: str) -> QueryResult: + return query_data(collection_name, [query_text]) + + +def format_query_result(result: QueryResult) -> list[str]: + ids = result.get("ids", [[]]) + documents = result.get("documents", [[]]) + distances = result.get("distances", [[]]) + + first_ids = ids[0] if ids else [] + first_documents = documents[0] if documents else [] + first_distances = distances[0] if distances else [] + + if not first_ids: + return ["No results found."] + + lines = ["Query results:"] + for index, document_id in enumerate(first_ids, start=1): + lines.append(f"{index}. id: {document_id}") + + if index - 1 < len(first_distances): + lines.append(f" distance: {first_distances[index - 1]}") + + if index - 1 < len(first_documents): + lines.append(f" document: {first_documents[index - 1]}") + + return lines