diff --git a/README.md b/README.md index 723158b..f562055 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ list-collections create-collection delete-collection count -add-data +import [ ...] query delete --where = ``` @@ -137,10 +137,12 @@ Create a collection: chromy create-collection notes ``` -Add a file: +Add one or more files: ```bash chromy import notes ./docs/example.txt +chromy import notes ./docs/intro.md ./docs/setup.md +chromy import notes *.md ``` Count stored records: @@ -175,7 +177,7 @@ chromy delete notes --where file_name=example.txt ## How ingestion works -When you run `import`, the file is: +When you run `import`, each file is: 1. read from disk 2. split into chunks @@ -188,4 +190,8 @@ Query results include the stored document chunk, its id, distance, and file name - collections are stored in a local persistent Chroma database in the current directory - `import` requires the target collection to already exist -- the CLI prints friendly messages for common errors such as missing collections or missing files +- `import` accepts one or more file paths +- unquoted glob patterns such as `*.md` are expanded by the shell before `chromy` starts +- quoted glob patterns such as `"*.md"` are treated as literal paths and are not expanded by `chromy` +- unmatched unquoted globs may behave differently by shell: `zsh` commonly fails before `chromy` starts, while `bash` may pass the literal pattern through depending on shell settings +- the CLI reports file-specific import failures and continues with the remaining files diff --git a/chromy/cli.py b/chromy/cli.py index 0dc6899..b8ec012 100644 --- a/chromy/cli.py +++ b/chromy/cli.py @@ -6,7 +6,6 @@ import typer from chromadb.errors import InternalError, NotFoundError from rich import print -from chromy.errors import UnsupportedTextFileError from chromy.handlers.count_collection import handle_count_collection from chromy.handlers.create_collection import handle_create_collection from chromy.handlers.delete_collection import ( @@ -106,26 +105,27 @@ def count( # ------------------------------------------------------------------------------ @app.command( "import", - help="Chunk, embed, and add a file to a collection in the local Chroma database.", + help=( + "Chunk, embed, and add one or more files to a collection in the " + "local Chroma database." + ), ) def import_data( collection: Annotated[ str, typer.Argument(help="Name of the target collection."), ], - file: Annotated[ - str, - typer.Argument(help="Path to the file to chunk and add to the collection."), + files: Annotated[ + list[str], + typer.Argument( + help="Path(s) to the file(s) to chunk and add to the collection." + ), ], ) -> None: try: - _run(lambda: handle_import(collection, file)) + _run(lambda: handle_import(collection, files)) except NotFoundError: _fail(f"Collection '{collection}' does not exist.") - except FileNotFoundError: - _fail(f"The file '{file}' was not found.") - except UnsupportedTextFileError: - _fail(f"The file '{file}' is not a text file.") # ------------------------------------------------------------------------------ diff --git a/chromy/handlers/import_data.py b/chromy/handlers/import_data.py index d59281f..15f8fec 100644 --- a/chromy/handlers/import_data.py +++ b/chromy/handlers/import_data.py @@ -2,6 +2,7 @@ from __future__ import annotations import os from pathlib import Path +from typing import Final from rich import print @@ -10,6 +11,9 @@ from chromy.utilities import ingest_file from ..utilities import is_probably_text_file +SUCCESS_EXIT_CODE: Final = 0 +FAILURE_EXIT_CODE: Final = 1 + def _get_absolute_path(file: str) -> str: """ @@ -29,12 +33,49 @@ def _get_absolute_path(file: str) -> str: return str(file_path.resolve()) -def handle_import(collection: str, file: str) -> int: +def _import_one(collection: str, file: str) -> int: absolute_path = _get_absolute_path(file) + if not Path(absolute_path).is_file(): + raise FileNotFoundError() + if not is_probably_text_file(absolute_path): raise UnsupportedTextFileError() records_added = ingest_file(collection, absolute_path) - print(f"[bold green]Added[/] {records_added} records to collection '{collection}'.") - return 0 + print( + "[bold green]Added[/] " + f"{records_added} records from '{file}' to collection '{collection}'." + ) + return SUCCESS_EXIT_CODE + + +def handle_import(collection: str, files: list[str]) -> int: + successful_imports = 0 + failed_imports = 0 + seen_paths: set[str] = set() + + for file in files: + try: + absolute_path = _get_absolute_path(file) + if absolute_path in seen_paths: + continue + + seen_paths.add(absolute_path) + _import_one(collection, file) + successful_imports += 1 + except FileNotFoundError: + failed_imports += 1 + print(f"[bold red]Error[/]: The file '{file}' was not found.") + except UnsupportedTextFileError: + failed_imports += 1 + print(f"[bold red]Error[/]: The file '{file}' is not a text file.") + + print( + f"Imported {successful_imports} file(s) successfully; {failed_imports} failed." + ) + + if failed_imports: + return FAILURE_EXIT_CODE + + return SUCCESS_EXIT_CODE diff --git a/tests/test_cli.py b/tests/test_cli.py index efde08f..ec90ebd 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -106,7 +106,58 @@ class CliTests(unittest.TestCase): self._fixture_path("romeo_and_juliet.txt"), ) self.assertEqual(result.exit_code, 0) - self.assertEqual(result.stdout, "Added 3 records to collection 'notes'.\n") + self.assertEqual( + result.stdout, + "Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 0 failed.\n", + ) + + def test_import_data_accepts_multiple_files(self) -> None: + with patch( + "chromy.handlers.import_data.ingest_file", + side_effect=[3, 2], + ) as ingest_file: + result = _invoke( + ["import", "notes", "romeo_and_juliet.txt", "README.md"], + ) + + self.assertEqual(ingest_file.call_count, 2) + ingest_file.assert_any_call( + "notes", + self._fixture_path("romeo_and_juliet.txt"), + ) + ingest_file.assert_any_call( + "notes", + self._fixture_path("README.md"), + ) + self.assertEqual(result.exit_code, 0) + self.assertEqual( + result.stdout, + "Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n" + "Added 2 records from 'README.md' to collection 'notes'.\n" + "Imported 2 file(s) successfully; 0 failed.\n", + ) + + def test_import_data_continues_after_missing_file(self) -> None: + with patch( + "chromy.handlers.import_data.ingest_file", + return_value=3, + ) as ingest_file: + result = _invoke( + ["import", "notes", "missing.txt", "romeo_and_juliet.txt"], + ) + + ingest_file.assert_called_once_with( + "notes", + self._fixture_path("romeo_and_juliet.txt"), + ) + self.assertEqual(result.exit_code, 1) + self.assertEqual( + result.stdout, + "Error: The file 'missing.txt' was not found.\n" + "Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 1 failed.\n", + ) def test_import_data_rejects_non_text_files(self) -> None: with patch( @@ -118,7 +169,38 @@ class CliTests(unittest.TestCase): self.assertEqual(result.exit_code, 1) self.assertEqual( result.stdout, - "Error: The file 'romeo_and_juliet.txt' is not a text file.\n", + "Error: The file 'romeo_and_juliet.txt' is not a text file.\n" + "Imported 0 file(s) successfully; 1 failed.\n", + ) + + def test_import_data_treats_literal_glob_as_missing_file(self) -> None: + result = _invoke(["import", "notes", "*.md"]) + + self.assertEqual(result.exit_code, 1) + self.assertEqual( + result.stdout, + "Error: The file '*.md' was not found.\n" + "Imported 0 file(s) successfully; 1 failed.\n", + ) + + def test_import_data_deduplicates_paths_within_single_invocation(self) -> None: + with patch( + "chromy.handlers.import_data.ingest_file", + return_value=3, + ) as ingest_file: + result = _invoke( + ["import", "notes", "README.md", "./README.md"], + ) + + ingest_file.assert_called_once_with( + "notes", + self._fixture_path("README.md"), + ) + self.assertEqual(result.exit_code, 0) + self.assertEqual( + result.stdout, + "Added 3 records from 'README.md' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 0 failed.\n", ) def test_query(self) -> None: diff --git a/tests/test_handlers.py b/tests/test_handlers.py index 7bb2d6c..53df14a 100644 --- a/tests/test_handlers.py +++ b/tests/test_handlers.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import TypeVar from unittest.mock import patch -from chromy.errors import UnsupportedTextFileError from chromy.handlers.count_collection import handle_count_collection from chromy.handlers.create_collection import handle_create_collection from chromy.handlers.delete_collection import ( @@ -97,7 +96,7 @@ class HandlerTests(unittest.TestCase): exit_code, output = _capture_output( handle_import, "notes", - "romeo_and_juliet.txt", + ["romeo_and_juliet.txt"], ) ingest_file.assert_called_once_with( @@ -105,17 +104,74 @@ class HandlerTests(unittest.TestCase): self._fixture_path("romeo_and_juliet.txt"), ) self.assertEqual(exit_code, 0) - self.assertEqual(output, "Added 3 records to collection 'notes'.\n") + self.assertEqual( + output, + "Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 0 failed.\n", + ) + + def test_import_data_continues_after_missing_file(self) -> None: + with patch( + "chromy.handlers.import_data.ingest_file", + return_value=3, + ) as ingest_file: + exit_code, output = _capture_output( + handle_import, + "notes", + ["missing.txt", "romeo_and_juliet.txt"], + ) + + ingest_file.assert_called_once_with( + "notes", + self._fixture_path("romeo_and_juliet.txt"), + ) + self.assertEqual(exit_code, 1) + self.assertEqual( + output, + "Error: The file 'missing.txt' was not found.\n" + "Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 1 failed.\n", + ) def test_import_data_rejects_non_text_files(self) -> None: - with ( - patch( - "chromy.handlers.import_data.is_probably_text_file", - return_value=False, - ), - self.assertRaises(UnsupportedTextFileError), + with patch( + "chromy.handlers.import_data.is_probably_text_file", + return_value=False, ): - handle_import("notes", "romeo_and_juliet.txt") + exit_code, output = _capture_output( + handle_import, + "notes", + ["romeo_and_juliet.txt"], + ) + + self.assertEqual(exit_code, 1) + self.assertEqual( + output, + "Error: The file 'romeo_and_juliet.txt' is not a text file.\n" + "Imported 0 file(s) successfully; 1 failed.\n", + ) + + def test_import_data_deduplicates_files(self) -> None: + with patch( + "chromy.handlers.import_data.ingest_file", + return_value=3, + ) as ingest_file: + exit_code, output = _capture_output( + handle_import, + "notes", + ["README.md", "./README.md"], + ) + + ingest_file.assert_called_once_with( + "notes", + self._fixture_path("README.md"), + ) + self.assertEqual(exit_code, 0) + self.assertEqual( + output, + "Added 3 records from 'README.md' to collection 'notes'.\n" + "Imported 1 file(s) successfully; 0 failed.\n", + ) def test_query_uses_typed_input(self) -> None: query_result = {"ids": [["1"]], "documents": [["hello"]]}