add multi-file import support
This commit is contained in:
@@ -124,7 +124,7 @@ list-collections
|
||||
create-collection <collection>
|
||||
delete-collection <collection>
|
||||
count <collection>
|
||||
add-data <collection> <file>
|
||||
import <collection> <file> [<file> ...]
|
||||
query <collection> <query_text>
|
||||
delete <collection> --where <condition>=<value>
|
||||
```
|
||||
@@ -137,10 +137,12 @@ Create a collection:
|
||||
chromy create-collection notes
|
||||
```
|
||||
|
||||
Add a file:
|
||||
Add one or more files:
|
||||
|
||||
```bash
|
||||
chromy import notes ./docs/example.txt
|
||||
chromy import notes ./docs/intro.md ./docs/setup.md
|
||||
chromy import notes *.md
|
||||
```
|
||||
|
||||
Count stored records:
|
||||
@@ -175,7 +177,7 @@ chromy delete notes --where file_name=example.txt
|
||||
|
||||
## How ingestion works
|
||||
|
||||
When you run `import`, the file is:
|
||||
When you run `import`, each file is:
|
||||
|
||||
1. read from disk
|
||||
2. split into chunks
|
||||
@@ -188,4 +190,8 @@ Query results include the stored document chunk, its id, distance, and file name
|
||||
|
||||
- collections are stored in a local persistent Chroma database in the current directory
|
||||
- `import` requires the target collection to already exist
|
||||
- the CLI prints friendly messages for common errors such as missing collections or missing files
|
||||
- `import` accepts one or more file paths
|
||||
- unquoted glob patterns such as `*.md` are expanded by the shell before `chromy` starts
|
||||
- quoted glob patterns such as `"*.md"` are treated as literal paths and are not expanded by `chromy`
|
||||
- unmatched unquoted globs may behave differently by shell: `zsh` commonly fails before `chromy` starts, while `bash` may pass the literal pattern through depending on shell settings
|
||||
- the CLI reports file-specific import failures and continues with the remaining files
|
||||
|
||||
+10
-10
@@ -6,7 +6,6 @@ import typer
|
||||
from chromadb.errors import InternalError, NotFoundError
|
||||
from rich import print
|
||||
|
||||
from chromy.errors import UnsupportedTextFileError
|
||||
from chromy.handlers.count_collection import handle_count_collection
|
||||
from chromy.handlers.create_collection import handle_create_collection
|
||||
from chromy.handlers.delete_collection import (
|
||||
@@ -106,26 +105,27 @@ def count(
|
||||
# ------------------------------------------------------------------------------
|
||||
@app.command(
|
||||
"import",
|
||||
help="Chunk, embed, and add a file to a collection in the local Chroma database.",
|
||||
help=(
|
||||
"Chunk, embed, and add one or more files to a collection in the "
|
||||
"local Chroma database."
|
||||
),
|
||||
)
|
||||
def import_data(
|
||||
collection: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Name of the target collection."),
|
||||
],
|
||||
file: Annotated[
|
||||
str,
|
||||
typer.Argument(help="Path to the file to chunk and add to the collection."),
|
||||
files: Annotated[
|
||||
list[str],
|
||||
typer.Argument(
|
||||
help="Path(s) to the file(s) to chunk and add to the collection."
|
||||
),
|
||||
],
|
||||
) -> None:
|
||||
try:
|
||||
_run(lambda: handle_import(collection, file))
|
||||
_run(lambda: handle_import(collection, files))
|
||||
except NotFoundError:
|
||||
_fail(f"Collection '{collection}' does not exist.")
|
||||
except FileNotFoundError:
|
||||
_fail(f"The file '{file}' was not found.")
|
||||
except UnsupportedTextFileError:
|
||||
_fail(f"The file '{file}' is not a text file.")
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from rich import print
|
||||
|
||||
@@ -10,6 +11,9 @@ from chromy.utilities import ingest_file
|
||||
|
||||
from ..utilities import is_probably_text_file
|
||||
|
||||
SUCCESS_EXIT_CODE: Final = 0
|
||||
FAILURE_EXIT_CODE: Final = 1
|
||||
|
||||
|
||||
def _get_absolute_path(file: str) -> str:
|
||||
"""
|
||||
@@ -29,12 +33,49 @@ def _get_absolute_path(file: str) -> str:
|
||||
return str(file_path.resolve())
|
||||
|
||||
|
||||
def handle_import(collection: str, file: str) -> int:
|
||||
def _import_one(collection: str, file: str) -> int:
|
||||
absolute_path = _get_absolute_path(file)
|
||||
|
||||
if not Path(absolute_path).is_file():
|
||||
raise FileNotFoundError()
|
||||
|
||||
if not is_probably_text_file(absolute_path):
|
||||
raise UnsupportedTextFileError()
|
||||
|
||||
records_added = ingest_file(collection, absolute_path)
|
||||
print(f"[bold green]Added[/] {records_added} records to collection '{collection}'.")
|
||||
return 0
|
||||
print(
|
||||
"[bold green]Added[/] "
|
||||
f"{records_added} records from '{file}' to collection '{collection}'."
|
||||
)
|
||||
return SUCCESS_EXIT_CODE
|
||||
|
||||
|
||||
def handle_import(collection: str, files: list[str]) -> int:
|
||||
successful_imports = 0
|
||||
failed_imports = 0
|
||||
seen_paths: set[str] = set()
|
||||
|
||||
for file in files:
|
||||
try:
|
||||
absolute_path = _get_absolute_path(file)
|
||||
if absolute_path in seen_paths:
|
||||
continue
|
||||
|
||||
seen_paths.add(absolute_path)
|
||||
_import_one(collection, file)
|
||||
successful_imports += 1
|
||||
except FileNotFoundError:
|
||||
failed_imports += 1
|
||||
print(f"[bold red]Error[/]: The file '{file}' was not found.")
|
||||
except UnsupportedTextFileError:
|
||||
failed_imports += 1
|
||||
print(f"[bold red]Error[/]: The file '{file}' is not a text file.")
|
||||
|
||||
print(
|
||||
f"Imported {successful_imports} file(s) successfully; {failed_imports} failed."
|
||||
)
|
||||
|
||||
if failed_imports:
|
||||
return FAILURE_EXIT_CODE
|
||||
|
||||
return SUCCESS_EXIT_CODE
|
||||
|
||||
+84
-2
@@ -106,7 +106,58 @@ class CliTests(unittest.TestCase):
|
||||
self._fixture_path("romeo_and_juliet.txt"),
|
||||
)
|
||||
self.assertEqual(result.exit_code, 0)
|
||||
self.assertEqual(result.stdout, "Added 3 records to collection 'notes'.\n")
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 0 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_accepts_multiple_files(self) -> None:
|
||||
with patch(
|
||||
"chromy.handlers.import_data.ingest_file",
|
||||
side_effect=[3, 2],
|
||||
) as ingest_file:
|
||||
result = _invoke(
|
||||
["import", "notes", "romeo_and_juliet.txt", "README.md"],
|
||||
)
|
||||
|
||||
self.assertEqual(ingest_file.call_count, 2)
|
||||
ingest_file.assert_any_call(
|
||||
"notes",
|
||||
self._fixture_path("romeo_and_juliet.txt"),
|
||||
)
|
||||
ingest_file.assert_any_call(
|
||||
"notes",
|
||||
self._fixture_path("README.md"),
|
||||
)
|
||||
self.assertEqual(result.exit_code, 0)
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n"
|
||||
"Added 2 records from 'README.md' to collection 'notes'.\n"
|
||||
"Imported 2 file(s) successfully; 0 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_continues_after_missing_file(self) -> None:
|
||||
with patch(
|
||||
"chromy.handlers.import_data.ingest_file",
|
||||
return_value=3,
|
||||
) as ingest_file:
|
||||
result = _invoke(
|
||||
["import", "notes", "missing.txt", "romeo_and_juliet.txt"],
|
||||
)
|
||||
|
||||
ingest_file.assert_called_once_with(
|
||||
"notes",
|
||||
self._fixture_path("romeo_and_juliet.txt"),
|
||||
)
|
||||
self.assertEqual(result.exit_code, 1)
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Error: The file 'missing.txt' was not found.\n"
|
||||
"Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 1 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_rejects_non_text_files(self) -> None:
|
||||
with patch(
|
||||
@@ -118,7 +169,38 @@ class CliTests(unittest.TestCase):
|
||||
self.assertEqual(result.exit_code, 1)
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Error: The file 'romeo_and_juliet.txt' is not a text file.\n",
|
||||
"Error: The file 'romeo_and_juliet.txt' is not a text file.\n"
|
||||
"Imported 0 file(s) successfully; 1 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_treats_literal_glob_as_missing_file(self) -> None:
|
||||
result = _invoke(["import", "notes", "*.md"])
|
||||
|
||||
self.assertEqual(result.exit_code, 1)
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Error: The file '*.md' was not found.\n"
|
||||
"Imported 0 file(s) successfully; 1 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_deduplicates_paths_within_single_invocation(self) -> None:
|
||||
with patch(
|
||||
"chromy.handlers.import_data.ingest_file",
|
||||
return_value=3,
|
||||
) as ingest_file:
|
||||
result = _invoke(
|
||||
["import", "notes", "README.md", "./README.md"],
|
||||
)
|
||||
|
||||
ingest_file.assert_called_once_with(
|
||||
"notes",
|
||||
self._fixture_path("README.md"),
|
||||
)
|
||||
self.assertEqual(result.exit_code, 0)
|
||||
self.assertEqual(
|
||||
result.stdout,
|
||||
"Added 3 records from 'README.md' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 0 failed.\n",
|
||||
)
|
||||
|
||||
def test_query(self) -> None:
|
||||
|
||||
+66
-10
@@ -8,7 +8,6 @@ from pathlib import Path
|
||||
from typing import TypeVar
|
||||
from unittest.mock import patch
|
||||
|
||||
from chromy.errors import UnsupportedTextFileError
|
||||
from chromy.handlers.count_collection import handle_count_collection
|
||||
from chromy.handlers.create_collection import handle_create_collection
|
||||
from chromy.handlers.delete_collection import (
|
||||
@@ -97,7 +96,7 @@ class HandlerTests(unittest.TestCase):
|
||||
exit_code, output = _capture_output(
|
||||
handle_import,
|
||||
"notes",
|
||||
"romeo_and_juliet.txt",
|
||||
["romeo_and_juliet.txt"],
|
||||
)
|
||||
|
||||
ingest_file.assert_called_once_with(
|
||||
@@ -105,17 +104,74 @@ class HandlerTests(unittest.TestCase):
|
||||
self._fixture_path("romeo_and_juliet.txt"),
|
||||
)
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(output, "Added 3 records to collection 'notes'.\n")
|
||||
self.assertEqual(
|
||||
output,
|
||||
"Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 0 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_continues_after_missing_file(self) -> None:
|
||||
with patch(
|
||||
"chromy.handlers.import_data.ingest_file",
|
||||
return_value=3,
|
||||
) as ingest_file:
|
||||
exit_code, output = _capture_output(
|
||||
handle_import,
|
||||
"notes",
|
||||
["missing.txt", "romeo_and_juliet.txt"],
|
||||
)
|
||||
|
||||
ingest_file.assert_called_once_with(
|
||||
"notes",
|
||||
self._fixture_path("romeo_and_juliet.txt"),
|
||||
)
|
||||
self.assertEqual(exit_code, 1)
|
||||
self.assertEqual(
|
||||
output,
|
||||
"Error: The file 'missing.txt' was not found.\n"
|
||||
"Added 3 records from 'romeo_and_juliet.txt' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 1 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_rejects_non_text_files(self) -> None:
|
||||
with (
|
||||
patch(
|
||||
"chromy.handlers.import_data.is_probably_text_file",
|
||||
return_value=False,
|
||||
),
|
||||
self.assertRaises(UnsupportedTextFileError),
|
||||
with patch(
|
||||
"chromy.handlers.import_data.is_probably_text_file",
|
||||
return_value=False,
|
||||
):
|
||||
handle_import("notes", "romeo_and_juliet.txt")
|
||||
exit_code, output = _capture_output(
|
||||
handle_import,
|
||||
"notes",
|
||||
["romeo_and_juliet.txt"],
|
||||
)
|
||||
|
||||
self.assertEqual(exit_code, 1)
|
||||
self.assertEqual(
|
||||
output,
|
||||
"Error: The file 'romeo_and_juliet.txt' is not a text file.\n"
|
||||
"Imported 0 file(s) successfully; 1 failed.\n",
|
||||
)
|
||||
|
||||
def test_import_data_deduplicates_files(self) -> None:
|
||||
with patch(
|
||||
"chromy.handlers.import_data.ingest_file",
|
||||
return_value=3,
|
||||
) as ingest_file:
|
||||
exit_code, output = _capture_output(
|
||||
handle_import,
|
||||
"notes",
|
||||
["README.md", "./README.md"],
|
||||
)
|
||||
|
||||
ingest_file.assert_called_once_with(
|
||||
"notes",
|
||||
self._fixture_path("README.md"),
|
||||
)
|
||||
self.assertEqual(exit_code, 0)
|
||||
self.assertEqual(
|
||||
output,
|
||||
"Added 3 records from 'README.md' to collection 'notes'.\n"
|
||||
"Imported 1 file(s) successfully; 0 failed.\n",
|
||||
)
|
||||
|
||||
def test_query_uses_typed_input(self) -> None:
|
||||
query_result = {"ids": [["1"]], "documents": [["hello"]]}
|
||||
|
||||
Reference in New Issue
Block a user