move top-level modules into a real package

This commit is contained in:
Matteo Rosati
2026-04-22 15:47:46 +02:00
parent e33160282c
commit 8ebab832d5
35 changed files with 6192 additions and 31 deletions
+1 -1
View File
@@ -82,7 +82,7 @@ chromy --help
You can also run it from the source tree without installing the tool:
```bash
uv run python main.py --help
uv run python -m chromy.main --help
```
## Commands
+1
View File
@@ -0,0 +1 @@
"""Chromy package."""
@@ -6,7 +6,7 @@ from chromadb.api import ClientAPI
from chromadb.api.types import QueryResult
from chromadb.errors import NotFoundError
from embed import EmbeddingRecord
from chromy.embed import EmbeddingRecord
def _get_client_and_collection(
+6 -6
View File
@@ -6,15 +6,15 @@ from dataclasses import dataclass
from chromadb.errors import InternalError, NotFoundError
from handlers.add_data import handle_add_data
from handlers.count_collection import handle_count_collection
from handlers.create_collection import handle_create_collection
from handlers.delete_collection import (
from chromy.handlers.add_data import handle_add_data
from chromy.handlers.count_collection import handle_count_collection
from chromy.handlers.create_collection import handle_create_collection
from chromy.handlers.delete_collection import (
handle_delete_collection,
handle_delete_records,
)
from handlers.list_collections import handle_list_collections
from handlers.query import handle_query
from chromy.handlers.list_collections import handle_list_collections
from chromy.handlers.query import handle_query
CommandHandler = Callable[[Namespace], int]
View File
@@ -1,6 +1,6 @@
from argparse import Namespace
from utilities import ingest_file
from chromy.utilities import ingest_file
def handle_add_data(args: Namespace) -> int:
@@ -1,6 +1,6 @@
from argparse import Namespace
from chroma_functions import count_collection
from chromy.chroma_functions import count_collection
def handle_count_collection(args: Namespace) -> int:
@@ -1,6 +1,6 @@
from argparse import Namespace
from chroma_functions import create_collection
from chromy.chroma_functions import create_collection
def handle_create_collection(args: Namespace) -> int:
@@ -1,6 +1,6 @@
from argparse import Namespace
from chroma_functions import delete_collection, delete_data
from chromy.chroma_functions import delete_collection, delete_data
def _parse_where_clause(where_clause: str) -> dict[str, str]:
@@ -1,7 +1,7 @@
from argparse import Namespace
from chroma_functions import list_collections
from utilities import print_lines
from chromy.chroma_functions import list_collections
from chromy.utilities import print_lines
def handle_list_collections(_: Namespace) -> int:
@@ -1,6 +1,6 @@
from argparse import Namespace
from utilities import format_query_result, print_lines, run_query
from chromy.utilities import format_query_result, print_lines, run_query
def handle_query(args: Namespace) -> int:
+2 -2
View File
@@ -2,8 +2,8 @@ from __future__ import annotations
from dotenv import load_dotenv
from cli_app import execute_command
from cli_parser import build_parser
from chromy.cli_app import execute_command
from chromy.cli_parser import build_parser
def main() -> int:
+3 -3
View File
@@ -1,9 +1,9 @@
from chromadb import QueryResult
from collections.abc import Mapping
from chroma_functions import add_data, query_data
from chunk_functions import chunk_file
from embed import embed
from chromy.chroma_functions import add_data, query_data
from chromy.chunk_functions import chunk_file
from chromy.embed import embed
def print_lines(lines: list[str]) -> None:
+37
View File
@@ -0,0 +1,37 @@
# 1. Move Top-Level Modules Into a Real Package [DONE]
## Summary
Move the current flat module layout into a proper `chromy/` package so imports, packaging, and future subpackages are easier to manage.
## Implementation Steps
- Create a `chromy/` package with `__init__.py`.
- Move `main.py`, `cli_app.py`, `cli_parser.py`, `chroma_functions.py`, `chunk_functions.py`, `embed.py`, and `utilities.py` into `chromy/`.
- Move `handlers/` into `chromy/handlers/`.
- Update imports to absolute package imports such as `from chromy.cli_app import execute_command` and `from chromy.handlers.add_data import handle_add_data`.
- Update `[project.scripts]` in `pyproject.toml` from `main:main` to `chromy.main:main`.
- Update setuptools configuration to package `chromy` and `chromy.handlers` instead of using top-level `py-modules`.
- Update README development commands from `uv run python main.py --help` to `uv run python -m chromy.main --help`.
## Public Interface Changes
- The installed CLI command remains `chromy`.
- Programmatic imports move from top-level modules to `chromy.*`.
- Running from source should use `python -m chromy.main`.
## Test Plan
- Run `uv run python -m chromy.main --help`.
- Run `uv run python -m chromy.main list-collections`.
- Build the package with `uv build`.
- Install locally in editable mode and confirm `chromy --help` resolves the packaged entrypoint.
- Test all commands to verify they still work:
- [creating, listing, deleting] collections
- [adding, deleting] documents to a collection (use [romeo_and_juliet.txt](romeo_and_juliet.txt))
- querying
## Assumptions
- Backward-compatible top-level imports are not required.
- The package refactor should preserve behavior before deeper service or architecture changes are made.
+31
View File
@@ -0,0 +1,31 @@
# 2. Replace `argparse.Namespace` Plumbing With Typed Command Inputs
## Summary
Stop passing mutable `argparse.Namespace` objects into handlers. Convert parsed CLI arguments into typed command dataclasses before dispatch.
## Implementation Steps
- Add frozen dataclasses for command inputs, such as list collections, create collection, delete collection, count, add data, query, and delete records.
- Keep `argparse` isolated in the CLI adapter layer.
- Convert `Namespace` into the correct command dataclass immediately before dispatch.
- Change handler signatures from `Callable[[Namespace], int]` to typed command-specific callables.
- Remove mutations such as `args.error_message = ...` in `cli_app.py` and `handlers/delete_collection.py`.
- Return or raise explicit structured results/errors rather than writing temporary state back into the parsed args object.
## Public Interface Changes
- CLI command syntax stays the same.
- Internal handler APIs change to typed dataclass inputs.
- Error message builders should receive typed command inputs or exceptions instead of raw `Namespace`.
## Test Plan
- Add parser-to-command conversion tests for every command and alias.
- Add handler unit tests that construct command dataclasses directly.
- Verify invalid delete filters still produce the same user-facing error.
## Assumptions
- Command dataclasses should live near CLI application code until a broader package refactor introduces clearer subpackages.
- The first pass should preserve the existing command names, arguments, aliases, and output.
@@ -0,0 +1,31 @@
# 3. Split CLI, Service, Repository, and Formatting Responsibilities
## Summary
Separate command handling, business workflows, Chroma persistence, and output formatting so each layer has a single responsibility.
## Implementation Steps
- Keep handlers thin: receive typed command input, call a service, and print formatted output.
- Move ingestion orchestration from `utilities.ingest_file` into an `IngestionService`.
- Move query orchestration from `utilities.run_query` into a `QueryService`.
- Move Chroma collection operations from `chroma_functions.py` into a `ChromaRepository` or `ChromaStore`.
- Move output-only functions such as `format_query_result` and `print_lines` into a formatting module.
- Keep Chroma-specific result parsing out of CLI handlers.
## Public Interface Changes
- CLI behavior remains unchanged.
- Internal APIs become service and repository methods instead of free functions.
- Formatting functions should accept internal domain objects rather than raw Chroma response dictionaries where possible.
## Test Plan
- Unit test services with mocked repository, chunker, and embedder dependencies.
- Unit test formatter output without requiring Chroma.
- Run a small manual CLI smoke test for create, add, query, count, list, and delete.
## Assumptions
- This should be done after typed command inputs and package layout changes, or coordinated carefully with them.
- No JSON output mode or web API is added in this refactor; the goal is to make those future changes easier.
@@ -0,0 +1,30 @@
# 4. Centralize Chroma Client Configuration
## Summary
Create Chroma client configuration once per command and inject it into repository or service objects instead of repeatedly calling `chromadb.PersistentClient()`.
## Implementation Steps
- Add a small configuration object for Chroma settings, including persistence path.
- Load the persistence path from a CLI option, environment variable, or default Chroma behavior.
- Create a Chroma client factory that returns one client per command execution.
- Inject the client into the Chroma repository instead of constructing it inside each function.
- Remove repeated `chromadb.PersistentClient()` calls from collection operations.
- Ensure tests can pass an in-memory or temporary Chroma client.
## Public Interface Changes
- Add a documented way to configure the Chroma persistence path.
- Existing commands should continue to work with the current implicit default when no path is provided.
## Test Plan
- Unit test repository operations with an injected test client.
- Verify two commands using the same configured persistence directory can see the same collections.
- Verify default behavior still works when no path is configured.
## Assumptions
- The initial implementation should preserve Chroma's default persistence behavior unless a path is explicitly configured.
- Configuration should be introduced before larger repository and service tests depend on isolated storage.
+30
View File
@@ -0,0 +1,30 @@
# 6. Add Ruff and Mypy Configuration
## Summary
Add linting, formatting, and static type checking configuration to `pyproject.toml`. Do not add pytest configuration for this improvement.
## Implementation Steps
- Add `ruff` and `mypy` to the development dependency group.
- Configure Ruff formatting and lint rules in `pyproject.toml`.
- Configure mypy in strict or near-strict mode suitable for the current Chroma and third-party boundaries.
- Add README development commands for `uv run ruff check .`, `uv run ruff format .`, and `uv run mypy .`.
- Fix only minimal configuration blockers in this plan; broad type modernization belongs to the type-hints plan.
## Public Interface Changes
- No runtime CLI behavior changes.
- Development workflow gains standard lint, format, and type-check commands.
## Test Plan
- Run `uv run ruff check .`.
- Run `uv run ruff format --check .`.
- Run `uv run mypy .`.
- Confirm no pytest configuration is added as part of this plan.
## Assumptions
- Pytest is intentionally out of scope for this improvement.
- Strict mypy may need targeted ignores for third-party packages that do not expose complete typing.
+29
View File
@@ -0,0 +1,29 @@
# 7. Modernize Type Hints and Add Missing Future Imports
## Summary
Make type annotations consistent across the codebase by using modern Python 3.12 typing syntax and adding future annotations imports where useful.
## Implementation Steps
- Add `from __future__ import annotations` consistently to Python modules that do not already have it.
- Replace `typing.List` with built-in generic syntax such as `list[str]` and `list[EmbeddingRecord]`.
- Use `collections.abc` input interfaces such as `Sequence[str]` where mutation is not required.
- Introduce type aliases or dataclasses for internal Chroma result shapes only where they reduce ambiguity.
- Keep runtime behavior unchanged.
## Public Interface Changes
- No CLI behavior changes.
- Public Python annotations become more precise and consistent.
## Test Plan
- Run `uv run ruff check .`.
- Run `uv run mypy .` after mypy configuration exists.
- Manually smoke test CLI commands that touched type boundaries.
## Assumptions
- The project remains Python 3.12+, so built-in generic syntax is acceptable everywhere.
- Larger domain model changes should be handled in the service/repository and query-formatting plans.
+29
View File
@@ -0,0 +1,29 @@
# 8. Avoid Catching `BaseException` in CLI Dispatch
## Summary
Change CLI dispatch error handling so process-control exceptions such as `KeyboardInterrupt` and `SystemExit` are not swallowed.
## Implementation Steps
- Change `execute_command` to catch `Exception` instead of `BaseException`.
- Keep mapped, expected errors handled through the existing command error mapping or its replacement.
- Print handled user-facing errors to `stderr` instead of `stdout`.
- Allow unmapped exceptions, `KeyboardInterrupt`, and `SystemExit` to propagate normally.
- Consider adding debug logging for unexpected exceptions after the logging plan exists.
## Public Interface Changes
- Expected command errors still return a non-zero exit code.
- Handled error messages move from stdout to stderr.
- Interrupt and process-exit behavior becomes conventional.
## Test Plan
- Test that a mapped exception returns `1` and writes to stderr.
- Test that `KeyboardInterrupt` is not caught by `execute_command`.
- Test that unmapped exceptions still propagate.
## Assumptions
- Returning `1` for handled user errors remains acceptable until the exit-code conventions plan is implemented.
@@ -0,0 +1,29 @@
# 9. Use Domain-Specific Exceptions Instead of Chroma Exceptions in CLI Mapping
## Summary
Hide Chroma-specific exceptions behind application-level exceptions so the CLI does not depend on Chroma's exception model.
## Implementation Steps
- Define app-level exceptions such as `CollectionNotFoundError`, `CollectionAlreadyExistsError`, and `StorageOperationError`.
- Convert Chroma exceptions inside the repository layer.
- Update CLI command error mappings to handle app-level exceptions only.
- Preserve existing user-facing messages for missing and duplicate collections.
- Avoid importing `chromadb.errors` in CLI modules after the repository layer owns that boundary.
## Public Interface Changes
- CLI behavior and messages should remain the same.
- Internal error contracts change from Chroma exceptions to app-level exceptions.
## Test Plan
- Unit test repository exception translation.
- Unit test CLI mappings for app-level exceptions.
- Smoke test missing collection, duplicate collection, and successful operations.
## Assumptions
- Chroma remains the only storage backend for now.
- The exception layer is still useful because it prevents storage details from leaking upward.
+30
View File
@@ -0,0 +1,30 @@
# 10. Make Ingestion More Configurable
## Summary
Move chunking and embedding choices into configuration and expose chunk size as an `add-data` CLI option.
## Implementation Steps
- Add ingestion configuration for chunk size, tokenizer/model name, and embedding function provider.
- Change chunking code to receive chunk size and tokenizer/model name instead of hard-coding `800` and `"gpt-4"`.
- Reuse the embedding function through dependency injection instead of constructing it for every embed call.
- Add `--chunk-size` to `add-data`, defaulting to the current value of `800`.
- Keep the default tokenizer/model behavior equivalent to the current `"gpt-4"` setting.
## Public Interface Changes
- `add-data` gains `--chunk-size`.
- Default ingestion behavior remains unchanged when no option is provided.
## Test Plan
- Test chunking with default and custom chunk sizes.
- Test `add-data --chunk-size` parser behavior.
- Test ingestion service with an injected fake embedder.
- Smoke test adding a file with and without `--chunk-size`.
## Assumptions
- Only chunk size is exposed in the CLI initially.
- Tokenizer/model and embedding provider configuration can remain internal or environment-backed until there is a concrete user-facing need.
+30
View File
@@ -0,0 +1,30 @@
# 11. Improve File Handling
## Summary
Make file ingestion boundaries clearer by using `Path`, explicit UTF-8 decoding, and validation before reading.
## Implementation Steps
- Change internal file ingestion APIs to accept `Path` instead of raw `str`.
- Convert CLI string paths to `Path` in the command adapter or handler.
- Validate that the path exists and is a regular file before reading.
- Read text with `encoding="utf-8"`.
- Raise a clear app-level file error for missing paths, directories, and decoding failures.
- Leave PDF and future file loaders out of scope for now.
## Public Interface Changes
- CLI argument remains a file path string.
- Error messages for missing or invalid files become clearer.
## Test Plan
- Test successful text-file loading.
- Test missing file, directory path, and invalid UTF-8 handling.
- Smoke test `add-data` with a valid UTF-8 file.
## Assumptions
- Only plain text ingestion is supported in this plan.
- Existing metadata can continue storing the original path string as `file_name` unless a later plan changes metadata shape.
+31
View File
@@ -0,0 +1,31 @@
# 12. Review Dependencies and Remove Unused Ones
## Summary
Audit runtime dependencies and move unused or optional packages out of the base install where appropriate.
## Implementation Steps
- Compare imports in source code against dependencies declared in `pyproject.toml`.
- Keep packages that are directly imported by current runtime code.
- Move packages used only for optional or future features behind extras, such as `pdf` or `openai`.
- Remove dependencies that are neither imported nor needed transitively by active code.
- Update `uv.lock` after dependency changes.
- Update README installation notes if extras are introduced.
## Public Interface Changes
- Base installation may become smaller.
- Optional feature dependencies should be installed through extras if introduced.
## Test Plan
- Run `uv sync` after dependency edits.
- Run `uv run python -m chromy.main --help`.
- Smoke test commands that use Chroma, semchunk, dotenv loading, and embedding.
- Build the package with `uv build`.
## Assumptions
- Dependency removal should be conservative and based on actual import usage.
- No new optional features are implemented as part of this cleanup.
@@ -0,0 +1,30 @@
# 13. Make Query Result Formatting More Robust
## Summary
Convert raw Chroma query results into typed internal matches before formatting them for terminal output.
## Implementation Steps
- Add a `QueryMatch` dataclass with fields for id, document, distance, and metadata.
- Add a parser that converts Chroma `QueryResult` data into `list[QueryMatch]`.
- Handle empty results, missing documents, missing metadata, missing distances, and unexpected metadata shapes defensively.
- Change terminal formatting to accept `list[QueryMatch]`.
- Keep current text output as stable as practical.
## Public Interface Changes
- CLI output should remain effectively the same for normal query results.
- Internal formatter APIs change from raw Chroma result dictionaries to typed match objects.
## Test Plan
- Test empty query results.
- Test populated results with ids, documents, distances, and metadata.
- Test missing documents, missing metadata, multiple query result groups, and non-mapping metadata values.
- Smoke test `query` against a real collection.
## Assumptions
- The first implementation can format the first query group only, matching current behavior.
- Support for alternate output formats is not added in this plan.
@@ -0,0 +1,30 @@
# 14. Clarify Delete Filter Semantics
## Summary
Make it clear that record deletion supports a simple metadata equality filter in `key=value` form.
## Implementation Steps
- Rename internal parser variables from `condition` to `key` where equality-only semantics are intended.
- Update CLI help for `--where` to say `Metadata equality filter in the format <key>=<value>`.
- Update error messages to use `<key>=<value>`.
- Update README command documentation to include the `delete` command and its filter syntax.
- Reserve richer Chroma filters, such as JSON filters, for a later feature.
## Public Interface Changes
- CLI option remains `--where`.
- Help text and error messages become more precise.
- No richer filter syntax is added yet.
## Test Plan
- Test valid `key=value` parsing with whitespace trimming.
- Test missing separator, empty key, and empty value errors.
- Verify `chromy delete --help` documents equality semantics.
## Assumptions
- Keeping the option name `--where` is acceptable for backward compatibility.
- Renaming to `--key` is not worth the CLI break for the current feature set.
+29
View File
@@ -0,0 +1,29 @@
# 15. Improve Command Registration So Parser and Dispatcher Cannot Drift
## Summary
Unify command parser metadata, handlers, aliases, and error mappings into one registry or add a startup check that prevents parser and dispatcher drift.
## Implementation Steps
- Create one command registry that includes command name, aliases, help text, arguments, handler, and expected error mappings.
- Build argparse subcommands from the registry.
- Dispatch commands through the same registry.
- Remove duplicated command declarations from separate parser and app structures.
- Add a small validation check that command names are unique and aliases do not collide.
## Public Interface Changes
- CLI command names, aliases, arguments, and help text should remain the same.
- Internal command registration becomes centralized.
## Test Plan
- Test that every registry command appears in parser help.
- Test every alias dispatches to the canonical command.
- Test duplicate command or alias validation fails fast.
- Smoke test all existing commands through parser and dispatcher.
## Assumptions
- A unified registry is preferred over only adding a drift-detection test because the current repo already has structured command metadata.
+30
View File
@@ -0,0 +1,30 @@
# 16. Add Logging for Debuggability
## Summary
Add optional diagnostic logging for ingestion, query, and Chroma operations while keeping normal CLI output clean.
## Implementation Steps
- Configure Python's `logging` module in the CLI entrypoint.
- Add a global `--verbose` flag to enable debug-level logs.
- Log file loading, chunk counts, embedding calls, Chroma writes, collection operations, and query timing.
- Send logs to stderr so stdout remains reserved for command output.
- Avoid logging document contents or full embeddings.
## Public Interface Changes
- Add global CLI flag `--verbose`.
- Normal output remains unchanged when verbose mode is not enabled.
## Test Plan
- Test parser behavior for `--verbose`.
- Test that debug logs are suppressed by default.
- Test that verbose mode emits representative diagnostic logs to stderr.
- Smoke test `add-data` and `query` with verbose mode enabled.
## Assumptions
- A single global verbosity level is enough for now.
- Structured logging means consistent logger names and message fields, not a JSON logging format.
+32
View File
@@ -0,0 +1,32 @@
# 17. Add Exit Code Conventions
## Summary
Document and implement consistent exit codes so scripts can distinguish success from expected user errors.
## Implementation Steps
- Define named constants or an enum for exit codes.
- Use `0` for success.
- Use `1` for expected user-facing errors initially.
- Optionally reserve distinct documented codes for validation errors, missing collections, and file errors if the CLI needs them.
- Update error handling to return constants instead of literal integers.
- Document exit code behavior in README.
## Public Interface Changes
- Existing successful commands still exit `0`.
- Existing handled errors may continue to exit `1` unless distinct codes are explicitly adopted.
- README documents the convention.
## Test Plan
- Test success returns `0`.
- Test expected user errors return the documented code.
- Test unexpected exceptions still propagate to the runtime.
- Manually verify shell exit status for representative commands.
## Assumptions
- Start with `0` and `1` unless there is a clear automation need for more granular codes.
- This plan should follow the exception cleanup plan so errors are categorized at the app level.
+30
View File
@@ -0,0 +1,30 @@
# 18. Add Repository-Level Documentation for Architecture
## Summary
Document the intended internal architecture so future changes follow the same parser, handler, service, repository, and formatter boundaries.
## Implementation Steps
- Add a "Development Architecture" section to README or create `docs/architecture.md`.
- Describe the request flow: CLI parser -> command input -> handler -> service -> Chroma repository -> formatter.
- Explain where to add a new command.
- Explain where to add a new document loader.
- Explain where to add a new embedding provider.
- Include the default Chroma persistence behavior and how configuration is injected after that refactor exists.
## Public Interface Changes
- No runtime behavior changes.
- Developer-facing documentation is added or expanded.
## Test Plan
- Verify documentation matches the actual package/module layout after refactors.
- Run README examples to ensure commands are still accurate.
- Check links and command snippets manually.
## Assumptions
- Documentation should be updated after or alongside the architecture refactor so it describes the intended final shape.
- A README section is enough unless the architecture content grows too large.
+2 -11
View File
@@ -19,19 +19,10 @@ dependencies = [
]
[project.scripts]
chromy = "main:main"
chromy = "chromy.main:main"
[tool.setuptools]
packages = ["handlers"]
py-modules = [
"main",
"cli_app",
"cli_parser",
"chroma_functions",
"chunk_functions",
"embed",
"utilities",
]
packages = ["chromy", "chromy.handlers"]
[dependency-groups]
dev = ["nuitka[onefile]>=4.0.8"]
+5651
View File
File diff suppressed because it is too large Load Diff