move top-level modules into a real package

2026-04-22 15:47:46 +02:00
parent e33160282c
commit 8ebab832d5
35 changed files with 6192 additions and 31 deletions
@@ -82,7 +82,7 @@ chromy --help
 You can also run it from the source tree without installing the tool:

 ```bash
-uv run python main.py --help
+uv run python -m chromy.main --help
 ```

 ## Commands
@@ -0,0 +1 @@
+"""Chromy package."""
@@ -6,7 +6,7 @@ from chromadb.api import ClientAPI
 from chromadb.api.types import QueryResult
 from chromadb.errors import NotFoundError

-from embed import EmbeddingRecord
+from chromy.embed import EmbeddingRecord


 def _get_client_and_collection(
@@ -6,15 +6,15 @@ from dataclasses import dataclass

 from chromadb.errors import InternalError, NotFoundError

-from handlers.add_data import handle_add_data
-from handlers.count_collection import handle_count_collection
-from handlers.create_collection import handle_create_collection
-from handlers.delete_collection import (
+from chromy.handlers.add_data import handle_add_data
+from chromy.handlers.count_collection import handle_count_collection
+from chromy.handlers.create_collection import handle_create_collection
+from chromy.handlers.delete_collection import (
    handle_delete_collection,
    handle_delete_records,
 )
-from handlers.list_collections import handle_list_collections
-from handlers.query import handle_query
+from chromy.handlers.list_collections import handle_list_collections
+from chromy.handlers.query import handle_query


 CommandHandler = Callable[[Namespace], int]
@@ -1,6 +1,6 @@
 from argparse import Namespace

-from utilities import ingest_file
+from chromy.utilities import ingest_file


 def handle_add_data(args: Namespace) -> int:
@@ -1,6 +1,6 @@
 from argparse import Namespace

-from chroma_functions import count_collection
+from chromy.chroma_functions import count_collection


 def handle_count_collection(args: Namespace) -> int:
@@ -1,6 +1,6 @@
 from argparse import Namespace

-from chroma_functions import create_collection
+from chromy.chroma_functions import create_collection


 def handle_create_collection(args: Namespace) -> int:
@@ -1,6 +1,6 @@
 from argparse import Namespace

-from chroma_functions import delete_collection, delete_data
+from chromy.chroma_functions import delete_collection, delete_data


 def _parse_where_clause(where_clause: str) -> dict[str, str]:
@@ -1,7 +1,7 @@
 from argparse import Namespace

-from chroma_functions import list_collections
-from utilities import print_lines
+from chromy.chroma_functions import list_collections
+from chromy.utilities import print_lines


 def handle_list_collections(_: Namespace) -> int:
@@ -1,6 +1,6 @@
 from argparse import Namespace

-from utilities import format_query_result, print_lines, run_query
+from chromy.utilities import format_query_result, print_lines, run_query


 def handle_query(args: Namespace) -> int:
@@ -2,8 +2,8 @@ from __future__ import annotations

 from dotenv import load_dotenv

-from cli_app import execute_command
-from cli_parser import build_parser
+from chromy.cli_app import execute_command
+from chromy.cli_parser import build_parser


 def main() -> int:
@@ -1,9 +1,9 @@
 from chromadb import QueryResult
 from collections.abc import Mapping

-from chroma_functions import add_data, query_data
-from chunk_functions import chunk_file
-from embed import embed
+from chromy.chroma_functions import add_data, query_data
+from chromy.chunk_functions import chunk_file
+from chromy.embed import embed


 def print_lines(lines: list[str]) -> None:
@@ -0,0 +1,37 @@
+# 1. Move Top-Level Modules Into a Real Package [DONE]
+
+## Summary
+
+Move the current flat module layout into a proper `chromy/` package so imports, packaging, and future subpackages are easier to manage.
+
+## Implementation Steps
+
+- Create a `chromy/` package with `__init__.py`.
+- Move `main.py`, `cli_app.py`, `cli_parser.py`, `chroma_functions.py`, `chunk_functions.py`, `embed.py`, and `utilities.py` into `chromy/`.
+- Move `handlers/` into `chromy/handlers/`.
+- Update imports to absolute package imports such as `from chromy.cli_app import execute_command` and `from chromy.handlers.add_data import handle_add_data`.
+- Update `[project.scripts]` in `pyproject.toml` from `main:main` to `chromy.main:main`.
+- Update setuptools configuration to package `chromy` and `chromy.handlers` instead of using top-level `py-modules`.
+- Update README development commands from `uv run python main.py --help` to `uv run python -m chromy.main --help`.
+
+## Public Interface Changes
+
+- The installed CLI command remains `chromy`.
+- Programmatic imports move from top-level modules to `chromy.*`.
+- Running from source should use `python -m chromy.main`.
+
+## Test Plan
+
+- Run `uv run python -m chromy.main --help`.
+- Run `uv run python -m chromy.main list-collections`.
+- Build the package with `uv build`.
+- Install locally in editable mode and confirm `chromy --help` resolves the packaged entrypoint.
+- Test all commands to verify they still work:
+  - [creating, listing, deleting] collections
+  - [adding, deleting] documents to a collection (use [romeo_and_juliet.txt](romeo_and_juliet.txt))
+  - querying
+
+## Assumptions
+
+- Backward-compatible top-level imports are not required.
+- The package refactor should preserve behavior before deeper service or architecture changes are made.
@@ -0,0 +1,31 @@
+# 2. Replace `argparse.Namespace` Plumbing With Typed Command Inputs
+
+## Summary
+
+Stop passing mutable `argparse.Namespace` objects into handlers. Convert parsed CLI arguments into typed command dataclasses before dispatch.
+
+## Implementation Steps
+
+- Add frozen dataclasses for command inputs, such as list collections, create collection, delete collection, count, add data, query, and delete records.
+- Keep `argparse` isolated in the CLI adapter layer.
+- Convert `Namespace` into the correct command dataclass immediately before dispatch.
+- Change handler signatures from `Callable[[Namespace], int]` to typed command-specific callables.
+- Remove mutations such as `args.error_message = ...` in `cli_app.py` and `handlers/delete_collection.py`.
+- Return or raise explicit structured results/errors rather than writing temporary state back into the parsed args object.
+
+## Public Interface Changes
+
+- CLI command syntax stays the same.
+- Internal handler APIs change to typed dataclass inputs.
+- Error message builders should receive typed command inputs or exceptions instead of raw `Namespace`.
+
+## Test Plan
+
+- Add parser-to-command conversion tests for every command and alias.
+- Add handler unit tests that construct command dataclasses directly.
+- Verify invalid delete filters still produce the same user-facing error.
+
+## Assumptions
+
+- Command dataclasses should live near CLI application code until a broader package refactor introduces clearer subpackages.
+- The first pass should preserve the existing command names, arguments, aliases, and output.
@@ -0,0 +1,31 @@
+# 3. Split CLI, Service, Repository, and Formatting Responsibilities
+
+## Summary
+
+Separate command handling, business workflows, Chroma persistence, and output formatting so each layer has a single responsibility.
+
+## Implementation Steps
+
+- Keep handlers thin: receive typed command input, call a service, and print formatted output.
+- Move ingestion orchestration from `utilities.ingest_file` into an `IngestionService`.
+- Move query orchestration from `utilities.run_query` into a `QueryService`.
+- Move Chroma collection operations from `chroma_functions.py` into a `ChromaRepository` or `ChromaStore`.
+- Move output-only functions such as `format_query_result` and `print_lines` into a formatting module.
+- Keep Chroma-specific result parsing out of CLI handlers.
+
+## Public Interface Changes
+
+- CLI behavior remains unchanged.
+- Internal APIs become service and repository methods instead of free functions.
+- Formatting functions should accept internal domain objects rather than raw Chroma response dictionaries where possible.
+
+## Test Plan
+
+- Unit test services with mocked repository, chunker, and embedder dependencies.
+- Unit test formatter output without requiring Chroma.
+- Run a small manual CLI smoke test for create, add, query, count, list, and delete.
+
+## Assumptions
+
+- This should be done after typed command inputs and package layout changes, or coordinated carefully with them.
+- No JSON output mode or web API is added in this refactor; the goal is to make those future changes easier.
@@ -0,0 +1,30 @@
+# 4. Centralize Chroma Client Configuration
+
+## Summary
+
+Create Chroma client configuration once per command and inject it into repository or service objects instead of repeatedly calling `chromadb.PersistentClient()`.
+
+## Implementation Steps
+
+- Add a small configuration object for Chroma settings, including persistence path.
+- Load the persistence path from a CLI option, environment variable, or default Chroma behavior.
+- Create a Chroma client factory that returns one client per command execution.
+- Inject the client into the Chroma repository instead of constructing it inside each function.
+- Remove repeated `chromadb.PersistentClient()` calls from collection operations.
+- Ensure tests can pass an in-memory or temporary Chroma client.
+
+## Public Interface Changes
+
+- Add a documented way to configure the Chroma persistence path.
+- Existing commands should continue to work with the current implicit default when no path is provided.
+
+## Test Plan
+
+- Unit test repository operations with an injected test client.
+- Verify two commands using the same configured persistence directory can see the same collections.
+- Verify default behavior still works when no path is configured.
+
+## Assumptions
+
+- The initial implementation should preserve Chroma's default persistence behavior unless a path is explicitly configured.
+- Configuration should be introduced before larger repository and service tests depend on isolated storage.
@@ -0,0 +1,30 @@
+# 6. Add Ruff and Mypy Configuration
+
+## Summary
+
+Add linting, formatting, and static type checking configuration to `pyproject.toml`. Do not add pytest configuration for this improvement.
+
+## Implementation Steps
+
+- Add `ruff` and `mypy` to the development dependency group.
+- Configure Ruff formatting and lint rules in `pyproject.toml`.
+- Configure mypy in strict or near-strict mode suitable for the current Chroma and third-party boundaries.
+- Add README development commands for `uv run ruff check .`, `uv run ruff format .`, and `uv run mypy .`.
+- Fix only minimal configuration blockers in this plan; broad type modernization belongs to the type-hints plan.
+
+## Public Interface Changes
+
+- No runtime CLI behavior changes.
+- Development workflow gains standard lint, format, and type-check commands.
+
+## Test Plan
+
+- Run `uv run ruff check .`.
+- Run `uv run ruff format --check .`.
+- Run `uv run mypy .`.
+- Confirm no pytest configuration is added as part of this plan.
+
+## Assumptions
+
+- Pytest is intentionally out of scope for this improvement.
+- Strict mypy may need targeted ignores for third-party packages that do not expose complete typing.
@@ -0,0 +1,29 @@
+# 7. Modernize Type Hints and Add Missing Future Imports
+
+## Summary
+
+Make type annotations consistent across the codebase by using modern Python 3.12 typing syntax and adding future annotations imports where useful.
+
+## Implementation Steps
+
+- Add `from __future__ import annotations` consistently to Python modules that do not already have it.
+- Replace `typing.List` with built-in generic syntax such as `list[str]` and `list[EmbeddingRecord]`.
+- Use `collections.abc` input interfaces such as `Sequence[str]` where mutation is not required.
+- Introduce type aliases or dataclasses for internal Chroma result shapes only where they reduce ambiguity.
+- Keep runtime behavior unchanged.
+
+## Public Interface Changes
+
+- No CLI behavior changes.
+- Public Python annotations become more precise and consistent.
+
+## Test Plan
+
+- Run `uv run ruff check .`.
+- Run `uv run mypy .` after mypy configuration exists.
+- Manually smoke test CLI commands that touched type boundaries.
+
+## Assumptions
+
+- The project remains Python 3.12+, so built-in generic syntax is acceptable everywhere.
+- Larger domain model changes should be handled in the service/repository and query-formatting plans.
@@ -0,0 +1,29 @@
+# 8. Avoid Catching `BaseException` in CLI Dispatch
+
+## Summary
+
+Change CLI dispatch error handling so process-control exceptions such as `KeyboardInterrupt` and `SystemExit` are not swallowed.
+
+## Implementation Steps
+
+- Change `execute_command` to catch `Exception` instead of `BaseException`.
+- Keep mapped, expected errors handled through the existing command error mapping or its replacement.
+- Print handled user-facing errors to `stderr` instead of `stdout`.
+- Allow unmapped exceptions, `KeyboardInterrupt`, and `SystemExit` to propagate normally.
+- Consider adding debug logging for unexpected exceptions after the logging plan exists.
+
+## Public Interface Changes
+
+- Expected command errors still return a non-zero exit code.
+- Handled error messages move from stdout to stderr.
+- Interrupt and process-exit behavior becomes conventional.
+
+## Test Plan
+
+- Test that a mapped exception returns `1` and writes to stderr.
+- Test that `KeyboardInterrupt` is not caught by `execute_command`.
+- Test that unmapped exceptions still propagate.
+
+## Assumptions
+
+- Returning `1` for handled user errors remains acceptable until the exit-code conventions plan is implemented.
@@ -0,0 +1,29 @@
+# 9. Use Domain-Specific Exceptions Instead of Chroma Exceptions in CLI Mapping
+
+## Summary
+
+Hide Chroma-specific exceptions behind application-level exceptions so the CLI does not depend on Chroma's exception model.
+
+## Implementation Steps
+
+- Define app-level exceptions such as `CollectionNotFoundError`, `CollectionAlreadyExistsError`, and `StorageOperationError`.
+- Convert Chroma exceptions inside the repository layer.
+- Update CLI command error mappings to handle app-level exceptions only.
+- Preserve existing user-facing messages for missing and duplicate collections.
+- Avoid importing `chromadb.errors` in CLI modules after the repository layer owns that boundary.
+
+## Public Interface Changes
+
+- CLI behavior and messages should remain the same.
+- Internal error contracts change from Chroma exceptions to app-level exceptions.
+
+## Test Plan
+
+- Unit test repository exception translation.
+- Unit test CLI mappings for app-level exceptions.
+- Smoke test missing collection, duplicate collection, and successful operations.
+
+## Assumptions
+
+- Chroma remains the only storage backend for now.
+- The exception layer is still useful because it prevents storage details from leaking upward.
@@ -0,0 +1,30 @@
+# 10. Make Ingestion More Configurable
+
+## Summary
+
+Move chunking and embedding choices into configuration and expose chunk size as an `add-data` CLI option.
+
+## Implementation Steps
+
+- Add ingestion configuration for chunk size, tokenizer/model name, and embedding function provider.
+- Change chunking code to receive chunk size and tokenizer/model name instead of hard-coding `800` and `"gpt-4"`.
+- Reuse the embedding function through dependency injection instead of constructing it for every embed call.
+- Add `--chunk-size` to `add-data`, defaulting to the current value of `800`.
+- Keep the default tokenizer/model behavior equivalent to the current `"gpt-4"` setting.
+
+## Public Interface Changes
+
+- `add-data` gains `--chunk-size`.
+- Default ingestion behavior remains unchanged when no option is provided.
+
+## Test Plan
+
+- Test chunking with default and custom chunk sizes.
+- Test `add-data --chunk-size` parser behavior.
+- Test ingestion service with an injected fake embedder.
+- Smoke test adding a file with and without `--chunk-size`.
+
+## Assumptions
+
+- Only chunk size is exposed in the CLI initially.
+- Tokenizer/model and embedding provider configuration can remain internal or environment-backed until there is a concrete user-facing need.
@@ -0,0 +1,30 @@
+# 11. Improve File Handling
+
+## Summary
+
+Make file ingestion boundaries clearer by using `Path`, explicit UTF-8 decoding, and validation before reading.
+
+## Implementation Steps
+
+- Change internal file ingestion APIs to accept `Path` instead of raw `str`.
+- Convert CLI string paths to `Path` in the command adapter or handler.
+- Validate that the path exists and is a regular file before reading.
+- Read text with `encoding="utf-8"`.
+- Raise a clear app-level file error for missing paths, directories, and decoding failures.
+- Leave PDF and future file loaders out of scope for now.
+
+## Public Interface Changes
+
+- CLI argument remains a file path string.
+- Error messages for missing or invalid files become clearer.
+
+## Test Plan
+
+- Test successful text-file loading.
+- Test missing file, directory path, and invalid UTF-8 handling.
+- Smoke test `add-data` with a valid UTF-8 file.
+
+## Assumptions
+
+- Only plain text ingestion is supported in this plan.
+- Existing metadata can continue storing the original path string as `file_name` unless a later plan changes metadata shape.
@@ -0,0 +1,31 @@
+# 12. Review Dependencies and Remove Unused Ones
+
+## Summary
+
+Audit runtime dependencies and move unused or optional packages out of the base install where appropriate.
+
+## Implementation Steps
+
+- Compare imports in source code against dependencies declared in `pyproject.toml`.
+- Keep packages that are directly imported by current runtime code.
+- Move packages used only for optional or future features behind extras, such as `pdf` or `openai`.
+- Remove dependencies that are neither imported nor needed transitively by active code.
+- Update `uv.lock` after dependency changes.
+- Update README installation notes if extras are introduced.
+
+## Public Interface Changes
+
+- Base installation may become smaller.
+- Optional feature dependencies should be installed through extras if introduced.
+
+## Test Plan
+
+- Run `uv sync` after dependency edits.
+- Run `uv run python -m chromy.main --help`.
+- Smoke test commands that use Chroma, semchunk, dotenv loading, and embedding.
+- Build the package with `uv build`.
+
+## Assumptions
+
+- Dependency removal should be conservative and based on actual import usage.
+- No new optional features are implemented as part of this cleanup.
@@ -0,0 +1,30 @@
+# 13. Make Query Result Formatting More Robust
+
+## Summary
+
+Convert raw Chroma query results into typed internal matches before formatting them for terminal output.
+
+## Implementation Steps
+
+- Add a `QueryMatch` dataclass with fields for id, document, distance, and metadata.
+- Add a parser that converts Chroma `QueryResult` data into `list[QueryMatch]`.
+- Handle empty results, missing documents, missing metadata, missing distances, and unexpected metadata shapes defensively.
+- Change terminal formatting to accept `list[QueryMatch]`.
+- Keep current text output as stable as practical.
+
+## Public Interface Changes
+
+- CLI output should remain effectively the same for normal query results.
+- Internal formatter APIs change from raw Chroma result dictionaries to typed match objects.
+
+## Test Plan
+
+- Test empty query results.
+- Test populated results with ids, documents, distances, and metadata.
+- Test missing documents, missing metadata, multiple query result groups, and non-mapping metadata values.
+- Smoke test `query` against a real collection.
+
+## Assumptions
+
+- The first implementation can format the first query group only, matching current behavior.
+- Support for alternate output formats is not added in this plan.
@@ -0,0 +1,30 @@
+# 14. Clarify Delete Filter Semantics
+
+## Summary
+
+Make it clear that record deletion supports a simple metadata equality filter in `key=value` form.
+
+## Implementation Steps
+
+- Rename internal parser variables from `condition` to `key` where equality-only semantics are intended.
+- Update CLI help for `--where` to say `Metadata equality filter in the format <key>=<value>`.
+- Update error messages to use `<key>=<value>`.
+- Update README command documentation to include the `delete` command and its filter syntax.
+- Reserve richer Chroma filters, such as JSON filters, for a later feature.
+
+## Public Interface Changes
+
+- CLI option remains `--where`.
+- Help text and error messages become more precise.
+- No richer filter syntax is added yet.
+
+## Test Plan
+
+- Test valid `key=value` parsing with whitespace trimming.
+- Test missing separator, empty key, and empty value errors.
+- Verify `chromy delete --help` documents equality semantics.
+
+## Assumptions
+
+- Keeping the option name `--where` is acceptable for backward compatibility.
+- Renaming to `--key` is not worth the CLI break for the current feature set.
@@ -0,0 +1,29 @@
+# 15. Improve Command Registration So Parser and Dispatcher Cannot Drift
+
+## Summary
+
+Unify command parser metadata, handlers, aliases, and error mappings into one registry or add a startup check that prevents parser and dispatcher drift.
+
+## Implementation Steps
+
+- Create one command registry that includes command name, aliases, help text, arguments, handler, and expected error mappings.
+- Build argparse subcommands from the registry.
+- Dispatch commands through the same registry.
+- Remove duplicated command declarations from separate parser and app structures.
+- Add a small validation check that command names are unique and aliases do not collide.
+
+## Public Interface Changes
+
+- CLI command names, aliases, arguments, and help text should remain the same.
+- Internal command registration becomes centralized.
+
+## Test Plan
+
+- Test that every registry command appears in parser help.
+- Test every alias dispatches to the canonical command.
+- Test duplicate command or alias validation fails fast.
+- Smoke test all existing commands through parser and dispatcher.
+
+## Assumptions
+
+- A unified registry is preferred over only adding a drift-detection test because the current repo already has structured command metadata.
@@ -0,0 +1,30 @@
+# 16. Add Logging for Debuggability
+
+## Summary
+
+Add optional diagnostic logging for ingestion, query, and Chroma operations while keeping normal CLI output clean.
+
+## Implementation Steps
+
+- Configure Python's `logging` module in the CLI entrypoint.
+- Add a global `--verbose` flag to enable debug-level logs.
+- Log file loading, chunk counts, embedding calls, Chroma writes, collection operations, and query timing.
+- Send logs to stderr so stdout remains reserved for command output.
+- Avoid logging document contents or full embeddings.
+
+## Public Interface Changes
+
+- Add global CLI flag `--verbose`.
+- Normal output remains unchanged when verbose mode is not enabled.
+
+## Test Plan
+
+- Test parser behavior for `--verbose`.
+- Test that debug logs are suppressed by default.
+- Test that verbose mode emits representative diagnostic logs to stderr.
+- Smoke test `add-data` and `query` with verbose mode enabled.
+
+## Assumptions
+
+- A single global verbosity level is enough for now.
+- Structured logging means consistent logger names and message fields, not a JSON logging format.
@@ -0,0 +1,32 @@
+# 17. Add Exit Code Conventions
+
+## Summary
+
+Document and implement consistent exit codes so scripts can distinguish success from expected user errors.
+
+## Implementation Steps
+
+- Define named constants or an enum for exit codes.
+- Use `0` for success.
+- Use `1` for expected user-facing errors initially.
+- Optionally reserve distinct documented codes for validation errors, missing collections, and file errors if the CLI needs them.
+- Update error handling to return constants instead of literal integers.
+- Document exit code behavior in README.
+
+## Public Interface Changes
+
+- Existing successful commands still exit `0`.
+- Existing handled errors may continue to exit `1` unless distinct codes are explicitly adopted.
+- README documents the convention.
+
+## Test Plan
+
+- Test success returns `0`.
+- Test expected user errors return the documented code.
+- Test unexpected exceptions still propagate to the runtime.
+- Manually verify shell exit status for representative commands.
+
+## Assumptions
+
+- Start with `0` and `1` unless there is a clear automation need for more granular codes.
+- This plan should follow the exception cleanup plan so errors are categorized at the app level.
@@ -0,0 +1,30 @@
+# 18. Add Repository-Level Documentation for Architecture
+
+## Summary
+
+Document the intended internal architecture so future changes follow the same parser, handler, service, repository, and formatter boundaries.
+
+## Implementation Steps
+
+- Add a "Development Architecture" section to README or create `docs/architecture.md`.
+- Describe the request flow: CLI parser -> command input -> handler -> service -> Chroma repository -> formatter.
+- Explain where to add a new command.
+- Explain where to add a new document loader.
+- Explain where to add a new embedding provider.
+- Include the default Chroma persistence behavior and how configuration is injected after that refactor exists.
+
+## Public Interface Changes
+
+- No runtime behavior changes.
+- Developer-facing documentation is added or expanded.
+
+## Test Plan
+
+- Verify documentation matches the actual package/module layout after refactors.
+- Run README examples to ensure commands are still accurate.
+- Check links and command snippets manually.
+
+## Assumptions
+
+- Documentation should be updated after or alongside the architecture refactor so it describes the intended final shape.
+- A README section is enough unless the architecture content grows too large.
@@ -19,19 +19,10 @@ dependencies = [
 ]

 [project.scripts]
-chromy = "main:main"
+chromy = "chromy.main:main"

 [tool.setuptools]
-packages = ["handlers"]
-py-modules = [
-    "main",
-    "cli_app",
-    "cli_parser",
-    "chroma_functions",
-    "chunk_functions",
-    "embed",
-    "utilities",
-]
+packages = ["chromy", "chromy.handlers"]

 [dependency-groups]
 dev = ["nuitka[onefile]>=4.0.8"]