From d71fce7a6aff2ae4e94ec29a02ba396ac939ff32 Mon Sep 17 00:00:00 2001 From: Matteo Rosati Date: Fri, 24 Apr 2026 18:40:51 +0200 Subject: [PATCH] cannot import non-text files! --- chromy/cli.py | 5 ++++- chromy/handlers/import_data.py | 8 +++++++ chromy/utilities.py | 39 ++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/chromy/cli.py b/chromy/cli.py index d142254..2f2f1b9 100644 --- a/chromy/cli.py +++ b/chromy/cli.py @@ -1,5 +1,6 @@ from __future__ import annotations +from plistlib import InvalidFileException from typing import Annotated, Callable import typer @@ -122,7 +123,9 @@ def import_data( except NotFoundError: _fail(f"Collection '{collection}' does not exist.") except FileNotFoundError: - _fail(f"The file {file} was not found.") + _fail(f"The file '{file}' was not found.") + except InvalidFileException: + _fail(f"The file '{file}' is not a text file.") # ------------------------------------------------------------------------------ diff --git a/chromy/handlers/import_data.py b/chromy/handlers/import_data.py index 9bee641..365736c 100644 --- a/chromy/handlers/import_data.py +++ b/chromy/handlers/import_data.py @@ -2,11 +2,14 @@ from __future__ import annotations import os from pathlib import Path +from plistlib import InvalidFileException from rich import print from chromy.utilities import ingest_file +from ..utilities import is_probably_text_file + def _get_absolute_path(file: str) -> str: """ @@ -27,6 +30,11 @@ def _get_absolute_path(file: str) -> str: def handle_import(collection: str, file: str) -> int: + absolute_path = _get_absolute_path(file) + + if not is_probably_text_file(absolute_path): + raise InvalidFileException() + records_added = ingest_file(collection, _get_absolute_path(file)) print(f"[bold green]Added[/] {records_added} records to collection '{collection}'.") return 0 diff --git a/chromy/utilities.py b/chromy/utilities.py index fdb086f..3ddf44c 100644 --- a/chromy/utilities.py +++ b/chromy/utilities.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Mapping, Sequence +from pathlib import Path from chromadb import QueryResult from rich.console import Console @@ -76,3 +77,41 @@ def format_query_result(result: QueryResult) -> list[Rule | Text]: lines.append(Rule()) return lines + + +def is_probably_text_file(path: str | Path, sample_size: int = 8192) -> bool: + """ + Return whether a file appears to contain text. + + Args: + path (str | Path): The path to the file to inspect. + sample_size (int): The maximum number of bytes to read from the file. + + Returns: + bool: ``True`` if the sampled bytes decode as UTF-8, UTF-8 with BOM, + UTF-16, or UTF-32, or if the file is empty. Otherwise, ``False``. + """ + + path = Path(path) + + with path.open("rb") as f: + sample = f.read(sample_size) + + if not sample: + return True + + encodings = ( + "utf-8", + "utf-8-sig", + "utf-16", + "utf-32", + ) + + for encoding in encodings: + try: + sample.decode(encoding) + return True + except UnicodeDecodeError: + pass + + return False