2026-04-22 17:19:14 +02:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-04-24 18:40:51 +02:00
|
|
|
from pathlib import Path
|
2026-04-21 17:42:37 +02:00
|
|
|
|
2026-04-22 17:03:01 +02:00
|
|
|
from chromadb import QueryResult
|
|
|
|
|
|
2026-04-29 14:46:41 +02:00
|
|
|
from chromy.chroma_functions import add_data, delete_data, has_data_for_file, query_data
|
2026-05-01 11:01:30 +02:00
|
|
|
from chromy.chunking import chunk_file
|
|
|
|
|
from chromy.embedding import embed
|
2026-04-21 17:42:37 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def ingest_file(collection_name: str, file_path: str) -> int:
|
2026-04-29 14:46:41 +02:00
|
|
|
if has_data_for_file(collection_name, file_path):
|
|
|
|
|
delete_data(collection_name, {"file_name": file_path})
|
|
|
|
|
|
2026-04-21 17:42:37 +02:00
|
|
|
chunks = chunk_file(file_path)
|
|
|
|
|
embeddings = embed(chunks)
|
2026-04-21 18:24:49 +02:00
|
|
|
add_data(collection_name, embeddings, file_path)
|
2026-04-21 17:42:37 +02:00
|
|
|
return len(embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_query(collection_name: str, query_text: str) -> QueryResult:
|
|
|
|
|
return query_data(collection_name, [query_text])
|
|
|
|
|
|
|
|
|
|
|
2026-04-24 18:40:51 +02:00
|
|
|
def is_probably_text_file(path: str | Path, sample_size: int = 8192) -> bool:
|
|
|
|
|
"""
|
|
|
|
|
Return whether a file appears to contain text.
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
path (str | Path): The path to the file to inspect.
|
|
|
|
|
sample_size (int): The maximum number of bytes to read from the file.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
bool: ``True`` if the sampled bytes decode as UTF-8, UTF-8 with BOM,
|
|
|
|
|
UTF-16, or UTF-32, or if the file is empty. Otherwise, ``False``.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
path = Path(path)
|
|
|
|
|
|
|
|
|
|
with path.open("rb") as f:
|
|
|
|
|
sample = f.read(sample_size)
|
|
|
|
|
|
|
|
|
|
if not sample:
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
encodings = (
|
|
|
|
|
"utf-8",
|
|
|
|
|
"utf-8-sig",
|
|
|
|
|
"utf-16",
|
|
|
|
|
"utf-32",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for encoding in encodings:
|
|
|
|
|
try:
|
|
|
|
|
sample.decode(encoding)
|
|
|
|
|
return True
|
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return False
|