from __future__ import annotations from collections.abc import Mapping, Sequence from pathlib import Path from chromadb import QueryResult from rich.console import Console from rich.rule import Rule from rich.text import Text from chromy.chroma_functions import add_data, query_data from chromy.chunk_functions import chunk_file from chromy.embed import embed CONSOLE = Console() def print_lines(lines: Sequence[Rule | Text]) -> None: for line in lines: CONSOLE.print(line) def ingest_file(collection_name: str, file_path: str) -> int: chunks = chunk_file(file_path) embeddings = embed(chunks) add_data(collection_name, embeddings, file_path) return len(embeddings) def run_query(collection_name: str, query_text: str) -> QueryResult: return query_data(collection_name, [query_text]) def format_query_result(result: QueryResult) -> list[Rule | Text]: ids = result.get("ids", [[]]) documents = result.get("documents", [[]]) distances = result.get("distances", [[]]) metadatas = result.get("metadatas", [[]]) first_ids = ids[0] if ids else [] first_documents = documents[0] if documents else [] first_distances = distances[0] if distances else [] first_metadatas = metadatas[0] if metadatas else [] if not first_ids: return [Text.from_markup("[yellow]No results found.[/]")] lines: list[Rule | Text] = [Rule(title="Query results")] for index, document_id in enumerate(first_ids, start=1): lines.append( Text.from_markup(f"[bold]{index}[/].\t[green]id[/]\t\t{document_id}") ) i = index - 1 if i < len(first_distances): lines.append( Text.from_markup(f"\t[green]distance[/]\t{first_distances[i]}") ) if i < len(first_metadatas): metadata = first_metadatas[i] if isinstance(metadata, Mapping): file_name = metadata.get("file_name") if file_name: lines.append( Text.from_markup(f"\t[green]file_name[/]\t{file_name}") ) if i < len(first_documents): lines.append(Text.from_markup("\n[bold green]Retrieved contents[/]\n")) lines.append(Text(first_documents[i])) # Print a separator between documents lines.append(Rule()) return lines def is_probably_text_file(path: str | Path, sample_size: int = 8192) -> bool: """ Return whether a file appears to contain text. Args: path (str | Path): The path to the file to inspect. sample_size (int): The maximum number of bytes to read from the file. Returns: bool: ``True`` if the sampled bytes decode as UTF-8, UTF-8 with BOM, UTF-16, or UTF-32, or if the file is empty. Otherwise, ``False``. """ path = Path(path) with path.open("rb") as f: sample = f.read(sample_size) if not sample: return True encodings = ( "utf-8", "utf-8-sig", "utf-16", "utf-32", ) for encoding in encodings: try: sample.decode(encoding) return True except UnicodeDecodeError: pass return False