Files
mrosati fb62d1b539
build / build (push) Successful in 45s
pytest / pytest (push) Successful in 26s
refactor chunking and embedding into their own modules
2026-05-01 11:01:30 +02:00

30 lines
718 B
Python

from __future__ import annotations
from collections.abc import Sequence
from typing import TypedDict
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
class EmbeddingRecord(TypedDict):
text: str
embedding: list[float]
def embed(chunks: Sequence[str]) -> list[EmbeddingRecord]:
if not chunks:
return []
embedding_function = DefaultEmbeddingFunction()
embeddings = embedding_function(list(chunks))
return [
{
"text": text,
"embedding": (
embedding.tolist() if hasattr(embedding, "tolist") else list(embedding)
),
}
for text, embedding in zip(chunks, embeddings, strict=False)
]