2026-04-22 17:03:01 +02:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-04-22 17:19:14 +02:00
|
|
|
from collections.abc import Sequence
|
2026-04-22 17:03:01 +02:00
|
|
|
from typing import TypedDict
|
2026-04-21 15:06:04 +02:00
|
|
|
|
|
|
|
|
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EmbeddingRecord(TypedDict):
|
|
|
|
|
text: str
|
2026-04-22 17:03:01 +02:00
|
|
|
embedding: list[float]
|
2026-04-21 15:06:04 +02:00
|
|
|
|
|
|
|
|
|
2026-04-22 17:19:14 +02:00
|
|
|
def embed(chunks: Sequence[str]) -> list[EmbeddingRecord]:
|
2026-04-21 15:06:04 +02:00
|
|
|
if not chunks:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
embedding_function = DefaultEmbeddingFunction()
|
2026-04-22 17:19:14 +02:00
|
|
|
embeddings = embedding_function(list(chunks))
|
2026-04-21 15:06:04 +02:00
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
{
|
|
|
|
|
"text": text,
|
|
|
|
|
"embedding": (
|
|
|
|
|
embedding.tolist() if hasattr(embedding, "tolist") else list(embedding)
|
|
|
|
|
),
|
|
|
|
|
}
|
2026-04-22 17:03:01 +02:00
|
|
|
for text, embedding in zip(chunks, embeddings, strict=False)
|
2026-04-21 15:06:04 +02:00
|
|
|
]
|