refactor chunking and embedding into their own modules
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from chromy.embedding.service import EmbeddingRecord, embed
|
||||
|
||||
__all__ = ["EmbeddingRecord", "embed"]
|
||||
@@ -0,0 +1,29 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Sequence
|
||||
from typing import TypedDict
|
||||
|
||||
from chromadb.utils.embedding_functions import DefaultEmbeddingFunction
|
||||
|
||||
|
||||
class EmbeddingRecord(TypedDict):
|
||||
text: str
|
||||
embedding: list[float]
|
||||
|
||||
|
||||
def embed(chunks: Sequence[str]) -> list[EmbeddingRecord]:
|
||||
if not chunks:
|
||||
return []
|
||||
|
||||
embedding_function = DefaultEmbeddingFunction()
|
||||
embeddings = embedding_function(list(chunks))
|
||||
|
||||
return [
|
||||
{
|
||||
"text": text,
|
||||
"embedding": (
|
||||
embedding.tolist() if hasattr(embedding, "tolist") else list(embedding)
|
||||
),
|
||||
}
|
||||
for text, embedding in zip(chunks, embeddings, strict=False)
|
||||
]
|
||||
Reference in New Issue
Block a user