refactor chunking and embedding into their own modules
build / build (push) Successful in 45s
pytest / pytest (push) Successful in 26s

This commit is contained in:
2026-05-01 11:01:30 +02:00
parent 26df98c08e
commit fb62d1b539
8 changed files with 18 additions and 8 deletions
+5
View File
@@ -0,0 +1,5 @@
from __future__ import annotations
from chromy.chunking.service import chunk_file, chunk_text
__all__ = ["chunk_file", "chunk_text"]
+19
View File
@@ -0,0 +1,19 @@
from __future__ import annotations
from pathlib import Path
from typing import cast
from semchunk import semchunk
def chunk_text(text: str, chunk_size: int = 800) -> list[str]:
chunker = semchunk.chunkerify("gpt-4", chunk_size)
chunks = chunker(text)
return cast("list[str]", chunks)
def chunk_file(filename: str, chunk_size: int = 800) -> list[str]:
contents = Path(filename).read_text()
return chunk_text(contents, chunk_size)