2026-04-22 17:03:01 +02:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2026-04-21 15:28:20 +02:00
|
|
|
from pathlib import Path
|
2026-04-22 17:03:01 +02:00
|
|
|
from typing import cast
|
2026-04-21 15:28:20 +02:00
|
|
|
|
2026-05-01 11:01:30 +02:00
|
|
|
from semchunk import semchunk
|
2026-04-21 15:28:20 +02:00
|
|
|
|
|
|
|
|
|
2026-04-22 17:03:01 +02:00
|
|
|
def chunk_text(text: str, chunk_size: int = 800) -> list[str]:
|
2026-04-21 15:28:20 +02:00
|
|
|
chunker = semchunk.chunkerify("gpt-4", chunk_size)
|
|
|
|
|
chunks = chunker(text)
|
|
|
|
|
|
2026-04-22 17:03:01 +02:00
|
|
|
return cast("list[str]", chunks)
|
2026-04-21 15:28:20 +02:00
|
|
|
|
|
|
|
|
|
2026-04-22 17:03:01 +02:00
|
|
|
def chunk_file(filename: str, chunk_size: int = 800) -> list[str]:
|
2026-04-21 15:28:20 +02:00
|
|
|
contents = Path(filename).read_text()
|
|
|
|
|
|
|
|
|
|
return chunk_text(contents, chunk_size)
|