Files

20 lines
457 B
Python
Raw Permalink Normal View History

2026-04-22 17:03:01 +02:00
from __future__ import annotations
2026-04-21 15:28:20 +02:00
from pathlib import Path
2026-04-22 17:03:01 +02:00
from typing import cast
2026-04-21 15:28:20 +02:00
from semchunk import semchunk
2026-04-21 15:28:20 +02:00
2026-04-22 17:03:01 +02:00
def chunk_text(text: str, chunk_size: int = 800) -> list[str]:
2026-04-21 15:28:20 +02:00
chunker = semchunk.chunkerify("gpt-4", chunk_size)
chunks = chunker(text)
2026-04-22 17:03:01 +02:00
return cast("list[str]", chunks)
2026-04-21 15:28:20 +02:00
2026-04-22 17:03:01 +02:00
def chunk_file(filename: str, chunk_size: int = 800) -> list[str]:
2026-04-21 15:28:20 +02:00
contents = Path(filename).read_text()
return chunk_text(contents, chunk_size)