refactor chunking and embedding into their own modules
This commit is contained in:
@@ -9,7 +9,7 @@ from chromadb.api import ClientAPI
|
|||||||
from chromadb.api.types import QueryResult, Where
|
from chromadb.api.types import QueryResult, Where
|
||||||
from chromadb.errors import NotFoundError
|
from chromadb.errors import NotFoundError
|
||||||
|
|
||||||
from chromy.embed import EmbeddingRecord
|
from chromy.embedding import EmbeddingRecord
|
||||||
|
|
||||||
|
|
||||||
def _get_client_and_collection(
|
def _get_client_and_collection(
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from chromy.chunking.service import chunk_file, chunk_text
|
||||||
|
|
||||||
|
__all__ = ["chunk_file", "chunk_text"]
|
||||||
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
import semchunk
|
from semchunk import semchunk
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text: str, chunk_size: int = 800) -> list[str]:
|
def chunk_text(text: str, chunk_size: int = 800) -> list[str]:
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from chromy.embedding.service import EmbeddingRecord, embed
|
||||||
|
|
||||||
|
__all__ = ["EmbeddingRecord", "embed"]
|
||||||
+2
-2
@@ -5,8 +5,8 @@ from pathlib import Path
|
|||||||
from chromadb import QueryResult
|
from chromadb import QueryResult
|
||||||
|
|
||||||
from chromy.chroma_functions import add_data, delete_data, has_data_for_file, query_data
|
from chromy.chroma_functions import add_data, delete_data, has_data_for_file, query_data
|
||||||
from chromy.chunk_functions import chunk_file
|
from chromy.chunking import chunk_file
|
||||||
from chromy.embed import embed
|
from chromy.embedding import embed
|
||||||
|
|
||||||
|
|
||||||
def ingest_file(collection_name: str, file_path: str) -> int:
|
def ingest_file(collection_name: str, file_path: str) -> int:
|
||||||
|
|||||||
+2
-2
@@ -24,7 +24,7 @@ dependencies = [
|
|||||||
chromy = "chromy.main:main"
|
chromy = "chromy.main:main"
|
||||||
|
|
||||||
[tool.setuptools]
|
[tool.setuptools]
|
||||||
packages = ["chromy", "chromy.handlers"]
|
packages = ["chromy", "chromy.chunking", "chromy.embedding", "chromy.handlers"]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
dev = [
|
dev = [
|
||||||
@@ -72,7 +72,7 @@ module = [
|
|||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
||||||
[[tool.mypy.overrides]]
|
[[tool.mypy.overrides]]
|
||||||
module = "chromy.chunk_functions"
|
module = "chromy.chunking.service"
|
||||||
disable_error_code = [
|
disable_error_code = [
|
||||||
"attr-defined",
|
"attr-defined",
|
||||||
]
|
]
|
||||||
|
|||||||
+2
-2
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
import unittest
|
import unittest
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from chromy.embed import embed
|
from chromy.embedding import embed
|
||||||
|
|
||||||
|
|
||||||
class EmbedTest(unittest.TestCase):
|
class EmbedTest(unittest.TestCase):
|
||||||
@@ -12,7 +12,7 @@ class EmbedTest(unittest.TestCase):
|
|||||||
|
|
||||||
def test_embed_pairs_text_with_list_embeddings(self) -> None:
|
def test_embed_pairs_text_with_list_embeddings(self) -> None:
|
||||||
with patch(
|
with patch(
|
||||||
"chromy.embed.DefaultEmbeddingFunction",
|
"chromy.embedding.service.DefaultEmbeddingFunction",
|
||||||
return_value=lambda chunks: ((1.0, 2.0), (3.0, 4.0)),
|
return_value=lambda chunks: ((1.0, 2.0), (3.0, 4.0)),
|
||||||
):
|
):
|
||||||
result = embed(["first", "second"])
|
result = embed(["first", "second"])
|
||||||
|
|||||||
Reference in New Issue
Block a user