Skip to content

Commit

Permalink
fix: passing metadata when semantic splitter used as well (#104)
Browse files Browse the repository at this point in the history
  • Loading branch information
elisalimli authored Apr 30, 2024
1 parent 8116e7b commit 6e97448
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
2 changes: 1 addition & 1 deletion service/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ async def generate_chunks(
min_split_tokens=config.splitter.min_tokens,
max_split_tokens=config.splitter.max_tokens,
)
chunks = await splitter_config(elements=elements)
chunks = await splitter_config(elements=elements, file=file)

if not chunks:
continue
Expand Down
20 changes: 16 additions & 4 deletions service/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from utils.logger import logger
from utils.table_parser import TableParser
from models.file import File


# TODO: Move to document processing utils, once we have
Expand Down Expand Up @@ -125,7 +126,11 @@ def _group_elements_by_title(self, elements: list[dict[str, Any]]) -> dict:
return grouped_elements

async def split_grouped_elements(
self, elements: list[dict[str, Any]], splitter: RollingWindowSplitter
self,
*,
elements: list[dict[str, Any]],
file: File,
splitter: RollingWindowSplitter,
) -> list[dict[str, Any]]:
grouped_elements = self._group_elements_by_title(elements)
chunks_with_title = []
Expand All @@ -138,7 +143,10 @@ def _append_chunks(
"title": title,
"content": content,
"chunk_index": chunk_index,
"metadata": metadata,
"metadata": {
**file.metadata,
**metadata,
},
}
)

Expand Down Expand Up @@ -207,5 +215,9 @@ def _append_chunks(
chunks_with_title.extend(chunks)
return chunks_with_title

async def __call__(self, elements: list[dict[str, Any]]) -> list[dict[str, Any]]:
return await self.split_grouped_elements(elements, self.splitter)
async def __call__(
self, elements: list[dict[str, Any]], file: File
) -> list[dict[str, Any]]:
return await self.split_grouped_elements(
elements=elements, file=file, splitter=self.splitter
)

0 comments on commit 6e97448

Please sign in to comment.