From 89db275ccb1c33c6ce7839d7c3589fef01963cdb Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 23 Oct 2024 11:30:24 +0800 Subject: [PATCH 01/20] feat: Improve triplet extraction batch size and handling Co-authored-by: Appointat --- dbgpt/rag/transformer/triplet_extractor.py | 3 +- .../knowledge_graph/community_summary.py | 57 ++++++++++++------- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/dbgpt/rag/transformer/triplet_extractor.py b/dbgpt/rag/transformer/triplet_extractor.py index 7a591560f..60b5346f3 100644 --- a/dbgpt/rag/transformer/triplet_extractor.py +++ b/dbgpt/rag/transformer/triplet_extractor.py @@ -1,4 +1,5 @@ """TripletExtractor class.""" + import logging import re from typing import Any, List, Optional, Tuple @@ -12,7 +13,7 @@ "Some text is provided below. Given the text, " "extract up to knowledge triplets as more as possible " "in the form of (subject, predicate, object).\n" - "Avoid stopwords.\n" + "Avoid stopwords. The subject, predicate, object can not be none.\n" "---------------------\n" "Example:\n" "Text: Alice is Bob's mother.\n" diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 904b0beba..8c24d3f88 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -1,5 +1,6 @@ """Define the CommunitySummaryKnowledgeGraph.""" +import asyncio import logging import os import uuid @@ -63,6 +64,10 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=5, description="Top size of knowledge graph chunk search", ) + triplet_extraction_batch_size: int = Field( + default=20, + description="Batch size of triplets extraction from the text", + ) class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph): @@ -96,6 +101,11 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.community_score_threshold, ) ) + self._triplet_extraction_batch_size = int( + os.getenv( + "TRIPLET_EXTRACTION_BATCH_SIZE", config.triplet_extraction_batch_size + ) + ) def extractor_configure(name: str, cfg: VectorStoreConfig): cfg.name = name @@ -189,30 +199,35 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: return document_graph_enabled = self._graph_store.get_config().document_graph_enabled - for chunk in chunks: - # TODO: Use asyncio to extract graph to accelerate the process - # (attention to the CAP of the graph db) + batch_size = self._triplet_extraction_batch_size + + for i in range(0, len(chunks), batch_size): + batch_chunks = chunks[i : i + batch_size] - graphs: List[MemoryGraph] = await self._graph_extractor.extract( - chunk.content + extraction_tasks = [ + self._graph_extractor.extract(chunk.content) for chunk in batch_chunks + ] + async_graphs: List[List[MemoryGraph]] = await asyncio.gather( + *extraction_tasks ) - for graph in graphs: - if document_graph_enabled: - # append the chunk id to the edge - for edge in graph.edges(): - edge.set_prop("_chunk_id", chunk.chunk_id) - graph.append_edge(edge=edge) - - # upsert the graph - self._graph_store_apdater.upsert_graph(graph) - - # chunk -> include -> entity - if document_graph_enabled: - for vertex in graph.vertices(): - self._graph_store_apdater.upsert_chunk_include_entity( - chunk=chunk, entity=vertex - ) + for chunk, graphs in zip(batch_chunks, async_graphs): + for graph in graphs: + if document_graph_enabled: + # append the chunk id to the edge + for edge in graph.edges(): + edge.set_prop("_chunk_id", chunk.chunk_id) + graph.append_edge(edge=edge) + + # upsert the graph + self._graph_store_apdater.upsert_graph(graph) + + # chunk -> include -> entity + if document_graph_enabled: + for vertex in graph.vertices(): + self._graph_store_apdater.upsert_chunk_include_entity( + chunk=chunk, entity=vertex + ) def _load_chunks( self, chunks: List[ParagraphChunk] From f8e3ed1711d7e54a440eb6ec63fa308089fe2995 Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 23 Oct 2024 11:42:10 +0800 Subject: [PATCH 02/20] feat: Improve triplet extraction batch size and handling Co-authored-by: Appointat --- .env.template | 1 + docs/docs/cookbook/rag/graph_rag_app_develop.md | 1 + 2 files changed, 2 insertions(+) diff --git a/.env.template b/.env.template index 45e075d67..5adb248be 100644 --- a/.env.template +++ b/.env.template @@ -167,6 +167,7 @@ TRIPLET_GRAPH_ENABLED=True # enable the graph search for triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the top size of knowledge graph search for chunks +TRIPLET_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ### Chroma vector db config #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md index c63a66e6e..8c548bc22 100644 --- a/docs/docs/cookbook/rag/graph_rag_app_develop.md +++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md @@ -116,6 +116,7 @@ GRAPH_COMMUNITY_SUMMARY_ENABLED=True # enable the graph community summary TRIPLET_GRAPH_ENABLED=True # enable the graph search for the triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the number of the searched triplets in a retrieval +TRIPLET_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ``` From a57029e561140b9adf8f52387a8909a832818847 Mon Sep 17 00:00:00 2001 From: Appointat Date: Mon, 28 Oct 2024 16:30:50 +0800 Subject: [PATCH 03/20] refactor: Add batch_extract method to ExtractorBase Co-authored-by: Appointat --- dbgpt/rag/transformer/base.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py index 289f02887..ad93101e0 100644 --- a/dbgpt/rag/transformer/base.py +++ b/dbgpt/rag/transformer/base.py @@ -1,7 +1,10 @@ """Transformer base class.""" + import logging from abc import ABC, abstractmethod -from typing import List, Optional +from typing import List, Optional, Union + +from dbgpt.core import Chunk logger = logging.getLogger(__name__) @@ -37,6 +40,15 @@ class ExtractorBase(TransformerBase, ABC): async def extract(self, text: str, limit: Optional[int] = None) -> List: """Extract results from text.""" + @abstractmethod + async def batch_extract( + self, + texts: Union[List[str], List[Chunk]], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List: + """Batch extract results from texts.""" + class TranslatorBase(TransformerBase, ABC): """Translator base class.""" From 3fc7640d1e0d544f2ac4eec61e359d3f88845104 Mon Sep 17 00:00:00 2001 From: Appointat Date: Mon, 28 Oct 2024 16:31:15 +0800 Subject: [PATCH 04/20] refactor: refactor: Add batch_extract method to GraphExtractor Co-authored-by: Appointat --- dbgpt/rag/transformer/graph_extractor.py | 83 +++++++++++++++++++----- 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py index 12751e89f..32ecee2ff 100644 --- a/dbgpt/rag/transformer/graph_extractor.py +++ b/dbgpt/rag/transformer/graph_extractor.py @@ -1,8 +1,9 @@ """GraphExtractor class.""" +import asyncio import logging import re -from typing import List, Optional +from typing import Dict, List, Optional, Tuple, Union from dbgpt.core import Chunk, LLMClient from dbgpt.rag.transformer.llm_extractor import LLMExtractor @@ -21,37 +22,85 @@ def __init__( """Initialize the GraphExtractor.""" super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN) self._chunk_history = chunk_history + self._chunk_context_map: Dict[str, str] = {} config = self._chunk_history.get_config() + self._vector_space = config.name self._max_chunks_once_load = config.max_chunks_once_load self._max_threads = config.max_threads self._topk = config.topk self._score_threshold = config.score_threshold - async def extract(self, text: str, limit: Optional[int] = None) -> List: - """Load similar chunks.""" - # load similar chunks - chunks = await self._chunk_history.asimilar_search_with_scores( - text, self._topk, self._score_threshold - ) - history = [ - f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks) - ] - context = "\n".join(history) if history else "" - - try: - # extract with chunk history - return await super()._extract(text, context, limit) + async def aload_chunk_context(self, texts: List[str]) -> None: + """Load chunk context.""" + for text in texts: + # Load similar chunks + chunks = await self._chunk_history.asimilar_search_with_scores( + text, self._topk, self._score_threshold + ) + history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)] - finally: - # save chunk to history + # Save chunk to history await self._chunk_history.aload_document_with_limit( [Chunk(content=text, metadata={"relevant_cnt": len(history)})], self._max_chunks_once_load, self._max_threads, ) + # Save chunk context to map + context = "\n".join(history) if history else "" + self._chunk_context_map[text] = context + + async def extract(self, text: str, limit: Optional[int] = None) -> List: + """Load similar chunks. + + Suggestion: to extract triplets in batches, call `batch_extract`. + """ + if text not in self._chunk_context_map: + await self.aload_chunk_context([Chunk(content=text)]) + context = self._chunk_context_map.get(text, "") + + # Extract with chunk history + return await super()._extract(text, context, limit) + + async def batch_extract( + self, + texts: Union[List[str], List[Chunk]], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List[Tuple[Chunk, List[Graph]]]: + """Extract graphs from chunks in batches.""" + if isinstance(texts, list) and any( + not isinstance(chunk, Chunk) for chunk in texts + ): + raise ValueError("Chunks should be a list of Chunk objects, not strings.") + chunks: List[Chunk] = texts # type: ignore[assignment] + + # 1. Load chunk context + chunk_content_list = [chunk.content for chunk in chunks] + await self.aload_chunk_context(chunk_content_list) + + chunk_graph_pairs: List[Tuple[Chunk, List[Graph]]] = [] + total_batches = (len(chunks) + batch_size - 1) // batch_size + + for batch_idx in range(total_batches): + start_idx = batch_idx * batch_size + end_idx = min((batch_idx + 1) * batch_size, len(chunks)) + batch_chunks = chunks[start_idx:end_idx] + + # 2. Process extraction in parallel + extraction_tasks = [ + self.extract(chunk.content, limit) for chunk in batch_chunks + ] + batch_graphs = await asyncio.gather(*extraction_tasks) + + # 3. Zip chunks with their corresponding graphs to maintain the relationship + batch_graph_pairs = list(zip(batch_chunks, batch_graphs)) + chunk_graph_pairs.extend(batch_graph_pairs) + + return chunk_graph_pairs + def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]: graph = MemoryGraph() edge_count = 0 From fee90cc484487181e7acd8e35e076123fc02b53b Mon Sep 17 00:00:00 2001 From: Appointat Date: Mon, 28 Oct 2024 16:31:38 +0800 Subject: [PATCH 05/20] refactor: Add batch_extract method to LLMExtractor Co-authored-by: Appointat --- dbgpt/rag/transformer/llm_extractor.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py index 494096d51..c8324c607 100644 --- a/dbgpt/rag/transformer/llm_extractor.py +++ b/dbgpt/rag/transformer/llm_extractor.py @@ -1,9 +1,10 @@ """TripletExtractor class.""" + import logging from abc import ABC, abstractmethod -from typing import List, Optional +from typing import List, Optional, Union -from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest +from dbgpt.core import Chunk, HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest from dbgpt.rag.transformer.base import ExtractorBase logger = logging.getLogger(__name__) @@ -22,6 +23,21 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List: """Extract by LLM.""" return await self._extract(text, None, limit) + async def batch_extract( + self, + texts: Union[List[str], List[Chunk]], + batch_size: int = 1, + limit: Optional[int] = None, + ) -> List: + """Batch extract by LLM.""" + if isinstance(texts, list) and any(not isinstance(text, str) for text in texts): + raise ValueError("All elements must be strings") + + results = [] + for text in texts: + results.append(await self.extract(text, limit)) + return results + async def _extract( self, text: str, history: str = None, limit: Optional[int] = None ) -> List: From ccd2cdff970fdad7ecb6fae9dd913748bb581daa Mon Sep 17 00:00:00 2001 From: Appointat Date: Mon, 28 Oct 2024 16:31:58 +0800 Subject: [PATCH 06/20] refactor: Refactor CommunitySummaryKnowledgeGraph batch extraction method Co-authored-by: Appointat --- .../knowledge_graph/community_summary.py | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 8c24d3f88..1f5dd59a0 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -1,6 +1,5 @@ """Define the CommunitySummaryKnowledgeGraph.""" -import asyncio import logging import os import uuid @@ -199,35 +198,28 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: return document_graph_enabled = self._graph_store.get_config().document_graph_enabled - batch_size = self._triplet_extraction_batch_size - for i in range(0, len(chunks), batch_size): - batch_chunks = chunks[i : i + batch_size] - - extraction_tasks = [ - self._graph_extractor.extract(chunk.content) for chunk in batch_chunks - ] - async_graphs: List[List[MemoryGraph]] = await asyncio.gather( - *extraction_tasks - ) + chunk_graph_pairs = await self._graph_extractor.batch_extract( + chunks, batch_size=self._triplet_extraction_batch_size + ) - for chunk, graphs in zip(batch_chunks, async_graphs): - for graph in graphs: - if document_graph_enabled: - # append the chunk id to the edge - for edge in graph.edges(): - edge.set_prop("_chunk_id", chunk.chunk_id) - graph.append_edge(edge=edge) - - # upsert the graph - self._graph_store_apdater.upsert_graph(graph) - - # chunk -> include -> entity - if document_graph_enabled: - for vertex in graph.vertices(): - self._graph_store_apdater.upsert_chunk_include_entity( - chunk=chunk, entity=vertex - ) + for chunk, graphs in chunk_graph_pairs: + for graph in graphs: + if document_graph_enabled: + # Append the chunk id to the edge + for edge in graph.edges(): + edge.set_prop("_chunk_id", chunk.chunk_id) + graph.append_edge(edge=edge) + + # Upsert the graph + self._graph_store_apdater.upsert_graph(graph) + + # chunk -> include -> entity + if document_graph_enabled: + for vertex in graph.vertices(): + self._graph_store_apdater.upsert_chunk_include_entity( + chunk=chunk, entity=vertex + ) def _load_chunks( self, chunks: List[ParagraphChunk] From 3f65e4970e86dd1aeb4071a66a388119c1e0737d Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 12:15:41 +0800 Subject: [PATCH 07/20] refactor: Update knowledge graph extraction batch size --- .env.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.template b/.env.template index 5adb248be..453e2c6d3 100644 --- a/.env.template +++ b/.env.template @@ -167,7 +167,7 @@ TRIPLET_GRAPH_ENABLED=True # enable the graph search for triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the top size of knowledge graph search for chunks -TRIPLET_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text +KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ### Chroma vector db config #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data From a253542110c1176acfe5336486d58b406e0a9bee Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 12:16:43 +0800 Subject: [PATCH 08/20] refactor: Update knowledge graph extraction batch size --- docs/docs/cookbook/rag/graph_rag_app_develop.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md index 8c548bc22..0cdca75d4 100644 --- a/docs/docs/cookbook/rag/graph_rag_app_develop.md +++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md @@ -116,7 +116,7 @@ GRAPH_COMMUNITY_SUMMARY_ENABLED=True # enable the graph community summary TRIPLET_GRAPH_ENABLED=True # enable the graph search for the triplets DOCUMENT_GRAPH_ENABLED=True # enable the graph search for documents and chunks KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5 # the number of the searched triplets in a retrieval -TRIPLET_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text +KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20 # the batch size of triplet extraction from the text ``` From c565600ce7ee7355b09dc3102c919d756c0b5ddb Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 12:19:00 +0800 Subject: [PATCH 09/20] Refactor batch extraction methods in GraphExtractor and LLMExtractor --- dbgpt/rag/transformer/base.py | 6 +- dbgpt/rag/transformer/graph_extractor.py | 78 +++++++++++++++--------- dbgpt/rag/transformer/llm_extractor.py | 25 +++++--- 3 files changed, 68 insertions(+), 41 deletions(-) diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py index ad93101e0..a71c2da14 100644 --- a/dbgpt/rag/transformer/base.py +++ b/dbgpt/rag/transformer/base.py @@ -2,9 +2,7 @@ import logging from abc import ABC, abstractmethod -from typing import List, Optional, Union - -from dbgpt.core import Chunk +from typing import List, Optional logger = logging.getLogger(__name__) @@ -43,7 +41,7 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List: @abstractmethod async def batch_extract( self, - texts: Union[List[str], List[Chunk]], + texts: List[str], batch_size: int = 1, limit: Optional[int] = None, ) -> List: diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py index 32ecee2ff..ef34ce246 100644 --- a/dbgpt/rag/transformer/graph_extractor.py +++ b/dbgpt/rag/transformer/graph_extractor.py @@ -3,7 +3,7 @@ import asyncio import logging import re -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional from dbgpt.core import Chunk, LLMClient from dbgpt.rag.transformer.llm_extractor import LLMExtractor @@ -22,7 +22,6 @@ def __init__( """Initialize the GraphExtractor.""" super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN) self._chunk_history = chunk_history - self._chunk_context_map: Dict[str, str] = {} config = self._chunk_history.get_config() @@ -32,8 +31,10 @@ def __init__( self._topk = config.topk self._score_threshold = config.score_threshold - async def aload_chunk_context(self, texts: List[str]) -> None: + async def aload_chunk_context(self, texts: List[str]) -> Dict[str, str]: """Load chunk context.""" + text_context_map: Dict[str, str] = {} + for text in texts: # Load similar chunks chunks = await self._chunk_history.asimilar_search_with_scores( @@ -50,56 +51,75 @@ async def aload_chunk_context(self, texts: List[str]) -> None: # Save chunk context to map context = "\n".join(history) if history else "" - self._chunk_context_map[text] = context + text_context_map[text] = context + return text_context_map async def extract(self, text: str, limit: Optional[int] = None) -> List: - """Load similar chunks. + """Extract graphs from text. Suggestion: to extract triplets in batches, call `batch_extract`. """ - if text not in self._chunk_context_map: - await self.aload_chunk_context([Chunk(content=text)]) - context = self._chunk_context_map.get(text, "") + # Load similar chunks + chunks = await self._chunk_history.asimilar_search_with_scores( + text, self._topk, self._score_threshold + ) + history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)] + + # Save chunk to history + await self._chunk_history.aload_document_with_limit( + [Chunk(content=text, metadata={"relevant_cnt": len(history)})], + self._max_chunks_once_load, + self._max_threads, + ) + + # Save chunk context to map + context = "\n".join(history) if history else "" # Extract with chunk history return await super()._extract(text, context, limit) async def batch_extract( self, - texts: Union[List[str], List[Chunk]], + texts: List[str], batch_size: int = 1, limit: Optional[int] = None, - ) -> List[Tuple[Chunk, List[Graph]]]: - """Extract graphs from chunks in batches.""" - if isinstance(texts, list) and any( - not isinstance(chunk, Chunk) for chunk in texts - ): - raise ValueError("Chunks should be a list of Chunk objects, not strings.") - chunks: List[Chunk] = texts # type: ignore[assignment] + ) -> List[List[Graph]]: + """Extract graphs from chunks in batches. + Returns list of graphs in same order as input texts (text <-> graphs). + """ # 1. Load chunk context - chunk_content_list = [chunk.content for chunk in chunks] - await self.aload_chunk_context(chunk_content_list) + text_context_map = await self.aload_chunk_context(texts) - chunk_graph_pairs: List[Tuple[Chunk, List[Graph]]] = [] - total_batches = (len(chunks) + batch_size - 1) // batch_size + # Pre-allocate results list to maintain order + graphs_list: List[List[Graph]] = [None] * len(texts) + total_batches = (len(texts) + batch_size - 1) // batch_size for batch_idx in range(total_batches): start_idx = batch_idx * batch_size - end_idx = min((batch_idx + 1) * batch_size, len(chunks)) - batch_chunks = chunks[start_idx:end_idx] + end_idx = min((batch_idx + 1) * batch_size, len(texts)) + batch_texts = texts[start_idx:end_idx] - # 2. Process extraction in parallel + # 2. Create tasks with their original indices extraction_tasks = [ - self.extract(chunk.content, limit) for chunk in batch_chunks + ( + idx, + self._extract(text, text_context_map[text], limit), + ) + for idx, text in enumerate(batch_texts, start=start_idx) ] - batch_graphs = await asyncio.gather(*extraction_tasks) - # 3. Zip chunks with their corresponding graphs to maintain the relationship - batch_graph_pairs = list(zip(batch_chunks, batch_graphs)) - chunk_graph_pairs.extend(batch_graph_pairs) + # 3. Process extraction in parallel while keeping track of indices + batch_results = await asyncio.gather( + *(task for _, task in extraction_tasks) + ) + + # 4. Place results in the correct positions + for (idx, _), graphs in zip(extraction_tasks, batch_results): + graphs_list[idx] = graphs - return chunk_graph_pairs + assert all(x is not None for x in graphs_list), "All positions should be filled" + return graphs_list def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]: graph = MemoryGraph() diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py index c8324c607..15c985de8 100644 --- a/dbgpt/rag/transformer/llm_extractor.py +++ b/dbgpt/rag/transformer/llm_extractor.py @@ -1,10 +1,11 @@ """TripletExtractor class.""" +import asyncio import logging from abc import ABC, abstractmethod -from typing import List, Optional, Union +from typing import List, Optional -from dbgpt.core import Chunk, HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest +from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest from dbgpt.rag.transformer.base import ExtractorBase logger = logging.getLogger(__name__) @@ -25,17 +26,25 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List: async def batch_extract( self, - texts: Union[List[str], List[Chunk]], + texts: List[str], batch_size: int = 1, limit: Optional[int] = None, ) -> List: """Batch extract by LLM.""" - if isinstance(texts, list) and any(not isinstance(text, str) for text in texts): - raise ValueError("All elements must be strings") - results = [] - for text in texts: - results.append(await self.extract(text, limit)) + + for i in range(0, len(texts), batch_size): + batch_texts = texts[i : i + batch_size] + + # Create tasks for current batch + extraction_tasks = [ + self._extract(text, None, limit) for text in batch_texts + ] + + # Execute batch concurrently and wait for all to complete + batch_results = await asyncio.gather(*extraction_tasks) + results.extend(batch_results) + return results async def _extract( From a4e602ed0f552c4d6171818dd80bec7f9fd5fdc4 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 12:19:05 +0800 Subject: [PATCH 10/20] Refactor knowledge graph extraction batch size and method in CommunitySummaryKnowledgeGraph --- .../knowledge_graph/community_summary.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 1f5dd59a0..e33779ee5 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -63,7 +63,7 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=5, description="Top size of knowledge graph chunk search", ) - triplet_extraction_batch_size: int = Field( + knowledge_graph_extraction_batch_size: int = Field( default=20, description="Batch size of triplets extraction from the text", ) @@ -102,7 +102,8 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): ) self._triplet_extraction_batch_size = int( os.getenv( - "TRIPLET_EXTRACTION_BATCH_SIZE", config.triplet_extraction_batch_size + "KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE", + config.knowledge_graph_extraction_batch_size, ) ) @@ -199,16 +200,20 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: document_graph_enabled = self._graph_store.get_config().document_graph_enabled - chunk_graph_pairs = await self._graph_extractor.batch_extract( - chunks, batch_size=self._triplet_extraction_batch_size + # Extract the triplets from the chunks, and return the list of graphs + # in the same order as the input texts + graphs_list = await self._graph_extractor.batch_extract( + [chunk.content for chunk in chunks], + batch_size=self._triplet_extraction_batch_size, ) - for chunk, graphs in chunk_graph_pairs: + # Upsert the graphs into the graph store + for idx, graphs in enumerate(graphs_list): for graph in graphs: if document_graph_enabled: # Append the chunk id to the edge for edge in graph.edges(): - edge.set_prop("_chunk_id", chunk.chunk_id) + edge.set_prop("_chunk_id", chunks[idx].chunk_id) graph.append_edge(edge=edge) # Upsert the graph @@ -218,7 +223,7 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: if document_graph_enabled: for vertex in graph.vertices(): self._graph_store_apdater.upsert_chunk_include_entity( - chunk=chunk, entity=vertex + chunk=chunks[idx], entity=vertex ) def _load_chunks( From 7d4d7f4b15fb67d072bef2ec318dd687a250ce57 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 12:25:35 +0800 Subject: [PATCH 11/20] refactor: Refactor batch extraction methods in GraphExtractor and LLMExtractor --- dbgpt/rag/transformer/graph_extractor.py | 23 ++++++++--------------- dbgpt/rag/transformer/llm_extractor.py | 3 +++ 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py index ef34ce246..7a02f74d1 100644 --- a/dbgpt/rag/transformer/graph_extractor.py +++ b/dbgpt/rag/transformer/graph_extractor.py @@ -40,7 +40,9 @@ async def aload_chunk_context(self, texts: List[str]) -> Dict[str, str]: chunks = await self._chunk_history.asimilar_search_with_scores( text, self._topk, self._score_threshold ) - history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)] + history = [ + f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks) + ] # Save chunk to history await self._chunk_history.aload_document_with_limit( @@ -60,20 +62,8 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List: Suggestion: to extract triplets in batches, call `batch_extract`. """ # Load similar chunks - chunks = await self._chunk_history.asimilar_search_with_scores( - text, self._topk, self._score_threshold - ) - history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)] - - # Save chunk to history - await self._chunk_history.aload_document_with_limit( - [Chunk(content=text, metadata={"relevant_cnt": len(history)})], - self._max_chunks_once_load, - self._max_threads, - ) - - # Save chunk context to map - context = "\n".join(history) if history else "" + text_context_map = await self.aload_chunk_context([text]) + context = text_context_map[text] # Extract with chunk history return await super()._extract(text, context, limit) @@ -88,6 +78,9 @@ async def batch_extract( Returns list of graphs in same order as input texts (text <-> graphs). """ + if batch_size < 1: + raise ValueError("batch_size >= 1") + # 1. Load chunk context text_context_map = await self.aload_chunk_context(texts) diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py index 15c985de8..049611bc7 100644 --- a/dbgpt/rag/transformer/llm_extractor.py +++ b/dbgpt/rag/transformer/llm_extractor.py @@ -31,6 +31,9 @@ async def batch_extract( limit: Optional[int] = None, ) -> List: """Batch extract by LLM.""" + if batch_size < 1: + raise ValueError("batch_size >= 1") + results = [] for i in range(0, len(texts), batch_size): From 5aaa39351340dddaa900628a9cc89431b8be9e86 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 21:36:19 +0800 Subject: [PATCH 12/20] feat: Refactor knowledge graph extraction batch size and method in TuGraphStoreAdapter --- .../community/tugraph_store_adapter.py | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index d65969d76..577a68a4b 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -560,29 +560,42 @@ def explore( rel = f"<-[r:{GraphElemType.RELATION.value}*{depth_string}]-" else: rel = f"-[r:{GraphElemType.RELATION.value}*{depth_string}]-" - query = ( + path_query = ( f"MATCH p=(n:{GraphElemType.ENTITY.value})" f"{rel}(m:{GraphElemType.ENTITY.value}) " f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " - f"RETURN p {limit_string}" + f"RETURN n {limit_string}" ) - return self.query(query) + return self.query(path_query, white_list=["description"]) else: graph = MemoryGraph() for sub in subs: - query = ( + # Query the chain from documents to chunks, + # document -> chunk -> chunk -> chunk -> ... + chain_query = ( f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" - f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]-" + f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS " f"'{self._escape_quotes(sub)}' " - f"RETURN p {limit_string}" - ) # if it contains the subjects - result = self.query(query) - for vertex in result.vertices(): - graph.upsert_vertex(vertex) - for edge in result.edges(): - graph.append_edge(edge) + f"RETURN p" + ) + # Query and filter all the properties + graph_of_path = self.query(query=chain_query, white_list=[""]) + graph.upsert_graph(graph_of_path) + + # Query the leaf chunks in the chain from documents to chunks + leaf_chunk_query = ( + f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" + f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" + f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS " + f"'{self._escape_quotes(sub)}' " + f"RETURN m {limit_string}" + ) + graph_of_leaf_chunk = self.query( + query=leaf_chunk_query, white_list=["content"] + ) + graph.upsert_graph(graph_of_leaf_chunk) return graph @@ -607,6 +620,7 @@ def query(self, query: str, **kwargs) -> MemoryGraph: vertices, edges = self._get_nodes_edges_from_queried_data( query_result, white_list ) + mg = MemoryGraph() for vertex in vertices: mg.upsert_vertex(vertex) @@ -714,7 +728,7 @@ def _get_nodes_edges_from_queried_data( from neo4j import graph def filter_properties( - properties: dict[str, Any], white_list: List[str] + properties: dict[str, Any], white_list: List[str] = None ) -> Dict[str, Any]: """Filter the properties. @@ -723,13 +737,26 @@ def filter_properties( entity_properties = ["id", "name", "description", "_document_id", "_chunk_id", "_community_id"] edge_properties = ["id", "name", "description", "_chunk_id"] + Args: + properties: Dictionary of properties to filter + white_list: List of properties to keep + - If None: Keep default properties (those not starting with '_' + and not in ['id', 'name']) + - If [""]: Remove all properties (return empty dict) + - If list of strings: Keep only properties in white_list """ - return { - key: value - for key, value in properties.items() - if (not key.startswith("_") and key not in ["id", "name"]) - or key in white_list - } + return ( + {} + if white_list == [""] + else { + key: value + for key, value in properties.items() + if ( + (not key.startswith("_") and key not in ["id", "name"]) + or (white_list is not None and key in white_list) + ) + } + ) # Parse the data to nodes and relationships for record in data: From e8b82dba222a99815cf0ec04ceb5f66f7c4ba726 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 21:36:26 +0800 Subject: [PATCH 13/20] refactor: Update knowledge graph extraction batch size and method in CommunitySummaryKnowledgeGraph --- .../knowledge_graph/community_summary.py | 255 ++++++------------ 1 file changed, 84 insertions(+), 171 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index e33779ee5..a5eca38a5 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -100,6 +100,12 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.community_score_threshold, ) ) + self._knowledge_graph_chunk_search_top_size = int( + os.getenv( + "KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE", + config.knowledge_graph_chunk_search_top_size, + ) + ) self._triplet_extraction_batch_size = int( os.getenv( "KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE", @@ -314,14 +320,14 @@ async def asimilar_search_with_scores( subgraph_for_doc = self._graph_store_apdater.explore( subs=keywords_for_document_graph, - limit=self._config.knowledge_graph_chunk_search_top_size, + limit=self._knowledge_graph_chunk_search_top_size, search_scope="document_graph", ) else: if document_graph_enabled: subgraph_for_doc = self._graph_store_apdater.explore( subs=keywords, - limit=self._config.knowledge_graph_chunk_search_top_size, + limit=self._knowledge_graph_chunk_search_top_size, search_scope="document_graph", ) knowledge_graph_str = subgraph.format() if subgraph else "" @@ -335,7 +341,7 @@ async def asimilar_search_with_scores( return [] # merge search results into context - content = HYBRID_SEARCH_PT_CN.format( + content = HYBRID_SEARCH_PT.format( context=context, knowledge_graph=knowledge_graph_str, knowledge_graph_for_doc=knowledge_graph_for_doc_str, @@ -365,179 +371,86 @@ def delete_vector_name(self, index_name: str): self._graph_extractor.drop() -HYBRID_SEARCH_PT_CN = """## 角色 -你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息, -准确恰当地回答用户的问题,并保证不会输出与上下文和知识图谱无关的信息。 - -## 技能 -### 技能 1: 上下文理解 -- 准确地理解[上下文]提供的信息,上下文信息可能被拆分为多个章节。 -- 上下文的每个章节内容都会以[Section]开始,并按需进行了编号。 -- 上下文信息提供了与用户问题相关度最高的总结性描述,请合理使用它们。 -### 技能 2: 知识图谱理解 -- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息和[Relationships:]章节中的关系信息,实体和关系信息的一般格式为: -``` -* 实体信息格式: -- (实体名) -- (实体名:实体描述) -- (实体名:实体属性表) -- (文本块ID:文档块内容) -- (目录ID:目录名) -- (文档ID:文档名称) - -* 关系信息的格式: -- (来源实体名)-[关系名]->(目标实体名) -- (来源实体名)-[关系名:关系描述]->(目标实体名) -- (来源实体名)-[关系名:关系属性表]->(目标实体名) -- (文本块实体)-[包含]->(实体名) -- (目录ID)-[包含]->(文本块实体) -- (目录ID)-[包含]->(子目录ID) -- (文档ID)-[包含]->(文本块实体) -- (文档ID)-[包含]->(目录ID) -``` -- 正确地将关系信息中的实体名/ID与实体信息关联,还原出图结构。 -- 将图结构所表达的信息作为用户提问的明细上下文,辅助生成更好的答案。 - - -## 约束条件 -- 不要在答案中描述你的思考过程,直接给出用户问题的答案,不要生成无关信息。 -- 若[知识图谱]或者[知识库原文]没有提供信息,此时应根据[上下文]提供的信息回答问题。 -- 确保以第三人称书写,从客观角度结合[上下文]、[知识图谱]和[知识库原文]表达的信息回答问题。 -- 若提供的信息相互矛盾,请解决矛盾并提供一个单一、连贯的描述。 -- 避免使用停用词和过于常见的词汇。 - -## 参考案例 -``` -[上下文]: -Section 1: -菲尔・贾伯的大儿子叫雅各布・贾伯。 -Section 2: -菲尔・贾伯的小儿子叫比尔・贾伯。 - -[知识图谱]: -Entities: -(菲尔・贾伯#菲尔兹咖啡创始人) -(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌) -(雅各布・贾伯#菲尔・贾伯的儿子) -(美国多地#菲尔兹咖啡的扩展地区) - -Relationships: -(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立) -(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点) -(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子) -(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官) -(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围) - -[知识库原文]: -... -``` - ----- - -接下来的[上下文]、[知识图谱]和[知识库原文]的信息,可以帮助你回答更好地用户的问题。 - -[上下文]: -{context} - -[知识图谱]: -{knowledge_graph} - -[知识库原文] -{knowledge_graph_for_doc} -""" # noqa: E501 - -HYBRID_SEARCH_PT_EN = """## Role -You excel at combining the information provided in the [Context] with -information from the [KnowledgeGraph] to accurately and appropriately -answer user questions, ensuring that you do not output information -unrelated to the context and knowledge graph. - -## Skills -### Skill 1: Context Understanding -- Accurately understand the information provided in the [Context], -which may be divided into several sections. -- Each section in the context will start with [Section] -and may be numbered as needed. -- The context provides a summary description most relevant to the user's -question, and it should be used wisely. -### Skill 2: Knowledge Graph Understanding -- Accurately identify entity information in the [Entities:] section and -relationship information in the [Relationships:] section -of the [KnowledgeGraph]. The general format for entity -and relationship information is: -``` -* Entity Information Format: -- (entity_name) -- (entity_name: entity_description) -- (entity_name: entity_property_map) -- (chunk_id: chunk_content) -- (catalog_id: catalog_name) -- (document_id: document_name) - -* Relationship Information Format: -- (source_entity_name)-[relationship_name]->(target_entity_name) -- (source_entity_name)-[relationship_name: relationship_description]->(target_entity_name) -- (source_entity_name)-[relationship_name: relationship_property_map]->(target_entity_name) -- (chunk_id)-[Contains]->(entity_name) -- (catalog_id)-[Contains]->(chunk_id) -- (catalog_id)-[Contains]->(sub_catalog_id) -- (document_id)-[Contains]->(chunk_id) -- (document_id)-[Contains]->(catalog_id) -``` -- Correctly associate entity names/IDs in the relationship information -with entity information to restore the graph structure. -- Use the information expressed by the graph structure as detailed -context for the user's query to assist in generating better answers. - -## Constraints -- Don't describe your thought process in the answer, provide the answer -to the user's question directly without generating irrelevant information. -- If the [KnowledgeGraph] or [Knowledge base original text] does not provide information, you should answer -the question based on the information provided in the [Context]. -- Ensure to write in the third person, responding to questions from -an objective perspective based on the information combined from the -[Context], the [KnowledgeGraph] and the [Knowledge base original text]. -- If the provided information is contradictory, resolve the -contradictions and provide a single, coherent description. -- Avoid using stop words and overly common vocabulary. - -## Reference Example -``` -[Context]: -Section 1: -Phil Schiller's eldest son is Jacob Schiller. -Section 2: -Phil Schiller's youngest son is Bill Schiller. - -[KnowledgeGraph]: -Entities: -(Phil Jaber#Founder of Philz Coffee) -(Philz Coffee#Coffee brand founded in Berkeley, California) -(Jacob Jaber#Son of Phil Jaber) -(Multiple locations in the USA#Expansion regions of Philz Coffee) - -Relationships: -(Phil Jaber#Created#Philz Coffee#Founded in Berkeley, California in 1978) -(Philz Coffee#Located in#Berkeley, California#Founding location of Philz Coffee) -(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber) -(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005) -(Philz Coffee#Expanded to#Multiple locations in the USA#Expansion regions of Philz Coffee) - -[Knowledge base original text] -... -``` - ----- - -The following information from the [Context], [KnowledgeGraph] and [Knowledge base original text] -can help you better answer user questions. +HYBRID_SEARCH_PT = """ +===== +[Context]、[Knowledge Graph]和[Original Text From RAG]的信息,可以帮助你回答更好地用户的问题。 [Context]: {context} -[KnowledgeGraph]: +[Knowledge Graph]: {knowledge_graph} -[Knowledge base original text] +[Original Text From RAG] {knowledge_graph_for_doc} +===== + +You are very good at combining the [Context] information provided by the prompt word template with the [Knowledge Graph] information, +answering the user's questions accurately and appropriately, and ensuring that no information irrelevant to the context and knowledge graph is output. + +## Role: GraphRAG Assistant + +### Core Capabilities +0. Make sure DO NOT answer irrelevant questions from the user. + +1. Information Processing +- Process contextual information across multiple sections ([Section] markers) +- Interpret knowledge graph relationships ((entity)-[relationship]->(entity)) +- Synthesize information from both structured and unstructured sources + +2. Response Generation +- Provide nuanced, multi-perspective answers +- Balance technical accuracy with conversational engagement +- Connect related concepts across different information sources +- Highlight uncertainties and limitations when appropriate + +3. Interaction Style +- Maintain a natural, engaging conversation flow +- Ask clarifying questions when needed +- Provide examples and analogies to illustrate complex points +- Adapt explanation depth based on user's apparent expertise + +4. Knowledge Integration +- Seamlessly blend information from: + * Context sections + * Knowledge graph relationships + * Background knowledge (when appropriate) +- Prioritize relevance over comprehensiveness +- Acknowledge information gaps explicitly + +5. Quality Assurance +- Verify logical consistency across sources +- Cross-reference relationships for validation +- Flag potential contradictions or ambiguities +- Provide confidence levels when appropriate + +### Information Sources Handling +1. Context Processing [Context] +- Parse information from numbered sections systematically +- Identify key concepts and relationships within each section +- Track section dependencies and cross-references +- Prioritize recent/relevant sections for the query + +2. Knowledge Graph Integration [Knowledge Graph] +- Parse Entities and Relationships sections separately +- Map entity-relationship-entity triples accurately +- Understand relationship directionality +- Use graph structure to find connected information + +3. Original Text Reference [Original Text From RAG] +- The GraphRAG document directory is stored as an edge in relationships to show the hierarchy of the current source text in the entire document. +- Use as authoritative source for detailed information +- Cross-reference with Context and Knowledge Graph +- Extract supporting evidence and examples +- Resolve conflicts between sources using this as primary reference + +### Output Format +1. Answer Structure +- Lead with synthesized core information +- Support with specific references to sources +- Include relevant entity-relationship pairs +- Conclude with confidence assessment +- Use the markdown format of the "quote" to highlight the original text from "GraphRAG" + +===== """ # noqa: E501 From 0b872189c9ede6174cae2173650d8fc8f0f33905 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 21:43:58 +0800 Subject: [PATCH 14/20] Refactor method signature in TuGraphStoreAdapter --- .../storage/knowledge_graph/community/tugraph_store_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index 577a68a4b..9a107785c 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -728,7 +728,7 @@ def _get_nodes_edges_from_queried_data( from neo4j import graph def filter_properties( - properties: dict[str, Any], white_list: List[str] = None + properties: dict[str, Any], white_list: Optional[List[str]] = None ) -> Dict[str, Any]: """Filter the properties. From e6f6d3308dd1b9aac081d2130e2cc8f72eab99f6 Mon Sep 17 00:00:00 2001 From: Appointat Date: Tue, 29 Oct 2024 21:45:52 +0800 Subject: [PATCH 15/20] Refactor markdown format in community_summary.py --- dbgpt/storage/knowledge_graph/community_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index a5eca38a5..0825efac5 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -450,7 +450,7 @@ def delete_vector_name(self, index_name: str): - Support with specific references to sources - Include relevant entity-relationship pairs - Conclude with confidence assessment -- Use the markdown format of the "quote" to highlight the original text from "GraphRAG" +- Use the markdown format of the "quote" to highlight the original text (in details) from "GraphRAG" ===== """ # noqa: E501 From 1ff31848cce13d19e5e9f6c41898c8b9c2cbbb0a Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 30 Oct 2024 18:07:40 +0800 Subject: [PATCH 16/20] fix: Refactor graph store configuration and enable/disable graph search --- dbgpt/storage/graph_store/base.py | 8 -- dbgpt/storage/graph_store/tugraph_store.py | 8 -- .../community/tugraph_store_adapter.py | 116 +++++++++++------- .../knowledge_graph/community_summary.py | 35 +++++- 4 files changed, 101 insertions(+), 66 deletions(-) diff --git a/dbgpt/storage/graph_store/base.py b/dbgpt/storage/graph_store/base.py index 5c2112578..a3344eeea 100644 --- a/dbgpt/storage/graph_store/base.py +++ b/dbgpt/storage/graph_store/base.py @@ -27,14 +27,6 @@ class GraphStoreConfig(BaseModel): default=False, description="Enable graph community summary or not.", ) - document_graph_enabled: bool = Field( - default=True, - description="Enable document graph search or not.", - ) - triplet_graph_enabled: bool = Field( - default=True, - description="Enable knowledge graph search or not.", - ) class GraphStoreBase(ABC): diff --git a/dbgpt/storage/graph_store/tugraph_store.py b/dbgpt/storage/graph_store/tugraph_store.py index c20965947..4f8437245 100644 --- a/dbgpt/storage/graph_store/tugraph_store.py +++ b/dbgpt/storage/graph_store/tugraph_store.py @@ -83,14 +83,6 @@ def __init__(self, config: TuGraphStoreConfig) -> None: os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true" or config.enable_summary ) - self._enable_document_graph = ( - os.getenv("DOCUMENT_GRAPH_ENABLED", "").lower() == "true" - or config.document_graph_enabled - ) - self._enable_triplet_graph = ( - os.getenv("TRIPLET_GRAPH_ENABLED", "").lower() == "true" - or config.triplet_graph_enabled - ) self._plugin_names = ( os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",") or config.plugin_names diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index 9a107785c..c2d12be3d 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -465,14 +465,12 @@ def create_graph_label( (vertices) and edges in the graph. """ if graph_elem_type.is_vertex(): # vertex - data = json.dumps( - { - "label": graph_elem_type.value, - "type": "VERTEX", - "primary": "id", - "properties": graph_properties, - } - ) + data = json.dumps({ + "label": graph_elem_type.value, + "type": "VERTEX", + "primary": "id", + "properties": graph_properties, + }) gql = f"""CALL db.createVertexLabelByJson('{data}')""" else: # edge @@ -498,14 +496,12 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]: else: raise ValueError("Invalid graph element type.") - data = json.dumps( - { - "label": graph_elem_type.value, - "type": "EDGE", - "constraints": edge_direction(graph_elem_type), - "properties": graph_properties, - } - ) + data = json.dumps({ + "label": graph_elem_type.value, + "type": "EDGE", + "constraints": edge_direction(graph_elem_type), + "properties": graph_properties, + }) gql = f"""CALL db.createEdgeLabelByJson('{data}')""" self.graph_store.conn.run(gql) @@ -544,7 +540,7 @@ def explore( if not subs: return MemoryGraph() - if depth < 0: + if depth <= 0: depth = 3 depth_string = f"1..{depth}" @@ -560,42 +556,76 @@ def explore( rel = f"<-[r:{GraphElemType.RELATION.value}*{depth_string}]-" else: rel = f"-[r:{GraphElemType.RELATION.value}*{depth_string}]-" - path_query = ( + query = ( f"MATCH p=(n:{GraphElemType.ENTITY.value})" f"{rel}(m:{GraphElemType.ENTITY.value}) " f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " - f"RETURN n {limit_string}" + f"RETURN p {limit_string}" ) - return self.query(path_query, white_list=["description"]) + return self.query(query=query, white_list=["description"]) else: graph = MemoryGraph() + check_entity_query = ( + f"MATCH (n:{GraphElemType.ENTITY.value}) " + f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " + "RETURN n" + ) - for sub in subs: + if self.query(check_entity_query): # Query the chain from documents to chunks, - # document -> chunk -> chunk -> chunk -> ... + # document -> chunk -> ... -> chunk (-> entity, do not reach entity) + chain_query = ( + f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" + f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->" + f"(leaf_chunk:{GraphElemType.CHUNK.value})-[:{GraphElemType.INCLUDE.value}]->" + f"(m:{GraphElemType.ENTITY.value}) " + f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " + # "WITH n, leaf_chunk " + # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->(leaf_chunk:{GraphElemType.CHUNK.value}) " + "RETURN p" + ) + # Filter all the properties by with_list + graph.upsert_graph(self.query(query=chain_query, white_list=[""])) + + # Query the leaf chunks in the chain from documents to chunks + leaf_chunk_query = ( + f"MATCH p=(n:{GraphElemType.CHUNK.value})-" + f"[r:{GraphElemType.INCLUDE.value}]->" + f"(m:{GraphElemType.ENTITY.value})" + f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " + f"RETURN n {limit_string}" + ) + graph.upsert_graph( + self.query(query=leaf_chunk_query, white_list=["content"]) + ) + else: + _subs_condition = " OR ".join([ + f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs + ]) + + # Query the chain from documents to chunks, + # document -> chunk -> chunk -> chunk -> ... -> chunk chain_query = ( f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" - f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS " - f"'{self._escape_quotes(sub)}' " - f"RETURN p" + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE {_subs_condition}" + "RETURN p" ) - # Query and filter all the properties - graph_of_path = self.query(query=chain_query, white_list=[""]) - graph.upsert_graph(graph_of_path) + # Filter all the properties by with_list + graph.upsert_graph(self.query(query=chain_query, white_list=[""])) # Query the leaf chunks in the chain from documents to chunks leaf_chunk_query = ( f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->" - f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS " - f"'{self._escape_quotes(sub)}' " + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE {_subs_condition}" f"RETURN m {limit_string}" ) - graph_of_leaf_chunk = self.query( - query=leaf_chunk_query, white_list=["content"] + graph.upsert_graph( + self.query(query=leaf_chunk_query, white_list=["content"]) ) - graph.upsert_graph(graph_of_leaf_chunk) return graph @@ -663,19 +693,15 @@ async def stream_query( # type: ignore[override] rels = list(record["p"].relationships) formatted_path = [] for i in range(len(nodes)): - formatted_path.append( - { - "id": nodes[i]._properties["id"], - "description": nodes[i]._properties["description"], - } - ) + formatted_path.append({ + "id": nodes[i]._properties["id"], + "description": nodes[i]._properties["description"], + }) if i < len(rels): - formatted_path.append( - { - "id": rels[i]._properties["id"], - "description": rels[i]._properties["description"], - } - ) + formatted_path.append({ + "id": rels[i]._properties["id"], + "description": rels[i]._properties["description"], + }) for i in range(0, len(formatted_path), 2): mg.upsert_vertex( Vertex( diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 0825efac5..8b7a1b83d 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -59,6 +59,15 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig): default=0.0, description="Recall score of community search in knowledge graph", ) + triplet_graph_enabled: bool = Field( + default=True, + description="Enable the graph search for triplets", + ) + document_graph_enabled: bool = Field( + default=True, + description="Enable the graph search for documents and chunks", + ) + knowledge_graph_chunk_search_top_size: int = Field( default=5, description="Top size of knowledge graph chunk search", @@ -100,6 +109,20 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.community_score_threshold, ) ) + self._document_graph_enabled = bool( + ( + os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true" + if "DOCUMENT_GRAPH_ENABLED" in os.environ + else config.document_graph_enabled + ) + ) + self._triplet_graph_enabled = bool( + ( + os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true" + if "TRIPLET_GRAPH_ENABLED" in os.environ + else config.triplet_graph_enabled + ) + ) self._knowledge_graph_chunk_search_top_size = int( os.getenv( "KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE", @@ -170,7 +193,7 @@ async def _aload_document_graph(self, chunks: List[Chunk]) -> None: The chunks include the doc structure. """ - if not self._graph_store.get_config().document_graph_enabled: + if not self._document_graph_enabled: return _chunks: List[ParagraphChunk] = [ @@ -201,10 +224,10 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None: The chunks include the doc structure. """ - if not self._graph_store.get_config().triplet_graph_enabled: + if not self._triplet_graph_enabled: return - document_graph_enabled = self._graph_store.get_config().document_graph_enabled + document_graph_enabled = self._document_graph_enabled # Extract the triplets from the chunks, and return the list of graphs # in the same order as the input texts @@ -303,10 +326,12 @@ async def asimilar_search_with_scores( context = "\n".join(summaries) if summaries else "" keywords: List[str] = await self._keyword_extractor.extract(text) + subgraph = None + subgraph_for_doc = None # Local search: extract keywords and explore subgraph - triplet_graph_enabled = self._graph_store.get_config().triplet_graph_enabled - document_graph_enabled = self._graph_store.get_config().document_graph_enabled + triplet_graph_enabled = self._triplet_graph_enabled + document_graph_enabled = self._document_graph_enabled if triplet_graph_enabled: subgraph: MemoryGraph = self._graph_store_apdater.explore( From a8f93216745affd9e2c64720cc91b26812fbb279 Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 30 Oct 2024 18:21:30 +0800 Subject: [PATCH 17/20] chore: format the code --- .../community/tugraph_store_adapter.py | 60 +++++++++++-------- .../knowledge_graph/community_summary.py | 3 +- 2 files changed, 36 insertions(+), 27 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index c2d12be3d..3e7c73a6c 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -465,12 +465,14 @@ def create_graph_label( (vertices) and edges in the graph. """ if graph_elem_type.is_vertex(): # vertex - data = json.dumps({ - "label": graph_elem_type.value, - "type": "VERTEX", - "primary": "id", - "properties": graph_properties, - }) + data = json.dumps( + { + "label": graph_elem_type.value, + "type": "VERTEX", + "primary": "id", + "properties": graph_properties, + } + ) gql = f"""CALL db.createVertexLabelByJson('{data}')""" else: # edge @@ -496,12 +498,14 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]: else: raise ValueError("Invalid graph element type.") - data = json.dumps({ - "label": graph_elem_type.value, - "type": "EDGE", - "constraints": edge_direction(graph_elem_type), - "properties": graph_properties, - }) + data = json.dumps( + { + "label": graph_elem_type.value, + "type": "EDGE", + "constraints": edge_direction(graph_elem_type), + "properties": graph_properties, + } + ) gql = f"""CALL db.createEdgeLabelByJson('{data}')""" self.graph_store.conn.run(gql) @@ -577,11 +581,13 @@ def explore( chain_query = ( f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->" - f"(leaf_chunk:{GraphElemType.CHUNK.value})-[:{GraphElemType.INCLUDE.value}]->" + f"(leaf_chunk:{GraphElemType.CHUNK.value})-" + f"[:{GraphElemType.INCLUDE.value}]->" f"(m:{GraphElemType.ENTITY.value}) " f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " # "WITH n, leaf_chunk " - # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->(leaf_chunk:{GraphElemType.CHUNK.value}) " + # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->" + # f"(leaf_chunk:{GraphElemType.CHUNK.value}) " "RETURN p" ) # Filter all the properties by with_list @@ -599,9 +605,9 @@ def explore( self.query(query=leaf_chunk_query, white_list=["content"]) ) else: - _subs_condition = " OR ".join([ - f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs - ]) + _subs_condition = " OR ".join( + [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs] + ) # Query the chain from documents to chunks, # document -> chunk -> chunk -> chunk -> ... -> chunk @@ -693,15 +699,19 @@ async def stream_query( # type: ignore[override] rels = list(record["p"].relationships) formatted_path = [] for i in range(len(nodes)): - formatted_path.append({ - "id": nodes[i]._properties["id"], - "description": nodes[i]._properties["description"], - }) + formatted_path.append( + { + "id": nodes[i]._properties["id"], + "description": nodes[i]._properties["description"], + } + ) if i < len(rels): - formatted_path.append({ - "id": rels[i]._properties["id"], - "description": rels[i]._properties["description"], - }) + formatted_path.append( + { + "id": rels[i]._properties["id"], + "description": rels[i]._properties["description"], + } + ) for i in range(0, len(formatted_path), 2): mg.upsert_vertex( Vertex( diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index 8b7a1b83d..badcb5fac 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -9,7 +9,6 @@ from dbgpt.core import Chunk from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer from dbgpt.rag.transformer.graph_extractor import GraphExtractor -from dbgpt.storage.graph_store.graph import MemoryGraph from dbgpt.storage.knowledge_graph.base import ParagraphChunk from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore from dbgpt.storage.knowledge_graph.knowledge_graph import ( @@ -334,7 +333,7 @@ async def asimilar_search_with_scores( document_graph_enabled = self._document_graph_enabled if triplet_graph_enabled: - subgraph: MemoryGraph = self._graph_store_apdater.explore( + subgraph = self._graph_store_apdater.explore( subs=keywords, limit=topk, search_scope="knowledge_graph" ) From 7e3c3c70e6c2195cb3fdfb732f568e44f8c078ed Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 30 Oct 2024 20:10:22 +0800 Subject: [PATCH 18/20] fix: Refactor TuGraphStoreAdapter to improve graph retrieval logic --- .../community/tugraph_store_adapter.py | 65 +++++++++++++------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py index 3e7c73a6c..fa107a28b 100644 --- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py +++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py @@ -568,42 +568,65 @@ def explore( ) return self.query(query=query, white_list=["description"]) else: + # If there exists the entities in the graph, return the graph that + # includes the leaf chunks that connect to the entities, the chains from + # documents to the leaf chunks, and the chain from documents to chunks; + # document -> chunk -> chunk -> ... -> leaf chunk -> (entity) + # + # If not, return the graph that includes the chains from documents to chunks + # that contain the subs (keywords). + # document -> chunk -> chunk -> ... -> leaf chunk (that contains the subs) + # + # And only the leaf chunks contain the content, and the other chunks do not + # contain any properties except the id, name. + graph = MemoryGraph() + + # Check if the entities exist in the graph check_entity_query = ( f"MATCH (n:{GraphElemType.ENTITY.value}) " f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} " "RETURN n" ) - if self.query(check_entity_query): - # Query the chain from documents to chunks, - # document -> chunk -> ... -> chunk (-> entity, do not reach entity) - chain_query = ( - f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" - f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->" - f"(leaf_chunk:{GraphElemType.CHUNK.value})-" - f"[:{GraphElemType.INCLUDE.value}]->" - f"(m:{GraphElemType.ENTITY.value}) " - f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " - # "WITH n, leaf_chunk " - # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->" - # f"(leaf_chunk:{GraphElemType.CHUNK.value}) " - "RETURN p" - ) - # Filter all the properties by with_list - graph.upsert_graph(self.query(query=chain_query, white_list=[""])) - # Query the leaf chunks in the chain from documents to chunks leaf_chunk_query = ( f"MATCH p=(n:{GraphElemType.CHUNK.value})-" f"[r:{GraphElemType.INCLUDE.value}]->" f"(m:{GraphElemType.ENTITY.value})" f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} " - f"RETURN n {limit_string}" + f"RETURN n" ) - graph.upsert_graph( - self.query(query=leaf_chunk_query, white_list=["content"]) + graph_of_leaf_chunks = self.query( + query=leaf_chunk_query, white_list=["content"] ) + + # Query the chain from documents to chunks, + # document -> chunk -> ... -> leaf_chunks + chunk_names = [ + self._escape_quotes(vertex.name) + for vertex in graph_of_leaf_chunks.vertices() + ] + chain_query = ( + f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-" + f"[:{GraphElemType.INCLUDE.value}*{depth_string}]->" + f"(m:{GraphElemType.CHUNK.value})" + f"WHERE m.name IN {chunk_names} " + "RETURN p" + ) + # Filter all the properties by with_list + graph.upsert_graph(self.query(query=chain_query, white_list=[""])) + + # The number of leaf chunks caompared to the `limit` + if not limit or len(chunk_names) <= limit: + graph.upsert_graph(graph_of_leaf_chunks) + else: + limited_leaf_chunk_query = leaf_chunk_query + f" {limit_string}" + graph.upsert_graph( + self.query( + query=limited_leaf_chunk_query, white_list=["content"] + ) + ) else: _subs_condition = " OR ".join( [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs] From 0c263bf364d2b95bc7bdb1341004268f0b0abfa7 Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 30 Oct 2024 20:19:55 +0800 Subject: [PATCH 19/20] fix --- .../knowledge_graph/community_summary.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index badcb5fac..d31b18b46 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -108,19 +108,15 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig): config.community_score_threshold, ) ) - self._document_graph_enabled = bool( - ( - os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true" - if "DOCUMENT_GRAPH_ENABLED" in os.environ - else config.document_graph_enabled - ) + self._document_graph_enabled = ( + os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true" + if "DOCUMENT_GRAPH_ENABLED" in os.environ + else config.document_graph_enabled ) - self._triplet_graph_enabled = bool( - ( - os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true" - if "TRIPLET_GRAPH_ENABLED" in os.environ - else config.triplet_graph_enabled - ) + self._triplet_graph_enabled = ( + os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true" + if "TRIPLET_GRAPH_ENABLED" in os.environ + else config.triplet_graph_enabled ) self._knowledge_graph_chunk_search_top_size = int( os.getenv( From f0216d760ff7a1af728b01e36426e1000a3bc1cf Mon Sep 17 00:00:00 2001 From: Appointat Date: Wed, 30 Oct 2024 20:22:20 +0800 Subject: [PATCH 20/20] Refactor markdown format in community_summary.py --- dbgpt/storage/knowledge_graph/community_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py index d31b18b46..62e3e4c13 100644 --- a/dbgpt/storage/knowledge_graph/community_summary.py +++ b/dbgpt/storage/knowledge_graph/community_summary.py @@ -393,7 +393,7 @@ def delete_vector_name(self, index_name: str): HYBRID_SEARCH_PT = """ ===== -[Context]、[Knowledge Graph]和[Original Text From RAG]的信息,可以帮助你回答更好地用户的问题。 +The following information from [Context], [Knowledge Graph], and [Original Text From RAG] can help you answer user questions better. [Context]: {context}