From 89db275ccb1c33c6ce7839d7c3589fef01963cdb Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 23 Oct 2024 11:30:24 +0800
Subject: [PATCH 01/20] feat: Improve triplet extraction batch size and
 handling Co-authored-by: Appointat <appointat@shu.edu.cn>

---
 dbgpt/rag/transformer/triplet_extractor.py    |  3 +-
 .../knowledge_graph/community_summary.py      | 57 ++++++++++++-------
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/dbgpt/rag/transformer/triplet_extractor.py b/dbgpt/rag/transformer/triplet_extractor.py
index 7a591560f..60b5346f3 100644
--- a/dbgpt/rag/transformer/triplet_extractor.py
+++ b/dbgpt/rag/transformer/triplet_extractor.py
@@ -1,4 +1,5 @@
 """TripletExtractor class."""
+
 import logging
 import re
 from typing import Any, List, Optional, Tuple
@@ -12,7 +13,7 @@
     "Some text is provided below. Given the text, "
     "extract up to knowledge triplets as more as possible "
     "in the form of (subject, predicate, object).\n"
-    "Avoid stopwords.\n"
+    "Avoid stopwords. The subject, predicate, object can not be none.\n"
     "---------------------\n"
     "Example:\n"
     "Text: Alice is Bob's mother.\n"
diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index 904b0beba..8c24d3f88 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -1,5 +1,6 @@
 """Define the CommunitySummaryKnowledgeGraph."""
 
+import asyncio
 import logging
 import os
 import uuid
@@ -63,6 +64,10 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
         default=5,
         description="Top size of knowledge graph chunk search",
     )
+    triplet_extraction_batch_size: int = Field(
+        default=20,
+        description="Batch size of triplets extraction from the text",
+    )
 
 
 class CommunitySummaryKnowledgeGraph(BuiltinKnowledgeGraph):
@@ -96,6 +101,11 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
                 config.community_score_threshold,
             )
         )
+        self._triplet_extraction_batch_size = int(
+            os.getenv(
+                "TRIPLET_EXTRACTION_BATCH_SIZE", config.triplet_extraction_batch_size
+            )
+        )
 
         def extractor_configure(name: str, cfg: VectorStoreConfig):
             cfg.name = name
@@ -189,30 +199,35 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
             return
 
         document_graph_enabled = self._graph_store.get_config().document_graph_enabled
-        for chunk in chunks:
-            # TODO: Use asyncio to extract graph to accelerate the process
-            # (attention to the CAP of the graph db)
+        batch_size = self._triplet_extraction_batch_size
+
+        for i in range(0, len(chunks), batch_size):
+            batch_chunks = chunks[i : i + batch_size]
 
-            graphs: List[MemoryGraph] = await self._graph_extractor.extract(
-                chunk.content
+            extraction_tasks = [
+                self._graph_extractor.extract(chunk.content) for chunk in batch_chunks
+            ]
+            async_graphs: List[List[MemoryGraph]] = await asyncio.gather(
+                *extraction_tasks
             )
 
-            for graph in graphs:
-                if document_graph_enabled:
-                    # append the chunk id to the edge
-                    for edge in graph.edges():
-                        edge.set_prop("_chunk_id", chunk.chunk_id)
-                        graph.append_edge(edge=edge)
-
-                # upsert the graph
-                self._graph_store_apdater.upsert_graph(graph)
-
-                # chunk -> include -> entity
-                if document_graph_enabled:
-                    for vertex in graph.vertices():
-                        self._graph_store_apdater.upsert_chunk_include_entity(
-                            chunk=chunk, entity=vertex
-                        )
+            for chunk, graphs in zip(batch_chunks, async_graphs):
+                for graph in graphs:
+                    if document_graph_enabled:
+                        # append the chunk id to the edge
+                        for edge in graph.edges():
+                            edge.set_prop("_chunk_id", chunk.chunk_id)
+                            graph.append_edge(edge=edge)
+
+                    # upsert the graph
+                    self._graph_store_apdater.upsert_graph(graph)
+
+                    # chunk -> include -> entity
+                    if document_graph_enabled:
+                        for vertex in graph.vertices():
+                            self._graph_store_apdater.upsert_chunk_include_entity(
+                                chunk=chunk, entity=vertex
+                            )
 
     def _load_chunks(
         self, chunks: List[ParagraphChunk]

From f8e3ed1711d7e54a440eb6ec63fa308089fe2995 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 23 Oct 2024 11:42:10 +0800
Subject: [PATCH 02/20] feat: Improve triplet extraction batch size and
 handling Co-authored-by: Appointat <appointat@shu.edu.cn>

---
 .env.template                                   | 1 +
 docs/docs/cookbook/rag/graph_rag_app_develop.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.env.template b/.env.template
index 45e075d67..5adb248be 100644
--- a/.env.template
+++ b/.env.template
@@ -167,6 +167,7 @@ TRIPLET_GRAPH_ENABLED=True  # enable the graph search for triplets
 DOCUMENT_GRAPH_ENABLED=True  # enable the graph search for documents and chunks
 
 KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5  # the top size of knowledge graph search for chunks
+TRIPLET_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
 
 ### Chroma vector db config
 #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data
diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md
index c63a66e6e..8c548bc22 100644
--- a/docs/docs/cookbook/rag/graph_rag_app_develop.md
+++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md
@@ -116,6 +116,7 @@ GRAPH_COMMUNITY_SUMMARY_ENABLED=True  # enable the graph community summary
 TRIPLET_GRAPH_ENABLED=True  # enable the graph search for the triplets
 DOCUMENT_GRAPH_ENABLED=True  # enable the graph search for documents and chunks
 KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5  # the number of the searched triplets in a retrieval
+TRIPLET_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
 ```
 
 

From a57029e561140b9adf8f52387a8909a832818847 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Mon, 28 Oct 2024 16:30:50 +0800
Subject: [PATCH 03/20] refactor: Add batch_extract method to ExtractorBase
 Co-authored-by: Appointat <appointat@shu.edu.cn>

---
 dbgpt/rag/transformer/base.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py
index 289f02887..ad93101e0 100644
--- a/dbgpt/rag/transformer/base.py
+++ b/dbgpt/rag/transformer/base.py
@@ -1,7 +1,10 @@
 """Transformer base class."""
+
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Union
+
+from dbgpt.core import Chunk
 
 logger = logging.getLogger(__name__)
 
@@ -37,6 +40,15 @@ class ExtractorBase(TransformerBase, ABC):
     async def extract(self, text: str, limit: Optional[int] = None) -> List:
         """Extract results from text."""
 
+    @abstractmethod
+    async def batch_extract(
+        self,
+        texts: Union[List[str], List[Chunk]],
+        batch_size: int = 1,
+        limit: Optional[int] = None,
+    ) -> List:
+        """Batch extract results from texts."""
+
 
 class TranslatorBase(TransformerBase, ABC):
     """Translator base class."""

From 3fc7640d1e0d544f2ac4eec61e359d3f88845104 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Mon, 28 Oct 2024 16:31:15 +0800
Subject: [PATCH 04/20] refactor: refactor: Add batch_extract method to
 GraphExtractor

Co-authored-by: Appointat <appointat@shu.edu.cn>
---
 dbgpt/rag/transformer/graph_extractor.py | 83 +++++++++++++++++++-----
 1 file changed, 66 insertions(+), 17 deletions(-)

diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py
index 12751e89f..32ecee2ff 100644
--- a/dbgpt/rag/transformer/graph_extractor.py
+++ b/dbgpt/rag/transformer/graph_extractor.py
@@ -1,8 +1,9 @@
 """GraphExtractor class."""
 
+import asyncio
 import logging
 import re
-from typing import List, Optional
+from typing import Dict, List, Optional, Tuple, Union
 
 from dbgpt.core import Chunk, LLMClient
 from dbgpt.rag.transformer.llm_extractor import LLMExtractor
@@ -21,37 +22,85 @@ def __init__(
         """Initialize the GraphExtractor."""
         super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
         self._chunk_history = chunk_history
+        self._chunk_context_map: Dict[str, str] = {}
 
         config = self._chunk_history.get_config()
+
         self._vector_space = config.name
         self._max_chunks_once_load = config.max_chunks_once_load
         self._max_threads = config.max_threads
         self._topk = config.topk
         self._score_threshold = config.score_threshold
 
-    async def extract(self, text: str, limit: Optional[int] = None) -> List:
-        """Load similar chunks."""
-        # load similar chunks
-        chunks = await self._chunk_history.asimilar_search_with_scores(
-            text, self._topk, self._score_threshold
-        )
-        history = [
-            f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
-        ]
-        context = "\n".join(history) if history else ""
-
-        try:
-            # extract with chunk history
-            return await super()._extract(text, context, limit)
+    async def aload_chunk_context(self, texts: List[str]) -> None:
+        """Load chunk context."""
+        for text in texts:
+            # Load similar chunks
+            chunks = await self._chunk_history.asimilar_search_with_scores(
+                text, self._topk, self._score_threshold
+            )
+            history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)]
 
-        finally:
-            # save chunk to history
+            # Save chunk to history
             await self._chunk_history.aload_document_with_limit(
                 [Chunk(content=text, metadata={"relevant_cnt": len(history)})],
                 self._max_chunks_once_load,
                 self._max_threads,
             )
 
+            # Save chunk context to map
+            context = "\n".join(history) if history else ""
+            self._chunk_context_map[text] = context
+
+    async def extract(self, text: str, limit: Optional[int] = None) -> List:
+        """Load similar chunks.
+
+        Suggestion: to extract triplets in batches, call `batch_extract`.
+        """
+        if text not in self._chunk_context_map:
+            await self.aload_chunk_context([Chunk(content=text)])
+        context = self._chunk_context_map.get(text, "")
+
+        # Extract with chunk history
+        return await super()._extract(text, context, limit)
+
+    async def batch_extract(
+        self,
+        texts: Union[List[str], List[Chunk]],
+        batch_size: int = 1,
+        limit: Optional[int] = None,
+    ) -> List[Tuple[Chunk, List[Graph]]]:
+        """Extract graphs from chunks in batches."""
+        if isinstance(texts, list) and any(
+            not isinstance(chunk, Chunk) for chunk in texts
+        ):
+            raise ValueError("Chunks should be a list of Chunk objects, not strings.")
+        chunks: List[Chunk] = texts  # type: ignore[assignment]
+
+        # 1. Load chunk context
+        chunk_content_list = [chunk.content for chunk in chunks]
+        await self.aload_chunk_context(chunk_content_list)
+
+        chunk_graph_pairs: List[Tuple[Chunk, List[Graph]]] = []
+        total_batches = (len(chunks) + batch_size - 1) // batch_size
+
+        for batch_idx in range(total_batches):
+            start_idx = batch_idx * batch_size
+            end_idx = min((batch_idx + 1) * batch_size, len(chunks))
+            batch_chunks = chunks[start_idx:end_idx]
+
+            # 2. Process extraction in parallel
+            extraction_tasks = [
+                self.extract(chunk.content, limit) for chunk in batch_chunks
+            ]
+            batch_graphs = await asyncio.gather(*extraction_tasks)
+
+            # 3. Zip chunks with their corresponding graphs to maintain the relationship
+            batch_graph_pairs = list(zip(batch_chunks, batch_graphs))
+            chunk_graph_pairs.extend(batch_graph_pairs)
+
+        return chunk_graph_pairs
+
     def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
         graph = MemoryGraph()
         edge_count = 0

From fee90cc484487181e7acd8e35e076123fc02b53b Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Mon, 28 Oct 2024 16:31:38 +0800
Subject: [PATCH 05/20] refactor: Add batch_extract method to LLMExtractor
 Co-authored-by: Appointat <appointat@shu.edu.cn>

---
 dbgpt/rag/transformer/llm_extractor.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py
index 494096d51..c8324c607 100644
--- a/dbgpt/rag/transformer/llm_extractor.py
+++ b/dbgpt/rag/transformer/llm_extractor.py
@@ -1,9 +1,10 @@
 """TripletExtractor class."""
+
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional
+from typing import List, Optional, Union
 
-from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
+from dbgpt.core import Chunk, HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
 from dbgpt.rag.transformer.base import ExtractorBase
 
 logger = logging.getLogger(__name__)
@@ -22,6 +23,21 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List:
         """Extract by LLM."""
         return await self._extract(text, None, limit)
 
+    async def batch_extract(
+        self,
+        texts: Union[List[str], List[Chunk]],
+        batch_size: int = 1,
+        limit: Optional[int] = None,
+    ) -> List:
+        """Batch extract by LLM."""
+        if isinstance(texts, list) and any(not isinstance(text, str) for text in texts):
+            raise ValueError("All elements must be strings")
+
+        results = []
+        for text in texts:
+            results.append(await self.extract(text, limit))
+        return results
+
     async def _extract(
         self, text: str, history: str = None, limit: Optional[int] = None
     ) -> List:

From ccd2cdff970fdad7ecb6fae9dd913748bb581daa Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Mon, 28 Oct 2024 16:31:58 +0800
Subject: [PATCH 06/20] refactor: Refactor CommunitySummaryKnowledgeGraph batch
 extraction method Co-authored-by: Appointat <appointat@shu.edu.cn>

---
 .../knowledge_graph/community_summary.py      | 48 ++++++++-----------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index 8c24d3f88..1f5dd59a0 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -1,6 +1,5 @@
 """Define the CommunitySummaryKnowledgeGraph."""
 
-import asyncio
 import logging
 import os
 import uuid
@@ -199,35 +198,28 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
             return
 
         document_graph_enabled = self._graph_store.get_config().document_graph_enabled
-        batch_size = self._triplet_extraction_batch_size
 
-        for i in range(0, len(chunks), batch_size):
-            batch_chunks = chunks[i : i + batch_size]
-
-            extraction_tasks = [
-                self._graph_extractor.extract(chunk.content) for chunk in batch_chunks
-            ]
-            async_graphs: List[List[MemoryGraph]] = await asyncio.gather(
-                *extraction_tasks
-            )
+        chunk_graph_pairs = await self._graph_extractor.batch_extract(
+            chunks, batch_size=self._triplet_extraction_batch_size
+        )
 
-            for chunk, graphs in zip(batch_chunks, async_graphs):
-                for graph in graphs:
-                    if document_graph_enabled:
-                        # append the chunk id to the edge
-                        for edge in graph.edges():
-                            edge.set_prop("_chunk_id", chunk.chunk_id)
-                            graph.append_edge(edge=edge)
-
-                    # upsert the graph
-                    self._graph_store_apdater.upsert_graph(graph)
-
-                    # chunk -> include -> entity
-                    if document_graph_enabled:
-                        for vertex in graph.vertices():
-                            self._graph_store_apdater.upsert_chunk_include_entity(
-                                chunk=chunk, entity=vertex
-                            )
+        for chunk, graphs in chunk_graph_pairs:
+            for graph in graphs:
+                if document_graph_enabled:
+                    # Append the chunk id to the edge
+                    for edge in graph.edges():
+                        edge.set_prop("_chunk_id", chunk.chunk_id)
+                        graph.append_edge(edge=edge)
+
+                # Upsert the graph
+                self._graph_store_apdater.upsert_graph(graph)
+
+                # chunk -> include -> entity
+                if document_graph_enabled:
+                    for vertex in graph.vertices():
+                        self._graph_store_apdater.upsert_chunk_include_entity(
+                            chunk=chunk, entity=vertex
+                        )
 
     def _load_chunks(
         self, chunks: List[ParagraphChunk]

From 3f65e4970e86dd1aeb4071a66a388119c1e0737d Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 12:15:41 +0800
Subject: [PATCH 07/20] refactor: Update knowledge graph extraction batch size

---
 .env.template | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.env.template b/.env.template
index 5adb248be..453e2c6d3 100644
--- a/.env.template
+++ b/.env.template
@@ -167,7 +167,7 @@ TRIPLET_GRAPH_ENABLED=True  # enable the graph search for triplets
 DOCUMENT_GRAPH_ENABLED=True  # enable the graph search for documents and chunks
 
 KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5  # the top size of knowledge graph search for chunks
-TRIPLET_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
+KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
 
 ### Chroma vector db config
 #CHROMA_PERSIST_PATH=/root/DB-GPT/pilot/data

From a253542110c1176acfe5336486d58b406e0a9bee Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 12:16:43 +0800
Subject: [PATCH 08/20] refactor: Update knowledge graph extraction batch size

---
 docs/docs/cookbook/rag/graph_rag_app_develop.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/docs/cookbook/rag/graph_rag_app_develop.md b/docs/docs/cookbook/rag/graph_rag_app_develop.md
index 8c548bc22..0cdca75d4 100644
--- a/docs/docs/cookbook/rag/graph_rag_app_develop.md
+++ b/docs/docs/cookbook/rag/graph_rag_app_develop.md
@@ -116,7 +116,7 @@ GRAPH_COMMUNITY_SUMMARY_ENABLED=True  # enable the graph community summary
 TRIPLET_GRAPH_ENABLED=True  # enable the graph search for the triplets
 DOCUMENT_GRAPH_ENABLED=True  # enable the graph search for documents and chunks
 KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE=5  # the number of the searched triplets in a retrieval
-TRIPLET_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
+KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE=20  # the batch size of triplet extraction from the text
 ```
 
 

From c565600ce7ee7355b09dc3102c919d756c0b5ddb Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 12:19:00 +0800
Subject: [PATCH 09/20] Refactor batch extraction methods in GraphExtractor and
 LLMExtractor

---
 dbgpt/rag/transformer/base.py            |  6 +-
 dbgpt/rag/transformer/graph_extractor.py | 78 +++++++++++++++---------
 dbgpt/rag/transformer/llm_extractor.py   | 25 +++++---
 3 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/dbgpt/rag/transformer/base.py b/dbgpt/rag/transformer/base.py
index ad93101e0..a71c2da14 100644
--- a/dbgpt/rag/transformer/base.py
+++ b/dbgpt/rag/transformer/base.py
@@ -2,9 +2,7 @@
 
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional, Union
-
-from dbgpt.core import Chunk
+from typing import List, Optional
 
 logger = logging.getLogger(__name__)
 
@@ -43,7 +41,7 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List:
     @abstractmethod
     async def batch_extract(
         self,
-        texts: Union[List[str], List[Chunk]],
+        texts: List[str],
         batch_size: int = 1,
         limit: Optional[int] = None,
     ) -> List:
diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py
index 32ecee2ff..ef34ce246 100644
--- a/dbgpt/rag/transformer/graph_extractor.py
+++ b/dbgpt/rag/transformer/graph_extractor.py
@@ -3,7 +3,7 @@
 import asyncio
 import logging
 import re
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional
 
 from dbgpt.core import Chunk, LLMClient
 from dbgpt.rag.transformer.llm_extractor import LLMExtractor
@@ -22,7 +22,6 @@ def __init__(
         """Initialize the GraphExtractor."""
         super().__init__(llm_client, model_name, GRAPH_EXTRACT_PT_CN)
         self._chunk_history = chunk_history
-        self._chunk_context_map: Dict[str, str] = {}
 
         config = self._chunk_history.get_config()
 
@@ -32,8 +31,10 @@ def __init__(
         self._topk = config.topk
         self._score_threshold = config.score_threshold
 
-    async def aload_chunk_context(self, texts: List[str]) -> None:
+    async def aload_chunk_context(self, texts: List[str]) -> Dict[str, str]:
         """Load chunk context."""
+        text_context_map: Dict[str, str] = {}
+
         for text in texts:
             # Load similar chunks
             chunks = await self._chunk_history.asimilar_search_with_scores(
@@ -50,56 +51,75 @@ async def aload_chunk_context(self, texts: List[str]) -> None:
 
             # Save chunk context to map
             context = "\n".join(history) if history else ""
-            self._chunk_context_map[text] = context
+            text_context_map[text] = context
+        return text_context_map
 
     async def extract(self, text: str, limit: Optional[int] = None) -> List:
-        """Load similar chunks.
+        """Extract graphs from text.
 
         Suggestion: to extract triplets in batches, call `batch_extract`.
         """
-        if text not in self._chunk_context_map:
-            await self.aload_chunk_context([Chunk(content=text)])
-        context = self._chunk_context_map.get(text, "")
+        # Load similar chunks
+        chunks = await self._chunk_history.asimilar_search_with_scores(
+            text, self._topk, self._score_threshold
+        )
+        history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)]
+
+        # Save chunk to history
+        await self._chunk_history.aload_document_with_limit(
+            [Chunk(content=text, metadata={"relevant_cnt": len(history)})],
+            self._max_chunks_once_load,
+            self._max_threads,
+        )
+
+        # Save chunk context to map
+        context = "\n".join(history) if history else ""
 
         # Extract with chunk history
         return await super()._extract(text, context, limit)
 
     async def batch_extract(
         self,
-        texts: Union[List[str], List[Chunk]],
+        texts: List[str],
         batch_size: int = 1,
         limit: Optional[int] = None,
-    ) -> List[Tuple[Chunk, List[Graph]]]:
-        """Extract graphs from chunks in batches."""
-        if isinstance(texts, list) and any(
-            not isinstance(chunk, Chunk) for chunk in texts
-        ):
-            raise ValueError("Chunks should be a list of Chunk objects, not strings.")
-        chunks: List[Chunk] = texts  # type: ignore[assignment]
+    ) -> List[List[Graph]]:
+        """Extract graphs from chunks in batches.
 
+        Returns list of graphs in same order as input texts (text <-> graphs).
+        """
         # 1. Load chunk context
-        chunk_content_list = [chunk.content for chunk in chunks]
-        await self.aload_chunk_context(chunk_content_list)
+        text_context_map = await self.aload_chunk_context(texts)
 
-        chunk_graph_pairs: List[Tuple[Chunk, List[Graph]]] = []
-        total_batches = (len(chunks) + batch_size - 1) // batch_size
+        # Pre-allocate results list to maintain order
+        graphs_list: List[List[Graph]] = [None] * len(texts)
+        total_batches = (len(texts) + batch_size - 1) // batch_size
 
         for batch_idx in range(total_batches):
             start_idx = batch_idx * batch_size
-            end_idx = min((batch_idx + 1) * batch_size, len(chunks))
-            batch_chunks = chunks[start_idx:end_idx]
+            end_idx = min((batch_idx + 1) * batch_size, len(texts))
+            batch_texts = texts[start_idx:end_idx]
 
-            # 2. Process extraction in parallel
+            # 2. Create tasks with their original indices
             extraction_tasks = [
-                self.extract(chunk.content, limit) for chunk in batch_chunks
+                (
+                    idx,
+                    self._extract(text, text_context_map[text], limit),
+                )
+                for idx, text in enumerate(batch_texts, start=start_idx)
             ]
-            batch_graphs = await asyncio.gather(*extraction_tasks)
 
-            # 3. Zip chunks with their corresponding graphs to maintain the relationship
-            batch_graph_pairs = list(zip(batch_chunks, batch_graphs))
-            chunk_graph_pairs.extend(batch_graph_pairs)
+            # 3. Process extraction in parallel while keeping track of indices
+            batch_results = await asyncio.gather(
+                *(task for _, task in extraction_tasks)
+            )
+
+            # 4. Place results in the correct positions
+            for (idx, _), graphs in zip(extraction_tasks, batch_results):
+                graphs_list[idx] = graphs
 
-        return chunk_graph_pairs
+        assert all(x is not None for x in graphs_list), "All positions should be filled"
+        return graphs_list
 
     def _parse_response(self, text: str, limit: Optional[int] = None) -> List[Graph]:
         graph = MemoryGraph()
diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py
index c8324c607..15c985de8 100644
--- a/dbgpt/rag/transformer/llm_extractor.py
+++ b/dbgpt/rag/transformer/llm_extractor.py
@@ -1,10 +1,11 @@
 """TripletExtractor class."""
 
+import asyncio
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional, Union
+from typing import List, Optional
 
-from dbgpt.core import Chunk, HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
+from dbgpt.core import HumanPromptTemplate, LLMClient, ModelMessage, ModelRequest
 from dbgpt.rag.transformer.base import ExtractorBase
 
 logger = logging.getLogger(__name__)
@@ -25,17 +26,25 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List:
 
     async def batch_extract(
         self,
-        texts: Union[List[str], List[Chunk]],
+        texts: List[str],
         batch_size: int = 1,
         limit: Optional[int] = None,
     ) -> List:
         """Batch extract by LLM."""
-        if isinstance(texts, list) and any(not isinstance(text, str) for text in texts):
-            raise ValueError("All elements must be strings")
-
         results = []
-        for text in texts:
-            results.append(await self.extract(text, limit))
+
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i : i + batch_size]
+
+            # Create tasks for current batch
+            extraction_tasks = [
+                self._extract(text, None, limit) for text in batch_texts
+            ]
+
+            # Execute batch concurrently and wait for all to complete
+            batch_results = await asyncio.gather(*extraction_tasks)
+            results.extend(batch_results)
+
         return results
 
     async def _extract(

From a4e602ed0f552c4d6171818dd80bec7f9fd5fdc4 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 12:19:05 +0800
Subject: [PATCH 10/20] Refactor knowledge graph extraction batch size and
 method in CommunitySummaryKnowledgeGraph

---
 .../knowledge_graph/community_summary.py      | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index 1f5dd59a0..e33779ee5 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -63,7 +63,7 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
         default=5,
         description="Top size of knowledge graph chunk search",
     )
-    triplet_extraction_batch_size: int = Field(
+    knowledge_graph_extraction_batch_size: int = Field(
         default=20,
         description="Batch size of triplets extraction from the text",
     )
@@ -102,7 +102,8 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
         )
         self._triplet_extraction_batch_size = int(
             os.getenv(
-                "TRIPLET_EXTRACTION_BATCH_SIZE", config.triplet_extraction_batch_size
+                "KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE",
+                config.knowledge_graph_extraction_batch_size,
             )
         )
 
@@ -199,16 +200,20 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
 
         document_graph_enabled = self._graph_store.get_config().document_graph_enabled
 
-        chunk_graph_pairs = await self._graph_extractor.batch_extract(
-            chunks, batch_size=self._triplet_extraction_batch_size
+        # Extract the triplets from the chunks, and return the list of graphs
+        # in the same order as the input texts
+        graphs_list = await self._graph_extractor.batch_extract(
+            [chunk.content for chunk in chunks],
+            batch_size=self._triplet_extraction_batch_size,
         )
 
-        for chunk, graphs in chunk_graph_pairs:
+        # Upsert the graphs into the graph store
+        for idx, graphs in enumerate(graphs_list):
             for graph in graphs:
                 if document_graph_enabled:
                     # Append the chunk id to the edge
                     for edge in graph.edges():
-                        edge.set_prop("_chunk_id", chunk.chunk_id)
+                        edge.set_prop("_chunk_id", chunks[idx].chunk_id)
                         graph.append_edge(edge=edge)
 
                 # Upsert the graph
@@ -218,7 +223,7 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
                 if document_graph_enabled:
                     for vertex in graph.vertices():
                         self._graph_store_apdater.upsert_chunk_include_entity(
-                            chunk=chunk, entity=vertex
+                            chunk=chunks[idx], entity=vertex
                         )
 
     def _load_chunks(

From 7d4d7f4b15fb67d072bef2ec318dd687a250ce57 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 12:25:35 +0800
Subject: [PATCH 11/20] refactor: Refactor batch extraction methods in
 GraphExtractor and LLMExtractor

---
 dbgpt/rag/transformer/graph_extractor.py | 23 ++++++++---------------
 dbgpt/rag/transformer/llm_extractor.py   |  3 +++
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/dbgpt/rag/transformer/graph_extractor.py b/dbgpt/rag/transformer/graph_extractor.py
index ef34ce246..7a02f74d1 100644
--- a/dbgpt/rag/transformer/graph_extractor.py
+++ b/dbgpt/rag/transformer/graph_extractor.py
@@ -40,7 +40,9 @@ async def aload_chunk_context(self, texts: List[str]) -> Dict[str, str]:
             chunks = await self._chunk_history.asimilar_search_with_scores(
                 text, self._topk, self._score_threshold
             )
-            history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)]
+            history = [
+                f"Section {i + 1}:\n{chunk.content}" for i, chunk in enumerate(chunks)
+            ]
 
             # Save chunk to history
             await self._chunk_history.aload_document_with_limit(
@@ -60,20 +62,8 @@ async def extract(self, text: str, limit: Optional[int] = None) -> List:
         Suggestion: to extract triplets in batches, call `batch_extract`.
         """
         # Load similar chunks
-        chunks = await self._chunk_history.asimilar_search_with_scores(
-            text, self._topk, self._score_threshold
-        )
-        history = [f"Section {i + 1}:\n{chunk}" for i, chunk in enumerate(chunks)]
-
-        # Save chunk to history
-        await self._chunk_history.aload_document_with_limit(
-            [Chunk(content=text, metadata={"relevant_cnt": len(history)})],
-            self._max_chunks_once_load,
-            self._max_threads,
-        )
-
-        # Save chunk context to map
-        context = "\n".join(history) if history else ""
+        text_context_map = await self.aload_chunk_context([text])
+        context = text_context_map[text]
 
         # Extract with chunk history
         return await super()._extract(text, context, limit)
@@ -88,6 +78,9 @@ async def batch_extract(
 
         Returns list of graphs in same order as input texts (text <-> graphs).
         """
+        if batch_size < 1:
+            raise ValueError("batch_size >= 1")
+
         # 1. Load chunk context
         text_context_map = await self.aload_chunk_context(texts)
 
diff --git a/dbgpt/rag/transformer/llm_extractor.py b/dbgpt/rag/transformer/llm_extractor.py
index 15c985de8..049611bc7 100644
--- a/dbgpt/rag/transformer/llm_extractor.py
+++ b/dbgpt/rag/transformer/llm_extractor.py
@@ -31,6 +31,9 @@ async def batch_extract(
         limit: Optional[int] = None,
     ) -> List:
         """Batch extract by LLM."""
+        if batch_size < 1:
+            raise ValueError("batch_size >= 1")
+
         results = []
 
         for i in range(0, len(texts), batch_size):

From 5aaa39351340dddaa900628a9cc89431b8be9e86 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 21:36:19 +0800
Subject: [PATCH 12/20] feat: Refactor knowledge graph extraction batch size
 and method in TuGraphStoreAdapter

---
 .../community/tugraph_store_adapter.py        | 65 +++++++++++++------
 1 file changed, 46 insertions(+), 19 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
index d65969d76..577a68a4b 100644
--- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
+++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -560,29 +560,42 @@ def explore(
                 rel = f"<-[r:{GraphElemType.RELATION.value}*{depth_string}]-"
             else:
                 rel = f"-[r:{GraphElemType.RELATION.value}*{depth_string}]-"
-            query = (
+            path_query = (
                 f"MATCH p=(n:{GraphElemType.ENTITY.value})"
                 f"{rel}(m:{GraphElemType.ENTITY.value}) "
                 f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
-                f"RETURN p {limit_string}"
+                f"RETURN n {limit_string}"
             )
-            return self.query(query)
+            return self.query(path_query, white_list=["description"])
         else:
             graph = MemoryGraph()
 
             for sub in subs:
-                query = (
+                # Query the chain from documents to chunks,
+                # document -> chunk -> chunk -> chunk -> ...
+                chain_query = (
                     f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
-                    f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]-"
+                    f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->"
                     f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS "
                     f"'{self._escape_quotes(sub)}' "
-                    f"RETURN p {limit_string}"
-                )  # if it contains the subjects
-                result = self.query(query)
-                for vertex in result.vertices():
-                    graph.upsert_vertex(vertex)
-                for edge in result.edges():
-                    graph.append_edge(edge)
+                    f"RETURN p"
+                )
+                # Query and filter all the properties
+                graph_of_path = self.query(query=chain_query, white_list=[""])
+                graph.upsert_graph(graph_of_path)
+
+                # Query the leaf chunks in the chain from documents to chunks
+                leaf_chunk_query = (
+                    f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
+                    f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->"
+                    f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS "
+                    f"'{self._escape_quotes(sub)}' "
+                    f"RETURN m {limit_string}"
+                )
+                graph_of_leaf_chunk = self.query(
+                    query=leaf_chunk_query, white_list=["content"]
+                )
+                graph.upsert_graph(graph_of_leaf_chunk)
 
             return graph
 
@@ -607,6 +620,7 @@ def query(self, query: str, **kwargs) -> MemoryGraph:
         vertices, edges = self._get_nodes_edges_from_queried_data(
             query_result, white_list
         )
+
         mg = MemoryGraph()
         for vertex in vertices:
             mg.upsert_vertex(vertex)
@@ -714,7 +728,7 @@ def _get_nodes_edges_from_queried_data(
         from neo4j import graph
 
         def filter_properties(
-            properties: dict[str, Any], white_list: List[str]
+            properties: dict[str, Any], white_list: List[str] = None
         ) -> Dict[str, Any]:
             """Filter the properties.
 
@@ -723,13 +737,26 @@ def filter_properties(
                 entity_properties = ["id", "name", "description", "_document_id",
                                         "_chunk_id", "_community_id"]
                 edge_properties = ["id", "name", "description", "_chunk_id"]
+            Args:
+                properties: Dictionary of properties to filter
+                white_list: List of properties to keep
+                    - If None: Keep default properties (those not starting with '_'
+                        and not in ['id', 'name'])
+                    - If [""]: Remove all properties (return empty dict)
+                    - If list of strings: Keep only properties in white_list
             """
-            return {
-                key: value
-                for key, value in properties.items()
-                if (not key.startswith("_") and key not in ["id", "name"])
-                or key in white_list
-            }
+            return (
+                {}
+                if white_list == [""]
+                else {
+                    key: value
+                    for key, value in properties.items()
+                    if (
+                        (not key.startswith("_") and key not in ["id", "name"])
+                        or (white_list is not None and key in white_list)
+                    )
+                }
+            )
 
         # Parse the data to nodes and relationships
         for record in data:

From e8b82dba222a99815cf0ec04ceb5f66f7c4ba726 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 21:36:26 +0800
Subject: [PATCH 13/20] refactor: Update knowledge graph extraction batch size
 and method in CommunitySummaryKnowledgeGraph

---
 .../knowledge_graph/community_summary.py      | 255 ++++++------------
 1 file changed, 84 insertions(+), 171 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index e33779ee5..a5eca38a5 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -100,6 +100,12 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
                 config.community_score_threshold,
             )
         )
+        self._knowledge_graph_chunk_search_top_size = int(
+            os.getenv(
+                "KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE",
+                config.knowledge_graph_chunk_search_top_size,
+            )
+        )
         self._triplet_extraction_batch_size = int(
             os.getenv(
                 "KNOWLEDGE_GRAPH_EXTRACTION_BATCH_SIZE",
@@ -314,14 +320,14 @@ async def asimilar_search_with_scores(
 
                 subgraph_for_doc = self._graph_store_apdater.explore(
                     subs=keywords_for_document_graph,
-                    limit=self._config.knowledge_graph_chunk_search_top_size,
+                    limit=self._knowledge_graph_chunk_search_top_size,
                     search_scope="document_graph",
                 )
         else:
             if document_graph_enabled:
                 subgraph_for_doc = self._graph_store_apdater.explore(
                     subs=keywords,
-                    limit=self._config.knowledge_graph_chunk_search_top_size,
+                    limit=self._knowledge_graph_chunk_search_top_size,
                     search_scope="document_graph",
                 )
         knowledge_graph_str = subgraph.format() if subgraph else ""
@@ -335,7 +341,7 @@ async def asimilar_search_with_scores(
             return []
 
         # merge search results into context
-        content = HYBRID_SEARCH_PT_CN.format(
+        content = HYBRID_SEARCH_PT.format(
             context=context,
             knowledge_graph=knowledge_graph_str,
             knowledge_graph_for_doc=knowledge_graph_for_doc_str,
@@ -365,179 +371,86 @@ def delete_vector_name(self, index_name: str):
         self._graph_extractor.drop()
 
 
-HYBRID_SEARCH_PT_CN = """## 角色
-你非常擅长结合提示词模板提供的[上下文]信息与[知识图谱]信息，
-准确恰当地回答用户的问题，并保证不会输出与上下文和知识图谱无关的信息。
-
-## 技能
-### 技能 1: 上下文理解
-- 准确地理解[上下文]提供的信息，上下文信息可能被拆分为多个章节。
-- 上下文的每个章节内容都会以[Section]开始，并按需进行了编号。
-- 上下文信息提供了与用户问题相关度最高的总结性描述，请合理使用它们。
-### 技能 2: 知识图谱理解
-- 准确地识别[知识图谱]中提供的[Entities:]章节中的实体信息和[Relationships:]章节中的关系信息，实体和关系信息的一般格式为：
-```
-* 实体信息格式:
-- (实体名)
-- (实体名:实体描述)
-- (实体名:实体属性表)
-- (文本块ID:文档块内容)
-- (目录ID:目录名)
-- (文档ID:文档名称)
-
-* 关系信息的格式:
-- (来源实体名)-[关系名]->(目标实体名)
-- (来源实体名)-[关系名:关系描述]->(目标实体名)
-- (来源实体名)-[关系名:关系属性表]->(目标实体名)
-- (文本块实体)-[包含]->(实体名)
-- (目录ID)-[包含]->(文本块实体)
-- (目录ID)-[包含]->(子目录ID)
-- (文档ID)-[包含]->(文本块实体)
-- (文档ID)-[包含]->(目录ID)
-```
-- 正确地将关系信息中的实体名/ID与实体信息关联，还原出图结构。
-- 将图结构所表达的信息作为用户提问的明细上下文，辅助生成更好的答案。
-
-
-## 约束条件
-- 不要在答案中描述你的思考过程，直接给出用户问题的答案，不要生成无关信息。
-- 若[知识图谱]或者[知识库原文]没有提供信息，此时应根据[上下文]提供的信息回答问题。
-- 确保以第三人称书写，从客观角度结合[上下文]、[知识图谱]和[知识库原文]表达的信息回答问题。
-- 若提供的信息相互矛盾，请解决矛盾并提供一个单一、连贯的描述。
-- 避免使用停用词和过于常见的词汇。
-
-## 参考案例
-```
-[上下文]:
-Section 1:
-菲尔・贾伯的大儿子叫雅各布・贾伯。
-Section 2:
-菲尔・贾伯的小儿子叫比尔・贾伯。
-
-[知识图谱]:
-Entities:
-(菲尔・贾伯#菲尔兹咖啡创始人)
-(菲尔兹咖啡#加利福尼亚州伯克利创立的咖啡品牌)
-(雅各布・贾伯#菲尔・贾伯的儿子)
-(美国多地#菲尔兹咖啡的扩展地区)
-
-Relationships:
-(菲尔・贾伯#创建#菲尔兹咖啡#1978年在加利福尼亚州伯克利创立)
-(菲尔兹咖啡#位于#加利福尼亚州伯克利#菲尔兹咖啡的创立地点)
-(菲尔・贾伯#拥有#雅各布・贾伯#菲尔・贾伯的儿子)
-(雅各布・贾伯#担任#首席执行官#在2005年成为菲尔兹咖啡的首席执行官)
-(菲尔兹咖啡#扩展至#美国多地#菲尔兹咖啡的扩展范围)
-
-[知识库原文]:
-...
-```
-
-----
-
-接下来的[上下文]、[知识图谱]和[知识库原文]的信息，可以帮助你回答更好地用户的问题。
-
-[上下文]:
-{context}
-
-[知识图谱]:
-{knowledge_graph}
-
-[知识库原文]
-{knowledge_graph_for_doc}
-"""  # noqa: E501
-
-HYBRID_SEARCH_PT_EN = """## Role
-You excel at combining the information provided in the [Context] with
-information from the [KnowledgeGraph] to accurately and appropriately
-answer user questions, ensuring that you do not output information
-unrelated to the context and knowledge graph.
-
-## Skills
-### Skill 1: Context Understanding
-- Accurately understand the information provided in the [Context],
-which may be divided into several sections.
-- Each section in the context will start with [Section]
-and may be numbered as needed.
-- The context provides a summary description most relevant to the user's
-question, and it should be used wisely.
-### Skill 2: Knowledge Graph Understanding
-- Accurately identify entity information in the [Entities:] section and
-relationship information in the [Relationships:] section
-of the [KnowledgeGraph]. The general format for entity
-and relationship information is:
-```
-* Entity Information Format:
-- (entity_name)
-- (entity_name: entity_description)
-- (entity_name: entity_property_map)
-- (chunk_id: chunk_content)
-- (catalog_id: catalog_name)
-- (document_id: document_name)
-
-* Relationship Information Format:
-- (source_entity_name)-[relationship_name]->(target_entity_name)
-- (source_entity_name)-[relationship_name: relationship_description]->(target_entity_name)
-- (source_entity_name)-[relationship_name: relationship_property_map]->(target_entity_name)
-- (chunk_id)-[Contains]->(entity_name)
-- (catalog_id)-[Contains]->(chunk_id)
-- (catalog_id)-[Contains]->(sub_catalog_id)
-- (document_id)-[Contains]->(chunk_id)
-- (document_id)-[Contains]->(catalog_id)
-```
-- Correctly associate entity names/IDs in the relationship information
-with entity information to restore the graph structure.
-- Use the information expressed by the graph structure as detailed
-context for the user's query to assist in generating better answers.
-
-## Constraints
-- Don't describe your thought process in the answer, provide the answer
-to the user's question directly without generating irrelevant information.
-- If the [KnowledgeGraph] or [Knowledge base original text] does not provide information, you should answer
-the question based on the information provided in the [Context].
-- Ensure to write in the third person, responding to questions from
-an objective perspective based on the information combined from the
-[Context], the [KnowledgeGraph] and the [Knowledge base original text].
-- If the provided information is contradictory, resolve the
-contradictions and provide a single, coherent description.
-- Avoid using stop words and overly common vocabulary.
-
-## Reference Example
-```
-[Context]:
-Section 1:
-Phil Schiller's eldest son is Jacob Schiller.
-Section 2:
-Phil Schiller's youngest son is Bill Schiller.
-
-[KnowledgeGraph]:
-Entities:
-(Phil Jaber#Founder of Philz Coffee)
-(Philz Coffee#Coffee brand founded in Berkeley, California)
-(Jacob Jaber#Son of Phil Jaber)
-(Multiple locations in the USA#Expansion regions of Philz Coffee)
-
-Relationships:
-(Phil Jaber#Created#Philz Coffee#Founded in Berkeley, California in 1978)
-(Philz Coffee#Located in#Berkeley, California#Founding location of Philz Coffee)
-(Phil Jaber#Has#Jacob Jaber#Son of Phil Jaber)
-(Jacob Jaber#Serves as#CEO#Became CEO of Philz Coffee in 2005)
-(Philz Coffee#Expanded to#Multiple locations in the USA#Expansion regions of Philz Coffee)
-
-[Knowledge base original text]
-...
-```
-
-----
-
-The following information from the [Context], [KnowledgeGraph] and [Knowledge base original text]
-can help you better answer user questions.
+HYBRID_SEARCH_PT = """
+=====
+[Context]、[Knowledge Graph]和[Original Text From RAG]的信息，可以帮助你回答更好地用户的问题。
 
 [Context]:
 {context}
 
-[KnowledgeGraph]:
+[Knowledge Graph]:
 {knowledge_graph}
 
-[Knowledge base original text]
+[Original Text From RAG]
 {knowledge_graph_for_doc}
+=====
+
+You are very good at combining the [Context] information provided by the prompt word template with the [Knowledge Graph] information,
+answering the user's questions accurately and appropriately, and ensuring that no information irrelevant to the context and knowledge graph is output.
+
+## Role: GraphRAG Assistant
+
+### Core Capabilities
+0. Make sure DO NOT answer irrelevant questions from the user.
+
+1. Information Processing
+- Process contextual information across multiple sections ([Section] markers)
+- Interpret knowledge graph relationships ((entity)-[relationship]->(entity))
+- Synthesize information from both structured and unstructured sources
+
+2. Response Generation
+- Provide nuanced, multi-perspective answers
+- Balance technical accuracy with conversational engagement
+- Connect related concepts across different information sources
+- Highlight uncertainties and limitations when appropriate
+
+3. Interaction Style
+- Maintain a natural, engaging conversation flow
+- Ask clarifying questions when needed
+- Provide examples and analogies to illustrate complex points
+- Adapt explanation depth based on user's apparent expertise
+
+4. Knowledge Integration
+- Seamlessly blend information from:
+  * Context sections
+  * Knowledge graph relationships
+  * Background knowledge (when appropriate)
+- Prioritize relevance over comprehensiveness
+- Acknowledge information gaps explicitly
+
+5. Quality Assurance
+- Verify logical consistency across sources
+- Cross-reference relationships for validation
+- Flag potential contradictions or ambiguities
+- Provide confidence levels when appropriate
+
+### Information Sources Handling
+1. Context Processing [Context]
+- Parse information from numbered sections systematically
+- Identify key concepts and relationships within each section
+- Track section dependencies and cross-references
+- Prioritize recent/relevant sections for the query
+
+2. Knowledge Graph Integration [Knowledge Graph]
+- Parse Entities and Relationships sections separately
+- Map entity-relationship-entity triples accurately
+- Understand relationship directionality
+- Use graph structure to find connected information
+
+3. Original Text Reference [Original Text From RAG]
+- The GraphRAG document directory is stored as an edge in relationships to show the hierarchy of the current source text in the entire document.
+- Use as authoritative source for detailed information
+- Cross-reference with Context and Knowledge Graph
+- Extract supporting evidence and examples
+- Resolve conflicts between sources using this as primary reference
+
+### Output Format
+1. Answer Structure
+- Lead with synthesized core information
+- Support with specific references to sources
+- Include relevant entity-relationship pairs
+- Conclude with confidence assessment
+- Use the markdown format of the "quote" to highlight the original text from "GraphRAG"
+
+=====
 """  # noqa: E501

From 0b872189c9ede6174cae2173650d8fc8f0f33905 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 21:43:58 +0800
Subject: [PATCH 14/20] Refactor method signature in TuGraphStoreAdapter

---
 .../storage/knowledge_graph/community/tugraph_store_adapter.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
index 577a68a4b..9a107785c 100644
--- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
+++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -728,7 +728,7 @@ def _get_nodes_edges_from_queried_data(
         from neo4j import graph
 
         def filter_properties(
-            properties: dict[str, Any], white_list: List[str] = None
+            properties: dict[str, Any], white_list: Optional[List[str]] = None
         ) -> Dict[str, Any]:
             """Filter the properties.
 

From e6f6d3308dd1b9aac081d2130e2cc8f72eab99f6 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Tue, 29 Oct 2024 21:45:52 +0800
Subject: [PATCH 15/20] Refactor markdown format in community_summary.py

---
 dbgpt/storage/knowledge_graph/community_summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index a5eca38a5..0825efac5 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -450,7 +450,7 @@ def delete_vector_name(self, index_name: str):
 - Support with specific references to sources
 - Include relevant entity-relationship pairs
 - Conclude with confidence assessment
-- Use the markdown format of the "quote" to highlight the original text from "GraphRAG"
+- Use the markdown format of the "quote" to highlight the original text (in details) from "GraphRAG"
 
 =====
 """  # noqa: E501

From 1ff31848cce13d19e5e9f6c41898c8b9c2cbbb0a Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 30 Oct 2024 18:07:40 +0800
Subject: [PATCH 16/20] fix: Refactor graph store configuration and
 enable/disable graph search

---
 dbgpt/storage/graph_store/base.py             |   8 --
 dbgpt/storage/graph_store/tugraph_store.py    |   8 --
 .../community/tugraph_store_adapter.py        | 116 +++++++++++-------
 .../knowledge_graph/community_summary.py      |  35 +++++-
 4 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/dbgpt/storage/graph_store/base.py b/dbgpt/storage/graph_store/base.py
index 5c2112578..a3344eeea 100644
--- a/dbgpt/storage/graph_store/base.py
+++ b/dbgpt/storage/graph_store/base.py
@@ -27,14 +27,6 @@ class GraphStoreConfig(BaseModel):
         default=False,
         description="Enable graph community summary or not.",
     )
-    document_graph_enabled: bool = Field(
-        default=True,
-        description="Enable document graph search or not.",
-    )
-    triplet_graph_enabled: bool = Field(
-        default=True,
-        description="Enable knowledge graph search or not.",
-    )
 
 
 class GraphStoreBase(ABC):
diff --git a/dbgpt/storage/graph_store/tugraph_store.py b/dbgpt/storage/graph_store/tugraph_store.py
index c20965947..4f8437245 100644
--- a/dbgpt/storage/graph_store/tugraph_store.py
+++ b/dbgpt/storage/graph_store/tugraph_store.py
@@ -83,14 +83,6 @@ def __init__(self, config: TuGraphStoreConfig) -> None:
             os.getenv("GRAPH_COMMUNITY_SUMMARY_ENABLED", "").lower() == "true"
             or config.enable_summary
         )
-        self._enable_document_graph = (
-            os.getenv("DOCUMENT_GRAPH_ENABLED", "").lower() == "true"
-            or config.document_graph_enabled
-        )
-        self._enable_triplet_graph = (
-            os.getenv("TRIPLET_GRAPH_ENABLED", "").lower() == "true"
-            or config.triplet_graph_enabled
-        )
         self._plugin_names = (
             os.getenv("TUGRAPH_PLUGIN_NAMES", "leiden").split(",")
             or config.plugin_names
diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
index 9a107785c..c2d12be3d 100644
--- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
+++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -465,14 +465,12 @@ def create_graph_label(
         (vertices) and edges in the graph.
         """
         if graph_elem_type.is_vertex():  # vertex
-            data = json.dumps(
-                {
-                    "label": graph_elem_type.value,
-                    "type": "VERTEX",
-                    "primary": "id",
-                    "properties": graph_properties,
-                }
-            )
+            data = json.dumps({
+                "label": graph_elem_type.value,
+                "type": "VERTEX",
+                "primary": "id",
+                "properties": graph_properties,
+            })
             gql = f"""CALL db.createVertexLabelByJson('{data}')"""
         else:  # edge
 
@@ -498,14 +496,12 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]:
                 else:
                     raise ValueError("Invalid graph element type.")
 
-            data = json.dumps(
-                {
-                    "label": graph_elem_type.value,
-                    "type": "EDGE",
-                    "constraints": edge_direction(graph_elem_type),
-                    "properties": graph_properties,
-                }
-            )
+            data = json.dumps({
+                "label": graph_elem_type.value,
+                "type": "EDGE",
+                "constraints": edge_direction(graph_elem_type),
+                "properties": graph_properties,
+            })
             gql = f"""CALL db.createEdgeLabelByJson('{data}')"""
 
         self.graph_store.conn.run(gql)
@@ -544,7 +540,7 @@ def explore(
         if not subs:
             return MemoryGraph()
 
-        if depth < 0:
+        if depth <= 0:
             depth = 3
         depth_string = f"1..{depth}"
 
@@ -560,42 +556,76 @@ def explore(
                 rel = f"<-[r:{GraphElemType.RELATION.value}*{depth_string}]-"
             else:
                 rel = f"-[r:{GraphElemType.RELATION.value}*{depth_string}]-"
-            path_query = (
+            query = (
                 f"MATCH p=(n:{GraphElemType.ENTITY.value})"
                 f"{rel}(m:{GraphElemType.ENTITY.value}) "
                 f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
-                f"RETURN n {limit_string}"
+                f"RETURN p {limit_string}"
             )
-            return self.query(path_query, white_list=["description"])
+            return self.query(query=query, white_list=["description"])
         else:
             graph = MemoryGraph()
+            check_entity_query = (
+                f"MATCH (n:{GraphElemType.ENTITY.value}) "
+                f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
+                "RETURN n"
+            )
 
-            for sub in subs:
+            if self.query(check_entity_query):
                 # Query the chain from documents to chunks,
-                # document -> chunk -> chunk -> chunk -> ...
+                # document -> chunk -> ... -> chunk (-> entity, do not reach entity)
+                chain_query = (
+                    f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
+                    f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->"
+                    f"(leaf_chunk:{GraphElemType.CHUNK.value})-[:{GraphElemType.INCLUDE.value}]->"
+                    f"(m:{GraphElemType.ENTITY.value}) "
+                    f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} "
+                    # "WITH n, leaf_chunk "
+                    # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->(leaf_chunk:{GraphElemType.CHUNK.value}) "
+                    "RETURN p"
+                )
+                # Filter all the properties by with_list
+                graph.upsert_graph(self.query(query=chain_query, white_list=[""]))
+
+                # Query the leaf chunks in the chain from documents to chunks
+                leaf_chunk_query = (
+                    f"MATCH p=(n:{GraphElemType.CHUNK.value})-"
+                    f"[r:{GraphElemType.INCLUDE.value}]->"
+                    f"(m:{GraphElemType.ENTITY.value})"
+                    f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} "
+                    f"RETURN n {limit_string}"
+                )
+                graph.upsert_graph(
+                    self.query(query=leaf_chunk_query, white_list=["content"])
+                )
+            else:
+                _subs_condition = " OR ".join([
+                    f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs
+                ])
+
+                # Query the chain from documents to chunks,
+                # document -> chunk -> chunk -> chunk -> ... -> chunk
                 chain_query = (
                     f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
                     f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->"
-                    f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS "
-                    f"'{self._escape_quotes(sub)}' "
-                    f"RETURN p"
+                    f"(m:{GraphElemType.CHUNK.value})"
+                    f"WHERE {_subs_condition}"
+                    "RETURN p"
                 )
-                # Query and filter all the properties
-                graph_of_path = self.query(query=chain_query, white_list=[""])
-                graph.upsert_graph(graph_of_path)
+                # Filter all the properties by with_list
+                graph.upsert_graph(self.query(query=chain_query, white_list=[""]))
 
                 # Query the leaf chunks in the chain from documents to chunks
                 leaf_chunk_query = (
                     f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
                     f"[r:{GraphElemType.INCLUDE.value}*{depth_string}]->"
-                    f"(m:{GraphElemType.CHUNK.value})WHERE m.content CONTAINS "
-                    f"'{self._escape_quotes(sub)}' "
+                    f"(m:{GraphElemType.CHUNK.value})"
+                    f"WHERE {_subs_condition}"
                     f"RETURN m {limit_string}"
                 )
-                graph_of_leaf_chunk = self.query(
-                    query=leaf_chunk_query, white_list=["content"]
+                graph.upsert_graph(
+                    self.query(query=leaf_chunk_query, white_list=["content"])
                 )
-                graph.upsert_graph(graph_of_leaf_chunk)
 
             return graph
 
@@ -663,19 +693,15 @@ async def stream_query(  # type: ignore[override]
                     rels = list(record["p"].relationships)
                     formatted_path = []
                     for i in range(len(nodes)):
-                        formatted_path.append(
-                            {
-                                "id": nodes[i]._properties["id"],
-                                "description": nodes[i]._properties["description"],
-                            }
-                        )
+                        formatted_path.append({
+                            "id": nodes[i]._properties["id"],
+                            "description": nodes[i]._properties["description"],
+                        })
                         if i < len(rels):
-                            formatted_path.append(
-                                {
-                                    "id": rels[i]._properties["id"],
-                                    "description": rels[i]._properties["description"],
-                                }
-                            )
+                            formatted_path.append({
+                                "id": rels[i]._properties["id"],
+                                "description": rels[i]._properties["description"],
+                            })
                     for i in range(0, len(formatted_path), 2):
                         mg.upsert_vertex(
                             Vertex(
diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index 0825efac5..8b7a1b83d 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -59,6 +59,15 @@ class CommunitySummaryKnowledgeGraphConfig(BuiltinKnowledgeGraphConfig):
         default=0.0,
         description="Recall score of community search in knowledge graph",
     )
+    triplet_graph_enabled: bool = Field(
+        default=True,
+        description="Enable the graph search for triplets",
+    )
+    document_graph_enabled: bool = Field(
+        default=True,
+        description="Enable the graph search for documents and chunks",
+    )
+
     knowledge_graph_chunk_search_top_size: int = Field(
         default=5,
         description="Top size of knowledge graph chunk search",
@@ -100,6 +109,20 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
                 config.community_score_threshold,
             )
         )
+        self._document_graph_enabled = bool(
+            (
+                os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true"
+                if "DOCUMENT_GRAPH_ENABLED" in os.environ
+                else config.document_graph_enabled
+            )
+        )
+        self._triplet_graph_enabled = bool(
+            (
+                os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true"
+                if "TRIPLET_GRAPH_ENABLED" in os.environ
+                else config.triplet_graph_enabled
+            )
+        )
         self._knowledge_graph_chunk_search_top_size = int(
             os.getenv(
                 "KNOWLEDGE_GRAPH_CHUNK_SEARCH_TOP_SIZE",
@@ -170,7 +193,7 @@ async def _aload_document_graph(self, chunks: List[Chunk]) -> None:
 
         The chunks include the doc structure.
         """
-        if not self._graph_store.get_config().document_graph_enabled:
+        if not self._document_graph_enabled:
             return
 
         _chunks: List[ParagraphChunk] = [
@@ -201,10 +224,10 @@ async def _aload_triplet_graph(self, chunks: List[Chunk]) -> None:
 
         The chunks include the doc structure.
         """
-        if not self._graph_store.get_config().triplet_graph_enabled:
+        if not self._triplet_graph_enabled:
             return
 
-        document_graph_enabled = self._graph_store.get_config().document_graph_enabled
+        document_graph_enabled = self._document_graph_enabled
 
         # Extract the triplets from the chunks, and return the list of graphs
         # in the same order as the input texts
@@ -303,10 +326,12 @@ async def asimilar_search_with_scores(
         context = "\n".join(summaries) if summaries else ""
 
         keywords: List[str] = await self._keyword_extractor.extract(text)
+        subgraph = None
+        subgraph_for_doc = None
 
         # Local search: extract keywords and explore subgraph
-        triplet_graph_enabled = self._graph_store.get_config().triplet_graph_enabled
-        document_graph_enabled = self._graph_store.get_config().document_graph_enabled
+        triplet_graph_enabled = self._triplet_graph_enabled
+        document_graph_enabled = self._document_graph_enabled
 
         if triplet_graph_enabled:
             subgraph: MemoryGraph = self._graph_store_apdater.explore(

From a8f93216745affd9e2c64720cc91b26812fbb279 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 30 Oct 2024 18:21:30 +0800
Subject: [PATCH 17/20] chore: format the code

---
 .../community/tugraph_store_adapter.py        | 60 +++++++++++--------
 .../knowledge_graph/community_summary.py      |  3 +-
 2 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
index c2d12be3d..3e7c73a6c 100644
--- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
+++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -465,12 +465,14 @@ def create_graph_label(
         (vertices) and edges in the graph.
         """
         if graph_elem_type.is_vertex():  # vertex
-            data = json.dumps({
-                "label": graph_elem_type.value,
-                "type": "VERTEX",
-                "primary": "id",
-                "properties": graph_properties,
-            })
+            data = json.dumps(
+                {
+                    "label": graph_elem_type.value,
+                    "type": "VERTEX",
+                    "primary": "id",
+                    "properties": graph_properties,
+                }
+            )
             gql = f"""CALL db.createVertexLabelByJson('{data}')"""
         else:  # edge
 
@@ -496,12 +498,14 @@ def edge_direction(graph_elem_type: GraphElemType) -> List[List[str]]:
                 else:
                     raise ValueError("Invalid graph element type.")
 
-            data = json.dumps({
-                "label": graph_elem_type.value,
-                "type": "EDGE",
-                "constraints": edge_direction(graph_elem_type),
-                "properties": graph_properties,
-            })
+            data = json.dumps(
+                {
+                    "label": graph_elem_type.value,
+                    "type": "EDGE",
+                    "constraints": edge_direction(graph_elem_type),
+                    "properties": graph_properties,
+                }
+            )
             gql = f"""CALL db.createEdgeLabelByJson('{data}')"""
 
         self.graph_store.conn.run(gql)
@@ -577,11 +581,13 @@ def explore(
                 chain_query = (
                     f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
                     f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->"
-                    f"(leaf_chunk:{GraphElemType.CHUNK.value})-[:{GraphElemType.INCLUDE.value}]->"
+                    f"(leaf_chunk:{GraphElemType.CHUNK.value})-"
+                    f"[:{GraphElemType.INCLUDE.value}]->"
                     f"(m:{GraphElemType.ENTITY.value}) "
                     f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} "
                     # "WITH n, leaf_chunk "
-                    # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->(leaf_chunk:{GraphElemType.CHUNK.value}) "
+                    # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->"
+                    # f"(leaf_chunk:{GraphElemType.CHUNK.value}) "
                     "RETURN p"
                 )
                 # Filter all the properties by with_list
@@ -599,9 +605,9 @@ def explore(
                     self.query(query=leaf_chunk_query, white_list=["content"])
                 )
             else:
-                _subs_condition = " OR ".join([
-                    f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs
-                ])
+                _subs_condition = " OR ".join(
+                    [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs]
+                )
 
                 # Query the chain from documents to chunks,
                 # document -> chunk -> chunk -> chunk -> ... -> chunk
@@ -693,15 +699,19 @@ async def stream_query(  # type: ignore[override]
                     rels = list(record["p"].relationships)
                     formatted_path = []
                     for i in range(len(nodes)):
-                        formatted_path.append({
-                            "id": nodes[i]._properties["id"],
-                            "description": nodes[i]._properties["description"],
-                        })
+                        formatted_path.append(
+                            {
+                                "id": nodes[i]._properties["id"],
+                                "description": nodes[i]._properties["description"],
+                            }
+                        )
                         if i < len(rels):
-                            formatted_path.append({
-                                "id": rels[i]._properties["id"],
-                                "description": rels[i]._properties["description"],
-                            })
+                            formatted_path.append(
+                                {
+                                    "id": rels[i]._properties["id"],
+                                    "description": rels[i]._properties["description"],
+                                }
+                            )
                     for i in range(0, len(formatted_path), 2):
                         mg.upsert_vertex(
                             Vertex(
diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index 8b7a1b83d..badcb5fac 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -9,7 +9,6 @@
 from dbgpt.core import Chunk
 from dbgpt.rag.transformer.community_summarizer import CommunitySummarizer
 from dbgpt.rag.transformer.graph_extractor import GraphExtractor
-from dbgpt.storage.graph_store.graph import MemoryGraph
 from dbgpt.storage.knowledge_graph.base import ParagraphChunk
 from dbgpt.storage.knowledge_graph.community.community_store import CommunityStore
 from dbgpt.storage.knowledge_graph.knowledge_graph import (
@@ -334,7 +333,7 @@ async def asimilar_search_with_scores(
         document_graph_enabled = self._document_graph_enabled
 
         if triplet_graph_enabled:
-            subgraph: MemoryGraph = self._graph_store_apdater.explore(
+            subgraph = self._graph_store_apdater.explore(
                 subs=keywords, limit=topk, search_scope="knowledge_graph"
             )
 

From 7e3c3c70e6c2195cb3fdfb732f568e44f8c078ed Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 30 Oct 2024 20:10:22 +0800
Subject: [PATCH 18/20] fix: Refactor TuGraphStoreAdapter to improve graph
 retrieval logic

---
 .../community/tugraph_store_adapter.py        | 65 +++++++++++++------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
index 3e7c73a6c..fa107a28b 100644
--- a/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
+++ b/dbgpt/storage/knowledge_graph/community/tugraph_store_adapter.py
@@ -568,42 +568,65 @@ def explore(
             )
             return self.query(query=query, white_list=["description"])
         else:
+            # If there exists the entities in the graph, return the graph that
+            # includes the leaf chunks that connect to the entities, the chains from
+            # documents to the leaf chunks, and the chain from documents to chunks;
+            # document -> chunk -> chunk -> ... -> leaf chunk -> (entity)
+            #
+            # If not, return the graph that includes the chains from documents to chunks
+            # that contain the subs (keywords).
+            # document -> chunk -> chunk -> ... -> leaf chunk (that contains the subs)
+            #
+            # And only the leaf chunks contain the content, and the other chunks do not
+            # contain any properties except the id, name.
+
             graph = MemoryGraph()
+
+            # Check if the entities exist in the graph
             check_entity_query = (
                 f"MATCH (n:{GraphElemType.ENTITY.value}) "
                 f"WHERE n.id IN {[self._escape_quotes(sub) for sub in subs]} "
                 "RETURN n"
             )
-
             if self.query(check_entity_query):
-                # Query the chain from documents to chunks,
-                # document -> chunk -> ... -> chunk (-> entity, do not reach entity)
-                chain_query = (
-                    f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
-                    f"[:{GraphElemType.INCLUDE.value}*1..{depth + 1}]->"
-                    f"(leaf_chunk:{GraphElemType.CHUNK.value})-"
-                    f"[:{GraphElemType.INCLUDE.value}]->"
-                    f"(m:{GraphElemType.ENTITY.value}) "
-                    f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} "
-                    # "WITH n, leaf_chunk "
-                    # f"MATCH p = (n)-[:{GraphElemType.INCLUDE.value}*1..{depth}]->"
-                    # f"(leaf_chunk:{GraphElemType.CHUNK.value}) "
-                    "RETURN p"
-                )
-                # Filter all the properties by with_list
-                graph.upsert_graph(self.query(query=chain_query, white_list=[""]))
-
                 # Query the leaf chunks in the chain from documents to chunks
                 leaf_chunk_query = (
                     f"MATCH p=(n:{GraphElemType.CHUNK.value})-"
                     f"[r:{GraphElemType.INCLUDE.value}]->"
                     f"(m:{GraphElemType.ENTITY.value})"
                     f"WHERE m.name IN {[self._escape_quotes(sub) for sub in subs]} "
-                    f"RETURN n {limit_string}"
+                    f"RETURN n"
                 )
-                graph.upsert_graph(
-                    self.query(query=leaf_chunk_query, white_list=["content"])
+                graph_of_leaf_chunks = self.query(
+                    query=leaf_chunk_query, white_list=["content"]
                 )
+
+                # Query the chain from documents to chunks,
+                # document -> chunk -> ... ->  leaf_chunks
+                chunk_names = [
+                    self._escape_quotes(vertex.name)
+                    for vertex in graph_of_leaf_chunks.vertices()
+                ]
+                chain_query = (
+                    f"MATCH p=(n:{GraphElemType.DOCUMENT.value})-"
+                    f"[:{GraphElemType.INCLUDE.value}*{depth_string}]->"
+                    f"(m:{GraphElemType.CHUNK.value})"
+                    f"WHERE m.name IN {chunk_names} "
+                    "RETURN p"
+                )
+                # Filter all the properties by with_list
+                graph.upsert_graph(self.query(query=chain_query, white_list=[""]))
+
+                # The number of leaf chunks caompared to the `limit`
+                if not limit or len(chunk_names) <= limit:
+                    graph.upsert_graph(graph_of_leaf_chunks)
+                else:
+                    limited_leaf_chunk_query = leaf_chunk_query + f" {limit_string}"
+                    graph.upsert_graph(
+                        self.query(
+                            query=limited_leaf_chunk_query, white_list=["content"]
+                        )
+                    )
             else:
                 _subs_condition = " OR ".join(
                     [f"m.content CONTAINS '{self._escape_quotes(sub)}'" for sub in subs]

From 0c263bf364d2b95bc7bdb1341004268f0b0abfa7 Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 30 Oct 2024 20:19:55 +0800
Subject: [PATCH 19/20] fix

---
 .../knowledge_graph/community_summary.py      | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index badcb5fac..d31b18b46 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -108,19 +108,15 @@ def __init__(self, config: CommunitySummaryKnowledgeGraphConfig):
                 config.community_score_threshold,
             )
         )
-        self._document_graph_enabled = bool(
-            (
-                os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true"
-                if "DOCUMENT_GRAPH_ENABLED" in os.environ
-                else config.document_graph_enabled
-            )
+        self._document_graph_enabled = (
+            os.environ["DOCUMENT_GRAPH_ENABLED"].lower() == "true"
+            if "DOCUMENT_GRAPH_ENABLED" in os.environ
+            else config.document_graph_enabled
         )
-        self._triplet_graph_enabled = bool(
-            (
-                os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true"
-                if "TRIPLET_GRAPH_ENABLED" in os.environ
-                else config.triplet_graph_enabled
-            )
+        self._triplet_graph_enabled = (
+            os.environ["TRIPLET_GRAPH_ENABLED"].lower() == "true"
+            if "TRIPLET_GRAPH_ENABLED" in os.environ
+            else config.triplet_graph_enabled
         )
         self._knowledge_graph_chunk_search_top_size = int(
             os.getenv(

From f0216d760ff7a1af728b01e36426e1000a3bc1cf Mon Sep 17 00:00:00 2001
From: Appointat <kuda.czk@antgroup.com>
Date: Wed, 30 Oct 2024 20:22:20 +0800
Subject: [PATCH 20/20] Refactor markdown format in community_summary.py

---
 dbgpt/storage/knowledge_graph/community_summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbgpt/storage/knowledge_graph/community_summary.py b/dbgpt/storage/knowledge_graph/community_summary.py
index d31b18b46..62e3e4c13 100644
--- a/dbgpt/storage/knowledge_graph/community_summary.py
+++ b/dbgpt/storage/knowledge_graph/community_summary.py
@@ -393,7 +393,7 @@ def delete_vector_name(self, index_name: str):
 
 HYBRID_SEARCH_PT = """
 =====
-[Context]、[Knowledge Graph]和[Original Text From RAG]的信息，可以帮助你回答更好地用户的问题。
+The following information from [Context], [Knowledge Graph], and [Original Text From RAG] can help you answer user questions better.
 
 [Context]:
 {context}