Add SearchApi integration for websearch

SearchApi · Nov 23, 2023 · 4327fd7 · 4327fd7
1 parent 604b177
commit 4327fd7
Show file tree

Hide file tree

Showing 9 changed files with 730 additions and 3 deletions.
diff --git a/haystack/nodes/retriever/web.py b/haystack/nodes/retriever/web.py
@@ -64,7 +64,7 @@ def __init__(
     ):
         """
         :param api_key: API key for the search engine provider.
-        :param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SerpAPI", "BingAPI" or "GoogleAPI"
+        :param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SearchApi", "SerpAPI", "BingAPI" or "GoogleAPI"
         :param search_engine_kwargs: Additional parameters to pass to the search engine provider.
         :param top_search_results: Number of top search results to be retrieved.
         :param top_k: Top k documents to be returned by the retriever.

diff --git a/haystack/nodes/search_engine/providers.py b/haystack/nodes/search_engine/providers.py
@@ -124,6 +124,114 @@ def search(self, query: str, **kwargs) -> List[Document]:
         return self.score_results(result_docs, len(answer_box) > 0)
 
 
+class SearchApi(SearchEngine):
+    """
+    SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube,
+    YouTube transcripts and more. See the [SearchApi website](https://www.searchapi.io/) for more details.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        top_k: Optional[int] = 10,
+        allowed_domains: Optional[List[str]] = None,
+        engine: Optional[str] = "google",
+        search_engine_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        :param api_key: API key for SearchApi.
+        :param top_k: Number of results to return.
+        :param allowed_domains: List of domains to limit the search to.
+        :param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
+        See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
+        :param search_engine_kwargs: Additional parameters passed to the SearchApi.
+        See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
+        """
+        super().__init__()
+        self.params_dict: Dict[str, Union[str, int, float]] = {}
+        self.api_key = api_key
+        self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
+        self.engine = engine
+        self.top_k = top_k
+        self.allowed_domains = allowed_domains
+
+    def search(self, query: str, **kwargs) -> List[Document]:
+        """
+        :param query: Query string.
+        :param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
+        to localize search to the specific location.
+        :return: List[Document]
+        """
+        kwargs = {**self.kwargs, **kwargs}
+        top_k = kwargs.pop("top_k", self.top_k)
+        url = "https://www.searchapi.io/api/v1/search"
+
+        allowed_domains = kwargs.pop("allowed_domains", self.allowed_domains)
+        query_prepend = "OR ".join(f"site:{domain} " for domain in allowed_domains) if allowed_domains else ""
+        params = {"q": query_prepend + " " + query, **kwargs}
+        headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}
+
+        if self.engine:
+            params["engine"] = self.engine
+        response = requests.get(url, params=params, headers=headers, timeout=90)
+
+        if response.status_code != 200:
+            raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")
+
+        json_result = json.loads(response.text)
+
+        # organic results are the main results from the search engine
+        organic_results = []
+        if "organic_results" in json_result:
+            for result in json_result["organic_results"]:
+                organic_results.append(
+                    Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
+                )
+
+        # answer box has a direct answer to the query
+        answer_box = []
+        if "answer_box" in json_result:
+            answer_box = [
+                Document.from_dict(
+                    {
+                        "title": json_result["answer_box"].get("title", ""),
+                        "content": json_result["answer_box"].get("answer", ""),
+                        "link": json_result["answer_box"].get("link", ""),
+                    }
+                )
+            ]
+
+        knowledge_graph = []
+        if "knowledge_graph" in json_result:
+            knowledge_graph = [
+                Document.from_dict(
+                    {
+                        "title": json_result["knowledge_graph"].get("title", ""),
+                        "content": json_result["knowledge_graph"].get("description", ""),
+                    }
+                )
+            ]
+
+        related_questions = []
+        if "related_questions" in json_result:
+            for result in json_result["related_questions"]:
+                related_questions.append(
+                    Document.from_dict(
+                        {
+                            "title": result["question"],
+                            "content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
+                            "link": result.get("source", {}).get("link", ""),
+                        }
+                    )
+                )
+
+        documents = answer_box + knowledge_graph + organic_results + related_questions
+
+        logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
+        result_docs = documents[:top_k]
+        return self.score_results(result_docs, len(answer_box) > 0)
+
+
 class SerperDev(SearchEngine):
     """
     Search engine using SerperDev API. See the [Serper Dev website](https://serper.dev/) for more details.

diff --git a/haystack/nodes/search_engine/web.py b/haystack/nodes/search_engine/web.py
@@ -14,6 +14,7 @@ class WebSearch(BaseComponent):
 
     WebSearch currently supports the following search engines providers (bridges):
     - SerperDev (default)
+    - SearchApi
     - SerpAPI
     - BingAPI
     - GoogleAPI

diff --git a/haystack/preview/components/websearch/__init__.py b/haystack/preview/components/websearch/__init__.py
@@ -1,3 +1,4 @@
 from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch
+from haystack.preview.components.websearch.searchapi import SearchApiWebSearch
 
-__all__ = ["SerperDevWebSearch"]
+__all__ = ["SerperDevWebSearch", "SearchApiWebSearch"]
diff --git a/haystack/preview/components/websearch/searchapi.py b/haystack/preview/components/websearch/searchapi.py
@@ -0,0 +1,140 @@
+import json
+import os
+import logging
+from typing import Dict, List, Optional, Any
+
+import requests
+
+from haystack.preview import Document, component, default_to_dict, ComponentError
+
+logger = logging.getLogger(__name__)
+
+
+SEARCHAPI_BASE_URL = "https://www.searchapi.io/api/v1/search"
+
+
+class SearchApiError(ComponentError):
+    ...
+
+
+@component
+class SearchApiWebSearch:
+    """
+    Search engine using SearchApi API. Given a query, it returns a list of URLs that are the most relevant.
+
+    See the [SearchApi website](https://www.searchapi.io/) for more details.
+    """
+
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        top_k: Optional[int] = 10,
+        allowed_domains: Optional[List[str]] = None,
+        search_params: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        :param api_key: API key for the SearchApi API.  It can be
+        explicitly provided or automatically read from the
+        environment variable SEARCHAPI_API_KEY (recommended).
+        :param top_k: Number of documents to return.
+        :param allowed_domains: List of domains to limit the search to.
+        :param search_params: Additional parameters passed to the SearchApi API.
+        For example, you can set 'num' to 100 to increase the number of search results.
+        See the [SearchApi website](https://www.searchapi.io/) for more details.
+        """
+        if api_key is None:
+            try:
+                api_key = os.environ["SEARCHAPI_API_KEY"]
+            except KeyError as e:
+                raise ValueError(
+                    "SearchApiWebSearch expects an API key. "
+                    "Set the SEARCHAPI_API_KEY environment variable (recommended) or pass it explicitly."
+                ) from e
+        self.api_key = api_key
+        self.top_k = top_k
+        self.allowed_domains = allowed_domains
+        self.search_params = search_params or {}
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serialize this component to a dictionary.
+        """
+        return default_to_dict(
+            self, top_k=self.top_k, allowed_domains=self.allowed_domains, search_params=self.search_params
+        )
+
+    @component.output_types(documents=List[Document], links=List[str])
+    def run(self, query: str):
+        """
+        Search the SearchApi API for the given query and return the results as a list of Documents and a list of links.
+
+        :param query: Query string.
+        """
+        query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
+
+        payload = json.dumps({"q": query_prepend + " " + query, **self.search_params})
+        headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}
+
+        try:
+            response = requests.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
+            response.raise_for_status()  # Will raise an HTTPError for bad responses
+        except requests.Timeout:
+            raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")
+
+        except requests.RequestException as e:
+            raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
+
+        # Request succeeded
+        json_result = response.json()
+
+        # organic results are the main results from the search engine
+        organic_results = []
+        if "organic_results" in json_result:
+            for result in json_result["organic_results"]:
+                organic_results.append(
+                    Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
+                )
+
+        # answer box has a direct answer to the query
+        answer_box = []
+        if "answer_box" in json_result:
+            answer_box = [
+                Document.from_dict(
+                    {
+                        "title": json_result["answer_box"].get("title", ""),
+                        "content": json_result["answer_box"].get("answer", ""),
+                        "link": json_result["answer_box"].get("link", ""),
+                    }
+                )
+            ]
+
+        knowledge_graph = []
+        if "knowledge_graph" in json_result:
+            knowledge_graph = [
+                Document.from_dict(
+                    {
+                        "title": json_result["knowledge_graph"].get("title", ""),
+                        "content": json_result["knowledge_graph"].get("description", ""),
+                    }
+                )
+            ]
+
+        related_questions = []
+        if "related_questions" in json_result:
+            for result in json_result["related_questions"]:
+                related_questions.append(
+                    Document.from_dict(
+                        {
+                            "title": result["question"],
+                            "content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
+                            "link": result.get("source", {}).get("link", ""),
+                        }
+                    )
+                )
+
+        documents = answer_box + knowledge_graph + organic_results + related_questions
+
+        links = [result["link"] for result in json_result["organic_results"]]
+
+        logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
+        return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
diff --git a/proposals/text/4084-agent-demo.md b/proposals/text/4084-agent-demo.md
@@ -83,7 +83,7 @@ The main Agent modules/tools are:
 ## SearchEngine
 
 SearchEngine is a symbolic API module allowing programmatic interaction with Google and other search engines. We'll have
-multiple providers of SearchEngine including https://serper.dev and https://serpapi.com as initial providers.
+multiple providers of SearchEngine including https://serper.dev, https://www.searchapi.io/ and https://serpapi.com as initial providers.
 
 SearchEngine will return a list of results (e.g. List[Document]), the content of each document being a "snippet" of the
 single search result, while all other attributes of the search results (e.g. title, url link, etc.) will

diff --git a/releasenotes/notes/add-searchapi-integration-bb9130485c3c9429.yaml b/releasenotes/notes/add-searchapi-integration-bb9130485c3c9429.yaml
@@ -0,0 +1,3 @@
+---
+preview:
+  - Integrate SearchApi as an additional websearch provider.
diff --git a/test/nodes/test_web_search.py b/test/nodes/test_web_search.py
@@ -43,6 +43,35 @@ def test_web_search_with_site_keyword():
     ), "Some documents are not from the specified sites lifewire.com or nasa.gov."
 
 
+@pytest.mark.skipif(
+    not os.environ.get("SEARCHAPI_API_KEY", None),
+    reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
+)
+@pytest.mark.integration
+def test_web_search_with_searchapi():
+    ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
+    result, _ = ws.run(query="What is the hometown of the reigning men's U.S. Open champion?")
+    assert "documents" in result
+    assert len(result["documents"]) > 0
+    assert isinstance(result["documents"][0], Document)
+
+
+@pytest.mark.skipif(
+    not os.environ.get("SEARCHAPI_API_KEY", None),
+    reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
+)
+@pytest.mark.integration
+def test_web_search_with_searchapi_with_site_keyword():
+    ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
+    result, _ = ws.run(query='site:openai.com OR site:langchain.com "Agent types"')
+    assert "documents" in result
+    assert len(result["documents"]) > 0
+    assert isinstance(result["documents"][0], Document)
+    assert all(
+        "langchain" in doc.meta["link"] or "openai" in doc.meta["link"] for doc in result["documents"]
+    ), "Some documents are not from the specified sites openai.com or langchain.com."
+
+
 @pytest.mark.unit
 def test_web_search_with_google_api_provider():
     if not googleapi_installed: