Skip to content

Commit

Permalink
Add SearchApi integration for websearch
Browse files Browse the repository at this point in the history
  • Loading branch information
SebastjanPrachovskij committed Nov 23, 2023
1 parent 604b177 commit 4327fd7
Show file tree
Hide file tree
Showing 9 changed files with 730 additions and 3 deletions.
2 changes: 1 addition & 1 deletion haystack/nodes/retriever/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
):
"""
:param api_key: API key for the search engine provider.
:param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SerpAPI", "BingAPI" or "GoogleAPI"
:param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SearchApi", "SerpAPI", "BingAPI" or "GoogleAPI"
:param search_engine_kwargs: Additional parameters to pass to the search engine provider.
:param top_search_results: Number of top search results to be retrieved.
:param top_k: Top k documents to be returned by the retriever.
Expand Down
108 changes: 108 additions & 0 deletions haystack/nodes/search_engine/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,114 @@ def search(self, query: str, **kwargs) -> List[Document]:
return self.score_results(result_docs, len(answer_box) > 0)


class SearchApi(SearchEngine):
"""
SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube,
YouTube transcripts and more. See the [SearchApi website](https://www.searchapi.io/) for more details.
"""

def __init__(
self,
api_key: str,
top_k: Optional[int] = 10,
allowed_domains: Optional[List[str]] = None,
engine: Optional[str] = "google",
search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for SearchApi.
:param top_k: Number of results to return.
:param allowed_domains: List of domains to limit the search to.
:param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
:param search_engine_kwargs: Additional parameters passed to the SearchApi.
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
"""
super().__init__()
self.params_dict: Dict[str, Union[str, int, float]] = {}
self.api_key = api_key
self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
self.engine = engine
self.top_k = top_k
self.allowed_domains = allowed_domains

def search(self, query: str, **kwargs) -> List[Document]:
"""
:param query: Query string.
:param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
to localize search to the specific location.
:return: List[Document]
"""
kwargs = {**self.kwargs, **kwargs}
top_k = kwargs.pop("top_k", self.top_k)
url = "https://www.searchapi.io/api/v1/search"

allowed_domains = kwargs.pop("allowed_domains", self.allowed_domains)
query_prepend = "OR ".join(f"site:{domain} " for domain in allowed_domains) if allowed_domains else ""
params = {"q": query_prepend + " " + query, **kwargs}
headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}

if self.engine:
params["engine"] = self.engine
response = requests.get(url, params=params, headers=headers, timeout=90)

if response.status_code != 200:
raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

json_result = json.loads(response.text)

# organic results are the main results from the search engine
organic_results = []
if "organic_results" in json_result:
for result in json_result["organic_results"]:
organic_results.append(
Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
)

# answer box has a direct answer to the query
answer_box = []
if "answer_box" in json_result:
answer_box = [
Document.from_dict(
{
"title": json_result["answer_box"].get("title", ""),
"content": json_result["answer_box"].get("answer", ""),
"link": json_result["answer_box"].get("link", ""),
}
)
]

knowledge_graph = []
if "knowledge_graph" in json_result:
knowledge_graph = [
Document.from_dict(
{
"title": json_result["knowledge_graph"].get("title", ""),
"content": json_result["knowledge_graph"].get("description", ""),
}
)
]

related_questions = []
if "related_questions" in json_result:
for result in json_result["related_questions"]:
related_questions.append(
Document.from_dict(
{
"title": result["question"],
"content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
"link": result.get("source", {}).get("link", ""),
}
)
)

documents = answer_box + knowledge_graph + organic_results + related_questions

logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
result_docs = documents[:top_k]
return self.score_results(result_docs, len(answer_box) > 0)


class SerperDev(SearchEngine):
"""
Search engine using SerperDev API. See the [Serper Dev website](https://serper.dev/) for more details.
Expand Down
1 change: 1 addition & 0 deletions haystack/nodes/search_engine/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class WebSearch(BaseComponent):
WebSearch currently supports the following search engines providers (bridges):
- SerperDev (default)
- SearchApi
- SerpAPI
- BingAPI
- GoogleAPI
Expand Down
3 changes: 2 additions & 1 deletion haystack/preview/components/websearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch
from haystack.preview.components.websearch.searchapi import SearchApiWebSearch

__all__ = ["SerperDevWebSearch"]
__all__ = ["SerperDevWebSearch", "SearchApiWebSearch"]
140 changes: 140 additions & 0 deletions haystack/preview/components/websearch/searchapi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import json
import os
import logging
from typing import Dict, List, Optional, Any

import requests

from haystack.preview import Document, component, default_to_dict, ComponentError

logger = logging.getLogger(__name__)


SEARCHAPI_BASE_URL = "https://www.searchapi.io/api/v1/search"


class SearchApiError(ComponentError):
...


@component
class SearchApiWebSearch:
"""
Search engine using SearchApi API. Given a query, it returns a list of URLs that are the most relevant.
See the [SearchApi website](https://www.searchapi.io/) for more details.
"""

def __init__(
self,
api_key: Optional[str] = None,
top_k: Optional[int] = 10,
allowed_domains: Optional[List[str]] = None,
search_params: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for the SearchApi API. It can be
explicitly provided or automatically read from the
environment variable SEARCHAPI_API_KEY (recommended).
:param top_k: Number of documents to return.
:param allowed_domains: List of domains to limit the search to.
:param search_params: Additional parameters passed to the SearchApi API.
For example, you can set 'num' to 100 to increase the number of search results.
See the [SearchApi website](https://www.searchapi.io/) for more details.
"""
if api_key is None:
try:
api_key = os.environ["SEARCHAPI_API_KEY"]
except KeyError as e:
raise ValueError(
"SearchApiWebSearch expects an API key. "
"Set the SEARCHAPI_API_KEY environment variable (recommended) or pass it explicitly."
) from e
self.api_key = api_key
self.top_k = top_k
self.allowed_domains = allowed_domains
self.search_params = search_params or {}

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self, top_k=self.top_k, allowed_domains=self.allowed_domains, search_params=self.search_params
)

@component.output_types(documents=List[Document], links=List[str])
def run(self, query: str):
"""
Search the SearchApi API for the given query and return the results as a list of Documents and a list of links.
:param query: Query string.
"""
query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""

payload = json.dumps({"q": query_prepend + " " + query, **self.search_params})
headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}

try:
response = requests.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
response.raise_for_status() # Will raise an HTTPError for bad responses
except requests.Timeout:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")

except requests.RequestException as e:
raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e

# Request succeeded
json_result = response.json()

# organic results are the main results from the search engine
organic_results = []
if "organic_results" in json_result:
for result in json_result["organic_results"]:
organic_results.append(
Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
)

# answer box has a direct answer to the query
answer_box = []
if "answer_box" in json_result:
answer_box = [
Document.from_dict(
{
"title": json_result["answer_box"].get("title", ""),
"content": json_result["answer_box"].get("answer", ""),
"link": json_result["answer_box"].get("link", ""),
}
)
]

knowledge_graph = []
if "knowledge_graph" in json_result:
knowledge_graph = [
Document.from_dict(
{
"title": json_result["knowledge_graph"].get("title", ""),
"content": json_result["knowledge_graph"].get("description", ""),
}
)
]

related_questions = []
if "related_questions" in json_result:
for result in json_result["related_questions"]:
related_questions.append(
Document.from_dict(
{
"title": result["question"],
"content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
"link": result.get("source", {}).get("link", ""),
}
)
)

documents = answer_box + knowledge_graph + organic_results + related_questions

links = [result["link"] for result in json_result["organic_results"]]

logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
2 changes: 1 addition & 1 deletion proposals/text/4084-agent-demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ The main Agent modules/tools are:
## SearchEngine

SearchEngine is a symbolic API module allowing programmatic interaction with Google and other search engines. We'll have
multiple providers of SearchEngine including https://serper.dev and https://serpapi.com as initial providers.
multiple providers of SearchEngine including https://serper.dev, https://www.searchapi.io/ and https://serpapi.com as initial providers.

SearchEngine will return a list of results (e.g. List[Document]), the content of each document being a "snippet" of the
single search result, while all other attributes of the search results (e.g. title, url link, etc.) will
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
preview:
- Integrate SearchApi as an additional websearch provider.
29 changes: 29 additions & 0 deletions test/nodes/test_web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,35 @@ def test_web_search_with_site_keyword():
), "Some documents are not from the specified sites lifewire.com or nasa.gov."


@pytest.mark.skipif(
not os.environ.get("SEARCHAPI_API_KEY", None),
reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_searchapi():
ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
result, _ = ws.run(query="What is the hometown of the reigning men's U.S. Open champion?")
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)


@pytest.mark.skipif(
not os.environ.get("SEARCHAPI_API_KEY", None),
reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_searchapi_with_site_keyword():
ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
result, _ = ws.run(query='site:openai.com OR site:langchain.com "Agent types"')
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)
assert all(
"langchain" in doc.meta["link"] or "openai" in doc.meta["link"] for doc in result["documents"]
), "Some documents are not from the specified sites openai.com or langchain.com."


@pytest.mark.unit
def test_web_search_with_google_api_provider():
if not googleapi_installed:
Expand Down
Loading

0 comments on commit 4327fd7

Please sign in to comment.