Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add SearchApi integration #1

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion haystack/nodes/retriever/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
):
"""
:param api_key: API key for the search engine provider.
:param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SerpAPI", "BingAPI" or "GoogleAPI"
:param search_engine_provider: Name of the search engine provider class. The options are "SerperDev" (default), "SearchApi", "SerpAPI", "BingAPI" or "GoogleAPI"
:param search_engine_kwargs: Additional parameters to pass to the search engine provider.
:param top_search_results: Number of top search results to be retrieved.
:param top_k: Top k documents to be returned by the retriever.
Expand Down
108 changes: 108 additions & 0 deletions haystack/nodes/search_engine/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,114 @@ def search(self, query: str, **kwargs) -> List[Document]:
return self.score_results(result_docs, len(answer_box) > 0)


class SearchApi(SearchEngine):
"""
SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube,
YouTube transcripts and more. See the [SearchApi website](https://www.searchapi.io/) for more details.
"""

def __init__(
self,
api_key: str,
top_k: Optional[int] = 10,
allowed_domains: Optional[List[str]] = None,
engine: Optional[str] = "google",
search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for SearchApi.
:param top_k: Number of results to return.
:param allowed_domains: List of domains to limit the search to.
:param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
:param search_engine_kwargs: Additional parameters passed to the SearchApi.
See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
"""
super().__init__()
self.params_dict: Dict[str, Union[str, int, float]] = {}
self.api_key = api_key
self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
self.engine = engine
self.top_k = top_k
self.allowed_domains = allowed_domains

def search(self, query: str, **kwargs) -> List[Document]:
"""
:param query: Query string.
:param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
to localize search to the specific location.
:return: List[Document]
"""
kwargs = {**self.kwargs, **kwargs}
top_k = kwargs.pop("top_k", self.top_k)
url = "https://www.searchapi.io/api/v1/search"

allowed_domains = kwargs.pop("allowed_domains", self.allowed_domains)
query_prepend = "OR ".join(f"site:{domain} " for domain in allowed_domains) if allowed_domains else ""
params = {"q": query_prepend + " " + query, **kwargs}
headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}

if self.engine:
params["engine"] = self.engine
response = requests.get(url, params=params, headers=headers, timeout=90)

if response.status_code != 200:
raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

json_result = json.loads(response.text)

# organic results are the main results from the search engine
organic_results = []
if "organic_results" in json_result:
for result in json_result["organic_results"]:
organic_results.append(
Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
)

# answer box has a direct answer to the query
answer_box = []
if "answer_box" in json_result:
answer_box = [
Document.from_dict(
{
"title": json_result["answer_box"].get("title", ""),
"content": json_result["answer_box"].get("answer", ""),
"link": json_result["answer_box"].get("link", ""),
}
)
]

knowledge_graph = []
if "knowledge_graph" in json_result:
knowledge_graph = [
Document.from_dict(
{
"title": json_result["knowledge_graph"].get("title", ""),
"content": json_result["knowledge_graph"].get("description", ""),
}
)
]

related_questions = []
if "related_questions" in json_result:
for result in json_result["related_questions"]:
related_questions.append(
Document.from_dict(
{
"title": result["question"],
"content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
"link": result.get("source", {}).get("link", ""),
}
)
)

documents = answer_box + knowledge_graph + organic_results + related_questions

logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
result_docs = documents[:top_k]
return self.score_results(result_docs, len(answer_box) > 0)


class SerperDev(SearchEngine):
"""
Search engine using SerperDev API. See the [Serper Dev website](https://serper.dev/) for more details.
Expand Down
1 change: 1 addition & 0 deletions haystack/nodes/search_engine/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class WebSearch(BaseComponent):

WebSearch currently supports the following search engines providers (bridges):
- SerperDev (default)
- SearchApi
- SerpAPI
- BingAPI
- GoogleAPI
Expand Down
3 changes: 2 additions & 1 deletion haystack/preview/components/websearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from haystack.preview.components.websearch.serper_dev import SerperDevWebSearch
from haystack.preview.components.websearch.searchapi import SearchApiWebSearch

__all__ = ["SerperDevWebSearch"]
__all__ = ["SerperDevWebSearch", "SearchApiWebSearch"]
140 changes: 140 additions & 0 deletions haystack/preview/components/websearch/searchapi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import json
import os
import logging
from typing import Dict, List, Optional, Any

import requests

from haystack.preview import Document, component, default_to_dict, ComponentError

logger = logging.getLogger(__name__)


SEARCHAPI_BASE_URL = "https://www.searchapi.io/api/v1/search"


class SearchApiError(ComponentError):
...


@component
class SearchApiWebSearch:
"""
Search engine using SearchApi API. Given a query, it returns a list of URLs that are the most relevant.

See the [SearchApi website](https://www.searchapi.io/) for more details.
"""

def __init__(
self,
api_key: Optional[str] = None,
top_k: Optional[int] = 10,
allowed_domains: Optional[List[str]] = None,
search_params: Optional[Dict[str, Any]] = None,
):
"""
:param api_key: API key for the SearchApi API. It can be
explicitly provided or automatically read from the
environment variable SEARCHAPI_API_KEY (recommended).
:param top_k: Number of documents to return.
:param allowed_domains: List of domains to limit the search to.
:param search_params: Additional parameters passed to the SearchApi API.
For example, you can set 'num' to 100 to increase the number of search results.
See the [SearchApi website](https://www.searchapi.io/) for more details.
"""
if api_key is None:
try:
api_key = os.environ["SEARCHAPI_API_KEY"]
except KeyError as e:
raise ValueError(
"SearchApiWebSearch expects an API key. "
"Set the SEARCHAPI_API_KEY environment variable (recommended) or pass it explicitly."
) from e
self.api_key = api_key
self.top_k = top_k
self.allowed_domains = allowed_domains
self.search_params = search_params or {}

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
"""
return default_to_dict(
self, top_k=self.top_k, allowed_domains=self.allowed_domains, search_params=self.search_params
)

@component.output_types(documents=List[Document], links=List[str])
def run(self, query: str):
"""
Search the SearchApi API for the given query and return the results as a list of Documents and a list of links.

:param query: Query string.
"""
query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""

payload = json.dumps({"q": query_prepend + " " + query, **self.search_params})
headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "Haystack"}

try:
response = requests.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
response.raise_for_status() # Will raise an HTTPError for bad responses
except requests.Timeout:
raise TimeoutError(f"Request to {self.__class__.__name__} timed out.")

except requests.RequestException as e:
raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e

# Request succeeded
json_result = response.json()

# organic results are the main results from the search engine
organic_results = []
if "organic_results" in json_result:
for result in json_result["organic_results"]:
organic_results.append(
Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
)

# answer box has a direct answer to the query
answer_box = []
if "answer_box" in json_result:
answer_box = [
Document.from_dict(
{
"title": json_result["answer_box"].get("title", ""),
"content": json_result["answer_box"].get("answer", ""),
"link": json_result["answer_box"].get("link", ""),
}
)
]

knowledge_graph = []
if "knowledge_graph" in json_result:
knowledge_graph = [
Document.from_dict(
{
"title": json_result["knowledge_graph"].get("title", ""),
"content": json_result["knowledge_graph"].get("description", ""),
}
)
]

related_questions = []
if "related_questions" in json_result:
for result in json_result["related_questions"]:
related_questions.append(
Document.from_dict(
{
"title": result["question"],
"content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
"link": result.get("source", {}).get("link", ""),
}
)
)

documents = answer_box + knowledge_graph + organic_results + related_questions

links = [result["link"] for result in json_result["organic_results"]]

logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
2 changes: 1 addition & 1 deletion proposals/text/4084-agent-demo.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ The main Agent modules/tools are:
## SearchEngine

SearchEngine is a symbolic API module allowing programmatic interaction with Google and other search engines. We'll have
multiple providers of SearchEngine including https://serper.dev and https://serpapi.com as initial providers.
multiple providers of SearchEngine including https://serper.dev, https://www.searchapi.io/ and https://serpapi.com as initial providers.

SearchEngine will return a list of results (e.g. List[Document]), the content of each document being a "snippet" of the
single search result, while all other attributes of the search results (e.g. title, url link, etc.) will
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ dev = [
"pylint",
"farm-haystack[formatting]",
# Documentation
"pydoc-markdown",
"toml",
"reno",
# dulwich is a reno dependency, they pin it at >=0.15.0 so pip takes ton of time to resolve the dependency tree.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
preview:
- Integrate SearchApi as an additional websearch provider.
2 changes: 1 addition & 1 deletion rest_api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ classifiers = [
]
dependencies = [
"farm-haystack",
"fastapi<0.104.0", # https://github.com/deepset-ai/haystack/issues/6119
"fastapi",
"uvicorn<1",
"python-multipart<1", # optional FastAPI dependency for form data
"pynvml",
Expand Down
29 changes: 29 additions & 0 deletions test/nodes/test_web_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,35 @@ def test_web_search_with_site_keyword():
), "Some documents are not from the specified sites lifewire.com or nasa.gov."


@pytest.mark.skipif(
not os.environ.get("SEARCHAPI_API_KEY", None),
reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_searchapi():
ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
result, _ = ws.run(query="What is the hometown of the reigning men's U.S. Open champion?")
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)


@pytest.mark.skipif(
not os.environ.get("SEARCHAPI_API_KEY", None),
reason="Please export an env var called SEARCHAPI_API_KEY containing the searchapi.io API key to run this test.",
)
@pytest.mark.integration
def test_web_search_with_searchapi_with_site_keyword():
ws = WebSearch(api_key=os.environ.get("SEARCHAPI_API_KEY", None), search_engine_provider="SearchApi")
result, _ = ws.run(query='site:openai.com OR site:langchain.com "Agent types"')
assert "documents" in result
assert len(result["documents"]) > 0
assert isinstance(result["documents"][0], Document)
assert all(
"langchain" in doc.meta["link"] or "openai" in doc.meta["link"] for doc in result["documents"]
), "Some documents are not from the specified sites openai.com or langchain.com."


@pytest.mark.unit
def test_web_search_with_google_api_provider():
if not googleapi_installed:
Expand Down
Loading
Loading