From 2cadee5ef6bc84fd17644712dc28d00399051b5f Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Tue, 7 May 2024 13:45:47 +0200 Subject: [PATCH] chore: remove fetch_from_http function (#7657) * remove fetch_from_http function * remove unneeded tests and workflow * fix linting * try * ignore allthethings --- .github/workflows/examples_tests.yml | 81 ------------------------ README.md | 84 ++++++------------------- examples/basic_faq_pipeline.py | 76 ----------------------- examples/basic_qa_pipeline.py | 79 ------------------------ examples/getting_started.py | 41 ------------- examples/hybrid_search_faq_pipeline.py | 85 -------------------------- examples/test_basic_faq_pipeline.py | 19 ------ examples/test_basic_qa_pipeline.py | 23 ------- examples/test_getting_started.py | 26 -------- haystack/nodes/file_converter/docx.py | 4 +- haystack/utils/__init__.py | 2 - haystack/utils/getting_started.py | 85 -------------------------- haystack/utils/import_utils.py | 62 +------------------ 13 files changed, 20 insertions(+), 647 deletions(-) delete mode 100644 .github/workflows/examples_tests.yml delete mode 100644 examples/basic_faq_pipeline.py delete mode 100644 examples/basic_qa_pipeline.py delete mode 100644 examples/getting_started.py delete mode 100644 examples/hybrid_search_faq_pipeline.py delete mode 100644 examples/test_basic_faq_pipeline.py delete mode 100644 examples/test_basic_qa_pipeline.py delete mode 100644 examples/test_getting_started.py delete mode 100644 haystack/utils/getting_started.py diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml deleted file mode 100644 index 2ec252b352..0000000000 --- a/.github/workflows/examples_tests.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: Examples tests - -on: - workflow_dispatch: # Activate this workflow manually - push: - branches: - - main - pull_request: - paths: - - "examples/**" - types: - - opened - - reopened - - synchronize - - ready_for_review - -env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }} - ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} - HUGGINGFACE_API_KEY: ${{ secrets.HUGGINGFACE_API_KEY }} - PYTHON_VERSION: "3.8" - -jobs: - tests: - name: Examples - runs-on: ubuntu-latest - services: - elasticsearch: - image: elasticsearch:7.17.6 - env: - discovery.type: "single-node" - ES_JAVA_OPTS: "-Xms128m -Xmx256m" - ports: - - 9200:9200 - - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: ${{ env.PYTHON_VERSION }} - - - name: Install Haystack - run: | - pip install --upgrade pip - pip install .[inference,dev,elasticsearch,preprocessing,file-conversion] - - - name: Run - run: pytest examples/ - - - name: Calculate alert data - id: calculator - if: (success() || failure()) && github.ref_name == 'main' - shell: bash - run: | - if [ "${{ job.status }}" = "success" ]; then - echo "alert_type=success" >> "$GITHUB_OUTPUT"; - else - echo "alert_type=error" >> "$GITHUB_OUTPUT"; - fi - - - name: Send event to Datadog - if: (success() || failure()) && github.ref_name == 'main' - uses: masci/datadog@v1 - with: - api-key: ${{ secrets.CORE_DATADOG_API_KEY }} - api-url: https://api.datadoghq.eu - events: | - - title: "${{ github.workflow }} workflow" - text: "Job ${{ github.job }} in branch ${{ github.ref_name }}" - alert_type: "${{ steps.calculator.outputs.alert_type }}" - source_type_name: "Github" - host: ${{ github.repository_owner }} - tags: - - "project:${{ github.repository }}" - - "job:${{ github.job }}" - - "run_id:${{ github.run_id }}" - - "workflow:${{ github.workflow }}" - - "branch:${{ github.ref_name }}" - - "url:https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" diff --git a/README.md b/README.md index 119515107e..71547244ef 100644 --- a/README.md +++ b/README.md @@ -1,66 +1,16 @@
Haystack -| | | -| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| | | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | CI/CD | [![Tests](https://github.com/deepset-ai/haystack/actions/workflows/tests.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/tests.yml) [![Docker image release](https://github.com/deepset-ai/haystack/actions/workflows/docker_release.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/docker_release.yml) [![Schemas](https://github.com/deepset-ai/haystack/actions/workflows/schemas.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/schemas.yml) [![code style - Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![types - Mypy](https://img.shields.io/badge/types-Mypy-blue.svg)](https://github.com/python/mypy) [![Coverage Status](https://coveralls.io/repos/github/deepset-ai/haystack/badge.svg?branch=main)](https://coveralls.io/github/deepset-ai/haystack?branch=main) | -| Docs | [![Sync docs with Readme](https://github.com/deepset-ai/haystack/actions/workflows/readme_sync.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/readme_sync.yml) [![Website](https://img.shields.io/website?label=documentation&up_message=online&url=https%3A%2F%2Fdocs.haystack.deepset.ai)](https://docs.haystack.deepset.ai) | -| Package | [![PyPI](https://img.shields.io/pypi/v/farm-haystack)](https://pypi.org/project/farm-haystack/) ![PyPI - Downloads](https://img.shields.io/pypi/dm/farm-haystack?color=blue&logo=pypi&logoColor=gold) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/farm-haystack?logo=python&logoColor=gold) [![GitHub](https://img.shields.io/github/license/deepset-ai/haystack?color=blue)](LICENSE) [![License Compliance](https://github.com/deepset-ai/haystack/actions/workflows/license_compliance.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/license_compliance.yml) | -| Meta | [![Discord](https://img.shields.io/discord/993534733298450452?logo=discord)](https://discord.gg/haystack) [![Twitter Follow](https://img.shields.io/twitter/follow/haystack_ai)](https://twitter.com/haystack_ai) | +| Docs | [![Sync docs with Readme](https://github.com/deepset-ai/haystack/actions/workflows/readme_sync.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/readme_sync.yml) [![Website](https://img.shields.io/website?label=documentation&up_message=online&url=https%3A%2F%2Fdocs.haystack.deepset.ai)](https://docs.haystack.deepset.ai) | +| Package | [![PyPI](https://img.shields.io/pypi/v/farm-haystack)](https://pypi.org/project/farm-haystack/) ![PyPI - Downloads](https://img.shields.io/pypi/dm/farm-haystack?color=blue&logo=pypi&logoColor=gold) ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/farm-haystack?logo=python&logoColor=gold) [![GitHub](https://img.shields.io/github/license/deepset-ai/haystack?color=blue)](LICENSE) [![License Compliance](https://github.com/deepset-ai/haystack/actions/workflows/license_compliance.yml/badge.svg)](https://github.com/deepset-ai/haystack/actions/workflows/license_compliance.yml) | +| Meta | [![Discord](https://img.shields.io/discord/993534733298450452?logo=discord)](https://discord.gg/haystack) [![Twitter Follow](https://img.shields.io/twitter/follow/haystack_ai)](https://twitter.com/haystack_ai) |
[Haystack](https://haystack.deepset.ai/) is an end-to-end NLP framework that enables you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform question answering, answer generation, semantic document search, or build tools that are capable of complex decision-making and query resolution, you can use state-of-the-art NLP models with Haystack to build end-to-end NLP applications to solve your use case. -## Quickstart - -Haystack is built around the concept of pipelines. A pipeline is a powerful structure that performs an NLP task. It's made up of components connected together. For example, you can connect a `Retriever` and a `PromptNode` to build a Generative Question Answering pipeline that uses your own data. - -Try out how Haystack answers questions about Game of Thrones using the Retrieval Augmented Generation (RAG) approach πŸ‘‡ - -First, run the minimal Haystack installation: - -```sh -pip install farm-haystack -``` - -Then, index your data to the DocumentStore, build a RAG pipeline, and ask a question on your data: - -```python -from haystack.document_stores import InMemoryDocumentStore -from haystack.utils import build_pipeline, add_example_data, print_answers - -# We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". -provider = "openai" -API_KEY = "sk-..." # ADD YOUR KEY HERE - -# We support many different databases. Here, we load a simple and lightweight in-memory database. -document_store = InMemoryDocumentStore(use_bm25=True) - -# Download and add Game of Thrones TXT articles to Haystack DocumentStore. -# You can also provide a folder with your local documents. -add_example_data(document_store, "data/GoT_getting_started") - -# Build a pipeline with a Retriever to get relevant documents to the query and a PromptNode interacting with LLMs using a custom prompt. -pipeline = build_pipeline(provider, API_KEY, document_store) - -# Ask a question on the data you just added. -result = pipeline.run(query="Who is the father of Arya Stark?") - -# For details, like which documents were used to generate the answer, look into the object -print_answers(result, details="medium") -``` - -The output of the pipeline will reference the documents used to generate the answer: - -``` -'Query: Who is the father of Arya Stark?' -'Answers:' -[{'answer': 'The father of Arya Stark is Lord Eddard Stark of ' - 'Winterfell. [Document 1, Document 4, Document 5]'}] -``` - -Congratulations, you have just built your first Haystack app! - ## Core Concepts πŸƒβ€β™€οΈ **[Pipelines](https://docs.haystack.deepset.ai/docs/pipelines):** This is the standard Haystack structure that builds on top of your data to perform various NLP tasks such as retrieval augmented generation, question answering and more. The data in a Pipeline flows from one Node to the next. You define how Nodes interact with each other and how one Node pushes data to the next. @@ -95,19 +45,19 @@ An example pipeline would consist of one `Retriever` Node and one `PromptNode`. - **Continuous Learning**: Collect new training data from user feedback in production & improve your models continuously. ## Resources -| | | -| ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| πŸ“’ [Docs](https://docs.haystack.deepset.ai) | Components, Pipeline Nodes, Guides, API Reference | -| πŸ’Ύ [Installation](https://github.com/deepset-ai/haystack#-installation) | How to install Haystack | -| πŸŽ“ [Tutorials](https://haystack.deepset.ai/tutorials) | See what Haystack can do with our Notebooks & Scripts | -| πŸŽ‰Β [Haystack Extras](https://github.com/deepset-ai/haystack-extras) | A repository that lists extra Haystack packages and components that can be installed separately. | -| πŸ”° [Demos](https://github.com/deepset-ai/haystack-demos) | A repository containing Haystack demo applications with Docker Compose and a REST API | +| | | +| ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| πŸ“’ [Docs](https://docs.haystack.deepset.ai) | Components, Pipeline Nodes, Guides, API Reference | +| πŸ’Ύ [Installation](https://github.com/deepset-ai/haystack#-installation) | How to install Haystack | +| πŸŽ“ [Tutorials](https://haystack.deepset.ai/tutorials) | See what Haystack can do with our Notebooks & Scripts | +| πŸŽ‰Β [Haystack Extras](https://github.com/deepset-ai/haystack-extras) | A repository that lists extra Haystack packages and components that can be installed separately. | +| πŸ”° [Demos](https://github.com/deepset-ai/haystack-demos) | A repository containing Haystack demo applications with Docker Compose and a REST API | | πŸ–– [Community](https://github.com/deepset-ai/haystack#-community) | [Discord](https://discord.gg/haystack), [𝕏 (Twitter)](https://twitter.com/haystack_ai), [Stack Overflow](https://stackoverflow.com/questions/tagged/haystack), [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | -| πŸ’™ [Contributing](https://github.com/deepset-ai/haystack#-contributing) | We welcome all contributions! | -| πŸ“Š [Benchmarks](https://haystack.deepset.ai/benchmarks/) | Speed & Accuracy of Retriever, Readers and DocumentStores | -| πŸ”­ [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack | -| πŸ“° [Blog](https://haystack.deepset.ai/blog) | Learn about the latest with Haystack and NLP | -| ☎️ [Jobs](https://www.deepset.ai/jobs) | We're hiring! Have a look at our open positions | +| πŸ’™ [Contributing](https://github.com/deepset-ai/haystack#-contributing) | We welcome all contributions! | +| πŸ“Š [Benchmarks](https://haystack.deepset.ai/benchmarks/) | Speed & Accuracy of Retriever, Readers and DocumentStores | +| πŸ”­ [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack | +| πŸ“° [Blog](https://haystack.deepset.ai/blog) | Learn about the latest with Haystack and NLP | +| ☎️ [Jobs](https://www.deepset.ai/jobs) | We're hiring! Have a look at our open positions | ## πŸ’Ύ Installation diff --git a/examples/basic_faq_pipeline.py b/examples/basic_faq_pipeline.py deleted file mode 100644 index e198ca5367..0000000000 --- a/examples/basic_faq_pipeline.py +++ /dev/null @@ -1,76 +0,0 @@ -# Disable pylint errors for logging basicConfig -# pylint: disable=no-logging-basicconfig -import logging - -import pandas as pd - -from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import EmbeddingRetriever -from haystack.nodes.other.docs2answers import Docs2Answers -from haystack.pipelines import Pipeline -from haystack.utils import fetch_archive_from_http, launch_es, print_answers - -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) - - -def basic_faq_pipeline(): - document_store = ElasticsearchDocumentStore( - host="localhost", - username="", - password="", - index="example-document", - embedding_field="question_emb", - embedding_dim=384, - excluded_meta_data=["question_emb"], - similarity="cosine", - ) - - retriever = EmbeddingRetriever( - document_store=document_store, - embedding_model="sentence-transformers/all-MiniLM-L6-v2", - use_gpu=True, - scale_score=False, - ) - - doc_to_answers = Docs2Answers() - - doc_dir = "data/basic_faq_pipeline" - s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip" - fetch_archive_from_http(url=s3_url, output_dir=doc_dir) - - df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") - - # Minimal cleaning - df.fillna(value="", inplace=True) - df["question"] = df["question"].apply(lambda x: x.strip()) - print(df.head()) - - # Get embeddings for our questions from the FAQs - questions = list(df["question"].values) - df["question_emb"] = retriever.embed_queries(queries=questions).tolist() - df = df.rename(columns={"question": "content"}) - - # Convert Dataframe to list of dicts and index them in our DocumentStore - docs_to_index = df.to_dict(orient="records") - document_store.write_documents(docs_to_index) - document_store.update_embeddings(retriever) - - # Initialize a Pipeline (this time without a reader) and ask questions - pipeline = Pipeline() - pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) - pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"]) - - # Ask a question - prediction = pipeline.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) - - print_answers(prediction, details="medium") - - # Remove the index once we're done to save space - document_store.delete_index(index="example-document") - return prediction - - -if __name__ == "__main__": - launch_es() - basic_faq_pipeline() diff --git a/examples/basic_qa_pipeline.py b/examples/basic_qa_pipeline.py deleted file mode 100644 index 97988627ee..0000000000 --- a/examples/basic_qa_pipeline.py +++ /dev/null @@ -1,79 +0,0 @@ -# Disable pylint errors for logging basicConfig -# pylint: disable=no-logging-basicconfig -import logging -from pathlib import Path - -from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import BM25Retriever, FARMReader -from haystack.nodes.file_classifier import FileTypeClassifier -from haystack.nodes.file_converter import TextConverter -from haystack.nodes.preprocessor import PreProcessor -from haystack.pipelines import Pipeline -from haystack.utils import fetch_archive_from_http, launch_es, print_answers - -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) - - -def basic_qa_pipeline(): - # Initialize a DocumentStore - document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="example-document") - - # fetch, pre-process and write documents - doc_dir = "data/basic_qa_pipeline" - s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/wiki_gameofthrones_txt1.zip" - fetch_archive_from_http(url=s3_url, output_dir=doc_dir) - - file_paths = [p for p in Path(doc_dir).glob("**/*")] - files_metadata = [{"name": path.name} for path in file_paths] - - # Indexing Pipeline - indexing_pipeline = Pipeline() - - # Makes sure the file is a TXT file (FileTypeClassifier node) - classifier = FileTypeClassifier() - indexing_pipeline.add_node(classifier, name="Classifier", inputs=["File"]) - - # Converts a file into text and performs basic cleaning (TextConverter node) - text_converter = TextConverter(remove_numeric_tables=True) - indexing_pipeline.add_node(text_converter, name="Text_converter", inputs=["Classifier.output_1"]) - - # - Pre-processes the text by performing splits and adding metadata to the text (Preprocessor node) - preprocessor = PreProcessor( - clean_whitespace=True, - clean_empty_lines=True, - split_length=100, - split_overlap=50, - split_respect_sentence_boundary=True, - ) - indexing_pipeline.add_node(preprocessor, name="Preprocessor", inputs=["Text_converter"]) - - # - Writes the resulting documents into the document store - indexing_pipeline.add_node(document_store, name="Document_Store", inputs=["Preprocessor"]) - - # Then we run it with the documents and their metadata as input - indexing_pipeline.run(file_paths=file_paths, meta=files_metadata) - - # Initialize Retriever & Reader - retriever = BM25Retriever(document_store=document_store) - reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) - - # Query Pipeline - pipeline = Pipeline() - pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) - pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"]) - - prediction = pipeline.run( - query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} - ) - - print_answers(prediction, details="minimum") - - # Remove the index once we're done to save space - document_store.delete_index(index="example-document") - return prediction - - -if __name__ == "__main__": - launch_es() - basic_qa_pipeline() diff --git a/examples/getting_started.py b/examples/getting_started.py deleted file mode 100644 index afe76ffa20..0000000000 --- a/examples/getting_started.py +++ /dev/null @@ -1,41 +0,0 @@ -import logging - -from typing import Optional - -from haystack.document_stores import InMemoryDocumentStore -from haystack.utils import build_pipeline, add_example_data, print_answers - -logger = logging.getLogger(__name__) - - -def getting_started(provider, API_KEY, API_BASE: Optional[str] = None): - """ - This getting_started example shows you how to use LLMs with your data with a technique called Retrieval Augmented Generation - RAG. - - :param provider: We are model agnostic :) Here, you can choose from: "anthropic", "cohere", "huggingface", and "openai". - :param API_KEY: The API key matching the provider. - :param API_BASE: The URL to use for a custom endpoint, e.g., if using LM Studio. Only openai provider supported. /v1 at the end is needed (e.g., http://localhost:1234/v1) - - """ - # We support many different databases. Here we load a simple and lightweight in-memory database. - document_store = InMemoryDocumentStore(use_bm25=True) - - # Pipelines are the main abstraction in Haystack, they connect components like LLMs and databases. - pipeline = build_pipeline(provider, API_KEY, API_BASE, document_store) - - # Download and add Game of Thrones TXT articles to Haystack's database. - # You can also provide a folder with your local documents. - # You might need to install additional dependencies - look inside the function for more information. - add_example_data(document_store, "data/GoT_getting_started") - - # Ask a question on the data you just added. - result = pipeline.run(query="Who is the father of Arya Stark?", debug=True) - - # For details such as which documents were used to generate the answer, look into the object. - print_answers(result, details="medium") - return result - - -if __name__ == "__main__": - # getting_started(provider="openai", API_KEY="NOT NEEDED", API_BASE="http://192.168.1.100:1234/v1") - getting_started(provider="openai", API_KEY="ADD KEY HERE") diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py deleted file mode 100644 index d4fcba6cf0..0000000000 --- a/examples/hybrid_search_faq_pipeline.py +++ /dev/null @@ -1,85 +0,0 @@ -# import logging - -import pandas as pd - -from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import EmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker -from haystack.nodes.other.docs2answers import Docs2Answers -from haystack.utils import launch_es, print_answers, fetch_archive_from_http -from haystack.pipelines import Pipeline - -# logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -# logging.getLogger("haystack").setLevel(logging.INFO) - - -def hybrid_search_faq_pipeline(): - document_store = ElasticsearchDocumentStore( - host="localhost", - username="", - password="", - index="document", - embedding_field="question_emb", - embedding_dim=384, - excluded_meta_data=["question_emb"], - similarity="cosine", - ) - - sparse_retriever = BM25Retriever(document_store=document_store) - dense_retriever = EmbeddingRetriever( - document_store=document_store, - embedding_model="sentence-transformers/all-MiniLM-L6-v2", - use_gpu=True, - scale_score=False, - ) - join_documents = JoinDocuments(join_mode="reciprocal_rank_fusion") - rerank = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") - - doc_to_answers = Docs2Answers() - - doc_dir = "data/basic_faq_pipeline" - s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip" - fetch_archive_from_http(url=s3_url, output_dir=doc_dir) - - df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") - - # Minimal cleaning - df.fillna(value="", inplace=True) - df["question"] = df["question"].apply(lambda x: x.strip()) - print(df.head()) - - # Get embeddings for our questions from the FAQs - questions = list(df["question"].values) - df["question_emb"] = dense_retriever.embed_queries(queries=questions).tolist() - df = df.rename(columns={"question": "content"}) - - # Convert Dataframe to list of dicts and index them in our DocumentStore - docs_to_index = df.to_dict(orient="records") - document_store.write_documents(docs_to_index) - document_store.update_embeddings(retriever=dense_retriever) - - # Initialize a Pipeline (this time without a reader) and ask questions - pipeline = Pipeline() - pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"]) - pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"]) - pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"]) - pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"]) - pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"]) - - # Ask a question - prediction = pipeline.run( - query="How is the virus spreading?", - params={ - "SparseRetriever": {"top_k": 10}, - "DenseRetriever": {"top_k": 10}, - "JoinDocuments": {"top_k_join": 15}, - "ReRanker": {"top_k": 5}, - }, - ) - - print_answers(prediction, details="medium") - return prediction - - -if __name__ == "__main__": - launch_es() - hybrid_search_faq_pipeline() diff --git a/examples/test_basic_faq_pipeline.py b/examples/test_basic_faq_pipeline.py deleted file mode 100644 index b637ad7223..0000000000 --- a/examples/test_basic_faq_pipeline.py +++ /dev/null @@ -1,19 +0,0 @@ -from examples.basic_faq_pipeline import basic_faq_pipeline - -from haystack.schema import Answer - - -def test_basic_faq_pipeline(): - prediction = basic_faq_pipeline() - - assert prediction is not None - assert prediction["query"] == "How is the virus spreading?" - - assert len(prediction["answers"]) == 10 # top-k of Retriever - assert type(prediction["answers"][0]) == Answer - assert ( - prediction["answers"][0].answer - == """This virus was first detected in Wuhan City, Hubei Province, China. The first infections were linked to a live animal market, but the virus is now spreading from person-to-person. It’s important to note that person-to-person spread can happen on a continuum. Some viruses are highly contagious (like measles), while other viruses are less so.\n\nThe virus that causes COVID-19 seems to be spreading easily and sustainably in the community (β€œcommunity spread”) in some affected geographic areas. Community spread means people have been infected with the virus in an area, including some who are not sure how or where they became infected.\n\nLearn what is known about the spread of newly emerged coronaviruses.""" - ) - assert prediction["answers"][0].score <= 1 - assert prediction["answers"][0].score >= 0 diff --git a/examples/test_basic_qa_pipeline.py b/examples/test_basic_qa_pipeline.py deleted file mode 100644 index d538979822..0000000000 --- a/examples/test_basic_qa_pipeline.py +++ /dev/null @@ -1,23 +0,0 @@ -from examples.basic_qa_pipeline import basic_qa_pipeline - -from haystack.schema import Answer, Document - - -def test_basic_qa_pipeline(): - prediction = basic_qa_pipeline() - - assert prediction is not None - assert prediction["query"] == "Who is the father of Arya Stark?" - - assert len(prediction["answers"]) == 5 # top-k of Reader - assert type(prediction["answers"][0]) == Answer - assert prediction["answers"][0].answer == "Ned" - assert prediction["answers"][0].score <= 1 - assert prediction["answers"][0].score >= 0 - assert prediction["answers"][0].meta["name"] == "43_Arya_Stark.txt" - - assert len(prediction["documents"]) == 10 # top-k of Retriever - assert type(prediction["documents"][0]) == Document - assert prediction["documents"][0].score <= 1 - assert prediction["documents"][0].score >= 0 - assert prediction["documents"][0].meta["name"] == "450_Baelor.txt" diff --git a/examples/test_getting_started.py b/examples/test_getting_started.py deleted file mode 100644 index ee4b99aa98..0000000000 --- a/examples/test_getting_started.py +++ /dev/null @@ -1,26 +0,0 @@ -import os - -import pytest - -from examples.getting_started import getting_started -from haystack.schema import Answer, Document - - -@pytest.mark.parametrize("provider", ["cohere", "huggingface", "openai"]) -def test_getting_started(provider): - if provider == "anthropic": - api_key = os.environ.get("ANTHROPIC_API_KEY", "") - elif provider == "cohere": - api_key = os.environ.get("COHERE_API_KEY", "") - elif provider == "huggingface": - api_key = os.environ.get("HUGGINGFACE_API_KEY", "") - elif provider == "openai": - api_key = os.environ.get("OPENAI_API_KEY", "") - - if api_key: - result = getting_started(provider=provider, API_KEY=api_key) - - # Testing only for functionality. Since model predictions from APIs might change, we cannot test those directly. - assert isinstance(result, dict) - assert type(result["answers"][0]) == Answer - assert type(result["documents"][0]) == Document diff --git a/haystack/nodes/file_converter/docx.py b/haystack/nodes/file_converter/docx.py index ae59f13919..eb3ff07933 100644 --- a/haystack/nodes/file_converter/docx.py +++ b/haystack/nodes/file_converter/docx.py @@ -74,8 +74,8 @@ def convert( if id_hash_keys is None: id_hash_keys = self.id_hash_keys - file = docx.Document(file_path) # Creating word reader object. - paragraphs = [para.text for para in file.paragraphs] + file = docx.Document(file_path) # type: ignore + paragraphs = [para.text for para in file.paragraphs] # type: ignore text = "\n".join(paragraphs) document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) return [document] diff --git a/haystack/utils/__init__.py b/haystack/utils/__init__.py index 15a9794082..6052262021 100644 --- a/haystack/utils/__init__.py +++ b/haystack/utils/__init__.py @@ -1,7 +1,6 @@ from haystack.utils.reflection import args_to_kwargs from haystack.utils.requests_utils import request_with_retry from haystack.utils.preprocessing import convert_files_to_docs, tika_convert_files_to_docs -from haystack.utils.import_utils import fetch_archive_from_http from haystack.utils.cleaning import clean_wiki_text from haystack.utils.doc_store import launch_es, launch_opensearch, launch_weaviate, stop_opensearch, stop_service from haystack.utils.deepsetcloud import DeepsetCloud, DeepsetCloudError, DeepsetCloudExperiments @@ -24,4 +23,3 @@ from haystack.utils.early_stopping import EarlyStopping from haystack.utils.labels import aggregate_labels from haystack.utils.batching import get_batches_from_generator -from haystack.utils.getting_started import build_pipeline, add_example_data diff --git a/haystack/utils/getting_started.py b/haystack/utils/getting_started.py deleted file mode 100644 index b20c3dc539..0000000000 --- a/haystack/utils/getting_started.py +++ /dev/null @@ -1,85 +0,0 @@ -import logging -import os - -from haystack.utils import convert_files_to_docs -from haystack.utils import fetch_archive_from_http - -logger = logging.getLogger(__name__) - - -def build_pipeline(provider, API_KEY, API_BASE, document_store): - # Importing top-level causes a circular import - from haystack.nodes import AnswerParser, PromptNode, PromptTemplate, BM25Retriever - from haystack.pipelines import Pipeline - - provider = provider.lower() - # A retriever selects the right documents when given a question. - retriever = BM25Retriever(document_store=document_store, top_k=5) - # Load prompt for doing retrieval augmented generation from https://prompthub.deepset.ai/?prompt=deepset%2Fquestion-answering-with-references - question_answering_with_references = PromptTemplate( - prompt="deepset/question-answering-with-references", - output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]"), - ) - # Load the LLM model - if provider == "anthropic": - prompt_node = PromptNode( - model_name_or_path="claude-2", api_key=API_KEY, default_prompt_template=question_answering_with_references - ) - elif provider == "cohere": - prompt_node = PromptNode( - model_name_or_path="command", api_key=API_KEY, default_prompt_template=question_answering_with_references - ) - elif provider == "huggingface": - # TODO: swap out for meta-llama/Llama-2-7b-chat-hf or the 40b model once supported in Haystack+HF API free tier - # The tiiuae/falcon-7b-instruct model cannot handle a complex prompt with references, so we use a very simple one - simple_QA = PromptTemplate( - prompt="deepset/question-answering", output_parser=AnswerParser(reference_pattern=r"Document\[(\d+)\]") - ) - prompt_node = PromptNode( - model_name_or_path="tiiuae/falcon-7b-instruct", api_key=API_KEY, default_prompt_template=simple_QA - ) - elif provider == "openai": - prompt_node = PromptNode( - model_name_or_path="gpt-3.5-turbo-0301", - api_key=API_KEY, - api_base=API_BASE, - default_prompt_template=question_answering_with_references, - ) - else: - logger.error('Given unknown. Please use any of "anthropic", "cohere", "huggingface", or "openai"') - # Compose the query pipeline - query_pipeline = Pipeline() - query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"]) - query_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"]) - - return query_pipeline - - -def add_example_data(document_store, dir): - # Importing top-level causes a circular import - from haystack.nodes import TextConverter, PreProcessor - - if dir == "data/GoT_getting_started": - # Download and add Game of Thrones TXT files - fetch_archive_from_http( - url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt.zip", - output_dir=dir, - ) - files_to_index = [dir + "/" + f for f in os.listdir(dir)] - converter = TextConverter(remove_numeric_tables=True) - docs = [converter.convert(file_path=file, meta=None)[0] for file in files_to_index] - else: - # Here you can add a local folder with your files(.txt, .pdf, .docx). - # You might need to install additional packages with "pip install farm-haystack[ocr,preprocessing,file-conversion,pdf]". - # For more details, see: https://haystack.deepset.ai/tutorials/08_preprocessing. - # Be aware that some of your data will be sent to external APIs if you use this functionality! - files_to_index = [dir + "/" + f for f in os.listdir(dir)] - logger.info("Adding %s number of files from local disk at %s.", len(files_to_index), dir) - docs = convert_files_to_docs(dir_path=dir) - - preprocessor = PreProcessor( - split_by="word", split_length=200, split_overlap=0, split_respect_sentence_boundary=True - ) - docs_processed = preprocessor.process(docs) - - document_store.write_documents(documents=docs_processed) diff --git a/haystack/utils/import_utils.py b/haystack/utils/import_utils.py index a1d537ba50..5038c38bf4 100644 --- a/haystack/utils/import_utils.py +++ b/haystack/utils/import_utils.py @@ -1,17 +1,10 @@ -import io -import gzip -import tarfile -import zipfile import logging import importlib import importlib.util -from pathlib import Path -from typing import Optional, Dict, Union, Tuple, List +from typing import Optional, Tuple, List from urllib.parse import urlparse, unquote from os.path import splitext, basename -import requests - from haystack.errors import DatasetsError from haystack.schema import Document @@ -62,58 +55,5 @@ def get_filename_extension_from_url(url: str) -> Tuple[str, str]: return file_name, archive_extension -def fetch_archive_from_http( - url: str, - output_dir: str, - proxies: Optional[Dict[str, str]] = None, - timeout: Union[float, Tuple[float, float]] = 10.0, -) -> bool: - """ - Fetch an archive (zip, gz or tar.gz) from a url via http and extract content to an output directory. - - :param url: http address - :param output_dir: local path - :param proxies: proxies details as required by requests library - :param timeout: How many seconds to wait for the server to send data before giving up, - as a float, or a :ref:`(connect timeout, read timeout) ` tuple. - Defaults to 10 seconds. - :return: if anything got fetched - """ - # verify & prepare local directory - path = Path(output_dir) - if not path.exists(): - path.mkdir(parents=True) - - is_not_empty = len(list(Path(path).rglob("*"))) > 0 - if is_not_empty: - logger.info("Found data stored in '%s'. Delete this first if you really want to fetch new data.", output_dir) - return False - else: - logger.info("Fetching from %s to '%s'", url, output_dir) - - file_name, archive_extension = get_filename_extension_from_url(url) - request_data = requests.get(url, proxies=proxies, timeout=timeout) - - if archive_extension == "zip": - zip_archive = zipfile.ZipFile(io.BytesIO(request_data.content)) - zip_archive.extractall(output_dir) - elif archive_extension == "gz" and not "tar.gz" in url: - gzip_archive = gzip.GzipFile(fileobj=io.BytesIO(request_data.content)) - file_content = gzip_archive.read() - with open(f"{output_dir}/{file_name}", "wb") as file: - file.write(file_content) - elif archive_extension in ["gz", "bz2", "xz"]: - tar_archive = tarfile.open(fileobj=io.BytesIO(request_data.content), mode="r|*") - tar_archive.extractall(output_dir) - else: - logger.warning( - "Skipped url %s as file type is not supported here. " - "See haystack documentation for support of more file types", - url, - ) - - return True - - def is_whisper_available(): return importlib.util.find_spec("whisper") is not None