From e9ba36e8a7fdbb8c707116dc678d17b1aefc7efa Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Wed, 26 Jun 2024 15:59:22 +0200 Subject: [PATCH] feat: bring it all together --- astro.config.mjs | 10 ++- .../docs/concepts/caching-and-filtering.md | 5 -- src/content/docs/concepts/extendability.md | 5 -- src/content/docs/concepts/loading-data.md | 5 -- src/content/docs/concepts/storing-results.md | 5 -- .../concepts/streaming-and-concurrency.md | 5 -- src/content/docs/examples/hello-world.md | 71 +++++++++++++++++-- .../architecture-and-design.mdx | 41 +++++++++++ .../docs/getting-started/changelog.mdx | 1 + .../docs/getting-started/feature-flags.mdx | 1 + .../docs/getting-started/installation.mdx | 1 + .../docs/in-depth/caching-and-filtering.md | 31 ++++++++ .../docs/{concepts => in-depth}/chunking.md | 3 + .../introducing-step-by-step.mdx} | 9 ++- src/content/docs/in-depth/loading-data.md | 31 ++++++++ .../docs/in-depth/prompting-embedding.md | 43 +++++++++++ src/content/docs/in-depth/storing-results.md | 39 ++++++++++ .../in-depth/streaming-and-concurrency.mdx | 60 ++++++++++++++++ .../transforming-and-enriching.mdx | 23 +++--- src/content/docs/index.mdx | 65 +++++++++++------ src/content/docs/reference/cache-filters.md | 3 - src/content/docs/reference/embeddings.md | 3 - .../docs/reference/ingestion-pipeline.md | 5 -- src/content/docs/reference/loaders.md | 3 - src/content/docs/reference/persistance.md | 3 - src/content/docs/reference/transformers.md | 3 - src/content/docs/troubleshooting.md | 42 +++++++++++ ...{introduction.mdx => what-is-swiftide.mdx} | 11 +-- 28 files changed, 437 insertions(+), 90 deletions(-) delete mode 100644 src/content/docs/concepts/caching-and-filtering.md delete mode 100644 src/content/docs/concepts/extendability.md delete mode 100644 src/content/docs/concepts/loading-data.md delete mode 100644 src/content/docs/concepts/storing-results.md delete mode 100644 src/content/docs/concepts/streaming-and-concurrency.md create mode 100644 src/content/docs/getting-started/architecture-and-design.mdx create mode 100644 src/content/docs/in-depth/caching-and-filtering.md rename src/content/docs/{concepts => in-depth}/chunking.md (91%) rename src/content/docs/{concepts/overview.mdx => in-depth/introducing-step-by-step.mdx} (90%) create mode 100644 src/content/docs/in-depth/loading-data.md create mode 100644 src/content/docs/in-depth/prompting-embedding.md create mode 100644 src/content/docs/in-depth/storing-results.md create mode 100644 src/content/docs/in-depth/streaming-and-concurrency.mdx rename src/content/docs/{concepts => in-depth}/transforming-and-enriching.mdx (63%) delete mode 100644 src/content/docs/reference/cache-filters.md delete mode 100644 src/content/docs/reference/embeddings.md delete mode 100644 src/content/docs/reference/ingestion-pipeline.md delete mode 100644 src/content/docs/reference/loaders.md delete mode 100644 src/content/docs/reference/persistance.md delete mode 100644 src/content/docs/reference/transformers.md rename src/content/docs/{introduction.mdx => what-is-swiftide.mdx} (77%) diff --git a/astro.config.mjs b/astro.config.mjs index 5d2a32e..a959e94 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -12,6 +12,9 @@ export default defineConfig({ editLink: { baseUrl: "https://github.com/bosun-ai/swiftide-website/edit/master", }, + tableOfContents: { + minHeadingLevel: 2, + }, customCss: [ // Fontsource files for to regular and semi-bold font weights. "@fontsource/fira-code/400.css", @@ -25,11 +28,12 @@ export default defineConfig({ }, social: { github: "https://github.com/bosun-ai/swiftide", + linkedin: "https://www.linkedin.com/company/bosun-ai/", }, sidebar: [ { - label: "Introduction", - link: "/introduction/", + label: "What is swiftide?", + link: "/what-is-swiftide/", }, { label: "Getting Started", @@ -40,7 +44,7 @@ export default defineConfig({ { label: "In depth", autogenerate: { - directory: "concepts", + directory: "in-depth", }, }, { diff --git a/src/content/docs/concepts/caching-and-filtering.md b/src/content/docs/concepts/caching-and-filtering.md deleted file mode 100644 index 4dd8810..0000000 --- a/src/content/docs/concepts/caching-and-filtering.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Caching and filtering nodes -sidebar: - order: 3 ---- diff --git a/src/content/docs/concepts/extendability.md b/src/content/docs/concepts/extendability.md deleted file mode 100644 index db3220d..0000000 --- a/src/content/docs/concepts/extendability.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Extending Swiftide -sidebar: - order: 7 ---- diff --git a/src/content/docs/concepts/loading-data.md b/src/content/docs/concepts/loading-data.md deleted file mode 100644 index 6e6eedc..0000000 --- a/src/content/docs/concepts/loading-data.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Loading Data -sidebar: - order: 2 ---- diff --git a/src/content/docs/concepts/storing-results.md b/src/content/docs/concepts/storing-results.md deleted file mode 100644 index 3c17c16..0000000 --- a/src/content/docs/concepts/storing-results.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Storing the results -sidebar: - order: 5 ---- diff --git a/src/content/docs/concepts/streaming-and-concurrency.md b/src/content/docs/concepts/streaming-and-concurrency.md deleted file mode 100644 index cc8c667..0000000 --- a/src/content/docs/concepts/streaming-and-concurrency.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Streaming and Concurrency -sidebar: - order: 6 ---- diff --git a/src/content/docs/examples/hello-world.md b/src/content/docs/examples/hello-world.md index a45e350..ee4f45a 100644 --- a/src/content/docs/examples/hello-world.md +++ b/src/content/docs/examples/hello-world.md @@ -3,9 +3,72 @@ title: Hello World description: A simple example of an ingestion pipeline --- -Guides lead a user through a specific task they want to accomplish, often with a sequence of steps. -Writing a good guide requires thinking about what your users are trying to do. +## Ingesting code into Qdrant -## Further reading +This example demonstrates how to ingest the Swiftide codebase itself. +Note that for it to work correctly you need to have OPENAI_API_KEY set, redis and qdrant +running. -- Read [about how-to guides](https://diataxis.fr/how-to-guides/) in the Diátaxis framework +The pipeline will: + +- Load all `.rs` files from the current directory +- Skip any nodes previously processed; hashes are based on the path and chunk (not the + metadata!) +- Run metadata QA on each chunk; generating questions and answers and adding metadata +- Chunk the code into pieces of 10 to 2048 bytes +- Embed the chunks in batches of 10, Metadata is embedded by default +- Store the nodes in Qdrant + +Note that metadata is copied over to smaller chunks when chunking. When making LLM requests +with lots of small chunks, consider the rate limits of the API. + +```rust + +use swiftide::{ + ingestion, + integrations::{self, qdrant::Qdrant, redis::Redis}, + loaders::FileLoader, + transformers::{ChunkCode, Embed, MetadataQACode}, +}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt::init(); + + let openai_client = integrations::openai::OpenAI::builder() + .default_embed_model("text-embedding-3-small") + .default_prompt_model("gpt-3.5-turbo") + .build()?; + + let redis_url = std::env::var("REDIS_URL") + .as_deref() + .unwrap_or("redis://localhost:6379") + .to_owned(); + + let qdrant_url = std::env::var("QDRANT_URL") + .as_deref() + .unwrap_or("http://localhost:6334") + .to_owned(); + + ingestion::IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"])) + .filter_cached(Redis::try_from_url(redis_url, "swiftide-examples")?) + .then(MetadataQACode::new(openai_client.clone())) + .then_chunk(ChunkCode::try_for_language_and_chunk_size( + "rust", + 10..2048, + )?) + .then_in_batch(10, Embed::new(openai_client.clone())) + .then_store_with( + Qdrant::try_from_url(qdrant_url)? + .batch_size(50) + .vector_size(1536) + .collection_name("swiftide-examples".to_string()) + .build()?, + ) + .run() + .await?; + Ok(()) +} +``` + +Find more examples in [our repository](https://github.com/bosun-ai/swiftide/blob/master/examples) diff --git a/src/content/docs/getting-started/architecture-and-design.mdx b/src/content/docs/getting-started/architecture-and-design.mdx new file mode 100644 index 0000000..48b0453 --- /dev/null +++ b/src/content/docs/getting-started/architecture-and-design.mdx @@ -0,0 +1,41 @@ +--- +title: Architecture and Design +description: The architecture and design principles of the Swiftide project. +--- + +## Design principles + +- **Modular**: The pipeline is built from small, composable parts. +- **Extensible**: It is easy to add new parts to the pipeline by extending straightforward traits. +- **Performance**: Performance and ease-of-use are the main goals of the library. Performance always has priority. +- **Tracable**: `tracing` is used throughout the pipeline. + +### When designing integrations, transformers, chunkers + +- **Simple**: The API should be simple and easy to use. +- **Sane defaults, fully configurable**: The library should have sane defaults that are easy to override. +- **Builder pattern**: The builder pattern is used to create new instances of the pipeline. + +## The-things-we-talk-about + +- **IngestionPipeline**: The main struct that holds the pipeline. It is a stream of IngestionNodes. +- **IngestionNode**: The main struct that holds the data. It has a path, chunk and metadata. +- **IngestionStream**: The internal stream of IngestionNodes in the pipeline. +- **Loader**: The starting point of the stream, creates and emits IngestionNodes. +- **Transformers**: Some behaviour that modifies the IngestionNodes. +- **BatchTransformers**: Transformers that transform multiple nodes. +- **Chunkers**: Transformers that split a node into multiple nodes. +- **Storages**: Persist the IngestionNodes. +- **NodeCache**: Filters cached nodes. +- **Integrations**: External libraries that can be used with the pipeline. + +### Pipeline structure and traits + +- from_loader (impl Loader) starting point of the stream, creates and emits IngestionNodes +- filter_cached (impl NodeCache) filters cached nodes +- then (impl Transformer) transforms the node and puts it on the stream +- then_in_batch (impl BatchTransformer) transforms multiple nodes and puts them on the stream +- then_chunk (impl ChunkerTransformer) transforms a single node and emits multiple nodes +- then_store_with (impl Storage) stores the nodes in a storage backend, this can be chained + +Additionally, several generic transformers are implemented. They take implementers of `SimplePrompt` and `EmbeddingModel` to do their things. diff --git a/src/content/docs/getting-started/changelog.mdx b/src/content/docs/getting-started/changelog.mdx index 352b8e1..3978b44 100644 --- a/src/content/docs/getting-started/changelog.mdx +++ b/src/content/docs/getting-started/changelog.mdx @@ -1,5 +1,6 @@ --- title: Changelog +description: The changelog of the Swiftide project. --- import Changelog from "../../../components/Changelog.astro"; diff --git a/src/content/docs/getting-started/feature-flags.mdx b/src/content/docs/getting-started/feature-flags.mdx index a543a35..58b573b 100644 --- a/src/content/docs/getting-started/feature-flags.mdx +++ b/src/content/docs/getting-started/feature-flags.mdx @@ -1,5 +1,6 @@ --- title: Feature Flags +description: Available features and integrations in Swiftide. sidebar: order: 1 --- diff --git a/src/content/docs/getting-started/installation.mdx b/src/content/docs/getting-started/installation.mdx index 0b0c747..9db9f8a 100644 --- a/src/content/docs/getting-started/installation.mdx +++ b/src/content/docs/getting-started/installation.mdx @@ -1,5 +1,6 @@ --- title: Installation +description: Installation instructions for Swiftide. sidebar: order: 0 --- diff --git a/src/content/docs/in-depth/caching-and-filtering.md b/src/content/docs/in-depth/caching-and-filtering.md new file mode 100644 index 0000000..5f37ab3 --- /dev/null +++ b/src/content/docs/in-depth/caching-and-filtering.md @@ -0,0 +1,31 @@ +--- +title: Caching and filtering nodes +description: How to cache and filter nodes in the pipeline. +sidebar: + order: 3 +--- + +When nodes have already been processed by the pipeline, they can often be skipped, speeding up the pipeline and saving costs. A node cache implements the `NodeCache` trait. + +## The `NodeCache` trait + +Which is defined as follows: + +```rust +pub trait NodeCache: Send + Sync + Debug { + async fn get(&self, node: &IngestionNode) -> bool; + async fn set(&self, node: &IngestionNode); +} +``` + +Or in human language: "Given a Node, provide methods to set and get from the cache". + +## Built in chunkers + + + +| Name | Description | Feature Flag | +| ----- | --------------------------------------------------- | ------------ | +| Redis | Can get and set nodes using multiplexed connections | redis | + + diff --git a/src/content/docs/concepts/chunking.md b/src/content/docs/in-depth/chunking.md similarity index 91% rename from src/content/docs/concepts/chunking.md rename to src/content/docs/in-depth/chunking.md index 3ab96b5..e97cf00 100644 --- a/src/content/docs/concepts/chunking.md +++ b/src/content/docs/in-depth/chunking.md @@ -1,11 +1,14 @@ --- title: Chunking +description: How to chunk nodes in the pipeline. sidebar: order: 2 --- For quality metadata it can be important to break up text into smaller parts for both better metadata and retrieval. A chunker implements the `ChunkerTransformer` trait. +## The `ChunkerTransformer` trait + Which is defined as follows: ```rust diff --git a/src/content/docs/concepts/overview.mdx b/src/content/docs/in-depth/introducing-step-by-step.mdx similarity index 90% rename from src/content/docs/concepts/overview.mdx rename to src/content/docs/in-depth/introducing-step-by-step.mdx index 031218b..89d12bf 100644 --- a/src/content/docs/concepts/overview.mdx +++ b/src/content/docs/in-depth/introducing-step-by-step.mdx @@ -1,5 +1,6 @@ --- -title: Overview +title: Step-by-step Introduction +description: A step-by-step introduction on how to use swiftide as a data ingestion pipeline in your project. sidebar: order: 0 --- @@ -8,6 +9,8 @@ Swiftide provides a pipeline model. Troughout a pipeline, `IngestionNodes` are t import { Steps } from "@astrojs/starlight/components"; +### A pipeline step-by-step + 1. The pipeline starts with a loader: @@ -86,3 +89,7 @@ import { Steps } from "@astrojs/starlight/components"; ``` + +### Read more + +[Reference documentation on docs.rs]("https://docs.rs/swiftide/latest/swiftide/") diff --git a/src/content/docs/in-depth/loading-data.md b/src/content/docs/in-depth/loading-data.md new file mode 100644 index 0000000..a4740a0 --- /dev/null +++ b/src/content/docs/in-depth/loading-data.md @@ -0,0 +1,31 @@ +--- +title: Loading Data +description: How to load data into the pipeline. +sidebar: + order: 1 +--- + +A pipeline starts with data and is only as good as the data it ingests. A loader implements the `Loader` trait. + +## The `Loader` trait + +Which is defined as follows: + +```rust +pub trait Loader { + fn into_stream(self) -> IngestionStream; +} +``` + +Or in human language: "I can be turned into a stream". The assumption under the hood is that Loaders will yield the data they load as a stream of `IngestionNodes`. These can be files, messages, webpages and so on. + +## Built in loaders + + + +| Name | Description | Feature Flag | +| -------------- | ------------------------------------------------------------------- | ------------ | +| FileLoader | Loads files with an optional extension filter, respecting gitignore | | +| ScrapingLoader | Scrapes a website using the `spider` crate | scraping | + + diff --git a/src/content/docs/in-depth/prompting-embedding.md b/src/content/docs/in-depth/prompting-embedding.md new file mode 100644 index 0000000..fa5a5ab --- /dev/null +++ b/src/content/docs/in-depth/prompting-embedding.md @@ -0,0 +1,43 @@ +--- +title: Prompting and Embedding +description: How to prompt and embed data in the pipeline. +sidebar: + order: 2 +--- + +Our metadata transformers are generic over the `SimplePrompt` trait. This enables different models to be used for different usecases. Similarly, the embedding transformer is generic over the `EmbeddingModel` trait. + +## The `SimplePrompt` trait + +Which is defined as follows: + +```rust +pub trait SimplePrompt: Debug + Send + Sync { + async fn prompt(&self, prompt: &str) -> Result; +} +``` + +Or in human language: "Given a Prompt, give me a response". + +## The `EmbeddingModel` trait + +Which is defined as follows: + +```rust +pub trait EmbeddingModel: Send + Sync { + async fn embed(&self, input: Vec) -> Result; +} +``` + +Or in human language: "Given a list of things to Embed, give me embeddings". The embedding transformer will link back the embeddings to the original nodes by _order_. + +## Built in inference and embedding models + + + +| Name | Description | Feature Flag | +| --------- | --------------------------------------------------------- | ------------ | +| OpenAI | Implements both SimplePrompt and Embed via `async_openai` | openai | +| FastEmbed | Implements Embed via `fastembed-rs` | fastembed | + + diff --git a/src/content/docs/in-depth/storing-results.md b/src/content/docs/in-depth/storing-results.md new file mode 100644 index 0000000..f30f85b --- /dev/null +++ b/src/content/docs/in-depth/storing-results.md @@ -0,0 +1,39 @@ +--- +title: Storing the results +description: How to store the results of the pipeline. +sidebar: + order: 5 +--- + +After processing nodes in the pipeline you probably want to store the results. Pipelines support multiple storage steps, but need at least one. A storage implements the `Persist` trait. + +## The `Persist` trait + +Which is defined as follows: + +```rust +pub trait Persist: Debug + Send + Sync { + async fn setup(&self) -> Result<()>; + async fn store(&self, node: IngestionNode) -> Result; + async fn batch_store(&self, nodes: Vec) -> IngestionStream; + fn batch_size(&self) -> Option { + None + } +} +``` + +Setup functions are run right away, asynchronously when the pipeline starts. This could include setting up collections, tables, connections etcetera. Because more might happen after storing, both `store` and `batch_store` are expected to return the nodes they processed. + +If `batch_size` is implemented for the storage, the stream will always prefer `batch_store`. + +## Built in storage + + + +| Name | Description | Feature Flag | +| ------------- | ---------------------------------------------------- | ------------ | +| Redis | Persists nodes by default as json | redis | +| Qdrant | Persists nodes in qdrant; expects a vector to be set | qdrant | +| MemoryStorage | Persists nodes in memory; great for debugging | | + + diff --git a/src/content/docs/in-depth/streaming-and-concurrency.mdx b/src/content/docs/in-depth/streaming-and-concurrency.mdx new file mode 100644 index 0000000..c325c70 --- /dev/null +++ b/src/content/docs/in-depth/streaming-and-concurrency.mdx @@ -0,0 +1,60 @@ +--- +title: Streaming and Concurrency +description: How the ingestion pipeline handles streaming and concurrency. +sidebar: + order: 6 +--- + +The ingestion pipeline is streaming, asynchronous, unordered and concurrent. + +## Concurrency + +When transforming, chunking or storing, steps are awaited buffered. Depending on the concurrency setting of the stream, _this means that many promises are awaited concurrently_. + +### Default concurrency and overriding + +The default concurrency for a pipeline is the number of available cpus and can be overwritten by +calling `pipeline.with_concurrency(concurrency: usize)` with the desired concurrency setting. + +Transformers, chunkers and stores can also implement `concurrency` on their respective traits, allowing for fine grained control per step. + +### Throttling + +If due to rate or other limitations throughput is too high, there is also a `pipeline.throttle(duration: impl Into)`, which wil limit the amount of nodes passing through to one per the given duration. + +import { Aside } from "@astrojs/starlight/components"; + + + +## Ingestion Stream + +You might have seen the `IngestionStream` type mentioned a few times. It is the internal stream that is being passed around, build on top of the Rust `Stream` and `StreamExt`. By wrapping it we have more control and less boilerplate when dealing with streams. + +When building batch transformers, storage or chunkers, you will need to return a `IngestionStream`. We've tried to make that as easy as possible and there are multiple ways. + +### Using `Into` + +From a list of `IngestionNodes` using `Into`: + +```rust +let nodes: Vec>> = vec![Ok(IngestionNode::default())]; +let stream: IngestionStream = nodes.into(); +``` + +There is also an implementation of `Into` for Rust streams. + +### Converting an iterator + +You can also convert an `Iterator` into an `IngestionStream` directly. This is great, as the iterator itself will stream it's results, instead of having to collect it first. + +```rust +let nodes: Vec>> = vec![IngestionNode::default()]; +let stream: IngestionStream = IngestionStream::iter(nodes.into_iter().map(|node| { + node.metadata.insert("foo".to_string(), "bar".to_string()); + Ok(node) +})); +``` diff --git a/src/content/docs/concepts/transforming-and-enriching.mdx b/src/content/docs/in-depth/transforming-and-enriching.mdx similarity index 63% rename from src/content/docs/concepts/transforming-and-enriching.mdx rename to src/content/docs/in-depth/transforming-and-enriching.mdx index 1387855..5ea6492 100644 --- a/src/content/docs/concepts/transforming-and-enriching.mdx +++ b/src/content/docs/in-depth/transforming-and-enriching.mdx @@ -1,5 +1,6 @@ --- title: Transforming and Enriching +description: How to transform and enrich nodes in the pipeline. sidebar: order: 1 --- @@ -8,6 +9,8 @@ Transformers are the bread and butter of an ingestion pipeline. They can transfo There's two ways to apply a transformer. Per node or in batch. +## The `Transformer` trait + The `Transformer` trait is very straightforward: ```rust @@ -26,18 +29,16 @@ In batches, the `BatchableTransformer` trait is similar, except that it needs to ## Built in transformers -import { Icon } from "@astrojs/starlight/components"; - -| Name | Description | Transformer | BatchableTransformer | Feature Flag | -| ------------------------- | ------------------------------------------------------ | --------------------- | --------------------- | ------------ | -| Embed | Generic embedding transformer, requires an LLM | | | | -| MetadataKeywords | Uses an LLM to extract keywords and add as metadata | | | | -| MetadataQACode | Uses an LLM to generate questions and answers for Code | | | | -| MetadataQAText | Uses an LLM to generate questions and answers for Text | | | | -| MetadataSummary | Uses an LLM to generate a summary | | | | -| MetadataTitle | Uses an LLM to generate a title | | | | -| HtmlToMarkdownTransformer | Converts html in a node to markdown | | | scraping | +| Name | Description | Feature Flag | +| ------------------------- | ------------------------------------------------------ | ------------ | +| Embed | Generic embedding transformer, requires an LLM | | +| MetadataKeywords | Uses an LLM to extract keywords and add as metadata | | +| MetadataQACode | Uses an LLM to generate questions and answers for Code | | +| MetadataQAText | Uses an LLM to generate questions and answers for Text | | +| MetadataSummary | Uses an LLM to generate a summary | | +| MetadataTitle | Uses an LLM to generate a title | | +| HtmlToMarkdownTransformer | Converts html in a node to markdown | scraping | diff --git a/src/content/docs/index.mdx b/src/content/docs/index.mdx index 2e3fedb..b8206e0 100644 --- a/src/content/docs/index.mdx +++ b/src/content/docs/index.mdx @@ -1,39 +1,62 @@ --- title: swiftide -description: Get started building your docs site with Starlight. +description: Blazing fast data pipelines for Retrieval Augmented Generation written in Rust template: splash hero: - tagline: Blazing fast document and code indexation for Retrieval Augmented Generation + tagline: Blazing fast data pipelines for Retrieval Augmented Generation written in Rust image: file: ../../assets/logo-full.png actions: - - text: Introduction - link: /introduction/ + - text: What is swiftide? + link: /what-is-swiftide/ icon: right-arrow variant: primary - - text: A simple example - link: /examples/example - icon: right-arrow + - text: View it on Github + link: https://github.com/bosun-ai/swiftide + icon: external variant: secondary --- import { Card, CardGrid } from "@astrojs/starlight/components"; -## Next steps +
+ A quick example - - - [Introduction](introduction) - + ```rust + IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["md"])) + .then_chunk(ChunkMarkdown::with_chunk_range(10..512)) + .then(MetadataQACode::new(openai_client.clone())) + .then_in_batch(10, Embed::new(openai_client.clone())) + .then_store_with( + Qdrant::try_from_url(qdrant_url)? + .batch_size(50) + .vector_size(1536) + .collection_name("swiftide-examples".to_string()) + .build()?, + ) + .run() + .await?; + ``` - - [Installation](installation) +
+ + + Load data from various sources, transform it, enrich it with metadata, and persist it in a database. + - - - [Concepts](concepts/big-picture) - - - [Reference](reference/ingestion-pipeline) - + + Qdrant, OpenAI, Redis, FastEmbed, Spider and many more. + + + Write your own loaders, transformers, and storages by extending straight forward traits. + + + Fast, safe, and efficient. Built with Rust's async and streaming features. + + + Part of [Bosun.ai](https://bosun.ai) and actively used in production. + + + Full API documentation available on [docs.rs](https://docs.rs/swiftide/latest/swiftide/) + diff --git a/src/content/docs/reference/cache-filters.md b/src/content/docs/reference/cache-filters.md deleted file mode 100644 index d4a716d..0000000 --- a/src/content/docs/reference/cache-filters.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Cache Filters ---- diff --git a/src/content/docs/reference/embeddings.md b/src/content/docs/reference/embeddings.md deleted file mode 100644 index af42708..0000000 --- a/src/content/docs/reference/embeddings.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Embeddings ---- diff --git a/src/content/docs/reference/ingestion-pipeline.md b/src/content/docs/reference/ingestion-pipeline.md deleted file mode 100644 index ca5f24f..0000000 --- a/src/content/docs/reference/ingestion-pipeline.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Ingestion Pipeline -sidebar: - order: 0 ---- diff --git a/src/content/docs/reference/loaders.md b/src/content/docs/reference/loaders.md deleted file mode 100644 index 81b3229..0000000 --- a/src/content/docs/reference/loaders.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Loaders ---- diff --git a/src/content/docs/reference/persistance.md b/src/content/docs/reference/persistance.md deleted file mode 100644 index caf375b..0000000 --- a/src/content/docs/reference/persistance.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Persistance ---- diff --git a/src/content/docs/reference/transformers.md b/src/content/docs/reference/transformers.md deleted file mode 100644 index 4c7ba9a..0000000 --- a/src/content/docs/reference/transformers.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Transformers ---- diff --git a/src/content/docs/troubleshooting.md b/src/content/docs/troubleshooting.md index 9662ac0..c2e684c 100644 --- a/src/content/docs/troubleshooting.md +++ b/src/content/docs/troubleshooting.md @@ -1,3 +1,45 @@ --- title: Troubleshooting +description: Debugging and troubleshooting your pipeline. --- + +When building a pipeline, things can go wrong. We provide several tools to help you debug and troubleshoot your pipeline. + +By default, if _any_ node fails, the pipeline will stop. This is to prevent cascading failures. You can change this behaviour by using `filter_errors` after any step. + +## The `tracing` crate + +[Tracing](https://github.com/tokio-rs/tracing) quickly became the standard for logging in Rust. We use it throughout the pipeline. When you run your pipeline, you can set the log level to `debug` or `trace` to get detailed logs of what is happening. + +To enable tracing you need to configure add a subscriber. + +```bash +cargo add tracing tracing-subscriber +``` + +Then in the entry point of your program, you can add the following code: + +```rust +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt::init(); + ... +} +``` + +When you then set `RUST_LOG=debug` or `RUST_LOG=trace` you will get detailed logs. Depending on the size of the data, it might be a lot + +### OpenTelemetry support + +Tracing has best-in-class opentelemetry support. See the [tracing-opentelemetry](https://github.com/tokio-rs/tracing-opentelemetry) crate for more information. + +Note that currently the IngestionNode is attached to every transformation step. Beware of large amounts of tracing data. + +## Helpers and utility functions + +There are several helpers and utility functions available on the pipeline to help you debug and handle errors. + +- `log_all` Logs both passing and failed nodes +- `log_errors` Logs errors only +- `log_nodes` Logs nodes only +- `filter_errors` Filters out errors, only passing nodes diff --git a/src/content/docs/introduction.mdx b/src/content/docs/what-is-swiftide.mdx similarity index 77% rename from src/content/docs/introduction.mdx rename to src/content/docs/what-is-swiftide.mdx index 2434722..77da241 100644 --- a/src/content/docs/introduction.mdx +++ b/src/content/docs/what-is-swiftide.mdx @@ -1,5 +1,6 @@ --- -title: Introduction +title: What is swiftide? +description: A brief introduction to swiftide. --- ## What is swiftide? @@ -7,12 +8,12 @@ title: Introduction import { Image } from "astro:assets"; import pipeline from "/src/assets/ingestion-pipeline.svg"; -Swiftide is a straightforward, easy-to-use, easy-to-extend asynchronous file ingestion and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations. +Swiftide is a straightforward, easy-to-use, easy-to-extend asynchronous data ingestion and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations. ingestion-pipeline -At the same time, swiftide also has a focus on developer experience and ease of use. -It is designed to be simple and intuitive, with a clear and concise API that makes -it easy to get started, build complex pipelines, and bring your own transformations. +At the same time, swiftide focusses on developer experience and ease of use. It is +designed to be simple and intuitive, with a clear and concise API that makes it easy +to get started, build complex pipelines, and bring your own transformations. ## What problem does swiftide solve?