From e9ba36e8a7fdbb8c707116dc678d17b1aefc7efa Mon Sep 17 00:00:00 2001
From: Timon Vonk <timonv@gmail.com>
Date: Wed, 26 Jun 2024 15:59:22 +0200
Subject: [PATCH] feat: bring it all together

---
 astro.config.mjs                              | 10 ++-
 .../docs/concepts/caching-and-filtering.md    |  5 --
 src/content/docs/concepts/extendability.md    |  5 --
 src/content/docs/concepts/loading-data.md     |  5 --
 src/content/docs/concepts/storing-results.md  |  5 --
 .../concepts/streaming-and-concurrency.md     |  5 --
 src/content/docs/examples/hello-world.md      | 71 +++++++++++++++++--
 .../architecture-and-design.mdx               | 41 +++++++++++
 .../docs/getting-started/changelog.mdx        |  1 +
 .../docs/getting-started/feature-flags.mdx    |  1 +
 .../docs/getting-started/installation.mdx     |  1 +
 .../docs/in-depth/caching-and-filtering.md    | 31 ++++++++
 .../docs/{concepts => in-depth}/chunking.md   |  3 +
 .../introducing-step-by-step.mdx}             |  9 ++-
 src/content/docs/in-depth/loading-data.md     | 31 ++++++++
 .../docs/in-depth/prompting-embedding.md      | 43 +++++++++++
 src/content/docs/in-depth/storing-results.md  | 39 ++++++++++
 .../in-depth/streaming-and-concurrency.mdx    | 60 ++++++++++++++++
 .../transforming-and-enriching.mdx            | 23 +++---
 src/content/docs/index.mdx                    | 65 +++++++++++------
 src/content/docs/reference/cache-filters.md   |  3 -
 src/content/docs/reference/embeddings.md      |  3 -
 .../docs/reference/ingestion-pipeline.md      |  5 --
 src/content/docs/reference/loaders.md         |  3 -
 src/content/docs/reference/persistance.md     |  3 -
 src/content/docs/reference/transformers.md    |  3 -
 src/content/docs/troubleshooting.md           | 42 +++++++++++
 ...{introduction.mdx => what-is-swiftide.mdx} | 11 +--
 28 files changed, 437 insertions(+), 90 deletions(-)
 delete mode 100644 src/content/docs/concepts/caching-and-filtering.md
 delete mode 100644 src/content/docs/concepts/extendability.md
 delete mode 100644 src/content/docs/concepts/loading-data.md
 delete mode 100644 src/content/docs/concepts/storing-results.md
 delete mode 100644 src/content/docs/concepts/streaming-and-concurrency.md
 create mode 100644 src/content/docs/getting-started/architecture-and-design.mdx
 create mode 100644 src/content/docs/in-depth/caching-and-filtering.md
 rename src/content/docs/{concepts => in-depth}/chunking.md (91%)
 rename src/content/docs/{concepts/overview.mdx => in-depth/introducing-step-by-step.mdx} (90%)
 create mode 100644 src/content/docs/in-depth/loading-data.md
 create mode 100644 src/content/docs/in-depth/prompting-embedding.md
 create mode 100644 src/content/docs/in-depth/storing-results.md
 create mode 100644 src/content/docs/in-depth/streaming-and-concurrency.mdx
 rename src/content/docs/{concepts => in-depth}/transforming-and-enriching.mdx (63%)
 delete mode 100644 src/content/docs/reference/cache-filters.md
 delete mode 100644 src/content/docs/reference/embeddings.md
 delete mode 100644 src/content/docs/reference/ingestion-pipeline.md
 delete mode 100644 src/content/docs/reference/loaders.md
 delete mode 100644 src/content/docs/reference/persistance.md
 delete mode 100644 src/content/docs/reference/transformers.md
 rename src/content/docs/{introduction.mdx => what-is-swiftide.mdx} (77%)

diff --git a/astro.config.mjs b/astro.config.mjs
index 5d2a32e..a959e94 100644
--- a/astro.config.mjs
+++ b/astro.config.mjs
@@ -12,6 +12,9 @@ export default defineConfig({
       editLink: {
         baseUrl: "https://github.com/bosun-ai/swiftide-website/edit/master",
       },
+      tableOfContents: {
+        minHeadingLevel: 2,
+      },
       customCss: [
         // Fontsource files for to regular and semi-bold font weights.
         "@fontsource/fira-code/400.css",
@@ -25,11 +28,12 @@ export default defineConfig({
       },
       social: {
         github: "https://github.com/bosun-ai/swiftide",
+        linkedin: "https://www.linkedin.com/company/bosun-ai/",
       },
       sidebar: [
         {
-          label: "Introduction",
-          link: "/introduction/",
+          label: "What is swiftide?",
+          link: "/what-is-swiftide/",
         },
         {
           label: "Getting Started",
@@ -40,7 +44,7 @@ export default defineConfig({
         {
           label: "In depth",
           autogenerate: {
-            directory: "concepts",
+            directory: "in-depth",
           },
         },
         {
diff --git a/src/content/docs/concepts/caching-and-filtering.md b/src/content/docs/concepts/caching-and-filtering.md
deleted file mode 100644
index 4dd8810..0000000
--- a/src/content/docs/concepts/caching-and-filtering.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Caching and filtering nodes
-sidebar:
-  order: 3
----
diff --git a/src/content/docs/concepts/extendability.md b/src/content/docs/concepts/extendability.md
deleted file mode 100644
index db3220d..0000000
--- a/src/content/docs/concepts/extendability.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Extending Swiftide
-sidebar:
-  order: 7
----
diff --git a/src/content/docs/concepts/loading-data.md b/src/content/docs/concepts/loading-data.md
deleted file mode 100644
index 6e6eedc..0000000
--- a/src/content/docs/concepts/loading-data.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Loading Data
-sidebar:
-  order: 2
----
diff --git a/src/content/docs/concepts/storing-results.md b/src/content/docs/concepts/storing-results.md
deleted file mode 100644
index 3c17c16..0000000
--- a/src/content/docs/concepts/storing-results.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Storing the results
-sidebar:
-  order: 5
----
diff --git a/src/content/docs/concepts/streaming-and-concurrency.md b/src/content/docs/concepts/streaming-and-concurrency.md
deleted file mode 100644
index cc8c667..0000000
--- a/src/content/docs/concepts/streaming-and-concurrency.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Streaming and Concurrency
-sidebar:
-  order: 6
----
diff --git a/src/content/docs/examples/hello-world.md b/src/content/docs/examples/hello-world.md
index a45e350..ee4f45a 100644
--- a/src/content/docs/examples/hello-world.md
+++ b/src/content/docs/examples/hello-world.md
@@ -3,9 +3,72 @@ title: Hello World
 description: A simple example of an ingestion pipeline
 ---
 
-Guides lead a user through a specific task they want to accomplish, often with a sequence of steps.
-Writing a good guide requires thinking about what your users are trying to do.
+## Ingesting code into Qdrant
 
-## Further reading
+This example demonstrates how to ingest the Swiftide codebase itself.
+Note that for it to work correctly you need to have OPENAI_API_KEY set, redis and qdrant
+running.
 
-- Read [about how-to guides](https://diataxis.fr/how-to-guides/) in the Diátaxis framework
+The pipeline will:
+
+- Load all `.rs` files from the current directory
+- Skip any nodes previously processed; hashes are based on the path and chunk (not the
+  metadata!)
+- Run metadata QA on each chunk; generating questions and answers and adding metadata
+- Chunk the code into pieces of 10 to 2048 bytes
+- Embed the chunks in batches of 10, Metadata is embedded by default
+- Store the nodes in Qdrant
+
+Note that metadata is copied over to smaller chunks when chunking. When making LLM requests
+with lots of small chunks, consider the rate limits of the API.
+
+```rust
+
+use swiftide::{
+    ingestion,
+    integrations::{self, qdrant::Qdrant, redis::Redis},
+    loaders::FileLoader,
+    transformers::{ChunkCode, Embed, MetadataQACode},
+};
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tracing_subscriber::fmt::init();
+
+    let openai_client = integrations::openai::OpenAI::builder()
+        .default_embed_model("text-embedding-3-small")
+        .default_prompt_model("gpt-3.5-turbo")
+        .build()?;
+
+    let redis_url = std::env::var("REDIS_URL")
+        .as_deref()
+        .unwrap_or("redis://localhost:6379")
+        .to_owned();
+
+    let qdrant_url = std::env::var("QDRANT_URL")
+        .as_deref()
+        .unwrap_or("http://localhost:6334")
+        .to_owned();
+
+    ingestion::IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["rs"]))
+        .filter_cached(Redis::try_from_url(redis_url, "swiftide-examples")?)
+        .then(MetadataQACode::new(openai_client.clone()))
+        .then_chunk(ChunkCode::try_for_language_and_chunk_size(
+            "rust",
+            10..2048,
+        )?)
+        .then_in_batch(10, Embed::new(openai_client.clone()))
+        .then_store_with(
+            Qdrant::try_from_url(qdrant_url)?
+                .batch_size(50)
+                .vector_size(1536)
+                .collection_name("swiftide-examples".to_string())
+                .build()?,
+        )
+        .run()
+        .await?;
+    Ok(())
+}
+```
+
+Find more examples in [our repository](https://github.com/bosun-ai/swiftide/blob/master/examples)
diff --git a/src/content/docs/getting-started/architecture-and-design.mdx b/src/content/docs/getting-started/architecture-and-design.mdx
new file mode 100644
index 0000000..48b0453
--- /dev/null
+++ b/src/content/docs/getting-started/architecture-and-design.mdx
@@ -0,0 +1,41 @@
+---
+title: Architecture and Design
+description: The architecture and design principles of the Swiftide project.
+---
+
+## Design principles
+
+- **Modular**: The pipeline is built from small, composable parts.
+- **Extensible**: It is easy to add new parts to the pipeline by extending straightforward traits.
+- **Performance**: Performance and ease-of-use are the main goals of the library. Performance always has priority.
+- **Tracable**: `tracing` is used throughout the pipeline.
+
+### When designing integrations, transformers, chunkers
+
+- **Simple**: The API should be simple and easy to use.
+- **Sane defaults, fully configurable**: The library should have sane defaults that are easy to override.
+- **Builder pattern**: The builder pattern is used to create new instances of the pipeline.
+
+## The-things-we-talk-about
+
+- **IngestionPipeline**: The main struct that holds the pipeline. It is a stream of IngestionNodes.
+- **IngestionNode**: The main struct that holds the data. It has a path, chunk and metadata.
+- **IngestionStream**: The internal stream of IngestionNodes in the pipeline.
+- **Loader**: The starting point of the stream, creates and emits IngestionNodes.
+- **Transformers**: Some behaviour that modifies the IngestionNodes.
+- **BatchTransformers**: Transformers that transform multiple nodes.
+- **Chunkers**: Transformers that split a node into multiple nodes.
+- **Storages**: Persist the IngestionNodes.
+- **NodeCache**: Filters cached nodes.
+- **Integrations**: External libraries that can be used with the pipeline.
+
+### Pipeline structure and traits
+
+- from_loader (impl Loader) starting point of the stream, creates and emits IngestionNodes
+- filter_cached (impl NodeCache) filters cached nodes
+- then (impl Transformer) transforms the node and puts it on the stream
+- then_in_batch (impl BatchTransformer) transforms multiple nodes and puts them on the stream
+- then_chunk (impl ChunkerTransformer) transforms a single node and emits multiple nodes
+- then_store_with (impl Storage) stores the nodes in a storage backend, this can be chained
+
+Additionally, several generic transformers are implemented. They take implementers of `SimplePrompt` and `EmbeddingModel` to do their things.
diff --git a/src/content/docs/getting-started/changelog.mdx b/src/content/docs/getting-started/changelog.mdx
index 352b8e1..3978b44 100644
--- a/src/content/docs/getting-started/changelog.mdx
+++ b/src/content/docs/getting-started/changelog.mdx
@@ -1,5 +1,6 @@
 ---
 title: Changelog
+description: The changelog of the Swiftide project.
 ---
 
 import Changelog from "../../../components/Changelog.astro";
diff --git a/src/content/docs/getting-started/feature-flags.mdx b/src/content/docs/getting-started/feature-flags.mdx
index a543a35..58b573b 100644
--- a/src/content/docs/getting-started/feature-flags.mdx
+++ b/src/content/docs/getting-started/feature-flags.mdx
@@ -1,5 +1,6 @@
 ---
 title: Feature Flags
+description: Available features and integrations in Swiftide.
 sidebar:
   order: 1
 ---
diff --git a/src/content/docs/getting-started/installation.mdx b/src/content/docs/getting-started/installation.mdx
index 0b0c747..9db9f8a 100644
--- a/src/content/docs/getting-started/installation.mdx
+++ b/src/content/docs/getting-started/installation.mdx
@@ -1,5 +1,6 @@
 ---
 title: Installation
+description: Installation instructions for Swiftide.
 sidebar:
   order: 0
 ---
diff --git a/src/content/docs/in-depth/caching-and-filtering.md b/src/content/docs/in-depth/caching-and-filtering.md
new file mode 100644
index 0000000..5f37ab3
--- /dev/null
+++ b/src/content/docs/in-depth/caching-and-filtering.md
@@ -0,0 +1,31 @@
+---
+title: Caching and filtering nodes
+description: How to cache and filter nodes in the pipeline.
+sidebar:
+  order: 3
+---
+
+When nodes have already been processed by the pipeline, they can often be skipped, speeding up the pipeline and saving costs. A node cache implements the `NodeCache` trait.
+
+## The `NodeCache` trait
+
+Which is defined as follows:
+
+```rust
+pub trait NodeCache: Send + Sync + Debug {
+    async fn get(&self, node: &IngestionNode) -> bool;
+    async fn set(&self, node: &IngestionNode);
+}
+```
+
+Or in human language: "Given a Node, provide methods to set and get from the cache".
+
+## Built in chunkers
+
+<small>
+
+| Name  | Description                                         | Feature Flag |
+| ----- | --------------------------------------------------- | ------------ |
+| Redis | Can get and set nodes using multiplexed connections | redis        |
+
+</small>
diff --git a/src/content/docs/concepts/chunking.md b/src/content/docs/in-depth/chunking.md
similarity index 91%
rename from src/content/docs/concepts/chunking.md
rename to src/content/docs/in-depth/chunking.md
index 3ab96b5..e97cf00 100644
--- a/src/content/docs/concepts/chunking.md
+++ b/src/content/docs/in-depth/chunking.md
@@ -1,11 +1,14 @@
 ---
 title: Chunking
+description: How to chunk nodes in the pipeline.
 sidebar:
   order: 2
 ---
 
 For quality metadata it can be important to break up text into smaller parts for both better metadata and retrieval. A chunker implements the `ChunkerTransformer` trait.
 
+## The `ChunkerTransformer` trait
+
 Which is defined as follows:
 
 ```rust
diff --git a/src/content/docs/concepts/overview.mdx b/src/content/docs/in-depth/introducing-step-by-step.mdx
similarity index 90%
rename from src/content/docs/concepts/overview.mdx
rename to src/content/docs/in-depth/introducing-step-by-step.mdx
index 031218b..89d12bf 100644
--- a/src/content/docs/concepts/overview.mdx
+++ b/src/content/docs/in-depth/introducing-step-by-step.mdx
@@ -1,5 +1,6 @@
 ---
-title: Overview
+title: Step-by-step Introduction
+description: A step-by-step introduction on how to use swiftide as a data ingestion pipeline in your project.
 sidebar:
   order: 0
 ---
@@ -8,6 +9,8 @@ Swiftide provides a pipeline model. Troughout a pipeline, `IngestionNodes` are t
 
 import { Steps } from "@astrojs/starlight/components";
 
+### A pipeline step-by-step
+
 <Steps>
 
 1. The pipeline starts with a loader:
@@ -86,3 +89,7 @@ import { Steps } from "@astrojs/starlight/components";
    ```
 
 </Steps>
+
+### Read more
+
+[Reference documentation on docs.rs]("https://docs.rs/swiftide/latest/swiftide/")
diff --git a/src/content/docs/in-depth/loading-data.md b/src/content/docs/in-depth/loading-data.md
new file mode 100644
index 0000000..a4740a0
--- /dev/null
+++ b/src/content/docs/in-depth/loading-data.md
@@ -0,0 +1,31 @@
+---
+title: Loading Data
+description: How to load data into the pipeline.
+sidebar:
+  order: 1
+---
+
+A pipeline starts with data and is only as good as the data it ingests. A loader implements the `Loader` trait.
+
+## The `Loader` trait
+
+Which is defined as follows:
+
+```rust
+pub trait Loader {
+    fn into_stream(self) -> IngestionStream;
+}
+```
+
+Or in human language: "I can be turned into a stream". The assumption under the hood is that Loaders will yield the data they load as a stream of `IngestionNodes`. These can be files, messages, webpages and so on.
+
+## Built in loaders
+
+<small>
+
+| Name           | Description                                                         | Feature Flag |
+| -------------- | ------------------------------------------------------------------- | ------------ |
+| FileLoader     | Loads files with an optional extension filter, respecting gitignore |              |
+| ScrapingLoader | Scrapes a website using the `spider` crate                          | scraping     |
+
+</small>
diff --git a/src/content/docs/in-depth/prompting-embedding.md b/src/content/docs/in-depth/prompting-embedding.md
new file mode 100644
index 0000000..fa5a5ab
--- /dev/null
+++ b/src/content/docs/in-depth/prompting-embedding.md
@@ -0,0 +1,43 @@
+---
+title: Prompting and Embedding
+description: How to prompt and embed data in the pipeline.
+sidebar:
+  order: 2
+---
+
+Our metadata transformers are generic over the `SimplePrompt` trait. This enables different models to be used for different usecases. Similarly, the embedding transformer is generic over the `EmbeddingModel` trait.
+
+## The `SimplePrompt` trait
+
+Which is defined as follows:
+
+```rust
+pub trait SimplePrompt: Debug + Send + Sync {
+    async fn prompt(&self, prompt: &str) -> Result<String>;
+}
+```
+
+Or in human language: "Given a Prompt, give me a response".
+
+## The `EmbeddingModel` trait
+
+Which is defined as follows:
+
+```rust
+pub trait EmbeddingModel: Send + Sync {
+    async fn embed(&self, input: Vec<String>) -> Result<Embeddings>;
+}
+```
+
+Or in human language: "Given a list of things to Embed, give me embeddings". The embedding transformer will link back the embeddings to the original nodes by _order_.
+
+## Built in inference and embedding models
+
+<small>
+
+| Name      | Description                                               | Feature Flag |
+| --------- | --------------------------------------------------------- | ------------ |
+| OpenAI    | Implements both SimplePrompt and Embed via `async_openai` | openai       |
+| FastEmbed | Implements Embed via `fastembed-rs`                       | fastembed    |
+
+</small>
diff --git a/src/content/docs/in-depth/storing-results.md b/src/content/docs/in-depth/storing-results.md
new file mode 100644
index 0000000..f30f85b
--- /dev/null
+++ b/src/content/docs/in-depth/storing-results.md
@@ -0,0 +1,39 @@
+---
+title: Storing the results
+description: How to store the results of the pipeline.
+sidebar:
+  order: 5
+---
+
+After processing nodes in the pipeline you probably want to store the results. Pipelines support multiple storage steps, but need at least one. A storage implements the `Persist` trait.
+
+## The `Persist` trait
+
+Which is defined as follows:
+
+```rust
+pub trait Persist: Debug + Send + Sync {
+    async fn setup(&self) -> Result<()>;
+    async fn store(&self, node: IngestionNode) -> Result<IngestionNode>;
+    async fn batch_store(&self, nodes: Vec<IngestionNode>) -> IngestionStream;
+    fn batch_size(&self) -> Option<usize> {
+        None
+    }
+}
+```
+
+Setup functions are run right away, asynchronously when the pipeline starts. This could include setting up collections, tables, connections etcetera. Because more might happen after storing, both `store` and `batch_store` are expected to return the nodes they processed.
+
+If `batch_size` is implemented for the storage, the stream will always prefer `batch_store`.
+
+## Built in storage
+
+<small>
+
+| Name          | Description                                          | Feature Flag |
+| ------------- | ---------------------------------------------------- | ------------ |
+| Redis         | Persists nodes by default as json                    | redis        |
+| Qdrant        | Persists nodes in qdrant; expects a vector to be set | qdrant       |
+| MemoryStorage | Persists nodes in memory; great for debugging        |              |
+
+</small>
diff --git a/src/content/docs/in-depth/streaming-and-concurrency.mdx b/src/content/docs/in-depth/streaming-and-concurrency.mdx
new file mode 100644
index 0000000..c325c70
--- /dev/null
+++ b/src/content/docs/in-depth/streaming-and-concurrency.mdx
@@ -0,0 +1,60 @@
+---
+title: Streaming and Concurrency
+description: How the ingestion pipeline handles streaming and concurrency.
+sidebar:
+  order: 6
+---
+
+The ingestion pipeline is streaming, asynchronous, unordered and concurrent.
+
+## Concurrency
+
+When transforming, chunking or storing, steps are awaited buffered. Depending on the concurrency setting of the stream, _this means that many promises are awaited concurrently_.
+
+### Default concurrency and overriding
+
+The default concurrency for a pipeline is the number of available cpus and can be overwritten by
+calling `pipeline.with_concurrency(concurrency: usize)` with the desired concurrency setting.
+
+Transformers, chunkers and stores can also implement `concurrency` on their respective traits, allowing for fine grained control per step.
+
+### Throttling
+
+If due to rate or other limitations throughput is too high, there is also a `pipeline.throttle(duration: impl Into<Duration>)`, which wil limit the amount of nodes passing through to one per the given duration.
+
+import { Aside } from "@astrojs/starlight/components";
+
+<Aside type="caution" title="Note on openai">
+  **gpt-3.5-turbo** is notorious for hitting rate limits fast. The
+  `async_openai` crate offers further options to tune the backoff algorithm
+  (short exponential) by default.
+</Aside>
+
+## Ingestion Stream
+
+You might have seen the `IngestionStream` type mentioned a few times. It is the internal stream that is being passed around, build on top of the Rust `Stream` and `StreamExt`. By wrapping it we have more control and less boilerplate when dealing with streams.
+
+When building batch transformers, storage or chunkers, you will need to return a `IngestionStream`. We've tried to make that as easy as possible and there are multiple ways.
+
+### Using `Into`
+
+From a list of `IngestionNodes` using `Into`:
+
+```rust
+let nodes: Vec<Result<IngestionNode>>> = vec![Ok(IngestionNode::default())];
+let stream: IngestionStream = nodes.into();
+```
+
+There is also an implementation of `Into` for Rust streams.
+
+### Converting an iterator
+
+You can also convert an `Iterator` into an `IngestionStream` directly. This is great, as the iterator itself will stream it's results, instead of having to collect it first.
+
+```rust
+let nodes: Vec<Result<IngestionNode>>> = vec![IngestionNode::default()];
+let stream: IngestionStream = IngestionStream::iter(nodes.into_iter().map(|node| {
+    node.metadata.insert("foo".to_string(), "bar".to_string());
+    Ok(node)
+}));
+```
diff --git a/src/content/docs/concepts/transforming-and-enriching.mdx b/src/content/docs/in-depth/transforming-and-enriching.mdx
similarity index 63%
rename from src/content/docs/concepts/transforming-and-enriching.mdx
rename to src/content/docs/in-depth/transforming-and-enriching.mdx
index 1387855..5ea6492 100644
--- a/src/content/docs/concepts/transforming-and-enriching.mdx
+++ b/src/content/docs/in-depth/transforming-and-enriching.mdx
@@ -1,5 +1,6 @@
 ---
 title: Transforming and Enriching
+description: How to transform and enrich nodes in the pipeline.
 sidebar:
   order: 1
 ---
@@ -8,6 +9,8 @@ Transformers are the bread and butter of an ingestion pipeline. They can transfo
 
 There's two ways to apply a transformer. Per node or in batch.
 
+## The `Transformer` trait
+
 The `Transformer` trait is very straightforward:
 
 ```rust
@@ -26,18 +29,16 @@ In batches, the `BatchableTransformer` trait is similar, except that it needs to
 
 ## Built in transformers
 
-import { Icon } from "@astrojs/starlight/components";
-
 <small>
 
-| Name                      | Description                                            | Transformer           | BatchableTransformer  | Feature Flag |
-| ------------------------- | ------------------------------------------------------ | --------------------- | --------------------- | ------------ |
-| Embed                     | Generic embedding transformer, requires an LLM         | <Icon name="check" /> | <Icon name="check" /> |              |
-| MetadataKeywords          | Uses an LLM to extract keywords and add as metadata    | <Icon name="check" /> |                       |              |
-| MetadataQACode            | Uses an LLM to generate questions and answers for Code | <Icon name="check" /> |                       |              |
-| MetadataQAText            | Uses an LLM to generate questions and answers for Text | <Icon name="check" /> |                       |              |
-| MetadataSummary           | Uses an LLM to generate a summary                      | <Icon name="check" /> |                       |              |
-| MetadataTitle             | Uses an LLM to generate a title                        | <Icon name="check" /> |                       |              |
-| HtmlToMarkdownTransformer | Converts html in a node to markdown                    | <Icon name="check" /> |                       | scraping     |
+| Name                      | Description                                            | Feature Flag |
+| ------------------------- | ------------------------------------------------------ | ------------ |
+| Embed                     | Generic embedding transformer, requires an LLM         |              |
+| MetadataKeywords          | Uses an LLM to extract keywords and add as metadata    |              |
+| MetadataQACode            | Uses an LLM to generate questions and answers for Code |              |
+| MetadataQAText            | Uses an LLM to generate questions and answers for Text |              |
+| MetadataSummary           | Uses an LLM to generate a summary                      |              |
+| MetadataTitle             | Uses an LLM to generate a title                        |              |
+| HtmlToMarkdownTransformer | Converts html in a node to markdown                    | scraping     |
 
 </small>
diff --git a/src/content/docs/index.mdx b/src/content/docs/index.mdx
index 2e3fedb..b8206e0 100644
--- a/src/content/docs/index.mdx
+++ b/src/content/docs/index.mdx
@@ -1,39 +1,62 @@
 ---
 title: swiftide
-description: Get started building your docs site with Starlight.
+description: Blazing fast data pipelines for Retrieval Augmented Generation written in Rust
 template: splash
 hero:
-  tagline: Blazing fast document and code indexation for Retrieval Augmented Generation
+  tagline: Blazing fast data pipelines for Retrieval Augmented Generation written in Rust
   image:
     file: ../../assets/logo-full.png
   actions:
-    - text: Introduction
-      link: /introduction/
+    - text: What is swiftide?
+      link: /what-is-swiftide/
       icon: right-arrow
       variant: primary
-    - text: A simple example
-      link: /examples/example
-      icon: right-arrow
+    - text: View it on Github
+      link: https://github.com/bosun-ai/swiftide
+      icon: external
       variant: secondary
 ---
 
 import { Card, CardGrid } from "@astrojs/starlight/components";
 
-## Next steps
+<details>
+  <summary>A quick example</summary>
 
-<CardGrid stagger>
-	<Card title="Introduction" icon="pencil">
-    [Introduction](introduction)
-  </Card>
+    ```rust
+    IngestionPipeline::from_loader(FileLoader::new(".").with_extensions(&["md"]))
+            .then_chunk(ChunkMarkdown::with_chunk_range(10..512))
+            .then(MetadataQACode::new(openai_client.clone()))
+            .then_in_batch(10, Embed::new(openai_client.clone()))
+            .then_store_with(
+                Qdrant::try_from_url(qdrant_url)?
+                    .batch_size(50)
+                    .vector_size(1536)
+                    .collection_name("swiftide-examples".to_string())
+                    .build()?,
+            )
+            .run()
+            .await?;
+    ```
 
-    <Card title="Installation" icon="add-document">
-    [Installation](installation)
+</details>
+<CardGrid stagger>
+	<Card title="Transform, enrich and persist lots of data" icon="seti:bicep">
+  Load data from various sources, transform it, enrich it with metadata, and persist it in a database.
+</Card>
 
-  </Card>
-	<Card title="Concepts" icon="setting">
-    [Concepts](concepts/big-picture)
-  </Card>
-	<Card title="Reference" icon="open-book">
-    [Reference](reference/ingestion-pipeline)
-  </Card>
+<Card title="Many existing integrations" icon="puzzle">
+  Qdrant, OpenAI, Redis, FastEmbed, Spider and many more.
+</Card>
+	<Card title="Easy to extend" icon="seti:crystal_embedded">
+  Write your own loaders, transformers, and storages by extending straight forward traits.
+</Card>
+	<Card title="Written in Rust" icon="seti:rust">
+  Fast, safe, and efficient. Built with Rust's async and streaming features.
+</Card>
+	<Card title="Part of Bosun.ai" icon="rocket">
+  Part of [Bosun.ai](https://bosun.ai) and actively used in production.
+</Card>
+<Card title="Reference" icon="open-book">
+  Full API documentation available on [docs.rs](https://docs.rs/swiftide/latest/swiftide/)
+</Card>
 </CardGrid>
diff --git a/src/content/docs/reference/cache-filters.md b/src/content/docs/reference/cache-filters.md
deleted file mode 100644
index d4a716d..0000000
--- a/src/content/docs/reference/cache-filters.md
+++ /dev/null
@@ -1,3 +0,0 @@
----
-title: Cache Filters
----
diff --git a/src/content/docs/reference/embeddings.md b/src/content/docs/reference/embeddings.md
deleted file mode 100644
index af42708..0000000
--- a/src/content/docs/reference/embeddings.md
+++ /dev/null
@@ -1,3 +0,0 @@
----
-title: Embeddings
----
diff --git a/src/content/docs/reference/ingestion-pipeline.md b/src/content/docs/reference/ingestion-pipeline.md
deleted file mode 100644
index ca5f24f..0000000
--- a/src/content/docs/reference/ingestion-pipeline.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-title: Ingestion Pipeline
-sidebar:
-  order: 0
----
diff --git a/src/content/docs/reference/loaders.md b/src/content/docs/reference/loaders.md
deleted file mode 100644
index 81b3229..0000000
--- a/src/content/docs/reference/loaders.md
+++ /dev/null
@@ -1,3 +0,0 @@
----
-title: Loaders
----
diff --git a/src/content/docs/reference/persistance.md b/src/content/docs/reference/persistance.md
deleted file mode 100644
index caf375b..0000000
--- a/src/content/docs/reference/persistance.md
+++ /dev/null
@@ -1,3 +0,0 @@
----
-title: Persistance
----
diff --git a/src/content/docs/reference/transformers.md b/src/content/docs/reference/transformers.md
deleted file mode 100644
index 4c7ba9a..0000000
--- a/src/content/docs/reference/transformers.md
+++ /dev/null
@@ -1,3 +0,0 @@
----
-title: Transformers
----
diff --git a/src/content/docs/troubleshooting.md b/src/content/docs/troubleshooting.md
index 9662ac0..c2e684c 100644
--- a/src/content/docs/troubleshooting.md
+++ b/src/content/docs/troubleshooting.md
@@ -1,3 +1,45 @@
 ---
 title: Troubleshooting
+description: Debugging and troubleshooting your pipeline.
 ---
+
+When building a pipeline, things can go wrong. We provide several tools to help you debug and troubleshoot your pipeline.
+
+By default, if _any_ node fails, the pipeline will stop. This is to prevent cascading failures. You can change this behaviour by using `filter_errors` after any step.
+
+## The `tracing` crate
+
+[Tracing](https://github.com/tokio-rs/tracing) quickly became the standard for logging in Rust. We use it throughout the pipeline. When you run your pipeline, you can set the log level to `debug` or `trace` to get detailed logs of what is happening.
+
+To enable tracing you need to configure add a subscriber.
+
+```bash
+cargo add tracing tracing-subscriber
+```
+
+Then in the entry point of your program, you can add the following code:
+
+```rust
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    tracing_subscriber::fmt::init();
+    ...
+}
+```
+
+When you then set `RUST_LOG=debug` or `RUST_LOG=trace` you will get detailed logs. Depending on the size of the data, it might be a lot
+
+### OpenTelemetry support
+
+Tracing has best-in-class opentelemetry support. See the [tracing-opentelemetry](https://github.com/tokio-rs/tracing-opentelemetry) crate for more information.
+
+Note that currently the IngestionNode is attached to every transformation step. Beware of large amounts of tracing data.
+
+## Helpers and utility functions
+
+There are several helpers and utility functions available on the pipeline to help you debug and handle errors.
+
+- `log_all` Logs both passing and failed nodes
+- `log_errors` Logs errors only
+- `log_nodes` Logs nodes only
+- `filter_errors` Filters out errors, only passing nodes
diff --git a/src/content/docs/introduction.mdx b/src/content/docs/what-is-swiftide.mdx
similarity index 77%
rename from src/content/docs/introduction.mdx
rename to src/content/docs/what-is-swiftide.mdx
index 2434722..77da241 100644
--- a/src/content/docs/introduction.mdx
+++ b/src/content/docs/what-is-swiftide.mdx
@@ -1,5 +1,6 @@
 ---
-title: Introduction
+title: What is swiftide?
+description: A brief introduction to swiftide.
 ---
 
 ## What is swiftide?
@@ -7,12 +8,12 @@ title: Introduction
 import { Image } from "astro:assets";
 import pipeline from "/src/assets/ingestion-pipeline.svg";
 
-Swiftide is a straightforward, easy-to-use, easy-to-extend asynchronous file ingestion and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations.
+Swiftide is a straightforward, easy-to-use, easy-to-extend asynchronous data ingestion and processing library. It is designed to be used in a RAG (Retrieval Augmented Generation) system. It is built to be fast and efficient, with a focus on parallel processing and asynchronous operations.
 
 <Image src={pipeline} alt="ingestion-pipeline" />
-At the same time, swiftide also has a focus on developer experience and ease of use.
-It is designed to be simple and intuitive, with a clear and concise API that makes
-it easy to get started, build complex pipelines, and bring your own transformations.
+At the same time, swiftide focusses on developer experience and ease of use. It is
+designed to be simple and intuitive, with a clear and concise API that makes it easy
+to get started, build complex pipelines, and bring your own transformations.
 
 ## What problem does swiftide solve?