From 12566ddd82b23cb3ab978af70b1f2a40d3592084 Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.williams@tiledb.com>
Date: Thu, 16 May 2024 21:39:39 -0400
Subject: [PATCH] Tutorial: TileDB-SOMA append-mode (#2571)

---
 .../notebooks/tutorial_soma_append_mode.ipynb | 699 ++++++++++++++++++
 .../notebooks/tutorial_soma_append_mode.ipynb |   1 +
 2 files changed, 700 insertions(+)
 create mode 100644 apis/python/notebooks/tutorial_soma_append_mode.ipynb
 create mode 120000 doc/source/notebooks/tutorial_soma_append_mode.ipynb

diff --git a/apis/python/notebooks/tutorial_soma_append_mode.ipynb b/apis/python/notebooks/tutorial_soma_append_mode.ipynb
new file mode 100644
index 0000000000..28e22b602c
--- /dev/null
+++ b/apis/python/notebooks/tutorial_soma_append_mode.ipynb
@@ -0,0 +1,699 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "# Tutorial: TileDB-SOMA append-mode"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "As of TileDB-SOMA 1.5.0, we're excited to offer support for append mode.\n",
+        "\n",
+        "Use-cases include ingesting H5AD/AnnData from multiple sequencing runs over time, accumulating the data over time, into millions of cells."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "First, we'll do the usual package imports:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tiledbsoma.__version__              1.11.1\n",
+            "TileDB-Py version                   0.29.0\n",
+            "TileDB core version (tiledb)        2.23.0\n",
+            "TileDB core version (libtiledbsoma) 2.23.0\n",
+            "python version                      3.11.8.final.0\n",
+            "OS version                          Linux 4.14.336-257.568.amzn2.x86_64\n"
+          ]
+        }
+      ],
+      "source": [
+        "import scanpy as sc\n",
+        "import tiledbsoma\n",
+        "import tiledbsoma.io\n",
+        "import tiledbsoma.logging\n",
+        "\n",
+        "tiledbsoma.show_package_versions()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Next we'll set up where our data are going. Note that to change the storage backend, everything in this notebook below this URI-selection cell is completely unchanged -- whether you store your data on TileDB Cloud, cloud storage such as S3, instance-local NVME, or what have you."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'/tmp/append-example-20240516-203638'"
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "import datetime\n",
+        "\n",
+        "stamp = datetime.datetime.today().strftime(\"%Y%m%d-%H%M%S\")\n",
+        "# experiment_uri = f\"tiledb://johnkerl-tiledb/s3://tiledb-johnkerl/scratch/append-example-{stamp}\"\n",
+        "# Self-contained demo\n",
+        "experiment_uri = f\"/tmp/append-example-{stamp}\"\n",
+        "experiment_uri"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "## Create the initial SOMA Experiment"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Next we'll prep some input data. To make things easy for this self-contained demo, we'll use Scanpy's `pbmc3k`, with a custom column."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "ad1 = sc.datasets.pbmc3k()\n",
+        "sc.pp.calculate_qc_metrics(ad1, inplace=True)\n",
+        "ad1.obs[\"when\"] = [\"Monday\"] * len(ad1.obs)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Now we're ready to ingest the data into a SOMA experiment. Since SOMA is multimodal, we'll specify the destination modality, or measurement name, to be \"RNA\"."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "measurement_name = \"RNA\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: registering isolated AnnData object.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/obs\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/ms/RNA/var\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Writing /tmp/append-example-20240516-203638/ms/RNA/X/data\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "<frozen abc>:119: FutureWarning: SparseDataset is deprecated and will be removed in late 2024. It has been replaced by the public classes CSRDataset and CSCDataset.\n",
+            "\n",
+            "For instance checks, use `isinstance(X, (anndata.experimental.CSRDataset, anndata.experimental.CSCDataset))` instead.\n",
+            "\n",
+            "For creation, use `anndata.experimental.sparse_dataset(X)` instead.\n",
+            "\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/ms/RNA/X/data\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "'/tmp/append-example-20240516-203638'"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "tiledbsoma.logging.info()\n",
+        "tiledbsoma.io.from_anndata(experiment_uri, ad1, measurement_name=measurement_name)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Now let's read back the data. We'll take a look at `obs`, `var`, and `X`."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "**obs**: For this initial ingest, there are obs IDs ending in `-1`, the `when` is `Monday`, and there are 2700 rows. Also note that since TileDB is a columnar database, when we select certain columns, those are the only ones loaded from disk. This positively impacts performance at cloud scale."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                obs_id  n_genes_by_counts    when\n",
+            "0     AAACATACAACCAC-1                781  Monday\n",
+            "1     AAACATTGAGCTAC-1               1352  Monday\n",
+            "2     AAACATTGATCAGC-1               1131  Monday\n",
+            "3     AAACCGTGCTTCCG-1                960  Monday\n",
+            "4     AAACCGTGTATGCG-1                522  Monday\n",
+            "...                ...                ...     ...\n",
+            "2695  TTTCGAACTCTCAT-1               1155  Monday\n",
+            "2696  TTTCTACTGAGGCA-1               1227  Monday\n",
+            "2697  TTTCTACTTCCTCG-1                622  Monday\n",
+            "2698  TTTGCATGAGAGGC-1                454  Monday\n",
+            "2699  TTTGCATGCCTCAC-1                724  Monday\n",
+            "\n",
+            "[2700 rows x 3 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    print(\n",
+        "        exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
+        "        .concat()\n",
+        "        .to_pandas()\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "**var**: Let's also look at `var`, selecting out the join IDs (which index columns of `X`) as well as the Ensembl-format and NCBI-format gene IDs:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "       soma_joinid        var_id         gene_ids\n",
+            "0                0    MIR1302-10  ENSG00000243485\n",
+            "1                1       FAM138A  ENSG00000237613\n",
+            "2                2         OR4F5  ENSG00000186092\n",
+            "3                3  RP11-34P13.7  ENSG00000238009\n",
+            "4                4  RP11-34P13.8  ENSG00000239945\n",
+            "...            ...           ...              ...\n",
+            "32733        32733    AC145205.1  ENSG00000215635\n",
+            "32734        32734         BAGE5  ENSG00000268590\n",
+            "32735        32735    CU459201.1  ENSG00000251180\n",
+            "32736        32736    AC002321.2  ENSG00000215616\n",
+            "32737        32737    AC002321.1  ENSG00000215611\n",
+            "\n",
+            "[32738 rows x 3 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    print(\n",
+        "        exp.ms[\"RNA\"]\n",
+        "        .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
+        "        .concat()\n",
+        "        .to_pandas()\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "**X**: Lastly let's look at the expression matrix, in COO format. (You can convert to other formats if you like.) Its rows and columns are indexed by the `soma_joinid` of the `obs` and `var` dataframes, respectively."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "         soma_dim_0  soma_dim_1  soma_data\n",
+            "0                 0          70        1.0\n",
+            "1                 0         166        1.0\n",
+            "2                 0         178        2.0\n",
+            "3                 0         326        1.0\n",
+            "4                 0         363        1.0\n",
+            "...             ...         ...        ...\n",
+            "2286879        2699       32697        1.0\n",
+            "2286880        2699       32698        7.0\n",
+            "2286881        2699       32702        1.0\n",
+            "2286882        2699       32705        1.0\n",
+            "2286883        2699       32708        3.0\n",
+            "\n",
+            "[2286884 rows x 3 columns]\n",
+            "\n",
+            "((0, 2699), (0, 32732))\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    X = exp.ms[\"RNA\"].X[\"data\"]\n",
+        "    print(X.read().tables().concat().to_pandas())\n",
+        "    print()\n",
+        "    print(X.used_shape())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "## Appending a new dataset to the SOMA Experiment"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Now, let's simiulate another day's sequencing run. For simplicity of this demo notebook, we'll mutate the previous dataset, changing the obs IDs to have a `-2` suffix, and also putting `Tuesday` in the `when` column. Also, we'll multiply the `X` values by 10."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "ad2 = ad1\n",
+        "ad2.obs.index = [e.replace(\"-1\", \"-2\") for e in ad1.obs.index]\n",
+        "ad2.obs[\"when\"] = [\"Tuesday\"] * len(ad2.obs)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "ad2.X *= 10"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Now we simply ingest as before -- the only additional step is a black-box registration step which detects which cell IDs are new (here, all of them) and which gene IDs are new (here, none of them).\n",
+        "\n",
+        "The registration takes two forms, either of which you can use depending on your use-case: `tiledbsoma.io.register_anndatas` for in-memory AnnData objects, or `tiledbsoma.io.register_h5ads` for on-storage AnnData objects."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: starting with experiment /tmp/append-example-20240516-203638\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: found nobs=2700 nvar=32738 from experiment.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: registering AnnData object.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: accumulated to nobs=5400 nvar=32738.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Registration: complete.\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/obs\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/ms/RNA/var\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Writing /tmp/append-example-20240516-203638/ms/RNA/X/data\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638/ms/RNA/X/data\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Wrote   /tmp/append-example-20240516-203638\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "'/tmp/append-example-20240516-203638'"
+            ]
+          },
+          "execution_count": 11,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "rd = tiledbsoma.io.register_anndatas(\n",
+        "    experiment_uri,\n",
+        "    [ad2],\n",
+        "    measurement_name=measurement_name,\n",
+        "    obs_field_name=\"obs_id\",\n",
+        "    var_field_name=\"var_id\",\n",
+        ")\n",
+        "\n",
+        "tiledbsoma.io.from_anndata(\n",
+        "    experiment_uri,\n",
+        "    ad2,\n",
+        "    measurement_name=measurement_name,\n",
+        "    registration_mapping=rd,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Now let's read back the appended data. There are now obs IDs ending in `-1` as well as `-2`, the `when` includes `Monday` as well as `Tuesday`, and there are 5400 rows.\n",
+        "\n",
+        "(For `Wednesday` and onward, it'll simply be the same pattern -- we can grow our data iteratively over time, to arbitrary sizes.)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                obs_id  n_genes_by_counts     when\n",
+            "0     AAACATACAACCAC-1                781   Monday\n",
+            "1     AAACATTGAGCTAC-1               1352   Monday\n",
+            "2     AAACATTGATCAGC-1               1131   Monday\n",
+            "3     AAACCGTGCTTCCG-1                960   Monday\n",
+            "4     AAACCGTGTATGCG-1                522   Monday\n",
+            "...                ...                ...      ...\n",
+            "5395  TTTCGAACTCTCAT-2               1155  Tuesday\n",
+            "5396  TTTCTACTGAGGCA-2               1227  Tuesday\n",
+            "5397  TTTCTACTTCCTCG-2                622  Tuesday\n",
+            "5398  TTTGCATGAGAGGC-2                454  Tuesday\n",
+            "5399  TTTGCATGCCTCAC-2                724  Tuesday\n",
+            "\n",
+            "[5400 rows x 3 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    print(\n",
+        "        exp.obs.read(column_names=[\"obs_id\", \"n_genes_by_counts\", \"when\"])\n",
+        "        .concat()\n",
+        "        .to_pandas()\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "Let's also look at `var`, as before. Since we had data for more cells but for the same genes, there is nothing new here. The `obs` table grew downward with the new cells, and `X` grew downward with new rows, but `var` stayed the same.\n",
+        "\n",
+        "In real-world data, occasionally you will see a gene expressed in subsequent data which wasn't expressed in the initial data. That's fine -- you'll simply see `var` grow just a bit for those newly encountered gene IDs, with corresponding new columns for `X`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "       soma_joinid        var_id         gene_ids\n",
+            "0                0    MIR1302-10  ENSG00000243485\n",
+            "1                1       FAM138A  ENSG00000237613\n",
+            "2                2         OR4F5  ENSG00000186092\n",
+            "3                3  RP11-34P13.7  ENSG00000238009\n",
+            "4                4  RP11-34P13.8  ENSG00000239945\n",
+            "...            ...           ...              ...\n",
+            "32733        32733    AC145205.1  ENSG00000215635\n",
+            "32734        32734         BAGE5  ENSG00000268590\n",
+            "32735        32735    CU459201.1  ENSG00000251180\n",
+            "32736        32736    AC002321.2  ENSG00000215616\n",
+            "32737        32737    AC002321.1  ENSG00000215611\n",
+            "\n",
+            "[32738 rows x 3 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    print(\n",
+        "        exp.ms[\"RNA\"]\n",
+        "        .var.read(column_names=[\"soma_joinid\", \"var_id\", \"gene_ids\"])\n",
+        "        .concat()\n",
+        "        .to_pandas()\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "tags": []
+      },
+      "source": [
+        "And lastly, the expression matrix which has grown downward with the new cells, while keeping the same width as we didn't introduce new genes:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "         soma_dim_0  soma_dim_1  soma_data\n",
+            "0                 0          70        1.0\n",
+            "1                 0         166        1.0\n",
+            "2                 0         178        2.0\n",
+            "3                 0         326        1.0\n",
+            "4                 0         363        1.0\n",
+            "...             ...         ...        ...\n",
+            "4573763        5399       32697       10.0\n",
+            "4573764        5399       32698       70.0\n",
+            "4573765        5399       32702       10.0\n",
+            "4573766        5399       32705       10.0\n",
+            "4573767        5399       32708       30.0\n",
+            "\n",
+            "[4573768 rows x 3 columns]\n",
+            "\n",
+            "((0, 5399), (0, 32732))\n"
+          ]
+        }
+      ],
+      "source": [
+        "with tiledbsoma.Experiment.open(experiment_uri) as exp:\n",
+        "    X = exp.ms[\"RNA\"].X[\"data\"]\n",
+        "    print(X.read().tables().concat().to_pandas())\n",
+        "    print()\n",
+        "    print(X.used_shape())"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.8"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
diff --git a/doc/source/notebooks/tutorial_soma_append_mode.ipynb b/doc/source/notebooks/tutorial_soma_append_mode.ipynb
new file mode 120000
index 0000000000..0d4def8a25
--- /dev/null
+++ b/doc/source/notebooks/tutorial_soma_append_mode.ipynb
@@ -0,0 +1 @@
+../../../apis/python/notebooks/tutorial_soma_append_mode.ipynb
\ No newline at end of file