openpipelines-bio · dorien-er · Sep 18, 2024 · Sep 18, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -119,6 +119,8 @@
 
 * `metadata/copy_obs` component: Added a component to copy an .obs column from a MuData object to another (PR #874).
 
+* `workflow/annotation/scanorama_knn` workflow: Cell-type annotation based on scanorama integration with KNN label transfer (PR #884). 
+
 ## MINOR CHANGES
 
 * `resources_test_scripts/cellranger_atac_tiny_bcl.sh` script: generate counts from fastq files using CellRanger atac count (PR #726).

diff --git a/src/workflows/annotation/scanorama_knn/config.vsh.yaml b/src/workflows/annotation/scanorama_knn/config.vsh.yaml
@@ -0,0 +1,165 @@
+name: "scanorama_knn"
+namespace: "workflows/annotation"
+description: "Cell type annotation workflow by performing scanorama integration of reference and query dataset followed by KNN label transfer."
+authors:
+  - __merge__: /src/authors/dorien_roosen.yaml
+    roles: [ author, maintainer ]
+  - __merge__: /src/authors/weiwei_schultz.yaml
+    roles: [ contributor ]
+
+argument_groups:
+  - name: Query Input
+    arguments:
+      - name: "--id"
+        required: true
+        type: string
+        description: ID of the sample.
+        example: foo
+      - name: "--input"
+        required: true
+        type: file
+        description: Input dataset consisting of the (unlabeled) query observations. The dataset is expected to be pre-processed in the same way as --reference.
+        example: input.h5mu
+      - name: "--modality"
+        description: Which modality to process. Should match the modality of the --reference dataset.
+        type: string
+        default: "rna"
+        required: false
+      - name: "--input_obsm_embedding"
+        example: "X_pca"
+        type: string
+        description: Embedding .obsm column to use as input for integration. Should match the embedding .obsm columng of the --reference dataset.
+      - name: "--input_obs_batch_label"
+        type: string
+        description: "The .obs field in the input (query) dataset containing the batch labels."
+        example: "sample"
+        required: true
+
+  - name: Reference input
+    arguments:
+      - name: "--reference"
+        required: true
+        type: file
+        description: Reference dataset consisting of the labeled observations to train the KNN classifier on. The dataset is expected to be pre-processed in the same way as the --input query dataset.
+        example: reference.h5mu
+      - name: "--reference_obs_targets"
+        type: string
+        example: [ ann_level_1, ann_level_2, ann_level_3, ann_level_4, ann_level_5, ann_finest_level ]
+        required: true
+        multiple: true
+        description: The `.obs` key(s) of the target labels to transfer.
+      - name: "--reference_obs_batch_label"
+        type: string
+        description:  "The .obs field in the reference dataset containing the batch labels."
+        example: "sample"
+        required: true
+
+  - name: Scanorama integration options
+    arguments:
+      - name: "--knn"
+        type: integer
+        description: "Number of nearest neighbors to use for matching during scanorama integration."
+        default: 20
+      - name: "--batch_size"
+        type: integer
+        description: "The batch size used in the alignment vector computation. Useful when integrating very large (>100k samples) datasets. Set to large value that runs within available memory."
+        default: 5000
+      - name: "--sigma"
+        type: double
+        description: "Correction smoothing parameter on Gaussian kernel."
+        default: 15
+      - name: "--approx"
+        type: boolean
+        description: "Use approximate nearest neighbors with Python annoy; greatly speeds up matching runtime."
+        default: True
+      - name: "--alpha"
+        type: double
+        description: "Alignment score minimum cutoff"
+        default: 0.1
+
+  - name: Leiden clustering options
+    arguments:
+      - name: "--leiden_resolution"
+        type: double
+        description: Control the coarseness of the clustering. Higher values lead to more clusters.
+        min: 0
+        default: [1]
+        multiple: true
+
+  - name: Neighbor classifier arguments
+    arguments:
+      - name: "--weights"
+        type: string
+        default: "uniform"
+        choices: ["uniform", "distance"]
+        description: |
+          Weight function used in prediction. Possible values are:
+          `uniform` (all points in each neighborhood are weighted equally) or 
+          `distance` (weight points by the inverse of their distance)
+      - name: "--n_neighbors"
+        type: integer
+        default: 15
+        required: false
+        description: |
+          The number of neighbors to use in k-neighbor graph structure used for fast approximate nearest neighbor search with PyNNDescent. 
+          Larger values will result in more accurate search results at the cost of computation time.
+
+  - name: "Outputs"
+    arguments:
+      - name: "--output"
+        type: file
+        required: true
+        direction: output
+        description: The query data in .h5mu format with predicted labels predicted from the classifier trained on the reference.
+        example: output.h5mu
+      - name: "--output_obs_predictions"
+        type: string
+        required: false
+        multiple: true
+        description: |
+          In which `.obs` slots to store the predicted cell labels.
+          If provided, must have the same length as `--reference_obs_targets`.
+          If empty, will default to the `reference_obs_targets` combined with the `"_pred"` suffix.
+      - name: "--output_obs_probability"
+        type: string
+        required: false
+        multiple: true
+        description: |
+          In which `.obs` slots to store the probability of the predictions.
+          If provided, must have the same length as `--reference_obs_targets`.
+          If empty, will default to the `reference_obs_targets` combined with the `"_probability"` suffix.
+      - name: "--output_obsm_integrated"
+        type: string
+        default: "X_integrated_scanorama"
+        required: false
+        description: "In which .obsm slot to store the integrated embedding."
+      - name: "--output_compression"
+        type: string
+        description: |
+          The compression format to be used on the output h5mu object.
+        choices: ["gzip", "lzf"]
+        required: false
+        example: "gzip"
+
+dependencies:
+  - name: workflows/integration/scanorama_leiden
+    alias: scanorama_leiden_workflow
+  - name: labels_transfer/pynndescent_knn
+  - name: dataflow/split_h5mu
+  - name: dataflow/concatenate_h5mu
+  - name: metadata/add_id
+  - name: metadata/copy_obs
+
+resources:
+  - type: nextflow_script
+    path: main.nf
+    entrypoint: run_wf
+
+test_resources:
+  - type: nextflow_script
+    path: test.nf
+    entrypoint: test_wf
+  - path: /resources_test/scgpt
+
+runners:
+  - type: nextflow
diff --git a/src/workflows/annotation/scanorama_knn/integration_test.sh b/src/workflows/annotation/scanorama_knn/integration_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# get the root of the directory
+REPO_ROOT=$(git rev-parse --show-toplevel)
+
+# ensure that the command below is run from the root of the repository
+cd "$REPO_ROOT"
+
+export NXF_VER=21.10.6
+
+viash ns build -q scanorama_knn
+
+nextflow \
+  run . \
+  -main-script src/workflows/annotation/scanorama_knn/test.nf \
+  -entry test_wf \
+  -resume \
+  -profile no_publish \
+  -c src/workflows/utils/labels_ci.config \
+  -c src/workflows/utils/integration_tests.config \
+  -with-trace work/trace.txt
diff --git a/src/workflows/annotation/scanorama_knn/main.nf b/src/workflows/annotation/scanorama_knn/main.nf
@@ -0,0 +1,164 @@
+workflow run_wf {
+  take:
+    input_ch
+
+  main:
+
+
+    output_ch = input_ch
+        // Set aside the output for this workflow to avoid conflicts
+        | map {id, state -> 
+        def new_state = state + ["workflow_output": state.output]
+        [id, new_state]
+        }
+        // add id as _meta join id to be able to merge with source channel and end of workflow
+        | map{ id, state -> 
+        def new_state = state + ["_meta": ["join_id": id]]
+        [id, new_state]
+        }
+        | view {"After adding join_id: $it"}
+        // Add 'query' id to .obs columns of query dataset
+        | add_id.run(
+            fromState: [
+                "input": "input",
+            ],
+            args:[
+                "input_id": "query",
+                "obs_output": "dataset",
+            ],
+            toState: ["input": "output"])
+        // Add 'reference'id to .obs columns of reference dataset
+        | add_id.run(
+                fromState:[
+                    "input": "reference",
+                ],
+                args:[
+                    "input_id": "reference",
+                    "obs_output": "dataset"
+                ],
+                toState: ["reference": "output"])
+        // Make sure that query and reference dataset have batch information in the same .obs column
+        // By copying the respective .obs columns to the obs column "batch_label"
+        | copy_obs.run(
+            fromState: [
+                "input": "input",
+                "modality": "modality",
+                "input_obs_key": "input_obs_batch_label",
+            ],
+            args: [
+                "output_obs_key": "batch_label"
+            ],
+            toState: [
+                "input": "output"
+            ]
+        )
+        | copy_obs.run(
+            fromState: [
+                "input": "reference",
+                "modality": "modality",
+                "input_obs_key": "reference_obs_batch_label",
+            ],
+            args: [
+                "output_obs_key": "batch_label"
+            ],
+            toState: [
+                "reference": "output"
+            ]
+        )
+        // Concatenate query and reference datasets prior to integration
+        | concatenate_h5mu.run(
+            fromState: { id, state -> [
+                "input": [state.input, state.reference]
+                ]
+            },
+            args: [
+                "input_id": ["query", "reference"],
+                "other_axis_mode": "move"
+            ],
+            toState: ["input": "output"]
+            )
+        | view {"After concatenation: $it"}
+        // Run scanorama integration with leiden clustering
+        | scanorama_leiden_workflow.run(
+            fromState: { id, state ->
+            [
+                "id": id,
+                "input": state.input,
+                // "layer": state.layer, 
+                "modality": state.modality,
+                "obsm_input": state.input_obsm_embedding, //
+                "obsm_output": state.output_obsm_integrated,
+                "leiden_resolution": state.leiden_resolution,
+                "knn": state.knn,
+                "batch_size": state.batch_size,
+                "sigma": state.sigma,
+                "approx": state.approx,
+                "alpha": state.alpha
+            ]},
+            args: [
+                "uns_neighbors": "scanorama_integration_neighbors",
+                "obsp_neighbor_distances": "scanorama_integration_distances",
+                "obsp_neighbor_connectivities": "scanorama_integration_connectivities",
+                "obs_cluster": "scanorama_integration_leiden",
+                "obsm_umap": "X_leiden_scanorama_umap",
+                "obs_batch": "batch_label"
+            ],
+            toState: ["input": "output"]
+            )
+        | view {"After integration: $it"}
+        // Split integrated dataset back into a separate reference and query dataset
+        | split_h5mu.run(
+            fromState: [
+                "input": "input",
+                "modality": "modality"
+            ],
+            args: [
+                "obs_feature": "dataset",
+                "output_files": "sample_files.csv",
+                "drop_obs_nan": "true",
+                "output": "ref_query"
+            ],
+            toState: [ 
+                "output": "output", 
+                "output_files": "output_files" 
+            ],
+            auto: [ publish: true ]
+            )
+        | view {"After sample splitting: $it"}
+        // map the integrated query and reference datasets back to the state
+        | map {id, state ->
+            def outputDir = state.output
+            def files = readCsv(state.output_files.toUriString())
+            def query_file = files.findAll{ dat -> dat.name == 'query' }
+            assert query_file.size() == 1, 'there should only be one query file'
+            def reference_file = files.findAll{ dat -> dat.name == 'reference' }
+            assert reference_file.size() == 1, 'there should only be one reference file'
+            def integrated_query = outputDir.resolve(query_file.filename)
+            def integrated_reference = outputDir.resolve(reference_file.filename)
+            def newKeys = ["integrated_query": integrated_query, "integrated_reference": integrated_reference]
+            [id, state + newKeys]
+            }
+        | view {"After splitting query: $it"}
+        // Perform KNN label transfer from integrated reference to integrated query
+        | pynndescent_knn.run(
+            fromState: [
+                "input": "integrated_query",
+                "modality": "modality",
+                "input_obsm_features": "output_obsm_integrated",
+                "reference": "integrated_reference",
+                "reference_obsm_features": "output_obsm_integrated",
+                "reference_obs_targets": "reference_obs_targets",
+                "output_obs_predictions": "output_obs_predictions",
+                "output_obs_probability": "output_obs_probability",
+                "output_compression": "output_compression",
+                "weights": "weights",
+                "n_neighbors": "n_neighbors",
+                "output": "workflow_output"
+            ],
+            toState: {id, output, state -> ["output": output.output]},
+            auto: [ publish: true ]
+            )
+
+  emit:
+    output_ch
+}
diff --git a/src/workflows/annotation/scanorama_knn/nextflow.config b/src/workflows/annotation/scanorama_knn/nextflow.config
@@ -0,0 +1,10 @@
+manifest {
+  nextflowVersion = '!>=20.12.1-edge'
+}
+
+params {
+  rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString()
+}
+
+// include common settings
+includeConfig("${params.rootDir}/src/workflows/utils/labels.config")