From 89bc8f35d641101e15e7bcd220820e75fea45da2 Mon Sep 17 00:00:00 2001
From: Sam Minot <sminot@fredhutch.org>
Date: Tue, 6 Feb 2024 11:17:11 -0800
Subject: [PATCH] Simplify normalization to aligned reads only

---
 bin/bin_metagenomes.py | 178 +++++++++++++++++------------------------
 1 file changed, 75 insertions(+), 103 deletions(-)

diff --git a/bin/bin_metagenomes.py b/bin/bin_metagenomes.py
index 0ca8933..6a6b106 100755
--- a/bin/bin_metagenomes.py
+++ b/bin/bin_metagenomes.py
@@ -170,7 +170,6 @@ def from_read_alignments(
         self.calc_bin_silhouette_score()
         self.est_genome_abund()
         self.calc_rel_abund()
-        del self.data.uns["filtered_out"]
 
         return self
 
@@ -493,14 +492,11 @@ def est_genome_abund(self):
         self.data.update()
         self.log_content()
 
-    def calc_rel_abund(self, mods=None, prefix="Relative to "):
+    def calc_rel_abund(self, mods=None):
         """
-        Compute the relative abundance in different ways.
-        1. Relative to any feature which is ubiquitously present
-        2. Relative to the total number of reads sequenced
-        3. Relative to the number of reads aligned
+        Normalize the abundance to the number of reads aligned
 
-        For each modality, a layer will be created with the given prefix
+        For each modality, a 'prop' layer will be created
         """
         if mods is None:
             mods = ["bins", "genomes"]
@@ -509,38 +505,26 @@ def calc_rel_abund(self, mods=None, prefix="Relative to "):
             # Get the values for this modality
             abund: pd.DataFrame = self.data.mod[mod].to_df()
 
-            # Identify any variables which are found in all samples
-            comp_vars = [var for var, vals in abund.items() if (vals > 0).all()]
-            logger.info(f"There are {len(comp_vars):,} variables {mod} present in all samples")
-            for var in comp_vars:
-                kw = f"{prefix}{var}"
-                logger.info(f"Computing {mod} {kw}")
-                # Calculate the ratio to the comparison variable
-                ratios = (abund.T / abund[var]).T
-                # Save the table
-                self.data.mod[mod].layers[kw] = ratios
-
-            # Run the analysis in comparison to the number of aligned reads
-            for kw, label in [
-                ("tot_reads", "Total Reads"),
-                ("aligned_reads", "Aligned Reads")
-            ]:
-                assert kw in self.data.mod["genes"].obs
-                new_kw = f"{prefix}{label}"
-                logger.info(f"Computing {mod} {new_kw}")
-                rel_abund = (
-                    abund.T /
-                    self.data.mod["genes"].obs[kw].reindex(
-                        index=abund.index.values
-                    )
-                ).T
-                self.data.mod[mod].layers[new_kw] = rel_abund
+            assert "aligned_reads" in self.data.mod["genes"].obs
+            logger.info(f"Normalizing {mod} to sequencing depth")
+            rel_abund = (
+                abund.T /
+                self.data.mod["genes"].obs["aligned_reads"].reindex(
+                    index=abund.index.values
+                )
+            ).T
+            self.data.mod[mod].layers["prop"] = rel_abund
 
     def compare_groups(self, category, mods=None):
         assert category in self.data.obs.columns.values
 
         # Get the metadata values
-        meta: pd.Series = self.data.obs[category].dropna()
+        meta: pd.Series = (
+            self.data
+            .obs[category]
+            .reindex(index=self.filtered_samples)
+            .dropna()
+        )
         logger.info(f"Samples with {category} defined: {meta.shape[0]:,}")
 
         # We can only support the comparison of two groups (at this point)
@@ -557,30 +541,16 @@ def compare_groups(self, category, mods=None):
             self.compare_groups_mod(meta, mod)
             self.log_content()
 
-    @staticmethod
-    def pack_meta_layer(meta_name, layer):
-        return f"~ {meta_name} - {layer}"
-
-    @staticmethod
-    def unpack_meta_layer(varm):
-        if varm.startswith("~ "):
-            return varm[2:].split(" - ", 1)
-        else:
-            return None, None
-
     def compare_groups_mod(self, meta: pd.Series, mod: str):
         logger.info(f"Comparing samples on the basis of {mod}")
 
-        # Use each of the different relative abundance tables
-        # which have been computed for this modality
-        for layer in self.data.mod[mod].layers:
-            label = self.pack_meta_layer(meta.name, layer)
-            logger.info(f"Comparing samples {label}")
-            abund: pd.DataFrame = self.data.mod[mod].to_df(layer)
+        # Use the relative abundance table computed for this modality
+        logger.info(f"Comparing samples using {mod} content")
+        abund: pd.DataFrame = self.data.mod[mod].to_df("prop")
 
-            self.data.mod[mod].varm[label] = (
-                self.compare_groups_single(meta, abund)
-            )
+        self.data.mod[mod].varm[f"~{meta.name}"] = (
+            self.compare_groups_single(meta, abund)
+        )
 
     def compare_groups_single(
         self,
@@ -622,11 +592,15 @@ def mannwhitneyu(var: str, vals: pd.Series, control_obs, comparison_obs):
 
     def to_h5ad(self, output_folder):
 
+        if "filtered_out" in self.data.uns:
+            del self.data.uns["filtered_out"]
         for mod in self.data.mod:
             self.data.mod[mod].write_h5ad(f"{output_folder}/metagenome.{mod}.h5ad")
 
     def to_csv(self, output_folder):
 
+        if "filtered_out" in self.data.uns:
+            del self.data.uns["filtered_out"]
         self.to_csv_obj(self.data, f"{output_folder}/metagenome")
 
     def to_csv_obj(self, obj, path):
@@ -681,23 +655,29 @@ def plot(self, output_folder):
         # and genomes extend horizontally.
 
         for bins_varm in self.data.mod["bins"].varm:
-            bins_meta, bins_layer = self.unpack_meta_layer(bins_varm)
+            assert bins_varm.startswith("~"), f"Expected ~meta, not {bins_varm}"
+            bins_meta = bins_varm[1:]
 
             for genomes_varm in self.data.mod["genomes"].varm:
-                genomes_meta, genome_layer = self.unpack_meta_layer(genomes_varm)
+                assert genomes_varm.startswith("~"), f"Expected ~meta, not {genomes_varm}"
+                genomes_meta = genomes_varm[1:]
 
                 if bins_meta == genomes_meta:
-                    output_fn = ' - '.join([bins_meta, "bins", bins_layer, "genomes", genome_layer])
+                    output_fn = genomes_meta
                     output_fp = f"{output_folder}/{output_fn}"
                     self.write_image(
                         bins_meta,
-                        bins_layer,
-                        genome_layer,
                         output_fp,
                     )
 
     @staticmethod
-    def sort_index(df):
+    def log_scale(df: pd.DataFrame):
+
+        lowest = df.apply(lambda c: c[c > 0].min()).min()
+        return df.clip(lower=lowest).apply(np.log10)
+
+    @staticmethod
+    def sort_index(df: pd.DataFrame):
         return df.index.values[
             hierarchy.leaves_list(
                 hierarchy.linkage(
@@ -711,8 +691,6 @@ def sort_index(df):
     def write_image(
         self,
         meta_cname,
-        bins_layer,
-        genome_layer,
         output_fp
     ):
         """"
@@ -796,7 +774,7 @@ def write_image(
         genomes_df: pd.DataFrame = (
             self.data
             .mod["genomes"]
-            .to_df(genome_layer)
+            .to_df("prop")
         )
         sample_order = self.sort_index(genomes_df)
         genomes_df = genomes_df.reindex(
@@ -804,19 +782,10 @@ def write_image(
             index=sample_order
         ).fillna(0)
 
-        if genome_layer.startswith("Relative to "):
-            comparitor = genome_layer[len("Relative to "):]
-            if comparitor in genomes_df.columns.values:
-                genomes_df = (
-                    genomes_df
-                    .drop(columns=[comparitor])
-                    .reindex(columns=genome_order)
-                )
-
         self.heatmap(
-            genomes_df.T,
+            self.log_scale(genomes_df.T),
             fig,
-            value_label=genome_layer,
+            value_label="Proportion of Aligned Reads (log10)",
             row=2,
             col=4,
             showscale=True,
@@ -826,7 +795,7 @@ def write_image(
             colorbar_xpad=0,
             colorbar_orientation="h",
             colorbar_title_side="top",
-            colorbar_title_text=f"Genome Group Abundance<br>{genome_layer}",
+            colorbar_title_text="Genome Group Abundance<br>Proportion of Aligned Reads (log10)",
             colorbar_y=0.45,
             colorbar_len=0.25
         )
@@ -875,7 +844,7 @@ def write_image(
             (
                 self.data
                 .mod["genomes"]
-                .varm[f"~ {meta_cname} - {genome_layer}"]
+                .varm[f"~{meta_cname}"]
                 ["neg_log10_qvalue"]
             ),
             "FDR-adjusted p-value (-log10)",
@@ -895,7 +864,8 @@ def write_image(
             row=2,
             col=6,
             orient="h",
-            showlegend=False
+            showlegend=False,
+            log=True
         )
 
         # Bins Information (extending vertically from the central heatmap)
@@ -917,26 +887,18 @@ def write_image(
         bins_df: pd.DataFrame = (
             self.data
             .mod["bins"]
-            .to_df(bins_layer)
+            .to_df("prop")
         )
         sample_order = self.sort_index(bins_df)
         bins_df = bins_df.reindex(
             columns=bin_order,
             index=sample_order
         ).fillna(0)
-        if bins_layer.startswith("Relative to "):
-            comparitor = bins_layer[len("Relative to "):]
-            if comparitor in bins_df.columns.values:
-                bins_df = (
-                    bins_df
-                    .drop(columns=[comparitor])
-                    .reindex(columns=bin_order)
-                )
 
         self.heatmap(
-            bins_df,
+            self.log_scale(bins_df),
             fig,
-            value_label=bins_layer,
+            value_label="Proportion of Aligned Reads (log10)",
             row=4,
             col=2,
             coloraxis="coloraxis4",
@@ -946,7 +908,7 @@ def write_image(
             colorbar_y=0.55,
             colorbar_len=0.25,
             colorbar_title_side="right",
-            colorbar_title_text=f"Gene Bin Abundance<br>{bins_layer}",
+            colorbar_title_text="Gene Bin Abundance<br>Proportion of Aligned Reads (log10)",
             colorbar_xpad=0
         )
 
@@ -981,7 +943,7 @@ def write_image(
             (
                 self.data
                 .mod["bins"]
-                .varm[f"~ {meta_cname} - {bins_layer}"]
+                .varm[f"~{meta_cname}"]
                 ["neg_log10_qvalue"]
             ),
             "FDR-adjusted p-value (-log10)",
@@ -990,7 +952,7 @@ def write_image(
             col=2
         )
 
-        # Boxplot comparing genome abundances by sample group
+        # Boxplot comparing gene bin abundances by sample group
         self.box(
             bins_df,
             self.data.obs[meta_cname].reindex(index=sample_order),
@@ -998,17 +960,12 @@ def write_image(
             "Gene Bin Abundance",
             fig,
             row=7,
-            col=2
+            col=2,
+            log=True
         )
 
         logger.info(fig.layout)
-        title_text = " - ".join(
-            [
-                f"Association with {meta_cname}",
-                f"Gene Bins: {bins_layer}",
-                f"Genome Groups: {genome_layer}"
-            ]
-        )
+        title_text = f"Association with {meta_cname}"
         fig.update_layout(
             title_text=title_text,
             title_xanchor="center",
@@ -1081,7 +1038,7 @@ def heatmap(
             col=col
         )
 
-    def bar(self, vals: pd.Series, label: str, fig, row, col, orient="v"):
+    def bar(self, vals: pd.Series, label: str, fig, row, col, orient="v", log=False):
         assert orient in ["v", "h"]
         kwargs = (
             dict(
@@ -1109,18 +1066,30 @@ def bar(self, vals: pd.Series, label: str, fig, row, col, orient="v"):
         )
         label = label.replace(" ", "<br>")
         self.label_axis(fig, row, col, orient, label)
+        if log:
+            self.logscale_axis(fig, row, col, orient)
+
+    def label_axis(self, fig, row, col, orient, title):
+        self.update_axis(
+            fig, row, col, orient, title=title
+        )
+
+    def logscale_axis(self, fig, row, col, orient):
+        self.update_axis(
+            fig, row, col, orient, type="log"
+        )
 
     @staticmethod
-    def label_axis(fig, row, col, orient, label):
+    def update_axis(fig, row, col, orient, **kwargs):
         if orient == "v":
             fig.for_each_yaxis(
-                lambda axis: axis.update(title=label),
+                lambda axis: axis.update(**kwargs),
                 row=row,
                 col=col
             )
         else:
             fig.for_each_xaxis(
-                lambda axis: axis.update(title=label),
+                lambda axis: axis.update(**kwargs),
                 row=row,
                 col=col
             )
@@ -1135,6 +1104,7 @@ def box(
         row,
         col,
         orient="v",
+        log=False,
         **box_kwargs
     ):
         assert orient in ["v", "h"]
@@ -1179,6 +1149,8 @@ def box(
             )
         label = label.replace(" ", "<br>")
         self.label_axis(fig, row, col, orient, label)
+        if log:
+            self.logscale_axis(fig, row, col, orient)
 
 
 def bin_metagenomes(