snorkel-team · talolard · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021 · Feb 12, 2021
diff --git a/.gitignore b/.gitignore
@@ -129,9 +129,10 @@ dmypy.json
 # Editors
 .vscode/
 .code-workspace*
-
+.idea/
 # Dask
 dask-worker-space/
 
 # nohup
 nohup.out
+
diff --git a/snorkel/labeling/__init__.py b/snorkel/labeling/__init__.py
@@ -4,4 +4,4 @@
 from .apply.core import LFApplier  # noqa: F401
 from .apply.pandas import PandasLFApplier  # noqa: F401
 from .lf.core import LabelingFunction, labeling_function  # noqa: F401
-from .utils import filter_unlabeled_dataframe  # noqa: F401
+from .utils import filter_unlabeled_dataframe  # noqa: F401
diff --git a/snorkel/labeling/model/__init__.py b/snorkel/labeling/model/__init__.py
@@ -1,2 +1,6 @@
 from .baselines import MajorityClassVoter, MajorityLabelVoter, RandomVoter  # noqa: F401
 from .label_model import LabelModel  # noqa: F401
+from .sparse_data_helpers import (
+    train_model_from_known_objective,
+    train_model_from_sparse_event_cooccurence,
+)
diff --git a/snorkel/labeling/model/label_model.py b/snorkel/labeling/model/label_model.py
@@ -15,6 +15,7 @@
 from snorkel.labeling.model.graph_utils import get_clique_tree
 from snorkel.labeling.model.logger import Logger
 from snorkel.types import Config
+from snorkel.types.data import KnownDimensions
 from snorkel.utils.config_utils import merge_config
 from snorkel.utils.lr_schedulers import LRSchedulerConfig
 from snorkel.utils.optimizers import OptimizerConfig
@@ -188,19 +189,7 @@ def _get_augmented_label_matrix(
         # Create a helper data structure which maps cliques (as tuples of member
         # sources) --> {start_index, end_index, maximal_cliques}, where
         # the last value is a set of indices in this data structure
-        self.c_data: Dict[int, _CliqueData] = {}
-        for i in range(self.m):
-            self.c_data[i] = _CliqueData(
-                start_index=i * self.cardinality,
-                end_index=(i + 1) * self.cardinality,
-                max_cliques=set(
-                    [
-                        j
-                        for j in self.c_tree.nodes()
-                        if i in self.c_tree.node[j]["members"]
-                    ]
-                ),
-            )
+
 
         L_ind = self._create_L_ind(L)
 
@@ -225,6 +214,21 @@ def _get_augmented_label_matrix(
         else:
             return L_ind
 
+    def _calculate_clique_data(self):
+        self.c_data: Dict[int, _CliqueData] = {}
+        for i in range(self.m):
+            self.c_data[i] = _CliqueData(
+                start_index=i * self.cardinality,
+                end_index=(i + 1) * self.cardinality,
+                max_cliques=set(
+                    [
+                        j
+                        for j in self.c_tree.nodes()
+                        if i in self.c_tree.node[j]["members"]
+                    ]
+                ),
+            )
+
     def _build_mask(self) -> None:
         """Build mask applied to O^{-1}, O for the matrix approx constraint."""
         self.mask = torch.ones(self.d, self.d).bool()
@@ -252,6 +256,12 @@ def _generate_O(self, L: np.ndarray, higher_order: bool = False) -> None:
         """
         L_aug = self._get_augmented_label_matrix(L, higher_order=higher_order)
         self.d = L_aug.shape[1]
+        self._generate_O_from_L_aug(L_aug)
+
+    def _generate_O_from_L_aug(self, L_aug):
+        """ Generates O from L_aug. Extracted to a seperate method for the sake of testing
+
+        """
         self.O = (
             torch.from_numpy(L_aug.T @ L_aug / self.n).float().to(self.config.device)
         )
@@ -371,13 +381,15 @@ def get_weights(self) -> np.ndarray:
         >>> np.around(label_model.get_weights(), 2)  # doctest: +SKIP
         array([0.99, 0.99, 0.99])
         """
+        if not hasattr(self,'coverage'):
+            raise NotImplementedError("Models Initialized from an Objective don't have statistics to generate weights")
         accs = np.zeros(self.m)
         cprobs = self.get_conditional_probs()
         for i in range(self.m):
             accs[i] = np.diag(cprobs[i, 1:, :] @ self.P.cpu().detach().numpy()).sum()
         return np.clip(accs / self.coverage, 1e-6, 1.0)
 
-    def predict_proba(self, L: np.ndarray) -> np.ndarray:
+    def predict_proba(self, L: np.ndarray,is_augmented=False) -> np.ndarray:
         r"""Return label probabilities P(Y | \lambda).
 
         Parameters
@@ -400,9 +412,14 @@ def predict_proba(self, L: np.ndarray) -> np.ndarray:
                [0., 1.],
                [0., 1.]])
         """
-        L_shift = L + 1  # convert to {0, 1, ..., k}
-        self._set_constants(L_shift)
-        L_aug = self._get_augmented_label_matrix(L_shift)
+        if not is_augmented:
+            #This is the usual mode
+            L_shift = L + 1  # convert to {0, 1, ..., k}
+            self._set_constants(L_shift) ##TODO - Why do we need this here ?
+            L_aug = self._get_augmented_label_matrix(L_shift)
+        else:
+            #The data came in augmented format, and constants are already set
+            L_aug = L
         mu = self.mu.cpu().detach().numpy()
         jtm = np.ones(L_aug.shape[1])
 
@@ -580,15 +597,35 @@ def _set_class_balance(
             )
         self.P = torch.diag(torch.from_numpy(self.p)).float().to(self.config.device)
 
-    def _set_constants(self, L: np.ndarray) -> None:
-        self.n, self.m = L.shape
+    def _set_constants(
+        self,
+        L: Optional[np.ndarray] = None,
+        known_dimensions: Optional[KnownDimensions] = None,
+    ) -> None:
+        if L is None and known_dimensions is None:
+            raise ValueError(
+                "You must either provide a LabelMatrix or specify known_dimensions"
+            )
+        elif known_dimensions is not None:
+            self.n = known_dimensions.num_examples
+            self.m = known_dimensions.num_functions
+            self.d = known_dimensions.num_events
+            self.cardinality = known_dimensions.num_classes
+        elif L is not None:
+            # We know L is not none, but the linter can't figure it out ...
+            self.n, self.m = L.shape
+        else:
+            raise ValueError(
+                "Something impossible happened. This is here for the sake of the linter"
+            )
         if self.m < 3:
             raise ValueError("L_train should have at least 3 labeling functions")
         self.t = 1
 
     def _create_tree(self) -> None:
         nodes = range(self.m)
         self.c_tree = get_clique_tree(nodes, [])
+        self._calculate_clique_data()
 
     def _execute_logging(self, loss: torch.Tensor) -> Metrics:
         self.eval()
@@ -861,13 +898,6 @@ def fit(
         >>> label_model.fit(L, class_balance=[0.7, 0.3], n_epochs=200, l2=0.4)
         """
         # Set random seed
-        self.train_config: TrainConfig = merge_config(  # type:ignore
-            TrainConfig(), kwargs  # type:ignore
-        )
-        # Update base config so that it includes all parameters
-        random.seed(self.train_config.seed)
-        np.random.seed(self.train_config.seed)
-        torch.manual_seed(self.train_config.seed)
 
         L_shift = L_train + 1  # convert to {0, 1, ..., k}
         if L_shift.max() > self.cardinality:
@@ -876,15 +906,41 @@ def fit(
             )
 
         self._set_constants(L_shift)
-        self._set_class_balance(class_balance, Y_dev)
-        self._create_tree()
+        self._common_training_preamble(
+            class_balance=class_balance, Y_dev=Y_dev, **kwargs
+        )
         lf_analysis = LFAnalysis(L_train)
         self.coverage = lf_analysis.lf_coverages()
 
         # Compute O and initialize params
         if self.config.verbose:  # pragma: no cover
             logging.info("Computing O...")
         self._generate_O(L_shift)
+        self._common_training_loop()
+
+    def _common_training_preamble(
+        self,
+        Y_dev: Optional[np.ndarray] = None,
+        class_balance: Optional[List[float]] = None,
+        **kwargs
+    ):
+        """
+            Performs the training preamble, regardless of user input
+        """
+        self.train_config: TrainConfig = merge_config(  # type:ignore
+            TrainConfig(), kwargs  # type:ignore
+        )
+        # Update base config so that it includes all parameters
+        random.seed(self.train_config.seed)
+        np.random.seed(self.train_config.seed)
+        torch.manual_seed(self.train_config.seed)
+        self._set_class_balance(class_balance, Y_dev)
+        self._create_tree()
+
+    def _common_training_loop(self):
+        """
+            Training Logic that is shared across different fit methods, irrespective of the user input format
+        """
         self._init_params()
 
         # Estimate \mu

diff --git a/snorkel/labeling/model/sparse_data_helpers.py b/snorkel/labeling/model/sparse_data_helpers.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+"""Sparse Data Helpers
+
+Indexing throughout this module is 0 based, with the assumption that "abstains" are ommited. 
+
+When working with larger datasets, it can be convenient to load the data in sparse format. This module
+provides utilities to do so. We provide functions for a number of cases. 
+
+The user has the AugmentedMatrix (L_ind) in tuple form. AugmentedMatrix is of shape (num_examples,numfuncs*num_classes) 
+and the user has a list of tuples (i,j) that indicate that event j occoured for example i. 
+
+The user has a list of 3-tuples(i,j,k) such that for document i, labeling function j predicted class k.
+
+The user has a list of 3-tuples (i,j,c) where i and j range over [0,num_funcs*num_classes] such that 
+the events  i and j were observed to have co-occur c times. 
+
+The user has a list of 3-tuples (i,j,f) where i and j range over [0,num_funcs*num_classes] such that 
+the events  i and j co-occur with frequency f where f is in (0,1]
+
+"""
+from snorkel.labeling.model.label_model import LabelModel
+from typing import List,  Tuple, Iterable, Dict
+from scipy.sparse import csr_matrix
+import numpy as np
+import torch
+from snorkel.types.data import KnownDimensions
+
+
+def predict_probs_from_cliqueset(
+    trained_model: LabelModel, cliqueset_indice_list: Iterable[Iterable[int]]
+):
+    """
+        This function can make inference many orders of magnitude faster for larger datasets.
+
+        In the data representation of L_ind where each row is a document and each column corresponds to an event "
+        function x predicted class y", the 1s on L_ind essentially define a fully connected graph, or cliqueset.
+        while their are num_classes^num_functions possible cliquesets, in practice we'll see a very small subset of
+        those.
+        In our exerpiments, where num_functions=40 and num_classes=3 we observed 600 cliquesets whereas 3^40 were possible.
+
+        This function receives a trained model, and a list of cliquesets (indexed by event_id "func_id*num_labels+label_id")
+        loads those in a sparse format and returns to predictions keyed by cliqueset
+
+
+
+    """
+    rows = []
+    cols = []
+    data = []
+    for num, cs in enumerate(cliqueset_indice_list):
+        for event_id in cs:
+            rows.append(num)
+            cols.append(event_id)
+            data.append(1)
+    sparse_input_l_ind = csr_matrix(
+        (data, (rows, cols)), shape=(len(rows), trained_model.d)
+    )
+    predicted_probs = trained_model.predict_proba(sparse_input_l_ind.todense(),is_augmented=True)
+    result_dict: Dict[tuple, np.array] ={}
+    for cs, probs in zip(cliqueset_indice_list, predicted_probs):
+        result_dict[tuple(cs)] = probs
+    return result_dict
+
+
+def train_model_from_known_objective(
+    objective: np.array, known_dimensions: KnownDimensions, **kwargs
+):
+    model = LabelModel(cardinality=known_dimensions.num_classes, **kwargs)
+    model._set_constants(known_dimensions=known_dimensions)
+    model.O = torch.from_numpy(objective)
+    model._common_training_preamble()
+    model._common_training_loop()
+    return model
+
+
+def train_model_from_sparse_event_cooccurence(
+    sparse_event_cooccurence: List[Tuple[int, int, int]],
+    known_dimensions: KnownDimensions,
+):
+    objective = _prepare_objective_from_sparse_event_cooccurence(
+        sparse_event_cooccurence, known_dimensions
+    )
+    return train_model_from_known_objective(
+        objective=objective, known_dimensions=known_dimensions
+    )
+
+
+def _prepare_objective_from_sparse_event_cooccurence(
+    sparse_event_cooccurence: List[Tuple[int, int, int]],
+    known_dimensions: KnownDimensions,
+):
+    sparse_L_ind = _prepare_sparse_L_ind(known_dimensions, sparse_event_cooccurence)
+    objective = (sparse_L_ind.T @ sparse_L_ind) / known_dimensions.num_examples
+    return objective.todense()
+
+
+def _prepare_sparse_L_ind(known_dimensions, sparse_event_cooccurence):
+    rows = []
+    cols = []
+    data = []
+    for (row, col, count) in sparse_event_cooccurence:
+        rows.append(row)
+        cols.append(col)
+        data.append(count)
+    rows = np.array(rows)
+    cols = np.array(cols)
+    sparse_L_ind = csr_matrix(
+        (data, (rows, cols),),  # Notice that this is a tuple with a tuple
+        shape=(known_dimensions.num_examples, known_dimensions.num_events),
+    )
+    return sparse_L_ind
diff --git a/snorkel/types/data.py b/snorkel/types/data.py
@@ -1,7 +1,18 @@
-from typing import Any, Mapping, Sequence
+from typing import Any, Mapping, Sequence, NamedTuple, Optional
 
 DataPoint = Any
 DataPoints = Sequence[DataPoint]
 
 Field = Any
 FieldMap = Mapping[str, Field]
+class KnownDimensions(NamedTuple):
+    num_functions: int
+    num_classes: int
+    num_examples: Optional[int]
+
+    @property
+    def num_events(self):
+        """
+            How many indicator random variables do we have (1 per event)
+        """
+        return self.num_functions * self.num_classes