issue #8: removed all references to utility module to simplify except…

… one method is genuinely used by swing + child windows
bagherilab · Feb 8, 2018 · 7e79fbd · 7e79fbd
1 parent b62159d
commit 7e79fbd
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 201 deletions.
diff --git a/Swing/DionesusWindow.py b/Swing/DionesusWindow.py
@@ -5,7 +5,6 @@
 import pandas as pd
 from sklearn.decomposition import PCA
 from .Window import Window
-from .util import utility_module as utility
 from .util.pls_nipals import vipp
 
 
@@ -210,7 +209,7 @@ def initialize_params(self):
 
         explained_variances_mean = np.mean(explained_variances, axis = 0)
         test_pcs = [x for x in range(1, len(explained_variances_mean)+1)]
-        elbow_x, elbow_y = utility.elbow_criteria(test_pcs, explained_variances_mean)
+        elbow_x, elbow_y = self._elbow_criteria(test_pcs, explained_variances_mean)
         self.num_pcs = elbow_x
 
     def fit_window(self, pcs=3, crag=False, calc_mse=False):
@@ -270,6 +269,29 @@ def _fitstack_coeffs(self, n_pcs, coeff_matrix, vip_matrix, model_list, x_matrix
             self.training_scores.append(training_scores)
             self.test_scores.append(test_scores)
         return coeff_matrix, vip_matrix, model_list
+
+    def _elbow_criteria(self,x,y):
+        x = np.array(x)
+        y = np.array(y)
+        # Slope between elbow endpoints
+        m1 = point_slope(x[0], y[0], x[-1], y[-1])
+        # Intercept
+        b1 = y[0] - m1*x[0]
+
+        # Slope for perpendicular lines
+        m2 = -1/m1
+
+        # Calculate intercepts for perpendicular lines that go through data point
+        b_array = y-m2*x
+        x_perp = (b_array-b1)/(m1-m2)
+        y_perp = m1*x_perp+b1
+
+        # Calculate where the maximum distance to a line connecting endpoints is
+        distances = np.sqrt((x_perp-x)**2+(y_perp-y)**2)
+        index_max = np.where(distances==np.max(distances))[0][0]
+        elbow_x = x[index_max]
+        elbow_y = y[index_max]
+        return elbow_x, elbow_y
 
     def get_coeffs(self, num_pcs=2, x_data=None, y_data=None, crag=False, calc_mse=False):
         """

diff --git a/Swing/LassoWindow.py b/Swing/LassoWindow.py
@@ -6,7 +6,6 @@
 from scipy import integrate
 from scipy import stats
 from sklearn.metrics import mean_squared_error
-from .util.utility_module import sum_of_squares
 from .Window import Window
 
 
@@ -353,7 +352,7 @@ def cross_validate_alpha(self, alpha, n_folds, condensed=False):
 
             # Calculate PRESS and SS
             current_press = np.sum(np.power(y_predicted - y_test, 2), axis=0)
-            current_ss = sum_of_squares(y_test)
+            current_ss = self.sum_of_squares(y_test)
 
             press += current_press
             ss += current_ss
@@ -507,4 +506,16 @@ def make_edge_table(self, calc_mse=False):
 
         return df
 
+    def _sum_of_squares(self,x, axis=0):
+        """
+        Calculate the sum of the squares for each column
+        :param x: array-like
+            The data matrix for which the sum of squares is taken
+        :return: float or array-like
+            The sum of squares, columnwise or total
+        """
+        column_mean = np.mean(x, axis=axis)
+        sse = np.sum(np.power(x - column_mean, 2), axis=axis)
+        return sse
+
 
diff --git a/Swing/Swing.py b/Swing/Swing.py
@@ -8,13 +8,13 @@
 from .RFRWindow import RandomForestRegressionWindow
 from .DionesusWindow import DionesusWindow
 from .LassoWindow import LassoWindow
-from .util import utility_module as utility
 from .util.Evaluator import Evaluator
+from .util import utility_module as util
 
 
 class Swing(object):
     """
-    An object that grabs different timepoints of data, can set window and step size.
+    An object that grabs different timepoints of data, can set window and step size, and makes an edge-list.
 
     """
 
@@ -519,36 +519,6 @@ def rank_edges(self, n_bootstraps=1000, permutation_n=1000):
                 window.make_edge_table(calc_mse=self.calc_mse)
         return self.window_list
 
-    def average_rank(self, rank_by, ascending):
-        """
-        Average window edge ranks
-
-        Called by:
-            pipeline
-
-
-        :param rank_by: string
-            The parameter to rank edges by
-        :param ascending: Bool
-        :return:
-        """
-        if self.window_type == "Lasso":
-            ranked_result_list = []
-            for window in self.window_list:
-                ranked_result = window.rank_results(rank_by, ascending)
-                ranked_result_list.append(ranked_result)
-        if self.window_type == "RandomForest":
-            ranked_result_list = []
-            for window in self.window_list:
-                ranked_result = window.sort_edges(rank_by)
-                ranked_result_list.append(ranked_result)
-
-        aggr_ranks = utility.average_rank(ranked_result_list, rank_by + "-rank")
-        # sort tables by mean rank in ascending order
-        mean_sorted_edge_list = aggr_ranks.sort(columns="mean-rank", axis=0)
-        self.averaged_ranks = mean_sorted_edge_list
-        return self.averaged_ranks
-
     def zscore_all_data(self):
         """
         Zscore the data in a data-frame
@@ -717,9 +687,9 @@ def make_static_edge_dict(self, true_edges, self_edges=False, lag_method='max_me
         # Calculate the full set of potential edges with TF list if it is provided.
 
         if self.tf_list is not None:
-            full_edge_set = set(utility.make_possible_edge_list(np.array(self.tf_list), self.gene_list, self_edges=self_edges))
+            full_edge_set = set(util.make_possible_edge_list(np.array(self.tf_list), self.gene_list, self_edges=self_edges))
         else:
-            full_edge_set = set(utility.make_possible_edge_list(self.gene_list, self.gene_list, self_edges=self_edges))
+            full_edge_set = set(util.make_possible_edge_list(self.gene_list, self.gene_list, self_edges=self_edges))
 
         # Identify edges that could exist, but do not appear in the inferred list
         edge_diff = full_edge_set.difference(edge_set)
@@ -755,6 +725,34 @@ def make_static_edge_dict(self, true_edges, self_edges=False, lag_method='max_me
             warnings.warn(message)
         return
 
+    def _make_possible_edge_list(self,parents, children, self_edges=True):
+        """
+        Create a list of all the possible edges between parents and children
+
+        :param parents: array
+            labels for parents
+        :param children: array
+            labels for children
+        :param self_edges:
+        :return: array, length = parents * children
+            array of parent, child combinations for all possible edges
+        """
+        parent_index = range(len(parents))
+        child_index = range(len(children))
+
+        a, b = np.meshgrid(parent_index, child_index)
+        parent_list = list(parents[a.flatten()])
+        child_list = list(children[b.flatten()])
+        possible_edge_list = None
+        if self_edges:
+            possible_edge_list = list(zip(parent_list, child_list))
+
+        elif not self_edges:
+            possible_edge_list = [x for x in zip(parent_list, child_list) if x[0] != x[1]]
+
+        return possible_edge_list
+
+
     def make_sort_df(self, df, sort_by='mean'):
         """
         Calculate the mean for each edge
@@ -778,24 +776,13 @@ def make_sort_df(self, df, sort_by='mean'):
         print("[DONE]")
         return sort_df
 
-    def calc_edge_importance_cutoff(self, df):
-        """
-        Calculate the importance threshold to filter edges on
-        :param df:
-        :return: dict
-        """
-        x, y = utility.elbow_criteria(range(0, len(df.Importance)), df.Importance.values.astype(np.float64))
-        elbow_dict = {'num_edges':x, 'importance_threshold':y}
-
-        return elbow_dict
-
     def get_samples(self):
         df=pd.read_csv(self.file_path,sep='\t')
         node_list = df.columns.tolist()
         node_list.pop(0)
         return node_list
 
-    def get_explanatory_indices(index):
+    def get_explanatory_indices(self,index):
         # In append mode, the start index can always be 0
         if self.max_lag is None:
             start_idx = 0

diff --git a/Swing/Window.py b/Swing/Window.py
@@ -1,7 +1,7 @@
-from .util import utility_module as utility
 import numpy as np
 import pandas as pd
 from scipy import stats
+from .util import utility_module as util
 
 
 class Window(object):
@@ -46,8 +46,8 @@ def __init__(self, raw_dataframe, window_info, roller_data, td_window, explanato
         self.genes = self.df.columns.values
         self.n_genes = len(self.genes)
         self.results_table = pd.DataFrame()
-        self.edge_list = utility.make_possible_edge_list(self.explanatory_labels, self.response_labels)
 
+        self.edge_list = util.make_possible_edge_list(self.explanatory_labels, self.response_labels)
         # Add edge list to edge table
         self.results_table['regulator-target'] = self.edge_list
         self.roller_data = roller_data
@@ -320,19 +320,7 @@ def get_average(self):
         averages = self.response_data.mean(axis=0)
         return averages
 
-    def crag_window(self, model_params):
-        model = model_params['model']
-        response_train = model_params['response']
-        predictor_train = model_params['predictor']
-        response_col = model_params['col_index']
-        training_scores = utility.get_cragging_scores(model, predictor_train, response_train)
-        test_data = utility.get_test_set(self.data, self.roller_data)
 
-        response_test = test_data.ix[:, response_col].values
-        predictor_test = test_data.drop(test_data.columns[response_col],1).values
-
-        test_scores = utility.get_cragging_scores(model,predictor_test, response_test)
-        return((training_scores, test_scores))
 
     ###################################################################################################################
     # Abstract methods listed below

diff --git a/Swing/util/utility_module.py b/Swing/util/utility_module.py
@@ -3,128 +3,6 @@
 import sklearn.metrics as skmet
 import numpy as np
 
-def get_explanatory_indices(index, min_lag, max_lag):
-        # In append mode, the start index can always be 0
-        if max_lag is None:
-            start_idx = 0
-        else:
-            start_idx = max(index-max_lag, 0)
-        end_index = max(index-min_lag+1, 0)
-
-        explanatory_indices = range(start_idx, end_index)
-
-        # If the maximum lag required is greater than the index, this window must be left censored
-        if len(explanatory_indices) == 0 or max_lag > index:
-            explanatory_indices = None
-
-        return explanatory_indices
-
-def get_test_set(window_raw_data, roller_raw_data):
-    roller_vec = roller_raw_data['Time'].unique()
-    window_vec = window_raw_data['Time'].unique()
-    test_set_vec = np.setdiff1d(roller_vec, window_vec)
-    test_data = roller_raw_data.loc[roller_raw_data['Time'].isin(test_set_vec)].drop('Time', 1)
-    return(test_data)
-
-
-def get_cragging_scores(model, predictor, response_true):
-    response_pred = model.predict(predictor)
-    scores = {}
-    scores['ev'] = skmet.explained_variance_score(response_true, response_pred)
-    scores['mae'] = skmet.mean_absolute_error(response_true, response_pred)
-    scores['mse'] = skmet.mean_squared_error(response_true, response_pred)
-    #scores['medae'] = skmet.median_absolute_error(response_true, response_pred)
-    scores['r2'] = skmet.r2_score(response_true, response_pred)
-    return(scores)
-
-
-def create_3D_linked_list(labels, numpy_array_3D, value_label):
-    """returns a panel with interaction (x-axis) - value (y axis) - time (Z axis)"""
-    windows_n = numpy_array_3D.shape[2]
-    linked_list_3D ={}
-
-    for i in xrange(windows_n):
-        target_2D_array = numpy_array_3D[:,:,i]
-        linked_list = create_linked_list(labels, target_2D_array, value_label)
-        linked_list_3D[i] = linked_list
-    return pd.Panel(linked_list_3D)
-
-
-def create_linked_list(labels, numpy_array_2D, value_label):
-    """labels and array should be in row-major order"""
-    linked_list = pd.DataFrame({'regulator-target':labels, value_label:numpy_array_2D.flatten()})
-    return linked_list
-
-
-def average_rank(ranked_result_list, col_string):
-    """finds the average rank and standard deviation throughout time"""
-    aggregate_ranks = []
-    for nth_window in ranked_result_list:
-        aggregate_ranks.append(nth_window[[col_string, 'regulator-target']])
-    #now merge the panels in an interesting loop. The merge function insures the keys are always matched up correctly.
-    left_df = aggregate_ranks[0] #initialize the left_df.
-    left_df.columns= [col_string+"_0", 'regulator-target']
-    for window_index in range(1,len(aggregate_ranks)):
-        right_df = aggregate_ranks[window_index]
-        right_df.columns= [col_string+"_"+str(window_index), 'regulator-target']
-        left_df = left_df.merge(right_df,on = 'regulator-target')
-
-    aggr_ranks = left_df.drop(['regulator-target'], axis = 1)
-    #assign to temporary variables to prevent the calc columns to be involved in other calculations
-    range_col = list(zip(aggr_ranks.min(axis = 1), aggr_ranks.max(axis = 1)))
-    mean_col = aggr_ranks.mean(axis = 1)
-    median_col = aggr_ranks.median(axis = 1)
-    sd_col = aggr_ranks.std(axis = 1, ddof=1)
-
-    aggr_ranks['range'] = range_col
-    aggr_ranks['mean-rank'] = mean_col
-    aggr_ranks['median-rank'] = median_col
-    aggr_ranks['sd-rank'] = sd_col
-    aggr_ranks['regulator-target'] = left_df['regulator-target']
-    return(aggr_ranks)
-
-
-def rank_results_3D(result_list, col_string, ascending=True):
-    """input: list of result pandas dfs, column name. output: each time window is sorted by column name, most significant to least"""
-    rank_column_name = col_string + "-rank"
-    for nth_window in result_list:
-        nth_window[rank_column_name] = nth_window[col_string].rank(method="dense", ascending = ascending)
-    return result_list
-
-
-def rank_index(vector):
-        return [vector.index(x) for x in sorted(range(vector), key=vector.__getitem__)]
-
-
-def point_slope(x1,y1, x2,y2):
-    slope = (y2-y1)/float(x2-x1)
-    return slope
-
-
-def elbow_criteria(x,y):
-    x = np.array(x)
-    y = np.array(y)
-    # Slope between elbow endpoints
-    m1 = point_slope(x[0], y[0], x[-1], y[-1])
-    # Intercept
-    b1 = y[0] - m1*x[0]
-
-    # Slope for perpendicular lines
-    m2 = -1/m1
-
-    # Calculate intercepts for perpendicular lines that go through data point
-    b_array = y-m2*x
-    x_perp = (b_array-b1)/(m1-m2)
-    y_perp = m1*x_perp+b1
-
-    # Calculate where the maximum distance to a line connecting endpoints is
-    distances = np.sqrt((x_perp-x)**2+(y_perp-y)**2)
-    index_max = np.where(distances==np.max(distances))[0][0]
-    elbow_x = x[index_max]
-    elbow_y = y[index_max]
-    return elbow_x, elbow_y
-
-
 def make_possible_edge_list(parents, children, self_edges=True):
     """
     Create a list of all the possible edges between parents and children
@@ -150,18 +28,4 @@ def make_possible_edge_list(parents, children, self_edges=True):
     elif not self_edges:
         possible_edge_list = [x for x in zip(parent_list, child_list) if x[0] != x[1]]
 
-    return possible_edge_list
-
-
-def sum_of_squares(x, axis=0):
-    """
-    Calculate the sum of the squares for each column
-    :param x: array-like
-        The data matrix for which the sum of squares is taken
-    :return: float or array-like
-        The sum of squares, columnwise or total
-    """
-    column_mean = np.mean(x, axis=axis)
-    sse = np.sum(np.power(x - column_mean, 2), axis=axis)
-    return sse
-
+    return possible_edge_list