Skip to content

Commit

Permalink
issue #8: removed all references to utility module to simplify except…
Browse files Browse the repository at this point in the history
… one method is genuinely used by swing + child windows
  • Loading branch information
Jia Jie Wu committed Feb 8, 2018
1 parent b62159d commit 7e79fbd
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 201 deletions.
26 changes: 24 additions & 2 deletions Swing/DionesusWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import pandas as pd
from sklearn.decomposition import PCA
from .Window import Window
from .util import utility_module as utility
from .util.pls_nipals import vipp


Expand Down Expand Up @@ -210,7 +209,7 @@ def initialize_params(self):

explained_variances_mean = np.mean(explained_variances, axis = 0)
test_pcs = [x for x in range(1, len(explained_variances_mean)+1)]
elbow_x, elbow_y = utility.elbow_criteria(test_pcs, explained_variances_mean)
elbow_x, elbow_y = self._elbow_criteria(test_pcs, explained_variances_mean)
self.num_pcs = elbow_x

def fit_window(self, pcs=3, crag=False, calc_mse=False):
Expand Down Expand Up @@ -270,6 +269,29 @@ def _fitstack_coeffs(self, n_pcs, coeff_matrix, vip_matrix, model_list, x_matrix
self.training_scores.append(training_scores)
self.test_scores.append(test_scores)
return coeff_matrix, vip_matrix, model_list

def _elbow_criteria(self,x,y):
x = np.array(x)
y = np.array(y)
# Slope between elbow endpoints
m1 = point_slope(x[0], y[0], x[-1], y[-1])
# Intercept
b1 = y[0] - m1*x[0]

# Slope for perpendicular lines
m2 = -1/m1

# Calculate intercepts for perpendicular lines that go through data point
b_array = y-m2*x
x_perp = (b_array-b1)/(m1-m2)
y_perp = m1*x_perp+b1

# Calculate where the maximum distance to a line connecting endpoints is
distances = np.sqrt((x_perp-x)**2+(y_perp-y)**2)
index_max = np.where(distances==np.max(distances))[0][0]
elbow_x = x[index_max]
elbow_y = y[index_max]
return elbow_x, elbow_y

def get_coeffs(self, num_pcs=2, x_data=None, y_data=None, crag=False, calc_mse=False):
"""
Expand Down
15 changes: 13 additions & 2 deletions Swing/LassoWindow.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from scipy import integrate
from scipy import stats
from sklearn.metrics import mean_squared_error
from .util.utility_module import sum_of_squares
from .Window import Window


Expand Down Expand Up @@ -353,7 +352,7 @@ def cross_validate_alpha(self, alpha, n_folds, condensed=False):

# Calculate PRESS and SS
current_press = np.sum(np.power(y_predicted - y_test, 2), axis=0)
current_ss = sum_of_squares(y_test)
current_ss = self.sum_of_squares(y_test)

press += current_press
ss += current_ss
Expand Down Expand Up @@ -507,4 +506,16 @@ def make_edge_table(self, calc_mse=False):

return df

def _sum_of_squares(self,x, axis=0):
"""
Calculate the sum of the squares for each column
:param x: array-like
The data matrix for which the sum of squares is taken
:return: float or array-like
The sum of squares, columnwise or total
"""
column_mean = np.mean(x, axis=axis)
sse = np.sum(np.power(x - column_mean, 2), axis=axis)
return sse


79 changes: 33 additions & 46 deletions Swing/Swing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from .RFRWindow import RandomForestRegressionWindow
from .DionesusWindow import DionesusWindow
from .LassoWindow import LassoWindow
from .util import utility_module as utility
from .util.Evaluator import Evaluator
from .util import utility_module as util


class Swing(object):
"""
An object that grabs different timepoints of data, can set window and step size.
An object that grabs different timepoints of data, can set window and step size, and makes an edge-list.
"""

Expand Down Expand Up @@ -519,36 +519,6 @@ def rank_edges(self, n_bootstraps=1000, permutation_n=1000):
window.make_edge_table(calc_mse=self.calc_mse)
return self.window_list

def average_rank(self, rank_by, ascending):
"""
Average window edge ranks
Called by:
pipeline
:param rank_by: string
The parameter to rank edges by
:param ascending: Bool
:return:
"""
if self.window_type == "Lasso":
ranked_result_list = []
for window in self.window_list:
ranked_result = window.rank_results(rank_by, ascending)
ranked_result_list.append(ranked_result)
if self.window_type == "RandomForest":
ranked_result_list = []
for window in self.window_list:
ranked_result = window.sort_edges(rank_by)
ranked_result_list.append(ranked_result)

aggr_ranks = utility.average_rank(ranked_result_list, rank_by + "-rank")
# sort tables by mean rank in ascending order
mean_sorted_edge_list = aggr_ranks.sort(columns="mean-rank", axis=0)
self.averaged_ranks = mean_sorted_edge_list
return self.averaged_ranks

def zscore_all_data(self):
"""
Zscore the data in a data-frame
Expand Down Expand Up @@ -717,9 +687,9 @@ def make_static_edge_dict(self, true_edges, self_edges=False, lag_method='max_me
# Calculate the full set of potential edges with TF list if it is provided.

if self.tf_list is not None:
full_edge_set = set(utility.make_possible_edge_list(np.array(self.tf_list), self.gene_list, self_edges=self_edges))
full_edge_set = set(util.make_possible_edge_list(np.array(self.tf_list), self.gene_list, self_edges=self_edges))
else:
full_edge_set = set(utility.make_possible_edge_list(self.gene_list, self.gene_list, self_edges=self_edges))
full_edge_set = set(util.make_possible_edge_list(self.gene_list, self.gene_list, self_edges=self_edges))

# Identify edges that could exist, but do not appear in the inferred list
edge_diff = full_edge_set.difference(edge_set)
Expand Down Expand Up @@ -755,6 +725,34 @@ def make_static_edge_dict(self, true_edges, self_edges=False, lag_method='max_me
warnings.warn(message)
return

def _make_possible_edge_list(self,parents, children, self_edges=True):
"""
Create a list of all the possible edges between parents and children
:param parents: array
labels for parents
:param children: array
labels for children
:param self_edges:
:return: array, length = parents * children
array of parent, child combinations for all possible edges
"""
parent_index = range(len(parents))
child_index = range(len(children))

a, b = np.meshgrid(parent_index, child_index)
parent_list = list(parents[a.flatten()])
child_list = list(children[b.flatten()])
possible_edge_list = None
if self_edges:
possible_edge_list = list(zip(parent_list, child_list))

elif not self_edges:
possible_edge_list = [x for x in zip(parent_list, child_list) if x[0] != x[1]]

return possible_edge_list


def make_sort_df(self, df, sort_by='mean'):
"""
Calculate the mean for each edge
Expand All @@ -778,24 +776,13 @@ def make_sort_df(self, df, sort_by='mean'):
print("[DONE]")
return sort_df

def calc_edge_importance_cutoff(self, df):
"""
Calculate the importance threshold to filter edges on
:param df:
:return: dict
"""
x, y = utility.elbow_criteria(range(0, len(df.Importance)), df.Importance.values.astype(np.float64))
elbow_dict = {'num_edges':x, 'importance_threshold':y}

return elbow_dict

def get_samples(self):
df=pd.read_csv(self.file_path,sep='\t')
node_list = df.columns.tolist()
node_list.pop(0)
return node_list

def get_explanatory_indices(index):
def get_explanatory_indices(self,index):
# In append mode, the start index can always be 0
if self.max_lag is None:
start_idx = 0
Expand Down
16 changes: 2 additions & 14 deletions Swing/Window.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .util import utility_module as utility
import numpy as np
import pandas as pd
from scipy import stats
from .util import utility_module as util


class Window(object):
Expand Down Expand Up @@ -46,8 +46,8 @@ def __init__(self, raw_dataframe, window_info, roller_data, td_window, explanato
self.genes = self.df.columns.values
self.n_genes = len(self.genes)
self.results_table = pd.DataFrame()
self.edge_list = utility.make_possible_edge_list(self.explanatory_labels, self.response_labels)

self.edge_list = util.make_possible_edge_list(self.explanatory_labels, self.response_labels)
# Add edge list to edge table
self.results_table['regulator-target'] = self.edge_list
self.roller_data = roller_data
Expand Down Expand Up @@ -320,19 +320,7 @@ def get_average(self):
averages = self.response_data.mean(axis=0)
return averages

def crag_window(self, model_params):
model = model_params['model']
response_train = model_params['response']
predictor_train = model_params['predictor']
response_col = model_params['col_index']
training_scores = utility.get_cragging_scores(model, predictor_train, response_train)
test_data = utility.get_test_set(self.data, self.roller_data)

response_test = test_data.ix[:, response_col].values
predictor_test = test_data.drop(test_data.columns[response_col],1).values

test_scores = utility.get_cragging_scores(model,predictor_test, response_test)
return((training_scores, test_scores))

###################################################################################################################
# Abstract methods listed below
Expand Down
138 changes: 1 addition & 137 deletions Swing/util/utility_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,128 +3,6 @@
import sklearn.metrics as skmet
import numpy as np

def get_explanatory_indices(index, min_lag, max_lag):
# In append mode, the start index can always be 0
if max_lag is None:
start_idx = 0
else:
start_idx = max(index-max_lag, 0)
end_index = max(index-min_lag+1, 0)

explanatory_indices = range(start_idx, end_index)

# If the maximum lag required is greater than the index, this window must be left censored
if len(explanatory_indices) == 0 or max_lag > index:
explanatory_indices = None

return explanatory_indices

def get_test_set(window_raw_data, roller_raw_data):
roller_vec = roller_raw_data['Time'].unique()
window_vec = window_raw_data['Time'].unique()
test_set_vec = np.setdiff1d(roller_vec, window_vec)
test_data = roller_raw_data.loc[roller_raw_data['Time'].isin(test_set_vec)].drop('Time', 1)
return(test_data)


def get_cragging_scores(model, predictor, response_true):
response_pred = model.predict(predictor)
scores = {}
scores['ev'] = skmet.explained_variance_score(response_true, response_pred)
scores['mae'] = skmet.mean_absolute_error(response_true, response_pred)
scores['mse'] = skmet.mean_squared_error(response_true, response_pred)
#scores['medae'] = skmet.median_absolute_error(response_true, response_pred)
scores['r2'] = skmet.r2_score(response_true, response_pred)
return(scores)


def create_3D_linked_list(labels, numpy_array_3D, value_label):
"""returns a panel with interaction (x-axis) - value (y axis) - time (Z axis)"""
windows_n = numpy_array_3D.shape[2]
linked_list_3D ={}

for i in xrange(windows_n):
target_2D_array = numpy_array_3D[:,:,i]
linked_list = create_linked_list(labels, target_2D_array, value_label)
linked_list_3D[i] = linked_list
return pd.Panel(linked_list_3D)


def create_linked_list(labels, numpy_array_2D, value_label):
"""labels and array should be in row-major order"""
linked_list = pd.DataFrame({'regulator-target':labels, value_label:numpy_array_2D.flatten()})
return linked_list


def average_rank(ranked_result_list, col_string):
"""finds the average rank and standard deviation throughout time"""
aggregate_ranks = []
for nth_window in ranked_result_list:
aggregate_ranks.append(nth_window[[col_string, 'regulator-target']])
#now merge the panels in an interesting loop. The merge function insures the keys are always matched up correctly.
left_df = aggregate_ranks[0] #initialize the left_df.
left_df.columns= [col_string+"_0", 'regulator-target']
for window_index in range(1,len(aggregate_ranks)):
right_df = aggregate_ranks[window_index]
right_df.columns= [col_string+"_"+str(window_index), 'regulator-target']
left_df = left_df.merge(right_df,on = 'regulator-target')

aggr_ranks = left_df.drop(['regulator-target'], axis = 1)
#assign to temporary variables to prevent the calc columns to be involved in other calculations
range_col = list(zip(aggr_ranks.min(axis = 1), aggr_ranks.max(axis = 1)))
mean_col = aggr_ranks.mean(axis = 1)
median_col = aggr_ranks.median(axis = 1)
sd_col = aggr_ranks.std(axis = 1, ddof=1)

aggr_ranks['range'] = range_col
aggr_ranks['mean-rank'] = mean_col
aggr_ranks['median-rank'] = median_col
aggr_ranks['sd-rank'] = sd_col
aggr_ranks['regulator-target'] = left_df['regulator-target']
return(aggr_ranks)


def rank_results_3D(result_list, col_string, ascending=True):
"""input: list of result pandas dfs, column name. output: each time window is sorted by column name, most significant to least"""
rank_column_name = col_string + "-rank"
for nth_window in result_list:
nth_window[rank_column_name] = nth_window[col_string].rank(method="dense", ascending = ascending)
return result_list


def rank_index(vector):
return [vector.index(x) for x in sorted(range(vector), key=vector.__getitem__)]


def point_slope(x1,y1, x2,y2):
slope = (y2-y1)/float(x2-x1)
return slope


def elbow_criteria(x,y):
x = np.array(x)
y = np.array(y)
# Slope between elbow endpoints
m1 = point_slope(x[0], y[0], x[-1], y[-1])
# Intercept
b1 = y[0] - m1*x[0]

# Slope for perpendicular lines
m2 = -1/m1

# Calculate intercepts for perpendicular lines that go through data point
b_array = y-m2*x
x_perp = (b_array-b1)/(m1-m2)
y_perp = m1*x_perp+b1

# Calculate where the maximum distance to a line connecting endpoints is
distances = np.sqrt((x_perp-x)**2+(y_perp-y)**2)
index_max = np.where(distances==np.max(distances))[0][0]
elbow_x = x[index_max]
elbow_y = y[index_max]
return elbow_x, elbow_y


def make_possible_edge_list(parents, children, self_edges=True):
"""
Create a list of all the possible edges between parents and children
Expand All @@ -150,18 +28,4 @@ def make_possible_edge_list(parents, children, self_edges=True):
elif not self_edges:
possible_edge_list = [x for x in zip(parent_list, child_list) if x[0] != x[1]]

return possible_edge_list


def sum_of_squares(x, axis=0):
"""
Calculate the sum of the squares for each column
:param x: array-like
The data matrix for which the sum of squares is taken
:return: float or array-like
The sum of squares, columnwise or total
"""
column_mean = np.mean(x, axis=axis)
sse = np.sum(np.power(x - column_mean, 2), axis=axis)
return sse

return possible_edge_list

0 comments on commit 7e79fbd

Please sign in to comment.