-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
99 lines (74 loc) · 3.29 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# -*- coding: utf-8 -*-
"""Various helper functions"""
import numpy as np
import pandas as pd
from load import load_genes_list
from constants import HORMONES, TUMORS, LABELS_CTRL, LABELS_CTRL_INVERTED
def df_to_tril(df):
"""Return a lower triangular dataframe where entries
above and including the main diagonal are set to zero."""
for index, row in df.iterrows():
row[index <= row.index] = 0
return df
def gene_pairs_per_treatment():
"""Return all pairs of genes differentially expressed
upon the same treatment as a Series object with the index
being the pair and the value being the list of treatments"""
genes_list = load_genes_list()
series = []
for h in HORMONES:
# get all genes expressing hormone h
genes_h = genes_list[genes_list[h]]
glist = list(genes_h.genes)
num_genes = len(glist)
# Compute pairs of indices for the lower triangular part of a matrix
# of size (num_genes x num_genes), excluding the diagonal
# (we don't want to pair the genes to themselves)
tril_indices = np.tril_indices(num_genes, k=-1)
index_pairs = list(zip(tril_indices[0], tril_indices[1]))
# Map the list of index-pairs to all possible pairs of genes
pairs = [(glist[pair[0]], glist[pair[1]]) for pair in index_pairs]
# Create the series and append to list
series_h = pd.Series(data=h, index=pairs, name="hormone")
series.append(series_h)
# Concatenate the list of series
genes_pairs = pd.concat(series, sort=False).groupby(level=0).apply(list)
return genes_pairs
def df_standardize_cols(df):
"""Standardize dataframe columns"""
return (df-df.mean(axis=0)) / df.std(axis=0)
def df_log_standardize_cols(df):
"""Log-transform values and standardize dataframe columns"""
# add a small constant because log(0) is undefined
df = np.log(df + .1)
return df_standardize_cols(df)
def pdx_standardize(X_pdx):
"""Standardize the PDX feature matrix per tumor and return the concatenated dataframe,
the aim being to eliminate bias introduced by different tumors being injected"""
dfs_stdized = []
for tumor in TUMORS:
df = X_pdx.xs(tumor, level=1, drop_level=False)
df_stdized = df_standardize_cols(df)
dfs_stdized.append(df_stdized)
return pd.concat(dfs_stdized).sort_values(["treatment", "tumor"])
def describe_prediction(predicted, actual, with_ctrl=True):
"""Print the composition of predicted clusters if we used the actual labels provided in the first place, ex.
"Cluster 0 contains:
9 dht samples
2 e2 samples
3 p4 samples".
:param predicted: predicted labels
:param actual: actual labels (supervised data)
:param with_ctrl: boolean, True if actual labels contain control group
"""
for cluster in np.unique(actual):
print("Cluster %d contains:" % cluster)
contains = actual[predicted == cluster]
for label in np.unique(contains):
count = np.count_nonzero(contains == label)
if with_ctrl:
label_name = LABELS_CTRL_INVERTED[label]
else:
label_name = LABELS_INVERTED[label]
print("%d %s samples" % (count,label_name))
print("")