From 154df380a9d534a28f58fa7e95a6a50f6632fc71 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Jun 2018 14:42:11 -0400 Subject: [PATCH 1/5] remove old test script --- test/phate_test_tree.py | 80 ----------------------------------------- 1 file changed, 80 deletions(-) delete mode 100644 test/phate_test_tree.py diff --git a/test/phate_test_tree.py b/test/phate_test_tree.py deleted file mode 100644 index 6037b91..0000000 --- a/test/phate_test_tree.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 - -# author: Daniel Burkhardt -# (C) 2017 Krishnaswamy Lab GPLv2 - -from __future__ import print_function -import phate -import matplotlib.pyplot as plt -import sklearn.manifold -import sys - - -def main(argv=None): - # generate DLA tree - print("Running PHATE test on DLA tree...\n") - M, C = phate.tree.gen_dla(n_dim=100, n_branch=20, branch_length=100, - n_drop=0, rand_multiplier=2, seed=37, sigma=4) - - # instantiate phate_operator - phate_operator = phate.PHATE(n_components=2, a=10, k=5, t=30, mds='classic', - knn_dist='euclidean', mds_dist='euclidean', njobs=-2) - - # run phate with classic MDS - Y_cmds = phate_operator.fit_transform(M) - - # run phate with metric MDS - # change the MDS embedding without recalculating diffusion potential - phate_operator.reset_mds(mds="metric") - Y_mmds = phate_operator.fit_transform(M) - - # run phate with nonmetric MDS - phate_operator.reset_mds(mds="nonmetric") - Y_nmmds = phate_operator.fit_transform(M) - - pca = phate.preprocessing.pca_reduce(M, n_components=2) - tsne = sklearn.manifold.TSNE().fit_transform(M) - - f, axes = plt.subplots(2, 3) - - f.set_size_inches(12, 8) - - plt.setp(axes, xticks=[], xticklabels=[], - yticks=[]) - - ax1, ax2, ax3, ax4, ax5, ax6 = axes.ravel() - - # plotting PCA - ax1.scatter(pca[:, 0], pca[:, 1], s=10, c=C) - ax1.set_xlabel("PC 1") - ax1.set_ylabel("PC 2") - ax1.set_title("PCA") - - # plotting tSNE - ax2.scatter(tsne[:, 0], tsne[:, 1], s=10, c=C) - ax2.set_xlabel("tSNE 1") - ax2.set_ylabel("tSNE 2") - ax2.set_title("tSNE") - - # plotting PHATE - classic MDS - ax4.scatter(Y_cmds[:, 0], Y_cmds[:, 1], s=10, c=C) - ax4.set_xlabel("phate 1") - ax4.set_ylabel("phate 2") - ax4.set_title("PHATE embedding of DLA fractal tree\nClassic MDS") - - # plotting PHATE - metric MDS - ax5.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=10, c=C) - ax5.set_xlabel("phate 1") - ax5.set_title("PHATE embedding of DLA fractal tree\nMetric MDS") - - # plotting PHATE - nonmetric MDS - ax6.scatter(Y_nmmds[:, 0], Y_nmmds[:, 1], s=10, c=C) - ax6.set_xlabel("phate 1") - ax6.set_title("PHATE embedding of DLA fractal tree\nNonmetric MDS") - - plt.tight_layout() - plt.savefig("./phate_tree_test.png", dpi=300) - print("output saved in './phate_tree_test.png'") - -if __name__ == '__main__': - sys.exit(main()) From a31c66f8789a8ed7809a08120e6015c3d1150cc4 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Jun 2018 14:51:56 -0400 Subject: [PATCH 2/5] Info message to notify type of precomputed matrix --- Python/phate/phate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Python/phate/phate.py b/Python/phate/phate.py index d593719..f344135 100644 --- a/Python/phate/phate.py +++ b/Python/phate/phate.py @@ -544,6 +544,7 @@ def fit(self, X): precomputed = "distance" else: precomputed = "affinity" + log_info("Using precomputed {} matrix...".format(precomputed)) n_pca = None else: precomputed = None From 1e64fdb5878fc2b81706ee6d3eb77d4249ef9943 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Jun 2018 14:52:17 -0400 Subject: [PATCH 3/5] update submodule --- phateR | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/phateR b/phateR index d99ad0a..ec2241f 160000 --- a/phateR +++ b/phateR @@ -1 +1 @@ -Subproject commit d99ad0ac46e8edd64182d0c0c64718815bc51041 +Subproject commit ec2241f15231d1e681511bb0ee006fda502a3caf From 5afefc365692fc8212ca415b671f3184eb179e7b Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Jun 2018 14:54:26 -0400 Subject: [PATCH 4/5] remove mac detritus --- Matlab/phate.m~ | 319 ------------------------------------------------ 1 file changed, 319 deletions(-) delete mode 100644 Matlab/phate.m~ diff --git a/Matlab/phate.m~ b/Matlab/phate.m~ deleted file mode 100644 index 3fe21d8..0000000 --- a/Matlab/phate.m~ +++ /dev/null @@ -1,319 +0,0 @@ -function Y = phate(data, varargin) -% phate Run PHATE for visualizing noisy non-linear data in lower dimensions -% Y = phate(data) runs PHATE on data (rows: samples, columns: features) -% with default parameter settings and returns a 2 dimensional embedding. -% -% If data is sparse PCA without mean centering will be done to maintain -% low memory footprint. If data is dense then normal PCA (with mean -% centering) is done. -% -% Y = phate(data, 'PARAM1',val1, 'PARAM2',val2, ...) allows you to -% specify optional parameter name/value pairs that control further details -% of PHATE. Parameters are: -% -% 'ndim' - number of (output) embedding dimensions. Common values are 2 -% or 3. Defaults to 2. -% -% 'k' - number of nearest neighbors for bandwidth of adaptive alpha -% decaying kernel or, when a=[], number of nearest neighbors of the knn -% graph. Defaults to 5. -% -% 'a' - alpha of alpha decaying kernel. when a=[] knn (unweighted) kernel -% is used. Defaults to 10. -% -% 't' - number of diffusion steps. Defaults to [] wich autmatically picks -% the optimal t. -% -% 't_max' - maximum t for finding optimal t. if t = [] optimal t will be -% computed by computing Von Neumann Entropy for each t <= t_max and then -% picking the kneepoint. Defaults to 100. -% -% 'npca' - number of pca components for computing distances. Defaults to -% 100. -% -% 'mds_method' - method of multidimensional scaling. Choices are: -% -% 'mmds' - metric MDS (default) -% 'cmds' - classical MDS -% 'nmmds' - non-metric MDS -% -% 'distfun' - distance function. Default is 'euclidean'. -% -% 'distfun_mds' - distance function for MDS. Default is 'euclidean'. -% -% 'pot_method' - method of computing the PHATE potential dstance. Choices -% are: -% -% 'log' - -log(P + eps). (default) -% -% 'sqrt' - sqrt(P). (not default but often produces superior -% embeddings) -% -% 'gamma' - 2/(1-\gamma)*P^((1-\gamma)/2) (default, with gamma=1) -% -% 'gamma' - gamma value for gamma potential method. Value between -1 and -% 1. -1 is diffusion distance. 1 is log potential. 0 is sqrt. Smaller -% gamma is a more locally sensitive embedding whereas larger gamma is a -% more globally sensitive embedding. Defaults to 1. -% -% 'pot_eps' - epsilon value added to diffusion operator prior to -% computing potential. Only used for 'pot_method' is 'log', i.e.: -% -log(P + pot_eps). Defaults to 1e-3. -% -% 'n_landmarks' - number of landmarks for fast and scalable PHATE. [] or -% n_landmarks = npoints does no landmarking, which is slower. More -% landmarks is more accurate but comes at the cost of speed and memory. -% Defaults to 2000. -% -% 'nsvd' - number of singular vectors for spectral clustering (for -% computing landmarks). Defaults to 100. -% -% 'kernel' - user supplied kernel. If not given ([]) kernel is -% computed from the supplied data. Supplied kernel should be a square -% (samples by samples) symmetric affinity matrix. If kernel is -% supplied input data can be empty ([]). Defaults to []. - -npca = 100; -k = 5; -nsvd = 100; -n_landmarks = 2000; -ndim = 2; -t = []; -mds_method = 'mmds'; -distfun = 'euclidean'; -distfun_mds = 'euclidean'; -pot_method = 'gamma'; -K = []; -a = 10; -Pnm = []; -t_max = 100; -pot_eps = 1e-7; -gamma = 1; - -% get input parameters -for i=1:length(varargin) - % k for knn adaptive sigma - if(strcmp(varargin{i},'k')) - k = lower(varargin{i+1}); - end - % a (alpha) for alpha decaying kernel - if(strcmp(varargin{i},'a')) - a = lower(varargin{i+1}); - end - % diffusion time - if(strcmp(varargin{i},'t')) - t = lower(varargin{i+1}); - end - % t_max for VNE - if(strcmp(varargin{i},'t_max')) - t_max = lower(varargin{i+1}); - end - % Number of pca components - if(strcmp(varargin{i},'npca')) - npca = lower(varargin{i+1}); - end - % Number of dimensions for the PHATE embedding - if(strcmp(varargin{i},'ndim')) - ndim = lower(varargin{i+1}); - end - % Method for MDS - if(strcmp(varargin{i},'mds_method')) - mds_method = varargin{i+1}; - end - % Distance function for the inputs - if(strcmp(varargin{i},'distfun')) - distfun = lower(varargin{i+1}); - end - % distfun for MDS - if(strcmp(varargin{i},'distfun_mds')) - distfun_mds = lower(varargin{i+1}); - end - % nsvd for spectral clustering - if(strcmp(varargin{i},'nsvd')) - nsvd = lower(varargin{i+1}); - end - % n_landmarks for spectral clustering - if(strcmp(varargin{i},'n_landmarks')) - n_landmarks = lower(varargin{i+1}); - end - % potential method: log, sqrt, gamma - if(strcmp(varargin{i},'pot_method')) - pot_method = lower(varargin{i+1}); - end - % kernel - if(strcmp(varargin{i},'kernel')) - K = lower(varargin{i+1}); - end - % kernel - if(strcmp(varargin{i},'gamma')) - gamma = lower(varargin{i+1}); - end - % pot_eps - if(strcmp(varargin{i},'pot_eps')) - pot_eps = lower(varargin{i+1}); - end -end - -if isempty(a) && k <=5 - disp 'Make sure k is not too small when using an unweighted knn kernel' - disp(['Currently k = ' numstr(k) ', which may be too small']); -end - -tt_pca = 0; -tt_kernel = 0; -tt_svd = 0; -tt_kmeans = 0; -tt_lo = 0; -tt_mmds = 0; -tt_nmmds = 0; - -if isempty(K) - if ~isempty(npca) && size(data,2) > npca - % PCA - disp 'Doing PCA' - tic; - if issparse(data) - disp 'Data is sparse, doing SVD instead of PCA (no mean centering)' - pc = svdpca_sparse(data, npca, 'random'); - else - pc = svdpca(data, npca, 'random'); - end - tt_pca = toc; - disp(['PCA took ' num2str(tt_pca) ' seconds']); - else - pc = data; - end - % kernel - tic; - if isempty(a) - disp 'using unweighted knn kernel' - K = compute_kernel_sparse(pc, 'k', k, 'distfun', distfun); - else - disp 'using alpha decaying kernel' - K = compute_alpha_kernel_sparse(pc, 'k', k, 'a', a, 'distfun', distfun); - end - tt_kernel = toc; - disp(['Computing kernel took ' num2str(tt_kernel) ' seconds']); -else - disp 'Using supplied kernel' -end - -disp 'Make kernel row stochastic' -P = bsxfun(@rdivide, K, sum(K,2)); - -if ~isempty(n_landmarks) && n_landmarks < size(K,1) - % spectral cluster for landmarks - disp 'Spectral clustering for landmarks' - tic; - [U,S,~] = randPCA(P, nsvd); - tt_svd = toc; - disp(['svd took ' num2str(tt_svd) ' seconds']); - tic; - IDX = kmeans(U*S, n_landmarks); - tt_kmeans = toc; - disp(['kmeans took ' num2str(tt_kmeans) ' seconds']); - - % create landmark operators - disp 'Computing landmark operators' - tic; - n = size(K,1); - m = max(IDX); - Pnm = nan(n,m); - for I=1:m - Pnm(:,I) = sum(K(:,IDX==I),2); - end - Pmn = Pnm'; - Pmn = bsxfun(@rdivide, Pmn, sum(Pmn,2)); - Pnm = bsxfun(@rdivide, Pnm, sum(Pnm,2)); - tt_lo = toc; - disp(['Computing landmark operators took ' num2str(tt_lo) ' seconds']); - - % Pmm - Pmm = Pmn * Pnm; -else - disp 'Running PHATE without landmarking' - Pmm = bsxfun(@rdivide, K, sum(K,2)); -end - -% VNE -if isempty(t) - disp 'Finding optimal t using VNE' - t = vne_optimal_t(Pmm, t_max); -end - -% diffuse -disp 'Diffusing operator' -tic; -P_t = Pmm^t; -tt_diff = toc; -disp(['Diffusion took ' num2str(tt_diff) ' seconds']); - -% potential distances -tic; -disp 'Computing potential distances' -switch pot_method - case 'log' - disp 'using -log(P) potential distance' - Pot = -log(P_t + pot_eps); - case 'sqrt' - disp 'using sqrt(P) potential distance' - Pot = sqrt(P_t); - case 'gamma' - disp 'Pot = 2/(1-\gamma)*P^((1-\gamma)/2)' - disp(['gamma = ' num2str(gamma)]); - gamma = min(gamma, 0.95); - Pot = 2/(1-gamma)*P_t.^((1-gamma)/2); - otherwise - error 'potential method unknown' -end -PDX = squareform(pdist(Pot, distfun_mds)); -tt_pdx = toc; -disp(['Computing potential distance took ' num2str(tt_pdx) ' seconds']); - -% CMDS -disp 'Doing classical MDS' -tic; -Y = randmds(PDX, ndim); -tt_cmds = toc; -disp(['CMDS took ' num2str(tt_cmds) ' seconds']); - -% MMDS -if strcmpi(mds_method, 'mmds') - tic; - disp 'Doing metric MDS:' - opt = statset('display','iter'); - Y = mdscale(PDX,ndim,'options',opt,'start',Y,'Criterion','metricstress'); - tt_mmds = toc; - disp(['MMDS took ' num2str(tt_mmds) ' seconds']); -end - -% NMMDS -if strcmpi(mds_method, 'nmmds') - tic; - disp 'Doing non-metric MDS:' - opt = statset('display','iter'); - Y = mdscale(PDX,ndim,'options',opt,'start',Y,'Criterion','stress'); - tt_nmmds = toc; - disp(['NMMDS took ' num2str(tt_nmmds) ' seconds']); -end - -if ~isempty(Pnm) - % out of sample extension from landmarks to all points - disp 'Out of sample extension from landmarks to all points' - Y = Pnm * Y; -end - -disp 'Done.' - -tt_total = tt_pca + tt_kernel + tt_svd + tt_kmeans + tt_lo + tt_diff + ... - tt_pdx + tt_cmds + tt_mmds + tt_nmmds; - -disp(['Total time ' num2str(tt_total) ' seconds']); - -end - - - - - - From 78b8da411c6ef286f03b5f2be2510846e8af37d6 Mon Sep 17 00:00:00 2001 From: Scott Gigante Date: Wed, 6 Jun 2018 16:57:21 -0400 Subject: [PATCH 5/5] library size norm returns pandas if given pandas --- Python/phate/preprocessing.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Python/phate/preprocessing.py b/Python/phate/preprocessing.py index 3690c79..54cd680 100644 --- a/Python/phate/preprocessing.py +++ b/Python/phate/preprocessing.py @@ -33,11 +33,15 @@ def library_size_normalize(data, verbose=False): if verbose: print("Normalizing library sizes for %s cells" % (data.shape[0])) + # pandas support + columns, index = None, None try: - if isinstance(data, pd.SparseDataFrame): - data = data.to_coo() - elif pd.api.types.is_sparse(data): + if isinstance(data, pd.SparseDataFrame) or \ + pd.api.types.is_sparse(data): + columns, index = data.columns, data.index data = data.to_coo() + elif isinstance(data, pd.DataFrame): + columns, index = data.columns, data.index except NameError: pass except AttributeError as e: @@ -76,4 +80,12 @@ def library_size_normalize(data, verbose=False): # axis = 1 independently normalizes each sample data_norm = data_norm * median_transcript_count + if columns is not None: + # pandas dataframe + if sparse.issparse(data_norm): + data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0) + else: + data_norm = pd.DataFrame(data_norm) + data_norm.columns = columns + data_norm.index = index return data_norm