From 154df380a9d534a28f58fa7e95a6a50f6632fc71 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Jun 2018 14:42:11 -0400
Subject: [PATCH 1/5] remove old test script

---
 test/phate_test_tree.py | 80 -----------------------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 test/phate_test_tree.py

diff --git a/test/phate_test_tree.py b/test/phate_test_tree.py
deleted file mode 100644
index 6037b91..0000000
--- a/test/phate_test_tree.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python3
-
-# author: Daniel Burkhardt <daniel.burkhardt@yale.edu>
-# (C) 2017 Krishnaswamy Lab GPLv2
-
-from __future__ import print_function
-import phate
-import matplotlib.pyplot as plt
-import sklearn.manifold
-import sys
-
-
-def main(argv=None):
-    # generate DLA tree
-    print("Running PHATE test on DLA tree...\n")
-    M, C = phate.tree.gen_dla(n_dim=100, n_branch=20, branch_length=100,
-                              n_drop=0, rand_multiplier=2, seed=37, sigma=4)
-
-    # instantiate phate_operator
-    phate_operator = phate.PHATE(n_components=2, a=10, k=5, t=30, mds='classic',
-                                 knn_dist='euclidean', mds_dist='euclidean', njobs=-2)
-
-    # run phate with classic MDS
-    Y_cmds = phate_operator.fit_transform(M)
-
-    # run phate with metric MDS
-    # change the MDS embedding without recalculating diffusion potential
-    phate_operator.reset_mds(mds="metric")
-    Y_mmds = phate_operator.fit_transform(M)
-
-    # run phate with nonmetric MDS
-    phate_operator.reset_mds(mds="nonmetric")
-    Y_nmmds = phate_operator.fit_transform(M)
-
-    pca = phate.preprocessing.pca_reduce(M, n_components=2)
-    tsne = sklearn.manifold.TSNE().fit_transform(M)
-
-    f, axes = plt.subplots(2, 3)
-
-    f.set_size_inches(12, 8)
-
-    plt.setp(axes, xticks=[], xticklabels=[],
-             yticks=[])
-
-    ax1, ax2, ax3, ax4, ax5, ax6 = axes.ravel()
-
-    # plotting PCA
-    ax1.scatter(pca[:, 0], pca[:, 1], s=10, c=C)
-    ax1.set_xlabel("PC 1")
-    ax1.set_ylabel("PC 2")
-    ax1.set_title("PCA")
-
-    # plotting tSNE
-    ax2.scatter(tsne[:, 0], tsne[:, 1], s=10, c=C)
-    ax2.set_xlabel("tSNE 1")
-    ax2.set_ylabel("tSNE 2")
-    ax2.set_title("tSNE")
-
-    # plotting PHATE - classic MDS
-    ax4.scatter(Y_cmds[:, 0], Y_cmds[:, 1], s=10, c=C)
-    ax4.set_xlabel("phate 1")
-    ax4.set_ylabel("phate 2")
-    ax4.set_title("PHATE embedding of DLA fractal tree\nClassic MDS")
-
-    # plotting PHATE - metric MDS
-    ax5.scatter(Y_mmds[:, 0], Y_mmds[:, 1], s=10, c=C)
-    ax5.set_xlabel("phate 1")
-    ax5.set_title("PHATE embedding of DLA fractal tree\nMetric MDS")
-
-    # plotting PHATE - nonmetric MDS
-    ax6.scatter(Y_nmmds[:, 0], Y_nmmds[:, 1], s=10, c=C)
-    ax6.set_xlabel("phate 1")
-    ax6.set_title("PHATE embedding of DLA fractal tree\nNonmetric MDS")
-
-    plt.tight_layout()
-    plt.savefig("./phate_tree_test.png", dpi=300)
-    print("output saved in './phate_tree_test.png'")
-
-if __name__ == '__main__':
-    sys.exit(main())

From a31c66f8789a8ed7809a08120e6015c3d1150cc4 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Jun 2018 14:51:56 -0400
Subject: [PATCH 2/5] Info message to notify type of precomputed matrix

---
 Python/phate/phate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Python/phate/phate.py b/Python/phate/phate.py
index d593719..f344135 100644
--- a/Python/phate/phate.py
+++ b/Python/phate/phate.py
@@ -544,6 +544,7 @@ def fit(self, X):
                 precomputed = "distance"
             else:
                 precomputed = "affinity"
+            log_info("Using precomputed {} matrix...".format(precomputed))
             n_pca = None
         else:
             precomputed = None

From 1e64fdb5878fc2b81706ee6d3eb77d4249ef9943 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Jun 2018 14:52:17 -0400
Subject: [PATCH 3/5] update submodule

---
 phateR | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/phateR b/phateR
index d99ad0a..ec2241f 160000
--- a/phateR
+++ b/phateR
@@ -1 +1 @@
-Subproject commit d99ad0ac46e8edd64182d0c0c64718815bc51041
+Subproject commit ec2241f15231d1e681511bb0ee006fda502a3caf

From 5afefc365692fc8212ca415b671f3184eb179e7b Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Jun 2018 14:54:26 -0400
Subject: [PATCH 4/5] remove mac detritus

---
 Matlab/phate.m~ | 319 ------------------------------------------------
 1 file changed, 319 deletions(-)
 delete mode 100644 Matlab/phate.m~

diff --git a/Matlab/phate.m~ b/Matlab/phate.m~
deleted file mode 100644
index 3fe21d8..0000000
--- a/Matlab/phate.m~
+++ /dev/null
@@ -1,319 +0,0 @@
-function Y = phate(data, varargin)
-% phate  Run PHATE for visualizing noisy non-linear data in lower dimensions
-%   Y = phate(data) runs PHATE on data (rows: samples, columns: features)
-%   with default parameter settings and returns a 2 dimensional embedding.
-%
-%   If data is sparse PCA without mean centering will be done to maintain
-%   low memory footprint. If data is dense then normal PCA (with mean
-%   centering) is done.
-%
-%   Y = phate(data, 'PARAM1',val1, 'PARAM2',val2, ...) allows you to
-%   specify optional parameter name/value pairs that control further details
-%   of PHATE.  Parameters are:
-%
-%   'ndim' - number of (output) embedding dimensions. Common values are 2
-%   or 3. Defaults to 2.
-%
-%   'k' - number of nearest neighbors for bandwidth of adaptive alpha
-%   decaying kernel or, when a=[], number of nearest neighbors of the knn
-%   graph. Defaults to 5.
-%
-%   'a' - alpha of alpha decaying kernel. when a=[] knn (unweighted) kernel
-%   is used. Defaults to 10.
-%
-%   't' - number of diffusion steps. Defaults to [] wich autmatically picks
-%   the optimal t.
-%
-%   't_max' - maximum t for finding optimal t. if t = [] optimal t will be
-%   computed by computing Von Neumann Entropy for each t <= t_max and then
-%   picking the kneepoint. Defaults to 100.
-%
-%   'npca' - number of pca components for computing distances. Defaults to
-%   100.
-%
-%   'mds_method' - method of multidimensional scaling. Choices are:
-%
-%       'mmds' - metric MDS (default)
-%       'cmds' - classical MDS
-%       'nmmds' - non-metric MDS
-%
-%   'distfun' - distance function. Default is 'euclidean'.
-%
-%   'distfun_mds' - distance function for MDS. Default is 'euclidean'.
-%
-%   'pot_method' - method of computing the PHATE potential dstance. Choices
-%   are:
-%
-%       'log' - -log(P + eps). (default)
-%
-%       'sqrt' - sqrt(P). (not default but often produces superior
-%       embeddings)
-%
-%       'gamma' - 2/(1-\gamma)*P^((1-\gamma)/2) (default, with gamma=1)
-%
-%   'gamma' - gamma value for gamma potential method. Value between -1 and
-%   1. -1 is diffusion distance. 1 is log potential. 0 is sqrt. Smaller
-%   gamma is a more locally sensitive embedding whereas larger gamma is a
-%   more globally sensitive embedding. Defaults to 1.
-%
-%   'pot_eps' - epsilon value added to diffusion operator prior to
-%   computing potential. Only used for 'pot_method' is 'log', i.e.:
-%   -log(P + pot_eps). Defaults to 1e-3.
-%
-%   'n_landmarks' - number of landmarks for fast and scalable PHATE. [] or
-%   n_landmarks = npoints does no landmarking, which is slower. More
-%   landmarks is more accurate but comes at the cost of speed and memory.
-%   Defaults to 2000.
-%
-%   'nsvd' - number of singular vectors for spectral clustering (for
-%   computing landmarks). Defaults to 100.
-%
-%   'kernel' - user supplied kernel. If not given ([]) kernel is
-%   computed from the supplied data. Supplied kernel should be a square
-%   (samples by samples) symmetric affinity matrix. If kernel is
-%   supplied input data can be empty ([]). Defaults to [].
-
-npca = 100;
-k = 5;
-nsvd = 100;
-n_landmarks = 2000;
-ndim = 2;
-t = [];
-mds_method = 'mmds';
-distfun = 'euclidean';
-distfun_mds = 'euclidean';
-pot_method = 'gamma';
-K = [];
-a = 10;
-Pnm = [];
-t_max = 100;
-pot_eps = 1e-7;
-gamma = 1;
-
-% get input parameters
-for i=1:length(varargin)
-    % k for knn adaptive sigma
-    if(strcmp(varargin{i},'k'))
-       k = lower(varargin{i+1});
-    end
-    % a (alpha) for alpha decaying kernel
-    if(strcmp(varargin{i},'a'))
-       a = lower(varargin{i+1});
-    end
-    % diffusion time
-    if(strcmp(varargin{i},'t'))
-       t = lower(varargin{i+1});
-    end
-    % t_max for VNE
-    if(strcmp(varargin{i},'t_max'))
-       t_max = lower(varargin{i+1});
-    end
-    % Number of pca components
-    if(strcmp(varargin{i},'npca'))
-       npca = lower(varargin{i+1});
-    end
-    % Number of dimensions for the PHATE embedding
-    if(strcmp(varargin{i},'ndim'))
-       ndim = lower(varargin{i+1});
-    end
-    % Method for MDS
-    if(strcmp(varargin{i},'mds_method'))
-       mds_method =  varargin{i+1};
-    end
-    % Distance function for the inputs
-    if(strcmp(varargin{i},'distfun'))
-       distfun = lower(varargin{i+1});
-    end
-    % distfun for MDS
-    if(strcmp(varargin{i},'distfun_mds'))
-       distfun_mds =  lower(varargin{i+1});
-    end
-    % nsvd for spectral clustering
-    if(strcmp(varargin{i},'nsvd'))
-       nsvd = lower(varargin{i+1});
-    end
-    % n_landmarks for spectral clustering
-    if(strcmp(varargin{i},'n_landmarks'))
-       n_landmarks = lower(varargin{i+1});
-    end
-    % potential method: log, sqrt, gamma
-    if(strcmp(varargin{i},'pot_method'))
-       pot_method = lower(varargin{i+1});
-    end
-    % kernel
-    if(strcmp(varargin{i},'kernel'))
-       K = lower(varargin{i+1});
-    end
-    % kernel
-    if(strcmp(varargin{i},'gamma'))
-       gamma = lower(varargin{i+1});
-    end
-    % pot_eps
-    if(strcmp(varargin{i},'pot_eps'))
-       pot_eps = lower(varargin{i+1});
-    end
-end
-
-if isempty(a) && k <=5
-    disp 'Make sure k is not too small when using an unweighted knn kernel'
-    disp(['Currently k = ' numstr(k) ', which may be too small']);
-end
-
-tt_pca = 0;
-tt_kernel = 0;
-tt_svd = 0;
-tt_kmeans = 0;
-tt_lo = 0;
-tt_mmds = 0;
-tt_nmmds = 0;
-
-if isempty(K)
-    if ~isempty(npca) && size(data,2) > npca
-        % PCA
-        disp 'Doing PCA'
-        tic;
-        if issparse(data)
-            disp 'Data is sparse, doing SVD instead of PCA (no mean centering)'
-            pc = svdpca_sparse(data, npca, 'random');
-        else
-            pc = svdpca(data, npca, 'random');
-        end
-        tt_pca = toc;
-        disp(['PCA took ' num2str(tt_pca) ' seconds']);
-    else
-        pc = data;
-    end
-    % kernel
-    tic;
-    if isempty(a)
-        disp 'using unweighted knn kernel'
-        K = compute_kernel_sparse(pc, 'k', k, 'distfun', distfun);
-    else
-        disp 'using alpha decaying kernel'
-        K = compute_alpha_kernel_sparse(pc, 'k', k, 'a', a, 'distfun', distfun);
-    end
-    tt_kernel = toc;
-    disp(['Computing kernel took ' num2str(tt_kernel) ' seconds']);
-else
-    disp 'Using supplied kernel'
-end
-
-disp 'Make kernel row stochastic'
-P = bsxfun(@rdivide, K, sum(K,2));
-
-if ~isempty(n_landmarks) && n_landmarks < size(K,1)
-    % spectral cluster for landmarks
-    disp 'Spectral clustering for landmarks'
-    tic;
-    [U,S,~] = randPCA(P, nsvd);
-    tt_svd = toc;
-    disp(['svd took ' num2str(tt_svd) ' seconds']);
-    tic;
-    IDX = kmeans(U*S, n_landmarks);
-    tt_kmeans = toc;
-    disp(['kmeans took ' num2str(tt_kmeans) ' seconds']);
-    
-    % create landmark operators
-    disp 'Computing landmark operators'
-    tic;
-    n = size(K,1);
-    m = max(IDX);
-    Pnm = nan(n,m);
-    for I=1:m
-        Pnm(:,I) = sum(K(:,IDX==I),2);
-    end
-    Pmn = Pnm';
-    Pmn = bsxfun(@rdivide, Pmn, sum(Pmn,2));
-    Pnm = bsxfun(@rdivide, Pnm, sum(Pnm,2));
-    tt_lo = toc;
-    disp(['Computing landmark operators took ' num2str(tt_lo) ' seconds']);
-    
-    % Pmm
-    Pmm = Pmn * Pnm;
-else
-    disp 'Running PHATE without landmarking'
-    Pmm = bsxfun(@rdivide, K, sum(K,2));
-end
-
-% VNE
-if isempty(t)
-    disp 'Finding optimal t using VNE'
-    t = vne_optimal_t(Pmm, t_max);
-end
-
-% diffuse
-disp 'Diffusing operator'
-tic;
-P_t = Pmm^t;
-tt_diff = toc;
-disp(['Diffusion took ' num2str(tt_diff) ' seconds']);
-
-% potential distances
-tic;
-disp 'Computing potential distances'
-switch pot_method
-    case 'log'
-        disp 'using -log(P) potential distance'
-        Pot = -log(P_t + pot_eps);
-    case 'sqrt'
-        disp 'using sqrt(P) potential distance'
-        Pot = sqrt(P_t);
-    case 'gamma'
-        disp 'Pot = 2/(1-\gamma)*P^((1-\gamma)/2)'
-        disp(['gamma = ' num2str(gamma)]);
-        gamma = min(gamma, 0.95);
-        Pot = 2/(1-gamma)*P_t.^((1-gamma)/2);
-    otherwise
-        error 'potential method unknown'
-end
-PDX = squareform(pdist(Pot, distfun_mds));
-tt_pdx = toc;
-disp(['Computing potential distance took ' num2str(tt_pdx) ' seconds']);
-
-% CMDS
-disp 'Doing classical MDS'
-tic;
-Y = randmds(PDX, ndim);
-tt_cmds = toc;
-disp(['CMDS took ' num2str(tt_cmds) ' seconds']);
-
-% MMDS
-if strcmpi(mds_method, 'mmds')
-    tic;
-    disp 'Doing metric MDS:'
-    opt = statset('display','iter');
-    Y = mdscale(PDX,ndim,'options',opt,'start',Y,'Criterion','metricstress');
-    tt_mmds = toc;
-    disp(['MMDS took ' num2str(tt_mmds) ' seconds']);
-end
-
-% NMMDS
-if strcmpi(mds_method, 'nmmds')
-    tic;
-    disp 'Doing non-metric MDS:'
-    opt = statset('display','iter');
-    Y = mdscale(PDX,ndim,'options',opt,'start',Y,'Criterion','stress');
-    tt_nmmds = toc;
-    disp(['NMMDS took ' num2str(tt_nmmds) ' seconds']);
-end
-
-if ~isempty(Pnm)
-    % out of sample extension from landmarks to all points
-    disp 'Out of sample extension from landmarks to all points'
-    Y = Pnm * Y;
-end
-
-disp 'Done.'
-
-tt_total = tt_pca + tt_kernel + tt_svd + tt_kmeans + tt_lo + tt_diff + ...
-    tt_pdx + tt_cmds + tt_mmds + tt_nmmds;
-
-disp(['Total time ' num2str(tt_total) ' seconds']);
-
-end
-
-
-
-
-
-

From 78b8da411c6ef286f03b5f2be2510846e8af37d6 Mon Sep 17 00:00:00 2001
From: Scott Gigante <scottgigante@gmail.com>
Date: Wed, 6 Jun 2018 16:57:21 -0400
Subject: [PATCH 5/5] library size norm returns pandas if given pandas

---
 Python/phate/preprocessing.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/Python/phate/preprocessing.py b/Python/phate/preprocessing.py
index 3690c79..54cd680 100644
--- a/Python/phate/preprocessing.py
+++ b/Python/phate/preprocessing.py
@@ -33,11 +33,15 @@ def library_size_normalize(data, verbose=False):
     if verbose:
         print("Normalizing library sizes for %s cells" % (data.shape[0]))
 
+    # pandas support
+    columns, index = None, None
     try:
-        if isinstance(data, pd.SparseDataFrame):
-            data = data.to_coo()
-        elif pd.api.types.is_sparse(data):
+        if isinstance(data, pd.SparseDataFrame) or \
+                pd.api.types.is_sparse(data):
+            columns, index = data.columns, data.index
             data = data.to_coo()
+        elif isinstance(data, pd.DataFrame):
+            columns, index = data.columns, data.index
     except NameError:
         pass
     except AttributeError as e:
@@ -76,4 +80,12 @@ def library_size_normalize(data, verbose=False):
     # axis = 1 independently normalizes each sample
 
     data_norm = data_norm * median_transcript_count
+    if columns is not None:
+        # pandas dataframe
+        if sparse.issparse(data_norm):
+            data_norm = pd.SparseDataFrame(data_norm, default_fill_value=0)
+        else:
+            data_norm = pd.DataFrame(data_norm)
+        data_norm.columns = columns
+        data_norm.index = index
     return data_norm