Some improvements to better match the paper

NicoMigenda · Jan 9, 2024 · e209bdd · e209bdd
1 parent 7d0ef35
commit e209bdd
Show file tree

Hide file tree

Showing 18 changed files with 127 additions and 98 deletions.
diff --git a/Example_dynamic.m b/Example_dynamic.m
@@ -25,7 +25,8 @@
 hold on;
 axis equal;
 axis manual;
-ngpca = init_units(ngpca, data, 19, 'iterations', 40000, 'PCADimensionality', 2);
+numUnits = 19;
+ngpca = init_units(ngpca, data, numUnits, 'iterations', 40000, 'PCADimensionality', 2);
 for i = 1:ngpca.iterations
     % Sample a random data point in each iteration
     ngpca = fit_single(ngpca, data(ceil(size(data,1) .* rand), :));

diff --git a/Example_dynamic.mlx b/Example_dynamic.mlx
diff --git a/Example_stationary.m b/Example_stationary.m
@@ -63,7 +63,8 @@
 hold on;
 axis equal;
 axis manual;
-ngpca = init_units(ngpca, data, 15, 'iterations', 20000, 'PCADimensionality', 2);
+numUnits = 15;
+ngpca = init_units(ngpca, data, numUnits, 'iterations', 20000, 'PCADimensionality', 2);
 for i = 1:ngpca.iterations
     % Sample a random data point in each iteration
     ngpca = fit_single(ngpca, data(ceil(size(data,1) .* rand), :));
@@ -79,7 +80,8 @@
 % For parameter description see above example for single data point
 % training. 
 ngpca = NGPCA();
-ngpca = init_units(ngpca, data, 15, 'Iterations', 20000, 'PCADimensionality', 2);
+numUnits = 15;
+ngpca = init_units(ngpca, data, numUnits, 'Iterations', 20000, 'PCADimensionality', 2);
 tic;
 ngpca = fit_multiple(ngpca, data);
 toc

diff --git a/Example_stationary.mlx b/Example_stationary.mlx
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <h3 align="center">NGPCA: Clustering high-dimensional and non-stationary data streams</h3>
 Local PCA Clustering for streaming and high-dimensional data distributions. 
 This repo serves to reproduce the results from the publication: Adaptive local Principal Component Analysis improves the clustering of high-dimensional data.
-Please cite this code using the following DOI: https://doi.org/10.1016/j.patcog.2023.110030
+Cite as: Nico Migenda, Ralf Möller, Wolfram Schenck, "Adaptive local Principal Component Analysis improves the clustering of high-dimensional data", Pattern Recognition, Volume 146, 2024, https://doi.org/10.1016/j.patcog.2023.110030
 
 [![View NGPCA: Neural Gas Principal Component Analysis on File Exchange](https://www.mathworks.com/matlabcentral/images/matlab-file-exchange.svg)](https://de.mathworks.com/matlabcentral/fileexchange/154316-ngpca-neural-gas-principal-component-analysis)
 
@@ -27,28 +27,30 @@ Within the download you'll find the following directories and files:
   <summary>Download contents</summary>
 
   ```text
-  |-- Example_stationary.m
-  |-- Example_stationary.mlx
-  |-- README.md
-  |-- Results
-  |   `-- gif
-  |       |-- s1_G_AR_S_V.gif
-  |       |-- s2_G_AR_S_V.gif
-  |       |-- s3_G_AR_S_V.gif
-  |       `-- s4_G_AR_S_V.gif
-  |-- data
-  |   `-- s1.mat
-  `-- ngpca
-    |-- NGPCA.m
-    |-- drawunits.m
-    |-- eforrlsa.m
-    |-- init.m
-    |-- normalizedmi.m
-    |-- plot_ellipse.m
-    |-- potentialFunction.m
-    |-- update.m
-    |-- validate_CI.m
-    `-- validate_NMI_DU.m
+    |-- Example_dynamic.m
+    |-- Example_dynamic.mlx
+    |-- Example_stationary.m
+    |-- Example_stationary.mlx
+    |-- README.md
+    |-- Results
+    |   `-- gif
+    |       |-- dynamic.gif
+    |       `-- s1.gif
+    |-- data
+    |   |-- rls.mat
+    |   |-- s1.mat
+    |   `-- vortex.m
+    `-- ngpca
+        |-- NGPCA.m
+        |-- drawunits.m
+        |-- eforrlsa.m
+        |-- init.m
+        |-- normalizedmi.m
+        |-- plot_ellipse.m
+        |-- potentialFunction.m
+        |-- update.m
+        |-- validate_CI.m
+        `-- validate_NMI_DU.m
 
   ```
 </details>
@@ -57,15 +59,16 @@ Within the download you'll find the following directories and files:
 
 The latest release contains all files needed to directly run the algorithm:
 
-1. Open either `Example_dynamic.m` or `Example_stationary.m` in Matlab
-2. Compiling the scripts will automatically perform NGPCA-Clustering on the s1 or ring-line-square + vortex data set or with standard settings
+1.a Open either `Example_dynamic.m` or `Example_stationary.m` in Matlab
+1.b Alternativly use the provided live script versions (.mlx) 
+2. Running the scripts will automatically perform NGPCA-Clustering on the s1 or ring-line-square + vortex data set or with standard settings
 
 Optional:
 
 3. Change default settings or add optional parameters to the ngpca object creation or for the training process
 4. Train the model directly on a full data set using the `fit_multiple()` function or build your own training loops with `fit_single()`
 5. Visualize the clustering results with the `draw()` function
-6. Calculate validation metrics (CI, NMI, DU) by provding ground thruth and cluster shape information
+6. Calculate validation metrics (CI, NMI, DU) by providing ground thruth and cluster shape information
 
 ## Visualizations
 The following visualizations represent the learning process on selected data sets of the standard clustering benchmark database. For all data sets the default settings are used.
@@ -76,8 +79,9 @@ The following visualizations represent the learning process on selected data set
 
 ## Creators
 
-Nico Migenda
+Nico Migenda, Center for Applied Data Science Gütersloh, Bielefeld University of Applied Sciences and Arts, Germany
 
-Ralf Möller
+Ralf Möller, Computer Engineering Group, Faculty of Technology, Bielefeld University, Germany
+
+Wolfram Schenck, Center for Applied Data Science Gütersloh, Bielefeld University of Applied Sciences and Arts, Germany
 
-Wolfram Schenck
diff --git a/Results/gif/s1.gif b/Results/gif/s1.gif
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,18 @@
+<h3 align="center">Data sets</h3>
+
+## S1
+P. Fänti and S. Sieranoja
+K-means properties on six clustering benchmark datasets
+Applied Intelligence, 48 (12), 4743-4759, December 2018
+https://doi.org/10.1007/s10489-018-1238-7
+
+Static data set of 15 non-overlapping clusters
+
+## RLS
+
+Ring-Line-Square data set containing a ring and a square that are connected by a line. Continuous distribution
+
+## Vortex
+
+Vortex data set with variable noise. COntinuous distribution
+
diff --git a/data/vortex.m b/data/vortex.m
@@ -7,12 +7,11 @@
     University of Bielefeld
     www.ti.uni-bielefeld.de
 %}
-function D = vortex(a_max, r_max, noise, N);
-
-for i = 1:N
-  a = a_max .* rand(1);
-  r = r_max .* (a ./ a_max) + noise .* rand(1);
-
-  D(i,1) = r .* cos(a);
-  D(i,2) = r .* sin(a);
-end
+function D = vortex(a_max, r_max, noise, N)
+    for i = 1:N
+      a = a_max .* rand(1);
+      r = r_max .* (a ./ a_max) + noise .* rand(1);
+
+      D(i,1) = r .* cos(a);
+      D(i,2) = r .* sin(a);
+    end
diff --git a/ngpca/NGPCA.m b/ngpca/NGPCA.m
@@ -2,6 +2,9 @@
     properties
         % Optional parameters that are overwritten in Constructor if parsed
         potentialFunction = "AR"    % Defines the potential function used during the ranking process
+                                    % Allowed values: "AR", "H", "N", "AV",
+                                    % "VRV", "VRR" - We refer to our paper
+                                    % for an explanation
         softhard          = 0       % 0 = Softclustering more accurate but slower, 1 = Hardclustering  
         learningRate      = 0.99    % Defines the initial learning rate for all units. 
         activity          = 1.00    % Defines the initial activity for all units  
@@ -23,7 +26,7 @@
         NMI                         % Validation: Normalized Mutual information
         data                        % Data 
         initialized                 % Bool variable to only initialize the model once
-        dataDimensionality          % Data dimensionality set to the size of input data
+        dataDimensionality          % Data dimensionality set to the dimensionality of input data
     end
 
     methods
@@ -75,7 +78,7 @@
                 end
             end
             if size(data,2) < 2
-                error('Invalid input size. Data dimensionality lesser than two.')
+                error('Invalid input size. Data dimensionality lesser than 2.')
             end 
             obj.data = data;
             obj.dataDimensionality = size(obj.data,2);
@@ -87,6 +90,9 @@
         % Train on one data point
         %-------
         function obj = fit_single(obj, data)
+            if size(data,2) < 2
+                error('Invalid input size. Data dimensionality lesser than two.')
+            end
             obj.data = data;
             obj = update(obj);
         end
@@ -138,7 +144,8 @@
             end
             % If eigenvalues, eigenvectors and gt are provided, it is
             % possible to calculate the CI 
-            if any(strcmp(varargin,'eigenvalues')) && any(strcmp(varargin,'gt')) && any(strcmp(varargin,'eigenvectors'))
+            if any(strcmp(varargin,'eigenvalues')) && any(strcmp(varargin,'gt')) ...
+               && any(strcmp(varargin,'eigenvectors'))
                 obj = validate_CI(obj, gt, eigenvalues, eigenvectors);
             end
             % If labels exist, calculate NMI and DU

diff --git a/ngpca/drawunits.m b/ngpca/drawunits.m
@@ -9,10 +9,6 @@
     end
     obj.units{k}.H = plot_ellipse(obj.units{k}.weight(1:2,1:2), sqrt(abs(obj.units{k}.eigenvalue(1:2))), obj.units{k}.center(1:2));
 end
-%if t == p.N
-%    for a = 1 : p.M
-%        p.a(a) = text(units{a}.center(1),units{a}.center(2), sprintf('%u', a), 'Color', 'r','FontSize',14);
-%    end
-%end
+% Force the plot, equal to drawnow
 pause(0.001)
 
diff --git a/ngpca/eforrlsa.m b/ngpca/eforrlsa.m
@@ -6,19 +6,14 @@
 EFO_p = zeros(units{k}.m,1);
 EFO_q = zeros(units{k}.m,units{k}.m);
 EFO_r = zeros(units{k}.m,units{k}.m);
-%Algorithmus Gleichungn 3-12
+%Algorithmus Eq. 3-12
 for i = 1:units{k}.m
 
-    % Hilfsvariablen
-    % alpha * Eigenwert
-    % Anderes Alpha als im Hauptalgo. Dieses Alpha läuft gegen 1 während
-    % das normale alpha gegen 0 läuft. Deshalb hier 1-alpha
-    helperVariable1 = (1-units{k}.alpha)*units{k}.eigenvalue(i);
-    % beta * Output
-    % gleicher Verlauf wie Alpha aus Hauptalgo
-    helperVariable2 = units{k}.alpha * units{k}.y(i);
+    % Helper variables
+    alpha_L = (1-units{k}.alpha)*units{k}.eigenvalue(i);
+    beta_y = units{k}.alpha * units{k}.y(i);
 
-    % Init und update von t und d nach Gleichung 5+6
+    % Init und update of t and d according to eq. 5+6
     if i == 1
         EFO_t = 0;
         EFO_d = units{k}.x_c'*units{k}.x_c;
@@ -29,36 +24,35 @@
            EFO_d = 0; 
         end
     end
-    %Gleichung 7
-    EFO_s = (helperVariable1+units{k}.alpha*EFO_d)*units{k}.y(i);
-    %Gleichung 8
-    EFO_L2(i) = helperVariable1*helperVariable1 ...
-        + helperVariable2 * (helperVariable1*units{k}.y(i) + EFO_s);
-    %Gleichung 9
+    % According to eq. 7
+    EFO_s = (alpha_L+units{k}.alpha*EFO_d)*units{k}.y(i);
+    %According to eq. 8
+    EFO_L2(i) = alpha_L*alpha_L ...
+        + beta_y * (alpha_L*units{k}.y(i) + EFO_s);
+    %According to eq. 9
     EFO_n2 = EFO_L2(i) - EFO_s*EFO_s*EFO_t;
     % ensure that EFO_n2 > 0
-    if( EFO_n2 < 1e-100 )
-        EFO_n2 = 1e-100;
+    if( EFO_n2 < eps )
+        EFO_n2 = eps;
     end
     EFO_n = sqrt( EFO_n2 );
-    %Gleichung 12
-    EFO_p(i) = (helperVariable2 - EFO_s*EFO_t) / EFO_n;
-    % Berechnung vom 2 additiven Termen in Gleichung 4 ( siehe Index
-    % Bezeichnung)
+    %According to eq. 12
+    EFO_p(i) = (beta_y - EFO_s*EFO_t) / EFO_n;
+    % Calculation of the 2 additive terms in eq. 4 (notice index)
     units{k}.weight(:,i) = EFO_p(i) * units{k}.x_c;
     for i2 = 1:i
-        %Gleichung 10+11
+        %According to eq. 10+11
         if i2 < i 
             EFO_r(i,i2) = EFO_r(i-1,i2) + EFO_p(i-1)*EFO_q(i-1,i2);
-            EFO_q(i,i2) = -(helperVariable2*units{k}.y(i2)+EFO_s*EFO_r(i,i2)) / EFO_n;
+            EFO_q(i,i2) = -(beta_y*units{k}.y(i2)+EFO_s*EFO_r(i,i2)) / EFO_n;
         elseif i2 == i
             EFO_r(i,i2) = 0;
-            EFO_q(i,i2) = helperVariable1 / EFO_n;
+            EFO_q(i,i2) = alpha_L / EFO_n;
         else
             EFO_r(i,i2) = 0;
             EFO_q(i,i2) = 0;
         end
-        % Gleichung 4
+        % According to eq. 4
         units{k}.weight(:,i) = units{k}.weight(:,i) + EFO_q(i,i2) * V(:,i2);
     end
 end

diff --git a/ngpca/init.m b/ngpca/init.m
@@ -9,6 +9,9 @@
         end
 
         % Initialize centers by choosing N data points at random within the data space
+        % When only 1 data point is passed (fit_single), than the units are
+        % randomly initialized around that one data point. Otherwise around
+        % the provided data set
         if size(obj.data,1) == 1
             for i = 1:obj.dataDimensionality
                 obj.units{k}.center(i) = obj.data(:, i) * rand;
@@ -27,7 +30,7 @@
         obj.units{k}.eigenvalue = repmat(obj.lambda, obj.units{k}.m, 1);
 
         % Residual variance in the minor (n - m) eigendirections
-        obj.units{k}.sigma = obj.lambda;
+        obj.units{k}.sigma_sqr = obj.lambda;
 
         % Deviation between input and center
         obj.units{k}.x_c = zeros(obj.dataDimensionality, 1);
@@ -40,16 +43,17 @@
         obj.units{k}.activity = obj.activity;
 
         % Unit matching measure
-        obj.units{k}.y_bar = obj.lambda^2 * ones(obj.units{k}.m, 1);
+        obj.units{k}.eta_bar = obj.lambda^2 * ones(obj.units{k}.m, 1);
         obj.units{k}.l_bar = repmat(obj.lambda, obj.units{k}.m, 1);
-        obj.units{k}.mt = obj.units{k}.y_bar + obj.units{k}.l_bar;
+        obj.units{k}.gamma_bar = obj.units{k}.eta_bar + obj.units{k}.l_bar;
 
         % Unit summarized matching measure
         %obj.units{k}.D = obj.DtMax;
 
-        % Global learning rate
+        % Learning rates 
         obj.units{k}.alpha = obj.learningRate;
         obj.units{k}.epsilon = obj.learningRate;
+        % Drawing Handle
         obj.units{k}.H = 0;
     end
     obj.initialized = 1;
diff --git a/ngpca/normalizedmi.m b/ngpca/normalizedmi.m
@@ -1,4 +1,7 @@
 function z = normalizedmi(x, y)
+% Mo Chen (2024). Normalized Mutual Information 
+% (https://www.mathworks.com/matlabcentral/fileexchange/29047-normalized-mutual-information),
+% MATLAB Central File Exchange. Retrieved January 9, 2024.
 % Compute normalized mutual information I(x,y)/sqrt(H(x)*H(y)) of two discrete variables x and y.
 % Input:
 %   x, y: two integer vector of the same length 

diff --git a/ngpca/plot_ellipse.m b/ngpca/plot_ellipse.m
@@ -13,7 +13,6 @@
 DEG2RAD = pi./180.0;
 
 Color     = [0 0 0];
-FaceColor = 'none';
 
 % scale axes
 MajorAxis = L(1).*W(:,1);
@@ -28,12 +27,11 @@
 X     = norm(MajorAxis) .* cos(Theta);
 Y     = norm(MinorAxis) .* sin(Theta);
 
-% Rotationsmatrix 
+% Rotationmatrix 
 NX    = Pos(1) + cos(PA).*X - sin(PA).*Y;
 NY    = Pos(2) + sin(PA).*X + cos(PA).*Y;
 
 H1    = plot(NX,NY,'k');
-%set(H1,'EdgeColor',Color,'FaceColor',FaceColor);
 
 % draw axes
 H2 = line([Pos(1); Pos(1) + MajorAxis(1)], [Pos(2); Pos(2) + MajorAxis(2)]);

diff --git a/ngpca/potentialFunction.m b/ngpca/potentialFunction.m
@@ -1,12 +1,14 @@
 function ranking = potentialFunction(unit, dataDimensionality, potentialFunction, rmax)
 
 %% Basisterm that is always considered. For m < n the reconstruction error is added.
+% According to the first additive term of eq. (23)
 basisTerm = unit.y' * (unit.y ./ unit.eigenvalue);
 if unit.m < dataDimensionality
-    lambda_rest = unit.sigma / (dataDimensionality - unit.m);
-    if( lambda_rest <= 0.0 )
+    lambda_rest = unit.sigma_sqr / (dataDimensionality - unit.m);
+    if( lambda_rest <= eps )
         lambda_rest = eps;
     end  
+    % According to eq. (23)
     basisTerm = basisTerm + (1 ./ lambda_rest) * (unit.x_c' * unit.x_c - unit.y' * unit.y);
 end