From 6d1fddc59228993288515c849889d3191cdd0f76 Mon Sep 17 00:00:00 2001 From: Ozlem Date: Mon, 10 Aug 2020 17:05:58 +0300 Subject: [PATCH] KMeans updated --- KMeans_Guide.asv | 28 ++++++++++++++++++++++++++++ KMeans_Guide.m | 22 +++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 KMeans_Guide.asv diff --git a/KMeans_Guide.asv b/KMeans_Guide.asv new file mode 100644 index 0000000..1cbb836 --- /dev/null +++ b/KMeans_Guide.asv @@ -0,0 +1,28 @@ +%%%------------- K-Means Clustering +% Step 1: Choose the number k of clusters + +% Step 2: Select k point at random called centroids + +% Step 3: Assign each datapoint to the nearest centroid which lead to K +% clusters + +% Step 4: Compute new centroids of each cluster based on the datapoints it +% contains + +% Step 5: Reassign each datapoint to the new closest centroid + +% Choosing the k value, search for WCSS. "The Elbow Method" + +% Import the dataset +data = readtable('Datasets\Mall_Customers.csv'); + +%Check for missing values +missings = sum(ismissing(data)); + +%Plot variables to check for outliers +IncomePlot = plot(data.AnnualIncome); +SpendingPlot = plot(data.SpendingScore); + +% Perform Feature Scaling (Standardization Method) +stand_income = (data.AnnualIncome - mean(data.AnnualIncome)) / std(data.AnnualIncome); +data.AnnualIncome = stand_I \ No newline at end of file diff --git a/KMeans_Guide.m b/KMeans_Guide.m index b192237..df11939 100644 --- a/KMeans_Guide.m +++ b/KMeans_Guide.m @@ -13,5 +13,25 @@ % Choosing the k value, search for WCSS. "The Elbow Method" - +% Import the dataset data = readtable('Datasets\Mall_Customers.csv'); + +%Check for missing values +missings = sum(ismissing(data)); + +%Plot variables to check for outliers +IncomePlot = plot(data.AnnualIncome); +SpendingPlot = plot(data.SpendingScore); + +% Perform Feature Scaling (Standardization Method) +stand_income = (data.AnnualIncome - mean(data.AnnualIncome)) / std(data.AnnualIncome); +data.AnnualIncome = stand_income; + +stand_spending = (data.SpendingScore - mean(data.SpendingScore)) / std(data.SpendingScore); +data.SpendingScore = stand_spending; + +% Select columns for clustering +selected_data = data(:,4:5); + +%Data must be an array to be used in clustering algorithm +arrayed_data = table2array(selected_data);