clustering.py

# -*- coding: utf-8 -*-
"""clustering.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1qygB1-c4ObKvMJJ9rVybhV48Z8KeZ4kk
"""

import pandas as pd
from google.colab import drive
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import kneighbors_graph
from sklearn.cluster import SpectralClustering
import scipy

drive.mount('/content/gdrive')
df = pd.read_csv('gdrive/My Drive/Country-data.csv')

df

df.describe()

df.info()

plt.figure(figsize=(10,8))
corrs = df.corr()
heatmap = sns.heatmap(corrs, annot=True, cmap="coolwarm")
plt.title("correlations")
plt.show()

comatrix = df.corr()
cogdpp = comatrix['gdpp']
print("Correlation Matrix:")
print(comatrix)
print("\nCorrelation with gdpp:")
print(cogdpp)

plt.figure(figsize=(12,6))
sns.barplot(x="country",y="health",data=df[df["health"]<df.health.quantile(0.25)])
plt.xticks(rotation=90);
plt.title("low health")

plt.figure(figsize=(12,6))
sns.barplot(x="country",y="income",data=df[df["income"]<df.income.quantile(0.25)])
plt.xticks(rotation=90);
plt.title("countries with low income")

df = df.drop(['imports', 'country'], axis=1)
from sklearn.preprocessing import MinMaxScaler
std_features = MinMaxScaler()
df_std = std_features.fit_transform(df.values)

w = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=500, n_init=10, random_state=42)
    kmeans.fit(df_std)
    w.append(kmeans.inertia_)
plt.plot(range(1, 11), w, marker='o')
plt.title('Elbow')
plt.xlabel('Number of Clusters')
plt.ylabel('W')
plt.show()
silhouette_scores = []
for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=500, n_init=10, random_state=42)
    kmeans.fit(df_std)
    silhouette_scores.append(silhouette_score(df_std, kmeans.labels_))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

optimal_clusters = 2
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=500, n_init=10, random_state=42)
kmeans.fit(df_std)
df['Cluster'] = kmeans.labels_
feature1 = 'child_mort'
feature2 = 'income'
feature3 = 'life_expec'
plt.figure(figsize=(10, 4))
plt.subplot(2, 2, 1)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature2], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature2}')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.legend()
plt.subplot(2, 2, 2)
for cluster in range(optimal_clusters):
    cluster_data =df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature3}')
plt.xlabel(feature1)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 3)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature3}')
plt.xlabel(feature2)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 4)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster'] == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature1], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature1}')
plt.xlabel(feature2)
plt.ylabel(feature1)
plt.legend()
plt.tight_layout()
plt.show()

df['Cluster_kmeans'] = kmeans.predict(df_std)
print(df[['Cluster_kmeans']])

silhouette_scores = []
for n_clusters in range(2, 11):
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    gmm.fit(df_std)
    labels = gmm.predict(df_std)
    silhouette_scores.append(silhouette_score(df_std, labels))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()
optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
print("optimal_clusters = ", optimal_clusters)

optimal_clusters = 2
gmm_optimal = GaussianMixture(n_components=optimal_clusters, random_state=42)
gmm_optimal.fit(df_std)
df['Cluster_GMM'] = gmm_optimal.predict(df_std)
print(df[['Cluster_GMM']])
feature1 = 'child_mort'
feature2 = 'income'
feature3 = 'life_expec'
plt.figure(figsize=(10, 8))
plt.subplot(2, 2, 1)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster_GMM'] == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature2], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature2} (GMM)')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.legend()
plt.subplot(2, 2, 2)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster_GMM'] == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature3} (GMM)')
plt.xlabel(feature1)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 3)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster_GMM'] == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature3} (GMM)')
plt.xlabel(feature2)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 4)
for cluster in range(optimal_clusters):
    cluster_data = df[df['Cluster_GMM'] == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature1], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature1} (GMM)')
plt.xlabel(feature2)
plt.ylabel(feature1)
plt.legend()
plt.tight_layout()
plt.show()

from scipy.linalg import eigh

def spectral_clustering(X, n_clusters, n_neighbors):
    # Step 1: Construct the Affinity Matrix using kneighbors_graph
    affinity_matrix = kneighbors_graph(X, n_neighbors=n_neighbors, mode='connectivity', include_self=True)
    affinity_matrix = 0.5 * (affinity_matrix + affinity_matrix.T)  # Make it symmetric

    # Step 2: Compute the Laplacian Matrix
    degree_matrix = np.diag(np.sum(affinity_matrix, axis=1))
    laplacian_matrix = degree_matrix - affinity_matrix

    # Step 3: Compute the first k eigenvectors (smallest eigenvalues) of the Laplacian matrix
    eigenvalues, eigenvectors = eigh(laplacian_matrix, eigvals=(0, n_clusters - 1))

    # Step 4: Form clusters using k-means on the obtained eigenvectors
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(eigenvectors)

    return cluster_labels

import networkx as nx
from sklearn.metrics.pairwise import euclidean_distances
distances = euclidean_distances(df_std, df_std)

# Create a graph
G = nx.Graph()

# Add nodes to the graph
for i in range(len(df)):
    G.add_node(i, data=df.iloc[i])

# Add edges based on a distance threshold
distance_threshold = 1.5  # Adjust this threshold based on your data
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        if distances[i, j] < distance_threshold:
            G.add_edge(i, j)

# Visualize the graph (optional)
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, font_size=8, node_size=200, node_color='skyblue', font_color='black', font_weight='bold', edge_color='gray')
adjacency_matrix = nx.to_numpy_array(G)
plt.show()

silhouette_scores = []
for n_clusters in range(2, 11):
    cluster_labels = spectral_clustering(adjacency_matrix, n_clusters=n_clusters, n_neighbors=5)  # Adjust n_neighbors as needed
    silhouette_avg = silhouette_score(adjacency_matrix, cluster_labels)
    silhouette_scores.append(silhouette_avg)
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()
optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 3
spectral_model_optimal = SpectralClustering(n_clusters=optimal_clusters, affinity='nearest_neighbors', random_state=42)
cluster_labels_optimal = spectral_model_optimal.fit_predict(df)
print("Cluster Assignments:")
print(cluster_labels_optimal)
feature1 = 'child_mort'
feature2 = 'income'
feature3 = 'life_expec'
plt.figure(figsize=(10, 8))
plt.subplot(2, 2, 1)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels_optimal == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature2], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature2} (Spectral Clustering)')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.legend()
plt.subplot(2, 2, 2)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels_optimal == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature3} (Spectral Clustering)')
plt.xlabel(feature1)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 3)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels_optimal == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature3} (Spectral Clustering)')
plt.xlabel(feature2)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 4)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels_optimal == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature1], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature1} (Spectral Clustering)')
plt.xlabel(feature2)
plt.ylabel(feature1)
plt.legend()
plt.tight_layout()
plt.show()

optimal_clusters = 2
spectral_model = SpectralClustering(n_clusters=optimal_clusters, affinity='nearest_neighbors', random_state=42)
cluster_labels = spectral_model.fit_predict(df_std)
print("Cluster Assignments:")
print(cluster_labels)
feature1 = 'child_mort'
feature2 = 'income'
feature3 = 'life_expec'
plt.figure(figsize=(10, 8))
plt.subplot(2, 2, 1)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature2], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature2} (Spectral Clustering)')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.legend()
plt.subplot(2, 2, 2)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels == cluster]
    plt.scatter(cluster_data[feature1], cluster_data[feature3], label=f'Cluster {cluster}')
plt.title(f'{feature1} vs {feature3} (Spectral Clustering)')
plt.xlabel(feature1)
plt.ylabel(feature3)
plt.legend()

# Plot Feature 2 vs Feature 3
plt.subplot(2, 2, 3)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature3], label=f'Cluster {cluster}')

plt.title(f'{feature2} vs {feature3} (Spectral Clustering)')
plt.xlabel(feature2)
plt.ylabel(feature3)
plt.legend()
plt.subplot(2, 2, 4)
for cluster in range(optimal_clusters):
    cluster_data = df[cluster_labels == cluster]
    plt.scatter(cluster_data[feature2], cluster_data[feature1], label=f'Cluster {cluster}')
plt.title(f'{feature2} vs {feature1} (Spectral Clustering)')
plt.xlabel(feature2)
plt.ylabel(feature1)
plt.legend()
plt.tight_layout()
plt.show()