Consider clustering customer data for market segmentation. A good clustering solution would have customers within each segment being very similar (low WCSS, high Silhouette score), and segments being clearly distinct from each other (high Calinski-Harabasz index, low Davies-Bouldin index). If you are segmenting customers by purchasing behavior, low WCSS and high Silhouette means customers within a cluster have very similar purchasing patterns. High Calinski-Harabasz means the segments based on average purchasing pattern of a segment are well-separated. Business context should complement metrics, as high separation and good compactness according to metrics alone may not necessarily correspond to meaningful or actionable customer segments.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.datasets import make_blobs
# Generate sample data
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# Perform k-means clustering (assuming we know k=4 from prior analysis)
kmeans = KMeans(n_clusters=4, random_state=42).fit(X)
labels = kmeans.labels_
# 1. WCSS (Inertia)
wcss = kmeans.inertia_
print(f"WCSS: {wcss}")
# 2. Silhouette Score
silhouette_avg = silhouette_score(X, labels)
print(f"Silhouette Score: {silhouette_avg}")
# 3. Calinski-Harabasz Index
ch_score = calinski_harabasz_score(X, labels)
print(f"Calinski-Harabasz Index: {ch_score}")
# 4. Davies-Bouldin Index
db_score = davies_bouldin_score(X, labels)
print(f"Davies-Bouldin Index: {db_score}")
# Visualize the clusters
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.title("K-means Clustering")
plt.show()
# Evaluate for Different k values:
# For example, evaluate for a range from k=2 to k=10 to see the changes
wcss_values = []
silhouette_scores = []
ch_scores = []
db_scores = []
k_values = range(2, 11)
for k in k_values:
kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42)
kmeans.fit(X)
labels = kmeans.labels_
# Calculate and append the metrics
wcss_values.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X, labels))
ch_scores.append(calinski_harabasz_score(X, labels))
db_scores.append(davies_bouldin_score(X, labels))
# Plot the metrics for different k values
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
axs[0, 0].plot(k_values, wcss_values, marker='o')
axs[0, 0].set_title('WCSS (Elbow Method)')
axs[0, 1].plot(k_values, silhouette_scores, marker='o')
axs[0, 1].set_title('Silhouette Coefficient')
axs[1, 0].plot(k_values, ch_scores, marker='o')
axs[1, 0].set_title('Calinski-Harabasz Index')
axs[1, 1].plot(k_values, db_scores, marker='o')
axs[1, 1].set_title('Davies-Bouldin Index')
for ax in axs.flat:
ax.set(xlabel='Number of Clusters (k)', ylabel='Score')
ax.grid(True)
plt.tight_layout() # Prevents overlapping labels/titles
plt.show()