Imagine grouping customers into market segments. 1. Choose k: Decide on the number of segments (e.g., k=3). 2. Initialize: Randomly select 3 customers as initial segment representatives (centroids). 3. Assign: Assign each customer to the closest representative based on their purchase history. 4. Update: Recalculate the representative for each segment by averaging purchases of assigned customers. 5. Repeat: Repeat steps 3 and 4 until segment representatives and customer groups stabilize.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs # For generating sample data
# 1. Generate Sample Data (replace with your own data)
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0) # Example with 4 clusters
# 2. Choose the number of clusters (k)
k = 4
# 3. Initialize centroids randomly (you can use other methods too, as explained in the Related Concepts)
kmeans = KMeans(n_clusters=k, init='random', random_state=42, n_init = 1) #n_init = 1 to illustrate using single initialization, typically a higher value is used, like 10 for multiple random starts.
kmeans.fit(X)
initial_centroids = kmeans.cluster_centers_ # Store the initial centroids for later vizualization
# Fit the KMeans model (Steps 3, 4 and 5 combined as part of kmeans.fit())
# Get cluster labels
labels = kmeans.labels_
final_centroids = kmeans.cluster_centers_ #Centroids after the algorithm converges.
# Plotting to illustrate how centroids shift during iterations.
plt.figure(figsize=(10,6))
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.scatter(initial_centroids[:, 0], initial_centroids[:, 1], c='red', marker='X', s=200, label='Initial Centroids')
plt.scatter(final_centroids[:, 0], final_centroids[:, 1], c='black', marker='X', s=200, label='Final Centroids')
plt.title(f'K-Means Clustering Results (k={k})')
plt.legend()
plt.show()