skills/clustering-analysis/SKILL.md
Identify groups and patterns in data using k-means, hierarchical clustering, and DBSCAN for cluster discovery, customer segmentation, and unsupervised learning
npx skillsauth add aj-geddes/useful-ai-prompts Clustering AnalysisInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Clustering partitions data into groups of similar observations without pre-defined labels, enabling discovery of natural patterns and structures in data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
silhouette_score, silhouette_samples, davies_bouldin_score,
calinski_harabasz_score
)
from scipy.cluster.hierarchy import dendrogram, linkage
import seaborn as sns
# Generate sample data
np.random.seed(42)
n_samples = 300
centers = [[0, 0], [5, 5], [-3, 4]]
X = np.vstack([
np.random.randn(100, 2) + centers[0],
np.random.randn(100, 2) + centers[1],
np.random.randn(100, 2) + centers[2],
])
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# K-Means with Elbow method
inertias = []
silhouette_scores = []
k_range = range(2, 11)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)
axes[1].plot(k_range, silhouette_scores, 'go-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')
axes[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Optimal k = 3
optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)
# K-Means visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# K-Means clusters
axes[0].scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)
axes[0].scatter(
kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1],
c='red', marker='X', s=200, edgecolors='black', linewidths=2
)
axes[0].set_title(f'K-Means (k={optimal_k})')
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
# Silhouette plot
ax = axes[1]
y_lower = 10
silhouette_vals = silhouette_samples(X_scaled, kmeans_labels)
for i in range(optimal_k):
cluster_silhouette_vals = silhouette_vals[kmeans_labels == i]
cluster_silhouette_vals.sort()
size_cluster_i = cluster_silhouette_vals.shape[0]
y_upper = y_lower + size_cluster_i
ax.fill_betweenx(np.arange(y_lower, y_upper),
0, cluster_silhouette_vals,
alpha=0.7, label=f'Cluster {i}')
y_lower = y_upper + 10
ax.axvline(x=silhouette_score(X_scaled, kmeans_labels), color="red", linestyle="--")
ax.set_xlabel('Silhouette Coefficient')
ax.set_ylabel('Cluster Label')
ax.set_title('Silhouette Plot')
# Hierarchical clustering
linkage_matrix = linkage(X_scaled, method='ward')
dendrogram(linkage_matrix, ax=axes[2], truncate_mode='lastp', p=10)
axes[2].set_title('Dendrogram (Ward)')
axes[2].set_xlabel('Sample Index')
plt.tight_layout()
plt.show()
# Hierarchical clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
hier_labels = hierarchical.fit_predict(X_scaled)
# DBSCAN clustering
dbscan = DBSCAN(eps=0.4, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)
# Gaussian Mixture Model
gmm = GaussianMixture(n_components=optimal_k, random_state=42)
gmm_labels = gmm.fit_predict(X_scaled)
gmm_proba = gmm.predict_proba(X_scaled)
# Clustering algorithm comparison
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
algorithms = [
(kmeans_labels, 'K-Means'),
(hier_labels, 'Hierarchical'),
(dbscan_labels, 'DBSCAN'),
(gmm_labels, 'Gaussian Mixture'),
]
for idx, (labels, title) in enumerate(algorithms):
ax = axes[idx // 2, idx % 2]
# Skip noise points for DBSCAN
mask = labels != -1
scatter = ax.scatter(
X[mask, 0], X[mask, 1], c=labels[mask], cmap='viridis', alpha=0.6
)
if title == 'DBSCAN' and n_noise > 0:
noise_mask = labels == -1
ax.scatter(X[noise_mask, 0], X[noise_mask, 1], c='red', marker='x', s=100, label='Noise')
ax.legend()
ax.set_title(f'{title} (n_clusters={len(set(labels[mask]))})')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
plt.tight_layout()
plt.show()
# Cluster validation metrics
validation_metrics = {
'Algorithm': ['K-Means', 'Hierarchical', 'DBSCAN', 'GMM'],
'Silhouette Score': [
silhouette_score(X_scaled, kmeans_labels),
silhouette_score(X_scaled, hier_labels),
silhouette_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
silhouette_score(X_scaled, gmm_labels),
],
'Davies-Bouldin Index': [
davies_bouldin_score(X_scaled, kmeans_labels),
davies_bouldin_score(X_scaled, hier_labels),
davies_bouldin_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
davies_bouldin_score(X_scaled, gmm_labels),
],
'Calinski-Harabasz Index': [
calinski_harabasz_score(X_scaled, kmeans_labels),
calinski_harabasz_score(X_scaled, hier_labels),
calinski_harabasz_score(X_scaled[dbscan_labels != -1], dbscan_labels[dbscan_labels != -1]) if n_noise < len(X_scaled) else np.nan,
calinski_harabasz_score(X_scaled, gmm_labels),
],
}
metrics_df = pd.DataFrame(validation_metrics)
print("Clustering Validation Metrics:")
print(metrics_df)
# Cluster size analysis
sizes_df = pd.DataFrame({
'K-Means': pd.Series(kmeans_labels).value_counts().sort_index(),
'Hierarchical': pd.Series(hier_labels).value_counts().sort_index(),
'GMM': pd.Series(gmm_labels).value_counts().sort_index(),
})
print("\nCluster Sizes:")
print(sizes_df)
# Membership probability (GMM)
fig, ax = plt.subplots(figsize=(10, 6))
membership = gmm_proba.max(axis=1)
scatter = ax.scatter(X[:, 0], X[:, 1], c=membership, cmap='RdYlGn', alpha=0.6, s=50)
ax.set_title('Cluster Membership Confidence (GMM)')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
plt.colorbar(scatter, ax=ax, label='Membership Probability')
plt.show()
# Cluster characteristics
kmeans_centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
cluster_df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
cluster_df['Cluster'] = kmeans_labels
for cluster_id in range(optimal_k):
cluster_data = cluster_df[cluster_df['Cluster'] == cluster_id]
print(f"\nCluster {cluster_id} Characteristics:")
print(cluster_data[['Feature 1', 'Feature 2']].describe())
development
Implement Zero Trust security model with identity verification, microsegmentation, least privilege access, and continuous monitoring. Use when building secure cloud-native applications.
development
Prevent Cross-Site Scripting (XSS) attacks through input sanitization, output encoding, and Content Security Policy. Use when handling user-generated content in web applications.
tools
Create wireframes and interactive prototypes to visualize user interfaces and gather feedback early. Use tools and techniques to communicate design ideas before development.
development
Implement real-time bidirectional communication with WebSockets including connection management, message routing, and scaling. Use when building real-time features, chat systems, live notifications, or collaborative applications.