claude-desktop-skills/monitoring-setup/SKILL.md
You are an expert at setting up monitoring, logging, and alerting systems.
npx skillsauth add ViggyV/claude-skills Monitoring SetupInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
You are an expert at setting up monitoring, logging, and alerting systems.
This skill activates when the user needs help with:
Ask about:
┌─────────────────────────────────────────────────────────────────┐
│ OBSERVABILITY STACK │
├─────────────────────────────────────────────────────────────────┤
│ │
│ METRICS LOGS TRACES ALERTS │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │Prometheus│ │ Loki │ │ Jaeger │ │Alertmgr │ │
│ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │
│ │ │ │ │ │
│ ─────┴───────────────┴───────────────┴───────────────┴───── │
│ │ │
│ ┌──────▼──────┐ │
│ │ Grafana │ │
│ │ (Dashboard) │ │
│ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Application metrics
- job_name: 'app'
static_configs:
- targets: ['app:8000']
metrics_path: /metrics
# Node exporter for system metrics
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
# Kubernetes service discovery
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
# alerts.yml
groups:
- name: application
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)"
- alert: HighLatency
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))
> 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "High latency detected"
description: "95th percentile latency is {{ $value }}s"
- alert: PodCrashLooping
expr: |
increase(kube_pod_container_status_restarts_total[1h]) > 3
for: 5m
labels:
severity: warning
annotations:
summary: "Pod crash looping"
description: "Pod {{ $labels.pod }} restarted {{ $value }} times"
- name: infrastructure
rules:
- alert: HighCPUUsage
expr: |
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage"
description: "CPU usage is {{ $value }}%"
- alert: HighMemoryUsage
expr: |
(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)
/ node_memory_MemTotal_bytes * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}%"
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes{fstype!="tmpfs"}
/ node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 < 15
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space"
description: "Only {{ $value }}% disk space remaining"
from prometheus_client import Counter, Histogram, Gauge, generate_latest
from fastapi import FastAPI, Request
import time
app = FastAPI()
# Define metrics
REQUEST_COUNT = Counter(
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint', 'status']
)
REQUEST_LATENCY = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint'],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0]
)
ACTIVE_REQUESTS = Gauge(
'http_requests_active',
'Active HTTP requests'
)
DB_CONNECTIONS = Gauge(
'db_connections_active',
'Active database connections'
)
# Middleware for automatic instrumentation
@app.middleware("http")
async def metrics_middleware(request: Request, call_next):
ACTIVE_REQUESTS.inc()
start_time = time.time()
response = await call_next(request)
duration = time.time() - start_time
endpoint = request.url.path
method = request.method
status = response.status_code
REQUEST_COUNT.labels(method=method, endpoint=endpoint, status=status).inc()
REQUEST_LATENCY.labels(method=method, endpoint=endpoint).observe(duration)
ACTIVE_REQUESTS.dec()
return response
# Metrics endpoint
@app.get("/metrics")
async def metrics():
return Response(
generate_latest(),
media_type="text/plain"
)
# Structured logging
import structlog
logger = structlog.get_logger()
@app.get("/api/users/{user_id}")
async def get_user(user_id: str):
logger.info(
"user_fetch",
user_id=user_id,
action="fetch"
)
# ... implementation
{
"dashboard": {
"title": "Application Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (status)",
"legendFormat": "{{status}}"
}
]
},
{
"title": "Latency (P95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))"
}
]
},
{
"title": "Error Rate",
"type": "singlestat",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100"
}
],
"format": "percent"
},
{
"title": "Active Connections",
"type": "gauge",
"targets": [
{
"expr": "db_connections_active"
}
]
}
]
}
}
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.47.0
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts.yml:/etc/prometheus/rules/alerts.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=15d'
ports:
- "9090:9090"
alertmanager:
image: prom/alertmanager:v0.26.0
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
grafana:
image: grafana/grafana:10.1.0
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_USERS_ALLOW_SIGN_UP=false
ports:
- "3000:3000"
loki:
image: grafana/loki:2.9.0
volumes:
- ./loki-config.yml:/etc/loki/local-config.yaml
- loki_data:/loki
ports:
- "3100:3100"
promtail:
image: grafana/promtail:2.9.0
volumes:
- ./promtail-config.yml:/etc/promtail/config.yml
- /var/log:/var/log
command: -config.file=/etc/promtail/config.yml
volumes:
prometheus_data:
grafana_data:
loki_data:
Provide:
data-ai
Use this skill for reinforcement learning tasks including training RL agents (PPO, SAC, DQN, TD3, DDPG, A2C, etc.), creating custom Gym environments, implementing callbacks for monitoring and control,
testing
You are an expert at optimizing SQL queries for performance and efficiency.
tools
Knowledge and utilities for creating animated GIFs optimized for Slack. Provides constraints, validation tools, and animation concepts. Use when users request animated GIFs for Slack like "make me a G
tools
21 production-ready scripts for iOS app testing, building, and automation. Provides semantic UI navigation, build automation, accessibility testing, and simulator lifecycle management. Optimized for A