areas/devops/observability/skills/slo-implementation/SKILL.md
Implement SLOs end-to-end in Prometheus — recording rules, burn rate alerts, error budget dashboards, and Sloth/pyrra integration.
npx skillsauth add sawrus/agent-guides slo-implementationInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Expertise: Prometheus recording rules for SLOs, multi-window burn rate alerts, Sloth code generation, error budget Grafana panels.
When implementing SLOs for a service in Prometheus, setting up burn rate alerts, or creating error budget dashboards.
# prometheus-rules/slo-checkout-service.yaml
groups:
- name: slo:checkout-service:recording
interval: 30s
rules:
# Good requests: 2xx, latency < 500ms (combine availability + latency SLI)
- record: slo:http_requests_good:rate5m
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{
service="checkout-service",
status=~"2.."
}[5m]))
# For latency SLI, intersect with bucket:
# sum(rate(http_request_duration_seconds_bucket{
# service="checkout-service", le="0.5"}[5m]))
- record: slo:http_requests_total:rate5m
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{service="checkout-service"}[5m]))
# SLI ratio (5m window)
- record: slo:http_availability:ratio_rate5m
labels: { service: checkout-service }
expr: |
slo:http_requests_good:rate5m{service="checkout-service"}
/ slo:http_requests_total:rate5m{service="checkout-service"}
# Pre-compute multiple windows for burn rate alerts
- record: slo:http_availability:ratio_rate30m
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{service="checkout-service",status=~"2.."}[30m]))
/ sum(rate(http_requests_total{service="checkout-service"}[30m]))
- record: slo:http_availability:ratio_rate1h
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{service="checkout-service",status=~"2.."}[1h]))
/ sum(rate(http_requests_total{service="checkout-service"}[1h]))
- record: slo:http_availability:ratio_rate6h
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{service="checkout-service",status=~"2.."}[6h]))
/ sum(rate(http_requests_total{service="checkout-service"}[6h]))
- record: slo:http_availability:ratio_rate1d
labels: { service: checkout-service }
expr: |
sum(rate(http_requests_total{service="checkout-service",status=~"2.."}[1d]))
/ sum(rate(http_requests_total{service="checkout-service"}[1d]))
- record: slo:http_availability:ratio_rate28d
labels: { service: checkout-service }
expr: |
sum_over_time(slo:http_availability:ratio_rate5m{service="checkout-service"}[28d])
/ (28 * 24 * 12) # 12 samples/hour × 24h × 28d
- name: slo:checkout-service:alerts
rules:
# ── Fast burn (1h + 5m windows, 14.4× rate) ──────────────────
# Consumes 2% of 28d budget in 1h → page immediately
- alert: CheckoutSLOFastBurn
expr: |
(slo:http_availability:ratio_rate1h{service="checkout-service"} < (1 - 14.4 * 0.005))
and
(slo:http_availability:ratio_rate5m{service="checkout-service"} < (1 - 14.4 * 0.005))
for: 2m
labels:
severity: critical
service: checkout-service
slo: availability-99.5
annotations:
summary: "Checkout SLO fast burn — error rate > 14.4× baseline"
description: "1h availability: {{ $value | humanizePercentage }}. Budget burning rapidly."
runbook_url: "https://runbooks.internal/checkout-slo-fast-burn"
# ── Slow burn (6h + 30m windows, 6× rate) ────────────────────
# Consumes 5% of 28d budget in 6h → ticket, fix in business hours
- alert: CheckoutSLOSlowBurn
expr: |
(slo:http_availability:ratio_rate6h{service="checkout-service"} < (1 - 6 * 0.005))
and
(slo:http_availability:ratio_rate30m{service="checkout-service"} < (1 - 6 * 0.005))
for: 15m
labels:
severity: warning
service: checkout-service
slo: availability-99.5
annotations:
summary: "Checkout SLO slow burn — error rate > 6× baseline"
runbook_url: "https://runbooks.internal/checkout-slo-slow-burn"
# ── Budget exhaustion warning ─────────────────────────────────
- alert: CheckoutSLOBudgetLow
expr: |
slo:http_availability:ratio_rate28d{service="checkout-service"}
< (1 - 0.005 * 0.75) # < 25% budget remaining
for: 1h
labels:
severity: warning
service: checkout-service
annotations:
summary: "Checkout error budget < 25% remaining for this month"
runbook_url: "https://runbooks.internal/checkout-error-budget"
# slo/checkout-service.yaml
version: "prometheus/v1"
service: checkout-service
labels: { team: backend, tier: "1" }
slos:
- name: requests-availability
objective: 99.5
description: "99.5% of checkout requests succeed"
sli:
events:
error_query: |
sum(rate(http_requests_total{
service="checkout-service",
status=~"5.."}[{{.window}}]))
total_query: |
sum(rate(http_requests_total{
service="checkout-service"}[{{.window}}]))
alerting:
name: CheckoutServiceAvailability
page_alert:
labels: { severity: critical }
annotations:
runbook_url: https://runbooks.internal/checkout-availability
ticket_alert:
labels: { severity: warning }
# Generate Prometheus rules + alerts from Sloth spec
sloth generate -i slo/checkout-service.yaml -o rules/slo-checkout-generated.yaml
# Produces: recording rules for all windows + multi-window burn rate alerts
-- Current error budget remaining (percent of 28d budget)
(
sum_over_time(slo:http_availability:ratio_rate5m{service="checkout-service"}[28d])
/ (28 * 24 * 12)
- (1 - 0.005)
)
/ 0.005 * 100
-- Hours of budget remaining at current burn rate
(
(slo:http_availability:ratio_rate28d{service="checkout-service"} - (1 - 0.005))
/ 0.005
) * 28 * 24
testing
QA Expert for writing E2E tests, test scenarios, test plans, and ensuring test coverage quality.
development
Expert UI/UX design intelligence for creating distinctive, high-craft, and mobile-first interfaces. Focuses on premium aesthetics, touch-first ergonomics, and Flutter performance.
development
Code Review Expert for static analysis, security auditing, architecture review, and ensuring code quality standards.
development
Babysit a GitHub pull request after creation by continuously polling review comments, CI checks/workflow runs, and mergeability state until the PR is merged/closed or user help is required. Diagnose failures, retry likely flaky failures up to 3 times, auto-fix/push branch-related issues when appropriate, and keep watching open PRs so fresh review feedback is surfaced promptly. Use when the user asks Codex to monitor a PR, watch CI, handle review comments, or keep an eye on failures and feedback on an open PR.