devops/observability/elk-stack/SKILL.md
Deploy and manage the ELK Stack (Elasticsearch, Logstash, Kibana) for log aggregation and analysis. Configure log pipelines, create visualizations, and implement log-based monitoring. Use when centralizing logs, implementing search functionality, or building log analytics platforms.
npx skillsauth add bagelhole/devops-security-agent-skills elk-stackInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Centralize and analyze logs with Elasticsearch, Logstash, and Kibana.
Use this skill when:
# docker-compose.yml
version: '3.8'
services:
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms1g -Xmx1g"
ports:
- "9200:9200"
volumes:
- elasticsearch-data:/usr/share/elasticsearch/data
logstash:
image: docker.elastic.co/logstash/logstash:8.11.0
volumes:
- ./logstash/pipeline:/usr/share/logstash/pipeline
- ./logstash/config:/usr/share/logstash/config
ports:
- "5044:5044"
- "5000:5000"
depends_on:
- elasticsearch
kibana:
image: docker.elastic.co/kibana/kibana:8.11.0
ports:
- "5601:5601"
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
depends_on:
- elasticsearch
filebeat:
image: docker.elastic.co/beats/filebeat:8.11.0
user: root
volumes:
- ./filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
- logstash
volumes:
elasticsearch-data:
PUT _index_template/logs-template
{
"index_patterns": ["logs-*"],
"template": {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1,
"index.lifecycle.name": "logs-policy"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"message": { "type": "text" },
"level": { "type": "keyword" },
"service": { "type": "keyword" },
"host": { "type": "keyword" },
"trace_id": { "type": "keyword" }
}
}
}
}
PUT _ilm/policy/logs-policy
{
"policy": {
"phases": {
"hot": {
"min_age": "0ms",
"actions": {
"rollover": {
"max_size": "50GB",
"max_age": "1d"
}
}
},
"warm": {
"min_age": "7d",
"actions": {
"shrink": { "number_of_shards": 1 },
"forcemerge": { "max_num_segments": 1 }
}
},
"cold": {
"min_age": "30d",
"actions": {
"freeze": {}
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
}
# logstash/pipeline/main.conf
input {
beats {
port => 5044
}
tcp {
port => 5000
codec => json_lines
}
}
filter {
# Parse JSON logs
if [message] =~ /^\{/ {
json {
source => "message"
}
}
# Parse timestamp
date {
match => ["timestamp", "ISO8601", "yyyy-MM-dd HH:mm:ss"]
target => "@timestamp"
}
# Add environment tag
mutate {
add_field => { "environment" => "production" }
}
# Grok pattern for nginx logs
if [type] == "nginx" {
grok {
match => {
"message" => '%{IPORHOST:client_ip} - %{USER:user} \[%{HTTPDATE:timestamp}\] "%{WORD:method} %{URIPATHPARAM:request} HTTP/%{NUMBER:http_version}" %{NUMBER:status} %{NUMBER:bytes}'
}
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{+YYYY.MM.dd}"
}
}
filter {
# Parse application logs
grok {
match => {
"message" => "%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} \[%{DATA:service}\] %{GREEDYDATA:log_message}"
}
}
# Extract trace ID from message
if [log_message] =~ /trace_id=/ {
grok {
match => { "log_message" => "trace_id=%{UUID:trace_id}" }
}
}
# GeoIP lookup
if [client_ip] {
geoip {
source => "client_ip"
target => "geoip"
}
}
# Drop debug logs in production
if [level] == "DEBUG" and [environment] == "production" {
drop {}
}
# Enrich with lookup
translate {
field => "status"
destination => "status_description"
dictionary => {
"200" => "OK"
"404" => "Not Found"
"500" => "Internal Server Error"
}
}
}
# filebeat/filebeat.yml
filebeat.inputs:
- type: container
paths:
- '/var/lib/docker/containers/*/*.log'
processors:
- add_docker_metadata:
host: "unix:///var/run/docker.sock"
- type: log
enabled: true
paths:
- /var/log/nginx/*.log
tags: ["nginx"]
fields:
type: nginx
output.logstash:
hosts: ["logstash:5044"]
logging.level: info
logging.to_files: true
logging.files:
path: /var/log/filebeat
name: filebeat
keepfiles: 7
// Search all logs
GET logs-*/_search
{
"query": {
"match_all": {}
}
}
// Search by keyword
GET logs-*/_search
{
"query": {
"match": {
"message": "error"
}
}
}
// Filter by field
GET logs-*/_search
{
"query": {
"bool": {
"must": [
{ "match": { "level": "ERROR" } },
{ "range": { "@timestamp": { "gte": "now-1h" } } }
],
"filter": [
{ "term": { "service": "api-gateway" } }
]
}
}
}
// Count by log level
GET logs-*/_search
{
"size": 0,
"aggs": {
"log_levels": {
"terms": { "field": "level" }
}
}
}
// Error rate over time
GET logs-*/_search
{
"size": 0,
"aggs": {
"errors_over_time": {
"date_histogram": {
"field": "@timestamp",
"fixed_interval": "5m"
},
"aggs": {
"error_count": {
"filter": { "term": { "level": "ERROR" } }
}
}
}
}
}
logs-*@timestampCreate saved searches for common queries:
level:ERROR - All errorsservice:api-gateway AND level:ERROR - API gateway errorsresponse_time:>1000 - Slow requestsCommon visualization types:
Create dashboard with:
PUT _watcher/watch/error_alert
{
"trigger": {
"schedule": { "interval": "5m" }
},
"input": {
"search": {
"request": {
"indices": ["logs-*"],
"body": {
"query": {
"bool": {
"must": [
{ "match": { "level": "ERROR" } },
{ "range": { "@timestamp": { "gte": "now-5m" } } }
]
}
}
}
}
}
},
"condition": {
"compare": { "ctx.payload.hits.total.value": { "gt": 100 } }
},
"actions": {
"notify_slack": {
"webhook": {
"scheme": "https",
"host": "hooks.slack.com",
"port": 443,
"method": "post",
"path": "/services/xxx",
"body": "{\"text\": \"High error rate detected: {{ctx.payload.hits.total.value}} errors in last 5 minutes\"}"
}
}
}
}
Problem: Elasticsearch consuming too much disk Solution: Implement ILM policies, reduce retention
Problem: Queries taking too long Solution: Optimize index settings, add more shards, use filters
Problem: Logs not parsed correctly Solution: Test grok patterns, check for log format changes
Problem: Elasticsearch OOM errors Solution: Increase heap size (max 50% of RAM), limit field data
development
Design and operationalize SRE dashboards that surface reliability, latency, error, saturation, and capacity signals across services. Use when building observability views for SLOs, incident response, and executive reliability reporting.
testing
Harden OpenClaw self-hosted environments with baseline host controls, auth tightening, secret handling, network segmentation, and safe update/rollback workflows. Use when deploying OpenClaw in home labs, startups, or production-like local AI infrastructure.
devops
Deploy, manage, and optimize vector databases for AI applications. Covers Qdrant, Weaviate, pgvector, and Pinecone — collection management, indexing strategies, backup, and performance tuning for production RAG and semantic search workloads.
testing
Deploy ML models on Kubernetes with KServe (formerly KFServing) and NVIDIA Triton Inference Server. Includes canary deployments, autoscaling, model versioning, A/B testing, and GPU resource management for production model serving.