areas/devops/infrastructure/skills/drift-detection/SKILL.md
Detect, classify, and automate Terraform drift detection in CI — scheduled plans, drift metrics, cloud-native audit log correlation.
npx skillsauth add sawrus/agent-guides drift-detectionInstall this skill globally with one command. Works with Claude Code, Cursor, and Windsurf.
3 of 9 scanners reported clean
Some scanners were skipped, did not run, or reported a non-clean status. Review each row below.
Expertise: Terraform plan-based drift detection, CI scheduling, cloud audit log correlation, drift classification.
When setting up scheduled drift detection, investigating detected drift, or correlating drift with cloud audit events.
# .github/workflows/drift-detection.yml
name: Drift Detection
on:
schedule:
- cron: '0 */6 * * *' # every 6 hours
workflow_dispatch: # manual trigger
jobs:
detect-drift:
runs-on: ubuntu-latest
strategy:
matrix:
component: [network, k8s-cluster, databases, iam-roles]
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials (OIDC)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ vars.DRIFT_DETECTOR_ROLE_ARN }}
aws-region: eu-west-1
- name: Terraform init
working-directory: terraform/environments/production/${{ matrix.component }}
run: terraform init -backend-config=backend.hcl
- name: Terraform plan (drift check)
id: plan
working-directory: terraform/environments/production/${{ matrix.component }}
run: |
terraform plan \
-var-file=terraform.tfvars \
-detailed-exitcode \
-out=drift-check.plan \
2>&1 | tee drift-output.txt
echo "exit_code=$?" >> $GITHUB_OUTPUT
continue-on-error: true # exit 2 = changes, don't fail job
- name: Classify drift
if: steps.plan.outputs.exit_code == '2'
run: |
# Check for security-sensitive resource changes
if grep -E "aws_iam|aws_security_group|aws_kms|encryption" drift-output.txt; then
echo "SEVERITY=INVESTIGATE" >> $GITHUB_ENV
else
echo "SEVERITY=REMEDIATE" >> $GITHUB_ENV
fi
- name: Create GitHub Issue on drift
if: steps.plan.outputs.exit_code == '2'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const plan = fs.readFileSync('terraform/environments/production/${{ matrix.component }}/drift-output.txt', 'utf8');
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `[DRIFT] ${{ matrix.component }} — ${process.env.SEVERITY}`,
body: `## Drift detected in \`${{ matrix.component }}\`\n\n**Severity:** ${process.env.SEVERITY}\n\n\`\`\`\n${plan.slice(0, 3000)}\n\`\`\``,
labels: ['infrastructure', 'drift', process.env.SEVERITY.toLowerCase()]
});
- name: Alert on INVESTIGATE drift
if: env.SEVERITY == 'INVESTIGATE'
uses: slackapi/slack-github-action@v1
with:
payload: |
{"text": "🚨 *INVESTIGATE drift* in `${{ matrix.component }}` — may indicate unauthorized change. <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View>"}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_ONCALL_WEBHOOK }}
# .gitlab-ci.yml — drift detection job (triggered by schedule)
drift-detect:
stage: audit
rules:
- if: $CI_PIPELINE_SOURCE == "schedule"
script:
- cd terraform/environments/production/${COMPONENT}
- terraform init -backend-config=backend.hcl
- terraform plan -var-file=terraform.tfvars -detailed-exitcode
-out=drift.plan 2>&1 | tee drift-output.txt || EXITCODE=$?
- |
if [ "${EXITCODE}" == "2" ]; then
echo "DRIFT DETECTED in ${COMPONENT}"
# Post to Slack, create MR, or alert via API
curl -X POST "$SLACK_WEBHOOK" \
-H 'Content-type: application/json' \
--data "{\"text\":\"Drift in ${COMPONENT}\"}"
exit 1
fi
parallel:
matrix:
- COMPONENT: [network, k8s-cluster, databases]
# Parse terraform plan output for drift classification
classify_drift() {
local plan_output="$1"
# INVESTIGATE: security-sensitive resources changed
if echo "$plan_output" | grep -qE \
"aws_iam_policy|aws_security_group|kms_key|aws_s3_bucket_server_side_encryption|encryption_at_rest"; then
echo "INVESTIGATE"
return
fi
# INVESTIGATE: resources deleted unexpectedly
if echo "$plan_output" | grep -q "# .* will be destroyed"; then
echo "INVESTIGATE"
return
fi
# REMEDIATE: configuration drift (non-security)
echo "REMEDIATE"
}
# AWS CloudTrail: who changed the drifted resource?
aws cloudtrail lookup-events \
--lookup-attributes AttributeKey=ResourceName,AttributeValue=<resource-id> \
--start-time $(date -d '-24 hours' --iso-8601) \
--query 'Events[*].{User: Username, Time: EventTime, Event: EventName}' \
--output table
# GCP Audit Logs
gcloud logging read \
'resource.type="gce_instance" AND protoPayload.methodName:"compute.instances"' \
--freshness=24h \
--format='table(timestamp, protoPayload.authenticationInfo.principalEmail, protoPayload.methodName)'
# lifecycle ignore_changes for intentionally managed-outside-TF fields
resource "aws_autoscaling_group" "workers" {
# ...
lifecycle {
ignore_changes = [
desired_capacity, # managed by cluster autoscaler, not TF
min_size,
]
}
}
testing
QA Expert for writing E2E tests, test scenarios, test plans, and ensuring test coverage quality.
development
Expert UI/UX design intelligence for creating distinctive, high-craft, and mobile-first interfaces. Focuses on premium aesthetics, touch-first ergonomics, and Flutter performance.
development
Code Review Expert for static analysis, security auditing, architecture review, and ensuring code quality standards.
development
Babysit a GitHub pull request after creation by continuously polling review comments, CI checks/workflow runs, and mergeability state until the PR is merged/closed or user help is required. Diagnose failures, retry likely flaky failures up to 3 times, auto-fix/push branch-related issues when appropriate, and keep watching open PRs so fresh review feedback is surfaced promptly. Use when the user asks Codex to monitor a PR, watch CI, handle review comments, or keep an eye on failures and feedback on an open PR.