homelab-docs/alertmanager-config-updated.yml

# Alertmanager Configuration for Fred's Homelab - UPDATED
# Location: /etc/prometheus/alertmanager.yml
# Updated: 2026-02-03 (Reduced alert noise)
#
# Changes:
# - Only CRITICAL alerts trigger Discord notifications
# - WARNING alerts are logged but NOT sent to notification channels
# - Removed email notifications entirely

global:
  resolve_timeout: 5m

# Root route - all alerts enter here
route:
  # Group alerts by these labels to reduce noise
  group_by: ['alertname', 'severity', 'instance']

  # Wait 30s before sending first notification (allows grouping)
  group_wait: 30s

  # Wait 5min before sending additional alerts for same group
  group_interval: 5m

  # Resend alert every 12 hours if still firing
  repeat_interval: 12h

  # Default receiver - drops everything (warnings go here)
  receiver: 'null'

  # Child routes for specific alert types
  routes:
    # CRITICAL alerts - send to Discord webhook
    - matchers:
        - severity="critical"
      receiver: 'discord-critical'
      group_wait: 10s
      repeat_interval: 1h

    # WARNING alerts - explicitly drop (logged by Prometheus, not sent)
    - matchers:
        - severity="warning"
      receiver: 'null'
      repeat_interval: 24h

# Inhibition rules - prevent alert spam
inhibit_rules:
  # If critical alert is firing, suppress warnings for same alert
  - source_matchers:
      - severity="critical"
    target_matchers:
      - severity="warning"
    equal: ['alertname', 'instance']

  # If host is down, suppress all other alerts from that host
  - source_matchers:
      - alertname="HostDown"
    target_matchers:
      - alertname!="HostDown"
    equal: ['instance']

# Receivers - define where alerts go
receivers:
  # Null receiver - drops alerts (used for warnings)
  - name: 'null'

  # Discord webhook for CRITICAL alerts
  - name: 'discord-critical'
    webhook_configs:
      - url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
        send_resolved: true
        http_config:
          follow_redirects: true
        max_alerts: 0  # Send all alerts (no limit)

# ====================================
# Deployment Instructions
# ====================================
#
# 1. Backup existing config:
#    ssh root@10.0.10.25 'cp /etc/prometheus/alertmanager.yml /etc/prometheus/alertmanager.yml.backup'
#
# 2. Upload this file:
#    scp alertmanager-config-updated.yml root@10.0.10.25:/etc/prometheus/alertmanager.yml
#
# 3. Upload updated alert rules:
#    scp prometheus-alert-rules-updated.yml root@10.0.10.25:/etc/prometheus/rules/homelab-alerts.yml
#
# 4. Reload Alertmanager:
#    ssh root@10.0.10.25 'systemctl reload prometheus-alertmanager'
#
# 5. Reload Prometheus:
#    ssh root@10.0.10.25 'systemctl reload prometheus'
#
# 6. Verify configuration:
#    curl http://10.0.10.25:9093/api/v1/status
#    curl http://10.0.10.25:9090/api/v1/rules
#
# 7. Test Discord webhook:
#    curl -X POST http://10.0.10.25:9093/api/v1/alerts -d '[
#      {
#        "labels": {
#          "alertname": "TestCriticalAlert",
#          "severity": "critical",
#          "instance": "test:9100"
#        },
#        "annotations": {
#          "summary": "Test alert - please ignore"
#        }
#      }
#    ]'
#
# ====================================
# Alert Flow Summary
# ====================================
#
# CRITICAL alerts:
#  Prometheus → Alertmanager → Discord Webhook → Your Discord Server
#
# WARNING alerts:
#  Prometheus → Alertmanager → null receiver (logged, not sent)
#
# You can view WARNING alerts in:
#  - Prometheus UI: http://10.0.10.25:9090/alerts
#  - Alertmanager UI: http://10.0.10.25:9093/#/alerts
#
# ====================================
# Expected Behavior After Update
# ====================================
#
# Your Discord will ONLY receive:
# ✅ Host completely down (HostDown)
# ✅ CPU >95% for 5 minutes (CriticalCPUUsage)
# ✅ Memory >95% for 5 minutes (CriticalMemoryUsage)
# ✅ Disk <5% free (DiskSpaceCritical)
# ✅ Proxmox node down (ProxmoxNodeDown)
# ✅ PostgreSQL down (PostgreSQLDown)
# ✅ VPS unreachable (VPSDown)
# ✅ Prometheus config reload failed
#
# Your inbox will receive:
# 🚫 NOTHING - all email notifications disabled
#
# Warnings (CPU 80-95%, memory 85-95%, etc.):
# 📊 Logged in Prometheus/Alertmanager UI only
#
# This should dramatically reduce notification noise while still
# catching critical issues that need immediate attention.