148 lines
4.4 KiB
YAML
148 lines
4.4 KiB
YAML
# Alertmanager Configuration for Fred's Homelab - UPDATED
|
|
# Location: /etc/prometheus/alertmanager.yml
|
|
# Updated: 2026-02-03 (Reduced alert noise)
|
|
#
|
|
# Changes:
|
|
# - Only CRITICAL alerts trigger Discord notifications
|
|
# - WARNING alerts are logged but NOT sent to notification channels
|
|
# - Removed email notifications entirely
|
|
|
|
global:
|
|
resolve_timeout: 5m
|
|
|
|
# Root route - all alerts enter here
|
|
route:
|
|
# Group alerts by these labels to reduce noise
|
|
group_by: ['alertname', 'severity', 'instance']
|
|
|
|
# Wait 30s before sending first notification (allows grouping)
|
|
group_wait: 30s
|
|
|
|
# Wait 5min before sending additional alerts for same group
|
|
group_interval: 5m
|
|
|
|
# Resend alert every 12 hours if still firing
|
|
repeat_interval: 12h
|
|
|
|
# Default receiver - drops everything (warnings go here)
|
|
receiver: 'null'
|
|
|
|
# Child routes for specific alert types
|
|
routes:
|
|
# CRITICAL alerts - send to Discord webhook
|
|
- matchers:
|
|
- severity="critical"
|
|
receiver: 'discord-critical'
|
|
group_wait: 10s
|
|
repeat_interval: 1h
|
|
|
|
# WARNING alerts - explicitly drop (logged by Prometheus, not sent)
|
|
- matchers:
|
|
- severity="warning"
|
|
receiver: 'null'
|
|
repeat_interval: 24h
|
|
|
|
# Inhibition rules - prevent alert spam
|
|
inhibit_rules:
|
|
# If critical alert is firing, suppress warnings for same alert
|
|
- source_matchers:
|
|
- severity="critical"
|
|
target_matchers:
|
|
- severity="warning"
|
|
equal: ['alertname', 'instance']
|
|
|
|
# If host is down, suppress all other alerts from that host
|
|
- source_matchers:
|
|
- alertname="HostDown"
|
|
target_matchers:
|
|
- alertname!="HostDown"
|
|
equal: ['instance']
|
|
|
|
# Receivers - define where alerts go
|
|
receivers:
|
|
# Null receiver - drops alerts (used for warnings)
|
|
- name: 'null'
|
|
|
|
# Discord webhook for CRITICAL alerts
|
|
- name: 'discord-critical'
|
|
webhook_configs:
|
|
- url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
|
|
send_resolved: true
|
|
http_config:
|
|
follow_redirects: true
|
|
max_alerts: 0 # Send all alerts (no limit)
|
|
|
|
# ====================================
|
|
# Deployment Instructions
|
|
# ====================================
|
|
#
|
|
# 1. Backup existing config:
|
|
# ssh root@10.0.10.25 'cp /etc/prometheus/alertmanager.yml /etc/prometheus/alertmanager.yml.backup'
|
|
#
|
|
# 2. Upload this file:
|
|
# scp alertmanager-config-updated.yml root@10.0.10.25:/etc/prometheus/alertmanager.yml
|
|
#
|
|
# 3. Upload updated alert rules:
|
|
# scp prometheus-alert-rules-updated.yml root@10.0.10.25:/etc/prometheus/rules/homelab-alerts.yml
|
|
#
|
|
# 4. Reload Alertmanager:
|
|
# ssh root@10.0.10.25 'systemctl reload prometheus-alertmanager'
|
|
#
|
|
# 5. Reload Prometheus:
|
|
# ssh root@10.0.10.25 'systemctl reload prometheus'
|
|
#
|
|
# 6. Verify configuration:
|
|
# curl http://10.0.10.25:9093/api/v1/status
|
|
# curl http://10.0.10.25:9090/api/v1/rules
|
|
#
|
|
# 7. Test Discord webhook:
|
|
# curl -X POST http://10.0.10.25:9093/api/v1/alerts -d '[
|
|
# {
|
|
# "labels": {
|
|
# "alertname": "TestCriticalAlert",
|
|
# "severity": "critical",
|
|
# "instance": "test:9100"
|
|
# },
|
|
# "annotations": {
|
|
# "summary": "Test alert - please ignore"
|
|
# }
|
|
# }
|
|
# ]'
|
|
#
|
|
# ====================================
|
|
# Alert Flow Summary
|
|
# ====================================
|
|
#
|
|
# CRITICAL alerts:
|
|
# Prometheus → Alertmanager → Discord Webhook → Your Discord Server
|
|
#
|
|
# WARNING alerts:
|
|
# Prometheus → Alertmanager → null receiver (logged, not sent)
|
|
#
|
|
# You can view WARNING alerts in:
|
|
# - Prometheus UI: http://10.0.10.25:9090/alerts
|
|
# - Alertmanager UI: http://10.0.10.25:9093/#/alerts
|
|
#
|
|
# ====================================
|
|
# Expected Behavior After Update
|
|
# ====================================
|
|
#
|
|
# Your Discord will ONLY receive:
|
|
# ✅ Host completely down (HostDown)
|
|
# ✅ CPU >95% for 5 minutes (CriticalCPUUsage)
|
|
# ✅ Memory >95% for 5 minutes (CriticalMemoryUsage)
|
|
# ✅ Disk <5% free (DiskSpaceCritical)
|
|
# ✅ Proxmox node down (ProxmoxNodeDown)
|
|
# ✅ PostgreSQL down (PostgreSQLDown)
|
|
# ✅ VPS unreachable (VPSDown)
|
|
# ✅ Prometheus config reload failed
|
|
#
|
|
# Your inbox will receive:
|
|
# 🚫 NOTHING - all email notifications disabled
|
|
#
|
|
# Warnings (CPU 80-95%, memory 85-95%, etc.):
|
|
# 📊 Logged in Prometheus/Alertmanager UI only
|
|
#
|
|
# This should dramatically reduce notification noise while still
|
|
# catching critical issues that need immediate attention.
|