Initial infrastructure documentation - comprehensive homelab reference
This commit is contained in:
147
alertmanager-config-updated.yml
Normal file
147
alertmanager-config-updated.yml
Normal file
@@ -0,0 +1,147 @@
|
||||
# Alertmanager Configuration for Fred's Homelab - UPDATED
|
||||
# Location: /etc/prometheus/alertmanager.yml
|
||||
# Updated: 2026-02-03 (Reduced alert noise)
|
||||
#
|
||||
# Changes:
|
||||
# - Only CRITICAL alerts trigger Discord notifications
|
||||
# - WARNING alerts are logged but NOT sent to notification channels
|
||||
# - Removed email notifications entirely
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
# Root route - all alerts enter here
|
||||
route:
|
||||
# Group alerts by these labels to reduce noise
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
|
||||
# Wait 30s before sending first notification (allows grouping)
|
||||
group_wait: 30s
|
||||
|
||||
# Wait 5min before sending additional alerts for same group
|
||||
group_interval: 5m
|
||||
|
||||
# Resend alert every 12 hours if still firing
|
||||
repeat_interval: 12h
|
||||
|
||||
# Default receiver - drops everything (warnings go here)
|
||||
receiver: 'null'
|
||||
|
||||
# Child routes for specific alert types
|
||||
routes:
|
||||
# CRITICAL alerts - send to Discord webhook
|
||||
- matchers:
|
||||
- severity="critical"
|
||||
receiver: 'discord-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
|
||||
# WARNING alerts - explicitly drop (logged by Prometheus, not sent)
|
||||
- matchers:
|
||||
- severity="warning"
|
||||
receiver: 'null'
|
||||
repeat_interval: 24h
|
||||
|
||||
# Inhibition rules - prevent alert spam
|
||||
inhibit_rules:
|
||||
# If critical alert is firing, suppress warnings for same alert
|
||||
- source_matchers:
|
||||
- severity="critical"
|
||||
target_matchers:
|
||||
- severity="warning"
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
# If host is down, suppress all other alerts from that host
|
||||
- source_matchers:
|
||||
- alertname="HostDown"
|
||||
target_matchers:
|
||||
- alertname!="HostDown"
|
||||
equal: ['instance']
|
||||
|
||||
# Receivers - define where alerts go
|
||||
receivers:
|
||||
# Null receiver - drops alerts (used for warnings)
|
||||
- name: 'null'
|
||||
|
||||
# Discord webhook for CRITICAL alerts
|
||||
- name: 'discord-critical'
|
||||
webhook_configs:
|
||||
- url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
|
||||
send_resolved: true
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
max_alerts: 0 # Send all alerts (no limit)
|
||||
|
||||
# ====================================
|
||||
# Deployment Instructions
|
||||
# ====================================
|
||||
#
|
||||
# 1. Backup existing config:
|
||||
# ssh root@10.0.10.25 'cp /etc/prometheus/alertmanager.yml /etc/prometheus/alertmanager.yml.backup'
|
||||
#
|
||||
# 2. Upload this file:
|
||||
# scp alertmanager-config-updated.yml root@10.0.10.25:/etc/prometheus/alertmanager.yml
|
||||
#
|
||||
# 3. Upload updated alert rules:
|
||||
# scp prometheus-alert-rules-updated.yml root@10.0.10.25:/etc/prometheus/rules/homelab-alerts.yml
|
||||
#
|
||||
# 4. Reload Alertmanager:
|
||||
# ssh root@10.0.10.25 'systemctl reload prometheus-alertmanager'
|
||||
#
|
||||
# 5. Reload Prometheus:
|
||||
# ssh root@10.0.10.25 'systemctl reload prometheus'
|
||||
#
|
||||
# 6. Verify configuration:
|
||||
# curl http://10.0.10.25:9093/api/v1/status
|
||||
# curl http://10.0.10.25:9090/api/v1/rules
|
||||
#
|
||||
# 7. Test Discord webhook:
|
||||
# curl -X POST http://10.0.10.25:9093/api/v1/alerts -d '[
|
||||
# {
|
||||
# "labels": {
|
||||
# "alertname": "TestCriticalAlert",
|
||||
# "severity": "critical",
|
||||
# "instance": "test:9100"
|
||||
# },
|
||||
# "annotations": {
|
||||
# "summary": "Test alert - please ignore"
|
||||
# }
|
||||
# }
|
||||
# ]'
|
||||
#
|
||||
# ====================================
|
||||
# Alert Flow Summary
|
||||
# ====================================
|
||||
#
|
||||
# CRITICAL alerts:
|
||||
# Prometheus → Alertmanager → Discord Webhook → Your Discord Server
|
||||
#
|
||||
# WARNING alerts:
|
||||
# Prometheus → Alertmanager → null receiver (logged, not sent)
|
||||
#
|
||||
# You can view WARNING alerts in:
|
||||
# - Prometheus UI: http://10.0.10.25:9090/alerts
|
||||
# - Alertmanager UI: http://10.0.10.25:9093/#/alerts
|
||||
#
|
||||
# ====================================
|
||||
# Expected Behavior After Update
|
||||
# ====================================
|
||||
#
|
||||
# Your Discord will ONLY receive:
|
||||
# ✅ Host completely down (HostDown)
|
||||
# ✅ CPU >95% for 5 minutes (CriticalCPUUsage)
|
||||
# ✅ Memory >95% for 5 minutes (CriticalMemoryUsage)
|
||||
# ✅ Disk <5% free (DiskSpaceCritical)
|
||||
# ✅ Proxmox node down (ProxmoxNodeDown)
|
||||
# ✅ PostgreSQL down (PostgreSQLDown)
|
||||
# ✅ VPS unreachable (VPSDown)
|
||||
# ✅ Prometheus config reload failed
|
||||
#
|
||||
# Your inbox will receive:
|
||||
# 🚫 NOTHING - all email notifications disabled
|
||||
#
|
||||
# Warnings (CPU 80-95%, memory 85-95%, etc.):
|
||||
# 📊 Logged in Prometheus/Alertmanager UI only
|
||||
#
|
||||
# This should dramatically reduce notification noise while still
|
||||
# catching critical issues that need immediate attention.
|
||||
Reference in New Issue
Block a user