Initial infrastructure documentation - comprehensive homelab reference

This commit is contained in:
Funky (OpenClaw)
2026-02-23 03:42:22 +00:00
commit 0682c79580
169 changed files with 63913 additions and 0 deletions

View File

@@ -0,0 +1,147 @@
# Alertmanager Configuration for Fred's Homelab - UPDATED
# Location: /etc/prometheus/alertmanager.yml
# Updated: 2026-02-03 (Reduced alert noise)
#
# Changes:
# - Only CRITICAL alerts trigger Discord notifications
# - WARNING alerts are logged but NOT sent to notification channels
# - Removed email notifications entirely
global:
resolve_timeout: 5m
# Root route - all alerts enter here
route:
# Group alerts by these labels to reduce noise
group_by: ['alertname', 'severity', 'instance']
# Wait 30s before sending first notification (allows grouping)
group_wait: 30s
# Wait 5min before sending additional alerts for same group
group_interval: 5m
# Resend alert every 12 hours if still firing
repeat_interval: 12h
# Default receiver - drops everything (warnings go here)
receiver: 'null'
# Child routes for specific alert types
routes:
# CRITICAL alerts - send to Discord webhook
- matchers:
- severity="critical"
receiver: 'discord-critical'
group_wait: 10s
repeat_interval: 1h
# WARNING alerts - explicitly drop (logged by Prometheus, not sent)
- matchers:
- severity="warning"
receiver: 'null'
repeat_interval: 24h
# Inhibition rules - prevent alert spam
inhibit_rules:
# If critical alert is firing, suppress warnings for same alert
- source_matchers:
- severity="critical"
target_matchers:
- severity="warning"
equal: ['alertname', 'instance']
# If host is down, suppress all other alerts from that host
- source_matchers:
- alertname="HostDown"
target_matchers:
- alertname!="HostDown"
equal: ['instance']
# Receivers - define where alerts go
receivers:
# Null receiver - drops alerts (used for warnings)
- name: 'null'
# Discord webhook for CRITICAL alerts
- name: 'discord-critical'
webhook_configs:
- url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
send_resolved: true
http_config:
follow_redirects: true
max_alerts: 0 # Send all alerts (no limit)
# ====================================
# Deployment Instructions
# ====================================
#
# 1. Backup existing config:
# ssh root@10.0.10.25 'cp /etc/prometheus/alertmanager.yml /etc/prometheus/alertmanager.yml.backup'
#
# 2. Upload this file:
# scp alertmanager-config-updated.yml root@10.0.10.25:/etc/prometheus/alertmanager.yml
#
# 3. Upload updated alert rules:
# scp prometheus-alert-rules-updated.yml root@10.0.10.25:/etc/prometheus/rules/homelab-alerts.yml
#
# 4. Reload Alertmanager:
# ssh root@10.0.10.25 'systemctl reload prometheus-alertmanager'
#
# 5. Reload Prometheus:
# ssh root@10.0.10.25 'systemctl reload prometheus'
#
# 6. Verify configuration:
# curl http://10.0.10.25:9093/api/v1/status
# curl http://10.0.10.25:9090/api/v1/rules
#
# 7. Test Discord webhook:
# curl -X POST http://10.0.10.25:9093/api/v1/alerts -d '[
# {
# "labels": {
# "alertname": "TestCriticalAlert",
# "severity": "critical",
# "instance": "test:9100"
# },
# "annotations": {
# "summary": "Test alert - please ignore"
# }
# }
# ]'
#
# ====================================
# Alert Flow Summary
# ====================================
#
# CRITICAL alerts:
# Prometheus → Alertmanager → Discord Webhook → Your Discord Server
#
# WARNING alerts:
# Prometheus → Alertmanager → null receiver (logged, not sent)
#
# You can view WARNING alerts in:
# - Prometheus UI: http://10.0.10.25:9090/alerts
# - Alertmanager UI: http://10.0.10.25:9093/#/alerts
#
# ====================================
# Expected Behavior After Update
# ====================================
#
# Your Discord will ONLY receive:
# ✅ Host completely down (HostDown)
# ✅ CPU >95% for 5 minutes (CriticalCPUUsage)
# ✅ Memory >95% for 5 minutes (CriticalMemoryUsage)
# ✅ Disk <5% free (DiskSpaceCritical)
# ✅ Proxmox node down (ProxmoxNodeDown)
# ✅ PostgreSQL down (PostgreSQLDown)
# ✅ VPS unreachable (VPSDown)
# ✅ Prometheus config reload failed
#
# Your inbox will receive:
# 🚫 NOTHING - all email notifications disabled
#
# Warnings (CPU 80-95%, memory 85-95%, etc.):
# 📊 Logged in Prometheus/Alertmanager UI only
#
# This should dramatically reduce notification noise while still
# catching critical issues that need immediate attention.