# Prometheus Alert Rules for Fred's Homelab - UPDATED # Location: /etc/prometheus/rules/homelab-alerts.yml # Updated: 2026-02-03 (Reduced alert noise) # # Changes: # - CPU threshold: 80%+ over 5 minutes # - Only CRITICAL alerts trigger notifications # - WARNING alerts are logged only groups: # ==================================== # Host & Infrastructure Alerts # ==================================== - name: infrastructure interval: 30s rules: # Host is completely down - alert: HostDown expr: up == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Host {{ $labels.instance }} is down" description: "{{ $labels.instance }} ({{ $labels.hostname }}) has been unreachable for more than 2 minutes. Check network connectivity and host status." # High CPU usage (warning only - logged, not sent) - alert: HighCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning category: performance annotations: summary: "High CPU usage on {{ $labels.hostname }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 80%)" # Critical CPU usage (notification sent) - alert: CriticalCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL CPU usage on {{ $labels.hostname }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)" # High memory usage (warning only) - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning category: performance annotations: summary: "High memory usage on {{ $labels.hostname }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 85%)" # Critical memory usage - alert: CriticalMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL memory usage on {{ $labels.hostname }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)" # ==================================== # Storage Alerts # ==================================== - name: storage interval: 1m rules: # Disk space low (warning only) - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15 for: 5m labels: severity: warning category: storage annotations: summary: "Low disk space on {{ $labels.hostname }}" description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has {{ $value | humanize }}% free space remaining (threshold: 15%)" # Disk space critical - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5 for: 2m labels: severity: critical category: storage annotations: summary: "CRITICAL disk space on {{ $labels.hostname }}" description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has only {{ $value | humanize }}% free space remaining!" # Disk will fill in 24 hours (warning only) - alert: DiskSpaceFillingFast expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"}[1h], 24 * 3600) < 0 for: 1h labels: severity: warning category: storage annotations: summary: "Disk filling rapidly on {{ $labels.hostname }}" description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to fill within 24 hours at current rate" # High disk I/O wait (warning only) - alert: HighDiskIOWait expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10 for: 5m labels: severity: warning category: performance annotations: summary: "High disk I/O wait on {{ $labels.hostname }}" description: "Disk I/O wait on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 10%)" # ==================================== # Network Alerts # ==================================== - name: network interval: 1m rules: # Network interface down (warning only) - alert: NetworkInterfaceDown expr: node_network_up{device!~"lo|veth.*|docker.*|br-.*"} == 0 for: 2m labels: severity: warning category: network annotations: summary: "Network interface down on {{ $labels.hostname }}" description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is down" # High network errors (warning only) - alert: HighNetworkErrors expr: rate(node_network_receive_errs_total[5m]) > 10 or rate(node_network_transmit_errs_total[5m]) > 10 for: 5m labels: severity: warning category: network annotations: summary: "High network errors on {{ $labels.hostname }}" description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is experiencing errors ({{ $value }} errors/sec)" # ==================================== # Proxmox Specific Alerts # ==================================== - name: proxmox interval: 1m rules: # Proxmox node unreachable - alert: ProxmoxNodeDown expr: up{role="proxmox-host"} == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Proxmox node {{ $labels.hostname }} is down" description: "Proxmox host {{ $labels.instance }} has been unreachable for more than 2 minutes" # High load on Proxmox host (warning only) - alert: ProxmoxHighLoad expr: node_load15{role="proxmox-host"} / count(node_cpu_seconds_total{role="proxmox-host",mode="idle"}) without (cpu, mode) > 2 for: 15m labels: severity: warning category: performance annotations: summary: "High load on Proxmox {{ $labels.hostname }}" description: "15-minute load average on {{ $labels.instance }} is {{ $value | humanize }} (threshold: 2x CPU cores)" # ==================================== # Service Specific Alerts # ==================================== - name: services interval: 1m rules: # PostgreSQL down - alert: PostgreSQLDown expr: up{app="postgres"} == 0 for: 2m labels: severity: critical category: database service: postgresql annotations: summary: "PostgreSQL is down" description: "PostgreSQL on {{ $labels.instance }} has been down for more than 2 minutes" # Home Assistant down (warning only) - alert: HomeAssistantDown expr: up{app="homeassistant"} == 0 for: 5m labels: severity: warning category: automation service: homeassistant annotations: summary: "Home Assistant is down" description: "Home Assistant on {{ $labels.instance }} has been unreachable for more than 5 minutes" # n8n down (warning only) - alert: N8NDown expr: up{app="n8n"} == 0 for: 5m labels: severity: warning category: automation service: n8n annotations: summary: "n8n is down" description: "n8n automation service on {{ $labels.instance }} has been down for more than 5 minutes" # VPS down - alert: VPSDown expr: up{role="vps"} == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "VPS is unreachable" description: "VPS at {{ $labels.instance }} has been unreachable for more than 2 minutes. External services may be affected." # ==================================== # Prometheus Self-Monitoring # ==================================== - name: prometheus interval: 1m rules: # Prometheus scrape failures (warning only) - alert: PrometheusScrapeFailing expr: up == 0 for: 5m labels: severity: warning category: monitoring annotations: summary: "Prometheus cannot scrape {{ $labels.instance }}" description: "Prometheus has failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 5 minutes" # Prometheus config reload failed - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 5m labels: severity: critical category: monitoring annotations: summary: "Prometheus config reload failed" description: "Prometheus configuration reload has failed. Check prometheus logs for errors." # Prometheus running out of storage (warning only) - alert: PrometheusStorageNearFull expr: (prometheus_tsdb_storage_blocks_bytes / prometheus_tsdb_storage_blocks_bytes) > 0.85 for: 10m labels: severity: warning category: monitoring annotations: summary: "Prometheus storage near capacity" description: "Prometheus storage is {{ $value | humanizePercentage }} full" # ==================================== # Summary of Changes # ==================================== # # CRITICAL alerts (trigger Discord notification): # - HostDown # - CriticalCPUUsage (>95% for 5min) # - CriticalMemoryUsage (>95% for 5min) # - DiskSpaceCritical (<5% free) # - ProxmoxNodeDown # - PostgreSQLDown # - VPSDown # - PrometheusConfigReloadFailed # # WARNING alerts (logged only, no notification): # - HighCPUUsage (>80% for 5min) ← UPDATED THRESHOLD # - HighMemoryUsage (>85% for 10min) # - DiskSpaceLow (<15% free) # - DiskSpaceFillingFast (filling in 24h) # - HighDiskIOWait # - NetworkInterfaceDown # - HighNetworkErrors # - ProxmoxHighLoad # - HomeAssistantDown # - N8NDown # - PrometheusScrapeFailing # - PrometheusStorageNearFull