Files
homelab-docs/prometheus-alert-rules-updated.yml

297 lines
11 KiB
YAML

# Prometheus Alert Rules for Fred's Homelab - UPDATED
# Location: /etc/prometheus/rules/homelab-alerts.yml
# Updated: 2026-02-03 (Reduced alert noise)
#
# Changes:
# - CPU threshold: 80%+ over 5 minutes
# - Only CRITICAL alerts trigger notifications
# - WARNING alerts are logged only
groups:
# ====================================
# Host & Infrastructure Alerts
# ====================================
- name: infrastructure
interval: 30s
rules:
# Host is completely down
- alert: HostDown
expr: up == 0
for: 2m
labels:
severity: critical
category: infrastructure
annotations:
summary: "Host {{ $labels.instance }} is down"
description: "{{ $labels.instance }} ({{ $labels.hostname }}) has been unreachable for more than 2 minutes. Check network connectivity and host status."
# High CPU usage (warning only - logged, not sent)
- alert: HighCPUUsage
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
category: performance
annotations:
summary: "High CPU usage on {{ $labels.hostname }}"
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 80%)"
# Critical CPU usage (notification sent)
- alert: CriticalCPUUsage
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
for: 5m
labels:
severity: critical
category: performance
annotations:
summary: "CRITICAL CPU usage on {{ $labels.hostname }}"
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"
# High memory usage (warning only)
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
category: performance
annotations:
summary: "High memory usage on {{ $labels.hostname }}"
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 85%)"
# Critical memory usage
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 5m
labels:
severity: critical
category: performance
annotations:
summary: "CRITICAL memory usage on {{ $labels.hostname }}"
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"
# ====================================
# Storage Alerts
# ====================================
- name: storage
interval: 1m
rules:
# Disk space low (warning only)
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15
for: 5m
labels:
severity: warning
category: storage
annotations:
summary: "Low disk space on {{ $labels.hostname }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has {{ $value | humanize }}% free space remaining (threshold: 15%)"
# Disk space critical
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5
for: 2m
labels:
severity: critical
category: storage
annotations:
summary: "CRITICAL disk space on {{ $labels.hostname }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has only {{ $value | humanize }}% free space remaining!"
# Disk will fill in 24 hours (warning only)
- alert: DiskSpaceFillingFast
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"}[1h], 24 * 3600) < 0
for: 1h
labels:
severity: warning
category: storage
annotations:
summary: "Disk filling rapidly on {{ $labels.hostname }}"
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to fill within 24 hours at current rate"
# High disk I/O wait (warning only)
- alert: HighDiskIOWait
expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10
for: 5m
labels:
severity: warning
category: performance
annotations:
summary: "High disk I/O wait on {{ $labels.hostname }}"
description: "Disk I/O wait on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 10%)"
# ====================================
# Network Alerts
# ====================================
- name: network
interval: 1m
rules:
# Network interface down (warning only)
- alert: NetworkInterfaceDown
expr: node_network_up{device!~"lo|veth.*|docker.*|br-.*"} == 0
for: 2m
labels:
severity: warning
category: network
annotations:
summary: "Network interface down on {{ $labels.hostname }}"
description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is down"
# High network errors (warning only)
- alert: HighNetworkErrors
expr: rate(node_network_receive_errs_total[5m]) > 10 or rate(node_network_transmit_errs_total[5m]) > 10
for: 5m
labels:
severity: warning
category: network
annotations:
summary: "High network errors on {{ $labels.hostname }}"
description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is experiencing errors ({{ $value }} errors/sec)"
# ====================================
# Proxmox Specific Alerts
# ====================================
- name: proxmox
interval: 1m
rules:
# Proxmox node unreachable
- alert: ProxmoxNodeDown
expr: up{role="proxmox-host"} == 0
for: 2m
labels:
severity: critical
category: infrastructure
annotations:
summary: "Proxmox node {{ $labels.hostname }} is down"
description: "Proxmox host {{ $labels.instance }} has been unreachable for more than 2 minutes"
# High load on Proxmox host (warning only)
- alert: ProxmoxHighLoad
expr: node_load15{role="proxmox-host"} / count(node_cpu_seconds_total{role="proxmox-host",mode="idle"}) without (cpu, mode) > 2
for: 15m
labels:
severity: warning
category: performance
annotations:
summary: "High load on Proxmox {{ $labels.hostname }}"
description: "15-minute load average on {{ $labels.instance }} is {{ $value | humanize }} (threshold: 2x CPU cores)"
# ====================================
# Service Specific Alerts
# ====================================
- name: services
interval: 1m
rules:
# PostgreSQL down
- alert: PostgreSQLDown
expr: up{app="postgres"} == 0
for: 2m
labels:
severity: critical
category: database
service: postgresql
annotations:
summary: "PostgreSQL is down"
description: "PostgreSQL on {{ $labels.instance }} has been down for more than 2 minutes"
# Home Assistant down (warning only)
- alert: HomeAssistantDown
expr: up{app="homeassistant"} == 0
for: 5m
labels:
severity: warning
category: automation
service: homeassistant
annotations:
summary: "Home Assistant is down"
description: "Home Assistant on {{ $labels.instance }} has been unreachable for more than 5 minutes"
# n8n down (warning only)
- alert: N8NDown
expr: up{app="n8n"} == 0
for: 5m
labels:
severity: warning
category: automation
service: n8n
annotations:
summary: "n8n is down"
description: "n8n automation service on {{ $labels.instance }} has been down for more than 5 minutes"
# VPS down
- alert: VPSDown
expr: up{role="vps"} == 0
for: 2m
labels:
severity: critical
category: infrastructure
annotations:
summary: "VPS is unreachable"
description: "VPS at {{ $labels.instance }} has been unreachable for more than 2 minutes. External services may be affected."
# ====================================
# Prometheus Self-Monitoring
# ====================================
- name: prometheus
interval: 1m
rules:
# Prometheus scrape failures (warning only)
- alert: PrometheusScrapeFailing
expr: up == 0
for: 5m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus cannot scrape {{ $labels.instance }}"
description: "Prometheus has failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 5 minutes"
# Prometheus config reload failed
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 5m
labels:
severity: critical
category: monitoring
annotations:
summary: "Prometheus config reload failed"
description: "Prometheus configuration reload has failed. Check prometheus logs for errors."
# Prometheus running out of storage (warning only)
- alert: PrometheusStorageNearFull
expr: (prometheus_tsdb_storage_blocks_bytes / prometheus_tsdb_storage_blocks_bytes) > 0.85
for: 10m
labels:
severity: warning
category: monitoring
annotations:
summary: "Prometheus storage near capacity"
description: "Prometheus storage is {{ $value | humanizePercentage }} full"
# ====================================
# Summary of Changes
# ====================================
#
# CRITICAL alerts (trigger Discord notification):
# - HostDown
# - CriticalCPUUsage (>95% for 5min)
# - CriticalMemoryUsage (>95% for 5min)
# - DiskSpaceCritical (<5% free)
# - ProxmoxNodeDown
# - PostgreSQLDown
# - VPSDown
# - PrometheusConfigReloadFailed
#
# WARNING alerts (logged only, no notification):
# - HighCPUUsage (>80% for 5min) ← UPDATED THRESHOLD
# - HighMemoryUsage (>85% for 10min)
# - DiskSpaceLow (<15% free)
# - DiskSpaceFillingFast (filling in 24h)
# - HighDiskIOWait
# - NetworkInterfaceDown
# - HighNetworkErrors
# - ProxmoxHighLoad
# - HomeAssistantDown
# - N8NDown
# - PrometheusScrapeFailing
# - PrometheusStorageNearFull