297 lines
11 KiB
YAML
297 lines
11 KiB
YAML
# Prometheus Alert Rules for Fred's Homelab - UPDATED
|
|
# Location: /etc/prometheus/rules/homelab-alerts.yml
|
|
# Updated: 2026-02-03 (Reduced alert noise)
|
|
#
|
|
# Changes:
|
|
# - CPU threshold: 80%+ over 5 minutes
|
|
# - Only CRITICAL alerts trigger notifications
|
|
# - WARNING alerts are logged only
|
|
|
|
groups:
|
|
# ====================================
|
|
# Host & Infrastructure Alerts
|
|
# ====================================
|
|
- name: infrastructure
|
|
interval: 30s
|
|
rules:
|
|
# Host is completely down
|
|
- alert: HostDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: infrastructure
|
|
annotations:
|
|
summary: "Host {{ $labels.instance }} is down"
|
|
description: "{{ $labels.instance }} ({{ $labels.hostname }}) has been unreachable for more than 2 minutes. Check network connectivity and host status."
|
|
|
|
# High CPU usage (warning only - logged, not sent)
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: performance
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.hostname }}"
|
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 80%)"
|
|
|
|
# Critical CPU usage (notification sent)
|
|
- alert: CriticalCPUUsage
|
|
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
category: performance
|
|
annotations:
|
|
summary: "CRITICAL CPU usage on {{ $labels.hostname }}"
|
|
description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"
|
|
|
|
# High memory usage (warning only)
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
category: performance
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.hostname }}"
|
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 85%)"
|
|
|
|
# Critical memory usage
|
|
- alert: CriticalMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
category: performance
|
|
annotations:
|
|
summary: "CRITICAL memory usage on {{ $labels.hostname }}"
|
|
description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"
|
|
|
|
# ====================================
|
|
# Storage Alerts
|
|
# ====================================
|
|
- name: storage
|
|
interval: 1m
|
|
rules:
|
|
# Disk space low (warning only)
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: storage
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.hostname }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has {{ $value | humanize }}% free space remaining (threshold: 15%)"
|
|
|
|
# Disk space critical
|
|
- alert: DiskSpaceCritical
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: storage
|
|
annotations:
|
|
summary: "CRITICAL disk space on {{ $labels.hostname }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has only {{ $value | humanize }}% free space remaining!"
|
|
|
|
# Disk will fill in 24 hours (warning only)
|
|
- alert: DiskSpaceFillingFast
|
|
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"}[1h], 24 * 3600) < 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
category: storage
|
|
annotations:
|
|
summary: "Disk filling rapidly on {{ $labels.hostname }}"
|
|
description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to fill within 24 hours at current rate"
|
|
|
|
# High disk I/O wait (warning only)
|
|
- alert: HighDiskIOWait
|
|
expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: performance
|
|
annotations:
|
|
summary: "High disk I/O wait on {{ $labels.hostname }}"
|
|
description: "Disk I/O wait on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 10%)"
|
|
|
|
# ====================================
|
|
# Network Alerts
|
|
# ====================================
|
|
- name: network
|
|
interval: 1m
|
|
rules:
|
|
# Network interface down (warning only)
|
|
- alert: NetworkInterfaceDown
|
|
expr: node_network_up{device!~"lo|veth.*|docker.*|br-.*"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
category: network
|
|
annotations:
|
|
summary: "Network interface down on {{ $labels.hostname }}"
|
|
description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is down"
|
|
|
|
# High network errors (warning only)
|
|
- alert: HighNetworkErrors
|
|
expr: rate(node_network_receive_errs_total[5m]) > 10 or rate(node_network_transmit_errs_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: network
|
|
annotations:
|
|
summary: "High network errors on {{ $labels.hostname }}"
|
|
description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is experiencing errors ({{ $value }} errors/sec)"
|
|
|
|
# ====================================
|
|
# Proxmox Specific Alerts
|
|
# ====================================
|
|
- name: proxmox
|
|
interval: 1m
|
|
rules:
|
|
# Proxmox node unreachable
|
|
- alert: ProxmoxNodeDown
|
|
expr: up{role="proxmox-host"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: infrastructure
|
|
annotations:
|
|
summary: "Proxmox node {{ $labels.hostname }} is down"
|
|
description: "Proxmox host {{ $labels.instance }} has been unreachable for more than 2 minutes"
|
|
|
|
# High load on Proxmox host (warning only)
|
|
- alert: ProxmoxHighLoad
|
|
expr: node_load15{role="proxmox-host"} / count(node_cpu_seconds_total{role="proxmox-host",mode="idle"}) without (cpu, mode) > 2
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
category: performance
|
|
annotations:
|
|
summary: "High load on Proxmox {{ $labels.hostname }}"
|
|
description: "15-minute load average on {{ $labels.instance }} is {{ $value | humanize }} (threshold: 2x CPU cores)"
|
|
|
|
# ====================================
|
|
# Service Specific Alerts
|
|
# ====================================
|
|
- name: services
|
|
interval: 1m
|
|
rules:
|
|
# PostgreSQL down
|
|
- alert: PostgreSQLDown
|
|
expr: up{app="postgres"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: database
|
|
service: postgresql
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL on {{ $labels.instance }} has been down for more than 2 minutes"
|
|
|
|
# Home Assistant down (warning only)
|
|
- alert: HomeAssistantDown
|
|
expr: up{app="homeassistant"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: automation
|
|
service: homeassistant
|
|
annotations:
|
|
summary: "Home Assistant is down"
|
|
description: "Home Assistant on {{ $labels.instance }} has been unreachable for more than 5 minutes"
|
|
|
|
# n8n down (warning only)
|
|
- alert: N8NDown
|
|
expr: up{app="n8n"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: automation
|
|
service: n8n
|
|
annotations:
|
|
summary: "n8n is down"
|
|
description: "n8n automation service on {{ $labels.instance }} has been down for more than 5 minutes"
|
|
|
|
# VPS down
|
|
- alert: VPSDown
|
|
expr: up{role="vps"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
category: infrastructure
|
|
annotations:
|
|
summary: "VPS is unreachable"
|
|
description: "VPS at {{ $labels.instance }} has been unreachable for more than 2 minutes. External services may be affected."
|
|
|
|
# ====================================
|
|
# Prometheus Self-Monitoring
|
|
# ====================================
|
|
- name: prometheus
|
|
interval: 1m
|
|
rules:
|
|
# Prometheus scrape failures (warning only)
|
|
- alert: PrometheusScrapeFailing
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
category: monitoring
|
|
annotations:
|
|
summary: "Prometheus cannot scrape {{ $labels.instance }}"
|
|
description: "Prometheus has failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 5 minutes"
|
|
|
|
# Prometheus config reload failed
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
category: monitoring
|
|
annotations:
|
|
summary: "Prometheus config reload failed"
|
|
description: "Prometheus configuration reload has failed. Check prometheus logs for errors."
|
|
|
|
# Prometheus running out of storage (warning only)
|
|
- alert: PrometheusStorageNearFull
|
|
expr: (prometheus_tsdb_storage_blocks_bytes / prometheus_tsdb_storage_blocks_bytes) > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
category: monitoring
|
|
annotations:
|
|
summary: "Prometheus storage near capacity"
|
|
description: "Prometheus storage is {{ $value | humanizePercentage }} full"
|
|
|
|
# ====================================
|
|
# Summary of Changes
|
|
# ====================================
|
|
#
|
|
# CRITICAL alerts (trigger Discord notification):
|
|
# - HostDown
|
|
# - CriticalCPUUsage (>95% for 5min)
|
|
# - CriticalMemoryUsage (>95% for 5min)
|
|
# - DiskSpaceCritical (<5% free)
|
|
# - ProxmoxNodeDown
|
|
# - PostgreSQLDown
|
|
# - VPSDown
|
|
# - PrometheusConfigReloadFailed
|
|
#
|
|
# WARNING alerts (logged only, no notification):
|
|
# - HighCPUUsage (>80% for 5min) ← UPDATED THRESHOLD
|
|
# - HighMemoryUsage (>85% for 10min)
|
|
# - DiskSpaceLow (<15% free)
|
|
# - DiskSpaceFillingFast (filling in 24h)
|
|
# - HighDiskIOWait
|
|
# - NetworkInterfaceDown
|
|
# - HighNetworkErrors
|
|
# - ProxmoxHighLoad
|
|
# - HomeAssistantDown
|
|
# - N8NDown
|
|
# - PrometheusScrapeFailing
|
|
# - PrometheusStorageNearFull
|