homelab-docs/prometheus-alert-rules-updated.yml

# Prometheus Alert Rules for Fred's Homelab - UPDATED
# Location: /etc/prometheus/rules/homelab-alerts.yml
# Updated: 2026-02-03 (Reduced alert noise)
#
# Changes:
# - CPU threshold: 80%+ over 5 minutes
# - Only CRITICAL alerts trigger notifications
# - WARNING alerts are logged only

groups:
  # ====================================
  # Host & Infrastructure Alerts
  # ====================================
  - name: infrastructure
    interval: 30s
    rules:
      # Host is completely down
      - alert: HostDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
          category: infrastructure
        annotations:
          summary: "Host {{ $labels.instance }} is down"
          description: "{{ $labels.instance }} ({{ $labels.hostname }}) has been unreachable for more than 2 minutes. Check network connectivity and host status."

      # High CPU usage (warning only - logged, not sent)
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High CPU usage on {{ $labels.hostname }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 80%)"

      # Critical CPU usage (notification sent)
      - alert: CriticalCPUUsage
        expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
        for: 5m
        labels:
          severity: critical
          category: performance
        annotations:
          summary: "CRITICAL CPU usage on {{ $labels.hostname }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"

      # High memory usage (warning only)
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 10m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High memory usage on {{ $labels.hostname }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 85%)"

      # Critical memory usage
      - alert: CriticalMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 5m
        labels:
          severity: critical
          category: performance
        annotations:
          summary: "CRITICAL memory usage on {{ $labels.hostname }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 95%)"

  # ====================================
  # Storage Alerts
  # ====================================
  - name: storage
    interval: 1m
    rules:
      # Disk space low (warning only)
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15
        for: 5m
        labels:
          severity: warning
          category: storage
        annotations:
          summary: "Low disk space on {{ $labels.hostname }}"
          description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has {{ $value | humanize }}% free space remaining (threshold: 15%)"

      # Disk space critical
      - alert: DiskSpaceCritical
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5
        for: 2m
        labels:
          severity: critical
          category: storage
        annotations:
          summary: "CRITICAL disk space on {{ $labels.hostname }}"
          description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} has only {{ $value | humanize }}% free space remaining!"

      # Disk will fill in 24 hours (warning only)
      - alert: DiskSpaceFillingFast
        expr: predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"}[1h], 24 * 3600) < 0
        for: 1h
        labels:
          severity: warning
          category: storage
        annotations:
          summary: "Disk filling rapidly on {{ $labels.hostname }}"
          description: "Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is predicted to fill within 24 hours at current rate"

      # High disk I/O wait (warning only)
      - alert: HighDiskIOWait
        expr: rate(node_cpu_seconds_total{mode="iowait"}[5m]) * 100 > 10
        for: 5m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High disk I/O wait on {{ $labels.hostname }}"
          description: "Disk I/O wait on {{ $labels.instance }} is {{ $value | humanize }}% (threshold: 10%)"

  # ====================================
  # Network Alerts
  # ====================================
  - name: network
    interval: 1m
    rules:
      # Network interface down (warning only)
      - alert: NetworkInterfaceDown
        expr: node_network_up{device!~"lo|veth.*|docker.*|br-.*"} == 0
        for: 2m
        labels:
          severity: warning
          category: network
        annotations:
          summary: "Network interface down on {{ $labels.hostname }}"
          description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is down"

      # High network errors (warning only)
      - alert: HighNetworkErrors
        expr: rate(node_network_receive_errs_total[5m]) > 10 or rate(node_network_transmit_errs_total[5m]) > 10
        for: 5m
        labels:
          severity: warning
          category: network
        annotations:
          summary: "High network errors on {{ $labels.hostname }}"
          description: "Network interface {{ $labels.device }} on {{ $labels.instance }} is experiencing errors ({{ $value }} errors/sec)"

  # ====================================
  # Proxmox Specific Alerts
  # ====================================
  - name: proxmox
    interval: 1m
    rules:
      # Proxmox node unreachable
      - alert: ProxmoxNodeDown
        expr: up{role="proxmox-host"} == 0
        for: 2m
        labels:
          severity: critical
          category: infrastructure
        annotations:
          summary: "Proxmox node {{ $labels.hostname }} is down"
          description: "Proxmox host {{ $labels.instance }} has been unreachable for more than 2 minutes"

      # High load on Proxmox host (warning only)
      - alert: ProxmoxHighLoad
        expr: node_load15{role="proxmox-host"} / count(node_cpu_seconds_total{role="proxmox-host",mode="idle"}) without (cpu, mode) > 2
        for: 15m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High load on Proxmox {{ $labels.hostname }}"
          description: "15-minute load average on {{ $labels.instance }} is {{ $value | humanize }} (threshold: 2x CPU cores)"

  # ====================================
  # Service Specific Alerts
  # ====================================
  - name: services
    interval: 1m
    rules:
      # PostgreSQL down
      - alert: PostgreSQLDown
        expr: up{app="postgres"} == 0
        for: 2m
        labels:
          severity: critical
          category: database
          service: postgresql
        annotations:
          summary: "PostgreSQL is down"
          description: "PostgreSQL on {{ $labels.instance }} has been down for more than 2 minutes"

      # Home Assistant down (warning only)
      - alert: HomeAssistantDown
        expr: up{app="homeassistant"} == 0
        for: 5m
        labels:
          severity: warning
          category: automation
          service: homeassistant
        annotations:
          summary: "Home Assistant is down"
          description: "Home Assistant on {{ $labels.instance }} has been unreachable for more than 5 minutes"

      # n8n down (warning only)
      - alert: N8NDown
        expr: up{app="n8n"} == 0
        for: 5m
        labels:
          severity: warning
          category: automation
          service: n8n
        annotations:
          summary: "n8n is down"
          description: "n8n automation service on {{ $labels.instance }} has been down for more than 5 minutes"

      # VPS down
      - alert: VPSDown
        expr: up{role="vps"} == 0
        for: 2m
        labels:
          severity: critical
          category: infrastructure
        annotations:
          summary: "VPS is unreachable"
          description: "VPS at {{ $labels.instance }} has been unreachable for more than 2 minutes. External services may be affected."

  # ====================================
  # Prometheus Self-Monitoring
  # ====================================
  - name: prometheus
    interval: 1m
    rules:
      # Prometheus scrape failures (warning only)
      - alert: PrometheusScrapeFailing
        expr: up == 0
        for: 5m
        labels:
          severity: warning
          category: monitoring
        annotations:
          summary: "Prometheus cannot scrape {{ $labels.instance }}"
          description: "Prometheus has failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 5 minutes"

      # Prometheus config reload failed
      - alert: PrometheusConfigReloadFailed
        expr: prometheus_config_last_reload_successful == 0
        for: 5m
        labels:
          severity: critical
          category: monitoring
        annotations:
          summary: "Prometheus config reload failed"
          description: "Prometheus configuration reload has failed. Check prometheus logs for errors."

      # Prometheus running out of storage (warning only)
      - alert: PrometheusStorageNearFull
        expr: (prometheus_tsdb_storage_blocks_bytes / prometheus_tsdb_storage_blocks_bytes) > 0.85
        for: 10m
        labels:
          severity: warning
          category: monitoring
        annotations:
          summary: "Prometheus storage near capacity"
          description: "Prometheus storage is {{ $value | humanizePercentage }} full"

# ====================================
# Summary of Changes
# ====================================
#
# CRITICAL alerts (trigger Discord notification):
# - HostDown
# - CriticalCPUUsage (>95% for 5min)
# - CriticalMemoryUsage (>95% for 5min)
# - DiskSpaceCritical (<5% free)
# - ProxmoxNodeDown
# - PostgreSQLDown
# - VPSDown
# - PrometheusConfigReloadFailed
#
# WARNING alerts (logged only, no notification):
# - HighCPUUsage (>80% for 5min) ← UPDATED THRESHOLD
# - HighMemoryUsage (>85% for 10min)
# - DiskSpaceLow (<15% free)
# - DiskSpaceFillingFast (filling in 24h)
# - HighDiskIOWait
# - NetworkInterfaceDown
# - HighNetworkErrors
# - ProxmoxHighLoad
# - HomeAssistantDown
# - N8NDown
# - PrometheusScrapeFailing
# - PrometheusStorageNearFull