homelab-docs/install-on-prometheus.sh

#!/bin/bash
# Run this script INSIDE the Prometheus container (CT 125)
# Usage: curl http://10.0.10.28/workspace/fred-infrastructure/install-on-prometheus.sh | bash

set -e

echo "🚀 Installing reduced alert configuration..."
echo ""

# Backup existing configs
echo "📦 Backing up existing configs..."
mkdir -p /etc/prometheus/backups
cp /etc/prometheus/alertmanager.yml /etc/prometheus/backups/alertmanager.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true
cp /etc/prometheus/rules/homelab-alerts.yml /etc/prometheus/backups/homelab-alerts.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true
echo "✅ Backups saved to /etc/prometheus/backups/"
echo ""

# Download new configs
echo "📥 Downloading new configurations..."

# Alertmanager config
cat > /etc/prometheus/alertmanager.yml << 'EOF'
global:
  resolve_timeout: 5m

route:
  group_by: ['alertname', 'severity', 'instance']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 12h
  receiver: 'null'
  routes:
    - matchers:
        - severity="critical"
      receiver: 'discord-critical'
      group_wait: 10s
      repeat_interval: 1h
    - matchers:
        - severity="warning"
      receiver: 'null'
      repeat_interval: 24h

inhibit_rules:
  - source_matchers:
      - severity="critical"
    target_matchers:
      - severity="warning"
    equal: ['alertname', 'instance']
  - source_matchers:
      - alertname="HostDown"
    target_matchers:
      - alertname!="HostDown"
    equal: ['instance']

receivers:
  - name: 'null'
  - name: 'discord-critical'
    webhook_configs:
      - url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
        send_resolved: true
        http_config:
          follow_redirects: true
        max_alerts: 0
EOF

echo "✅ Alertmanager config updated"

# Prometheus alert rules
cat > /etc/prometheus/rules/homelab-alerts.yml << 'EOF'
groups:
  - name: infrastructure
    interval: 30s
    rules:
      - alert: HostDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
          category: infrastructure
        annotations:
          summary: "Host {{ $labels.instance }} is down"
          description: "{{ $labels.instance }} has been unreachable for more than 2 minutes"

      - alert: HighCPUUsage
        expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High CPU usage on {{ $labels.hostname }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}%"

      - alert: CriticalCPUUsage
        expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
        for: 5m
        labels:
          severity: critical
          category: performance
        annotations:
          summary: "CRITICAL CPU usage on {{ $labels.hostname }}"
          description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}%"

      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 10m
        labels:
          severity: warning
          category: performance
        annotations:
          summary: "High memory usage on {{ $labels.hostname }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}%"

      - alert: CriticalMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 5m
        labels:
          severity: critical
          category: performance
        annotations:
          summary: "CRITICAL memory usage on {{ $labels.hostname }}"
          description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}%"

  - name: storage
    interval: 1m
    rules:
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15
        for: 5m
        labels:
          severity: warning
          category: storage
        annotations:
          summary: "Low disk space on {{ $labels.hostname }}"
          description: "Disk {{ $labels.mountpoint }} has {{ $value | humanize }}% free"

      - alert: DiskSpaceCritical
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5
        for: 2m
        labels:
          severity: critical
          category: storage
        annotations:
          summary: "CRITICAL disk space on {{ $labels.hostname }}"
          description: "Disk {{ $labels.mountpoint }} has only {{ $value | humanize }}% free!"

  - name: services
    interval: 1m
    rules:
      - alert: PostgreSQLDown
        expr: up{app="postgres"} == 0
        for: 2m
        labels:
          severity: critical
          category: database
        annotations:
          summary: "PostgreSQL is down"
          description: "PostgreSQL has been down for more than 2 minutes"

      - alert: VPSDown
        expr: up{role="vps"} == 0
        for: 2m
        labels:
          severity: critical
          category: infrastructure
        annotations:
          summary: "VPS is unreachable"
          description: "VPS has been unreachable for more than 2 minutes"
EOF

echo "✅ Alert rules updated"
echo ""

# Reload services
echo "🔄 Reloading Prometheus and Alertmanager..."
systemctl reload prometheus
systemctl reload prometheus-alertmanager
echo "✅ Services reloaded"
echo ""

# Verify
echo "✅ Deployment complete!"
echo ""
echo "📊 New configuration:"
echo "  • CPU warning: 80%+ over 5 min (logged only)"
echo "  • CPU critical: 95%+ over 5 min (Discord alert)"
echo "  • Only CRITICAL alerts sent to Discord"
echo "  • WARNING alerts logged but NOT sent"
echo "  • Email notifications disabled"
echo ""
echo "🧪 Test with:"
echo "  curl -X POST http://localhost:9093/api/v1/alerts -d '[{\"labels\":{\"alertname\":\"Test\",\"severity\":\"critical\"},\"annotations\":{\"summary\":\"Test\"}}]'"
echo ""