#!/bin/bash # Run this script INSIDE the Prometheus container (CT 125) # Usage: curl http://10.0.10.28/workspace/fred-infrastructure/install-on-prometheus.sh | bash set -e echo "๐Ÿš€ Installing reduced alert configuration..." echo "" # Backup existing configs echo "๐Ÿ“ฆ Backing up existing configs..." mkdir -p /etc/prometheus/backups cp /etc/prometheus/alertmanager.yml /etc/prometheus/backups/alertmanager.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true cp /etc/prometheus/rules/homelab-alerts.yml /etc/prometheus/backups/homelab-alerts.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true echo "โœ… Backups saved to /etc/prometheus/backups/" echo "" # Download new configs echo "๐Ÿ“ฅ Downloading new configurations..." # Alertmanager config cat > /etc/prometheus/alertmanager.yml << 'EOF' global: resolve_timeout: 5m route: group_by: ['alertname', 'severity', 'instance'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'null' routes: - matchers: - severity="critical" receiver: 'discord-critical' group_wait: 10s repeat_interval: 1h - matchers: - severity="warning" receiver: 'null' repeat_interval: 24h inhibit_rules: - source_matchers: - severity="critical" target_matchers: - severity="warning" equal: ['alertname', 'instance'] - source_matchers: - alertname="HostDown" target_matchers: - alertname!="HostDown" equal: ['instance'] receivers: - name: 'null' - name: 'discord-critical' webhook_configs: - url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0' send_resolved: true http_config: follow_redirects: true max_alerts: 0 EOF echo "โœ… Alertmanager config updated" # Prometheus alert rules cat > /etc/prometheus/rules/homelab-alerts.yml << 'EOF' groups: - name: infrastructure interval: 30s rules: - alert: HostDown expr: up == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Host {{ $labels.instance }} is down" description: "{{ $labels.instance }} has been unreachable for more than 2 minutes" - alert: HighCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning category: performance annotations: summary: "High CPU usage on {{ $labels.hostname }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}%" - alert: CriticalCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL CPU usage on {{ $labels.hostname }}" description: "CPU usage on {{ $labels.instance }} is {{ $value | humanize }}%" - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning category: performance annotations: summary: "High memory usage on {{ $labels.hostname }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}%" - alert: CriticalMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL memory usage on {{ $labels.hostname }}" description: "Memory usage on {{ $labels.instance }} is {{ $value | humanize }}%" - name: storage interval: 1m rules: - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15 for: 5m labels: severity: warning category: storage annotations: summary: "Low disk space on {{ $labels.hostname }}" description: "Disk {{ $labels.mountpoint }} has {{ $value | humanize }}% free" - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5 for: 2m labels: severity: critical category: storage annotations: summary: "CRITICAL disk space on {{ $labels.hostname }}" description: "Disk {{ $labels.mountpoint }} has only {{ $value | humanize }}% free!" - name: services interval: 1m rules: - alert: PostgreSQLDown expr: up{app="postgres"} == 0 for: 2m labels: severity: critical category: database annotations: summary: "PostgreSQL is down" description: "PostgreSQL has been down for more than 2 minutes" - alert: VPSDown expr: up{role="vps"} == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "VPS is unreachable" description: "VPS has been unreachable for more than 2 minutes" EOF echo "โœ… Alert rules updated" echo "" # Reload services echo "๐Ÿ”„ Reloading Prometheus and Alertmanager..." systemctl reload prometheus systemctl reload prometheus-alertmanager echo "โœ… Services reloaded" echo "" # Verify echo "โœ… Deployment complete!" echo "" echo "๐Ÿ“Š New configuration:" echo " โ€ข CPU warning: 80%+ over 5 min (logged only)" echo " โ€ข CPU critical: 95%+ over 5 min (Discord alert)" echo " โ€ข Only CRITICAL alerts sent to Discord" echo " โ€ข WARNING alerts logged but NOT sent" echo " โ€ข Email notifications disabled" echo "" echo "๐Ÿงช Test with:" echo " curl -X POST http://localhost:9093/api/v1/alerts -d '[{\"labels\":{\"alertname\":\"Test\",\"severity\":\"critical\"},\"annotations\":{\"summary\":\"Test\"}}]'" echo ""