#!/bin/bash # Deploy reduced alerts - Run this from your PC/terminal # Usage: bash deploy-inline.sh echo "๐Ÿš€ Deploying alert spam fix to Prometheus..." echo "" # Execute inside Prometheus container via Proxmox ssh root@10.0.10.3 "pct exec 125 -- bash -s" << 'SCRIPT_END' set -e echo "๐Ÿ“ฆ Backing up configs..." mkdir -p /etc/prometheus/backups cp /etc/prometheus/alertmanager.yml /etc/prometheus/backups/alertmanager.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true cp /etc/prometheus/rules/homelab-alerts.yml /etc/prometheus/backups/homelab-alerts.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true echo "โœ… Backups saved" echo "๐Ÿ“ Installing new Alertmanager config..." cat > /etc/prometheus/alertmanager.yml << 'EOF' global: resolve_timeout: 5m route: group_by: ['alertname', 'severity', 'instance'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'null' routes: - matchers: - severity="critical" receiver: 'discord-critical' group_wait: 10s repeat_interval: 1h - matchers: - severity="warning" receiver: 'null' repeat_interval: 24h inhibit_rules: - source_matchers: - severity="critical" target_matchers: - severity="warning" equal: ['alertname', 'instance'] - source_matchers: - alertname="HostDown" target_matchers: - alertname!="HostDown" equal: ['instance'] receivers: - name: 'null' - name: 'discord-critical' webhook_configs: - url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0' send_resolved: true http_config: follow_redirects: true max_alerts: 0 EOF echo "โœ… Alertmanager config updated" echo "๐Ÿ“ Installing new alert rules..." cat > /etc/prometheus/rules/homelab-alerts.yml << 'EOF' groups: - name: infrastructure interval: 30s rules: - alert: HostDown expr: up == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Host {{ $labels.instance }} is down" description: "{{ $labels.instance }} has been unreachable for 2+ minutes" - alert: HighCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning category: performance annotations: summary: "High CPU on {{ $labels.hostname }}" description: "CPU {{ $value | humanize }}%" - alert: CriticalCPUUsage expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL CPU on {{ $labels.hostname }}" description: "CPU {{ $value | humanize }}%" - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning category: performance annotations: summary: "High memory on {{ $labels.hostname }}" description: "Memory {{ $value | humanize }}%" - alert: CriticalMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95 for: 5m labels: severity: critical category: performance annotations: summary: "CRITICAL memory on {{ $labels.hostname }}" description: "Memory {{ $value | humanize }}%" - name: storage interval: 1m rules: - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15 for: 5m labels: severity: warning category: storage annotations: summary: "Low disk on {{ $labels.hostname }}" description: "{{ $labels.mountpoint }} has {{ $value | humanize }}% free" - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5 for: 2m labels: severity: critical category: storage annotations: summary: "CRITICAL disk on {{ $labels.hostname }}" description: "{{ $labels.mountpoint }} only {{ $value | humanize }}% free!" - name: services interval: 1m rules: - alert: PostgreSQLDown expr: up{app="postgres"} == 0 for: 2m labels: severity: critical category: database annotations: summary: "PostgreSQL is down" description: "PostgreSQL down for 2+ minutes" - alert: VPSDown expr: up{role="vps"} == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "VPS unreachable" description: "VPS down for 2+ minutes" - alert: ProxmoxNodeDown expr: up{role="proxmox-host"} == 0 for: 2m labels: severity: critical category: infrastructure annotations: summary: "Proxmox node down" description: "Proxmox host unreachable for 2+ minutes" EOF echo "โœ… Alert rules updated" echo "๐Ÿ”„ Reloading services..." systemctl reload prometheus systemctl reload prometheus-alertmanager echo "" echo "โœ… DEPLOYMENT COMPLETE!" echo "" echo "๐Ÿ“Š Changes applied:" echo " โ€ข CPU warning: 80%+ over 5min (logged only)" echo " โ€ข CPU critical: 95%+ over 5min (Discord)" echo " โ€ข Only CRITICAL alerts โ†’ Discord" echo " โ€ข WARNING alerts โ†’ Logged, not sent" echo " โ€ข Email notifications โ†’ DISABLED" echo "" echo "Your inbox spam should STOP immediately!" echo "" echo "๐Ÿงช Test Discord webhook:" echo " curl -X POST http://localhost:9093/api/v1/alerts -d '[{\"labels\":{\"alertname\":\"Test\",\"severity\":\"critical\"},\"annotations\":{\"summary\":\"Test alert\"}}]'" SCRIPT_END echo "" echo "๐ŸŽ‰ Done! Check your Discord for the test alert."