Initial infrastructure documentation - comprehensive homelab reference
This commit is contained in:
203
deploy-inline.sh
Normal file
203
deploy-inline.sh
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/bin/bash
|
||||
# Deploy reduced alerts - Run this from your PC/terminal
|
||||
# Usage: bash deploy-inline.sh
|
||||
|
||||
echo "🚀 Deploying alert spam fix to Prometheus..."
|
||||
echo ""
|
||||
|
||||
# Execute inside Prometheus container via Proxmox
|
||||
ssh root@10.0.10.3 "pct exec 125 -- bash -s" << 'SCRIPT_END'
|
||||
|
||||
set -e
|
||||
|
||||
echo "📦 Backing up configs..."
|
||||
mkdir -p /etc/prometheus/backups
|
||||
cp /etc/prometheus/alertmanager.yml /etc/prometheus/backups/alertmanager.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true
|
||||
cp /etc/prometheus/rules/homelab-alerts.yml /etc/prometheus/backups/homelab-alerts.yml.$(date +%Y%m%d-%H%M%S) 2>/dev/null || true
|
||||
echo "✅ Backups saved"
|
||||
|
||||
echo "📝 Installing new Alertmanager config..."
|
||||
cat > /etc/prometheus/alertmanager.yml << 'EOF'
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
group_by: ['alertname', 'severity', 'instance']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: 'null'
|
||||
routes:
|
||||
- matchers:
|
||||
- severity="critical"
|
||||
receiver: 'discord-critical'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
- matchers:
|
||||
- severity="warning"
|
||||
receiver: 'null'
|
||||
repeat_interval: 24h
|
||||
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity="critical"
|
||||
target_matchers:
|
||||
- severity="warning"
|
||||
equal: ['alertname', 'instance']
|
||||
- source_matchers:
|
||||
- alertname="HostDown"
|
||||
target_matchers:
|
||||
- alertname!="HostDown"
|
||||
equal: ['instance']
|
||||
|
||||
receivers:
|
||||
- name: 'null'
|
||||
- name: 'discord-critical'
|
||||
webhook_configs:
|
||||
- url: 'https://discord.com/api/webhooks/1462667503301038285/ZVJDuek6VADA-RdI09xJDvqjveOWXgxQnMBcsQzoKwVPnNOACMCL5v-HN55-KVe4IZY0'
|
||||
send_resolved: true
|
||||
http_config:
|
||||
follow_redirects: true
|
||||
max_alerts: 0
|
||||
EOF
|
||||
|
||||
echo "✅ Alertmanager config updated"
|
||||
|
||||
echo "📝 Installing new alert rules..."
|
||||
cat > /etc/prometheus/rules/homelab-alerts.yml << 'EOF'
|
||||
groups:
|
||||
- name: infrastructure
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HostDown
|
||||
expr: up == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Host {{ $labels.instance }} is down"
|
||||
description: "{{ $labels.instance }} has been unreachable for 2+ minutes"
|
||||
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: performance
|
||||
annotations:
|
||||
summary: "High CPU on {{ $labels.hostname }}"
|
||||
description: "CPU {{ $value | humanize }}%"
|
||||
|
||||
- alert: CriticalCPUUsage
|
||||
expr: 100 - (avg by(instance, hostname) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
category: performance
|
||||
annotations:
|
||||
summary: "CRITICAL CPU on {{ $labels.hostname }}"
|
||||
description: "CPU {{ $value | humanize }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
category: performance
|
||||
annotations:
|
||||
summary: "High memory on {{ $labels.hostname }}"
|
||||
description: "Memory {{ $value | humanize }}%"
|
||||
|
||||
- alert: CriticalMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
category: performance
|
||||
annotations:
|
||||
summary: "CRITICAL memory on {{ $labels.hostname }}"
|
||||
description: "Memory {{ $value | humanize }}%"
|
||||
|
||||
- name: storage
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
category: storage
|
||||
annotations:
|
||||
summary: "Low disk on {{ $labels.hostname }}"
|
||||
description: "{{ $labels.mountpoint }} has {{ $value | humanize }}% free"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs|squashfs|overlay"} / node_filesystem_size_bytes) * 100 < 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: storage
|
||||
annotations:
|
||||
summary: "CRITICAL disk on {{ $labels.hostname }}"
|
||||
description: "{{ $labels.mountpoint }} only {{ $value | humanize }}% free!"
|
||||
|
||||
- name: services
|
||||
interval: 1m
|
||||
rules:
|
||||
- alert: PostgreSQLDown
|
||||
expr: up{app="postgres"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: database
|
||||
annotations:
|
||||
summary: "PostgreSQL is down"
|
||||
description: "PostgreSQL down for 2+ minutes"
|
||||
|
||||
- alert: VPSDown
|
||||
expr: up{role="vps"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "VPS unreachable"
|
||||
description: "VPS down for 2+ minutes"
|
||||
|
||||
- alert: ProxmoxNodeDown
|
||||
expr: up{role="proxmox-host"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: infrastructure
|
||||
annotations:
|
||||
summary: "Proxmox node down"
|
||||
description: "Proxmox host unreachable for 2+ minutes"
|
||||
EOF
|
||||
|
||||
echo "✅ Alert rules updated"
|
||||
|
||||
echo "🔄 Reloading services..."
|
||||
systemctl reload prometheus
|
||||
systemctl reload prometheus-alertmanager
|
||||
|
||||
echo ""
|
||||
echo "✅ DEPLOYMENT COMPLETE!"
|
||||
echo ""
|
||||
echo "📊 Changes applied:"
|
||||
echo " • CPU warning: 80%+ over 5min (logged only)"
|
||||
echo " • CPU critical: 95%+ over 5min (Discord)"
|
||||
echo " • Only CRITICAL alerts → Discord"
|
||||
echo " • WARNING alerts → Logged, not sent"
|
||||
echo " • Email notifications → DISABLED"
|
||||
echo ""
|
||||
echo "Your inbox spam should STOP immediately!"
|
||||
echo ""
|
||||
echo "🧪 Test Discord webhook:"
|
||||
echo " curl -X POST http://localhost:9093/api/v1/alerts -d '[{\"labels\":{\"alertname\":\"Test\",\"severity\":\"critical\"},\"annotations\":{\"summary\":\"Test alert\"}}]'"
|
||||
|
||||
SCRIPT_END
|
||||
|
||||
echo ""
|
||||
echo "🎉 Done! Check your Discord for the test alert."
|
||||
Reference in New Issue
Block a user