mirror of
https://github.com/elAgala/server-initializer.git
synced 2026-02-14 13:16:17 +00:00
save
This commit is contained in:
56
templates/monitoring/alerts/infrastructure.yml
Normal file
56
templates/monitoring/alerts/infrastructure.yml
Normal file
@@ -0,0 +1,56 @@
|
||||
groups:
|
||||
- name: infrastructure
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
|
||||
- alert: HighCpuUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is above 80% on {{ $labels.instance }} for more than 5 minutes."
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is above 90% on {{ $labels.instance }} for more than 5 minutes."
|
||||
|
||||
- alert: DiskSpaceLow
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low disk space on {{ $labels.instance }}"
|
||||
description: "Disk usage is above 85% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
- alert: DiskSpaceCritical
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical disk space on {{ $labels.instance }}"
|
||||
description: "Disk usage is above 95% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
|
||||
- alert: HighLoadAverage
|
||||
expr: node_load1 > (count by(instance) (node_cpu_seconds_total{mode="idle"})) * 0.8
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High load average on {{ $labels.instance }}"
|
||||
description: "Load average is high on {{ $labels.instance }}: {{ $value }}"
|
||||
Reference in New Issue
Block a user