This commit is contained in:
2025-08-12 18:51:44 -03:00
parent 2dd0edbd49
commit 32a09ed129
13 changed files with 710 additions and 21 deletions

View File

@@ -0,0 +1,330 @@
groups:
- name: infrastructure
title: Infrastructure Alerts
folder: alerting
orgId: 1
interval: 30s
rules:
- uid: instance-down
title: Instance Down
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: up == 0
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: []
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
labels:
severity: critical
- uid: high-cpu-usage
title: High CPU Usage
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [80]
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
labels:
severity: warning
- uid: high-memory-usage
title: High Memory Usage
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [90]
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
labels:
severity: critical
- uid: disk-space-critical
title: Disk Space Critical
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [95]
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 2m
annotations:
summary: "Critical disk space on {{ $labels.instance }}"
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
labels:
severity: critical
- name: web-server
title: Web Server Alerts
folder: alerting
orgId: 1
interval: 30s
rules:
- uid: caddy-down
title: Caddy Down
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 60
to: 0
datasourceUid: prometheus
model:
expr: up{job="caddy"}
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 1m
annotations:
summary: "Caddy web server is down"
description: "Caddy web server has been down for more than 1 minute."
labels:
severity: critical
- uid: crowdsec-down
title: CrowdSec Down
condition: C
data:
- refId: A
queryType: ''
relativeTimeRange:
from: 120
to: 0
datasourceUid: prometheus
model:
expr: up{job="crowdsec"}
interval: ''
refId: A
- refId: C
queryType: ''
relativeTimeRange:
from: 0
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params: [1]
type: lt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
hide: false
intervalMs: 1000
maxDataPoints: 43200
reducer: last
refId: C
type: reduce
noDataState: NoData
execErrState: Alerting
for: 2m
annotations:
summary: "CrowdSec is down"
description: "CrowdSec security engine has been down for more than 2 minutes."
labels:
severity: warning