mirror of
https://github.com/elAgala/server-initializer.git
synced 2026-02-14 13:16:17 +00:00
330 lines
9.2 KiB
YAML
330 lines
9.2 KiB
YAML
groups:
|
|
- name: infrastructure
|
|
title: Infrastructure Alerts
|
|
folder: alerting
|
|
orgId: 1
|
|
interval: 30s
|
|
rules:
|
|
- uid: instance-down
|
|
title: Instance Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up == 0
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: []
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 5m
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
|
labels:
|
|
severity: critical
|
|
|
|
- uid: high-cpu-usage
|
|
title: High CPU Usage
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [80]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 5m
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
|
|
labels:
|
|
severity: warning
|
|
|
|
- uid: high-memory-usage
|
|
title: High Memory Usage
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [90]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 5m
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
|
|
labels:
|
|
severity: critical
|
|
|
|
- uid: disk-space-critical
|
|
title: Disk Space Critical
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [95]
|
|
type: gt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 2m
|
|
annotations:
|
|
summary: "Critical disk space on {{ $labels.instance }}"
|
|
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
|
labels:
|
|
severity: critical
|
|
|
|
- name: web-server
|
|
title: Web Server Alerts
|
|
folder: alerting
|
|
orgId: 1
|
|
interval: 30s
|
|
rules:
|
|
- uid: caddy-down
|
|
title: Caddy Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 60
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="caddy"}
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 1m
|
|
annotations:
|
|
summary: "Caddy web server is down"
|
|
description: "Caddy web server has been down for more than 1 minute."
|
|
labels:
|
|
severity: critical
|
|
|
|
- uid: crowdsec-down
|
|
title: CrowdSec Down
|
|
condition: C
|
|
data:
|
|
- refId: A
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 120
|
|
to: 0
|
|
datasourceUid: prometheus
|
|
model:
|
|
expr: up{job="crowdsec"}
|
|
interval: ''
|
|
refId: A
|
|
- refId: C
|
|
queryType: ''
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
datasourceUid: __expr__
|
|
model:
|
|
conditions:
|
|
- evaluator:
|
|
params: [1]
|
|
type: lt
|
|
operator:
|
|
type: and
|
|
query:
|
|
params:
|
|
- A
|
|
reducer:
|
|
params: []
|
|
type: last
|
|
type: query
|
|
datasource:
|
|
type: __expr__
|
|
uid: __expr__
|
|
expression: A
|
|
hide: false
|
|
intervalMs: 1000
|
|
maxDataPoints: 43200
|
|
reducer: last
|
|
refId: C
|
|
type: reduce
|
|
noDataState: NoData
|
|
execErrState: Alerting
|
|
for: 2m
|
|
annotations:
|
|
summary: "CrowdSec is down"
|
|
description: "CrowdSec security engine has been down for more than 2 minutes."
|
|
labels:
|
|
severity: warning |