mirror of
https://github.com/elAgala/server-initializer.git
synced 2026-02-14 13:16:17 +00:00
save
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
apiVersion: 1
|
||||
|
||||
contactPoints:
|
||||
- orgId: 1
|
||||
name: grafana-default-email
|
||||
receivers:
|
||||
- uid: default-email
|
||||
type: email
|
||||
settings:
|
||||
addresses: admin@localhost
|
||||
subject: "Grafana Alert: {{ .GroupLabels.alertname }}"
|
||||
body: |
|
||||
Alert: {{ .GroupLabels.alertname }}
|
||||
Status: {{ .Status }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
Summary: {{ .Annotations.summary }}
|
||||
Description: {{ .Annotations.description }}
|
||||
Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
|
||||
{{ end }}
|
||||
disableResolveMessage: false
|
||||
|
||||
- orgId: 1
|
||||
name: webhook-placeholder
|
||||
receivers:
|
||||
- uid: webhook-placeholder
|
||||
type: webhook
|
||||
settings:
|
||||
url: http://localhost:8080/webhook
|
||||
username: ""
|
||||
password: ""
|
||||
title: "Grafana Alert"
|
||||
httpMethod: POST
|
||||
disableResolveMessage: false
|
||||
@@ -0,0 +1,31 @@
|
||||
apiVersion: 1
|
||||
|
||||
policies:
|
||||
- orgId: 1
|
||||
receiver: grafana-default-email
|
||||
group_by:
|
||||
- alertname
|
||||
- grafana_folder
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
routes:
|
||||
- receiver: grafana-default-email
|
||||
object_matchers:
|
||||
- - severity
|
||||
- "="
|
||||
- critical
|
||||
group_wait: 5s
|
||||
group_interval: 1m
|
||||
repeat_interval: 30m
|
||||
continue: false
|
||||
|
||||
- receiver: grafana-default-email
|
||||
object_matchers:
|
||||
- - severity
|
||||
- "="
|
||||
- warning
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 2h
|
||||
continue: false
|
||||
330
templates/monitoring/grafana/provisioning/alerting/rules.yml
Normal file
330
templates/monitoring/grafana/provisioning/alerting/rules.yml
Normal file
@@ -0,0 +1,330 @@
|
||||
groups:
|
||||
- name: infrastructure
|
||||
title: Infrastructure Alerts
|
||||
folder: alerting
|
||||
orgId: 1
|
||||
interval: 30s
|
||||
rules:
|
||||
- uid: instance-down
|
||||
title: Instance Down
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up == 0
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: []
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: high-cpu-usage
|
||||
title: High CPU Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [80]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "High CPU usage on {{ $labels.instance }}"
|
||||
description: "CPU usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: high-memory-usage
|
||||
title: High Memory Usage
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [90]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "High memory usage on {{ $labels.instance }}"
|
||||
description: "Memory usage is {{ $value }}% on {{ $labels.instance }} for more than 5 minutes."
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: disk-space-critical
|
||||
title: Disk Space Critical
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [95]
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 2m
|
||||
annotations:
|
||||
summary: "Critical disk space on {{ $labels.instance }}"
|
||||
description: "Disk usage is {{ $value }}% on {{ $labels.instance }} ({{ $labels.mountpoint }})"
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- name: web-server
|
||||
title: Web Server Alerts
|
||||
folder: alerting
|
||||
orgId: 1
|
||||
interval: 30s
|
||||
rules:
|
||||
- uid: caddy-down
|
||||
title: Caddy Down
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 60
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="caddy"}
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "Caddy web server is down"
|
||||
description: "Caddy web server has been down for more than 1 minute."
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: crowdsec-down
|
||||
title: CrowdSec Down
|
||||
condition: C
|
||||
data:
|
||||
- refId: A
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 120
|
||||
to: 0
|
||||
datasourceUid: prometheus
|
||||
model:
|
||||
expr: up{job="crowdsec"}
|
||||
interval: ''
|
||||
refId: A
|
||||
- refId: C
|
||||
queryType: ''
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params: [1]
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params:
|
||||
- A
|
||||
reducer:
|
||||
params: []
|
||||
type: last
|
||||
type: query
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
hide: false
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
reducer: last
|
||||
refId: C
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 2m
|
||||
annotations:
|
||||
summary: "CrowdSec is down"
|
||||
description: "CrowdSec security engine has been down for more than 2 minutes."
|
||||
labels:
|
||||
severity: warning
|
||||
Reference in New Issue
Block a user