refactor: rethink monitoring structure | add Loki && Promtail | new Grafana conf

This commit is contained in:
2025-07-09 01:30:11 -03:00
parent 2d182eaf9c
commit 130575ffd3
7 changed files with 236 additions and 62 deletions

View File

@@ -1,20 +1,35 @@
# Prometheus monitoring endpoint
# Bypasses WAF for API endpoints since Prometheus scraping doesn't need WAF protection
prometheus.example.com {
basic_auth {
agala {$PROMETHEUS_PASSWORD}
}
# Prometheus API endpoint for external Grafana access via IP
# Access via: https://YOUR_SERVER_IP/prometheus/
:443 {
# Basic auth for Prometheus path
handle_path /prometheus/* {
basic_auth {
prometheus {$PROMETHEUS_PASSWORD}
}
@waf {
not path /api/v1/*
}
# Only allow Prometheus API endpoints that Grafana needs
@allowed_endpoints {
path /api/v1/*
path /federate
path /metrics
}
handle @waf {
coraza_waf {
directives `
Include /etc/caddy/coraza.conf
`
# Block everything else (UI, admin endpoints, etc.)
handle {
@blocked {
not path /api/v1/*
not path /federate
not path /metrics
}
respond @blocked "API access only" 403
}
# Forward only allowed endpoints (no WAF needed for API)
handle @allowed_endpoints {
reverse_proxy * http://prometheus:9090
}
}
reverse_proxy * http://prometheus:9090
# Default response for other paths
respond "Server monitoring" 200
}

View File

@@ -1,40 +1,21 @@
services:
# PORT 9090
prometheus:
image: prom/prometheus:latest
container_name: prometheus
restart: always
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
networks:
- monitoring_net
- caddy_net
# Grafana add-on for command center servers
# Use: docker compose -f docker-compose.yml -f docker-compose.grafana.yml up -d
services:
# PORT 3000
grafana:
image: grafana/grafana:latest
image: grafana/grafana:11.4.1
container_name: grafana
restart: always
restart: unless-stopped
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=YOUR_PASSWORD
- GE_SERVER_ROOT_URL=YOUR_URL
depends_on:
- prometheus
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
- GF_INSTALL_PLUGINS=grafana-clock-panel,grafana-simple-json-datasource
networks:
- monitoring_net
- caddy_net
# Note: Grafana connects to external Prometheus instances via HTTPS
# PORT 9100
node_exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: always
networks:
- monitoring_net
networks:
monitoring_net:
driver: bridge
caddy_net:
external: true
volumes:
grafana_data:

View File

@@ -1,20 +1,39 @@
services:
# PORT 9099
# PORT 9090 (internal only)
prometheus:
image: prom/prometheus:v3.4.2
container_name: prometheus
restart: always
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
networks:
- monitoring_net
- caddy_net
# No ports exposed - access via Caddy only
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
# PORT 9100
node_exporter:
image: prom/node-exporter:v1.9.1
container_name: node-exporter
restart: always
pid: host
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring_net
@@ -31,10 +50,41 @@ services:
restart: unless-stopped
networks:
- monitoring_net
command:
- '--housekeeping_interval=10s'
- '--docker_only=true'
# PORT 3100
loki:
image: grafana/loki:3.4.0
container_name: loki
volumes:
- ./loki/loki.yml:/etc/loki/local-config.yaml
- loki_data:/loki
restart: unless-stopped
networks:
- monitoring_net
command: -config.file=/etc/loki/local-config.yaml
# PORT 9080
promtail:
image: grafana/promtail:3.4.0
container_name: promtail
volumes:
- ./promtail/promtail.yml:/etc/promtail/config.yml
- ../caddy/logs:/var/log/caddy:ro
- /var/log:/var/log:ro
restart: unless-stopped
networks:
- monitoring_net
command: -config.file=/etc/promtail/config.yml
volumes:
prometheus_data:
loki_data:
networks:
monitoring_net:
external: true
caddy_net:
external: true
external: true

View File

@@ -0,0 +1,44 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
retention_period: 168h
compactor:
working_directory: /loki/boltdb-shipper-compactor
retention_enabled: true
retention_delete_delay: 2h

View File

@@ -1,8 +1,12 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['prometheus:9090']
@@ -10,15 +14,29 @@ scrape_configs:
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'crowdsec'
static_configs:
- targets: ['crowdsec:6060']
- job_name: 'caddy'
static_configs:
- targets: ['caddy:2019']
scrape_interval: 5s
metrics_path: /metrics
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
scrape_interval: 5s
metrics_path: /metrics
- job_name: 'caddy'
static_configs:
- targets: ['caddy:2019']
scrape_interval: 5s
metrics_path: /metrics
- job_name: 'crowdsec'
static_configs:
- targets: ['crowdsec:6060']
scrape_interval: 30s
metrics_path: /metrics
- job_name: 'loki'
static_configs:
- targets: ['loki:3100']
scrape_interval: 15s
metrics_path: /metrics

View File

@@ -0,0 +1,50 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
# Caddy access logs
- job_name: caddy
static_configs:
- targets:
- localhost
labels:
job: caddy
__path__: /var/log/caddy/access.log
pipeline_stages:
- json:
expressions:
timestamp: ts
level: level
message: msg
method: request.method
uri: request.uri
status: resp_headers.status
duration: duration
- labels:
method:
status:
- timestamp:
source: timestamp
format: Unix
# System logs
- job_name: syslog
static_configs:
- targets:
- localhost
labels:
job: syslog
__path__: /var/log/syslog
pipeline_stages:
- regex:
expression: '^(?P<timestamp>\w+\s+\d+\s+\d+:\d+:\d+)\s+(?P<hostname>\w+)\s+(?P<service>\w+).*'
- labels:
hostname:
service: