services/swarm/monitoring.yaml
traveler c3b0c0f571 df
2026-05-01 13:40:01 -05:00

266 lines
No EOL
7.9 KiB
YAML
Executable file

# Run with docker stack deploy -c monitoring.yaml monitoring
services:
# ============================================================
# PROMETHEUS — Metrics collection
# ============================================================
prometheus:
image: prom/prometheus:latest
user: "1964:1964"
environment:
TZ: America/Chicago
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=30d
- --web.enable-lifecycle
- --web.console.libraries=/usr/share/prometheus/console_libraries
- --web.console.templates=/usr/share/prometheus/consoles
volumes:
- /DockerVol/prometheus/data:/prometheus
- /DockerVol/prometheus/config:/etc/prometheus:ro
networks:
- netgrimoire
deploy:
placement:
constraints:
- node.hostname == znas
- node.platform.arch != aarch64
- node.platform.arch != arm
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- Caddy ---
caddy: prometheus.netgrimoire.com
caddy.reverse_proxy: prometheus:9090
caddy.import: crowdsec
# --- Uptime Kuma ---
kuma.prometheus.http.name: Prometheus
kuma.prometheus.http.url: https://prometheus.netgrimoire.com
# --- Homepage ---
homepage.group: Monitoring
homepage.name: Prometheus
homepage.icon: prometheus.png
homepage.href: https://prometheus.netgrimoire.com
homepage.description: Metrics Collection
homepage.widget.type: prometheus
homepage.widget.url: http://prometheus:9090
# --- DIUN ---
diun.enable: "true"
# ============================================================
# GRAFANA — Dashboards
# ============================================================
grafana:
image: grafana/grafana:latest
user: "1964:1964"
environment:
TZ: America/Chicago
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: F@lcon13
GF_USERS_DEFAULT_THEME: dark
GF_SERVER_ROOT_URL: https://grafana.netgrimoire.com
GF_FEATURE_TOGGLES_ENABLE: publicDashboards
volumes:
- /DockerVol/grafana/data:/var/lib/grafana
- /DockerVol/grafana/provisioning:/etc/grafana/provisioning
networks:
- netgrimoire
deploy:
placement:
constraints:
- node.hostname == znas
- node.platform.arch != aarch64
- node.platform.arch != arm
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- Caddy ---
caddy: grafana.netgrimoire.com
caddy.reverse_proxy: grafana:3000
caddy.import: crowdsec
# --- Uptime Kuma ---
kuma.grafana.http.name: Grafana
kuma.grafana.http.url: https://grafana.netgrimoire.com
# --- Homepage ---
homepage.group: Monitoring
homepage.name: Grafana
homepage.icon: grafana.png
homepage.href: https://grafana.netgrimoire.com
homepage.description: Metrics Dashboards
homepage.widget.type: grafana
homepage.widget.url: http://grafana:3000
homepage.widget.username: admin
homepage.widget.password: F@lcon13
# --- DIUN ---
diun.enable: "true"
# ============================================================
# ALERTMANAGER — Alert routing → ntfy
# ============================================================
alertmanager:
image: prom/alertmanager:latest
user: "1964:1964"
environment:
TZ: America/Chicago
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --web.external-url=https://alertmanager.netgrimoire.com
volumes:
- /DockerVol/alertmanager/data:/alertmanager
- /DockerVol/alertmanager/config:/etc/alertmanager:ro
networks:
- netgrimoire
deploy:
placement:
constraints:
- node.hostname == znas
- node.platform.arch != aarch64
- node.platform.arch != arm
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- Caddy ---
caddy: alertmanager.netgrimoire.com
caddy.reverse_proxy: alertmanager:9093
caddy.import: crowdsec
# --- Uptime Kuma ---
kuma.alertmanager.http.name: Alertmanager
kuma.alertmanager.http.url: https://alertmanager.netgrimoire.com
# --- Homepage ---
homepage.group: Monitoring
homepage.name: Alertmanager
homepage.icon: alertmanager.png
homepage.href: https://alertmanager.netgrimoire.com
homepage.description: Alert Routing
# --- DIUN ---
diun.enable: "true"
# ============================================================
# BLACKBOX EXPORTER — HTTP/TCP/ICMP probing
# ============================================================
blackbox:
image: prom/blackbox-exporter:latest
environment:
TZ: America/Chicago
cap_add:
- NET_RAW
command:
- --config.file=/etc/blackbox/blackbox.yml
volumes:
- /DockerVol/blackbox/config:/etc/blackbox:ro
networks:
- netgrimoire
deploy:
placement:
constraints:
- node.hostname == znas
- node.platform.arch != aarch64
- node.platform.arch != arm
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- Caddy ---
caddy: blackbox.netgrimoire.com
caddy.reverse_proxy: blackbox:9115
caddy.import: crowdsec
# --- Uptime Kuma ---
kuma.blackbox.http.name: Blackbox Exporter
kuma.blackbox.http.url: https://blackbox.netgrimoire.com
# --- Homepage ---
homepage.group: Monitoring
homepage.name: Blackbox
homepage.icon: prometheus.png
homepage.href: https://blackbox.netgrimoire.com
homepage.description: HTTP/TCP Probing
# --- DIUN ---
diun.enable: "true"
# ============================================================
# Multi-arch image — runs on aarch64 and x86_64
# ============================================================
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
hostname: "{{.Node.Hostname}}"
environment:
TZ: America/Chicago
command:
- --docker_only=true
- --store_container_labels=false
- --disable_metrics=disk,diskIO,network,tcp,udp,percpu,sched,process
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
networks:
- netgrimoire
deploy:
mode: global
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- DIUN ---
diun.enable: "true"
# ============================================================
# NODE EXPORTER — Host metrics (all nodes including Pi)
# Multi-arch image — runs on aarch64 and x86_64
# ============================================================
node-exporter:
image: prom/node-exporter:latest
hostname: "{{.Node.Hostname}}"
environment:
TZ: America/Chicago
NODE_HOSTNAME: "{{.Node.Hostname}}"
command:
- --path.rootfs=/host
- --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)
- --collector.textfile.directory=/etc/node-exporter
volumes:
- /:/host:ro,rslave
- /etc/hostname:/etc/hostname:ro
networks:
- netgrimoire
deploy:
mode: global
restart_policy:
condition: any
delay: 5s
max_attempts: 3
window: 120s
labels:
# --- DIUN ---
diun.enable: "true"
networks:
netgrimoire:
external: true