diff --git a/swarm/monitoring.yaml b/swarm/monitoring.yaml new file mode 100644 index 0000000..18aab1a --- /dev/null +++ b/swarm/monitoring.yaml @@ -0,0 +1,225 @@ +# Run with docker stack deploy -c monitoring.yaml monitoring +services: + + # ============================================================ + # PROMETHEUS — Metrics collection + # ============================================================ + prometheus: + image: prom/prometheus:latest + user: "1964:1964" + environment: + TZ: America/Chicago + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=30d + - --web.enable-lifecycle + - --web.console.libraries=/usr/share/prometheus/console_libraries + - --web.console.templates=/usr/share/prometheus/consoles + volumes: + - /DockerVol/prometheus/data:/prometheus + - /DockerVol/prometheus/config:/etc/prometheus:ro + networks: + - netgrimoire + deploy: + placement: + constraints: + - node.hostname == znas + - node.platform.arch != aarch64 + - node.platform.arch != arm + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + labels: + # --- Caddy --- + caddy: prometheus.netgrimoire.com + caddy.reverse_proxy: prometheus:9090 + caddy.import: crowdsec + # caddy.import_1: authentik + + # --- Uptime Kuma --- + kuma.prometheus.http.name: Prometheus + kuma.prometheus.http.url: https://prometheus.netgrimoire.com + + # --- Homepage --- + homepage.group: Monitoring + homepage.name: Prometheus + homepage.icon: prometheus.png + homepage.href: https://prometheus.netgrimoire.com + homepage.description: Metrics Collection + homepage.widget.type: prometheus + homepage.widget.url: http://prometheus:9090 + + # --- DIUN --- + diun.enable: "true" + + # ============================================================ + # GRAFANA — Dashboards + # ============================================================ + grafana: + image: grafana/grafana:latest + user: "1964:1964" + environment: + TZ: America/Chicago + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD: F@lcon13 + GF_USERS_DEFAULT_THEME: dark + GF_SERVER_ROOT_URL: https://grafana.netgrimoire.com + GF_FEATURE_TOGGLES_ENABLE: publicDashboards + volumes: + - /DockerVol/grafana/data:/var/lib/grafana + - /DockerVol/grafana/provisioning:/etc/grafana/provisioning + networks: + - netgrimoire + deploy: + placement: + constraints: + - node.hostname == znas + - node.platform.arch != aarch64 + - node.platform.arch != arm + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + labels: + # --- Caddy --- + caddy: grafana.netgrimoire.com + caddy.reverse_proxy: grafana:3000 + caddy.import: crowdsec + # caddy.import_1: authentik + + # --- Uptime Kuma --- + kuma.grafana.http.name: Grafana + kuma.grafana.http.url: https://grafana.netgrimoire.com + + # --- Homepage --- + homepage.group: Monitoring + homepage.name: Grafana + homepage.icon: grafana.png + homepage.href: https://grafana.netgrimoire.com + homepage.description: Metrics Dashboards + homepage.widget.type: grafana + homepage.widget.url: http://grafana:3000 + homepage.widget.username: admin + homepage.widget.password: F@lcon13 + + # --- DIUN --- + diun.enable: "true" + + # ============================================================ + # ALERTMANAGER — Alert routing → ntfy + # ============================================================ + alertmanager: + image: prom/alertmanager:latest + user: "1964:1964" + environment: + TZ: America/Chicago + command: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + - --web.external-url=https://alertmanager.netgrimoire.com + volumes: + - /DockerVol/alertmanager/data:/alertmanager + - /DockerVol/alertmanager/config:/etc/alertmanager:ro + networks: + - netgrimoire + deploy: + placement: + constraints: + - node.hostname == znas + - node.platform.arch != aarch64 + - node.platform.arch != arm + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + labels: + # --- Caddy --- + caddy: alertmanager.netgrimoire.com + caddy.reverse_proxy: alertmanager:9093 + caddy.import: crowdsec + # caddy.import_1: authentik + + # --- Uptime Kuma --- + kuma.alertmanager.http.name: Alertmanager + kuma.alertmanager.http.url: https://alertmanager.netgrimoire.com + + # --- Homepage --- + homepage.group: Monitoring + homepage.name: Alertmanager + homepage.icon: alertmanager.png + homepage.href: https://alertmanager.netgrimoire.com + homepage.description: Alert Routing + + # --- DIUN --- + diun.enable: "true" + + # ============================================================ + # CADVISOR — Container metrics (all nodes) + # ============================================================ + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + environment: + TZ: America/Chicago + command: + - --docker_only=true + - --store_container_labels=false + - --disable_metrics=disk,diskIO,network,tcp,udp,percpu,sched,process + volumes: + - /:/rootfs:ro + - /var/run:/var/run:ro + - /sys:/sys:ro + - /var/lib/docker:/var/lib/docker:ro + networks: + - netgrimoire + deploy: + mode: global + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + placement: + constraints: + - node.platform.arch != aarch64 + - node.platform.arch != arm + labels: + # --- DIUN --- + diun.enable: "true" + + # ============================================================ + # NODE EXPORTER — Host metrics (all nodes) + # ============================================================ + node-exporter: + image: prom/node-exporter:latest + environment: + TZ: America/Chicago + command: + - --path.rootfs=/host + - --collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/) + volumes: + - /:/host:ro,rslave + networks: + - netgrimoire + deploy: + mode: global + restart_policy: + condition: any + delay: 5s + max_attempts: 3 + window: 120s + placement: + constraints: + - node.platform.arch != aarch64 + - node.platform.arch != arm + labels: + # --- DIUN --- + diun.enable: "true" + +networks: + netgrimoire: + external: true \ No newline at end of file