Add DockerdServiceReplicaFlapping alert
Change-Id: If388bc1091dc80f6b63f33e29384df177f55a937
Related-bug: PROD-33295
(cherry picked from commit 1a51036e93ebfd2bdab6c66aae32ffa955079308)
diff --git a/docker/meta/prometheus.yml b/docker/meta/prometheus.yml
index 6921039..0b1f345 100644
--- a/docker/meta/prometheus.yml
+++ b/docker/meta/prometheus.yml
@@ -12,6 +12,16 @@
annotations:
summary: "Dockerd process is down"
description: "The dockerd process on the {{ $labels.host }} node is down."
+ DockerdServiceReplicaFlapping:
+ if: >-
+ sum(changes(docker_swarm_tasks_running[10m])) by (service_name) > 0
+ for: 15m
+ labels:
+ severity: critical
+ service: docker
+ annotations:
+ summary: "{{ $labels.service_name }} is flapping"
+ description: "Docker Swarm {{ $labels.service_name }} service replica is flapping for 15 minutes."
{%- endraw %}
DockerServiceOutage:
if: >-