Alerts rationalization for Docker service

Related-Bug: PROD-19543

Change-Id: I071a9d92527a47fdc12eb4b489aeee296d54047d
diff --git a/docker/meta/prometheus.yml b/docker/meta/prometheus.yml
index 4328310..e642481 100644
--- a/docker/meta/prometheus.yml
+++ b/docker/meta/prometheus.yml
@@ -1,5 +1,4 @@
 {% from "docker/map.jinja" import host, client, monitoring with context %}
-
 server:
   alert:
 {%- if host.get('enabled', False) %}
@@ -8,11 +7,22 @@
         procstat_running{process_name="dockerd"} == 0
   {%- raw %}
       labels:
-        severity: warning
+        severity: minor
         service: docker
       annotations:
-        summary: 'Dockerd service is down'
-        description: 'Dockerd service is down on node {{ $labels.host }}'
+        summary: "Dockerd process is down"
+        description: "The Dockerd process on the {{ $labels.host }} node is down."
+  {%- endraw %}
+    DockerServiceOutage:
+      if: >-
+        count(label_replace(procstat_running{process_name="dockerd"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="dockerd"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster)
+  {%- raw %}
+      labels:
+        severity: critical
+        service: docker
+      annotations:
+        summary: "Docker cluster outage"
+        description: "All Dockerd processes within the {{ $labels.cluster }} cluster are down."
   {%- endraw %}
 {%- endif %}
 {%- if client.get('enabled', False) %}
@@ -23,37 +33,43 @@
         {%- set camel_case_name = full_service_name.split('_')|map('capitalize')|join('')|replace('-', '') %}
         {%- set label_selector = 'service_name="{}_{}"'.format(stack_name, service_name) %}
         {%- if service.deploy.replicas > 1 %}
-    DockerService{{ camel_case_name }}WarningReplicasNumber:
+    DockerService{{ camel_case_name }}ReplicasDownMinor:
       if: >-
-        docker_swarm_tasks_running{{ '{' + label_selector + '}' }} <= {{ service.deploy.replicas }} * {{ 1 - monitoring.replicas_failed_warning_threshold_percent }}
+        {{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_warning_threshold_percent }}
+  {%- raw %}
       for: 2m
       labels:
-        severity: warning
-        service: "{{ full_service_name }}"
+        severity: minor
+        service: docker
       annotations:
-        summary: 'Docker Swarm service {{ full_service_name }} invalid number of replicas for 2 minutes'
-        description: "{%raw %}{{ $value }}{%- endraw %}/{{ service.deploy.replicas }} replicas are running for the Docker Swarn service '{{ full_service_name }}' for 2 minutes."
-    DockerService{{ camel_case_name }}CriticalReplicasNumber:
+        summary: "{%- endraw %}{{monitoring.replicas_failed_warning_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down"
+        description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for at least 2 minutes."
+  {%- endraw %}
+    DockerService{{ camel_case_name }}ReplicasDownMajor:
       if: >-
-        docker_swarm_tasks_running{{ '{' + label_selector + '}' }} <= {{ service.deploy.replicas }} * {{ 1 - monitoring.replicas_failed_critical_threshold_percent }}
+        {{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_critical_threshold_percent }}
+  {%- raw %}
+      for: 2m
+      labels:
+        severity: major
+        service: docker
+      annotations:
+        summary: "{%- endraw %}{{monitoring.replicas_failed_critical_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down"
+        description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for at least 2 minutes."
+  {%- endraw %}
+        {%- endif %}
+    DockerService{{ camel_case_name }}Outage:
+      if: >-
+        docker_swarm_tasks_running{{ '{' + label_selector + '}' }} == 0 or absent(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) == 1
+  {%- raw %}
       for: 2m
       labels:
         severity: critical
-        service: "{{ full_service_name }}"
+        service: docker
       annotations:
-        summary: 'Docker Swarm service {{ full_service_name }} invalid number of replicas for 2 minutes'
-        description: "{%raw %}{{ $value }}{%- endraw %}/{{ service.deploy.replicas }} replicas are running for the Docker Swarn service '{{ full_service_name }}' for 2 minutes."
-        {%- endif %}
-    DockerService{{ camel_case_name }}ReplicasDown:
-      if: >-
-        docker_swarm_tasks_running{{ '{' + label_selector + '}' }} == 0 or absent(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) == 1
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ full_service_name }}"
-      annotations:
-        summary: 'Docker Swarm service {{ full_service_name }} down for 2 minutes'
-        description: "No replicas are running for the Docker Swarn service '{{ full_service_name }}'. for 2 minutes"
+        summary: "Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service outage"
+        description: "All Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' replicas are down for at least 2 minutes."
+  {%- endraw %}
       {%- endif %}
     {%- endfor %}
   {%- endfor %}