Add threshold for docker replicas alerts
Add three severeties for *ReplicasNumber alerts
Closes-Bug: PROD-15164
Change-Id: I92da9799f7322f3313b4647c1bba00f1c8126b3d
diff --git a/docker/map.jinja b/docker/map.jinja
index 012d86d..5a9571c 100644
--- a/docker/map.jinja
+++ b/docker/map.jinja
@@ -58,6 +58,13 @@
},
}, grain='os', merge=salt['pillar.get']('docker:registry')) %}
+{% set monitoring = salt['grains.filter_by']({
+ 'default': {
+ 'replicas_failed_warning_threshold_percent': 0.3,
+ 'replicas_failed_critical_threshold_percent': 0.6,
+ },
+}, grain='os_family', merge=salt['pillar.get']('docker:monitoring')) %}
+
{%- load_yaml as blacklist %}
dockerng_running:
- name
diff --git a/docker/meta/prometheus.yml b/docker/meta/prometheus.yml
index d5fed29..6e53766 100644
--- a/docker/meta/prometheus.yml
+++ b/docker/meta/prometheus.yml
@@ -1,4 +1,4 @@
-{% from "docker/map.jinja" import host, client with context %}
+{% from "docker/map.jinja" import host, client, monitoring with context %}
server:
alert:
@@ -23,22 +23,31 @@
{%- set camel_case_name = full_service_name.split('_')|map('capitalize')|join('')|replace('-', '') %}
{%- set label_selector = 'com_docker_swarm_service_name="{}_{}"'.format(stack_name, service_name) %}
{%- if service.deploy.replicas > 1 %}
- DockerService{{ camel_case_name }}InvalidReplicasNumber:
+ DockerService{{ camel_case_name }}WarningReplicasNumber:
if: >-
- count(count_over_time(docker_container_cpu_usage_percent{{ '{' + label_selector + '}' }}[1m])) != {{ service.deploy.replicas }}
+ count(count_over_time(docker_container_cpu_usage_percent{{ '{' + label_selector + '}' }}[1m])) <= {{ service.deploy.replicas }} * {{ 1 - monitoring.replicas_failed_warning_threshold_percent }}
labels:
severity: warning
- service: docker
+ service: "{{ full_service_name }}"
+ annotations:
+ summary: 'Docker Swarm service {{ full_service_name }} invalid number of replicas'
+ description: "{%raw %}{{ $value }}{%- endraw %}/{{ service.deploy.replicas }} replicas are running for the Docker Swarn service '{{ full_service_name }}'."
+ DockerService{{ camel_case_name }}CriticalReplicasNumber:
+ if: >-
+ count(count_over_time(docker_container_cpu_usage_percent{{ '{' + label_selector + '}' }}[1m])) <= {{ service.deploy.replicas }} * {{ 1 - monitoring.replicas_failed_critical_threshold_percent }}
+ labels:
+ severity: critical
+ service: "{{ full_service_name }}"
annotations:
summary: 'Docker Swarm service {{ full_service_name }} invalid number of replicas'
description: "{%raw %}{{ $value }}{%- endraw %}/{{ service.deploy.replicas }} replicas are running for the Docker Swarn service '{{ full_service_name }}'."
{%- endif %}
- DockerService{{ camel_case_name }}NoReplica:
+ DockerService{{ camel_case_name }}ReplicasDown:
if: >-
count(count_over_time(docker_container_cpu_usage_percent{{ '{' + label_selector + '}' }}[1m])) == 0 or absent(docker_container_cpu_usage_percent{{ '{' + label_selector + '}' }}) == 1
labels:
- severity: critical
- service: docker
+ severity: down
+ service: "{{ full_service_name }}"
annotations:
summary: 'Docker Swarm service {{ full_service_name }} down'
description: "No replicas are running for the Docker Swarn service '{{ full_service_name }}'."