blob: 6921039b2990ea34b43ab5b45bb6608a1822e7d2 [file] [log] [blame]
{% from "docker/map.jinja" import host, client, monitoring with context %}
server:
alert:
{%- if host.get('enabled', False) %}
DockerdProcessDown:
if: >-
procstat_running{process_name="dockerd"} == 0
{%- raw %}
labels:
severity: minor
service: docker
annotations:
summary: "Dockerd process is down"
description: "The dockerd process on the {{ $labels.host }} node is down."
{%- endraw %}
DockerServiceOutage:
if: >-
count(label_replace(procstat_running{process_name="dockerd"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="dockerd"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster)
{%- raw %}
labels:
severity: critical
service: docker
annotations:
summary: "Docker cluster outage"
description: "All dockerd processes within the {{ $labels.cluster }} cluster are down."
{%- endraw %}
{%- endif %}
{%- if client.get('enabled', False) %}
{%- for stack_name, stack in client.get('stack', {})|dictsort %}
{%- for service_name, service in stack.get('service', {})|dictsort %}
{%- set full_service_name = "{}_{}".format(stack_name, service_name) %}
{%- set camel_case_name = full_service_name.split('_')|map('capitalize')|join('')|replace('-', '') %}
{%- set label_selector = 'service_name="{}_{}"'.format(stack_name, service_name) %}
{%- if service.get('deploy', {}).get('replicas', 1) > 1 %}
DockerService{{ camel_case_name }}ReplicasDownMinor:
if: >-
{{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_warning_threshold_percent }}
{%- raw %}
for: 2m
labels:
severity: minor
service: docker
annotations:
summary: "{%- endraw %}{{monitoring.replicas_failed_warning_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down"
description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for 2 minutes."
{%- endraw %}
DockerService{{ camel_case_name }}ReplicasDownMajor:
if: >-
{{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_critical_threshold_percent }}
{%- raw %}
for: 2m
labels:
severity: major
service: docker
annotations:
summary: "{%- endraw %}{{monitoring.replicas_failed_critical_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down"
description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for 2 minutes."
{%- endraw %}
{%- endif %}
DockerService{{ camel_case_name }}Outage:
if: >-
docker_swarm_tasks_running{{ '{' + label_selector + '}' }} == 0 or absent(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) == 1
{%- raw %}
for: 2m
labels:
severity: critical
service: docker
annotations:
summary: "Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service outage"
description: "All Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' replicas are down for 2 minutes."
{%- endraw %}
{%- endfor %}
{%- endfor %}
{%- endif %}