| {% from "docker/map.jinja" import host, client, monitoring with context %} |
| server: |
| alert: |
| {%- if host.get('enabled', False) %} |
| DockerdProcessDown: |
| if: >- |
| procstat_running{process_name="dockerd"} == 0 |
| {%- raw %} |
| labels: |
| severity: minor |
| service: docker |
| annotations: |
| summary: "Dockerd process is down" |
| description: "The dockerd process on the {{ $labels.host }} node is down." |
| {%- endraw %} |
| DockerServiceOutage: |
| if: >- |
| count(label_replace(procstat_running{process_name="dockerd"}, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(procstat_running{process_name="dockerd"} == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) |
| {%- raw %} |
| labels: |
| severity: critical |
| service: docker |
| annotations: |
| summary: "Docker cluster outage" |
| description: "All dockerd processes within the {{ $labels.cluster }} cluster are down." |
| {%- endraw %} |
| {%- endif %} |
| {%- if client.get('enabled', False) %} |
| {%- for stack_name, stack in client.get('stack', {})|dictsort %} |
| {%- for service_name, service in stack.get('service', {})|dictsort %} |
| {%- set full_service_name = "{}_{}".format(stack_name, service_name) %} |
| {%- set camel_case_name = full_service_name.split('_')|map('capitalize')|join('')|replace('-', '') %} |
| {%- set label_selector = 'service_name="{}_{}"'.format(stack_name, service_name) %} |
| {%- if service.get('deploy', {}).get('replicas', 1) > 1 %} |
| DockerService{{ camel_case_name }}ReplicasDownMinor: |
| if: >- |
| {{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_warning_threshold_percent }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: docker |
| annotations: |
| summary: "{%- endraw %}{{monitoring.replicas_failed_warning_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down" |
| description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for 2 minutes." |
| {%- endraw %} |
| DockerService{{ camel_case_name }}ReplicasDownMajor: |
| if: >- |
| {{ service.deploy.replicas }} - min(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) >= {{ service.deploy.replicas }} * {{ monitoring.replicas_failed_critical_threshold_percent }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: major |
| service: docker |
| annotations: |
| summary: "{%- endraw %}{{monitoring.replicas_failed_critical_threshold_percent*100}}%{%- raw %} of Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down" |
| description: "{{ $value }} Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service replicas are down for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| DockerService{{ camel_case_name }}Outage: |
| if: >- |
| docker_swarm_tasks_running{{ '{' + label_selector + '}' }} == 0 or absent(docker_swarm_tasks_running{{ '{' + label_selector + '}' }}) == 1 |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: critical |
| service: docker |
| annotations: |
| summary: "Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' service outage" |
| description: "All Docker Swarm '{%- endraw %}{{ full_service_name }}{%- raw %}' replicas are down for 2 minutes." |
| {%- endraw %} |
| {%- endfor %} |
| {%- endfor %} |
| {%- endif %} |