blob: 2addb4c014ce1f1e4a9f31be6aa9a5918546ef01 [file] [log] [blame]
{%- from "etcd/map.jinja" import server, monitoring with context %}
server:
target:
static:
etcd:
enabled: true
{%- if server.get('ssl', {}).get('enabled') %}
scheme: https
tls_config:
skip_verify: true
cert_name: prometheus-server.crt
key_name: prometheus-server.key
{%- else %}
scheme: http
{%- endif %}
endpoint:
{%- set hostname = grains.get('nodename') %}
{%- for member in server.members %}
{%- if member.name == hostname %}
- address: {{ member.host }}
port: {{ member.port }}
{%- endif %}
{%- endfor %}
alert:
EtcdRequestFailureTooHigh:
{%- set failed_http_requests = monitoring.failed_http_requests_percentage / 100.0 %}
if: >-
sum by(method) (rate(etcd_http_failed_total[5m]))
/ sum by(method) (rate(etcd_http_received_total[5m])) > {{ failed_http_requests }}
{%- raw %}
labels:
severity: minor
service: etcd
annotations:
summary: "High number of HTTP requests are failing on etcd"
description: "{{ $value }}% of requests for the {{ $labels.method }} method failed on the {{ $labels.instance }} etcd instance."
{%- endraw %}
EtcdInstanceNoLeader:
if: "etcd_server_has_leader != 1"
{%- raw %}
labels:
severity: major
service: etcd
annotations:
summary: "Etcd instance has no leader"
description: "The etcd {{ $labels.instance }} instance has no leader."
{%- endraw %}
EtcdServiceDownMinor:
if: "up{job='etcd'} == 0"
{%- raw %}
for: 2m
labels:
severity: minor
service: etcd
annotations:
summary: "Etcd instance is down"
description: "The etcd {{ $labels.instance }} instance is down for at least 2 minutes."
{%- endraw %}
EtcdServiceDownMajor:
{%- set instances_major_threshold_percent = monitoring.instances_major_threshold_percent %}
if: "count(up{job='etcd'} == 0) > count(up{job='etcd'}) * {{ monitoring.instances_major_threshold_percent }}"
{%- raw %}
for: 2m
labels:
severity: major
service: etcd
annotations:
summary: "{{ instances_major_threshold_percent*100 }}% of etcd instances are down"
description: "{{ $value }}{%- endraw %} etcd instances are down (at least {{ instances_major_threshold_percent*100 }} for at least 2 minutes."
EtcdServiceOutage:
if: "count(up{job='etcd'} == 0) == count(up{job='etcd'})"
{%- raw %}
labels:
severity: critical
service: etcd
annotations:
summary: "Etcd service outage"
description: "All etcd services within the cluster are down."
{%- endraw %}