blob: 1fa86432fdf2070f8153a817cde9b4fd478a200c [file] [log] [blame]
{%- from "etcd/map.jinja" import server, monitoring with context %}
server:
target:
static:
etcd:
enabled: true
{%- if server.get('ssl', {}).get('enabled') %}
scheme: https
tls_config:
skip_verify: true
cert_name: prometheus-server.crt
key_name: prometheus-server.key
{%- else %}
scheme: http
{%- endif %}
endpoint:
{%- set hostname = grains.get('nodename') %}
{%- for member in server.members %}
{%- if member.name == hostname %}
- address: {{ member.host }}
port: {{ member.port }}
{%- endif %}
{%- endfor %}
alert:
EtcdFailedTotalIn5m:
{%- set threshold = monitoring.failed_http_requests_percentage / 100.0 %}
if: >-
sum by(method) (rate(etcd_http_failed_total{code!~"4[0-9]{2}"}[5m]))
/ sum by(method) (rate(etcd_http_received_total[5m])) > {{ threshold }}
{% raw %}
labels:
severity: warning
service: etcd
annotations:
summary: 'High number of HTTP requests are failing on etcd'
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
{% endraw %}
EtcdServerHasLeader:
if: 'etcd_server_has_leader != 1'
{% raw %}
labels:
severity: warning
service: etcd
annotations:
summary: 'Etcd instance lost leader'
description: 'Etcd {{ $labels.instance }} lost his leader'
{% endraw %}
EtcdClusterSmall:
if: 'count(up{job="etcd"} == 0) > count(up{job="etcd"}) / 2 - 1'
{% raw %}
labels:
severity: warning
service: etcd
annotations:
summary: 'Etcd cluster small'
description: 'If one more etcd peer goes down the cluster will be unavailable'
{% endraw %}