| {%- from "etcd/map.jinja" import server, monitoring with context %} |
| server: |
| target: |
| static: |
| etcd: |
| enabled: true |
| {%- if server.get('ssl', {}).get('enabled') %} |
| scheme: https |
| tls_config: |
| skip_verify: true |
| cert_name: prometheus-server.crt |
| key_name: prometheus-server.key |
| {%- else %} |
| scheme: http |
| {%- endif %} |
| endpoint: |
| {%- set hostname = grains.get('nodename') %} |
| {%- for member in server.members %} |
| {%- if member.name == hostname %} |
| - address: {{ member.host }} |
| port: {{ member.port }} |
| {%- endif %} |
| {%- endfor %} |
| alert: |
| EtcdFailedTotalIn5m: |
| {%- set threshold = monitoring.failed_http_requests_percentage / 100.0 %} |
| if: >- |
| sum by(method) (rate(etcd_http_failed_total{code!~"4[0-9]{2}"}[5m])) |
| / sum by(method) (rate(etcd_http_received_total[5m])) > {{ threshold }} |
| {% raw %} |
| labels: |
| severity: warning |
| service: etcd |
| annotations: |
| summary: 'High number of HTTP requests are failing on etcd' |
| description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}' |
| {% endraw %} |
| EtcdServerHasLeader: |
| if: 'etcd_server_has_leader != 1' |
| {% raw %} |
| labels: |
| severity: warning |
| service: etcd |
| annotations: |
| summary: 'Etcd instance lost leader' |
| description: 'Etcd {{ $labels.instance }} lost his leader' |
| {% endraw %} |
| EtcdClusterSmall: |
| if: 'count(up{job="etcd"} == 0) > count(up{job="etcd"}) / 2 - 1' |
| {% raw %} |
| labels: |
| severity: warning |
| service: etcd |
| annotations: |
| summary: 'Etcd cluster small' |
| description: 'If one more etcd peer goes down the cluster will be unavailable' |
| {% endraw %} |