blob: 9c456abd4100f71902ba40d074d93f8fc4eb68c4 [file] [log] [blame]
Vnaumov25780432018-03-26 12:25:27 +04001parameters:
2 prometheus:
3 server:
4 alert:
5 InsufficientMembers:
6 if: >-
7 count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
8 for: 3m
9 labels:
10 severity: critical
11 annotations:
12 description: 'If one more etcd member goes down the cluster will be unavailable'
13 summary: 'etcd cluster insufficient members'
14 NoLeader:
15 if: >-
16 etcd_server_has_leader{job="etcd"} == 0
17 for: 1m
18 labels:
19 severity: critical
20 annotations:
21 description: 'etcd member {{ $labels.instance }} has no leader'
22 summary: 'etcd member has no leader'
23 HighNumberOfLeaderChanges:
24 if: >-
25 increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
26 labels:
27 severity: warning
28 annotations:
29 description: 'etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour'
30 summary: 'a high number of leader changes within the etcd cluster are happening'
31 HighNumberOfFailedGRPCRequests:
32 if: >-
33 sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
34 for: 10m
35 labels:
36 severity: warning
37 annotations:
38 description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
39 summary: 'a high number of gRPC requests are failing'
40 HighNumberOfFailedGRPCRequests:
41 if: >-
42 sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
43 for: 5m
44 labels:
45 severity: critical
46 annotations:
47 description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
48 summary: 'a high number of gRPC requests are failing'
49 GRPCRequestsSlow:
50 if: >-
51 histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
52 for: 10m
53 labels:
54 severity: critical
55 annotations:
56 description: 'on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow'
57 summary: 'slow gRPC requests'
58 HighNumberOfFailedHTTPRequests:
59 if: >-
60 sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.01
61 for: 10m
62 labels:
63 severity: warning
64 annotations:
65 description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
66 summary: 'a high number of HTTP requests are failing'
67 HighNumberOfFailedHTTPRequests:
68 if: >-
69 sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) > 0.05
70 for: 5m
71 labels:
72 severity: critical
73 annotations:
74 description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}'
75 summary: 'a high number of HTTP requests are failing'
76 HTTPRequestsSlow:
77 if: >-
78 histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
79 for: 10m
80 labels:
81 severity: warning
82 annotations:
83 description: 'on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow'
84 summary: 'slow HTTP requests'
85 record: instance:fd_utilization
86 if: >-
87 process_open_fds / process_max_fds
88 FdExhaustionClose:
89 if: >-
90 predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
91 for: 10m
92 labels:
93 severity: warning
94 annotations:
95 description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon'
96 summary: 'file descriptors soon exhausted'
97 FdExhaustionClose:
98 if: >-
99 predict_linear(instance:fd_utilization[10m], 3600) > 1
100 for: 10m
101 labels:
102 severity: critical
103 annotations:
104 description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon'
105 summary: 'file descriptors soon exhausted'
106 EtcdMemberCommunicationSlow:
107 if: >-
108 histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) > 0.15
109 for: 10m
110 labels:
111 severity: warning
112 annotations:
113 description: 'etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow'
114 summary: 'etcd member communication is slow'
115 HighNumberOfFailedProposals:
116 if: >-
117 increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
118 labels:
119 severity: warning
120 annotations:
121 description: 'etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour'
122 summary: 'a high number of proposals within the etcd cluster are failing'
123 HighFsyncDurations:
124 if: >-
125 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
126 for: 10m
127 labels:
128 severity: warning
129 annotations:
130 description: 'etcd instance {{ $labels.instance }} fync durations are high'
131 summary: 'high fsync durations'
132 HighCommitDurations:
133 if: >-
134 histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
135 for: 10m
136 labels:
137 severity: warning
138 annotations:
139 description: 'etcd instance {{ $labels.instance }} commit durations are high'
140 summary: 'high commit durations'