1 {%- if pillar.influxdb.server is defined %}
2 {%- from "influxdb/map.jinja" import server, relay, monitoring with context %}
4 {%- if server.get('enabled', False) or relay.get('enabled') %}
7 {%- if server.get('http', {}).get('enabled', False) %}
16 summary: "InfluxDB service is down"
17 description: "The InfluxDB service on the {{ $labels.host }} node is down."
18 InfluxdbServicesDownMinor:
21 count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }}
27 summary: "{%- endraw %}{{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down"
28 description: "{{ $value }} InfluxDB services (>= {%- endraw %} {{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}%) are down."
29 InfluxdbServicesDownMajor:
32 count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }}
38 summary: "{%- endraw %}{{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down"
39 description: "{{ $value }} InfluxDB services (>= {%- endraw %} {{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}%) are down."
40 InfluxdbServiceOutage:
42 count(influxdb_up == 0) == count(influxdb_up)
47 summary: "InfluxDB service outage"
48 description: "All InfluxDB services are down."
49 InfluxdbSeriesMaxNumberWarning:
51 {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
53 influxdb_database_numSeries >= {{ influx_max_series_threshold }}
59 summary: "Reached {%- endraw %}{{ influx_max_series_threshold }}%{%- raw %} of time series in the InfluxDB {{ $labels.database }} database"
60 description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series."
61 InfluxdbSeriesMaxNumberCritical:
64 influxdb_database_numSeries >= {{ server.data.max_series_per_database }}
70 summary: "Reached maximum number of time series in the InfluxDB {{ $labels.database }} database"
71 description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series. No more series can be saved."
72 InfluxdbHTTPClientErrorsWarning:
74 {%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %}
76 rate(influxdb_httpd_clientError[1m]) / rate(influxdb_httpd_req[1m]) * 100 > {{ influx_http_client_error_threshold }}
82 summary: "{%- endraw %}{{ influx_http_client_error_threshold }}{%- raw %}% of HTTP client errors"
83 description: "An average of {{ printf `%.1f` $value }}% of HTTP client requests on the {{ $labels.host }} node fail."
84 InfluxdbHTTPPointsWritesFailWarning:
86 {%- set influx_http_points_written_fail_threshold = monitoring.failed_points_percentage %}
88 rate(influxdb_httpd_pointsWrittenFail[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_fail_threshold }}
94 summary: "{%- endraw %}{{ influx_http_points_written_fail_threshold }}{%- raw %}% of InfluxDB HTTP points writes fail"
95 description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node fail."
96 InfluxdbHTTPPointsWritesDropWarning:
98 {%- set influx_http_points_written_dropped_threshold = monitoring.dropped_points_percentage %}
100 rate(influxdb_httpd_pointsWrittenDropped[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_dropped_threshold }}
106 summary: "{%- endraw %}{{ influx_http_points_written_dropped_threshold }}{%- raw %}% of InfluxDB HTTP points writes were dropped"
107 description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node were dropped."
110 {%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %}
111 {%- set buffer_sizes = [] %}
112 {%- for name, listen in relay.listen.iteritems()|sort %}
113 {%- for backend_name, backend in listen.output.iteritems()|sort %}
114 {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %}
117 {%- set buffer_sizes = buffer_sizes|sort %}
118 {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %}
119 {%- if buffer_size > 0 %}
120 InfluxdbRelayBufferFullWarning:
122 influxdb_relay_backend_buffer_bytes / {{ buffer_size }} * 100 > {{ monitoring.max_relay_buffer_percentage }}
126 service: influxdb-relay
128 summary: "InfluxDB Relay buffer is {%- endraw %} {{ monitoring.max_relay_buffer_percentage }}{%- raw %}% full"
129 description: "The InfluxDB Relay {{ $labels.host }}/{{ $labels.backend }} back-end buffer is {{ $value }}% full."
132 InfluxdbRelayRequestsFailWarning:
134 rate(influxdb_relay_failed_requests_total[1m]) / rate(influxdb_relay_requests_total[1m]) * 100 > {{ monitoring.relay_failed_requests_percentage }}
138 service: influxdb-relay
140 summary: "{%- endraw %}{{ monitoring.relay_failed_requests_percentage }}{%- raw %}% of InfluxDB Relay requests fail"
141 description: "An average of {{ printf `%.1f` $value }}% of InfluxDB Relay requests on the {{ $labels.host }} node fail."
145 {%- if relay.get('enabled') and relay.telemetry.get('enabled') %}
146 {%- set addresses = [] %}
147 {%- if relay.telemetry.get('bind', {}).address is defined and not relay.telemetry.bind.address.startswith('127') and relay.telemetry.bind.address != '0.0.0.0' %}
148 {%- do addresses.append(relay.telemetry.bind.address) %}
150 {%- for address in grains['fqdn_ip4'] %}
151 {%- if not address.startswith('127') %}
152 {%- do addresses.append(address) %}
160 - address: {{ addresses[0] }}
161 port: {{ relay.telemetry.bind.port }}
163 - regex: {{ addresses[0] }}:{{ relay.telemetry.bind.port }}
164 replacement: {{ grains['host'] }}
165 source_labels: "__address__"