{%- if pillar.influxdb.server is defined %} {%- from "influxdb/map.jinja" import server, relay, monitoring with context %} {%- if server.get('enabled', False) or relay.get('enabled') %} server: alert: {%- if server.get('http', {}).get('enabled', False) %} {%- raw %} InfluxdbServiceDown: if: >- influxdb_up == 0 labels: severity: minor service: influxdb annotations: summary: "InfluxDB service is down" description: "The InfluxDB service on the {{ $labels.host }} node is down." InfluxdbServicesDownMinor: {%- endraw %} if: >- count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }} {%- raw %} labels: severity: minor service: influxdb annotations: summary: "{%- endraw %}{{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down" description: "{{ $value }} InfluxDB services (>= {%- endraw %} {{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}%) are down." InfluxdbServicesDownMajor: {%- endraw %} if: >- count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }} {%- raw %} labels: severity: major service: influxdb annotations: summary: "{%- endraw %}{{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down" description: "{{ $value }} InfluxDB services (>= {%- endraw %} {{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}%) are down." InfluxdbServiceOutage: if: >- count(influxdb_up == 0) == count(influxdb_up) labels: severity: critical service: influxdb annotations: summary: "InfluxDB service outage" description: "All InfluxDB services are down." InfluxdbSeriesMaxNumberWarning: {%- endraw %} {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %} if: >- influxdb_database_numSeries >= {{ influx_max_series_threshold }} {%- raw %} labels: severity: warning service: influxdb annotations: summary: "Reached{%- endraw %} {{ monitoring.max_series_percentage }}%{%- raw %} of time series in the InfluxDB {{ $labels.database }} database" description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series." InfluxdbSeriesMaxNumberCritical: {%- endraw %} if: >- influxdb_database_numSeries >= {{ server.data.max_series_per_database }} {%- raw %} labels: severity: critical service: influxdb annotations: summary: "Reached maximum number of time series in the InfluxDB {{ $labels.database }} database" description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series. No more series can be saved." InfluxdbHTTPClientErrorsWarning: {%- endraw %} {%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %} if: >- rate(influxdb_httpd_clientError[1m]) / rate(influxdb_httpd_req[1m]) * 100 > {{ influx_http_client_error_threshold }} {%- raw %} labels: severity: warning service: influxdb annotations: summary: "{%- endraw %}{{ influx_http_client_error_threshold }}{%- raw %}% of HTTP client errors" description: "An average of {{ printf `%.1f` $value }}% of HTTP client requests on the {{ $labels.host }} node fail." InfluxdbHTTPPointsWritesFailWarning: {%- endraw %} {%- set influx_http_points_written_fail_threshold = monitoring.failed_points_percentage %} if: >- rate(influxdb_httpd_pointsWrittenFail[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_fail_threshold }} {%- raw %} labels: severity: warning service: influxdb annotations: summary: "{%- endraw %}{{ influx_http_points_written_fail_threshold }}{%- raw %}% of InfluxDB HTTP points writes fail" description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node fail." InfluxdbHTTPPointsWritesDropWarning: {%- endraw %} {%- set influx_http_points_written_dropped_threshold = monitoring.dropped_points_percentage %} if: >- rate(influxdb_httpd_pointsWrittenDropped[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_dropped_threshold }} {%- raw %} labels: severity: warning service: influxdb annotations: summary: "{%- endraw %}{{ influx_http_points_written_dropped_threshold }}{%- raw %}% of InfluxDB HTTP points writes were dropped" description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node were dropped." {%- endraw %} {%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %} {%- set buffer_sizes = [] %} {%- for name, listen in relay.listen.iteritems()|sort %} {%- for backend_name, backend in listen.output.iteritems()|sort %} {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %} {%- endfor %} {%- endfor %} {%- set buffer_sizes = buffer_sizes|sort %} {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %} {%- if buffer_size > 0 %} InfluxdbRelayBufferFullWarning: if: >- influxdb_relay_backend_buffer_bytes / {{ buffer_size }} * 100 > {{ monitoring.max_relay_buffer_percentage }} {%- raw %} labels: severity: warning service: influxdb-relay annotations: summary: "InfluxDB Relay buffer is {%- endraw %} {{ monitoring.max_relay_buffer_percentage }}{%- raw %}% full" description: "The InfluxDB Relay {{ $labels.host }}/{{ $labels.backend }} back-end buffer is {{ $value }}% full." {%- endraw %} {%- endif %} InfluxdbRelayRequestsFailWarning: if: >- rate(influxdb_relay_failed_requests_total[1m]) / rate(influxdb_relay_requests_total[1m]) * 100 > {{ monitoring.relay_failed_requests_percentage }} {%- raw %} labels: severity: warning service: influxdb-relay annotations: summary: "{%- endraw %}{{ monitoring.relay_failed_requests_percentage }}{%- raw %}% of InfluxDB Relay requests fail" description: "An average of {{ printf `%.1f` $value }}% of InfluxDB Relay requests on the {{ $labels.host }} node fail." {%- endraw %} {%- endif %} {%- if relay.get('enabled') and relay.telemetry.get('enabled') %} {%- set addresses = [] %} {%- if relay.telemetry.get('bind', {}).address is defined and not relay.telemetry.bind.address.startswith('127') and relay.telemetry.bind.address != '0.0.0.0' %} {%- do addresses.append(relay.telemetry.bind.address) %} {%- endif %} {%- for address in grains['fqdn_ip4'] %} {%- if not address.startswith('127') %} {%- do addresses.append(address) %} {%- endif %} {%- endfor %} target: static: influxdb_relay: enabled: true endpoint: - address: {{ addresses[0] }} port: {{ relay.telemetry.bind.port }} relabel_configs: - regex: {{ addresses[0] }}:{{ relay.telemetry.bind.port }} replacement: {{ grains['host'] }} source_labels: "__address__" target_label: "host" {%- endif %} {%- endif %} {%- endif %} {%- endif %}