{%- if pillar.influxdb.server is defined %}
-{%- from "influxdb/map.jinja" import server with context %}
+{%- from "influxdb/map.jinja" import server, relay, monitoring with context %}
-{%- if server.get('enabled', False) %}
+{%- if server.get('enabled', False) or relay.get('enabled') %}
server:
alert:
- ProcstatRunningInfluxdb:
+{%- if server.get('http', {}).get('enabled', False) %}
+{%- raw %}
+ InfluxdbServiceDown:
+ if: >-
+ influxdb_up == 0
+ labels:
+ severity: minor
+ service: influxdb
+ annotations:
+ summary: "InfluxDB service is down"
+ description: "The InfluxDB service on the {{ $labels.host }} node is down."
+ InfluxdbServicesDownMinor:
+ {%- endraw %}
if: >-
- procstat_running{process_name="influxdb"} == 0
- {% raw %}
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: minor
+ service: influxdb
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down"
+ description: "{{ $value }} InfluxDB services are down (at least {%- endraw %} {{ monitoring.service_failed_warning_threshold_percent*100 }}{%- raw %}%)."
+ InfluxdbServicesDownMajor:
+ {%- endraw %}
+ if: >-
+ count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }}
+ {%- raw %}
+ labels:
+ severity: major
+ service: influxdb
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}% of InfluxDB services are down"
+ description: "{{ $value }} InfluxDB services are down (at least {%- endraw %} {{ monitoring.service_failed_critical_threshold_percent*100 }}{%- raw %}%)."
+ InfluxdbServiceOutage:
+ if: >-
+ count(influxdb_up == 0) == count(influxdb_up)
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ summary: "InfluxDB service outage"
+ description: "All InfluxDB services are down."
+ InfluxdbSeriesMaxNumberWarning:
+ {%- endraw %}
+ {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
+ if: >-
+ influxdb_database_numSeries >= {{ influx_max_series_threshold }}
+ {%- raw %}
labels:
severity: warning
service: influxdb
annotations:
- summary: 'Influxdb service is down'
- description: 'Influxdb service is down on node {{ $labels.host }}'
- {% endraw %}
-{%- if server.get('http', {}).get('enabled', False) %}
- InfluxdbHTTPClientError:
- {%- set influx_http_client_error_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPClientError', {}).get('var', {}).get('threshold', 5) %}
+ summary: "{%- endraw %}{{ influx_max_series_threshold }}{%- raw %} time series in {{ $labels.database }} database"
+ description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series."
+ InfluxdbSeriesMaxNumberCritical:
+ {%- endraw %}
+ if: >-
+ influxdb_database_numSeries >= {{ server.data.max_series_per_database }}
+ {%- raw %}
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ summary: "Maximum number of time series in the {{ $labels.database }} database"
+ description: "The InfluxDB {{ $labels.database }} database contains {{ $value }} time series. No more series can be saved."
+ InfluxdbHTTPClientErrorsWarning:
+ {%- endraw %}
+ {%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %}
if: >-
- rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }}
- {% raw %}
+ rate(influxdb_httpd_clientError[1m]) / rate(influxdb_httpd_req[1m]) * 100 > {{ influx_http_client_error_threshold }}
+ {%- raw %}
labels:
severity: warning
service: influxdb
annotations:
- summary: 'Influxdb number of client errors is high'
- description: '{{ printf `%.1f` $value }}% of client requests are in error on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_client_error_threshold }}).'
- InfluxdbHTTPPointsWrittenFail:
- {%- set influx_http_points_written_fail_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenFail', {}).get('var', {}).get('threshold', 5) %}
+ summary: "{%- endraw %}{{ influx_http_client_error_threshold }}{%- raw %}% of HTTP client errors"
+ description: "An average of {{ printf `%.1f` $value }}% of HTTP client requests on the {{ $labels.host }} node fail."
+ InfluxdbHTTPPointsWritesFailWarning:
+ {%- endraw %}
+ {%- set influx_http_points_written_fail_threshold = monitoring.failed_points_percentage %}
if: >-
- rate(influxdb_httpd_pointsWrittenFail[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_fail_threshold }}
- {% raw %}
+ rate(influxdb_httpd_pointsWrittenFail[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_fail_threshold }}
+ {%- raw %}
labels:
severity: warning
service: influxdb
annotations:
- summary: 'Influxdb too many failed writes'
- description: '{{ printf `%.1f` $value }}% of written points have failed on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_fail_threshold }}).'
- InfluxdbHTTPPointsWrittenDropped:
- {%- set influx_http_points_written_dropped_threshold = prometheus_server.get('alert', {}).get('InfluxdbHTTPPointsWrittenDropped', {}).get('var', {}).get('threshold', 5) %}
+ summary: "{%- endraw %}{{ influx_http_points_written_fail_threshold }}{%- raw %}% of HTTP points writes fail"
+ description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node fail."
+ InfluxdbHTTPPointsWritesDropWarning:
+ {%- endraw %}
+ {%- set influx_http_points_written_dropped_threshold = monitoring.dropped_points_percentage %}
if: >-
- rate(influxdb_httpd_pointsWrittenDropped[2m]) / rate(influxdb_httpd_pointsWrittenOK[2m]) * 100 > {{ influx_http_points_written_dropped_threshold }}
- {% raw %}
+ rate(influxdb_httpd_pointsWrittenDropped[1m]) / (rate(influxdb_httpd_pointsWrittenOK[1m]) + rate(influxdb_httpd_pointsWrittenFail[1m]) + rate(influxdb_httpd_pointsWrittenDropped[1m])) * 100 > {{ influx_http_points_written_dropped_threshold }}
+ {%- raw %}
labels:
severity: warning
service: influxdb
annotations:
- summary: 'Influxdb too many dropped writes'
- description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).'
+ summary: "{%- endraw %}{{ influx_http_points_written_dropped_threshold }}{%- raw %}% of HTTP points writes were dropped"
+ description: "An average of {{ printf `%.1f` $value }}% of HTTP points writes on the {{ $labels.host }} node were dropped."
+{%- endraw %}
+
+{%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %}
+ {%- set buffer_sizes = [] %}
+ {%- for name, listen in relay.listen.iteritems()|sort %}
+ {%- for backend_name, backend in listen.output.iteritems()|sort %}
+ {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %}
+ {%- endfor %}
+ {%- endfor %}
+ {%- set buffer_sizes = buffer_sizes|sort %}
+ {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %}
+ {%- if buffer_size > 0 %}
+ InfluxdbRelayBufferFullWarning:
+ if: >-
+ influxdb_relay_backend_buffer_bytes / {{ buffer_size }} * 100 > {{ monitoring.max_relay_buffer_percentage }}
+ {%- raw %}
+ labels:
+ severity: warning
+ service: influxdb-relay
+ annotations:
+ summary: "InfluxDB Relay buffer is {%- endraw %} {{ monitoring.max_relay_buffer_percentage }}{%- raw %}% full"
+ description: "The InfluxDB Relay {{ $labels.host }}/{{ $labels.backend }} back-end buffer is {{ $value }}% full."
+ {%- endraw %}
+ {%- endif %}
+ InfluxdbRelayRequestsFailWarning:
+ if: >-
+ rate(influxdb_relay_failed_requests_total[1m]) / rate(influxdb_relay_requests_total[1m]) * 100 > {{ monitoring.relay_failed_requests_percentage }}
+ {%- raw %}
+ labels:
+ severity: warning
+ service: influxdb-relay
+ annotations:
+ summary: "{%- endraw %}{{ monitoring.relay_failed_requests_percentage }}{%- raw %}% of requests fail"
+ description: "An average of {{ printf `%.1f` $value }}% of InfluxDB Relay requests on the {{ $labels.host }} node fail."
+{%- endraw %}
+{%- endif %}
+
+{%- if relay.get('enabled') and relay.telemetry.get('enabled') %}
+{%- set addresses = [] %}
+{%- if relay.telemetry.get('bind', {}).address is defined and not relay.telemetry.bind.address.startswith('127') and relay.telemetry.bind.address != '0.0.0.0' %}
+{%- do addresses.append(relay.telemetry.bind.address) %}
+{%- endif %}
+{%- for address in grains['fqdn_ip4'] %}
+{%- if not address.startswith('127') %}
+{%- do addresses.append(address) %}
+{%- endif %}
+{%- endfor %}
+ target:
+ static:
+ influxdb_relay:
+ enabled: true
+ endpoint:
+ - address: {{ addresses[0] }}
+ port: {{ relay.telemetry.bind.port }}
+ relabel_configs:
+ - regex: {{ addresses[0] }}:{{ relay.telemetry.bind.port }}
+ replacement: {{ grains['host'] }}
+ source_labels: "__address__"
+ target_label: "host"
+
+{%- endif %}
+
{%- endif %}
{%- endif %}
{%- endif %}