Add alerts on the number of series
Change-Id: Iea33775b44390876aff5e00505b74aede07e0f20
diff --git a/influxdb/files/influxdb.conf b/influxdb/files/influxdb.conf
index ecddad0..181ba43 100644
--- a/influxdb/files/influxdb.conf
+++ b/influxdb/files/influxdb.conf
@@ -60,7 +60,7 @@
wal-dir = "{{ server.data.wal_dir }}"
trace-logging-enabled = {{ server.data.trace_logging_enabled|default('false')|lower }}
query-log-enabled = {{ server.data.query_log_enabled | lower }}
- max-series-per-database = {{ server.data.max_series_per_database|default(1000000) }}
+ max-series-per-database = {{ server.data.max_series_per_database }}
cache-max-memory-size = {{ server.data.cache_max_memory_size|default(1048576000) }}
cache-snapshot-memory-size = {{ server.data.cache_snapshot_memory_size|default(26214400) }}
cache-snapshot-write-cold-duration = "{{ server.data.cache_snapshot_write_cold_duration|default('10m') }}"
diff --git a/influxdb/map.jinja b/influxdb/map.jinja
index c059cf4..d2d4a89 100644
--- a/influxdb/map.jinja
+++ b/influxdb/map.jinja
@@ -27,6 +27,7 @@
dir: '/var/lib/influxdb/data'
wal_dir: '/var/lib/influxdb/wal'
query_log_enabled: false
+ max_series_per_database: 1000000
meta:
enabled: true
dir: '/var/lib/influxdb/meta'
@@ -36,6 +37,7 @@
{%- set monitoring = salt['grains.filter_by']({
'default': {
+ 'max_series_percentage': 95,
'http_errors_percentage': 5,
'failed_points_percentage': 5,
'dropped_points_percentage': 5,
diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml
index 02573e6..54a8b13 100644
--- a/influxdb/meta/prometheus.yml
+++ b/influxdb/meta/prometheus.yml
@@ -5,18 +5,39 @@
server:
alert:
{%- if server.get('http', {}).get('enabled', False) %}
- InfluxdbQDown:
+ InfluxdbDown:
if: >-
influxdb_up != 1
labels:
severity: warning
service: influxdb
annotations:
+ {%- raw %}
summary: 'InfluxDB service down'
- {% raw %}
description: 'InfluxDB service is down on node {{ $labels.host }}'
- {% endraw %}
- InfluxdbHTTPClientError:
+ {%- endraw %}
+ InfluxdbSeriesNumberHigh:
+ {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %}
+ if: >-
+ influxdb_database_numSeries >= {{ influx_max_series_threshold }}
+ labels:
+ severity: warning
+ service: influxdb
+ annotations:
+ {% raw %}
+ summary: 'InfluxDB high number of series for {{ $labels.database }}'
+ description: 'The InfluxDB {{ $labels.database }} database is getting close to the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ influx_max_series_threshold }}).'
+ InfluxdbSeriesNumberTooHigh:
+ if: >-
+ influxdb_database_numSeries >= {{ server.data.max_series_per_database }}
+ labels:
+ severity: critical
+ service: influxdb
+ annotations:
+ {% raw %}
+ summary: 'InfluxDB too many series for {{ $labels.database }}'
+ description: 'The InfluxDB {{ $labels.database }} database has exceeded the maximum number of series (value={{ $value }}{%- endraw %},threshold={{ server.data.max_series_per_database }}).'
+ InfluxdbHTTPClientErrors:
{%- set influx_http_client_error_threshold = monitoring.http_errors_percentage %}
if: >-
rate(influxdb_httpd_clientError[2m]) / rate(influxdb_httpd_req[2m]) * 100 > {{ influx_http_client_error_threshold }}