Split between local and remote Telegraf agents
The remote agent is now in charge of collecting the cluster health
metrics. This makes the alerting rules on cluster health more robust
because they don't have to aggregate metrics anumore.
Change-Id: I1dcd1801038e5e580402a70fb9325f39fac02c85
Depends-On: I473a916178ec4f9ea44951975f55fff5ee68e687
diff --git a/elasticsearch/files/telegraf.conf b/elasticsearch/files/telegraf.conf
new file mode 100644
index 0000000..a987469
--- /dev/null
+++ b/elasticsearch/files/telegraf.conf
@@ -0,0 +1,7 @@
+[[inputs.elasticsearch]]
+ servers = [{%- for server in values.servers|default([]) %}"{{ server }}"{%- if not loop.last%}, {% endif %} {%- endfor %}]
+ http_timeout = "{{ values.http_timeout|default("5s") }}"
+ local = {%- if values.local %}true{%- else %}false{%- endif %}
+ cluster_health = {%- if values.cluster_health %}true{%- else %}false{%- endif %}
+ cluster_stats = {%- if values.cluster_stats %}true{%- else %}false{%- endif %}
+{%- include 'telegraf/files/input/_filters.conf' %}
diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml
index 464111d..f0aa983 100644
--- a/elasticsearch/meta/prometheus.yml
+++ b/elasticsearch/meta/prometheus.yml
@@ -1,19 +1,13 @@
-{%- if pillar.elasticsearch.server is defined %}
-{% raw %}
+{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
+{%- from "elasticsearch/map.jinja" import server, client with context %}
+
server:
alert:
- ElasticsearchDown:
- if: >-
- elasticsearch_up != 1
- labels:
- severity: warning
- service: elasticsearch
- annotations:
- summary: 'Elasticsearch service down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
+{%- if client.get('enabled', False) %}
+{%- raw %}
ElasticsearchClusterHealthStatusYellow:
if: >-
- max_over_time(elasticsearch_cluster_health_status[5m]) == 2
+ elasticsearch_cluster_health_status == 2
labels:
severity: warning
service: elasticsearch
@@ -23,7 +17,7 @@
The Elasticsearch cluster status is YELLOW for the last 5 minutes.
ElasticsearchClusterHealthStatusRed:
if: >-
- max_over_time(elasticsearch_cluster_health_status[5m]) == 3
+ elasticsearch_cluster_health_status == 3
labels:
severity: critical
service: elasticsearch
@@ -31,6 +25,19 @@
summary: 'Elasticsearch cluster status is RED'
description: >-
The Elasticsearch cluster status is RED for the last 5 minutes.
+{%- endraw %}
+{%- endif %}
+{%- if server.get('enabled', False) %}
+{%- raw %}
+ ElasticsearchDown:
+ if: >-
+ elasticsearch_up{host=~'.*'} != 1
+ labels:
+ severity: warning
+ service: elasticsearch
+ annotations:
+ summary: 'Elasticsearch service down'
+ description: 'Elasticsearch service is down on node {{ $labels.host }}'
ElasticsearchClusterDiskLowWaterMark:
if: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
@@ -54,5 +61,6 @@
labels:
severity: critical
service: elasticsearch
-{% endraw %}
+{%- endraw %}
+{%- endif %}
{%- endif %}
diff --git a/elasticsearch/meta/telegraf.yml b/elasticsearch/meta/telegraf.yml
index 3568601..6ccac3f 100644
--- a/elasticsearch/meta/telegraf.yml
+++ b/elasticsearch/meta/telegraf.yml
@@ -1,13 +1,31 @@
-{%- from "elasticsearch/map.jinja" import server with context %}
+{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
+{%- from "elasticsearch/map.jinja" import server, client with context %}
+
{%- if server.get('enabled', False) %}
-{%- set address = server.get('bind', {}).get('address', '127.0.0.1') %}
-{%- set port = server.get('bind', {}).get('port', 9200) %}
-{%- set servers = ['http://{}:{}'.format(address, port)] %}
+{%- set bind = server.get('bind', {}) %}
+{# The local agent gathers the node's metrics + cluster stats if the node is master #}
agent:
input:
elasticsearch:
- servers: {{ servers|yaml }}
- local: true
- cluster_health: true
+ template: elasticsearch/files/telegraf.conf
+ servers:
+ - "http://{{ bind.address|default('127.0.0.1') }}:{{ bind.port|default(9200) }}"
+ cluster_health: false
cluster_stats: true
+ local: true
+{%- endif %}
+
+{%- if client.get('enabled', False) %}
+{# The remote agent gathers only the cluster health metrics #}
+remote_agent:
+ input:
+ elasticsearch:
+ template: elasticsearch/files/telegraf.conf
+ servers:
+ - "http://{{ client.server.host }}:{{ client.server.get('port', 9200) }}"
+ cluster_health: true
+ cluster_stats: false
+ local: false
+ namepass: [ 'elasticsearch_cluster_health*' ]
+{%- endif %}
{%- endif %}