The remote agent is now in charge of collecting the cluster health
metrics. This makes the alerting rules on cluster health more robust
because they don't have to aggregate metrics anumore.
Change-Id: I1dcd1801038e5e580402a70fb9325f39fac02c85
Depends-On: I473a916178ec4f9ea44951975f55fff5ee68e687
--- /dev/null
+[[inputs.elasticsearch]]
+ servers = [{%- for server in values.servers|default([]) %}"{{ server }}"{%- if not loop.last%}, {% endif %} {%- endfor %}]
+ http_timeout = "{{ values.http_timeout|default("5s") }}"
+ local = {%- if values.local %}true{%- else %}false{%- endif %}
+ cluster_health = {%- if values.cluster_health %}true{%- else %}false{%- endif %}
+ cluster_stats = {%- if values.cluster_stats %}true{%- else %}false{%- endif %}
+{%- include 'telegraf/files/input/_filters.conf' %}
-{%- if pillar.elasticsearch.server is defined %}
-{% raw %}
+{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
+{%- from "elasticsearch/map.jinja" import server, client with context %}
+
- ElasticsearchDown:
- if: >-
- elasticsearch_up != 1
- labels:
- severity: warning
- service: elasticsearch
- annotations:
- summary: 'Elasticsearch service down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
+{%- if client.get('enabled', False) %}
+{%- raw %}
ElasticsearchClusterHealthStatusYellow:
if: >-
ElasticsearchClusterHealthStatusYellow:
if: >-
- max_over_time(elasticsearch_cluster_health_status[5m]) == 2
+ elasticsearch_cluster_health_status == 2
labels:
severity: warning
service: elasticsearch
labels:
severity: warning
service: elasticsearch
The Elasticsearch cluster status is YELLOW for the last 5 minutes.
ElasticsearchClusterHealthStatusRed:
if: >-
The Elasticsearch cluster status is YELLOW for the last 5 minutes.
ElasticsearchClusterHealthStatusRed:
if: >-
- max_over_time(elasticsearch_cluster_health_status[5m]) == 3
+ elasticsearch_cluster_health_status == 3
labels:
severity: critical
service: elasticsearch
labels:
severity: critical
service: elasticsearch
summary: 'Elasticsearch cluster status is RED'
description: >-
The Elasticsearch cluster status is RED for the last 5 minutes.
summary: 'Elasticsearch cluster status is RED'
description: >-
The Elasticsearch cluster status is RED for the last 5 minutes.
+{%- endraw %}
+{%- endif %}
+{%- if server.get('enabled', False) %}
+{%- raw %}
+ ElasticsearchDown:
+ if: >-
+ elasticsearch_up{host=~'.*'} != 1
+ labels:
+ severity: warning
+ service: elasticsearch
+ annotations:
+ summary: 'Elasticsearch service down'
+ description: 'Elasticsearch service is down on node {{ $labels.host }}'
ElasticsearchClusterDiskLowWaterMark:
if: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
ElasticsearchClusterDiskLowWaterMark:
if: >-
(max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
labels:
severity: critical
service: elasticsearch
labels:
severity: critical
service: elasticsearch
+{%- endraw %}
+{%- endif %}
-{%- from "elasticsearch/map.jinja" import server with context %}
+{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %}
+{%- from "elasticsearch/map.jinja" import server, client with context %}
+
{%- if server.get('enabled', False) %}
{%- if server.get('enabled', False) %}
-{%- set address = server.get('bind', {}).get('address', '127.0.0.1') %}
-{%- set port = server.get('bind', {}).get('port', 9200) %}
-{%- set servers = ['http://{}:{}'.format(address, port)] %}
+{%- set bind = server.get('bind', {}) %}
+{# The local agent gathers the node's metrics + cluster stats if the node is master #}
agent:
input:
elasticsearch:
agent:
input:
elasticsearch:
- servers: {{ servers|yaml }}
+ template: elasticsearch/files/telegraf.conf
+ servers:
+ - "http://{{ bind.address|default('127.0.0.1') }}:{{ bind.port|default(9200) }}"
+ cluster_health: false
+ cluster_stats: true
+{%- endif %}
+
+{%- if client.get('enabled', False) %}
+{# The remote agent gathers only the cluster health metrics #}
+remote_agent:
+ input:
+ elasticsearch:
+ template: elasticsearch/files/telegraf.conf
+ servers:
+ - "http://{{ client.server.host }}:{{ client.server.get('port', 9200) }}"
+ cluster_stats: false
+ local: false
+ namepass: [ 'elasticsearch_cluster_health*' ]
+{%- endif %}