From 29fb4553518f6471f322f0704d5c5fd23375afe0 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Thu, 13 Jul 2017 11:14:18 +0200 Subject: [PATCH] Split between local and remote Telegraf agents The remote agent is now in charge of collecting the cluster health metrics. This makes the alerting rules on cluster health more robust because they don't have to aggregate metrics anumore. Change-Id: I1dcd1801038e5e580402a70fb9325f39fac02c85 Depends-On: I473a916178ec4f9ea44951975f55fff5ee68e687 --- elasticsearch/files/telegraf.conf | 7 ++++++ elasticsearch/meta/prometheus.yml | 36 +++++++++++++++++++------------ elasticsearch/meta/telegraf.yml | 30 ++++++++++++++++++++------ 3 files changed, 53 insertions(+), 20 deletions(-) create mode 100644 elasticsearch/files/telegraf.conf diff --git a/elasticsearch/files/telegraf.conf b/elasticsearch/files/telegraf.conf new file mode 100644 index 0000000..a987469 --- /dev/null +++ b/elasticsearch/files/telegraf.conf @@ -0,0 +1,7 @@ +[[inputs.elasticsearch]] + servers = [{%- for server in values.servers|default([]) %}"{{ server }}"{%- if not loop.last%}, {% endif %} {%- endfor %}] + http_timeout = "{{ values.http_timeout|default("5s") }}" + local = {%- if values.local %}true{%- else %}false{%- endif %} + cluster_health = {%- if values.cluster_health %}true{%- else %}false{%- endif %} + cluster_stats = {%- if values.cluster_stats %}true{%- else %}false{%- endif %} +{%- include 'telegraf/files/input/_filters.conf' %} diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml index 464111d..f0aa983 100644 --- a/elasticsearch/meta/prometheus.yml +++ b/elasticsearch/meta/prometheus.yml @@ -1,19 +1,13 @@ -{%- if pillar.elasticsearch.server is defined %} -{% raw %} +{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %} +{%- from "elasticsearch/map.jinja" import server, client with context %} + server: alert: - ElasticsearchDown: - if: >- - elasticsearch_up != 1 - labels: - severity: warning - service: elasticsearch - annotations: - summary: 'Elasticsearch service down' - description: 'Elasticsearch service is down on node {{ $labels.host }}' +{%- if client.get('enabled', False) %} +{%- raw %} ElasticsearchClusterHealthStatusYellow: if: >- - max_over_time(elasticsearch_cluster_health_status[5m]) == 2 + elasticsearch_cluster_health_status == 2 labels: severity: warning service: elasticsearch @@ -23,7 +17,7 @@ server: The Elasticsearch cluster status is YELLOW for the last 5 minutes. ElasticsearchClusterHealthStatusRed: if: >- - max_over_time(elasticsearch_cluster_health_status[5m]) == 3 + elasticsearch_cluster_health_status == 3 labels: severity: critical service: elasticsearch @@ -31,6 +25,19 @@ server: summary: 'Elasticsearch cluster status is RED' description: >- The Elasticsearch cluster status is RED for the last 5 minutes. +{%- endraw %} +{%- endif %} +{%- if server.get('enabled', False) %} +{%- raw %} + ElasticsearchDown: + if: >- + elasticsearch_up{host=~'.*'} != 1 + labels: + severity: warning + service: elasticsearch + annotations: + summary: 'Elasticsearch service down' + description: 'Elasticsearch service is down on node {{ $labels.host }}' ElasticsearchClusterDiskLowWaterMark: if: >- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 @@ -54,5 +61,6 @@ server: labels: severity: critical service: elasticsearch -{% endraw %} +{%- endraw %} +{%- endif %} {%- endif %} diff --git a/elasticsearch/meta/telegraf.yml b/elasticsearch/meta/telegraf.yml index 3568601..6ccac3f 100644 --- a/elasticsearch/meta/telegraf.yml +++ b/elasticsearch/meta/telegraf.yml @@ -1,13 +1,31 @@ -{%- from "elasticsearch/map.jinja" import server with context %} +{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %} +{%- from "elasticsearch/map.jinja" import server, client with context %} + {%- if server.get('enabled', False) %} -{%- set address = server.get('bind', {}).get('address', '127.0.0.1') %} -{%- set port = server.get('bind', {}).get('port', 9200) %} -{%- set servers = ['http://{}:{}'.format(address, port)] %} +{%- set bind = server.get('bind', {}) %} +{# The local agent gathers the node's metrics + cluster stats if the node is master #} agent: input: elasticsearch: - servers: {{ servers|yaml }} + template: elasticsearch/files/telegraf.conf + servers: + - "http://{{ bind.address|default('127.0.0.1') }}:{{ bind.port|default(9200) }}" + cluster_health: false + cluster_stats: true local: true +{%- endif %} + +{%- if client.get('enabled', False) %} +{# The remote agent gathers only the cluster health metrics #} +remote_agent: + input: + elasticsearch: + template: elasticsearch/files/telegraf.conf + servers: + - "http://{{ client.server.host }}:{{ client.server.get('port', 9200) }}" cluster_health: true - cluster_stats: true + cluster_stats: false + local: false + namepass: [ 'elasticsearch_cluster_health*' ] +{%- endif %} {%- endif %} -- 2.32.7