From: Petr Michalec Date: Thu, 7 Dec 2017 11:01:09 +0000 (+0100) Subject: Merge pull request #25 from Perceptyx/master X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Finfluxdb.git;a=commitdiff_plain;h=a054a52f03659cbfdd2b8cd7aa9aab6fbae6ff7f;hp=9bdd2560a41df0d453cc924c2749d0ddc522f7a3 Merge pull request #25 from Perceptyx/master Ensure custom paths are created and have correct permissions --- diff --git a/.kitchen.travis.yml b/.kitchen.travis.yml new file mode 100644 index 0000000..6bcad13 --- /dev/null +++ b/.kitchen.travis.yml @@ -0,0 +1,6 @@ +suites: + + - name: <%= ENV['SUITE'] %> + provisioner: + pillars-from-files: + influxdb.sls: tests/pillar/<%= ENV['SUITE'] %>.sls diff --git a/.travis.yml b/.travis.yml index 7a77247..2e36211 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,15 +17,19 @@ install: - bundle install env: - - PLATFORM=trevorj/salty-whales:trusty - - PLATFORM=trevorj/salty-whales:xenial + - PLATFORM=trevorj/salty-whales:trusty SUITE=client + - PLATFORM=trevorj/salty-whales:xenial SUITE=client + - PLATFORM=trevorj/salty-whales:trusty SUITE=cluster + - PLATFORM=trevorj/salty-whales:xenial SUITE=cluster + - PLATFORM=trevorj/salty-whales:trusty SUITE=single + - PLATFORM=trevorj/salty-whales:xenial SUITE=single before_script: - set -o pipefail - make test | tail script: - - test ! -e .kitchen.yml || bundle exec kitchen test -t tests/integration + - KITCHEN_LOCAL_YAML=.kitchen.travis.yml bundle exec kitchen test -t tests/integration notifications: webhooks: diff --git a/README.rst b/README.rst index 2b8349d..769bb2d 100644 --- a/README.rst +++ b/README.rst @@ -264,6 +264,34 @@ InfluxDB client for configuring databases, users and retention policies: database: mydb1 privilege: all +InfluxDB relay with HTTP outputs: + +.. code-block:: yaml + + influxdb: + relay: + enabled: true + telemetry: + enabled: true + bind: + address: 127.0.0.1 + port: 9196 + listen: + http_backend: + type: http + bind: + address: 127.0.0.1 + port: 9096 + output: + server1: + location: http://server1:8086/write + timeout: 20s + buffer_size_mb: 512 + max_batch_kb: 1024 + max_delay_interval: 30s + server2: + location: http://server2:8086/write + Read more ========= diff --git a/influxdb/files/grafana_dashboards/influxdb_prometheus.json b/influxdb/files/grafana_dashboards/influxdb_prometheus.json index 9b5291f..9b3d0fa 100644 --- a/influxdb/files/grafana_dashboards/influxdb_prometheus.json +++ b/influxdb/files/grafana_dashboards/influxdb_prometheus.json @@ -11,6 +11,334 @@ "links": [], "refresh": "1m", "rows": [ + { + "collapse": false, + "height": 250, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": null, + "decimals": 0, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count_scalar(influxdb_up)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "Total instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": null, + "decimals": 0, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count_scalar(influxdb_up == 1)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "Running instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": null, + "decimals": 0, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count_scalar(influxdb_up == 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "Stopped instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": null, + "decimals": 0, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 10, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(haproxy_active_servers{proxy=~\"influxdb-backend\", sv=\"BACKEND\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60 + } + ], + "thresholds": "", + "title": "InfluxDB backends", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "General", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -348,7 +676,7 @@ } ], "thresholds": "", - "title": "Go routines", + "title": "Goroutines", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -562,10 +890,10 @@ "multi": false, "name": "server", "options": [], - "query": "label_values(influxdb_httpd_authFail, host)", + "query": "label_values(influxdb_up, host)", "refresh": 1, "regex": "", - "sort": 0, + "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", diff --git a/influxdb/files/grafana_dashboards/influxdb_relay_prometheus.json b/influxdb/files/grafana_dashboards/influxdb_relay_prometheus.json new file mode 100644 index 0000000..f7f4e9b --- /dev/null +++ b/influxdb/files/grafana_dashboards/influxdb_relay_prometheus.json @@ -0,0 +1,439 @@ +{% raw %} +{ + "annotations": { + "list": [] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": "1m", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 0, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(influxdb_relay_requests_total{instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "total ({{instance}})", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(influxdb_relay_failed_requests_total{instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "failed ({{instance}})", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Requests", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(influxdb_relay_received_points_total{instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Received points", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "wps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Instance metrics", + "titleSize": "h6" + }, + { + "collapse": false, + "height": 250, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(influxdb_relay_backend_sent_bytes_total{backend=~\"$backend\",instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "sent ({{instance}} -> {{backend}})", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(influxdb_relay_backend_failed_bytes_total{backend=~\"$backend\",instance=~\"$instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "failed ({{instance}} -> {{backend}})", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 0, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "influxdb_relay_backend_buffer_bytes{backend=~\"$backend\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} -> {{backend}}", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Buffer", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Backend metrics", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(influxdb_relay_backend_buffer_bytes, instance)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "backend", + "options": [], + "query": "label_values(influxdb_relay_backend_buffer_bytes, backend)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "InfluxDB Relay", + "version": 1 +} +{% endraw %} \ No newline at end of file diff --git a/influxdb/files/influxdb-relay.conf b/influxdb/files/influxdb-relay.conf new file mode 100644 index 0000000..06fd210 --- /dev/null +++ b/influxdb/files/influxdb-relay.conf @@ -0,0 +1,59 @@ +{%- from "influxdb/map.jinja" import relay with context -%} + +{%- if relay.get('enabled') %} + +{%- if relay.telemetry.get('enabled') %} +[telemetry] +bind-addr = "{{ relay.telemetry.bind.get('address', '') }}:{{ relay.telemetry.bind.port }}" +{%- endif %} + +{%- for name, listen in relay.listen.iteritems()|sort %} + +{%- if listen.get('enabled', True) and listen.get('type', 'http') in ('http', 'udp') %} + +{%- set listen_type = listen.get('type', 'http') %} +[[{{ listen_type }}]] +name = "{{ name }}" +bind-addr = "{{ listen.bind.get('address', '') }}:{{ listen.bind.port }}" +{%- if listen_type == 'http' and listen.default_retention_policy is defined %} +default-retention-policy = "{{ listen.default_retention_policy }}" +{%- endif %} +{%- if listen_type == 'udp' and listen.precision is defined %} +precision = "{{ listen.precision }}" +{%- endif %} +{%- if listen_type == 'udp' and listen.read_buffer is defined %} +read-buffer = {{ listen.read_buffer|int }} +{%- endif %} + +{%- set outputs = [] %} +{%- for output_name, output in listen.get('output', {}).iteritems()|sort %} + {%- set tmp = ['name = "{}"'.format(output_name), 'location = "{}"'.format( output.location)] %} + {%- if listen_type == 'http' and output.timeout is defined %} + {%- do tmp.append('timeout = "{}"'.format(output.timeout)) %} + {%- endif %} + {%- if listen_type == 'http' and output.buffer_size_mb is defined %} + {%- do tmp.append('buffer-size-mb = {}'.format(output.buffer_size_mb)) %} + {%- endif %} + {%- if listen_type == 'http' and output.max_batch_kb is defined %} + {%- do tmp.append('max-batch-kb = {}'.format(output.max_batch_kb)) %} + {%- endif %} + {%- if listen_type == 'http' and output.max_delay_interval is defined %} + {%- do tmp.append('max-delay-interval = "{}"'.format(output.max_delay_interval)) %} + {%- endif %} + {%- if listen_type == 'udp' and output.mtu is defined %} + {%- do tmp.append('mtu = {}'.format(output.mtu)) %} + {%- endif %} + {%- do outputs.append(tmp) %} +{%- endfor %} + +output = [ +{%- for output in outputs %} + { {{ output|join(', ') }} }, +{%- endfor %} +] + +{%- endif %} + +{%- endfor %} + +{%- endif %} diff --git a/influxdb/init.sls b/influxdb/init.sls index a878715..95c7731 100644 --- a/influxdb/init.sls +++ b/influxdb/init.sls @@ -6,4 +6,7 @@ include: {%- if pillar.influxdb.client is defined %} - influxdb.client {%- endif %} +{%- if pillar.influxdb.relay is defined %} +- influxdb.relay +{%- endif %} {%- endif %} diff --git a/influxdb/map.jinja b/influxdb/map.jinja index 2c61594..83047b6 100644 --- a/influxdb/map.jinja +++ b/influxdb/map.jinja @@ -43,6 +43,10 @@ default: 'http_errors_percentage': 5, 'failed_points_percentage': 5, 'dropped_points_percentage': 5, + 'max_relay_buffer_percentage': 70, + 'relay_failed_requests_percentage': 5, + 'service_failed_warning_threshold_percent': 0.3, + 'service_failed_critical_threshold_percent': 0.6, }, }, grain='os_family', merge=salt['pillar.get']('influxdb:monitoring')) %} @@ -50,3 +54,12 @@ default: 'default': { }, }, merge=salt['pillar.get']('influxdb:client')) %} + +{%- set relay = salt['grains.filter_by']({ + 'default': { + 'pkgs': ['influxdb-relay'], + 'service': 'influxdb-relay', + 'listen': {}, + 'telemetry': {}, + }, +}, merge=salt['pillar.get']('influxdb:relay')) %} diff --git a/influxdb/meta/grafana.yml b/influxdb/meta/grafana.yml index 74c3f9e..0dead01 100644 --- a/influxdb/meta/grafana.yml +++ b/influxdb/meta/grafana.yml @@ -7,3 +7,22 @@ dashboard: datasource: influxdb format: json template: influxdb/files/grafana_dashboards/influxdb_influxdb.json + influxdb_relay_prometheus: + datasource: prometheus + format: json + template: influxdb/files/grafana_dashboards/influxdb_relay_prometheus.json + main_prometheus: + datasource: prometheus + row: + ost-middleware: + title: Middleware + panel: + influxdb: + title: InfluxDB + links: + - dashboard: InfluxDB + title: InfluxDB + type: dashboard + target: + cluster_status: + expr: avg(influxdb_up) by (name) \ No newline at end of file diff --git a/influxdb/meta/prometheus.yml b/influxdb/meta/prometheus.yml index 54a8b13..affbd77 100644 --- a/influxdb/meta/prometheus.yml +++ b/influxdb/meta/prometheus.yml @@ -1,21 +1,48 @@ {%- if pillar.influxdb.server is defined %} -{%- from "influxdb/map.jinja" import server, monitoring with context %} +{%- from "influxdb/map.jinja" import server, relay, monitoring with context %} -{%- if server.get('enabled', False) %} +{%- if server.get('enabled', False) or relay.get('enabled') %} server: alert: {%- if server.get('http', {}).get('enabled', False) %} - InfluxdbDown: + InfluxdbInfo: if: >- - influxdb_up != 1 + influxdb_up == 0 labels: - severity: warning + severity: info service: influxdb annotations: {%- raw %} summary: 'InfluxDB service down' description: 'InfluxDB service is down on node {{ $labels.host }}' {%- endraw %} + InfluxdbWarning: + if: >- + count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_warning_threshold_percent }} + labels: + severity: warning + service: influxdb + annotations: + summary: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down' + description: 'More than {{monitoring.service_failed_warning_threshold_percent*100}}% of InfluxDB services are down' + InfluxdbCritical: + if: >- + count(influxdb_up == 0) >= count(influxdb_up) * {{ monitoring.service_failed_critical_threshold_percent }} + labels: + severity: critical + service: influxdb + annotations: + summary: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down' + description: 'More than {{monitoring.service_failed_critical_threshold_percent*100}}% of InfluxDB services are down' + InfluxdbDown: + if: >- + count(influxdb_up == 0) == count(influxdb_up) + labels: + severity: down + service: influxdb + annotations: + summary: 'All InfluxDB services are down' + description: 'All InfluxDB services are down' InfluxdbSeriesNumberHigh: {%- set influx_max_series_threshold = monitoring.max_series_percentage * server.data.max_series_per_database / 100 %} if: >- @@ -70,6 +97,64 @@ server: annotations: summary: 'Influxdb too many dropped writes' description: '{{ printf `%.1f` $value }}% of written points have been dropped on {{ $labels.host }} (threshold={%- endraw %}{{ influx_http_points_written_dropped_threshold }}).' +{%- if relay.get('enabled', False) and relay.telemetry is defined and relay.telemetry.get('enabled') %} + {%- set buffer_sizes = [] %} + {%- for name, listen in relay.listen.iteritems()|sort %} + {%- for backend_name, backend in listen.output.iteritems()|sort %} + {%- do buffer_sizes.append(backend.get('buffer_size_mb', 0)|float) %} + {%- endfor %} + {%- endfor %} + {%- set buffer_sizes = buffer_sizes|sort %} + {%- set buffer_size = buffer_sizes[-1] * 1024 * 1024 %} + {%- if buffer_size > 0 %} + InfluxdbRelayBufferNearFull: + {%- set influx_relay_buffer_size_threshold = monitoring.max_relay_buffer_percentage %} + if: >- + influxdb_relay_backend_buffer_bytes > {{ buffer_size }} * {{ influx_relay_buffer_size_threshold }} / 100 + {% raw %} + labels: + severity: warning + service: influxdb-relay + annotations: + summary: 'InfluxDB Relay buffer almost full' + description: 'The buffer size for the {{ $labels.instance }}/{{ $labels.backend }} backend is getting full (current value={{ $value }} bytes, threshold={%- endraw %}{{ buffer_size * influx_relay_buffer_size_threshold / 100 }}).' + {%- endif %} + InfluxdbRelayFailedRequests: + {%- set influx_relay_failed_requests_threshold = monitoring.relay_failed_requests_percentage %} + if: >- + rate(influxdb_relay_failed_requests_total[5m]) / rate(influxdb_relay_requests_total[5m]) * 100 > {{ influx_relay_failed_requests_threshold }} + {% raw %} + labels: + severity: warning + service: influxdb-relay + annotations: + summary: 'InfluxDB Relay too many failed requests' + description: '{{ printf `%.1f` $value }}% of requests have been dropped on {{ $labels.instance }} (threshold={%- endraw %}{{ influx_relay_failed_requests_threshold }}).' + +{%- endif %} + +{%- if relay.get('enabled') and relay.telemetry.get('enabled') %} + +{%- set addresses = [] %} +{%- if relay.telemetry.get('bind', {}).address is defined and not relay.telemetry.bind.address.startswith('127') and relay.telemetry.bind.address != '0.0.0.0' %} +{%- do addresses.append(relay.telemetry.bind.address) %} +{%- endif %} +{%- for address in grains['fqdn_ip4'] %} +{%- if not address.startswith('127') %} +{%- do addresses.append(address) %} {%- endif %} +{%- endfor %} + + target: + static: + influxdb_relay: + enabled: true + endpoint: + - address: {{ addresses[0] }} + port: {{ relay.telemetry.bind.port }} + +{%- endif %} + {%- endif %} {%- endif %} +{%- endif %} \ No newline at end of file diff --git a/influxdb/relay.sls b/influxdb/relay.sls new file mode 100644 index 0000000..58f0413 --- /dev/null +++ b/influxdb/relay.sls @@ -0,0 +1,26 @@ +{%- from "influxdb/map.jinja" import relay with context %} +{%- if relay.get('enabled') %} + +influxdb_relay_packages: + pkg.installed: + - names: {{ relay.pkgs }} + +influxdb_relay_config: + file.managed: + - name: //etc/influxdb-relay/influxdb-relay.conf + - source: salt://influxdb/files/influxdb-relay.conf + - template: jinja + - require: + - pkg: influxdb_relay_packages + +influxdb_relay_service: + service.running: + - enable: true + - name: {{ relay.service }} +{%- if grains.get('noservices') %} + - onlyif: /bin/false +{%- endif %} + - watch: + - file: influxdb_relay_config + +{%- endif %} diff --git a/metadata/service/relay/cluster.yml b/metadata/service/relay/cluster.yml new file mode 100644 index 0000000..0dbe9b4 --- /dev/null +++ b/metadata/service/relay/cluster.yml @@ -0,0 +1,43 @@ +applications: +- influxdb +classes: +- service.influxdb.support +parameters: + _param: + influxdb_relay_timeout: 10s + influxdb_relay_buffer_size_mb: 512 + influxdb_relay_max_batch_kb: 512 + influxdb_relay_max_delay_inteval: 10s + influxdb: + relay: + enabled: true + telemetry: + enabled: true + bind: + address: ${_param:cluster_local_address} + port: 9196 + listen: + http: + type: http + bind: + address: ${_param:cluster_local_address} + port: 9096 + output: + influxdb01: + location: http://${_param:cluster_node01_address}:8086/write + timeout: ${_param:influxdb_relay_timeout} + buffer_size_mb: ${_param:influxdb_relay_buffer_size_mb} + max_batch_kb: ${_param:influxdb_relay_max_batch_kb} + max_delay_interval: ${_param:influxdb_relay_max_delay_inteval} + influxdb02: + location: http://${_param:cluster_node02_address}:8086/write + timeout: ${_param:influxdb_relay_timeout} + buffer_size_mb: ${_param:influxdb_relay_buffer_size_mb} + max_batch_kb: ${_param:influxdb_relay_max_batch_kb} + max_delay_interval: ${_param:influxdb_relay_max_delay_inteval} + influxdb03: + location: http://${_param:cluster_node03_address}:8086/write + timeout: ${_param:influxdb_relay_timeout} + buffer_size_mb: ${_param:influxdb_relay_buffer_size_mb} + max_batch_kb: ${_param:influxdb_relay_max_batch_kb} + max_delay_interval: ${_param:influxdb_relay_max_delay_inteval} diff --git a/tests/pillar/relay.sls b/tests/pillar/relay.sls new file mode 100644 index 0000000..cf7e866 --- /dev/null +++ b/tests/pillar/relay.sls @@ -0,0 +1,34 @@ +influxdb: + relay: + enabled: true + telemetry: + enabled: true + bind: + address: 127.0.0.1 + port: 9196 + listen: + http_backend: + type: http + bind: + address: 127.0.0.1 + port: 9096 + output: + server1: + location: http://server1:8086/write + timeout: 20s + buffer_size_mb: 512 + max_batch_kb: 1024 + max_delay_interval: 30s + server2: + location: http://server2:8086/write + udp_backend: + type: udp + bind: + address: 127.0.0.1 + port: 9196 + output: + server1: + location: http://server1:8086/write + mtu: 1500 + server2: + location: http://server2:8086/write diff --git a/tests/run_tests.sh b/tests/run_tests.sh index a4cac88..29fb975 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -6,11 +6,13 @@ set -e CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" METADATA=${CURDIR}/../metadata.yml FORMULA_NAME=$(cat $METADATA | python -c "import sys,yaml; print yaml.load(sys.stdin)['name']") +FORMULA_META_DIR=${CURDIR}/../${FORMULA_NAME}/meta ## Overrideable parameters PILLARDIR=${PILLARDIR:-${CURDIR}/pillar} BUILDDIR=${BUILDDIR:-${CURDIR}/build} VENV_DIR=${VENV_DIR:-${BUILDDIR}/virtualenv} +MOCK_BIN_DIR=${MOCK_BIN_DIR:-${CURDIR}/mock_bin} DEPSDIR=${BUILDDIR}/deps SALT_FILE_DIR=${SALT_FILE_DIR:-${BUILDDIR}/file_root} @@ -40,6 +42,15 @@ setup_virtualenv() { pip install salt${PIP_SALT_VERSION} } +setup_mock_bin() { + # If some state requires a binary, a lightweight replacement for + # such binary can be put into MOCK_BIN_DIR for test purposes + if [ -d "${MOCK_BIN_DIR}" ]; then + PATH="${MOCK_BIN_DIR}:$PATH" + export PATH + fi +} + setup_pillar() { [ ! -d ${SALT_PILLAR_DIR} ] && mkdir -p ${SALT_PILLAR_DIR} echo "base:" > ${SALT_PILLAR_DIR}/top.sls @@ -121,6 +132,7 @@ prepare() { [ -d ${BUILDDIR} ] && mkdir -p ${BUILDDIR} which salt-call || setup_virtualenv + setup_mock_bin setup_pillar setup_salt install_dependencies @@ -130,7 +142,26 @@ run() { for pillar in ${PILLARDIR}/*.sls; do grep ${FORMULA_NAME}: ${pillar} &>/dev/null || continue state_name=$(basename ${pillar%.sls}) + salt_run grains.set 'noservices' False force=True + + echo "Checking state ${FORMULA_NAME}.${state_name} ..." salt_run --id=${state_name} state.show_sls ${FORMULA_NAME} || (log_err "Execution of ${FORMULA_NAME}.${state_name} failed"; exit 1) + + # Check that all files in 'meta' folder can be rendered using any valid pillar + for meta in `find ${FORMULA_META_DIR} -type f`; do + meta_name=$(basename ${meta}) + echo "Checking meta ${meta_name} ..." + salt_run --out=quiet --id=${state_name} cp.get_template ${meta} ${SALT_CACHE_DIR}/${meta_name} \ + || (log_err "Failed to render meta ${meta} using pillar ${FORMULA_NAME}.${state_name}"; exit 1) + cat ${SALT_CACHE_DIR}/${meta_name} + done + done +} + +real_run() { + for pillar in ${PILLARDIR}/*.sls; do + state_name=$(basename ${pillar%.sls}) + salt_run --id=${state_name} state.sls ${FORMULA_NAME} || (log_err "Execution of ${FORMULA_NAME}.${state_name} failed"; exit 1) done } @@ -159,6 +190,9 @@ case $1 in run) run ;; + real-run) + real_run + ;; *) prepare run