Be able to monitor the Keepalived node State
Change-Id: I1f38472141220d937d78ac17a22268bc6dac3b39
Related-Bug: PROD-18708
diff --git a/keepalived/files/grafana_dashboards/keepalived_prometheus.json b/keepalived/files/grafana_dashboards/keepalived_prometheus.json
new file mode 100644
index 0000000..4234073
--- /dev/null
+++ b/keepalived/files/grafana_dashboards/keepalived_prometheus.json
@@ -0,0 +1,562 @@
+{% raw %}
+{
+ "annotations": {
+ "list": []
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "hideControls": false,
+ "id": null,
+ "links": [],
+ "rows": [
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": null,
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 4,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "procstat_running{process_name=\"keepalived\",host=\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": "0,1",
+ "title": "Process status",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ },
+ {
+ "op": "=",
+ "text": "UP",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "DOWN",
+ "value": "0"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 1,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 9,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "procstat_running{process_name=\"keepalived\",host=\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ host }}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Process status",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "decimals": null,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "id": 3,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 3,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "keepalived_state{host=\"$host\"} or absent(keepalived_state{host=\"$host\"})-2",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 4
+ }
+ ],
+ "thresholds": "0.5,1.5",
+ "title": "Keepalived current State",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "UNKNOWN",
+ "value": "-1"
+ },
+ {
+ "op": "=",
+ "text": "FAILED",
+ "value": "0"
+ },
+ {
+ "op": "=",
+ "text": "BACKUP",
+ "value": "1"
+ },
+ {
+ "op": "=",
+ "text": "MASTER",
+ "value": "2"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "id": 2,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": false,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 9,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "keepalived_state{host=\"$host\"} or absent(keepalived_state{host=\"$host\"})-2",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{ host }}",
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Keepalived current state",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "-1",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ },
+ {
+ "collapse": false,
+ "height": 250,
+ "panels": [
+ {
+ "columns": [],
+ "datasource": "prometheus",
+ "fontSize": "100%",
+ "id": 5,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 0,
+ "desc": true
+ },
+ "span": 12,
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "date"
+ },
+ {
+ "alias": "State",
+ "colorMode": "row",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "pattern": "Value",
+ "thresholds": [
+ "0.5",
+ "1.5"
+ ],
+ "type": "number",
+ "unit": "none"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "hide",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "__name__",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "keepalived_state",
+ "format": "table",
+ "intervalFactor": 2,
+ "refId": "A",
+ "step": 2
+ }
+ ],
+ "timeFrom": "1s",
+ "title": "Keepalived global states",
+ "transform": "table",
+ "type": "table"
+ }
+ ],
+ "repeat": null,
+ "repeatIteration": null,
+ "repeatRowId": null,
+ "showTitle": false,
+ "title": "Dashboard Row",
+ "titleSize": "h6"
+ }
+ ],
+ "schemaVersion": 14,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "",
+ "value": ""
+ },
+ "datasource": "prometheus",
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "host",
+ "options": [],
+ "query": "label_values(procstat_running{process_name=\"keepalived\"},host)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "Keepalived",
+ "version": 1
+}
+{% endraw %}
\ No newline at end of file
diff --git a/keepalived/meta/fluentd.yml b/keepalived/meta/fluentd.yml
new file mode 100644
index 0000000..bd80056
--- /dev/null
+++ b/keepalived/meta/fluentd.yml
@@ -0,0 +1,58 @@
+agent:
+ config:
+ label:
+ keepalived:
+ input:
+ keeaplived_syslog:
+ type: tail
+ tag: keepalived.syslog
+ path: /var/log/syslog
+ pos_file: {{ pillar.fluentd.agent.dir.positiondb }}/keepalived.pos
+ parser:
+ type: regexp
+ format: >-
+ '.*VRRP_Instance\(VIP\) Entering (?<state>MASTER|FAILED|BACKUP) STATE'
+ filter:
+ keepalived_state_parse:
+ tag: metric.keepalived_state
+ type: parser
+ key_name: Payload
+ parser:
+ type: regexp
+ format: '^(?<Timestamp>) (?<hostname>) .*VRRP_Instance.*(?<state>MASTER|FAILED|BACKUP) STATE'
+ keepalived_state:
+ tag: metric.keepalived_state
+ require:
+ - keepalived_state_parse
+ type: prometheus
+ metric:
+ - name: keepalived_state
+ type: gauge
+ desc: The keepalived state.
+ key: state
+ label:
+ - name: host
+ value: ${hostname}
+ match_state:
+ type: record_transformer
+ tag: keepalived.**
+ enable_ruby: true
+ record:
+ - name: state
+ value: ${ {"FAILED"=>0,"BACKUP"=>1,"MASTER"=>2}[record["state"]] }
+ match:
+ push_to_default:
+ tag: 'keepalived.**'
+ type: copy
+ store:
+ - type: relabel
+ label: default_output
+ - type: rewrite_tag_filter
+ rule:
+ - name: state
+ regexp: '.'
+ result: metric.keepalived_state
+ push_to_metric:
+ tag: 'metric.**'
+ type: relabel
+ label: default_metric
diff --git a/keepalived/meta/prometheus.yml b/keepalived/meta/prometheus.yml
index 5172ace..3c3b969 100644
--- a/keepalived/meta/prometheus.yml
+++ b/keepalived/meta/prometheus.yml
@@ -13,4 +13,15 @@
summary: 'Keepalived service is down'
description: 'Keepalived service is down on node {{ $labels.host }}'
{% endraw %}
+ KeepalivedStateFailed:
+ if: >-
+ keepalived_state == 0
+ {% raw %}
+ labels:
+ severity: warning
+ service: keepalived
+ annotations:
+ summary: 'Keepalived is in the Failed state'
+ description: 'Keepalived service is in Failed state on node {{ $labels.host }}'
+ {% endraw %}
{%- endif %}
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 989b10b..9a7866b 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -13,3 +13,7 @@
enabled: true
prometheus:
enabled: true
+ fluentd:
+ enabled: true
+ grafana:
+ enabled: true
\ No newline at end of file