Enable nstat input plugin for softnet_stat data
Since we added to nstat's telegraf plugin the possibility
to collect data from `/proc/net/softnet_stat` regarding
dropped packets and rx_net_action a.k.a time squeeze, we need to enable
it globally on all hosts.
Also grafana dashboard update to include new graphs + added four
new Prometheus alers.
Related-Bug: PROD-21090
Change-Id: I9dfe87bdc8b677a51e3f305dd3c75c7d4cc4e0d4
diff --git a/linux/files/grafana_dashboards/system_network_prometheus.json b/linux/files/grafana_dashboards/system_network_prometheus.json
index 13c6827..b05c62e 100644
--- a/linux/files/grafana_dashboards/system_network_prometheus.json
+++ b/linux/files/grafana_dashboards/system_network_prometheus.json
@@ -19,7 +19,7 @@
"gnetId": null,
"graphTooltip": 1,
"id": null,
- "iteration": 1529498668709,
+ "iteration": 1532690906484,
"links": [],
"panels": [
{
@@ -430,12 +430,561 @@
}
},
{
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "decimals": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 0,
+ "y": 15
+ },
+ "id": 49,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nstat_time_squeeze{host=~\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{host}} @{{cpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Net RX action@$host per CPU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "decimals": 0,
+ "format": "none",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 7,
+ "w": 12,
+ "x": 12,
+ "y": 15
+ },
+ "id": 45,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nstat_packet_drop{host=~\"$host\"}",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "{{host}} @{{cpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Dropped packets@$host per CPU",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "decimals": 0,
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": null,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 0,
+ "y": 22
+ },
+ "hideTimeOverride": false,
+ "id": 51,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "Increase since 1h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "preserveFormat": false,
+ "thresholds": [
+ "2",
+ "4"
+ ],
+ "type": "string",
+ "unit": "none",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Increase since 4h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [
+ "8",
+ "16"
+ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "Increase since 24h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [
+ "50",
+ "100"
+ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[1h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[1h]) - 1",
+ "format": "table",
+ "instant": false,
+ "intervalFactor": 2,
+ "refId": "A"
+ },
+ {
+ "expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[4h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[4h]) - 1",
+ "format": "table",
+ "intervalFactor": 1,
+ "refId": "B"
+ },
+ {
+ "expr": "floor(increase(nstat_time_squeeze{host=~\"$host\"}[24h])) > 0 or increase(nstat_time_squeeze{host=~\"$host\"}[24h]) - 1",
+ "format": "table",
+ "intervalFactor": 1,
+ "refId": "C"
+ }
+ ],
+ "timeFrom": "1s",
+ "title": "Net RX action@$host per CPU - increased",
+ "transform": "table",
+ "transparent": false,
+ "type": "table"
+ },
+ {
+ "columns": [],
+ "datasource": null,
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 6,
+ "w": 12,
+ "x": 12,
+ "y": 22
+ },
+ "hideTimeOverride": false,
+ "id": 47,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "Increase since 1h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "preserveFormat": false,
+ "rangeMaps": [
+ {
+ "from": "0",
+ "text": "asas",
+ "to": "0"
+ }
+ ],
+ "thresholds": [
+ "2",
+ "4"
+ ],
+ "type": "string",
+ "unit": "none",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "job",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "instance",
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ },
+ {
+ "alias": "Increase since 4h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [
+ "8",
+ "16"
+ ],
+ "type": "string",
+ "unit": "none",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "Increase since 24h",
+ "colorMode": "cell",
+ "colors": [
+ "transparent",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 0,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [
+ "50",
+ "100"
+ ],
+ "type": "string",
+ "unit": "short",
+ "valueMaps": [
+ {
+ "text": "-",
+ "value": "-1"
+ }
+ ]
+ },
+ {
+ "alias": "",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[1h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[1h]) - 1",
+ "format": "table",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[4h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[4h]) -1",
+ "format": "table",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "B"
+ },
+ {
+ "expr": "floor(increase(nstat_packet_drop{host=~\"$host\"}[24h])) > 0 or increase(nstat_packet_drop{host=~\"$host\"}[24h]) -1",
+ "format": "table",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 2,
+ "legendFormat": "",
+ "refId": "C"
+ }
+ ],
+ "timeFrom": "1s",
+ "title": "Dropped packets@$host per CPU - increased",
+ "transform": "table",
+ "transparent": false,
+ "type": "table"
+ },
+ {
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
- "y": 15
+ "y": 28
},
"id": 24,
"panels": [],
@@ -454,7 +1003,7 @@
"h": 7,
"w": 8,
"x": 0,
- "y": 16
+ "y": 29
},
"id": 8,
"legend": {
@@ -568,7 +1117,7 @@
"h": 7,
"w": 8,
"x": 8,
- "y": 16
+ "y": 29
},
"id": 15,
"legend": {
@@ -681,7 +1230,7 @@
"h": 7,
"w": 8,
"x": 16,
- "y": 16
+ "y": 29
},
"id": 13,
"legend": {
@@ -841,6 +1390,7 @@
]
},
"datasource": "prometheus",
+
"hide": 0,
"includeAll": true,
"label": null,
@@ -933,6 +1483,6 @@
"timezone": "browser",
"title": "System - Networking",
"uid": null,
- "version": 22
+ "version": 1
}
{% endraw %}
\ No newline at end of file
diff --git a/linux/map.jinja b/linux/map.jinja
index f176068..c333a89 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -348,5 +348,13 @@
'failed_auths_threshold': {
'warn': 5,
},
+ 'net_rx_action_per_cpu_threshold': {
+ 'warning': '0',
+ 'minor': '100'
+ },
+ 'packets_dropped_per_cpu_threshold': {
+ 'minor': '0',
+ 'major': '100'
+ }
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index ca4ba3d..3ca2b26 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -208,6 +208,50 @@
annotations:
summary: "{{ threshold }}{%- raw %} failed SSH logins"
description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
+ PacketsDroppedByCpuMinor:
+ {%- endraw %}
+ {%- set packets_dropped_minor_threshold = monitoring.packets_dropped_per_cpu_threshold.minor %}
+ if: >-
+ floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_minor_threshold }}
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "CPU dropped {{ packets_dropped_minor_threshold }}{%- raw %} packets"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
+ PacketsDroppedByCpuMajor:
+ {%- endraw %}
+ {%- set packets_dropped_major_threshold = monitoring.packets_dropped_per_cpu_threshold.major %}
+ if: >-
+ floor(increase(nstat_packet_drop[24h])) > {{ packets_dropped_major_threshold }}
+ labels:
+ severity: major
+ service: system
+ annotations:
+ summary: "CPU dropped {{ packets_dropped_major_threshold }}{%- raw %} packets"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 24 hours."
+ NetRxActionByCpuWarning:
+ {%- endraw %}
+ {%- set net_rx_action_warning_threshold = monitoring.net_rx_action_per_cpu_threshold.warning %}
+ if: >-
+ floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_warning_threshold }}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "CPU terminated {{ net_rx_action_warning_threshold }}{%- raw %} net_rx_action loops"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
+ NetRxActionByCpuMinor:
+ {%- endraw %}
+ {%- set net_rx_action_minor_threshold = monitoring.net_rx_action_per_cpu_threshold.minor %}
+ if: >-
+ floor(increase(nstat_time_squeeze[24h])) > {{ net_rx_action_minor_threshold }}
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "CPU terminated {{ net_rx_action_minor_threshold }}{%- raw %} net_rx_action loops"
+ description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node terminated {{ $value }} net_rx_action loops during the last 24 hours."
{%- endraw %}
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
{%- raw %}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 0c39da1..d1cd721 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -20,6 +20,10 @@
kernel:
net:
mem:
+ nstat:
+ fieldpass:
+ - packet_drop
+ - time_squeeze
processes:
swap:
system: