Improve bond interfaces monitoring
- enable bond telegraf input plugin by default on every host
where bond interface enabled;
- disable SystemRxPacketsDroppedTooHigh and
SystemRxPacketsDroppedLongTermTooHigh alerts for bond slaves;
Change-Id: I17961a3133226b0878f553635b33216a23226015
Related-PROD: PROD-25272 (PROD:25272)
diff --git a/linux/meta/grafana.yml b/linux/meta/grafana.yml
index 32b4679..b3b6c64 100644
--- a/linux/meta/grafana.yml
+++ b/linux/meta/grafana.yml
@@ -1,4 +1,4 @@
-{%- from "linux/map.jinja" import monitoring with context %}
+{%- from "linux/map.jinja" import network with context %}
dashboard:
linux_overview_prometheus:
datasource: prometheus
@@ -16,7 +16,13 @@
datasource: influxdb
format: json
template: linux/files/grafana_dashboards/system_influxdb.json
-{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
+{%- set bond_interfaces = [] %}
+{%- for interface_name, interface in network.interface.items() %}
+ {%- if interface.type == 'bond' and interface.enabled == True %}
+ {%- do bond_interfaces.append(interface_name) %}
+ {%- endif %}
+{%- endfor %}
+{%- if bond_interfaces|length > 0 %}
linux_bond:
datasource: prometheus
format: json
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 1e029f3..e89b42b 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,4 +1,4 @@
-{%- from "linux/map.jinja" import monitoring with context %}
+{%- from "linux/map.jinja" import monitoring, network with context %}
server:
alert:
SystemCpuFullWarning:
@@ -151,7 +151,7 @@
{%- endraw %}
{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
if: >-
- increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
+ increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
labels:
severity: warning
service: system
@@ -160,7 +160,7 @@
description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
SystemRxPacketsDroppedLongTermTooHigh:
if: >-
- increase(net_drop_in[1m]) > 0
+ increase(net_drop_in[1m]) > 0 unless on (host,interface) bond_slave_active == 0
for: 10m
labels:
severity: major
@@ -242,9 +242,16 @@
annotations:
summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
-{%- endraw %}
-{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
-{%- raw %}
+{%- endraw -%}
+
+{%- set bond_interfaces = [] %}
+{%- for interface_name, interface in network.interface.items() %}
+ {%- if interface.type == 'bond' and interface.enabled == True %}
+ {%- do bond_interfaces.append(interface_name) %}
+ {%- endif %}
+{%- endfor %}
+{%- if bond_interfaces|length > 0 %}
+ {%- raw %}
BondInterfaceDown:
if: >-
bond_status < 1
@@ -272,5 +279,5 @@
annotations:
summary: "50% of bond interface slaves {{ $labels.bond }} are down"
description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
-{% endraw %}
+ {%- endraw %}
{%- endif %}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index d1cd721..52b4fe7 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -1,4 +1,4 @@
-{%- from "linux/map.jinja" import monitoring with context %}
+{%- from "linux/map.jinja" import network with context %}
agent:
input:
cpu:
@@ -34,13 +34,13 @@
cron:
exe: cron
linux_sysctl_fs:
-{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
+{%- set bond_interfaces = [] %}
+{%- for interface_name, interface in network.interface.items() %}
+ {%- if interface.type == 'bond' and interface.enabled == True %}
+ {%- do bond_interfaces.append(interface_name) %}
+ {%- endif %}
+{%- endfor %}
+{%- if bond_interfaces|length > 0 %}
bond:
template: linux/files/telegraf.conf
-{%- if monitoring.bond_status.interfaces is list %}
- bond_interfaces: {{ monitoring.bond_status.interfaces }}
-{%- endif %}
-{%- if monitoring.bond_status.host_proc is defined %}
- host_proc: {{ monitoring.bond_status.host_proc }}
-{%- endif %}
{%- endif %}