Add Feature to gather ovs bridge information
Related-Prod: PROD-33136, PROD-30617
Change-Id: I6a3b22a123ec032ba301e7d38ec36484db2e8694
diff --git a/telegraf/files/script/ovs_parse_bridge.py b/telegraf/files/script/ovs_parse_bridge.py
new file mode 100644
index 0000000..19fc8f4
--- /dev/null
+++ b/telegraf/files/script/ovs_parse_bridge.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python2
+import subprocess
+import json
+
+
+def call_process(cmd):
+ p = subprocess.Popen(
+ cmd.split(),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ output = p.communicate()
+ return output
+
+
+def ovsctl_json(cmd):
+ cmd = "ovs-vsctl -f json " + cmd
+ result = call_process(cmd)
+ headings = json.loads(result[0])["headings"]
+ datas = json.loads(result[0])["data"]
+
+ uuid_index = headings.index("_uuid")
+
+ obj = {}
+
+ for data in datas:
+ obj_data = {}
+ for d, h in zip(data, headings):
+ if isinstance(d, list):
+ _type, value = d[0], d[1]
+ if _type == "uuid":
+ obj_data[h] = [d]
+ elif _type == "map":
+ obj_data[h] = {v[0]: v[1] for v in value}
+ elif _type == "set":
+ obj_data[h] = value
+ else:
+ obj_data[h] = d
+ obj[data[uuid_index][1]] = obj_data
+ return obj
+
+
+def gather():
+ bridges = ovsctl_json("list bridge")
+ ports = ovsctl_json("list port")
+ interfaces = ovsctl_json("list interface")
+
+ for bridge_uuid, bridge in bridges.iteritems():
+ for bridge_port in bridge['ports']:
+ port = ports[bridge_port[1]]
+ interface = interfaces[port['interfaces'][0][1]]
+
+ label_obj = {
+ "bridge": bridge.get('name', '""'),
+ "port": port.get('name', '""'),
+ "interface": interface.get('name', '""'),
+ "type": interface.get('type', '""'),
+ "peer": interface.get('options', {}).get('peer', '""'),
+ }
+
+ if label_obj['peer'] == '""':
+ label_obj.pop('peer')
+
+ labels = ','.join(["{}={}".format(key, value)
+ for key, value in label_obj.iteritems()])
+
+ if interface['link_state'] == "up":
+ status = 1
+ if len(interface['error']) > 0:
+ status = 2
+ else:
+ status = 0
+
+ print "ovs_bridge,{} status={}".format(labels, status)
+
+
+if __name__ == "__main__":
+ try:
+ gather()
+ print "ovs_bridge check=1"
+ except Exception:
+ print "ovs_bridge check=0"
diff --git a/telegraf/meta/prometheus.yml b/telegraf/meta/prometheus.yml
index 258cd81..decbc0d 100644
--- a/telegraf/meta/prometheus.yml
+++ b/telegraf/meta/prometheus.yml
@@ -14,6 +14,7 @@
{%- endif %}
{%- endif %}
{%- endif %}
+{%- raw %}
server:
alert:
NodeDown:
@@ -23,10 +24,8 @@
severity: critical
service: system
annotations:
-{%- raw %}
summary: "The {{ $labels.host }} node is down"
description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down."
-{%- endraw %}
TelegrafGatherErrors:
if: >-
rate(internal_agent_gather_errors[10m]) > 0
@@ -35,11 +34,49 @@
service: telegraf
annotations:
summary: "Telegraf failed to gather metrics"
-{%- raw %}
description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes."
{%- endraw %}
{%- if pillar.neutron is defined %}
- {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
+{%- raw %}
+ OVSTooManyPortRunningOnAgent:
+ if: >-
+ sum by (host) (ovs_bridge_status) > 1500
+ labels:
+ severity: major
+ service: neutron
+ annotations:
+ summary: "High number of ovs ports on host"
+ description: "The number of ovs port is {{ $value }} (ovs-vsctl list port ) on {{ $labels.host }} which is more than the expected limit"
+ OVSErrorOnPort:
+ if: >-
+ ovs_bridge_status == 2
+ labels:
+ severity: critical
+ service: neutron
+ annotations:
+ summary: "OVS port is reporting error"
+ description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting errors"
+ OVSNonInternalPortDown:
+ if: >-
+ ovs_bridge_status{type!="internal"} == 0
+ labels:
+ severity: critical
+ service: neutron
+ annotations:
+ summary: "Non internal ovs port is down"
+ description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting status down"
+ OVSGatherFailed:
+ if: >-
+ ovs_bridge_check == 0
+ labels:
+ severity: critical
+ service: neutron
+ annotations:
+ summary: "Failed to Gather OVS information"
+ description: "Failed to Gather OVS information on host {{ $labels.host }}"
+ {%- endraw %}
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
OVSInstanceArpingCheckDown:
if: instance_arping_check_up == 0
for: 2m
@@ -51,6 +88,7 @@
{%- raw %}
description: "The OVS instance arping check on the {{ $labels.host }} node is down for 2 minutes."
{%- endraw %}
+ {%- endif %}
{%- endif %}
{%- endif %}
{%- if pillar.opencontrail is defined %}
diff --git a/telegraf/meta/telegraf.yml b/telegraf/meta/telegraf.yml
index d05b4fd..6361da8 100644
--- a/telegraf/meta/telegraf.yml
+++ b/telegraf/meta/telegraf.yml
@@ -1,12 +1,18 @@
{%- if pillar.neutron is defined %}
- {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
- {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
agent:
input:
+ ovs_parse_bridge:
+ template: telegraf/files/input/exec.conf
+ commands: "/usr/local/bin/ovs_parse_bridge.py"
+ interval: 45s
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
+ {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
ovs_arping_check:
template: telegraf/files/input/exec.conf
commands: "/usr/local/bin/check_ovs_arping.py --host {{ prometheus_address }} --port 15016"
interval: 45s
+ {%- endif %}
{%- endif %}
{%- endif %}
diff --git a/telegraf/script.sls b/telegraf/script.sls
index f809924..5424a8f 100644
--- a/telegraf/script.sls
+++ b/telegraf/script.sls
@@ -1,6 +1,13 @@
{%- if pillar.neutron is defined %}
- {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
+ovs_parse_bridge:
+ file.managed:
+ - name: /usr/local/bin/ovs_parse_bridge.py
+ - source: salt://telegraf/files/script/ovs_parse_bridge.py
+ - template: jinja
+ - mode: 755
+ {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
ovs_arping_check_telegraf_script:
file.managed:
- name: /usr/local/bin/check_ovs_arping.py
@@ -8,6 +15,7 @@
- template: jinja
- mode: 755
+ {%- endif %}
{%- endif %}
{%- endif %}