Add Feature to gather ovs bridge information
Related-Prod: PROD-33136, PROD-30617

Change-Id: I6a3b22a123ec032ba301e7d38ec36484db2e8694
diff --git a/telegraf/files/script/ovs_parse_bridge.py b/telegraf/files/script/ovs_parse_bridge.py
new file mode 100644
index 0000000..19fc8f4
--- /dev/null
+++ b/telegraf/files/script/ovs_parse_bridge.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python2
+import subprocess
+import json
+
+
+def call_process(cmd):
+    p = subprocess.Popen(
+        cmd.split(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE)
+    output = p.communicate()
+    return output
+
+
+def ovsctl_json(cmd):
+    cmd = "ovs-vsctl -f json " + cmd
+    result = call_process(cmd)
+    headings = json.loads(result[0])["headings"]
+    datas = json.loads(result[0])["data"]
+
+    uuid_index = headings.index("_uuid")
+
+    obj = {}
+
+    for data in datas:
+        obj_data = {}
+        for d, h in zip(data, headings):
+            if isinstance(d, list):
+                _type, value = d[0], d[1]
+                if _type == "uuid":
+                    obj_data[h] = [d]
+                elif _type == "map":
+                    obj_data[h] = {v[0]: v[1] for v in value}
+                elif _type == "set":
+                    obj_data[h] = value
+            else:
+                obj_data[h] = d
+        obj[data[uuid_index][1]] = obj_data
+    return obj
+
+
+def gather():
+    bridges = ovsctl_json("list bridge")
+    ports = ovsctl_json("list port")
+    interfaces = ovsctl_json("list interface")
+
+    for bridge_uuid, bridge in bridges.iteritems():
+        for bridge_port in bridge['ports']:
+            port = ports[bridge_port[1]]
+            interface = interfaces[port['interfaces'][0][1]]
+
+            label_obj = {
+                "bridge": bridge.get('name', '""'),
+                "port": port.get('name', '""'),
+                "interface": interface.get('name', '""'),
+                "type": interface.get('type', '""'),
+                "peer": interface.get('options', {}).get('peer', '""'),
+            }
+
+            if label_obj['peer'] == '""':
+                label_obj.pop('peer')
+
+            labels = ','.join(["{}={}".format(key, value)
+                               for key, value in label_obj.iteritems()])
+
+            if interface['link_state'] == "up":
+                status = 1
+                if len(interface['error']) > 0:
+                    status = 2
+            else:
+                status = 0
+
+            print "ovs_bridge,{} status={}".format(labels, status)
+
+
+if __name__ == "__main__":
+    try:
+        gather()
+        print "ovs_bridge check=1"
+    except Exception:
+        print "ovs_bridge check=0"
diff --git a/telegraf/meta/prometheus.yml b/telegraf/meta/prometheus.yml
index 258cd81..decbc0d 100644
--- a/telegraf/meta/prometheus.yml
+++ b/telegraf/meta/prometheus.yml
@@ -14,6 +14,7 @@
     {%- endif %}
   {%- endif %}
 {%- endif %}
+{%- raw %}
 server:
   alert:
     NodeDown:
@@ -23,10 +24,8 @@
         severity: critical
         service: system
       annotations:
-{%- raw %}
         summary: "The {{ $labels.host }} node is down"
         description: "The {{ $labels.host }} node is unreachable at {{ $labels.url }}, the Telegraf and Fluentd targets on the {{ $labels.host }} node are down."
-{%- endraw %}
     TelegrafGatherErrors:
       if: >-
         rate(internal_agent_gather_errors[10m]) > 0
@@ -35,11 +34,49 @@
         service: telegraf
       annotations:
         summary: "Telegraf failed to gather metrics"
-{%- raw %}
         description: "Telegraf has gathering errors on the {{ $labels.host }} node for the last 10 minutes."
 {%- endraw %}
 {%- if pillar.neutron is defined %}
-  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
+  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
+{%- raw %}
+    OVSTooManyPortRunningOnAgent:
+      if: >-
+        sum by (host) (ovs_bridge_status)  > 1500
+      labels:
+        severity: major
+        service: neutron
+      annotations:
+        summary: "High number of ovs ports on host"
+        description: "The number of ovs port is {{ $value }} (ovs-vsctl list port ) on {{ $labels.host }} which is more than the expected limit"
+    OVSErrorOnPort:
+      if: >-
+        ovs_bridge_status == 2
+      labels:
+        severity: critical
+        service: neutron
+      annotations:
+        summary: "OVS port is reporting error"
+        description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting errors"
+    OVSNonInternalPortDown:
+      if: >-
+        ovs_bridge_status{type!="internal"} == 0
+      labels:
+        severity: critical
+        service: neutron
+      annotations:
+        summary: "Non internal ovs port is down"
+        description: "OVS port {{ $labels.port }} on bridge {{ $labels.bridge }} running on {{ $labels.host }} is reporting status down"
+    OVSGatherFailed:
+      if: >-
+        ovs_bridge_check == 0
+      labels:
+        severity: critical
+        service: neutron
+      annotations:
+        summary: "Failed to Gather OVS information"
+        description: "Failed to Gather OVS information on host {{ $labels.host }}"
+    {%- endraw %}
+    {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
     OVSInstanceArpingCheckDown:
       if: instance_arping_check_up == 0
       for: 2m
@@ -51,6 +88,7 @@
         {%- raw %}
         description: "The OVS instance arping check on the {{ $labels.host }} node is down for 2 minutes."
         {%- endraw %}
+    {%- endif %}
   {%- endif %}
 {%- endif %}
 {%- if pillar.opencontrail is defined %}
diff --git a/telegraf/meta/telegraf.yml b/telegraf/meta/telegraf.yml
index d05b4fd..6361da8 100644
--- a/telegraf/meta/telegraf.yml
+++ b/telegraf/meta/telegraf.yml
@@ -1,12 +1,18 @@
 {%- if pillar.neutron is defined %}
-  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
-    {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
+  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
 agent:
   input:
+    ovs_parse_bridge:
+      template: telegraf/files/input/exec.conf
+      commands: "/usr/local/bin/ovs_parse_bridge.py"
+      interval: 45s
+    {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
+      {%- set prometheus_address = pillar._param.stacklight_monitor_address %}
     ovs_arping_check:
       template: telegraf/files/input/exec.conf
       commands: "/usr/local/bin/check_ovs_arping.py --host {{ prometheus_address }} --port 15016"
       interval: 45s
+    {%- endif %}
   {%- endif %}
 {%- endif %}
 
diff --git a/telegraf/script.sls b/telegraf/script.sls
index f809924..5424a8f 100644
--- a/telegraf/script.sls
+++ b/telegraf/script.sls
@@ -1,6 +1,13 @@
 {%- if pillar.neutron is defined %}
-  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or (pillar.neutron.get('compute',{}).get('enabled', False) == True and pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True) %}
+  {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('enabled', False) == True %}
+ovs_parse_bridge:
+  file.managed:
+  - name: /usr/local/bin/ovs_parse_bridge.py
+  - source: salt://telegraf/files/script/ovs_parse_bridge.py
+  - template: jinja
+  - mode: 755
 
+    {%- if pillar.neutron.get('gateway', {}).get('enabled', False) == True or pillar.neutron.get('compute',{}).get('dhcp_agent_enabled', False) == True %}
 ovs_arping_check_telegraf_script:
   file.managed:
   - name: /usr/local/bin/check_ovs_arping.py
@@ -8,6 +15,7 @@
   - template: jinja
   - mode: 755
 
+    {%- endif %}
   {%- endif %}
 {%- endif %}