Merge "Add contrail-status and vrouter checks for telegraf"
diff --git a/opencontrail/files/3.0/check_contrail_health.py b/opencontrail/files/3.0/check_contrail_health.py
new file mode 100644
index 0000000..8449f40
--- /dev/null
+++ b/opencontrail/files/3.0/check_contrail_health.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+import subprocess
+import sys
+
+
+CMD = "contrail-status"
+
+SERVICE_EXTRA_STATES = {'contrail-schema': 'backup',
+                        'contrail-svc-monitor': 'backup',
+                        'contrail-device-manager': 'backup'}
+
+
+def check(output=sys.stdout):
+    cmd = subprocess.check_output(CMD.split(' '))
+    result = {}
+    for l in [l.strip() for l in cmd.split('\n') if l.strip() != '']:
+        if l.startswith('=='):
+            # role detected
+            role = l.strip('==').strip().replace(' ', '_')
+            result[role] = []
+        else:
+            state_info = [a for a in l.split(' ') if a != '']
+            service = state_info[0].split(':')[0]
+            status = ("%s %s" % (state_info[1], " ".join(state_info[2:]))
+                      if len(state_info) > 2 else state_info[1])
+            if status != 'active' and SERVICE_EXTRA_STATES.get(service) != status:
+                exit_code = 1
+            else:
+                exit_code = 0
+            result[role].append({'service': service, 'status': status,
+                                 'exit_code': exit_code})
+    # output all collected info
+    # NOTE(ivasilevskaya) ignore contrail database in favor of supervisor
+    # database
+    result.pop('Contrail_Database', None)
+    for role, services in result.iteritems():
+        for info in services:
+            output.write(("%(workload)s,contrail_service=%(service)s,role=%(role)s "
+                          "exit_code=%(exit_code)s\n") %
+                          {'workload': 'contrail_health',
+                           'service': info['service'],
+                           'exit_code': info['exit_code'],
+                           'role': role})
+    return result
+
+
+if __name__ == "__main__":
+    check()
diff --git a/opencontrail/files/3.0/check_global_vrouter_config.py b/opencontrail/files/3.0/check_global_vrouter_config.py
new file mode 100644
index 0000000..12d4d8f
--- /dev/null
+++ b/opencontrail/files/3.0/check_global_vrouter_config.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+import sys
+from vnc_api import vnc_api
+
+
+def check(output=sys.stdout):
+    try:
+        vnc_cli = vnc_api.VncApi(conf_file='/etc/contrail/vnc_api_lib.ini')
+        out = vnc_cli._objects_list('global-vrouter-config')
+        exit_code = 0 if len(out.get('global-vrouter-configs', [])) == 1 else 1
+    # XXX FIXME narrow down exception type
+    except Exception:
+        exit_code = 1
+    res = {'workload': 'contrail_global_vrouter_config',
+           'exit_code': exit_code}
+    # output result
+    output.write('%(workload)s exit_code=%(exit_code)s\n' % res)
+
+
+if __name__ == "__main__":
+    check()
diff --git a/opencontrail/files/4.0/check_contrail_health.py b/opencontrail/files/4.0/check_contrail_health.py
new file mode 100644
index 0000000..3617553
--- /dev/null
+++ b/opencontrail/files/4.0/check_contrail_health.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import subprocess
+import sys
+
+CMD = "doctrail all contrail-status"
+
+SERVICE_EXTRA_STATES = {'contrail-schema': 'backup',
+                        'contrail-svc-monitor': 'backup',
+                        'contrail-device-manager': 'backup'}
+
+
+def check(output=sys.stdout):
+    cmd = subprocess.check_output(CMD.split(' '))
+    result = {}
+    for l in [l.strip() for l in cmd.split('\n') if l.strip() != '']:
+        if l.startswith("FOR NODE") or l.startswith("*******"):
+            continue
+        if l.startswith('=='):
+            # role detected
+            role = l.strip('==').strip().replace(' ', '_')
+            result[role] = []
+        else:
+            state_info = [a for a in l.split(' ') if a != '']
+            service = state_info[0].split(':')[0]
+            status = ("%s %s" % (state_info[1], " ".join(state_info[2:]))
+                      if len(state_info) > 2 else state_info[1])
+            if status != 'active' and SERVICE_EXTRA_STATES.get(service) != status:
+                exit_code = 1
+            else:
+                exit_code = 0
+            result[role].append({'service': service, 'status': status,
+                                 'exit_code': exit_code})
+    # output all collected info
+    # NOTE(ivasilevskaya) ignore contrail database in favor of supervisor
+    # database
+    # result.pop('Contrail_Database', None)
+    for role, services in result.iteritems():
+        for info in services:
+            output.write(("%(workload)s,contrail_service=%(service)s,role=%(role)s "
+                          "exit_code=%(exit_code)s\n") %
+                          {'workload': 'contrail_health',
+                           'service': info['service'],
+                           'exit_code': info['exit_code'],
+                           'role': role})
+    return result
+
+
+if __name__ == "__main__":
+    check()
diff --git a/opencontrail/files/4.0/check_global_vrouter_config.py b/opencontrail/files/4.0/check_global_vrouter_config.py
new file mode 100644
index 0000000..12d4d8f
--- /dev/null
+++ b/opencontrail/files/4.0/check_global_vrouter_config.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+import sys
+from vnc_api import vnc_api
+
+
+def check(output=sys.stdout):
+    try:
+        vnc_cli = vnc_api.VncApi(conf_file='/etc/contrail/vnc_api_lib.ini')
+        out = vnc_cli._objects_list('global-vrouter-config')
+        exit_code = 0 if len(out.get('global-vrouter-configs', [])) == 1 else 1
+    # XXX FIXME narrow down exception type
+    except Exception:
+        exit_code = 1
+    res = {'workload': 'contrail_global_vrouter_config',
+           'exit_code': exit_code}
+    # output result
+    output.write('%(workload)s exit_code=%(exit_code)s\n' % res)
+
+
+if __name__ == "__main__":
+    check()
diff --git a/opencontrail/init.sls b/opencontrail/init.sls
index ee862a7..09a204a 100644
--- a/opencontrail/init.sls
+++ b/opencontrail/init.sls
@@ -23,4 +23,9 @@
 {%- endif %}
 {% if pillar.opencontrail.common is defined %}
 - opencontrail.common
-{% endif %}
\ No newline at end of file
+{% endif %}
+{%- if pillar.opencontrail.collector is defined or
+       pillar.opencontrail.control is defined or
+       pillar.opencontrail.database is defined %}
+- opencontrail.monitoring
+{%- endif %}
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index d7cf03f..7f8ef15 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -133,6 +133,42 @@
     {%- raw %}
         summary: "{{ $labels.name }} service outage"
         description: "All {{ $labels.process_name }} processes are down."
+    ContrailHealthCheckDisabled:
+      if: >-
+        absent(contrail_health_exit_code) == 1
+      labels:
+        severity: critical
+        service: contrail
+      annotations:
+        summary: "Contrail healthcheck disabled"
+        description: "Contrail healthcheck is disabled."
+    ContrailHealthCheckFailed:
+      if: >-
+        contrail_health_exit_code != 0
+      labels:
+        severity: critical
+        service: contrail
+      annotations:
+        summary: "Contrail healthcheck failed"
+        description: "Contrail healthcheck failed for the {{ $labels.contrail_service }} on the {{ $labels.host }} node."
+    ContrailGlobalVrouterConfigCheckDisabled:
+      if: >-
+        absent(contrail_global_vrouter_config_exit_code) == 1
+      labels:
+        severity: critical
+        service: contrail
+      annotations:
+        summary: "Contrail global vrouter config check disabled"
+        description: "Contrail global vrouter config check is disabled."
+    ContrailGlobalVrouterConfigCheckFailed:
+      if: >-
+        contrail_global_vrouter_config_exit_code != 0
+      labels:
+        severity: critical
+        service: contrail
+      annotations:
+        summary: "Contrail global vrouter config check failed"
+        description: "Contrail global vrouter config check failed on the {{ $labels.host }} node."
     ContrailBGPSessionsNoEstablished:
       if: >-
         max(contrail_bgp_session_count) by (host) == 0
diff --git a/opencontrail/meta/telegraf.yml b/opencontrail/meta/telegraf.yml
index 118750e..21cc5d3 100644
--- a/opencontrail/meta/telegraf.yml
+++ b/opencontrail/meta/telegraf.yml
@@ -12,6 +12,24 @@
       data_format: "json"
       timeout: "10s"
     {%- endif %}
+    {%- if pillar.opencontrail.collector is defined or
+           pillar.opencontrail.control is defined or
+           pillar.opencontrail.database is defined %}
+    monitor_contrail_health:
+      template: telegraf/files/input/exec.conf
+      commands:
+      - /usr/local/bin/check_contrail_health.py
+      data_format: influx
+      interval: 30s
+    {%- endif %}
+    {%- if pillar.opencontrail.control is defined %}
+    monitor_global_vrouter_config:
+      template: telegraf/files/input/exec.conf
+      commands:
+      - /usr/local/bin/check_global_vrouter_config.py
+      data_format: influx
+      interval: 30s
+    {%- endif %}
     procstat:
       process:
     {%- if collector.get('enabled', False) %}
diff --git a/opencontrail/monitoring.sls b/opencontrail/monitoring.sls
new file mode 100644
index 0000000..30a0bf1
--- /dev/null
+++ b/opencontrail/monitoring.sls
@@ -0,0 +1,31 @@
+{%- if pillar.opencontrail is defined and pillar.telegraf is defined %}
+  {%- if pillar.opencontrail.collector is defined %}
+    {%- set version = pillar.opencontrail.collector.version %}
+  {%- endif %}
+  {%- if pillar.opencontrail.control is defined %}
+    {%- set version = pillar.opencontrail.control.version %}
+  {%- endif %}
+  {%- if pillar.opencontrail.database is defined %}
+    {%- set version = pillar.opencontrail.database.version %}
+  {%- endif %}
+
+  {%- if pillar.opencontrail.collector is defined or
+         pillar.opencontrail.control is defined or
+         pillar.opencontrail.database is defined %}
+contrail_control_status_check_telegraf_script:
+  file.managed:
+  - name: /usr/local/bin/check_contrail_health.py
+  - source: salt://opencontrail/files/{{ version }}/check_contrail_health.py
+  - template: jinja
+  - mode: 755
+  {%- endif %}
+
+  {%- if pillar.opencontrail.control is defined %}
+contrail_control_vrouter_check_telegraf_script:
+  file.managed:
+  - name: /usr/local/bin/check_global_vrouter_config.py
+  - source: salt://opencontrail/files/{{ version }}/check_global_vrouter_config.py
+  - template: jinja
+  - mode: 755
+  {%- endif %}
+{%- endif %}