Merge "Add contrail-status and vrouter checks for telegraf"
diff --git a/opencontrail/files/3.0/check_contrail_health.py b/opencontrail/files/3.0/check_contrail_health.py
new file mode 100644
index 0000000..8449f40
--- /dev/null
+++ b/opencontrail/files/3.0/check_contrail_health.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+import subprocess
+import sys
+
+
+CMD = "contrail-status"
+
+SERVICE_EXTRA_STATES = {'contrail-schema': 'backup',
+ 'contrail-svc-monitor': 'backup',
+ 'contrail-device-manager': 'backup'}
+
+
+def check(output=sys.stdout):
+ cmd = subprocess.check_output(CMD.split(' '))
+ result = {}
+ for l in [l.strip() for l in cmd.split('\n') if l.strip() != '']:
+ if l.startswith('=='):
+ # role detected
+ role = l.strip('==').strip().replace(' ', '_')
+ result[role] = []
+ else:
+ state_info = [a for a in l.split(' ') if a != '']
+ service = state_info[0].split(':')[0]
+ status = ("%s %s" % (state_info[1], " ".join(state_info[2:]))
+ if len(state_info) > 2 else state_info[1])
+ if status != 'active' and SERVICE_EXTRA_STATES.get(service) != status:
+ exit_code = 1
+ else:
+ exit_code = 0
+ result[role].append({'service': service, 'status': status,
+ 'exit_code': exit_code})
+ # output all collected info
+ # NOTE(ivasilevskaya) ignore contrail database in favor of supervisor
+ # database
+ result.pop('Contrail_Database', None)
+ for role, services in result.iteritems():
+ for info in services:
+ output.write(("%(workload)s,contrail_service=%(service)s,role=%(role)s "
+ "exit_code=%(exit_code)s\n") %
+ {'workload': 'contrail_health',
+ 'service': info['service'],
+ 'exit_code': info['exit_code'],
+ 'role': role})
+ return result
+
+
+if __name__ == "__main__":
+ check()
diff --git a/opencontrail/files/3.0/check_global_vrouter_config.py b/opencontrail/files/3.0/check_global_vrouter_config.py
new file mode 100644
index 0000000..12d4d8f
--- /dev/null
+++ b/opencontrail/files/3.0/check_global_vrouter_config.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+import sys
+from vnc_api import vnc_api
+
+
+def check(output=sys.stdout):
+ try:
+ vnc_cli = vnc_api.VncApi(conf_file='/etc/contrail/vnc_api_lib.ini')
+ out = vnc_cli._objects_list('global-vrouter-config')
+ exit_code = 0 if len(out.get('global-vrouter-configs', [])) == 1 else 1
+ # XXX FIXME narrow down exception type
+ except Exception:
+ exit_code = 1
+ res = {'workload': 'contrail_global_vrouter_config',
+ 'exit_code': exit_code}
+ # output result
+ output.write('%(workload)s exit_code=%(exit_code)s\n' % res)
+
+
+if __name__ == "__main__":
+ check()
diff --git a/opencontrail/files/4.0/check_contrail_health.py b/opencontrail/files/4.0/check_contrail_health.py
new file mode 100644
index 0000000..3617553
--- /dev/null
+++ b/opencontrail/files/4.0/check_contrail_health.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import subprocess
+import sys
+
+CMD = "doctrail all contrail-status"
+
+SERVICE_EXTRA_STATES = {'contrail-schema': 'backup',
+ 'contrail-svc-monitor': 'backup',
+ 'contrail-device-manager': 'backup'}
+
+
+def check(output=sys.stdout):
+ cmd = subprocess.check_output(CMD.split(' '))
+ result = {}
+ for l in [l.strip() for l in cmd.split('\n') if l.strip() != '']:
+ if l.startswith("FOR NODE") or l.startswith("*******"):
+ continue
+ if l.startswith('=='):
+ # role detected
+ role = l.strip('==').strip().replace(' ', '_')
+ result[role] = []
+ else:
+ state_info = [a for a in l.split(' ') if a != '']
+ service = state_info[0].split(':')[0]
+ status = ("%s %s" % (state_info[1], " ".join(state_info[2:]))
+ if len(state_info) > 2 else state_info[1])
+ if status != 'active' and SERVICE_EXTRA_STATES.get(service) != status:
+ exit_code = 1
+ else:
+ exit_code = 0
+ result[role].append({'service': service, 'status': status,
+ 'exit_code': exit_code})
+ # output all collected info
+ # NOTE(ivasilevskaya) ignore contrail database in favor of supervisor
+ # database
+ # result.pop('Contrail_Database', None)
+ for role, services in result.iteritems():
+ for info in services:
+ output.write(("%(workload)s,contrail_service=%(service)s,role=%(role)s "
+ "exit_code=%(exit_code)s\n") %
+ {'workload': 'contrail_health',
+ 'service': info['service'],
+ 'exit_code': info['exit_code'],
+ 'role': role})
+ return result
+
+
+if __name__ == "__main__":
+ check()
diff --git a/opencontrail/files/4.0/check_global_vrouter_config.py b/opencontrail/files/4.0/check_global_vrouter_config.py
new file mode 100644
index 0000000..12d4d8f
--- /dev/null
+++ b/opencontrail/files/4.0/check_global_vrouter_config.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+import sys
+from vnc_api import vnc_api
+
+
+def check(output=sys.stdout):
+ try:
+ vnc_cli = vnc_api.VncApi(conf_file='/etc/contrail/vnc_api_lib.ini')
+ out = vnc_cli._objects_list('global-vrouter-config')
+ exit_code = 0 if len(out.get('global-vrouter-configs', [])) == 1 else 1
+ # XXX FIXME narrow down exception type
+ except Exception:
+ exit_code = 1
+ res = {'workload': 'contrail_global_vrouter_config',
+ 'exit_code': exit_code}
+ # output result
+ output.write('%(workload)s exit_code=%(exit_code)s\n' % res)
+
+
+if __name__ == "__main__":
+ check()
diff --git a/opencontrail/init.sls b/opencontrail/init.sls
index ee862a7..09a204a 100644
--- a/opencontrail/init.sls
+++ b/opencontrail/init.sls
@@ -23,4 +23,9 @@
{%- endif %}
{% if pillar.opencontrail.common is defined %}
- opencontrail.common
-{% endif %}
\ No newline at end of file
+{% endif %}
+{%- if pillar.opencontrail.collector is defined or
+ pillar.opencontrail.control is defined or
+ pillar.opencontrail.database is defined %}
+- opencontrail.monitoring
+{%- endif %}
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index d7cf03f..7f8ef15 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -133,6 +133,42 @@
{%- raw %}
summary: "{{ $labels.name }} service outage"
description: "All {{ $labels.process_name }} processes are down."
+ ContrailHealthCheckDisabled:
+ if: >-
+ absent(contrail_health_exit_code) == 1
+ labels:
+ severity: critical
+ service: contrail
+ annotations:
+ summary: "Contrail healthcheck disabled"
+ description: "Contrail healthcheck is disabled."
+ ContrailHealthCheckFailed:
+ if: >-
+ contrail_health_exit_code != 0
+ labels:
+ severity: critical
+ service: contrail
+ annotations:
+ summary: "Contrail healthcheck failed"
+ description: "Contrail healthcheck failed for the {{ $labels.contrail_service }} on the {{ $labels.host }} node."
+ ContrailGlobalVrouterConfigCheckDisabled:
+ if: >-
+ absent(contrail_global_vrouter_config_exit_code) == 1
+ labels:
+ severity: critical
+ service: contrail
+ annotations:
+ summary: "Contrail global vrouter config check disabled"
+ description: "Contrail global vrouter config check is disabled."
+ ContrailGlobalVrouterConfigCheckFailed:
+ if: >-
+ contrail_global_vrouter_config_exit_code != 0
+ labels:
+ severity: critical
+ service: contrail
+ annotations:
+ summary: "Contrail global vrouter config check failed"
+ description: "Contrail global vrouter config check failed on the {{ $labels.host }} node."
ContrailBGPSessionsNoEstablished:
if: >-
max(contrail_bgp_session_count) by (host) == 0
diff --git a/opencontrail/meta/telegraf.yml b/opencontrail/meta/telegraf.yml
index 118750e..21cc5d3 100644
--- a/opencontrail/meta/telegraf.yml
+++ b/opencontrail/meta/telegraf.yml
@@ -12,6 +12,24 @@
data_format: "json"
timeout: "10s"
{%- endif %}
+ {%- if pillar.opencontrail.collector is defined or
+ pillar.opencontrail.control is defined or
+ pillar.opencontrail.database is defined %}
+ monitor_contrail_health:
+ template: telegraf/files/input/exec.conf
+ commands:
+ - /usr/local/bin/check_contrail_health.py
+ data_format: influx
+ interval: 30s
+ {%- endif %}
+ {%- if pillar.opencontrail.control is defined %}
+ monitor_global_vrouter_config:
+ template: telegraf/files/input/exec.conf
+ commands:
+ - /usr/local/bin/check_global_vrouter_config.py
+ data_format: influx
+ interval: 30s
+ {%- endif %}
procstat:
process:
{%- if collector.get('enabled', False) %}
diff --git a/opencontrail/monitoring.sls b/opencontrail/monitoring.sls
new file mode 100644
index 0000000..30a0bf1
--- /dev/null
+++ b/opencontrail/monitoring.sls
@@ -0,0 +1,31 @@
+{%- if pillar.opencontrail is defined and pillar.telegraf is defined %}
+ {%- if pillar.opencontrail.collector is defined %}
+ {%- set version = pillar.opencontrail.collector.version %}
+ {%- endif %}
+ {%- if pillar.opencontrail.control is defined %}
+ {%- set version = pillar.opencontrail.control.version %}
+ {%- endif %}
+ {%- if pillar.opencontrail.database is defined %}
+ {%- set version = pillar.opencontrail.database.version %}
+ {%- endif %}
+
+ {%- if pillar.opencontrail.collector is defined or
+ pillar.opencontrail.control is defined or
+ pillar.opencontrail.database is defined %}
+contrail_control_status_check_telegraf_script:
+ file.managed:
+ - name: /usr/local/bin/check_contrail_health.py
+ - source: salt://opencontrail/files/{{ version }}/check_contrail_health.py
+ - template: jinja
+ - mode: 755
+ {%- endif %}
+
+ {%- if pillar.opencontrail.control is defined %}
+contrail_control_vrouter_check_telegraf_script:
+ file.managed:
+ - name: /usr/local/bin/check_global_vrouter_config.py
+ - source: salt://opencontrail/files/{{ version }}/check_global_vrouter_config.py
+ - template: jinja
+ - mode: 755
+ {%- endif %}
+{%- endif %}