Merge "Install contrail-openstack-vrouter for Juniper contrail"
diff --git a/opencontrail/map.jinja b/opencontrail/map.jinja
index 31790d8..921274a 100644
--- a/opencontrail/map.jinja
+++ b/opencontrail/map.jinja
@@ -322,14 +322,16 @@
'vrouter_dns_xmpp_sessions_variation_threshold': 5,
'vrouter_lls_too_high_threshold': 10,
'vrouter_lls_variation_threshold': 5,
- 'vrouter_flows_active_too_high_threshold': 1200,
- 'vrouter_flows_discard_too_high_threshold': 1200,
- 'vrouter_flows_flow_action_drop_too_high_threshold': 5,
+ 'vrouter_flows_active_too_high_threshold': 100,
+ 'vrouter_flows_discard_too_high_threshold': 0.1,
+ 'vrouter_flows_flow_action_drop_too_high_threshold': 0.2,
'vrouter_flows_frag_err_too_high_threshold': 100,
- 'vrouter_flows_invalid_nh_too_high_threshold': 104,
- 'vrouter_flows_composite_invalid_interface_too_high_threshold': 105,
+ 'vrouter_flows_invalid_nh_too_high_threshold': 0.1,
+ 'vrouter_flows_composite_invalid_interface_too_high_threshold': 0.05,
'vrouter_flows_invalid_label_too_high_threshold': 100,
- 'vrouter_flows_flow_queue_limit_exceeded_too_high_threshold': 100,
+ 'vrouter_flows_flow_queue_limit_exceeded_too_high_threshold': 0.1,
'vrouter_flows_flow_table_full_too_high_threshold': 100,
+ 'services_failed_warning_threshold_percent': 0.3,
+ 'services_failed_critical_threshold_percent': 0.6,
},
}, grain='os_family', merge=salt['pillar.get']('opencontrail:monitoring')) %}
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index b360903..384caf0 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -1,6 +1,9 @@
{%- if pillar.opencontrail is defined %}
{%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring with context %}
+ {%- set all_contrail_processes = [] %}
+ {%- set all_contrail_apis = [] %}
+
{%- if collector.get('enabled', False) %}
{%- set collector_apis = ( 'contrail.collector', ) %}
{%- set collector_processes = (
@@ -8,6 +11,14 @@
'contrail-nodemgr', 'contrail-query-engine', 'contrail-snmp-collector',
'contrail-supervisord-analytics', 'contrail-topology',
) %}
+
+ {%- for api in collector_apis %}
+ {% do all_contrail_apis.append(api) %}
+ {% endfor %}
+
+ {%- for process in collector_processes %}
+ {% do all_contrail_processes.append(process) %}
+ {% endfor %}
{%- endif %}
{%- if compute.get('enabled', False) %}
@@ -15,6 +26,14 @@
{%- set compute_processes = (
'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
) %}
+
+ {%- for api in compute_apis %}
+ {% do all_contrail_apis.append(api) %}
+ {% endfor %}
+
+ {%- for process in compute_processes %}
+ {% do all_contrail_processes.append(process) %}
+ {% endfor %}
{%- endif %}
{%- if control.get('enabled', False) %}
@@ -27,6 +46,14 @@
'contrail-schema', 'contrail-supervisord-config',
'contrail-supervisord-control', 'contrail-svc-monitor',
) %}
+
+ {%- for api in control_apis %}
+ {% do all_contrail_apis.append(api) %}
+ {% endfor %}
+
+ {%- for process in control_processes %}
+ {% do all_contrail_processes.append(process) %}
+ {% endfor %}
{%- endif %}
{%- if database.get('enabled', False) %}
@@ -34,6 +61,10 @@
'kafka-server', 'cassandra-server',
'contrail-nodemgr-database', 'contrail-supervisord-database',
) %}
+
+ {%- for process in database_processes %}
+ {% do all_contrail_processes.append(process) %}
+ {% endfor %}
{%- endif %}
{%- if web.get('enabled', False) %}
@@ -46,6 +77,10 @@
'contrail-web-server',
) %}
{%- endif %}
+
+ {%- for process in web_processes %}
+ {% do all_contrail_processes.append(process) %}
+ {% endfor %}
{%- endif %}
{%- if database_processes is defined and
@@ -87,37 +122,105 @@
web_processes is defined %}
server:
alert:
- {%- if control_processes is defined %}
- {%- for contrail_api in control_apis %}
- {%- set words = contrail_api.split('.') %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+ {%- for contrail_api in all_contrail_apis %}
+ {%- set words = contrail_api.split('.') %}
+ {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIInfo:
if: >-
http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: info
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "Endpoint check for '{{ $labels.service }}' is failed"
+ description: Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}
+ {%- endraw %}
+ {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIWarning:
+ if: >-
+ count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_warning_threshold_percent }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: warning
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ description: More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+ {%- endraw %}
+ {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APICritical:
+ if: >-
+ count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_critical_threshold_percent }}
+ {%- raw %}
+ for: 2m
+ labels:
+ severity: critical
+ service: "{{ $labels.service }}"
+ annotations:
+ summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ description: More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+ {%- endraw %}
+ {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+ if: >-
+ count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) == count(http_response_status{service=~"{{ contrail_api }}"}) by (service)
+ {%- raw %}
for: 2m
labels:
severity: down
service: "{{ $labels.service }}"
annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
- description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
- {%- endfor %}
- {%- for contrail_process in control_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
+ summary: "All '{{ $labels.service }}' APIs are down"
+ description: All '{{ $labels.service }}' APIs are down
+ {%- endraw %}
+ {%- endfor %}
+{%- for contrail_process in all_contrail_processes %}
+ {%- set words = contrail_process.split('-') %}
+ {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessInfo:
if: >-
procstat_running{process_name="{{ contrail_process }}"} == 0
labels:
+ severity: info
+ service: {{ contrail_process }}
+ annotations:
+ {%- raw %}
+ summary: '{{ $labels.service }} service is down'
+ description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
+ {%- endraw %}
+ {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessWarning:
+ if: >-
+ count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+ labels:
+ severity: warning
+ service: {{ contrail_process }}
+ annotations:
+ {%- raw %}
+ summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ description: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ {%- endraw %}
+ {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessCritical:
+ if: >-
+ count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+ labels:
+ severity: critical
+ service: {{ contrail_process }}
+ annotations:
+ {%- raw %}
+ summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ description: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+ {%- endraw %}
+ {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
+ if: >-
+ count(procstat_running{process_name="{{ contrail_process }}"} == 0) == count(procstat_running{process_name="{{ contrail_process }}"})
+ labels:
severity: down
service: {{ contrail_process }}
annotations:
-{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
- {%- endfor %}
+ {%- raw %}
+ summary: "All '{{ $labels.service }}' services are down"
+ description: "All '{{ $labels.service }}' services are down"
+ {%- endraw %}
+{%- endfor %}
+ {%- if control_processes is defined %}
{%- raw %}
ContrailBGPSessionsNoneUp:
if: >-
@@ -205,39 +308,6 @@
description: 'There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_variation_threshold }})'
{%- endif %}
- {%- if collector_processes is defined %}
- {%- for contrail_api in collector_apis %}
- {%- set words = contrail_api.split('.') %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
- if: >-
- http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
- description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
- {%- endfor %}
- {%- for contrail_process in collector_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
- if: >-
- procstat_running{process_name="{{ contrail_process }}"} == 0
- labels:
- severity: down
- service: {{ contrail_process }}
- annotations:
-{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
- {%- endfor %}
- {%- endif %}
-
{%- if compute_processes is defined %}
{%- raw %}
ContrailVrouterXMPPSessionsNone:
@@ -335,7 +405,7 @@
ContrailFlowsActiveTooMany:
if: >-
{%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %}
- min(contrail_vrouter_flows_active) by (host) >= {{ vrouter_flows_active_toohigh_threshold }}
+ deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
@@ -347,7 +417,7 @@
ContrailFlowsDiscardTooMany:
if: >-
{%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %}
- min(contrail_vrouter_flows_discard) by (host) >= {{ vrouter_flows_discard_toohigh_threshold }}
+ rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
@@ -383,7 +453,7 @@
ContrailFlowsInvalidNHTooMany:
if: >-
{%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %}
- min(contrail_vrouter_flows_invalid_nh) by (host) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
+ rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
@@ -395,7 +465,7 @@
ContrailFlowsInvalidITFTooMany:
if: >-
{%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %}
- min(contrail_vrouter_flows_composite_invalid_interface) by (host) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
+ rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
@@ -419,7 +489,7 @@
ContrailFlowsQueueLimitExceededTooMany:
if: >-
{%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %}
- min(contrail_vrouter_flows_flow_queue_limit_exceeded) by (host) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
+ rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
@@ -440,83 +510,49 @@
annotations:
summary: 'Too many vRouter flows with table full'
description: 'There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_table_full_toohigh_threshold }})'
- {%- for contrail_api in compute_apis %}
- {%- set words = contrail_api.split('.') %}
- {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
- if: >-
- http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
- for: 2m
- labels:
- severity: down
- service: "{{ $labels.service }}"
- annotations:
- summary: "Endpoint check for '{{ $labels.service }}' is down"
- description: >-
- Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
- {%- endfor %}
- {%- for contrail_process in compute_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
- if: >-
- procstat_running{process_name="{{ contrail_process }}"} == 0
- labels:
- severity: down
- service: {{ contrail_process }}
- annotations:
-{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
- {%- endfor %}
- {%- endif %}
-
- {%- if database_processes is defined %}
- {%- for contrail_process in database_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
- if: >-
- procstat_running{process_name="{{ contrail_process }}"} == 0
- labels:
- severity: down
- service: {{ contrail_process }}
- annotations:
-{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
- {%- endfor %}
- {%- endif %}
-
- {%- if web_processes is defined %}
- {%- for contrail_process in web_processes %}
- {%- set words = contrail_process.split('-') %}
- {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
- if: >-
- procstat_running{process_name="{{ contrail_process }}"} == 0
- labels:
- severity: down
- service: {{ contrail_process }}
- annotations:
-{%- raw %}
- summary: '{{ $labels.service }} service is down'
- description: '{{ $labels.service }} service is down on node { $labels.host }}'
-{%- endraw %}
- {%- endfor %}
{%- endif %}
{%- if database.get('enabled', False) %}
- ZookeeperDown:
+ ZookeeperInfo:
if: >-
zookeeper_up != 1
for: 2m
labels:
+ severity: info
+ service: zookeeper
+ annotations:
+ summary: 'Zookeeper service down'
+ description: 'Zookeeper service is down on node {% raw %}{{ $labels.host }}{% endraw %}.'
+ ZookeeperWarning:
+ if: >-
+ count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_warning_threshold_percent }}
+ for: 2m
+ labels:
severity: warning
service: zookeeper
annotations:
- summary: 'Zookeeper service down'
- description: 'Zookeeper service is down on node {% raw %}{{ $labels.host }}{% endraw %}.'
+ summary: "More than {{monitoring.services_failed_warning_threshold_percent*100}}% of Zookeeper services are down"
+ description: "More than {{monitoring.services_failed_warning_threshold_percent*100}}% of Zookeeper services are down"
+ ZookeeperCritical:
+ if: >-
+ count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_critical_threshold_percent }}
+ for: 2m
+ labels:
+ severity: critical
+ service: zookeeper
+ annotations:
+ summary: "More than {{monitoring.services_failed_critical_threshold_percent*100}}% of Zookeeper services are down"
+ description: "More than {{monitoring.services_failed_critical_threshold_percent*100}}% of Zookeeper services are down"
+ ZookeeperDown:
+ if: >-
+ count(zookeeper_up == 0) == count(zookeeper_up)
+ for: 2m
+ labels:
+ severity: down
+ service: zookeeper
+ annotations:
+ summary: 'All Zookeeper services are down'
+ description: 'All Zookeeper services are down'
{%- endif %}
{%- if exporters is defined %}