Merge "Install contrail-openstack-vrouter for Juniper contrail"
diff --git a/opencontrail/map.jinja b/opencontrail/map.jinja
index 31790d8..921274a 100644
--- a/opencontrail/map.jinja
+++ b/opencontrail/map.jinja
@@ -322,14 +322,16 @@
         'vrouter_dns_xmpp_sessions_variation_threshold': 5,
         'vrouter_lls_too_high_threshold': 10,
         'vrouter_lls_variation_threshold': 5,
-        'vrouter_flows_active_too_high_threshold': 1200,
-        'vrouter_flows_discard_too_high_threshold': 1200,
-        'vrouter_flows_flow_action_drop_too_high_threshold': 5,
+        'vrouter_flows_active_too_high_threshold': 100,
+        'vrouter_flows_discard_too_high_threshold': 0.1,
+        'vrouter_flows_flow_action_drop_too_high_threshold': 0.2,
         'vrouter_flows_frag_err_too_high_threshold': 100,
-        'vrouter_flows_invalid_nh_too_high_threshold': 104,
-        'vrouter_flows_composite_invalid_interface_too_high_threshold': 105,
+        'vrouter_flows_invalid_nh_too_high_threshold': 0.1,
+        'vrouter_flows_composite_invalid_interface_too_high_threshold': 0.05,
         'vrouter_flows_invalid_label_too_high_threshold': 100,
-        'vrouter_flows_flow_queue_limit_exceeded_too_high_threshold': 100,
+        'vrouter_flows_flow_queue_limit_exceeded_too_high_threshold': 0.1,
         'vrouter_flows_flow_table_full_too_high_threshold': 100,
+        'services_failed_warning_threshold_percent': 0.3,
+        'services_failed_critical_threshold_percent': 0.6,
     },
 }, grain='os_family', merge=salt['pillar.get']('opencontrail:monitoring')) %}
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
index b360903..384caf0 100644
--- a/opencontrail/meta/prometheus.yml
+++ b/opencontrail/meta/prometheus.yml
@@ -1,6 +1,9 @@
 {%- if pillar.opencontrail is defined %}
   {%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring with context %}
 
+  {%- set all_contrail_processes = [] %}
+  {%- set all_contrail_apis = [] %}
+
   {%- if collector.get('enabled', False) %}
     {%- set collector_apis = ( 'contrail.collector', ) %}
     {%- set collector_processes = (
@@ -8,6 +11,14 @@
         'contrail-nodemgr', 'contrail-query-engine', 'contrail-snmp-collector',
         'contrail-supervisord-analytics', 'contrail-topology',
         ) %}
+
+    {%- for api in collector_apis %}
+      {% do all_contrail_apis.append(api) %}
+    {% endfor %}
+
+    {%- for process in collector_processes %}
+      {% do all_contrail_processes.append(process) %}
+    {% endfor %}
   {%- endif %}
 
   {%- if compute.get('enabled', False) %}
@@ -15,6 +26,14 @@
     {%- set compute_processes = (
         'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
         ) %}
+
+    {%- for api in compute_apis %}
+      {% do all_contrail_apis.append(api) %}
+    {% endfor %}
+
+    {%- for process in compute_processes %}
+      {% do all_contrail_processes.append(process) %}
+    {% endfor %}
   {%- endif %}
 
   {%- if control.get('enabled', False) %}
@@ -27,6 +46,14 @@
         'contrail-schema', 'contrail-supervisord-config',
         'contrail-supervisord-control', 'contrail-svc-monitor',
         ) %}
+
+    {%- for api in control_apis %}
+      {% do all_contrail_apis.append(api) %}
+    {% endfor %}
+
+    {%- for process in control_processes %}
+      {% do all_contrail_processes.append(process) %}
+    {% endfor %}
   {%- endif %}
 
   {%- if database.get('enabled', False) %}
@@ -34,6 +61,10 @@
         'kafka-server', 'cassandra-server',
         'contrail-nodemgr-database', 'contrail-supervisord-database',
         ) %}
+
+    {%- for process in database_processes %}
+      {% do all_contrail_processes.append(process) %}
+    {% endfor %}
   {%- endif %}
 
   {%- if web.get('enabled', False) %}
@@ -46,6 +77,10 @@
           'contrail-web-server',
           ) %}
     {%- endif %}
+
+    {%- for process in web_processes %}
+      {% do all_contrail_processes.append(process) %}
+    {% endfor %}
   {%- endif %}
 
   {%- if database_processes is defined and
@@ -87,37 +122,105 @@
          web_processes is defined %}
 server:
   alert:
-    {%- if control_processes is defined %}
-      {%- for contrail_api in control_apis %}
-        {%- set words = contrail_api.split('.') %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+    {%- for contrail_api in all_contrail_apis %}
+      {%- set words = contrail_api.split('.') %}
+    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIInfo:
       if: >-
         http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
+      {%- raw %}
+      for: 2m
+      labels:
+        severity: info
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "Endpoint check for '{{ $labels.service }}' is failed"
+        description: Endpoint check for '{{ $labels.service }}' is failed for 2 minutes on node {{ $labels.host }}
+      {%- endraw %}
+    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIWarning:
+      if: >-
+        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_warning_threshold_percent }}
+      {%- raw %}
+      for: 2m
+      labels:
+        severity: warning
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        description: More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+      {%- endraw %}
+    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APICritical:
+      if: >-
+        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) >= count(http_response_status{service=~"{{ contrail_api }}"}) by (service) *{{ monitoring.services_failed_critical_threshold_percent }}
+      {%- raw %}
+      for: 2m
+      labels:
+        severity: critical
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        description: More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down
+      {%- endraw %}
+    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+      if: >-
+        count(http_response_status{service=~"{{ contrail_api }}"} == 0) by (service) == count(http_response_status{service=~"{{ contrail_api }}"}) by (service)
+      {%- raw %}
       for: 2m
       labels:
         severity: down
         service: "{{ $labels.service }}"
       annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
-        description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
-      {%- endfor %}
-      {%- for contrail_process in control_processes %}
-        {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
+        summary: "All '{{ $labels.service }}' APIs are down"
+        description: All '{{ $labels.service }}' APIs are down
+      {%- endraw %}
+    {%- endfor %}
+{%- for contrail_process in all_contrail_processes %}
+      {%- set words = contrail_process.split('-') %}
+    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessInfo:
       if: >-
         procstat_running{process_name="{{ contrail_process }}"} == 0
       labels:
+        severity: info
+        service: {{ contrail_process }}
+      annotations:
+    {%- raw %}
+        summary: '{{ $labels.service }} service is down'
+        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
+    {%- endraw %}
+    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessWarning:
+      if: >-
+        count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_warning_threshold_percent }}
+      labels:
+        severity: warning
+        service: {{ contrail_process }}
+      annotations:
+    {%- raw %}
+        summary: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        description: "More than {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+    {%- endraw %}
+    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessCritical:
+      if: >-
+        count(procstat_running{process_name="{{ contrail_process }}"} == 0) >= count(procstat_running{process_name="{{ contrail_process }}"}) *{{ monitoring.services_failed_critical_threshold_percent }}
+      labels:
+        severity: critical
+        service: {{ contrail_process }}
+      annotations:
+    {%- raw %}
+        summary: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+        description: "More than {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of '{{ $labels.service }}' is down"
+    {%- endraw %}
+    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
+      if: >-
+        count(procstat_running{process_name="{{ contrail_process }}"} == 0) == count(procstat_running{process_name="{{ contrail_process }}"})
+      labels:
         severity: down
         service: {{ contrail_process }}
       annotations:
-{%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
-      {%- endfor %}
+    {%- raw %}
+        summary: "All '{{ $labels.service }}' services are down"
+        description: "All '{{ $labels.service }}' services are down"
+    {%- endraw %}
+{%- endfor %}
+    {%- if control_processes is defined %}
 {%- raw %}
     ContrailBGPSessionsNoneUp:
       if: >-
@@ -205,39 +308,6 @@
         description: 'There are too many XMPP sessions changes on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ xmpp_variation_threshold }})'
     {%- endif %}
 
-    {%- if collector_processes is defined %}
-      {%- for contrail_api in collector_apis %}
-        {%- set words = contrail_api.split('.') %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
-      if: >-
-        http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
-        description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
-      {%- endfor %}
-      {%- for contrail_process in collector_processes %}
-        {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
-      if: >-
-        procstat_running{process_name="{{ contrail_process }}"} == 0
-      labels:
-        severity: down
-        service: {{ contrail_process }}
-      annotations:
-{%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
-      {%- endfor %}
-    {%- endif %}
-
     {%- if compute_processes is defined %}
 {%- raw %}
     ContrailVrouterXMPPSessionsNone:
@@ -335,7 +405,7 @@
     ContrailFlowsActiveTooMany:
       if: >-
     {%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %}
-        min(contrail_vrouter_flows_active) by (host) >= {{ vrouter_flows_active_toohigh_threshold }}
+        deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
@@ -347,7 +417,7 @@
     ContrailFlowsDiscardTooMany:
       if: >-
     {%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %}
-        min(contrail_vrouter_flows_discard) by (host) >= {{ vrouter_flows_discard_toohigh_threshold }}
+        rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
@@ -383,7 +453,7 @@
     ContrailFlowsInvalidNHTooMany:
       if: >-
     {%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %}
-        min(contrail_vrouter_flows_invalid_nh) by (host) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
+        rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
@@ -395,7 +465,7 @@
     ContrailFlowsInvalidITFTooMany:
       if: >-
     {%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %}
-        min(contrail_vrouter_flows_composite_invalid_interface) by (host) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
+        rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
@@ -419,7 +489,7 @@
     ContrailFlowsQueueLimitExceededTooMany:
       if: >-
     {%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %}
-        min(contrail_vrouter_flows_flow_queue_limit_exceeded) by (host) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
+        rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
 {%- raw %}
       for: 2m
       labels:
@@ -440,83 +510,49 @@
       annotations:
         summary: 'Too many vRouter flows with table full'
         description: 'There are too many vRouter flows with table full on node {{ $labels.host }} (current value={{ $value }}, threshold={%- endraw %}{{ vrouter_flows_flow_table_full_toohigh_threshold }})'
-      {%- for contrail_api in compute_apis %}
-        {%- set words = contrail_api.split('.') %}
-    {% for word in words %}{% if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
-      if: >-
-        http_response_status{service=~"{{ contrail_api }}"} == 0
-{%- raw %}
-      for: 2m
-      labels:
-        severity: down
-        service: "{{ $labels.service }}"
-      annotations:
-        summary: "Endpoint check for '{{ $labels.service }}' is down"
-        description: >-
-            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $labels.host }}
-{%- endraw %}
-      {%- endfor %}
-      {%- for contrail_process in compute_processes %}
-        {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
-      if: >-
-        procstat_running{process_name="{{ contrail_process }}"} == 0
-      labels:
-        severity: down
-        service: {{ contrail_process }}
-      annotations:
-{%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
-      {%- endfor %}
-    {%- endif %}
-
-    {%- if database_processes is defined %}
-      {%- for contrail_process in database_processes %}
-        {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
-      if: >-
-        procstat_running{process_name="{{ contrail_process }}"} == 0
-      labels:
-        severity: down
-        service: {{ contrail_process }}
-      annotations:
-{%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node {{ $labels.host }}'
-{%- endraw %}
-      {%- endfor %}
-    {%- endif %}
-
-    {%- if web_processes is defined %}
-      {%- for contrail_process in web_processes %}
-        {%- set words = contrail_process.split('-') %}
-    {% for word in words %}{{ word | capitalize }}{% endfor %}ProcessDown:
-      if: >-
-        procstat_running{process_name="{{ contrail_process }}"} == 0
-      labels:
-        severity: down
-        service: {{ contrail_process }}
-      annotations:
-{%- raw %}
-        summary: '{{ $labels.service }} service is down'
-        description: '{{ $labels.service }} service is down on node { $labels.host }}'
-{%- endraw %}
-      {%- endfor %}
     {%- endif %}
 
     {%- if database.get('enabled', False) %}
-    ZookeeperDown:
+    ZookeeperInfo:
       if: >-
         zookeeper_up != 1
       for: 2m
       labels:
+        severity: info
+        service: zookeeper
+      annotations:
+        summary: 'Zookeeper service down'
+        description: 'Zookeeper service is down on node {% raw %}{{ $labels.host }}{% endraw %}.'
+    ZookeeperWarning:
+      if: >-
+        count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_warning_threshold_percent }}
+      for: 2m
+      labels:
         severity: warning
         service: zookeeper
       annotations:
-        summary: 'Zookeeper service down'
-        description: 'Zookeeper service is down on node {% raw %}{{ $labels.host }}{% endraw %}.'
+        summary: "More than {{monitoring.services_failed_warning_threshold_percent*100}}% of Zookeeper services are down"
+        description: "More than {{monitoring.services_failed_warning_threshold_percent*100}}% of Zookeeper services are down"
+    ZookeeperCritical:
+      if: >-
+        count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_critical_threshold_percent }}
+      for: 2m
+      labels:
+        severity: critical
+        service: zookeeper
+      annotations:
+        summary: "More than {{monitoring.services_failed_critical_threshold_percent*100}}% of Zookeeper services are down"
+        description: "More than {{monitoring.services_failed_critical_threshold_percent*100}}% of Zookeeper services are down"
+    ZookeeperDown:
+      if: >-
+        count(zookeeper_up == 0) == count(zookeeper_up)
+      for: 2m
+      labels:
+        severity: down
+        service: zookeeper
+      annotations:
+        summary: 'All Zookeeper services are down'
+        description: 'All Zookeeper services are down'
     {%- endif %}
 
     {%- if exporters is defined %}