Fix Contrail metrics, alarms and Nagios dashboard state

Change-Id: I711511bc4982d2a7dbce834faf67a8dc98fc3c12
diff --git a/opencontrail/meta/collectd.yml b/opencontrail/meta/collectd.yml
index e849738..1faf907 100644
--- a/opencontrail/meta/collectd.yml
+++ b/opencontrail/meta/collectd.yml
@@ -1,6 +1,6 @@
 {%- if pillar.opencontrail is defined %}
 {%- if pillar.opencontrail.control is defined %}
-{%- from "opencontrail/map.jinja" import control, collector, config, database with context %}
+{%- from "opencontrail/map.jinja" import control, collector, config, database, web with context %}
 local_plugin:
 {%- if control.get('enabled', False) %}
 {%- if database is defined and database.get('cassandra', False) %}
@@ -65,7 +65,7 @@
       contrail-collector:
         match: 'contrail-collector'
       contrail-control:
-        match: 'contrail-control'
+        match: '[^=]contrail-control$'
       contrail-device-manager:
         match: 'python.*contrail-device-manager'
       contrail-discovery:
@@ -81,24 +81,38 @@
       contrail-named:
         match: 'contrail-named'
       contrail-nodemgr:
-        match: 'python.*contrail-nodemgr'
+        match: 'python.*contrail-nodemgr$'
+      contrail-nodemgr-config:
+        match: 'python.*contrail-nodemgr.*-config'
+      contrail-nodemgr-control:
+        match: 'python.*contrail-nodemgr.*-control'
+      contrail-nodemgr-database:
+        match: 'python.*contrail-nodemgr.*-database'
       contrail-query-engine:
         match: 'contrail-query-engine'
       contrail-schema:
         match: 'python.*contrail-schema'
       contrail-snmp-collector:
         match: 'python.*contrail-snmp-collector'
-      contrail-supervisord:
-        match: 'python.*supervisord'
+      contrail-supervisord-analytics:
+        match: 'python.*supervisord.*_analytics'
+      contrail-supervisord-config:
+        match: 'python.*supervisord.*_config'
+      contrail-supervisord-control:
+        match: 'python.*supervisord.*_control'
+      contrail-supervisord-database:
+        match: 'python.*supervisord.*_database'
       contrail-svc-monitor:
         match: 'python.*contrail-svc-monitor'
       contrail-topology:
         match: 'python.*contrail-topology'
+{%- if web.get('enabled', False) %}
       contrail-web-server:
         match: 'node.*webServerStart'
-      zookeeper:
+{%- endif %}
+      zookeeper-server:
         match: 'java.*zookeeper.server'
-      kafka-zookeeper:
+      kafka-server:
         match: 'java.*kafka.Kafka'
       redis-server:
         match: 'redis-server'
@@ -181,10 +195,10 @@
         url: "http://127.0.0.1:8085/"
   collectd_processes:
     process:
-      contrail-nodemgr:
-        match: 'python.*contrail-nodemgr'
-      contrail-supervisord:
-        match: 'python.*supervisord'
+      contrail-nodemgr-vrouter:
+        match: 'python.*contrail-nodemgr.*-vrouter'
+      contrail-supervisord-vrouter:
+        match: 'python.*supervisord.*_vrouter'
       contrail-vrouter-agent:
         match: 'contrail-vrouter-agent'
 {%- endif %}
diff --git a/opencontrail/meta/heka.yml b/opencontrail/meta/heka.yml
index 1f09ef5..0b04c0a 100644
--- a/opencontrail/meta/heka.yml
+++ b/opencontrail/meta/heka.yml
@@ -3,11 +3,26 @@
     {%- from "opencontrail/map.jinja" import control with context %}
     {%- if control.get('enabled', False) %}
       {%- set controller_ref = control %}
+      {%- set control_processes = (
+            'cassandra-server', 'contrail-alarm-gen', 'contrail-analytics-api',
+            'contrail-api', 'contrail-collector', 'contrail-control',
+            'contrail-device-manager', 'contrail-discovery', 'contrail-dns',
+            'contrail-ifmap-server', 'contrail-irond', 'contrail-job-server',
+            'contrail-named', 'contrail-nodemgr', 'contrail-nodemgr-config',
+            'contrail-nodemgr-control', 'contrail-nodemgr-database', 'contrail-query-engine',
+            'contrail-schema', 'contrail-snmp-collector', 'contrail-supervisord-analytics',
+            'contrail-supervisord-config', 'contrail-supervisord-control', 'contrail-supervisord-database',
+            'contrail-svc-monitor', 'contrail-topology',
+            'kafka-server', 'redis-server', 'zookeeper-server'
+          ) %}
     {%- endif %}
   {%- elif pillar.opencontrail.compute is defined %}
     {%- from "opencontrail/map.jinja" import compute with context %}
     {%- if compute.get('enabled', False) %}
       {%- set compute_ref = compute %}
+      {%- set compute_processes = (
+            'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
+          ) %}
     {%- endif %}
   {%- endif %}
   {%- if pillar.opencontrail.web is defined %}
@@ -152,25 +167,55 @@
 metric_collector:
   trigger:
   {%- if controller_ref is defined %}
-    {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
-    'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
-    'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
-    'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
-    'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
-    'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
-    ) %}
+    contrail_api_local_endpoint:
+      description: 'Contrail API is locally down'
+      severity: down
+      rules:
+      - metric: openstack_check_local_api
+        field:
+          service: contrail-api
+        relational_operator: '=='
+        threshold: 0
+        window: 60
+        periods: 0
+        function: last
+    contrail_discovery_api_local_endpoint:
+      description: 'Contrail Discovery API is locally down'
+      severity: down
+      rules:
+      - metric: openstack_check_local_api
+        field:
+          service: contrail-discovery
+        relational_operator: '=='
+        threshold: 0
+        window: 60
+        periods: 0
+        function: last
+    contrail_collector_api_local_endpoint:
+      description: 'Contrail Collector API is locally down'
+      severity: down
+      rules:
+      - metric: openstack_check_local_api
+        field:
+          service: contrail-collector
+        relational_operator: '=='
+        threshold: 0
+        window: 60
+        periods: 0
+        function: last
+    {%- for contrail_process in control_processes %}
     {{ contrail_process|replace("-", "_") }}:
       description: "There is no {{ contrail_process }} process running"
       severity: critical
       rules:
       - metric: lma_components_processes
-        fields:
-          service: '{{ contrail_process }}'
+        field:
+          service: {{ contrail_process }}
         relational_operator: '=='
         threshold: 0
         window: 60
         periods: 0
-        function: min
+        function: last
     {%- endfor %}
     xmpp_number_of_sessions_lo:
       description: "There are no xmpp-number-of-sessions"
@@ -473,129 +518,185 @@
         periods: 0
         function: min
   {%- endif %}
-  {%- if web_ref is defined %}
-    {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
-    ) %}
-    {{ contrail_process|replace("-", "_") }}:
-      description: "There is no {{ contrail_process }} process running"
-      severity: critical
-      rules:
-      - metric: lma_components_processes
-        fields:
-          service: '{{ contrail_process }}'
-        relational_operator: '=='
-        threshold: 0
-        window: 60
-        periods: 0
-        function: min
-    {%- endfor %}
-  {%- endif %}
   {%- if compute_ref is defined %}
-    {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
-    ) %}
+    contrail_node_manager_api_local_endpoint:
+      description: 'Contrail Node Manager API is locally down'
+      severity: down
+      rules:
+      - metric: openstack_check_local_api
+        field:
+          service: contrail-node-manager
+        relational_operator: '=='
+        threshold: 0
+        window: 60
+        periods: 0
+        function: last
+    contrail_vrouter_api_local_endpoint:
+      description: 'Contrail vrouter API is locally down'
+      severity: down
+      rules:
+      - metric: openstack_check_local_api
+        field:
+          service: contrail-vrouter
+        relational_operator: '=='
+        threshold: 0
+        window: 60
+        periods: 0
+        function: last
+    {%- for contrail_process in compute_processes %}
     {{ contrail_process|replace("-", "_") }}:
       description: "There is no {{ contrail_process }} process running"
       severity: critical
       rules:
       - metric: lma_components_processes
-        fields:
-          service: '{{ contrail_process|replace("-", "_") }}'
+        field:
+          service: {{ contrail_process }}
         relational_operator: '=='
         threshold: 0
         window: 60
         periods: 0
-        function: min
+        function: last
     {%- endfor %}
   {%- endif %}
   alarm:
   {%- if controller_ref is defined %}
-    {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
-    'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
-    'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
-    'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
-    'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
-    'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
-    ) %}
+    contrail_api_endpoint:
+      alerting: enabled
+      triggers:
+      - contrail_api_local_endpoint
+      dimension:
+        service: contrail-api-endpoint
+    contrail_discovery_api_endpoint:
+      alerting: enabled
+      triggers:
+      - contrail_discovery_api_local_endpoint
+      dimension:
+        service: contrail-discovery-api-endpoint
+    contrail_collector_api_endpoint:
+      alerting: enabled
+      triggers:
+      - contrail_collector_api_local_endpoint
+      dimension:
+        service: contrail-collector-api-endpoint
+    {%- for contrail_process in control_processes %}
     {{ contrail_process|replace("-", "_") }}:
       alerting: enabled
       triggers:
       - {{ contrail_process|replace("-", "_") }}
       dimension:
-        service: contrail-processes-control
-    {%- endfor %}
-  {%- endif %}
-  {%- if web_ref is defined %}
-    {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
-    ) %}
-    {{ contrail_process|replace("-", "_") }}:
-      alerting: enabled
-      triggers:
-      - {{ contrail_process|replace("-", "_") }}
-      dimension:
-        service: contrail-processes-control
+        service: contrail-control
     {%- endfor %}
   {%- endif %}
   {%- if compute_ref is defined %}
-    {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
-    ) %}
+    contrail_node_manager_api_endpoint:
+      alerting: enabled
+      triggers:
+      - contrail_node_manager_api_local_endpoint
+      dimension:
+        service: contrail-node-manager-api-endpoint
+    contrail_vrouter_api_endpoint:
+      alerting: enabled
+      triggers:
+      - contrail_vrouter_api_local_endpoint
+      dimension:
+        service: contrail-vrouter-api-endpoint
+    {%- for contrail_process in compute_processes %}
     {{ contrail_process|replace("-", "_") }}:
       alerting: enabled
       triggers:
       - {{ contrail_process|replace("-", "_") }}
       dimension:
-        service: contrail-processes-compute
+        service: contrail-compute
     {%- endfor %}
   {%- endif %}
 aggregator:
   alarm_cluster:
   {%- if controller_ref is defined %}
-    contrail_processes_control:
-      policy: highest_severity
-      group_by: hostname
+    contrail_api_endpoint:
+      policy: availability_of_members
       alerting: enabled
+      group_by: hostname
       match:
-        service: contrail-processes-control
+        service: contrail-api-endpoint
       members:
-    {%- for contrail_process in ('contrail-control', 'contrail-control-nodemgr',
-    'contrail-dns', 'contrail-named', 'contrail-alarm-gen', 'contrail-analytics-api',
-    'contrail-analytics-nodemgr', 'contrail-collector', 'contrail-query-engine',
-    'contrail-snmp-collector', 'contrail-topology', 'contrail-api', 'contrail-config-nodemgr',
-    'contrail-device-manager', 'contrail-discovery', 'contrail-schema', 'contrail-svc-monitor',
-    'contrail-database-nodemgr', 'ifmap', 'kafka', 'cassandra', 'zookeeper'
-    ) %}
-      - {{ contrail_process|replace("-", "_") }}:
-    {%- endfor %}
+      - contrail_api_endpoint
       dimension:
         service: contrail-control
-  {%- endif %}
-  {%- if web_ref is defined %}
-    contrail_processes_compute:
-      policy: highest_severity
-      group_by: hostname
+        nagios_host: 01-service-clusters
+    contrail_discovery_api_endpoint:
+      policy: availability_of_members
       alerting: enabled
+      group_by: hostname
       match:
-        service: contrail-processes-control
+        service: contrail-discovery-api-endpoint
       members:
-    {%- for contrail_process in ('contrail-webui', 'contrail-webui-middleware'
-    ) %}
-      - {{ contrail_process|replace("-", "_") }}:
-    {%- endfor %}
+      - contrail_discovery_api_endpoint
       dimension:
         service: contrail-control
+        nagios_host: 01-service-clusters
+    contrail_collector_api_endpoint:
+      policy: availability_of_members
+      alerting: enabled
+      group_by: hostname
+      match:
+        service: contrail-collector-api-endpoint
+      members:
+      - contrail_collector_api_endpoint
+      dimension:
+        service: contrail-control
+        nagios_host: 01-service-clusters
+    contrail_control:
+      policy: highest_severity
+      alerting: enabled
+      match:
+        service: contrail-control
+      members:
+      - contrail_api_endpoint
+      - contrail_discovery_api_endpoint
+      - contrail_collector_api_endpoint
+    {%- for contrail_process in control_processes %}
+      - {{ contrail_process|replace("-", "_") }}
+    {%- endfor %}
+      dimension:
+        cluster_name: contrail-control
+        nagios_host: 00-top-clusters
   {%- endif %}
   {%- if compute_ref is defined %}
-    contrail_processes_compute:
-      policy: highest_severity
-      group_by: hostname
+    contrail_node_manager_api_endpoint:
+      policy: availability_of_members
       alerting: enabled
+      group_by: hostname
       match:
-        service: contrail-processes-compute
+        service: contrail-node-manager-api-endpoint
       members:
-    {%- for contrail_process in ('contrail-vrouter-agent', 'contrail-vrouter-nodemgr'
-    ) %}
-      - {{ contrail_process|replace("-", "_") }}:
-    {%- endfor %}
+      - contrail_node_manager_api_endpoint
       dimension:
         service: contrail-compute
+        nagios_host: 01-service-clusters
+    contrail_vrouter_api_endpoint:
+      policy: availability_of_members
+      alerting: enabled
+      group_by: hostname
+      match:
+        service: contrail-vrouter-api-endpoint
+      members:
+      - contrail_vrouter_api_endpoint
+      dimension:
+        service: contrail-compute
+        nagios_host: 01-service-clusters
+    contrail_compute:
+      policy: highest_severity
+      alerting: enabled
+      match:
+        service: contrail-compute
+      members:
+      - contrail_node_manager_api_endpoint
+      - contrail_vrouter_api_endpoint
+    {%- for contrail_process in compute_processes %}
+      - {{ contrail_process|replace("-", "_") }}
+    {%- endfor %}
+      dimension:
+        cluster_name: contrail-compute
+        nagios_host: 00-top-clusters
   {%- endif %}
 {%- endif %}