Add Prometheus alerts

Change-Id: I80b43221d39048cf2268b59e8b9cd70f2b693b78
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 3a18832..3895e61 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -15,3 +15,5 @@
         enabled: true
         enabled: true
+      prometheus:
+        enabled: true
diff --git a/opencontrail/meta/prometheus.yml b/opencontrail/meta/prometheus.yml
new file mode 100644
index 0000000..2453538
--- /dev/null
+++ b/opencontrail/meta/prometheus.yml
@@ -0,0 +1,462 @@
+{%- if pillar.opencontrail is defined %}
+  {%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web with context %}
+  {%- if collector.get('enabled', False) %}
+    {%- set collector_apis = ( 'contrail.collector', ) %}
+    {%- set collector_processes = (
+        'contrail-alarm-gen', 'contrail-analytics-api', 'contrail-collector',
+        'contrail-nodemgr', 'contrail-query-engine', 'contrail-snmp-collector',
+        'contrail-supervisord-analytics', 'contrail-topology',
+        ) %}
+  {%- endif %}
+  {%- if compute.get('enabled', False) %}
+    {%- set compute_apis = ( 'contrail.vrouter', 'contrail.node.manager' ) %}
+    {%- set compute_processes = (
+        'contrail-nodemgr-vrouter', 'contrail-supervisord-vrouter', 'contrail-vrouter-agent'
+        ) %}
+  {%- endif %}
+  {%- if control.get('enabled', False) %}
+    {%- set control_apis = ( 'contrail.api', 'contrail.discovery' ) %}
+    {%- set control_processes = (
+        'contrail-api', 'contrail-control', 'contrail-device-manager',
+        'contrail-discovery', 'contrail-dns', 'contrail-ifmap-server',
+        'contrail-irond', 'contrail-job-server', 'contrail-named',
+        'contrail-nodemgr-config', 'contrail-nodemgr-control',
+        'contrail-schema', 'contrail-supervisord-config',
+        'contrail-supervisord-control', 'contrail-svc-monitor',
+        ) %}
+  {%- endif %}
+  {%- if database.get('enabled', False) %}
+    {%- set database_processes = (
+        'zookeeper-server', 'kafka-server', 'cassandra-server',
+        'contrail-nodemgr-database', 'contrail-supervisord-database',
+        ) %}
+  {%- endif %}
+  {%- if web.get('enabled', False) %}
+    {%- if web.get('cache', {}).get('engine', '') == 'redis' %}
+      {%- set web_processes = (
+          'contrail-web-server', 'redis-server'
+          ) %}
+    {%- else %}
+      {%- set web_processes = (
+          'contrail-web-server',
+          ) %}
+    {%- endif %}
+  {%- endif %}
+  {%- if control_processes is defined or
+         collector_processes is defined or
+         compute_processes is defined or
+         database_processes is defined or
+         web_processes is defined %}
+  alert:
+    {%- if control_processes is defined %}
+      {%- for contrail_api in control_apis %}
+        {%- set words = contrail_api.split('.') %}
+    {% for word in words %}{%- if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+      if: >-
+        http_response_status{service=~"{{ contrail_api }}"} == 0
+{%- raw %}
+      for: 2m
+      labels:
+        severity: down
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        description: >-
+            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $ }}
+{%- endraw %}
+      {%- endfor %}
+      {%- for contrail_process in control_processes %}
+        {%- set words = contrail_process.split('-') %}
+    ProcstatRunning{% for word in words %}{{ word | capitalize }}{% endfor %}:
+      if: >-
+        procstat_running{process_name="{{ contrail_process }}"} == 0
+      labels:
+        severity: down
+        service: {{ contrail_process }}
+      annotations:
+        summary: '{{ contrail_process }} service is down'
+        description: '{{ contrail_process }} service is down on node {% raw %}{{ $ }}{% endraw %}'
+      {%- endfor %}
+{%- raw %}
+    ContrailBGPSessionsNoneUp:
+      if: >-
+        max(contrail_bgp_session_up_count) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'no active BGP sessions'
+        description: 'There are no active BGP sessions on node {{ $ }}'
+    ContrailBGPSessionsSomeDown:
+      if: >-
+        min(contrail_bgp_session_down_count) >= 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'inactive BGP sessions'
+        description: 'There are inactive BGP sessions on node {{ $ }}'
+    ContrailBGPSessionsNone:
+      if: >-
+        max(contrail_bgp_session_count) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'No BGP sessions'
+        description: 'There are no BGP sessions on node {{ $ }}'
+    ContrailXMPPSessionsNoneUp:
+      if: >-
+        max(contrail_xmpp_session_up_count) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'no active XMPP sessions'
+        description: 'There are no active XMPP sessions on node {{ $ }}'
+    ContrailXMPPSessionsSomeDown:
+      if: >-
+        min(contrail_xmpp_session_down_count) >= 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'inactive XMPP sessions'
+        description: 'There are inactive XMPP sessions on node {{ $ }}'
+    ContrailXMPPSessionsNone:
+      if: >-
+        max(contrail_xmpp_session_count) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'No XMPP sessions'
+        description: 'There are no XMPP sessions on node {{ $ }}'
+    ContrailXMPPSessionsTooMany:
+      if: >-
+        min(contrail_xmpp_session_count) >= 500
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'Too many XMPP sessions'
+        description: 'There are too many XMPP sessions on node {{ $ }}'
+    ContrailXMPPSessionsTooManyVariations:
+      if: >-
+        abs(delta(contrail_xmpp_session_count[2m])) >= 100
+      labels:
+        severity: warning
+        service: contrail-control
+      annotations:
+        summary: 'Number of XMPP sessions changed between checks is too high'
+        description: 'There are too many XMPP sessions changes on node {{ $ }}'
+{%- endraw %}
+    {%- endif %}
+    {%- if collector_processes is defined %}
+      {%- for contrail_api in collector_apis %}
+        {%- set words = contrail_api.split('.') %}
+    {% for word in words %}{%- if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+      if: >-
+        http_response_status{service=~"{{ contrail_api }}"} == 0
+{%- raw %}
+      for: 2m
+      labels:
+        severity: down
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        description: >-
+            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $ }}
+{%- endraw %}
+      {%- endfor %}
+      {%- for contrail_process in collector_processes %}
+        {%- set words = contrail_process.split('-') %}
+    ProcstatRunning{% for word in words %}{{ word | capitalize }}{% endfor %}:
+      if: >-
+        procstat_running{process_name="{{ contrail_process }}"} == 0
+      labels:
+        severity: down
+        service: {{ contrail_process }}
+      annotations:
+        summary: '{{ contrail_process }} service is down'
+        description: '{{ contrail_process }} service is down on node {% raw %}{{ $ }}{% endraw %}'
+      {%- endfor %}
+    {%- endif %}
+    {%- if compute_processes is defined %}
+{%- raw %}
+    ContrailVrouterXMPPSessionsNone:
+      if: >-
+        max(contrail_vrouter_xmpp) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'No vRouter XMPP sessions'
+        description: 'There are no vRouter XMPP sessions on node {{ $ }}'
+    ContrailVrouterXMPPSessionsTooMany:
+      if: >-
+        min(contrail_vrouter_xmpp) >= 10
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter XMPP sessions'
+        description: 'There are too many vRouter XMPP sessions on node {{ $ }}'
+    ContrailVrouterXMPPSessionsTooManyVariations:
+      if: >-
+        abs(delta(contrail_vrouter_xmpp[2m])) >= 5
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Number of vRouter XMPP sessions changed between checks is too high'
+        description: 'There are too many vRouter XMPP sessions changes on node {{ $ }}'
+    ContrailVrouterDNSXMPPSessionsNone:
+      if: >-
+        max(contrail_vrouter_dns_xmpp) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'No vRouter DNS-XMPP sessions'
+        description: 'There are no vRouter DNS-XMPP sessions on node {{ $ }}'
+    ContrailVrouterDNSXMPPSessionsTooMany:
+      if: >-
+        min(contrail_vrouter_dns_xmpp) >= 10
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter DNS-XMPP sessions'
+        description: 'There are too many vRouter DNS-XMPP sessions on node {{ $ }}'
+    ContrailVrouterDNSXMPPSessionsTooManyVariations:
+      if: >-
+        abs(delta(contrail_vrouter_dns_xmpp[2m])) >= 5
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Number of vRouter DNS-XMPP sessions changed between checks is too high'
+        description: 'There are too many vRouter DNS-XMPP sessions changes on node {{ $ }}'
+    ContrailVrouterLLSSessionsNone:
+      if: >-
+        max(contrail_vrouter_lls) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'No vRouter LLS sessions'
+        description: 'There are no vRouter LLS sessions on node {{ $ }}'
+    ContrailVrouterLLSSessionsTooMany:
+      if: >-
+        min(contrail_vrouter_lls) >= 10
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter LLS sessions'
+        description: 'There are too many vRouter LLS sessions on node {{ $ }}'
+    ContrailVrouterLLSSessionsTooManyVariations:
+      if: >-
+        abs(delta(contrail_vrouter_lls[2m])) >= 5
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Number of vRouter LLS sessions changed between checks is too high'
+        description: 'There are too many vRouter LLS sessions changes on node {{ $ }}'
+    ContrailFlowsActiveNone:
+      if: >-
+        min(contrail_vrouter_flows_active) == 0
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'No active vRouter flows'
+        description: 'There are no active vRouter flows on node {{ $ }}'
+    ContrailFlowsActiveTooMany:
+      if: >-
+        min(contrail_vrouter_flows_active) >= 1200
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter active flows'
+        description: 'There are too many active vRouter flows on node {{ $ }}'
+    ContrailFlowsCreatedTooMany:
+      if: >-
+        min(contrail_vrouter_flows_created) >= 1000
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter created flows'
+        description: 'There are too many created vRouter flows on node {{ $ }}'
+    ContrailFlowsDiscardTooMany:
+      if: >-
+        min(contrail_vrouter_flows_discard) >= 1200
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter discarded flows'
+        description: 'There are too many discarded vRouter flows on node {{ $ }}'
+    ContrailFlowsDropTooMany:
+      if: >-
+        min(contrail_vrouter_flows_flow_action_drop) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter dropped flows'
+        description: 'There are too many dropped vRouter flows on node {{ $ }}'
+    ContrailFlowsFragErrTooMany:
+      if: >-
+        min(contrail_vrouter_flows_frag_err) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with fragment errors'
+        description: 'There are too many vRouter flows with fragment errors on node {{ $ }}'
+    ContrailFlowsInvalidNHTooMany:
+      if: >-
+        min(contrail_vrouter_flows_invalid_nh) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with invalid next hop'
+        description: 'There are too many vRouter flows with invalid next hop on node {{ $ }}'
+    ContrailFlowsInvalidITFTooMany:
+      if: >-
+        min(contrail_vrouter_flows_composite_invalid_interface) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with composite invalid interface'
+        description: 'There are too many vRouter flows with composite invalid interface on node {{ $ }}'
+    ContrailFlowsInvalidLabelTooMany:
+      if: >-
+        min(contrail_vrouter_flows_invalid_label) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with invalid label'
+        description: 'There are too many vRouter flows with invalid label on node {{ $ }}'
+    ContrailFlowsQueueLimitExeededTooMany:
+      if: >-
+        min(contrail_vrouter_flows_flow_queue_limit_exceeded) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with queue limit exceeded'
+        description: 'There are too many vRouter flows with queue limit exceeded on node {{ $ }}'
+    ContrailFlowsTableFullTooMany:
+      if: >-
+        min(contrail_vrouter_flows_flow_table_full) >= 100
+      for: 2m
+      labels:
+        severity: warning
+        service: contrail-compute
+      annotations:
+        summary: 'Too many vRouter flows with table full'
+        description: 'There are too many vRouter flows with table full on node {{ $ }}'
+{%- endraw %}
+      {%- for contrail_api in compute_apis %}
+        {%- set words = contrail_api.split('.') %}
+    {% for word in words %}{%- if word != 'api' %}{{ word | capitalize }}{% endif %}{% endfor %}APIDown:
+      if: >-
+        http_response_status{service=~"{{ contrail_api }}"} == 0
+{%- raw %}
+      for: 2m
+      labels:
+        severity: down
+        service: "{{ $labels.service }}"
+      annotations:
+        summary: "Endpoint check for '{{ $labels.service }}' is down"
+        description: >-
+            Endpoint check for '{{ $labels.service }}' is down for 2 minutes on node {{ $ }}
+{%- endraw %}
+      {%- endfor %}
+      {%- for contrail_process in compute_processes %}
+        {%- set words = contrail_process.split('-') %}
+    ProcstatRunning{% for word in words %}{{ word | capitalize }}{% endfor %}:
+      if: >-
+        procstat_running{process_name="{{ contrail_process }}"} == 0
+      labels:
+        severity: down
+        service: {{ contrail_process }}
+      annotations:
+        summary: '{{ contrail_process }} service is down'
+        description: '{{ contrail_process }} service is down on node {% raw %}{{ $ }}{% endraw %}'
+      {%- endfor %}
+    {%- endif %}
+    {%- if database_processes is defined %}
+      {%- for contrail_process in database_processes %}
+        {%- set words = contrail_process.split('-') %}
+    ProcstatRunning{% for word in words %}{{ word | capitalize }}{% endfor %}:
+      if: >-
+        procstat_running{process_name="{{ contrail_process }}"} == 0
+      labels:
+        severity: down
+        service: {{ contrail_process }}
+      annotations:
+        summary: '{{ contrail_process }} service is down'
+        description: '{{ contrail_process }} service is down on node {% raw %}{{ $ }}{% endraw %}'
+      {%- endfor %}
+    {%- endif %}
+    {%- if web_processes is defined %}
+      {%- for contrail_process in web_processes %}
+        {%- set words = contrail_process.split('-') %}
+    ProcstatRunning{% for word in words %}{{ word | capitalize }}{% endfor %}:
+      if: >-
+        procstat_running{process_name="{{ contrail_process }}"} == 0
+      labels:
+        severity: down
+        service: {{ contrail_process }}
+      annotations:
+        summary: '{{ contrail_process }} service is down'
+        description: '{{ contrail_process }} service is down on node {% raw %}{{ $ }}{% endraw %}'
+      {%- endfor %}
+    {%- endif %}
+  {%- endif %}
+{%- endif %}
diff --git a/opencontrail/meta/telegraf.yml b/opencontrail/meta/telegraf.yml
index 72d079b..32989f1 100644
--- a/opencontrail/meta/telegraf.yml
+++ b/opencontrail/meta/telegraf.yml
@@ -2,6 +2,89 @@
   {%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web with context %}
+  {%- if collector.get('enabled', False) or database.get('enabled', False) or control.get('enabled', False)
+      or web.get('enabled', False) or compute.get('enabled', False) %}
+    procstat:
+      process:
+    {%- if collector.get('enabled', False) %}
+        contrail-alarm-gen:
+          pattern: 'python.*contrail-alarm-gen'
+        contrail-analytics-api:
+          pattern: 'python.*contrail-analytics-api'
+        contrail-collector:
+          pattern: 'contrail-collector'
+        contrail-nodemgr:
+          pattern: 'python.*contrail-nodemgr$'
+        contrail-query-engine:
+          pattern: 'contrail-query-engine'
+        contrail-snmp-collector:
+          pattern: 'python.*contrail-snmp-collector'
+        contrail-supervisord-analytics:
+          pattern: 'python.*supervisord.*_analytics'
+        contrail-topology:
+          pattern: 'python.*contrail-topology'
+    {%- endif %}
+    {%- if database.get('enabled', False) %}
+        zookeeper-server:
+          pattern: 'java.*zookeeper.server'
+        kafka-server:
+          pattern: 'java.*kafka.Kafka'
+        cassandra-server:
+          pattern: 'java.*service.CassandraDaemon'
+        contrail-nodemgr-database:
+          pattern: 'python.*contrail-nodemgr.*-database'
+        contrail-supervisord-database:
+          pattern: 'python.*supervisord.*_database'
+    {%- endif %}
+    {%- if control.get('enabled', False) %}
+        contrail-api:
+          pattern: 'python.*contrail-api'
+        contrail-control:
+          pattern: '[^=]contrail-control$'
+        contrail-device-manager:
+          pattern: 'python.*contrail-device-manager'
+        contrail-discovery:
+          pattern: 'python.*contrail-discovery'
+        contrail-dns:
+          pattern: 'contrail-dns'
+        contrail-ifmap-server:
+          pattern: 'sh.*ifmap-server'
+        contrail-irond:
+          pattern: 'java.*irond'
+        contrail-job-server:
+          pattern: 'node.*jobServerStart'
+        contrail-named:
+          pattern: 'contrail-named'
+        contrail-nodemgr-config:
+          pattern: 'python.*contrail-nodemgr.*-config'
+        contrail-nodemgr-control:
+          pattern: 'python.*contrail-nodemgr.*-control'
+        contrail-schema:
+          pattern: 'python.*contrail-schema'
+        contrail-supervisord-config:
+          pattern: 'python.*supervisord.*_config'
+        contrail-supervisord-control:
+          pattern: 'python.*supervisord.*_control'
+        contrail-svc-monitor:
+          pattern: 'python.*contrail-svc-monitor'
+    {%- endif %}
+    {%- if web.get('enabled', False) %}
+        contrail-web-server:
+          pattern: 'node.*webServerStart'
+      {%- if web.get('cache', {}).get('engine', '') == 'redis' %}
+        redis-server:
+          pattern: 'redis-server'
+      {%- endif %}
+    {%- endif %}
+    {%- if compute.get('enabled', False) %}
+        contrail-nodemgr-vrouter:
+          pattern: 'python.*contrail-nodemgr.*-vrouter'
+        contrail-supervisord-vrouter:
+          pattern: 'python.*supervisord.*_vrouter'
+        contrail-vrouter-agent:
+          pattern: 'contrail-vrouter-agent'
+    {%- endif %}
+  {%- endif %}
   {%- if config.get('enabled', False) or collector.get('enabled', False) or compute.get('enabled', False) %}
     {%- if config.get('enabled', False) %}