Add sf-notifier monitoring:

- add alerts for checking service health
- add Prometheus dns target
- add fluentd configuration

Change-Id: Id61f613cf3d853d1a37b0834ab6149fb2e9b10b5
Related-bug: PROD-25099 (PROD:25099)
diff --git a/.travis.yml b/.travis.yml
index 516d0f6..e21a264 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,8 +15,6 @@
   - bundle install
 
 env:
-  - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2016.3/salt:2018_11_19 SUITE=alertmanager
-  - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2016.3/salt:2018_11_19 SUITE=server
   - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2017.7/salt:2018_11_19 SUITE=alertmanager
   - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2017.7/salt:2018_11_19 SUITE=server
   - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-stable/salt:2018_11_19 SUITE=alertmanager
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index e4d3cb4..bbbbf18 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -7,3 +7,5 @@
         enabled: true
       telegraf:
         enabled: true
+      fluentd:
+        enabled: true
diff --git a/prometheus/meta/fluentd.yml b/prometheus/meta/fluentd.yml
new file mode 100644
index 0000000..ea8b052
--- /dev/null
+++ b/prometheus/meta/fluentd.yml
@@ -0,0 +1,46 @@
+{%- if pillar.get('fluentd', {}).get('agent', {}).get('enabled', False) %}
+{% from "prometheus/map.jinja" import alertmanager, sf_notifier with context %}
+{%- if alertmanager.enabled and sf_notifier.enabled %}
+agent:
+  config:
+    label:
+      sfnotifier:
+        input:
+          tail_sfnotifier:
+            type: tail
+            tag: sfnotifier.log
+            path: {{ sf_notifier.dir.logs }}/sf-notifier.log
+            pos_file: {{ pillar.fluentd.agent.dir.positiondb }}/sfnotifier.pos
+            parser:
+              type: multi_format
+              patterns:
+                # sfnotifier log format: https://regex101.com/r/JsYcJU/1
+                - type: regexp
+                  time_key: Timestamp
+                  time_format: '%F %H:%M:%S'
+                  keep_time_key: false
+                  format: '/^\[(?<Timestamp>.*)\] (?<Severity>.*?) in (?<Module>.*?)\: (?<Payload>.+)$/'
+                # http log format: https://regex101.com/r/8i6u1v/2
+                - type: regexp
+                  time_key: Timestamp
+                  time_format: '%d/%b/%Y %H:%M:%S'
+                  keep_time_key: false
+                  format: '/^(?<http_client_ip_address>[^ ]*) - - \[(?<Timestamp>.*)\] "(?<http_method>\S+) (?<http_url>[^ ]*) \S*" (?<http_status>[^ ]*) -$/'
+        filter:
+          sfnotifier_record:
+            type: record_transformer
+            tag: sfnotifier.log
+            enable_ruby: true
+            remove_keys: priority
+            record:
+              - name: service
+                value: sfnotifier
+              - name: severity_label
+                value: '${ {"TRACE"=>8,"DEBUG"=>7,"INFO"=>6,"NOTICE"=>5,"WARNING"=>4,"ERROR"=>3,"CRITICAL"=>2,"ALERT"=>1,"EMERGENCY"=>0}.fetch(record["Severity"].to_i, 6) }'
+        match:
+          push_to_default:
+            tag: sfnotifier.**
+            type: relabel
+            label: default_output
+{%- endif %}
+{%- endif %}
diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml
index 9d2755b..f53d66d 100644
--- a/prometheus/meta/prometheus.yml
+++ b/prometheus/meta/prometheus.yml
@@ -1,5 +1,5 @@
 {%- if pillar.prometheus is defined %}
-{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
+{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay, sf_notifier with context %}
 server:
   alert:
 {%- if server.get('enabled', False) %}
@@ -185,17 +185,62 @@
         description: "All Prometheus Long Term Storage services are down for 2 minutes."
 {%- endraw %}
 {%- endif %}
-{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
-  {%- set addresses = [] %}
-  {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
-    {%- do addresses.append(server.bind.address) %}
-  {%- endif %}
-  {%- for address in grains['fqdn_ip4'] %}
-    {%- if not address.startswith('127') %}
-      {%- do addresses.append(address) %}
-    {%- endif %}
-  {%- endfor %}
+{%- if sf_notifier.get('enabled', False) %}
+{%- raw %}
+    SfNotifierDown:
+      if: >-
+        absent(sf_auth_ok) == 1
+      for: 2m
+      labels:
+        severity: critical
+        service: sf-notifier
+      annotations:
+        summary: "Sf-notifier service is down"
+        description: "The sf-notifier service is down for 2 minutes."
+    SfNotifierAuthFailure:
+      if: >-
+        sf_auth_ok == 0
+      for: 2m
+      labels:
+        severity: critical
+        service: sf-notifier
+      annotations:
+        summary: "Sf-notifier authentication failure"
+        description: "The sf-notifier service fails to authenticate to Salesforce for 2 minutes."
+    SfNotifierErrorsWarning:
+      if: >-
+        increase(sf_error_count_total[2m]) > 0
+      for: 2m
+      labels:
+        severity: warning
+        service: sf-notifier
+      annotations:
+        summary: "Sf-notifier error rate increase"
+        description: "An average of {{ $value }} sf-notifier error requests appear for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) or sf_notifier.get('enabled', False) %}
   target:
+  {%- if sf_notifier.get('enabled', False) %}
+    dns:
+      enabled: true
+      endpoint:
+        - name: 'sf_notifier'
+          domain:
+          - 'tasks.monitoring_sf_notifier'
+          type: A
+          port: {{ sf_notifier.uwsgi.bind_port }}
+  {%- endif %}
+  {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
+    {%- set addresses = [] %}
+    {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
+      {%- do addresses.append(server.bind.address) %}
+    {%- endif %}
+    {%- for address in grains['fqdn_ip4'] %}
+      {%- if not address.startswith('127') %}
+        {%- do addresses.append(address) %}
+      {%- endif %}
+    {%- endfor %}
     static:
       prometheus_lts:
         enabled: True
@@ -217,5 +262,6 @@
             replacement: {{ grains['host'] }}
             source_labels: "__address__"
             target_label: "host"
+  {%- endif %}
 {%- endif %}
 {%- endif %}