Add sf-notifier monitoring:
- add alerts for checking service health
- add Prometheus dns target
- add fluentd configuration
Change-Id: Id61f613cf3d853d1a37b0834ab6149fb2e9b10b5
Related-bug: PROD-25099 (PROD:25099)
diff --git a/.travis.yml b/.travis.yml
index 516d0f6..e21a264 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,8 +15,6 @@
- bundle install
env:
- - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2016.3/salt:2018_11_19 SUITE=alertmanager
- - PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2016.3/salt:2018_11_19 SUITE=server
- PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2017.7/salt:2018_11_19 SUITE=alertmanager
- PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-2017.7/salt:2018_11_19 SUITE=server
- PLATFORM=docker-dev-local.docker.mirantis.net/epcim/salt/saltstack-ubuntu-xenial-salt-stable/salt:2018_11_19 SUITE=alertmanager
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index e4d3cb4..bbbbf18 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -7,3 +7,5 @@
enabled: true
telegraf:
enabled: true
+ fluentd:
+ enabled: true
diff --git a/prometheus/meta/fluentd.yml b/prometheus/meta/fluentd.yml
new file mode 100644
index 0000000..ea8b052
--- /dev/null
+++ b/prometheus/meta/fluentd.yml
@@ -0,0 +1,46 @@
+{%- if pillar.get('fluentd', {}).get('agent', {}).get('enabled', False) %}
+{% from "prometheus/map.jinja" import alertmanager, sf_notifier with context %}
+{%- if alertmanager.enabled and sf_notifier.enabled %}
+agent:
+ config:
+ label:
+ sfnotifier:
+ input:
+ tail_sfnotifier:
+ type: tail
+ tag: sfnotifier.log
+ path: {{ sf_notifier.dir.logs }}/sf-notifier.log
+ pos_file: {{ pillar.fluentd.agent.dir.positiondb }}/sfnotifier.pos
+ parser:
+ type: multi_format
+ patterns:
+ # sfnotifier log format: https://regex101.com/r/JsYcJU/1
+ - type: regexp
+ time_key: Timestamp
+ time_format: '%F %H:%M:%S'
+ keep_time_key: false
+ format: '/^\[(?<Timestamp>.*)\] (?<Severity>.*?) in (?<Module>.*?)\: (?<Payload>.+)$/'
+ # http log format: https://regex101.com/r/8i6u1v/2
+ - type: regexp
+ time_key: Timestamp
+ time_format: '%d/%b/%Y %H:%M:%S'
+ keep_time_key: false
+ format: '/^(?<http_client_ip_address>[^ ]*) - - \[(?<Timestamp>.*)\] "(?<http_method>\S+) (?<http_url>[^ ]*) \S*" (?<http_status>[^ ]*) -$/'
+ filter:
+ sfnotifier_record:
+ type: record_transformer
+ tag: sfnotifier.log
+ enable_ruby: true
+ remove_keys: priority
+ record:
+ - name: service
+ value: sfnotifier
+ - name: severity_label
+ value: '${ {"TRACE"=>8,"DEBUG"=>7,"INFO"=>6,"NOTICE"=>5,"WARNING"=>4,"ERROR"=>3,"CRITICAL"=>2,"ALERT"=>1,"EMERGENCY"=>0}.fetch(record["Severity"].to_i, 6) }'
+ match:
+ push_to_default:
+ tag: sfnotifier.**
+ type: relabel
+ label: default_output
+{%- endif %}
+{%- endif %}
diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml
index 9d2755b..f53d66d 100644
--- a/prometheus/meta/prometheus.yml
+++ b/prometheus/meta/prometheus.yml
@@ -1,5 +1,5 @@
{%- if pillar.prometheus is defined %}
-{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
+{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay, sf_notifier with context %}
server:
alert:
{%- if server.get('enabled', False) %}
@@ -185,17 +185,62 @@
description: "All Prometheus Long Term Storage services are down for 2 minutes."
{%- endraw %}
{%- endif %}
-{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
- {%- set addresses = [] %}
- {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
- {%- do addresses.append(server.bind.address) %}
- {%- endif %}
- {%- for address in grains['fqdn_ip4'] %}
- {%- if not address.startswith('127') %}
- {%- do addresses.append(address) %}
- {%- endif %}
- {%- endfor %}
+{%- if sf_notifier.get('enabled', False) %}
+{%- raw %}
+ SfNotifierDown:
+ if: >-
+ absent(sf_auth_ok) == 1
+ for: 2m
+ labels:
+ severity: critical
+ service: sf-notifier
+ annotations:
+ summary: "Sf-notifier service is down"
+ description: "The sf-notifier service is down for 2 minutes."
+ SfNotifierAuthFailure:
+ if: >-
+ sf_auth_ok == 0
+ for: 2m
+ labels:
+ severity: critical
+ service: sf-notifier
+ annotations:
+ summary: "Sf-notifier authentication failure"
+ description: "The sf-notifier service fails to authenticate to Salesforce for 2 minutes."
+ SfNotifierErrorsWarning:
+ if: >-
+ increase(sf_error_count_total[2m]) > 0
+ for: 2m
+ labels:
+ severity: warning
+ service: sf-notifier
+ annotations:
+ summary: "Sf-notifier error rate increase"
+ description: "An average of {{ $value }} sf-notifier error requests appear for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) or sf_notifier.get('enabled', False) %}
target:
+ {%- if sf_notifier.get('enabled', False) %}
+ dns:
+ enabled: true
+ endpoint:
+ - name: 'sf_notifier'
+ domain:
+ - 'tasks.monitoring_sf_notifier'
+ type: A
+ port: {{ sf_notifier.uwsgi.bind_port }}
+ {%- endif %}
+ {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
+ {%- set addresses = [] %}
+ {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
+ {%- do addresses.append(server.bind.address) %}
+ {%- endif %}
+ {%- for address in grains['fqdn_ip4'] %}
+ {%- if not address.startswith('127') %}
+ {%- do addresses.append(address) %}
+ {%- endif %}
+ {%- endfor %}
static:
prometheus_lts:
enabled: True
@@ -217,5 +262,6 @@
replacement: {{ grains['host'] }}
source_labels: "__address__"
target_label: "host"
+ {%- endif %}
{%- endif %}
{%- endif %}