Add Prometheus scraping endpoint
Available on /metrics endpoint.
Added when app is run via uwsgi:
uwsgi --http 127.0.0.1:5000 \
--wsgi-file sf_notifier/server.py \
--callable app_dispatch
Change-Id: I3aa8f715e81f6032d6c14e1b764324ea05473e45
Related-bug: PROD-25099 (PROD:25099)
diff --git a/README.md b/README.md
index 04b5e7d..8129986 100644
--- a/README.md
+++ b/README.md
@@ -41,11 +41,11 @@
Run server:
```
-$ venv/bin/flask run
+$ venv/bin/uwsgi --http 127.0.0.1:5000 --wsgi-file sf_notifier/server.py --callable app_dispatch
```
Check in browser:
```
-http://127.0.0.1:5000/health
+http://127.0.0.1:5000/metrics
```
diff --git a/requirements.txt b/requirements.txt
index d2154e7..94ceb82 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@
itsdangerous==1.1.0
Jinja2==2.10
MarkupSafe==1.1.0
+prometheus-client==0.4.2
pycparser==2.19
pyOpenSSL==18.0.0
PyYAML==3.13
@@ -19,4 +20,5 @@
simple-settings==0.13.0
six==1.11.0
urllib3==1.24.1
+uWSGI==2.0.17.1
Werkzeug==0.14.1
diff --git a/sf_notifier/helpers.py b/sf_notifier/helpers.py
index 8d9d7d5..8a5bf8d 100644
--- a/sf_notifier/helpers.py
+++ b/sf_notifier/helpers.py
@@ -15,20 +15,22 @@
RESOLVED_STATUSES = ('UP', 'OK', 'resolved')
+SUBJECT_FMT = '[{}][{}] {}'
def alert_fields_and_action(alert):
fields = []
- action = 'create_case'
if alert['status'] in RESOLVED_STATUSES:
- fields.append(alert['labels'])
action = 'close_case'
+ fields.append(alert['labels'])
else:
+ action = 'create_case'
# Order matters
- fields.append('[sf-notifier] {}'.format(
- alert['annotations']['summary'])
- )
+ subject = SUBJECT_FMT.format(alert['labels']['environment'],
+ alert['labels'].get('host', 'none'),
+ alert['annotations']['summary'])
+ fields.append(subject)
fields.append(alert['annotations']['description'])
fields.append(alert['status'])
fields.append(alert['labels'])
diff --git a/sf_notifier/salesforce/client.py b/sf_notifier/salesforce/client.py
index 319488d..79a6467 100644
--- a/sf_notifier/salesforce/client.py
+++ b/sf_notifier/salesforce/client.py
@@ -18,6 +18,8 @@
import os
import uuid
+from prometheus_client import Counter, Gauge
+
import requests
from simple_salesforce import Salesforce
@@ -66,6 +68,11 @@
class SalesforceClient(object):
def __init__(self, config):
+ self.metrics = {
+ 'sf_auth_ok': Gauge('sf_auth_ok', 'sf-notifier'),
+ 'sf_error_count': Counter('sf_error_count', 'sf-notifier'),
+ 'sf_request_count': Counter('sf_request_count', 'sf-notifier')
+ }
self.session = requests.Session()
self.config = self._validate_config(config)
self.environment = self.config.pop('environment_id')
@@ -103,8 +110,10 @@
self.sf = Salesforce(**kwargs)
except sf_exceptions.SalesforceAuthenticationFailed:
logger.error('Salesforce authentication failure.')
+ self.metrics['sf_auth_ok'].set(0)
return
logger.info('Salesforce authentication successful.')
+ self.metrics['sf_auth_ok'].set(1)
@staticmethod
def _get_alert_id(labels):
@@ -117,7 +126,7 @@
def _create_case(self, subject, body, labels, alert_id):
if alert_id in self._registered_alerts:
- logger.info('Duplicate case for alert: {}.'.format(alert_id))
+ logger.warning('Duplicate case for alert: {}.'.format(alert_id))
return 1, self._registered_alerts[alert_id]['Id']
severity = labels.get('severity', 'unknown').upper()
@@ -133,9 +142,11 @@
'Environment2__c': self.environment,
'Alert_ID__c': alert_id,
}
- logger.info('Try to create case: {}'.format(payload))
+ logger.info('Try to create case: {}.'.format(payload))
try:
+ self.metrics['sf_request_count'].inc()
case = self.sf.Case.create(payload)
+ logger.info('Created case: {}.'.format(case))
except sf_exceptions.SalesforceMalformedRequest as ex:
msg = ex.content[0]['message']
err_code = ex.content[0]['errorCode']
@@ -146,22 +157,16 @@
self._registered_alerts[alert_id] = {'Id': case_id}
return 1, case_id
else:
+ self.metrics['sf_error_count'].inc()
raise
self._registered_alerts[alert_id] = {'Id': case['id']}
return 0, case['id']
@sf_auth_retry
- def _get_case(self, case_id):
- return self.sf.Case.get(case_id)
-
- @sf_auth_retry
- def _update_case(self, case_id, data):
- return self.sf.Case.update(case_id, data)
-
- @sf_auth_retry
def _close_case(self, case_id):
logger.info('Try to close case: {}.'.format(case_id))
+ self.metrics['sf_request_count'].inc()
update = self.sf.Case.update(
case_id,
{'Status': 'Auto-solved', 'Alert_ID__c': uuid.uuid4().hex}
diff --git a/sf_notifier/server.py b/sf_notifier/server.py
index 658061d..a42c26a 100644
--- a/sf_notifier/server.py
+++ b/sf_notifier/server.py
@@ -18,6 +18,8 @@
from flask import Flask, Response, jsonify, request
+from prometheus_client import make_wsgi_app
+
from sf_notifier.helpers import alert_fields_and_action
from sf_notifier.salesforce.client import SalesforceClient
@@ -25,19 +27,19 @@
from simple_settings import settings
+from werkzeug.wsgi import DispatcherMiddleware
+
dictConfig(settings.LOGGING)
-app = Flask('__name__')
+app = Flask(__name__)
+app_dispatch = DispatcherMiddleware(app, {
+ '/metrics': make_wsgi_app()
+})
+
sf_cli = SalesforceClient(settings.SF_CONFIG)
-@app.route('/health', methods=['GET'])
-def health():
- app.logger.info('Health: OK!')
- return 'OK!'
-
-
@app.route('/hook', methods=['POST'])
def webhook_receiver():
@@ -66,8 +68,9 @@
if fields:
try:
cases.append(getattr(sf_cli, action)(*fields))
- except SalesforceMalformedRequest:
- msg = 'Salesforce request failure.'
+ except SalesforceMalformedRequest as err:
+ msg = 'Salesforce request failure: {}.'.format(err)
+ sf_cli.metrics['sf_error_count'].inc()
app.logger.error(msg)
return Response(json.dumps({'error': msg}),
status=500,
diff --git a/sf_notifier/settings/production.py b/sf_notifier/settings/production.py
index b886190..d98036f 100644
--- a/sf_notifier/settings/production.py
+++ b/sf_notifier/settings/production.py
@@ -20,11 +20,11 @@
},
'loggers': {
'sf_notifier.server': {
- 'level': 'WARN',
+ 'level': 'INFO',
'handlers': ['file', 'wsgi']
},
'sf_notifier.salesforce.client': {
- 'level': 'WARN',
+ 'level': 'INFO',
'handlers': ['file', 'wsgi']
}
}