Script to filter logs for ERRORs based on whitelist

The script will take a directory or url containing log files.
For now all non-whitelisted errors will be dumped to the console but
the script will always return success. Once we are convinced it is reliable
enough we can change it to fail on non-whitelisted errors.

Partially implements blueprint fail-gate-on-log-errors

Change-Id: I30b0eee1055f47aaad7984d886c739ccf5aa6186
diff --git a/etc/whitelist.yaml b/etc/whitelist.yaml
new file mode 100644
index 0000000..6762f9f
--- /dev/null
+++ b/etc/whitelist.yaml
@@ -0,0 +1,131 @@
+n-cpu:
+    - module: "nova.virt.libvirt.driver"
+      message: "During wait destroy, instance disappeared"
+    - module: "glanceclient.common.http"
+      message: "Request returned failure status"
+    - module: "nova.openstack.common.periodic_task"
+      message: "Error during ComputeManager\\.update_available_resource: \
+        'NoneType' object is not iterable"
+    - module: "nova.compute.manager"
+      message: "Possibly task preempted"
+    - module: "nova.openstack.common.rpc.amqp"
+      message: "Exception during message handling"
+    - module: "nova.network.api"
+      message: "Failed storing info cache"
+    - module: "nova.compute.manager"
+      message: "Error while trying to clean up image"
+    - module: "nova.virt.libvirt.driver"
+      message: "Error injecting data into image.*\\(Unexpected error while \
+        running command"
+    - module: "nova.compute.manager"
+      message: "Instance failed to spawn"
+    - module: "nova.compute.manager"
+      message: "Error: Unexpected error while running command"
+
+g-api:
+    - module: "glance.store.sheepdog"
+      message: "Error in store configuration: Unexpected error while \
+        running command"
+    - module: "swiftclient"
+      message: "Container HEAD failed: .*404 Not Found"
+
+ceilometer-acompute:
+    - module: "ceilometer.compute.pollsters.disk"
+      message: "Requested operation is not valid: domain is not running"
+    - module: "ceilometer.compute.pollsters.disk"
+      message: "Domain not found: no domain with matching uuid"
+
+h-api:
+    - module: "root"
+      message: "Returning 400 to user: The server could not comply with \
+        the request since it is either malformed or otherwise incorrect"
+    - module: "root"
+      message: "Unexpected error occurred serving API: Request limit \
+        exceeded: Template exceeds maximum allowed size"
+    - module: "root"
+      message: "Unexpected error occurred serving API: The Stack \
+        .*could not be found"
+
+h-eng:
+    - module: "heat.openstack.common.rpc.amqp"
+      message: "Exception during message handling"
+    - module: "heat.openstack.common.rpc.common"
+      message: "The Stack .* could not be found"
+
+n-api:
+    - module: "glanceclient.common.http"
+      message: "Request returned failure status"
+    - module: "nova.api.openstack"
+      message: "Caught error: Quota exceeded for"
+    - module: "nova.compute.api"
+      message: "ServerDiskConfigTest"
+    - module: "nova.compute.api"
+      message: "ServersTest"
+    - module: "nova.compute.api"
+      message: "\\{u'kernel_id'.*u'ramdisk_id':"
+
+n-cond:
+    - module: "nova.notifications"
+      message: "Failed to send state update notification"
+
+n-sch:
+    - module: "nova.scheduler.filter_scheduler"
+      message: "Error from last host: "
+
+c-api:
+    - module: "cinder.api.middleware.fault"
+      message: "Caught error: Volume .* could not be found"
+
+q-dhpc:
+    - module: "neutron.common.legacy"
+      message: "Skipping unknown group key: firewall_driver"
+    - module: "neutron.agent.dhcp_agent"
+      message: "Unable to enable dhcp"
+    - module: "neutron.agent.dhcp_agent"
+      message: " Network .* RPC info call failed"
+
+ceilometer-collector:
+    - module: "stevedore.extension"
+      message: ".*"
+    - module: "ceilometer.collector.dispatcher.database"
+      message: "duplicate key value violates unique constraint"
+
+q-agt:
+    - module: "neutron.agent.linux.ovs_lib"
+      message: "Unable to execute.*Exception:"
+
+q-dhcp:
+    - module: "neutron.common.legacy"
+      message: "Skipping unknown group key: firewall_driver"
+    - module: "neutron.agent.dhcp_agent"
+      message: "Unable to enable dhcp"
+    - module: "neutron.agent.dhcp_agent"
+      message: "Network .* RPC info call failed"
+
+q-l3:
+    - module: "neutron.common.legacy"
+      message: "Skipping unknown group key: firewall_driver"
+    - module: "neutron.agent.l3_agent"
+      message: "Failed synchronizing routers"
+
+
+q-lbaas:
+    - module: "neutron.common.legacy"
+      message: "Skipping unknown group key: firewall_driver"
+    - module: "neutron.services.loadbalancer.drivers.haproxy.agent_manager"
+      message: "Error upating stats"
+    - module: "neutron.services.loadbalancer.drivers.haproxy.agent_manager"
+      message: "Unable to destroy device for pool"
+
+q-svc:
+    - module: "neutron.common.legacy"
+      message: "Skipping unknown group key: firewall_driver"
+    - module: "neutron.openstack.common.rpc.amqp"
+      message: "Exception during message handling"
+    - module: "neutron.openstack.common.rpc.common"
+      message: "Network .* could not be found"
+    - module: "neutron.openstack.common.rpc.common"
+      message: "Pool .* could not be found"
+    - module: "neutron.api.v2.resource"
+      message: "show failed"
+
diff --git a/tools/check_logs.py b/tools/check_logs.py
index 0cc3677..2ad4f70 100755
--- a/tools/check_logs.py
+++ b/tools/check_logs.py
@@ -16,7 +16,140 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+import argparse
+import gzip
+import os
+import re
+import StringIO
 import sys
+import urllib2
+import yaml
+
+
+def process_files(file_specs, url_specs, whitelists):
+    regexp = re.compile(r"^.*(ERROR|CRITICAL).*\[.*\-.*\]")
+    had_errors = False
+    for (name, filename) in file_specs:
+        whitelist = whitelists.get(name, [])
+        with open(filename) as content:
+            if scan_content(name, content, regexp, whitelist):
+                had_errors = True
+    for (name, url) in url_specs:
+        whitelist = whitelists.get(name, [])
+        req = urllib2.Request(url)
+        req.add_header('Accept-Encoding', 'gzip')
+        page = urllib2.urlopen(req)
+        buf = StringIO.StringIO(page.read())
+        f = gzip.GzipFile(fileobj=buf)
+        if scan_content(name, f.read().splitlines(), regexp, whitelist):
+            had_errors = True
+    return had_errors
+
+
+def scan_content(name, content, regexp, whitelist):
+    had_errors = False
+    for line in content:
+        if not line.startswith("Stderr:") and regexp.match(line):
+            whitelisted = False
+            for w in whitelist:
+                pat = ".*%s.*%s.*" % (w['module'].replace('.', '\\.'),
+                                      w['message'])
+                if re.match(pat, line):
+                    whitelisted = True
+                    break
+            if not whitelisted:
+                if not had_errors:
+                    print("Log File: %s" % name)
+                had_errors = True
+                print(line)
+    return had_errors
+
+
+def collect_url_logs(url):
+    page = urllib2.urlopen(url)
+    content = page.read()
+    logs = re.findall('(screen-[\w-]+\.txt\.gz)</a>', content)
+    return logs
+
+
+def main(opts):
+    if opts.directory and opts.url or not (opts.directory or opts.url):
+        print("Must provide exactly one of -d or -u")
+        exit(1)
+    print("Checking logs...")
+    WHITELIST_FILE = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        "etc", "whitelist.yaml")
+
+    file_matcher = re.compile(r".*screen-([\w-]+)\.log")
+    files = []
+    if opts.directory:
+        d = opts.directory
+        for f in os.listdir(d):
+            files.append(os.path.join(d, f))
+    files_to_process = []
+    for f in files:
+        m = file_matcher.match(f)
+        if m:
+            files_to_process.append((m.group(1), f))
+
+    url_matcher = re.compile(r".*screen-([\w-]+)\.txt\.gz")
+    urls = []
+    if opts.url:
+        for logfile in collect_url_logs(opts.url):
+            urls.append("%s/%s" % (opts.url, logfile))
+    urls_to_process = []
+    for u in urls:
+        m = url_matcher.match(u)
+        if m:
+            urls_to_process.append((m.group(1), u))
+
+    whitelists = {}
+    with open(WHITELIST_FILE) as stream:
+        loaded = yaml.safe_load(stream)
+        if loaded:
+            for (name, l) in loaded.iteritems():
+                for w in l:
+                    assert 'module' in w, 'no module in %s' % name
+                    assert 'message' in w, 'no message in %s' % name
+            whitelists = loaded
+    if process_files(files_to_process, urls_to_process, whitelists):
+        print("Logs have errors")
+        # Return non-zero to start failing builds
+        return 0
+    else:
+        print("ok")
+        return 0
+
+usage = """
+Find non-white-listed log errors in log files from a devstack-gate run.
+Log files will be searched for ERROR or CRITICAL messages. If any
+error messages do not match any of the whitelist entries contained in
+etc/whitelist.yaml, those messages will be printed to the console and
+failure will be returned. A file directory containing logs or a url to the
+log files of an OpenStack gate job can be provided.
+
+The whitelist yaml looks like:
+
+log-name:
+    - module: "a.b.c"
+      message: "regexp"
+    - module: "a.b.c"
+      message: "regexp"
+
+repeated for each log file with a whitelist.
+"""
+
+parser = argparse.ArgumentParser(description=usage)
+parser.add_argument('-d', '--directory',
+                    help="Directory containing log files")
+parser.add_argument('-u', '--url',
+                    help="url containing logs from an OpenStack gate job")
 
 if __name__ == "__main__":
-    sys.exit(0)
+    try:
+        sys.exit(main(parser.parse_args()))
+    except Exception as e:
+        print("Failure in script: %s" % e)
+        # Don't fail if there is a problem with the script.
+        sys.exit(0)