Parse XML results to JSON files

after running XCCDF or OVAL scans, parses results.xml file to
results.json one in the format acceptable by our `worp` API.

vendored-in is `untangle` lib v1.1.0
(https://github.com/stchris/untangle, MIT License)

Change-Id: I87f106c4b8b678e1b125ffab832f80ee261a4781
Related-Issue: https://mirantis.jira.com/browse/PROD-23159
Related-Issue: https://mirantis.jira.com/browse/PROD-23160
diff --git a/_modules/oscap/commands.py b/_modules/oscap/commands.py
index 44c703d..6682ae3 100644
--- a/_modules/oscap/commands.py
+++ b/_modules/oscap/commands.py
@@ -2,6 +2,7 @@
 import tempfile
 import os
 from oscap.utils import build_tailoring, normalize_id, run
+from oscap.utils import xccdf_xml_to_json, oval_xml_to_json
 
 def oscap_has_sce():
     (stdout, _, _) = run('oscap -V')
@@ -40,6 +41,9 @@
             f.write(build_tailoring(pillar_data, tailoring_id))
 
     stdout, stderr, rc = run(cmd, tempdir)
+    res_file = os.path.join(tempdir, 'results.xml')
+    if os.path.isfile(res_file):
+        xccdf_xml_to_json(res_file)
     return stdout, stderr, rc, tempdir
 
 def oval(benchmark):
@@ -48,4 +52,7 @@
     cmd = cmd.format(benchmark)
 
     stdout, stderr, rc = run(cmd, tempdir)
+    res_file = os.path.join(tempdir, 'results.xml')
+    if os.path.isfile(res_file):
+        oval_xml_to_json(res_file)
     return stdout, stderr, rc, tempdir
diff --git a/_modules/oscap/untangle.py b/_modules/oscap/untangle.py
new file mode 100644
index 0000000..6f0b22b
--- /dev/null
+++ b/_modules/oscap/untangle.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+ untangle
+
+ Converts xml to python objects.
+
+ The only method you need to call is parse()
+
+ Partially inspired by xml2obj
+ (http://code.activestate.com/recipes/149368-xml2obj/)
+
+ Author: Christian Stefanescu (http://0chris.com)
+ License: MIT License - http://www.opensource.org/licenses/mit-license.php
+"""
+
+import os
+from xml.sax import make_parser, handler
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+__version__ = '1.1.0'
+
+
+class Element():
+    """
+    Representation of an XML element.
+    """
+    def __init__(self, name, attributes):
+        self._name = name
+        self._attributes = attributes
+        self.children = []
+        self.is_root = False
+        self.cdata = ''
+
+    def add_child(self, element):
+        self.children.append(element)
+
+    def add_cdata(self, cdata):
+        self.cdata = self.cdata + cdata
+
+    def get_attribute(self, key):
+        return self._attributes.get(key)
+
+    def get_elements(self, name=None):
+        if name:
+            return [e for e in self.children if e._name == name]
+        else:
+            return self.children
+
+    def __getitem__(self, key):
+        return self.get_attribute(key)
+
+    def __getattr__(self, key):
+        matching_children = [x for x in self.children if x._name == key]
+        if matching_children:
+            if len(matching_children) == 1:
+                self.__dict__[key] = matching_children[0]
+                return matching_children[0]
+            else:
+                self.__dict__[key] = matching_children
+                return matching_children
+        else:
+            raise IndexError('Unknown key <%s>' % key)
+
+    def __iter__(self):
+        yield self
+
+    def __str__(self):
+        return (
+            "Element <%s> with attributes %s and children %s" %
+            (self._name, self._attributes, self.children)
+        )
+
+    def __repr__(self):
+        return (
+            "Element(name = %s, attributes = %s, cdata = %s)" %
+            (self._name, self._attributes, self.cdata)
+        )
+
+    def __nonzero__(self):
+        return self.is_root or self._name is not None
+
+    def __eq__(self, val):
+        return self.cdata == val
+
+    def __dir__(self):
+        children_names = [x._name for x in self.children]
+        return children_names
+
+
+class Handler(handler.ContentHandler):
+    """
+    SAX handler which creates the Python object structure out of ``Element``s
+    """
+    def __init__(self):
+        self.root = Element(None, None)
+        self.root.is_root = True
+        self.elements = []
+
+    def startElement(self, name, attributes):
+        name = name.replace('-', '_')
+        name = name.replace('.', '_')
+        name = name.replace(':', '_')
+        attrs = dict()
+        for k, v in attributes.items():
+            attrs[k] = v
+        element = Element(name, attrs)
+        if len(self.elements) > 0:
+            self.elements[-1].add_child(element)
+        else:
+            self.root.add_child(element)
+        self.elements.append(element)
+
+    def endElement(self, name):
+        self.elements.pop()
+
+    def characters(self, cdata):
+        self.elements[-1].add_cdata(cdata)
+
+
+def parse(filename):
+    """
+    Interprets the given string as a filename, URL or XML data string,
+    parses it and returns a Python object which represents the given
+    document.
+
+    Raises ``ValueError`` if the argument is None / empty string.
+
+    Raises ``xml.sax.SAXParseException`` if something goes wrong
+    during parsing.s
+    """
+    if filename is None or filename.strip() == '':
+        raise ValueError('parse() takes a filename, URL or XML string')
+    parser = make_parser()
+    sax_handler = Handler()
+    parser.setContentHandler(sax_handler)
+    if os.path.exists(filename) or is_url(filename):
+        parser.parse(filename)
+    else:
+        parser.parse(StringIO(filename))
+
+    return sax_handler.root
+
+
+def is_url(string):
+    return string.startswith('http://') or string.startswith('https://')
+
+# vim: set expandtab ts=4 sw=4:
diff --git a/_modules/oscap/utils.py b/_modules/oscap/utils.py
index 164949d..f026fdc 100644
--- a/_modules/oscap/utils.py
+++ b/_modules/oscap/utils.py
@@ -1,11 +1,17 @@
+import collections
+import datetime
+import json
 from lxml.etree import Element, SubElement, tostring
+import os.path
+import re
 from subprocess import Popen, PIPE
 import shlex
-import re
-import datetime
 
 import salt.ext.six as six
 
+from oscap import untangle
+
+
 def normalize_id(id,
                  xccdf_version='1.2',
                  typeof='profile',
@@ -16,6 +22,7 @@
             return 'xccdf_org.{0}.content_{1}_{2}'.format(vendor, typeof, id)
     return id
 
+
 def build_tailoring(data, id):
     xccdf_version = data.get('xccdf_version', '1.2')
     ns = {None: 'http://checklists.nist.gov/xccdf/{}'.format(xccdf_version)}
@@ -26,12 +33,11 @@
     tailoring.append(Element('benchmark', {'href': ext}))
 
     now = datetime.datetime.now().isoformat()
-    version = SubElement(tailoring, 'version', time=now).text = '1'
+    SubElement(tailoring, 'version', time=now).text = '1'
 
     profile = SubElement(tailoring, 'Profile', id=pid, extends=ext)
 
-    title = SubElement(profile, 'title').text = \
-        'Extends {}'.format(ext)
+    SubElement(profile, 'title').text = 'Extends {}'.format(ext)
 
     for key, value in six.iteritems(data.get('values', {})):
         idref = normalize_id(key, xccdf_version, typeof='value')
@@ -39,8 +45,122 @@
         elem.text = str(value)
     return tostring(tailoring, pretty_print=True)
 
+
 def run(cmd, cwd=None):
     # The Popen used here because the __salt__['cmd.run'] returns only stdout
     proc = Popen(shlex.split(cmd), stdout=PIPE, stderr=PIPE, cwd=cwd)
     (stdout, stderr) = proc.communicate()
     return stdout, stderr, proc.returncode
+
+
+def _get_flatten_groups(document, groups=None):
+    groups = groups if groups else []
+    if hasattr(document, 'Group'):
+        for group in document.Group:
+            groups.append(group)
+            groups = _get_flatten_groups(group, groups)
+    return groups
+
+
+def _get_rules(groups):
+    rules = {}
+    for group in groups:
+        if hasattr(group, 'Rule'):
+            for rule in group.Rule:
+                rules[rule['id']] = {
+                    'title': rule.title.cdata,
+                    'severity': rule['severity'],
+                    'description': rule.description.cdata}
+    return rules
+
+
+def _parse_xccdf_doc(document):
+    groups = _get_flatten_groups(document.Benchmark)
+    rules = _get_rules(groups)
+
+    results = []
+    for result in document.Benchmark.TestResult.rule_result:
+        results.append({
+            'rule': result['idref'],
+            'result': result.result.cdata,
+            'severity': result['severity'],
+            'weight': result['weight'],
+            'title': rules[result['idref']]['title'],
+            'description': rules[result['idref']]['title']
+        })
+
+    return results
+
+
+def _sanitize_xccdf_xml(data):
+    data = data.replace(
+        '<html:code xmlns:html="http://www.w3.org/1999/xhtml">', '')
+    data = data.replace('</html:code>', '')
+    data = data.replace(
+        '<html:pre xmlns:html="http://www.w3.org/1999/xhtml">', '')
+    data = data.replace('<html:pre>', '')
+    data = data.replace('</html:pre>', '')
+    data = data.replace('<html:code>', '')
+    data = data.replace('<html:li>', '')
+    data = data.replace('</html:li>', '')
+    data = data.replace(
+        '<html:pre xmlns:html="http://www.w3.org/1999/xhtml" '
+        'xmlns:ns0="http://checklists.nist.gov/xccdf/1.1">',
+        '')
+    data = data.replace(
+        '<html:br xmlns:html="http://www.w3.org/1999/xhtml"/>', '')
+    data = data.replace(
+        '<html:code xmlns:html="http://www.w3.org/1999/xhtml" '
+        'xmlns:ns0="http://checklists.nist.gov/xccdf/1.1">',
+        '')
+    return data
+
+
+def xccdf_xml_to_json(xml_file):
+    with open(xml_file) as in_file:
+        raw_xml = in_file.read()
+    doc = untangle.parse(_sanitize_xccdf_xml(raw_xml))
+    results = _parse_xccdf_doc(doc)
+    with open(os.path.splitext(xml_file)[0] + '.json', 'w') as json_file:
+        # NOTE(pas-ha) the src/com/mirantis/mk.Common.parseJSON method
+        # from mk/pipeline-library that is used in our Jenkins pipelines
+        # can not parse the string representation of a list!
+        # only dict structure is supported as a top-level one there
+        json.dump({"results": results}, json_file)
+
+
+def _parse_oval_definitions(document):
+    definitions = {}
+    def_list = document.oval_results.oval_definitions.definitions.definition
+    for definition in def_list:
+        try:
+            definitions[definition['id']] = {'class': definition['class']}
+            def_dict = definitions[definition['id']]
+            def_dict['title'] = definition.metadata.title.cdata
+            def_dict['description'] = definition.metadata.description.cdata
+            def_dict['ref_id'] = definition.metadata.reference['ref_id']
+            def_dict['link'] = definition.metadata.reference['ref_url']
+            def_dict['severity'] = definition.metadata.advisory.severity.cdata
+        except AttributeError:
+            # NOTE(e0ne): inventory  does't have definition.metadata.reference
+            pass
+
+    return definitions
+
+
+def oval_xml_to_json(xml_file):
+    document = untangle.parse(xml_file)
+    definitions = _parse_oval_definitions(document)
+    results = []
+    for defn in document.oval_results.results.system.definitions.definition:
+        res = collections.defaultdict(lambda: None)
+        res['id'] = defn['definition_id']
+        res['result'] = defn['result']
+        res.update(definitions[defn['definition_id']])
+        results.append(res)
+    with open(os.path.splitext(xml_file)[0] + '.json', 'w') as json_file:
+        # NOTE(pas-ha) the src/com/mirantis/mk.Common.parseJSON method
+        # from mk/pipeline-library that is used in our Jenkins pipelines
+        # can not parse the string representation of a list!
+        # only dict structure is supported as a top-level one there
+        json.dump({"results": results}, json_file)