Revamp storage classes, add node subdirs and caching

This is a mammoth commit, sorry about that. In trying to add node subdir
support, I ended up refactoring a lot of the logic from yaml_fs into the
storage base class, and then caching was added along the way, I almost
didn't notice. ;)

Signed-off-by: martin f. krafft <madduck@madduck.net>
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
index 3553051..1bd522a 100644
--- a/doc/source/changelog.rst
+++ b/doc/source/changelog.rst
@@ -5,6 +5,10 @@
 ========= ========== ========================================================
 Version   Date       Changes
 ========= ========== ========================================================
+                     * Caching of classes for performance reasons, especially
+                       during the inventory runs
+                     * yaml_fs: nodes may be defined in subdirectories
+                       (closes: #10).
                      * Classes and nodes URI must not overlap anymore
                      * Class names must not contain spaces
 1.1       2013-08-28 Salt adapter: fix interface to include minion_id, filter
diff --git a/doc/source/operations.rst b/doc/source/operations.rst
index 049fed1..671b8b0 100644
--- a/doc/source/operations.rst
+++ b/doc/source/operations.rst
@@ -40,6 +40,12 @@
                     permit_root_login: no
 ============ ================================================================
 
+Nodes may be defined in subdirectories. However, node names (filename) must be
+unique across all subdirectories, and |reclass| will exit with an error if
+a node is defined multiple times. Subdirectories therefore really only exist
+for the administrator's sanity (and may be used in the future to tag
+additional classes onto nodes).
+
 Data merging
 ------------
 |reclass| has two modes of operation: node information retrieval and inventory
diff --git a/doc/source/todo.rst b/doc/source/todo.rst
index f1733c5..5aa12c7 100644
--- a/doc/source/todo.rst
+++ b/doc/source/todo.rst
@@ -74,9 +74,4 @@
 a list of clients that define it as their master. That would short-circuit
 Puppet's ``storeconfigs`` and Salt's ``mine``.
 
-Caching of classes in yaml\_fs
-------------------------------
-Right now, ``yaml\_fs`` opens each class file dozens of times during an
-inventory run. A class could be cached.
-
 .. include:: substs.inc
diff --git a/reclass/errors.py b/reclass/errors.py
index 8f097c6..ddc09ad 100644
--- a/reclass/errors.py
+++ b/reclass/errors.py
@@ -140,3 +140,12 @@
         msg = "Invalid character '{0}' in class name '{1}'."
         msg = msg.format(invalid_character, classname)
         super(InvalidClassnameError, self).__init__(msg)
+
+
+class DuplicateNodeNameError(NameError):
+
+    def __init__(self, storage, name, uri1, uri2):
+        msg = "{0}: Definition of node '{1}' in '{2}' collides with " \
+              "definition in '{3}'. Nodes can only be defined once per inventory."
+        msg = msg.format(storage, name, uri2, uri1)
+        super(DuplicateNodeNameError, self).__init__(msg)
diff --git a/reclass/storage/__init__.py b/reclass/storage/__init__.py
index b13826b..8bb64e4 100644
--- a/reclass/storage/__init__.py
+++ b/reclass/storage/__init__.py
@@ -8,6 +8,7 @@
 #
 
 import time, sys
+from reclass.datatypes import Entity
 
 def _get_timestamp():
     return time.strftime('%c')
@@ -21,31 +22,87 @@
     def __init__(self, nodes_uri, classes_uri):
         self._nodes_uri = nodes_uri
         self._classes_uri = classes_uri
+        self._classes_cache = {}
 
     nodes_uri = property(lambda self: self._nodes_uri)
     classes_uri = property(lambda self: self._classes_uri)
 
-    def _read_entity(self, node, base_uri, seen={}):
-        raise NotImplementedError, "Storage class not implement node info retrieval"
+    def _get_storage_name(self):
+        raise NotImplementedError, "Storage class does not have a name"
 
-    def nodeinfo(self, node):
-        entity, uri = self._read_entity(node, self.nodes_uri, {})
-        entity.interpolate()
-        return {'__reclass__' : {'node': node, 'node_uri': uri,
-                                 'timestamp': _get_timestamp()
+    def _get_node(self, name, merge_base=None):
+        raise NotImplementedError, "Storage class not implement node entity retrieval"
+
+    def _get_class(self, name):
+        raise NotImplementedError, "Storage class not implement class entity retrieval"
+
+    def _recurse_entity(self, entity, merge_base=None, seen={}, nodename=None):
+        if merge_base is None:
+            merge_base = Entity(name='empty (@{0})'.format(nodename))
+
+        for klass in entity.classes.as_list():
+            if klass not in seen:
+                try:
+                    class_entity = self._classes_cache[klass]
+                except KeyError, e:
+                    class_entity, uri = self._get_class(klass)
+                    self._classes_cache[klass] = class_entity
+
+                descent = self._recurse_entity(class_entity, seen=seen,
+                                               nodename=nodename)
+                # on every iteration, we merge the result of the recursive
+                # descent into what we have so far…
+                merge_base.merge(descent)
+                seen[klass] = True
+
+        # … and finally, we merge what we have at this level into the
+        # result of the iteration, so that elements at the current level
+        # overwrite stuff defined by parents
+        merge_base.merge(entity)
+        return merge_base
+
+    def _nodeinfo(self, nodename):
+        node_entity, uri = self._get_node(nodename)
+        merge_base = Entity(name='merge base for {0}'.format(nodename))
+        ret = self._recurse_entity(node_entity, merge_base, nodename=nodename)
+        ret.interpolate()
+        return ret, uri
+
+    def _nodeinfo_as_dict(self, nodename, entity, uri):
+        ret = {'__reclass__' : {'node': nodename, 'uri': uri,
+                                'timestamp': _get_timestamp()
                                 },
-                'classes': entity.classes.as_list(),
-                'applications': entity.applications.as_list(),
-                'parameters': entity.parameters.as_dict()
-               }
+              }
+        ret.update(entity.as_dict())
+        return ret
+
+    def nodeinfo(self, nodename):
+        return self._nodeinfo_as_dict(nodename, *self._nodeinfo(nodename))
 
     def _list_inventory(self):
         raise NotImplementedError, "Storage class does not implement inventory listing"
 
     def inventory(self):
-        entities, applications, classes = self._list_inventory()
+        entities = self._list_inventory()
+
+        nodes = {}
+        applications = {}
+        classes = {}
+        for f, (nodeinfo, uri) in entities.iteritems():
+            d = nodes[f] = self._nodeinfo_as_dict(f, nodeinfo, uri)
+            for a in d['applications']:
+                if a in applications:
+                    applications[a].append(f)
+                else:
+                    applications[a] = [f]
+            for c in d['classes']:
+                if c in classes:
+                    classes[c].append(f)
+                else:
+                    classes[c] = [f]
+
         return {'__reclass__' : {'timestamp': _get_timestamp()},
-                'nodes': entities,
+                'nodes': nodes,
                 'classes': classes,
                 'applications': applications
                }
diff --git a/reclass/storage/yaml_fs/__init__.py b/reclass/storage/yaml_fs/__init__.py
index d6f3cbd..798847e 100644
--- a/reclass/storage/yaml_fs/__init__.py
+++ b/reclass/storage/yaml_fs/__init__.py
@@ -7,6 +7,7 @@
 # Released under the terms of the Artistic Licence 2.0
 #
 import os, sys
+import fnmatch
 from reclass.storage import NodeStorageBase
 from yamlfile import YamlFile
 from directory import Directory
@@ -24,65 +25,55 @@
     def __init__(self, nodes_uri, classes_uri):
         super(ExternalNodeStorage, self).__init__(nodes_uri, classes_uri)
 
-    def _handle_read_error(self, exc, name, base_uri, nodename):
-        if base_uri == self.classes_uri:
-            raise reclass.errors.ClassNotFound('yaml_fs', name, base_uri, nodename)
-        else:
-            raise reclass.errors.NodeNotFound('yaml_fs', name, base_uri)
+        def _handle_node_duplicates(name, uri1, uri2):
+            raise reclass.errors.DuplicateNodeNameError(self._get_storage_name(),
+                                                        name, uri1, uri2)
+        self._nodes = self._enumerate_inventory(nodes_uri,
+                                                duplicate_handler=_handle_node_duplicates)
+        self._classes = self._enumerate_inventory(classes_uri)
 
-    def _read_entity(self, name, base_uri, seen, nodename=None):
-        path = os.path.join(base_uri, name + FILE_EXTENSION)
+    def _get_storage_name(self):
+        return 'yaml_fs'
+
+    def _enumerate_inventory(self, basedir, duplicate_handler=None):
+        ret = {}
+        def register_fn(dirpath, filenames):
+            filenames = fnmatch.filter(filenames, '*{0}'.format(FILE_EXTENSION))
+            vvv('REGISTER {0} in path {1}'.format(filenames, dirpath))
+            for f in filenames:
+                name = os.path.splitext(f)[0]
+                uri = os.path.join(dirpath, f)
+                if name in ret and callable(duplicate_handler):
+                    duplicate_handler(name, os.path.join(basedir, ret[name]), uri)
+                ret[name] = os.path.relpath(uri, basedir)
+
+        d = Directory(basedir)
+        d.walk(register_fn)
+        return ret
+
+    def _get_node(self, name):
+        vvv('GET NODE {0}'.format(name))
         try:
-            entity = YamlFile(path).entity
-            seen[name] = True
+            path = os.path.join(self.nodes_uri, self._nodes[name])
+        except KeyError, e:
+            raise reclass.errors.NodeNotFound(self._get_storage_name(),
+                                              name, self.nodes_uri)
+        entity = YamlFile(path).entity
+        return entity, 'file://{0}'.format(path)
 
-            merge_base = Entity()
-            for klass in entity.classes.as_list():
-                if klass not in seen:
-                    ret = self._read_entity(klass, self.classes_uri, seen,
-                                              name if nodename is None else nodename)[0]
-                    # on every iteration, we merge the result of the
-                    # recursive descend into what we have so far…
-                    merge_base.merge(ret)
-
-            # … and finally, we merge what we have at this level into the
-            # result of the iteration, so that elements at the current level
-            # overwrite stuff defined by parents
-            merge_base.merge(entity)
-            return merge_base, 'file://{0}'.format(path)
-
-        except reclass.errors.NotFoundError, e:
-            self._handle_read_error(e, name, base_uri, nodename)
-
-        except IOError, e:
-            self._handle_read_error(e, name, base_uri, nodename)
+    def _get_class(self, name, nodename=None):
+        vvv('GET CLASS {0}'.format(name))
+        try:
+            path = os.path.join(self.classes_uri, self._classes[name])
+        except KeyError, e:
+            raise reclass.errors.ClassNotFound(self._get_storage_name(),
+                                               name, self.classes_uri,
+                                               nodename)
+        entity = YamlFile(path).entity
+        return entity, 'file://{0}'.format(path)
 
     def _list_inventory(self):
-        d = Directory(self.nodes_uri)
-
         entities = {}
-
-        def register_fn(dirpath, filenames):
-            vvv('REGISTER {0} in path {1}'.format(filenames, dirpath))
-            for f in filter(lambda f: f.endswith(FILE_EXTENSION), filenames):
-                name = f[:-len(FILE_EXTENSION)]
-                nodeinfo = self.nodeinfo(name)
-                entities[name] = nodeinfo
-
-        d.walk(register_fn)
-
-        applications = {}
-        classes = {}
-        for f, nodeinfo in entities.iteritems():
-            for a in nodeinfo['applications']:
-                if a in applications:
-                    applications[a].append(f)
-                else:
-                    applications[a] = [f]
-            for c in nodeinfo['classes']:
-                if c in classes:
-                    classes[c].append(f)
-                else:
-                    classes[c] = [f]
-
-        return entities, applications, classes
+        for n in self._nodes.iterkeys():
+            entities[n] = self._nodeinfo(n)
+        return entities