Patchwork [v2] package: Add cachedpath optimisation

login
register
mail settings
Submitter Richard Purdie
Date March 18, 2013, 4:47 p.m.
Message ID <1363625236.16482.59.camel@ted>
Download mbox | patch
Permalink /patch/46403/
State New
Headers show

Comments

Richard Purdie - March 18, 2013, 4:47 p.m.
Currently, various standard library operations like os.walk(),
os.path.isdir() and os.path.islink() each call stat or lstat which
involves a syscall into the kernel. There is no caching since they could
conceivably have changed on disk. The result is that for something like
the do_package task of the kernel we're spending over two minutes making
868,000 individual stat calls for 23,000 files. This is suboptimal.

This patch adds lib/oe/cachedpath.py which are a set of replacement
functions for these operations which use cached stat data rather than
hitting the kernel each time. It gives a nice performance improvement
halving the build time of the kernel do_package.

Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>

---
Chris Larson - March 18, 2013, 5 p.m.
On Mon, Mar 18, 2013 at 9:47 AM, Richard Purdie <
richard.purdie@linuxfoundation.org> wrote:

> Currently, various standard library operations like os.walk(),
> os.path.isdir() and os.path.islink() each call stat or lstat which
> involves a syscall into the kernel. There is no caching since they could
> conceivably have changed on disk. The result is that for something like
> the do_package task of the kernel we're spending over two minutes making
> 868,000 individual stat calls for 23,000 files. This is suboptimal.
>
> This patch adds lib/oe/cachedpath.py which are a set of replacement
> functions for these operations which use cached stat data rather than
> hitting the kernel each time. It gives a nice performance improvement
> halving the build time of the kernel do_package.
>
> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
>

Looks good, nice job.

Signed-off-by: Christopher Larson <chris_larson@mentor.com>

Patch

diff --git a/meta/classes/base.bbclass b/meta/classes/base.bbclass
index 4ec1eda..5fe9a84 100644
--- a/meta/classes/base.bbclass
+++ b/meta/classes/base.bbclass
@@ -10,7 +10,7 @@  inherit utility-tasks
 inherit metadata_scm
 inherit logging
 
-OE_IMPORTS += "os sys time oe.path oe.utils oe.data oe.package oe.packagegroup oe.sstatesig oe.lsb"
+OE_IMPORTS += "os sys time oe.path oe.utils oe.data oe.package oe.packagegroup oe.sstatesig oe.lsb oe.cachedpath"
 OE_IMPORTS[type] = "list"
 
 def oe_import(d):
diff --git a/meta/classes/package.bbclass b/meta/classes/package.bbclass
index f3a6bc7..fde4e9d 100644
--- a/meta/classes/package.bbclass
+++ b/meta/classes/package.bbclass
@@ -288,7 +288,7 @@  def copydebugsources(debugsrcdir, d):
         basepath = dvar
         for p in debugsrcdir.split("/"):
             basepath = basepath + "/" + p
-            if not os.path.exists(basepath):
+            if not cpath.exists(basepath):
                 nosuchdir.append(basepath)
         bb.utils.mkdirhier(basepath)
 
@@ -388,7 +388,7 @@  python package_do_split_locales() {
 
     localedir = os.path.join(dvar + datadir, 'locale')
 
-    if not os.path.isdir(localedir):
+    if not cpath.isdir(localedir):
         bb.debug(1, "No locale files in this package")
         return
 
@@ -628,7 +628,7 @@  python fixup_perms () {
             continue
 
         origin = dvar + dir
-        if not (os.path.exists(origin) and os.path.isdir(origin) and not os.path.islink(origin)):
+        if not (cpath.exists(origin) and cpath.isdir(origin) and not cpath.islink(origin)):
             continue
 
         link = fs_perms_table[dir].link
@@ -654,7 +654,7 @@  python fixup_perms () {
             continue
 
         origin = dvar + dir
-        if not (os.path.exists(origin) and os.path.isdir(origin)):
+        if not (cpath.exists(origin) and cpath.isdir(origin)):
             continue
 
         fix_perms(origin, fs_perms_table[dir].mode, fs_perms_table[dir].uid, fs_perms_table[dir].gid, dir)
@@ -735,7 +735,7 @@  python split_and_strip_files () {
     baselibdir = os.path.abspath(dvar + os.sep + d.getVar("base_libdir", True))
     if (d.getVar('INHIBIT_PACKAGE_DEBUG_SPLIT', True) != '1') and \
             (d.getVar('INHIBIT_PACKAGE_STRIP', True) != '1'):
-        for root, dirs, files in os.walk(dvar):
+        for root, dirs, files in cpath.walk(dvar):
             for f in files:
                 file = os.path.join(root, f)
                 if file.endswith(".ko") and file.find("/lib/modules/") != -1:
@@ -749,18 +749,20 @@  python split_and_strip_files () {
                     continue
 
                 try:
-                    ltarget = oe.path.realpath(file, dvar, False)
-                    s = os.lstat(ltarget)
+                    ltarget = cpath.realpath(file, dvar, False)
+                    s = cpath.lstat(ltarget)
                 except OSError, (err, strerror):
                     if err != errno.ENOENT:
                         raise
                     # Skip broken symlinks
                     continue
+                if not s:
+                    continue
                 # Check its an excutable
                 if (s[stat.ST_MODE] & stat.S_IXUSR) or (s[stat.ST_MODE] & stat.S_IXGRP) or (s[stat.ST_MODE] & stat.S_IXOTH) \
                         or ((file.startswith(libdir) or file.startswith(baselibdir)) and ".so" in f):
                     # If it's a symlink, and points to an ELF file, we capture the readlink target
-                    if os.path.islink(file):
+                    if cpath.islink(file):
                         target = os.readlink(file)
                         if isELF(ltarget):
                             #bb.note("Sym: %s (%d)" % (ltarget, isELF(ltarget)))
@@ -918,8 +920,8 @@  python populate_packages () {
         for file in files:
             if os.path.isabs(file):
                 file = '.' + file
-            if not os.path.islink(file):
-                if os.path.isdir(file):
+            if not cpath.islink(file):
+                if cpath.isdir(file):
                     newfiles =  [ os.path.join(file,x) for x in os.listdir(file) ]
                     if newfiles:
                         files += newfiles
@@ -929,7 +931,7 @@  python populate_packages () {
                 if [ file ] != globbed:
                     files += globbed
                     continue
-            if (not os.path.islink(file)) and (not os.path.exists(file)):
+            if (not cpath.islink(file)) and (not cpath.exists(file)):
                 continue
             if file in seen:
                 continue
@@ -938,33 +940,35 @@  python populate_packages () {
             def mkdir(src, dest, p):
                 src = os.path.join(src, p)
                 dest = os.path.join(dest, p)
-                bb.utils.mkdirhier(dest)
-                fstat = os.stat(src)
-                os.chmod(dest, fstat.st_mode)
+                fstat = cpath.stat(src)
+                if not fstat:
+                    bb.error("No file for %s" % src)
+                os.mkdir(dest, fstat.st_mode)
                 os.chown(dest, fstat.st_uid, fstat.st_gid)
                 if p not in seen:
                     seen.append(p)
+                cpath.updatecache(dest)
 
             def mkdir_recurse(src, dest, paths):
-                if os.path.exists(dest + '/' + paths):
+                if cpath.exists(dest + '/' + paths):
                     return
                 while paths.startswith("./"):
                     paths = paths[2:]
                 p = "."
                 for c in paths.split("/"):
                     p = os.path.join(p, c)
-                    if not os.path.exists(os.path.join(dest, p)):
+                    if not cpath.exists(os.path.join(dest, p)):
                         mkdir(src, dest, p)
 
-            if os.path.isdir(file) and not os.path.islink(file):
+            if cpath.isdir(file) and not cpath.islink(file):
                 mkdir_recurse(dvar, root, file)
                 continue
 
             mkdir_recurse(dvar, root, os.path.dirname(file))
             fpath = os.path.join(root,file)
-            if not os.path.islink(file):
+            if not cpath.islink(file):
                 os.link(file, fpath)
-                fstat = os.stat(file)
+                fstat = cpath.stat(file)
                 os.chmod(fpath, fstat.st_mode)
                 os.chown(fpath, fstat.st_uid, fstat.st_gid)
                 continue
@@ -975,7 +979,7 @@  python populate_packages () {
     os.chdir(workdir)
 
     unshipped = []
-    for root, dirs, files in os.walk(dvar):
+    for root, dirs, files in cpath.walk(dvar):
         dir = root[len(dvar):]
         if not dir:
             dir = os.sep
@@ -1009,8 +1013,8 @@  python package_fixsymlinks () {
         for path in pkgfiles[pkg]:
                 rpath = path[len(inst_root):]
                 pkg_files[pkg].append(rpath)
-                rtarget = oe.path.realpath(path, inst_root, True, assume_dir = True)
-                if not os.path.lexists(rtarget):
+                rtarget = cpath.realpath(path, inst_root, True, assume_dir = True)
+                if not cpath.lexists(rtarget):
                     dangling_links[pkg].append(os.path.normpath(rtarget[len(inst_root):]))
 
     newrdepends = {}
@@ -1394,7 +1398,7 @@  python package_do_shlibs() {
         renames = list()
         for file in pkgfiles[pkg]:
                 soname = None
-                if os.path.islink(file):
+                if cpath.islink(file):
                     continue
                 if targetos == "darwin" or targetos == "darwin8":
                     darwin_so(file)
@@ -1781,6 +1785,10 @@  python do_package () {
     # as any change to rpmdeps requires this to be rerun.
     # PACKAGE_BBCLASS_VERSION = "1"
 
+    # Init cachedpath
+    global cpath
+    cpath = oe.cachedpath.CachedPath()
+
     ###########################################################################
     # Sanity test the setup
     ###########################################################################
@@ -1827,6 +1835,8 @@  python do_package () {
     # Split up PKGD into PKGDEST
     ###########################################################################
 
+    cpath = oe.cachedpath.CachedPath()
+
     for f in (d.getVar('PACKAGESPLITFUNCS', True) or '').split():
         bb.build.exec_func(f, d)
 
@@ -1841,7 +1851,7 @@  python do_package () {
     pkgdest = d.getVar('PKGDEST', True)
     for pkg in packages:
         pkgfiles[pkg] = []
-        for walkroot, dirs, files in os.walk(pkgdest + "/" + pkg):
+        for walkroot, dirs, files in cpath.walk(pkgdest + "/" + pkg):
             for file in files:
                 pkgfiles[pkg].append(walkroot + os.sep + file)
 
diff --git a/meta/lib/oe/cachedpath.py b/meta/lib/oe/cachedpath.py
new file mode 100644
index 0000000..dd7d335
--- /dev/null
+++ b/meta/lib/oe/cachedpath.py
@@ -0,0 +1,236 @@ 
+#
+# Based on standard python library functions but avoid
+# repeated stat calls. Its assumed the files will not change from under us
+# so we can cache stat calls.
+#
+
+import os
+import errno
+import stat as statmod
+
+class CachedPath(object):
+    statcache = {}
+    lstatcache = {}
+    normpathcache = {}
+
+    def __init__(self):
+        return
+
+    def updatecache(self, x):
+        x = self.normpath(x)
+        if x in self.statcache:
+            del self.statcache[x]
+        if x in self.lstatcache:
+            del self.lstatcache[x]
+
+    def normpath(self, path):
+        if path in self.normpathcache:
+            return self.normpathcache[path]
+        newpath = os.path.normpath(path)
+        self.normpathcache[path] = newpath
+        return newpath
+
+    def _callstat(self, path):
+        if path in self.statcache:
+            return self.statcache[path]
+        try:
+            st = os.stat(path)
+            self.statcache[path] = st
+            return st
+        except os.error:
+            self.statcache[path] = False
+            return False
+
+    # We might as well call lstat and then only 
+    # call stat as well in the symbolic link case
+    # since this turns out to be much more optimal
+    # in real world usage of this cache
+    def callstat(self, path):
+        path = self.normpath(path)
+        self.calllstat(path)
+        return self.statcache[path]
+
+    def calllstat(self, path):
+        path = self.normpath(path)
+        if path in self.lstatcache:
+            return self.lstatcache[path]
+        #bb.error("LStatpath:" + path)
+        try:
+            lst = os.lstat(path)
+            self.lstatcache[path] = lst
+            if not statmod.S_ISLNK(lst.st_mode):
+                self.statcache[path] = lst
+            else:
+                self._callstat(path)
+            return lst
+        except (os.error, AttributeError):
+            self.lstatcache[path] = False
+            self.statcache[path] = False
+            return False
+
+    # This follows symbolic links, so both islink() and isdir() can be true
+    # for the same path ono systems that support symlinks
+    def isfile(self, path):
+        """Test whether a path is a regular file"""
+        st = self.callstat(path)
+        if not st:
+            return False
+        return statmod.S_ISREG(st.st_mode)
+
+    # Is a path a directory?
+    # This follows symbolic links, so both islink() and isdir()
+    # can be true for the same path on systems that support symlinks
+    def isdir(self, s):
+        """Return true if the pathname refers to an existing directory."""
+        st = self.callstat(s)
+        if not st:
+            return False
+        return statmod.S_ISDIR(st.st_mode)
+
+    def islink(self, path):
+        """Test whether a path is a symbolic link"""
+        st = self.calllstat(path)
+        if not st:
+            return False
+        return statmod.S_ISLNK(st.st_mode)
+
+    # Does a path exist?
+    # This is false for dangling symbolic links on systems that support them.
+    def exists(self, path):
+        """Test whether a path exists.  Returns False for broken symbolic links"""
+        if self.callstat(path):
+            return True
+        return False
+
+    def lexists(self, path):
+        """Test whether a path exists.  Returns True for broken symbolic links"""
+        if self.calllstat(path):
+            return True
+        return False
+
+    def stat(self, path):
+        return self.callstat(path)
+
+    def lstat(self, path):
+        return self.calllstat(path)
+
+    def walk(self, top, topdown=True, onerror=None, followlinks=False):
+        # Matches os.walk, not os.path.walk()
+
+        # We may not have read permission for top, in which case we can't
+        # get a list of the files the directory contains.  os.path.walk
+        # always suppressed the exception then, rather than blow up for a
+        # minor reason when (say) a thousand readable directories are still
+        # left to visit.  That logic is copied here.
+        try:
+            # Note that listdir and error are globals in this module due
+            # to earlier import-*.
+            names = os.listdir(top)
+        except error, err:
+            if onerror is not None:
+                onerror(err)
+            return
+
+        dirs, nondirs = [], []
+        for name in names:
+            if self.isdir(os.path.join(top, name)):
+                dirs.append(name)
+            else:
+                nondirs.append(name)
+
+        if topdown:
+            yield top, dirs, nondirs
+        for name in dirs:
+            new_path = os.path.join(top, name)
+            if followlinks or not self.islink(new_path):
+                for x in self.walk(new_path, topdown, onerror, followlinks):
+                    yield x
+        if not topdown:
+            yield top, dirs, nondirs
+
+    ## realpath() related functions
+    def __is_path_below(self, file, root):
+        return (file + os.path.sep).startswith(root)
+
+    def __realpath_rel(self, start, rel_path, root, loop_cnt, assume_dir):
+        """Calculates real path of symlink 'start' + 'rel_path' below
+        'root'; no part of 'start' below 'root' must contain symlinks. """
+        have_dir = True
+
+        for d in rel_path.split(os.path.sep):
+            if not have_dir and not assume_dir:
+                raise OSError(errno.ENOENT, "no such directory %s" % start)
+
+            if d == os.path.pardir: # '..'
+                if len(start) >= len(root):
+                    # do not follow '..' before root
+                    start = os.path.dirname(start)
+                else:
+                    # emit warning?
+                    pass
+            else:
+                (start, have_dir) = self.__realpath(os.path.join(start, d),
+                                                    root, loop_cnt, assume_dir)
+
+            assert(self.__is_path_below(start, root))
+
+        return start
+
+    def __realpath(self, file, root, loop_cnt, assume_dir):
+        while self.islink(file) and len(file) >= len(root):
+            if loop_cnt == 0:
+                raise OSError(errno.ELOOP, file)
+
+            loop_cnt -= 1
+            target = os.path.normpath(os.readlink(file))
+    
+            if not os.path.isabs(target):
+                tdir = os.path.dirname(file)
+                assert(self.__is_path_below(tdir, root))
+            else:
+                tdir = root
+
+            file = self.__realpath_rel(tdir, target, root, loop_cnt, assume_dir)
+
+        try:
+            is_dir = self.isdir(file)
+        except:
+            is_dir = False
+
+        return (file, is_dir)
+
+    def realpath(self, file, root, use_physdir = True, loop_cnt = 100, assume_dir = False):
+        """ Returns the canonical path of 'file' with assuming a
+        toplevel 'root' directory. When 'use_physdir' is set, all
+        preceding path components of 'file' will be resolved first;
+        this flag should be set unless it is guaranteed that there is
+        no symlink in the path. When 'assume_dir' is not set, missing
+        path components will raise an ENOENT error"""
+
+        root = os.path.normpath(root)
+        file = os.path.normpath(file)
+
+        if not root.endswith(os.path.sep):
+            # letting root end with '/' makes some things easier
+            root = root + os.path.sep
+
+        if not self.__is_path_below(file, root):
+            raise OSError(errno.EINVAL, "file '%s' is not below root" % file)
+
+        try:
+            if use_physdir:
+                file = self.__realpath_rel(root, file[(len(root) - 1):], root, loop_cnt, assume_dir)
+            else:
+                file = self.__realpath(file, root, loop_cnt, assume_dir)[0]
+        except OSError, e:
+            if e.errno == errno.ELOOP:
+                # make ELOOP more readable; without catching it, there will
+                # be printed a backtrace with 100s of OSError exceptions
+                # else
+                raise OSError(errno.ELOOP,
+                              "too much recursions while resolving '%s'; loop in '%s'" %
+                              (file, e.strerror))
+
+            raise
+
+        return file