aboutsummaryrefslogtreecommitdiffstats
path: root/extras/fast_partial.py
diff options
context:
space:
mode:
authorDavid Robillard <d@drobilla.net>2018-09-15 14:51:02 +0200
committerDavid Robillard <d@drobilla.net>2018-09-15 14:51:02 +0200
commit8c96b17a5393bffee0c521c4872a9fa999048032 (patch)
treeff0760d5f3f0e994413c8c1dff7a4fed37914bee /extras/fast_partial.py
Squashed 'waflib/' content from commit 982416b
git-subtree-dir: waflib git-subtree-split: 982416b8a6c6728e200243e1be3ab60435c08830
Diffstat (limited to 'extras/fast_partial.py')
-rw-r--r--extras/fast_partial.py518
1 files changed, 518 insertions, 0 deletions
diff --git a/extras/fast_partial.py b/extras/fast_partial.py
new file mode 100644
index 0000000..b3af513
--- /dev/null
+++ b/extras/fast_partial.py
@@ -0,0 +1,518 @@
+#! /usr/bin/env python
+# encoding: utf-8
+# Thomas Nagy, 2017-2018 (ita)
+
+"""
+A system for fast partial rebuilds
+
+Creating a large amount of task objects up front can take some time.
+By making a few assumptions, it is possible to avoid posting creating
+task objects for targets that are already up-to-date.
+
+On a silly benchmark the gain observed for 1M tasks can be 5m->10s
+for a single file change.
+
+Usage::
+
+ def options(opt):
+ opt.load('fast_partial')
+
+Assuptions:
+* Mostly for C/C++/Fortran targets with link tasks (object-only targets are not handled)
+* For full project builds: no --targets and no pruning from subfolders
+* The installation phase is ignored
+* `use=` dependencies are specified up front even across build groups
+* Task generator source files are not obtained from globs
+
+Implementation details:
+* The first layer obtains file timestamps to recalculate file hashes only
+ when necessary (similar to md5_tstamp); the timestamps are then stored
+ in a dedicated pickle file
+* A second layer associates each task generator to a file set to help
+ detecting changes. Task generators are to create their tasks only when
+ the related files have been modified. A specific db file is created
+ to store such data (5m -> 1m10)
+* A third layer binds build context proxies onto task generators, replacing
+ the default context. While loading data for the full build uses more memory
+ (4GB -> 9GB), partial builds are then much faster (1m10 -> 13s)
+* A fourth layer enables a 2-level cache on file signatures to
+ reduce the size of the main pickle file (13s -> 10s)
+"""
+
+import os
+from waflib import Build, Context, Errors, Logs, Task, TaskGen, Utils
+from waflib.TaskGen import feature, after_method, taskgen_method
+import waflib.Node
+
+DONE = 0
+DIRTY = 1
+NEEDED = 2
+
+SKIPPABLE = ['cshlib', 'cxxshlib', 'cstlib', 'cxxstlib', 'cprogram', 'cxxprogram']
+
+TSTAMP_DB = '.wafpickle_tstamp_db_file'
+
+SAVED_ATTRS = 'root node_sigs task_sigs imp_sigs raw_deps node_deps'.split()
+
+class bld_proxy(object):
+ def __init__(self, bld):
+ object.__setattr__(self, 'bld', bld)
+
+ object.__setattr__(self, 'node_class', type('Nod3', (waflib.Node.Node,), {}))
+ self.node_class.__module__ = 'waflib.Node'
+ self.node_class.ctx = self
+
+ object.__setattr__(self, 'root', self.node_class('', None))
+ for x in SAVED_ATTRS:
+ if x != 'root':
+ object.__setattr__(self, x, {})
+
+ self.fix_nodes()
+
+ def __setattr__(self, name, value):
+ bld = object.__getattribute__(self, 'bld')
+ setattr(bld, name, value)
+
+ def __delattr__(self, name):
+ bld = object.__getattribute__(self, 'bld')
+ delattr(bld, name)
+
+ def __getattribute__(self, name):
+ try:
+ return object.__getattribute__(self, name)
+ except AttributeError:
+ bld = object.__getattribute__(self, 'bld')
+ return getattr(bld, name)
+
+ def __call__(self, *k, **kw):
+ return self.bld(*k, **kw)
+
+ def fix_nodes(self):
+ for x in ('srcnode', 'path', 'bldnode'):
+ node = self.root.find_dir(getattr(self.bld, x).abspath())
+ object.__setattr__(self, x, node)
+
+ def set_key(self, store_key):
+ object.__setattr__(self, 'store_key', store_key)
+
+ def fix_tg_path(self, *tgs):
+ # changing Node objects on task generators is possible
+ # yet, all Node objects must belong to the same parent
+ for tg in tgs:
+ tg.path = self.root.make_node(tg.path.abspath())
+
+ def restore(self):
+ dbfn = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
+ Logs.debug('rev_use: reading %s', dbfn)
+ try:
+ data = Utils.readf(dbfn, 'rb')
+ except (EnvironmentError, EOFError):
+ # handle missing file/empty file
+ Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
+ else:
+ try:
+ waflib.Node.pickle_lock.acquire()
+ waflib.Node.Nod3 = self.node_class
+ try:
+ data = Build.cPickle.loads(data)
+ except Exception as e:
+ Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
+ else:
+ for x in SAVED_ATTRS:
+ object.__setattr__(self, x, data.get(x, {}))
+ finally:
+ waflib.Node.pickle_lock.release()
+ self.fix_nodes()
+
+ def store(self):
+ data = {}
+ for x in Build.SAVED_ATTRS:
+ data[x] = getattr(self, x)
+ db = os.path.join(self.variant_dir, Context.DBFILE + self.store_key)
+
+ try:
+ waflib.Node.pickle_lock.acquire()
+ waflib.Node.Nod3 = self.node_class
+ x = Build.cPickle.dumps(data, Build.PROTOCOL)
+ finally:
+ waflib.Node.pickle_lock.release()
+
+ Logs.debug('rev_use: storing %s', db)
+ Utils.writef(db + '.tmp', x, m='wb')
+ try:
+ st = os.stat(db)
+ os.remove(db)
+ if not Utils.is_win32:
+ os.chown(db + '.tmp', st.st_uid, st.st_gid)
+ except (AttributeError, OSError):
+ pass
+ os.rename(db + '.tmp', db)
+
+class bld(Build.BuildContext):
+ def __init__(self, **kw):
+ super(bld, self).__init__(**kw)
+ self.hashes_md5_tstamp = {}
+
+ def __call__(self, *k, **kw):
+ # this is one way of doing it, one could use a task generator method too
+ bld = kw['bld'] = bld_proxy(self)
+ ret = TaskGen.task_gen(*k, **kw)
+ self.task_gen_cache_names = {}
+ self.add_to_group(ret, group=kw.get('group'))
+ ret.bld = bld
+ bld.set_key(ret.path.abspath().replace(os.sep, '') + str(ret.idx))
+ return ret
+
+ def is_dirty(self):
+ return True
+
+ def store_tstamps(self):
+ # Called after a build is finished
+ # For each task generator, record all files involved in task objects
+ # optimization: done only if there was something built
+ do_store = False
+ try:
+ f_deps = self.f_deps
+ except AttributeError:
+ f_deps = self.f_deps = {}
+ self.f_tstamps = {}
+
+ allfiles = set()
+ for g in self.groups:
+ for tg in g:
+ try:
+ staleness = tg.staleness
+ except AttributeError:
+ staleness = DIRTY
+
+ if staleness != DIRTY:
+ # DONE case: there was nothing built
+ # NEEDED case: the tg was brought in because of 'use' propagation
+ # but nothing really changed for them, there may be incomplete
+ # tasks (object files) and in this case it is best to let the next build
+ # figure out if an input/output file changed
+ continue
+
+ do_cache = False
+ for tsk in tg.tasks:
+ if tsk.hasrun == Task.SUCCESS:
+ do_cache = True
+ pass
+ elif tsk.hasrun == Task.SKIPPED:
+ pass
+ else:
+ # one failed task, clear the cache for this tg
+ try:
+ del f_deps[(tg.path.abspath(), tg.idx)]
+ except KeyError:
+ pass
+ else:
+ # just store the new state because there is a change
+ do_store = True
+
+ # skip the rest because there is no valid cache possible
+ break
+ else:
+ if not do_cache:
+ # all skipped, but is there anything in cache?
+ try:
+ f_deps[(tg.path.abspath(), tg.idx)]
+ except KeyError:
+ # probably cleared because a wscript file changed
+ # store it
+ do_cache = True
+
+ if do_cache:
+
+ # there was a rebuild, store the data structure too
+ tg.bld.store()
+
+ # all tasks skipped but no cache
+ # or a successful task build
+ do_store = True
+ st = set()
+ for tsk in tg.tasks:
+ st.update(tsk.inputs)
+ st.update(self.node_deps.get(tsk.uid(), []))
+
+ # TODO do last/when loading the tgs?
+ lst = []
+ for k in ('wscript', 'wscript_build'):
+ n = tg.path.find_node(k)
+ if n:
+ n.get_bld_sig()
+ lst.append(n.abspath())
+
+ lst.extend(sorted(x.abspath() for x in st))
+ allfiles.update(lst)
+ f_deps[(tg.path.abspath(), tg.idx)] = lst
+
+ for x in allfiles:
+ # f_tstamps has everything, while md5_tstamp can be relatively empty on partial builds
+ self.f_tstamps[x] = self.hashes_md5_tstamp[x][0]
+
+ if do_store:
+ dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
+ Logs.debug('rev_use: storing %s', dbfn)
+ dbfn_tmp = dbfn + '.tmp'
+ x = Build.cPickle.dumps([self.f_tstamps, f_deps], Build.PROTOCOL)
+ Utils.writef(dbfn_tmp, x, m='wb')
+ os.rename(dbfn_tmp, dbfn)
+ Logs.debug('rev_use: stored %s', dbfn)
+
+ def store(self):
+ self.store_tstamps()
+ if self.producer.dirty:
+ Build.BuildContext.store(self)
+
+ def compute_needed_tgs(self):
+ # assume the 'use' keys are not modified during the build phase
+
+ dbfn = os.path.join(self.variant_dir, TSTAMP_DB)
+ Logs.debug('rev_use: Loading %s', dbfn)
+ try:
+ data = Utils.readf(dbfn, 'rb')
+ except (EnvironmentError, EOFError):
+ Logs.debug('rev_use: Could not load the build cache %s (missing)', dbfn)
+ self.f_deps = {}
+ self.f_tstamps = {}
+ else:
+ try:
+ self.f_tstamps, self.f_deps = Build.cPickle.loads(data)
+ except Exception as e:
+ Logs.debug('rev_use: Could not pickle the build cache %s: %r', dbfn, e)
+ self.f_deps = {}
+ self.f_tstamps = {}
+ else:
+ Logs.debug('rev_use: Loaded %s', dbfn)
+
+
+ # 1. obtain task generators that contain rebuilds
+ # 2. obtain the 'use' graph and its dual
+ stales = set()
+ reverse_use_map = Utils.defaultdict(list)
+ use_map = Utils.defaultdict(list)
+
+ for g in self.groups:
+ for tg in g:
+ if tg.is_stale():
+ stales.add(tg)
+
+ try:
+ lst = tg.use = Utils.to_list(tg.use)
+ except AttributeError:
+ pass
+ else:
+ for x in lst:
+ try:
+ xtg = self.get_tgen_by_name(x)
+ except Errors.WafError:
+ pass
+ else:
+ use_map[tg].append(xtg)
+ reverse_use_map[xtg].append(tg)
+
+ Logs.debug('rev_use: found %r stale tgs', len(stales))
+
+ # 3. dfs to post downstream tg as stale
+ visited = set()
+ def mark_down(tg):
+ if tg in visited:
+ return
+ visited.add(tg)
+ Logs.debug('rev_use: marking down %r as stale', tg.name)
+ tg.staleness = DIRTY
+ for x in reverse_use_map[tg]:
+ mark_down(x)
+ for tg in stales:
+ mark_down(tg)
+
+ # 4. dfs to find ancestors tg to mark as needed
+ self.needed_tgs = needed_tgs = set()
+ def mark_needed(tg):
+ if tg in needed_tgs:
+ return
+ needed_tgs.add(tg)
+ if tg.staleness == DONE:
+ Logs.debug('rev_use: marking up %r as needed', tg.name)
+ tg.staleness = NEEDED
+ for x in use_map[tg]:
+ mark_needed(x)
+ for xx in visited:
+ mark_needed(xx)
+
+ # so we have the whole tg trees to post in the set "needed"
+ # load their build trees
+ for tg in needed_tgs:
+ tg.bld.restore()
+ tg.bld.fix_tg_path(tg)
+
+ # the stale ones should be fully build, while the needed ones
+ # may skip a few tasks, see create_compiled_task and apply_link_after below
+ Logs.debug('rev_use: amount of needed task gens: %r', len(needed_tgs))
+
+ def post_group(self):
+ # assumption: we can ignore the folder/subfolders cuts
+ def tgpost(tg):
+ try:
+ f = tg.post
+ except AttributeError:
+ pass
+ else:
+ f()
+
+ if not self.targets or self.targets == '*':
+ for tg in self.groups[self.current_group]:
+ # this can cut quite a lot of tg objects
+ if tg in self.needed_tgs:
+ tgpost(tg)
+ else:
+ # default implementation
+ return Build.BuildContext.post_group()
+
+ def get_build_iterator(self):
+ if not self.targets or self.targets == '*':
+ self.compute_needed_tgs()
+ return Build.BuildContext.get_build_iterator(self)
+
+@taskgen_method
+def is_stale(self):
+ # assume no globs
+ self.staleness = DIRTY
+
+ # 1. the case of always stale targets
+ if getattr(self, 'always_stale', False):
+ return True
+
+ # 2. check if the db file exists
+ db = os.path.join(self.bld.variant_dir, Context.DBFILE)
+ try:
+ dbstat = os.stat(db).st_mtime
+ except OSError:
+ Logs.debug('rev_use: must post %r because this is a clean build')
+ return True
+
+ # 3. check if the configuration changed
+ if os.stat(self.bld.bldnode.find_node('c4che/build.config.py').abspath()).st_mtime > dbstat:
+ Logs.debug('rev_use: must post %r because the configuration has changed', self.name)
+ return True
+
+ # 3.a any tstamp data?
+ try:
+ f_deps = self.bld.f_deps
+ except AttributeError:
+ Logs.debug('rev_use: must post %r because there is no f_deps', self.name)
+ return True
+
+ # 4. check if this is the first build (no cache)
+ try:
+ lst = f_deps[(self.path.abspath(), self.idx)]
+ except KeyError:
+ Logs.debug('rev_use: must post %r because there it has no cached data', self.name)
+ return True
+
+ try:
+ cache = self.bld.cache_tstamp_rev_use
+ except AttributeError:
+ cache = self.bld.cache_tstamp_rev_use = {}
+
+ # 5. check the timestamp of each dependency files listed is unchanged
+ f_tstamps = self.bld.f_tstamps
+ for x in lst:
+ try:
+ old_ts = f_tstamps[x]
+ except KeyError:
+ Logs.debug('rev_use: must post %r because %r is not in cache', self.name, x)
+ return True
+
+ try:
+ try:
+ ts = cache[x]
+ except KeyError:
+ ts = cache[x] = os.stat(x).st_mtime
+ except OSError:
+ del f_deps[(self.path.abspath(), self.idx)]
+ Logs.debug('rev_use: must post %r because %r does not exist anymore', self.name, x)
+ return True
+ else:
+ if ts != old_ts:
+ Logs.debug('rev_use: must post %r because the timestamp on %r changed %r %r', self.name, x, old_ts, ts)
+ return True
+
+ self.staleness = DONE
+ return False
+
+@taskgen_method
+def create_compiled_task(self, name, node):
+ # skip the creation of object files
+ # assumption: object-only targets are not skippable
+ if self.staleness == NEEDED:
+ # only libraries/programs can skip object files
+ for x in SKIPPABLE:
+ if x in self.features:
+ return None
+
+ out = '%s.%d.o' % (node.name, self.idx)
+ task = self.create_task(name, node, node.parent.find_or_declare(out))
+ try:
+ self.compiled_tasks.append(task)
+ except AttributeError:
+ self.compiled_tasks = [task]
+ return task
+
+@feature(*SKIPPABLE)
+@after_method('apply_link')
+def apply_link_after(self):
+ # cprogram/cxxprogram might be unnecessary
+ if self.staleness != NEEDED:
+ return
+ for tsk in self.tasks:
+ tsk.hasrun = Task.SKIPPED
+
+def path_from(self, node):
+ # handle nodes of distinct types
+ if node.ctx is not self.ctx:
+ node = self.ctx.root.make_node(node.abspath())
+ return self.default_path_from(node)
+waflib.Node.Node.default_path_from = waflib.Node.Node.path_from
+waflib.Node.Node.path_from = path_from
+
+def h_file(self):
+ # similar to md5_tstamp.py, but with 2-layer cache
+ # global_cache for the build context common for all task generators
+ # local_cache for the build context proxy (one by task generator)
+ #
+ # the global cache is not persistent
+ # the local cache is persistent and meant for partial builds
+ #
+ # assume all calls are made from a single thread
+ #
+ filename = self.abspath()
+ st = os.stat(filename)
+
+ global_cache = self.ctx.bld.hashes_md5_tstamp
+ local_cache = self.ctx.hashes_md5_tstamp
+
+ if filename in global_cache:
+ # value already calculated in this build
+ cval = global_cache[filename]
+
+ # the value in global cache is assumed to be calculated once
+ # reverifying it could cause task generators
+ # to get distinct tstamp values, thus missing rebuilds
+ local_cache[filename] = cval
+ return cval[1]
+
+ if filename in local_cache:
+ cval = local_cache[filename]
+ if cval[0] == st.st_mtime:
+ # correct value from a previous build
+ # put it in the global cache
+ global_cache[filename] = cval
+ return cval[1]
+
+ ret = Utils.h_file(filename)
+ local_cache[filename] = global_cache[filename] = (st.st_mtime, ret)
+ return ret
+waflib.Node.Node.h_file = h_file
+