diff mbox series

[v4,1/3] fetch2: Add support for upstream source tracing

Message ID 20230824155655.448755-1-alberto@pianon.eu
State New
Headers show
Series [v4,1/3] fetch2: Add support for upstream source tracing | expand

Commit Message

Alberto Pianon Aug. 24, 2023, 3:56 p.m. UTC
From: Alberto Pianon <alberto@pianon.eu>

License compliance and SBoM generation require to be able to trace each
source file back to its corresponding upstream source. The current
implementation of bb.fetch2 makes it difficult, especially when multiple
upstream sources are combined together.

This patch provides an interface to solve the issue by implementing a
process that unpacks each SRC_URI element into a temporary directory,
creates an entrypoint to collect relevant provenance metadata on each
source file, moves everything to the recipe rootdir, and saves metadata
in a JSON file.

This patch contains required modifications to fetchers' code plus a
TraceUnpackBase class that implements the above described process. Data
collection logic should be separately implemented by subclassing
TraceUnpackBase, implementing _collect_data() and _process_data()
methods.

Splitting the above described solution in multiple patches and in
multiple modules/classes aims at easing review and merge process, and
also at decoupling the development of the data collection logic from the
process that enables it.

Signed-off-by: Alberto Pianon <alberto@pianon.eu>
---
 bin/bitbake-selftest        |   1 +
 lib/bb/fetch2/__init__.py   |  55 +++++++-
 lib/bb/fetch2/crate.py      |   2 +
 lib/bb/fetch2/gitsm.py      |  24 +++-
 lib/bb/fetch2/hg.py         |   1 +
 lib/bb/fetch2/npm.py        |   1 +
 lib/bb/fetch2/npmsw.py      |  25 +++-
 lib/bb/fetch2/trace_base.py | 256 ++++++++++++++++++++++++++++++++++++
 lib/bb/tests/trace_base.py  | 227 ++++++++++++++++++++++++++++++++
 9 files changed, 583 insertions(+), 9 deletions(-)
 create mode 100644 lib/bb/fetch2/trace_base.py
 create mode 100644 lib/bb/tests/trace_base.py

Comments

Alberto Pianon Aug. 24, 2023, 4:03 p.m. UTC | #1
Hi,
I submit this third patch by just sending the link to the commit, since 
the patch contains some big compressed JSON files that would be too 
large for an email message. From reading tests and test data, you may 
get what is the final goal/result of this series of patches.

http://cgit.openembedded.org/bitbake-contrib/commit/?h=alpianon/srctrace3&id=43707aefb22c59a6508af04c02219423f42806ba

This series of patches works also with the new format of 
npm-shrinkwrap.json files, supported by the latest revision of the npmsw 
fetcher. I included a specific test case also for that one.

Cheers,
Alberto
Richard Purdie Aug. 24, 2023, 10:12 p.m. UTC | #2
On Thu, 2023-08-24 at 17:56 +0200, alberto@pianon.eu wrote:
> From: Alberto Pianon <alberto@pianon.eu>
> 
> License compliance and SBoM generation require to be able to trace each
> source file back to its corresponding upstream source. The current
> implementation of bb.fetch2 makes it difficult, especially when multiple
> upstream sources are combined together.
> 
> This patch provides an interface to solve the issue by implementing a
> process that unpacks each SRC_URI element into a temporary directory,
> creates an entrypoint to collect relevant provenance metadata on each
> source file, moves everything to the recipe rootdir, and saves metadata
> in a JSON file.
> 
> This patch contains required modifications to fetchers' code plus a
> TraceUnpackBase class that implements the above described process. Data
> collection logic should be separately implemented by subclassing
> TraceUnpackBase, implementing _collect_data() and _process_data()
> methods.
> 
> Splitting the above described solution in multiple patches and in
> multiple modules/classes aims at easing review and merge process, and
> also at decoupling the development of the data collection logic from the
> process that enables it.
> 
> Signed-off-by: Alberto Pianon <alberto@pianon.eu>
> ---
>  bin/bitbake-selftest        |   1 +
>  lib/bb/fetch2/__init__.py   |  55 +++++++-
>  lib/bb/fetch2/crate.py      |   2 +
>  lib/bb/fetch2/gitsm.py      |  24 +++-
>  lib/bb/fetch2/hg.py         |   1 +
>  lib/bb/fetch2/npm.py        |   1 +
>  lib/bb/fetch2/npmsw.py      |  25 +++-
>  lib/bb/fetch2/trace_base.py | 256 ++++++++++++++++++++++++++++++++++++
>  lib/bb/tests/trace_base.py  | 227 ++++++++++++++++++++++++++++++++
>  9 files changed, 583 insertions(+), 9 deletions(-)
>  create mode 100644 lib/bb/fetch2/trace_base.py
>  create mode 100644 lib/bb/tests/trace_base.py
> 
> diff --git a/bin/bitbake-selftest b/bin/bitbake-selftest
> index f25f23b1..6d60a5d2 100755
> --- a/bin/bitbake-selftest
> +++ b/bin/bitbake-selftest
> @@ -31,6 +31,7 @@ tests = ["bb.tests.codeparser",
>           "bb.tests.runqueue",
>           "bb.tests.siggen",
>           "bb.tests.utils",
> +         "bb.tests.trace_base",
>           "bb.tests.compression",
>           "hashserv.tests",
>           "layerindexlib.tests.layerindexobj",
> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
> index e4c1d206..aef5fcb4 100644
> --- a/lib/bb/fetch2/__init__.py
> +++ b/lib/bb/fetch2/__init__.py
> @@ -28,6 +28,8 @@ import bb.checksum
>  import bb.process
>  import bb.event
>  
> +from .trace_base import TraceUnpackBase
> +
>  __version__ = "2"
>  _checksum_cache = bb.checksum.FileChecksumCache()
>  
> @@ -1279,6 +1281,7 @@ class FetchData(object):
>          if not self.pswd and "pswd" in self.parm:
>              self.pswd = self.parm["pswd"]
>          self.setup = False
> +        self.destdir = None
>  
>          def configure_checksum(checksum_id):
>              if "name" in self.parm:
> @@ -1557,6 +1560,8 @@ class FetchMethod(object):
>              bb.utils.mkdirhier(unpackdir)
>          else:
>              unpackdir = rootdir
> +        urldata.destdir = unpackdir
> +        urldata.is_unpacked_archive = unpack and cmd
>  
>          if not unpack or not cmd:
>              # If file == dest, then avoid any copies, as we already put the file into dest!
> @@ -1572,6 +1577,7 @@ class FetchMethod(object):
>                      if urlpath.find("/") != -1:
>                          destdir = urlpath.rsplit("/", 1)[0] + '/'
>                          bb.utils.mkdirhier("%s/%s" % (unpackdir, destdir))
> +                        urldata.destdir = "%s/%s" % (unpackdir, destdir)
>                  cmd = 'cp -fpPRH "%s" "%s"' % (file, destdir)
>  
>          if not cmd:
> @@ -1855,26 +1861,69 @@ class Fetch(object):
>              if not ret:
>                  raise FetchError("URL %s doesn't work" % u, u)
>  
> -    def unpack(self, root, urls=None):
> +    def unpack(self, root, urls=None, is_module=False, checkout_destdir=None):
>          """
> -        Unpack urls to root
> +        Unpack urls to a tmp dir, trace, and then move everything to root
> +
> +        is_module needs to be set to true when this method is recursively called
> +        by a fetcher's unpack method to unpack (sub)modules (gitsm, npmsw)
> +
> +        checkout_destdir needs to be passed when this method is recursively
> +        called by gitsm fetcher
>          """
>  
>          if not urls:
>              urls = self.urls
> +        if is_module:
> +            destdir = root
> +        else:
> +            trace = TraceUnpackBase(root, self.d)
> +            destdir = trace.tmpdir
>  
>          for u in urls:
>              ud = self.ud[u]
> +            # absolute subdir, destsuffix and subpath params wouldn't work when
> +            # unpacking in the tmp dir, convert them to relative paths
> +            realroot = os.path.realpath(root)
> +            params = [ 'subdir', 'destsuffix', 'subpath' ]
> +            for p in params:
> +                if not ud.parm.get(p):
> +                    continue
> +                if os.path.isabs(ud.parm[p]):
> +                    realpath = os.path.realpath(ud.parm[p])
> +                    if realpath.startswith(realroot):
> +                        ud.parm[p] = os.path.relpath(realpath, realroot)
>              ud.setup_localpath(self.d)
> +            ud.rootdir = root
> +
> +            if hasattr(ud, "checkout_destdir"):
> +                ud.checkout_destdir = checkout_destdir
>  
>              if ud.lockfile:
>                  lf = bb.utils.lockfile(ud.lockfile)
>  
> -            ud.method.unpack(ud, root, self.d)
> +            ud.method.unpack(ud, destdir, self.d)
>  
>              if ud.lockfile:
>                  bb.utils.unlockfile(lf)
>  
> +            if is_module:
> +                continue
> +
> +            if hasattr(ud, "nocheckout") and ud.nocheckout:
> +                logger.warning(
> +                    "Can't trace sources for"
> +                    " %s because repo has not been checked out" % u)
> +            else:
> +                trace.commit(u, ud)
> +
> +            trace.move2root()
> +
> +        if is_module:
> +            return
> +        trace.write_data()
> +        trace.close()
> +
>      def clean(self, urls=None):
>          """
>          Clean files that the fetcher gets or places

This patch takes a relatively clean function and adds logic which isn't
particularly clear or easy to follow. For example why should the
function skip the trace.commit() if it is a module? Surely you need the
trace data for modules as well? I guess you collect this differently
for things registered as modules?

Other things which concern me:

* backend specific parameters such as nocheckout are now appearing in
the core function
* we're switching absolute paths for relative paths for very unclear
reasons which will make the code more fragile to changes
* gitsm specific handling is being added to the core

I was willing to accept intercept hooks on this code but this rewrites
the code paths and the fetcher behaviour significantly in ways most of
our users don't need or want.

Whilst I know I did say to submit it, getting the code 1 working day
before feature freeze does also limit my options somewhat in what I can
do with it during review.


I'm basically worried that this convolutes the unpack process so badly
that nobody is going to be able to follow, maintain or modify this code
in future. I know I can barely follow what this is doing and it won't
encourage anyone else to touch it :(.


I do understand why you want to do this but I'm not convinced that the
project should be taking changes like this for what I personally think
are very marginal gains.

I appreciate from some legal perspectives, you feel you absolutely must
resolve every file back to every original source but in reality for
99.9% of our users, they just don't need to do this.

This puts me in a really difficult position. I appreciate the work
you've put into this and I do see what you want to do but in it's
current form, I don't think I'll do the project any favours by
accepting it.

I do think there must be a way for improving the interface so that my
concerns are alleviated and the risk is pushed to the people enabling
it, but without diving into the code myself to try and illustrate that,
I'm not sure how I can help us get there.

Cheers,

Richard
Ross Burton Aug. 29, 2023, 4:43 p.m. UTC | #3
On 24 Aug 2023, at 16:56, Alberto Pianon via lists.openembedded.org <alberto=pianon.eu@lists.openembedded.org> wrote:
> License compliance and SBoM generation require to be able to trace each
> source file back to its corresponding upstream source. The current
> implementation of bb.fetch2 makes it difficult, especially when multiple
> upstream sources are combined together.

I’ll have a look at the actual patches shortly, but just so that it is clear: this won’t be merged for the next release as it’s invasive, non-trivial, and didn’t arrive in time for review before the freeze.

This isn’t a rejection, but a simple matter of the release cycle.  We encourage large and invasive patches to land during M1 or M2 so that they have time to settle in and be proven.  The discussion on these patches can happen now so that, assuming they get approved, the patches can be merged after the next release.

Ross
diff mbox series

Patch

diff --git a/bin/bitbake-selftest b/bin/bitbake-selftest
index f25f23b1..6d60a5d2 100755
--- a/bin/bitbake-selftest
+++ b/bin/bitbake-selftest
@@ -31,6 +31,7 @@  tests = ["bb.tests.codeparser",
          "bb.tests.runqueue",
          "bb.tests.siggen",
          "bb.tests.utils",
+         "bb.tests.trace_base",
          "bb.tests.compression",
          "hashserv.tests",
          "layerindexlib.tests.layerindexobj",
diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
index e4c1d206..aef5fcb4 100644
--- a/lib/bb/fetch2/__init__.py
+++ b/lib/bb/fetch2/__init__.py
@@ -28,6 +28,8 @@  import bb.checksum
 import bb.process
 import bb.event
 
+from .trace_base import TraceUnpackBase
+
 __version__ = "2"
 _checksum_cache = bb.checksum.FileChecksumCache()
 
@@ -1279,6 +1281,7 @@  class FetchData(object):
         if not self.pswd and "pswd" in self.parm:
             self.pswd = self.parm["pswd"]
         self.setup = False
+        self.destdir = None
 
         def configure_checksum(checksum_id):
             if "name" in self.parm:
@@ -1557,6 +1560,8 @@  class FetchMethod(object):
             bb.utils.mkdirhier(unpackdir)
         else:
             unpackdir = rootdir
+        urldata.destdir = unpackdir
+        urldata.is_unpacked_archive = unpack and cmd
 
         if not unpack or not cmd:
             # If file == dest, then avoid any copies, as we already put the file into dest!
@@ -1572,6 +1577,7 @@  class FetchMethod(object):
                     if urlpath.find("/") != -1:
                         destdir = urlpath.rsplit("/", 1)[0] + '/'
                         bb.utils.mkdirhier("%s/%s" % (unpackdir, destdir))
+                        urldata.destdir = "%s/%s" % (unpackdir, destdir)
                 cmd = 'cp -fpPRH "%s" "%s"' % (file, destdir)
 
         if not cmd:
@@ -1855,26 +1861,69 @@  class Fetch(object):
             if not ret:
                 raise FetchError("URL %s doesn't work" % u, u)
 
-    def unpack(self, root, urls=None):
+    def unpack(self, root, urls=None, is_module=False, checkout_destdir=None):
         """
-        Unpack urls to root
+        Unpack urls to a tmp dir, trace, and then move everything to root
+
+        is_module needs to be set to true when this method is recursively called
+        by a fetcher's unpack method to unpack (sub)modules (gitsm, npmsw)
+
+        checkout_destdir needs to be passed when this method is recursively
+        called by gitsm fetcher
         """
 
         if not urls:
             urls = self.urls
+        if is_module:
+            destdir = root
+        else:
+            trace = TraceUnpackBase(root, self.d)
+            destdir = trace.tmpdir
 
         for u in urls:
             ud = self.ud[u]
+            # absolute subdir, destsuffix and subpath params wouldn't work when
+            # unpacking in the tmp dir, convert them to relative paths
+            realroot = os.path.realpath(root)
+            params = [ 'subdir', 'destsuffix', 'subpath' ]
+            for p in params:
+                if not ud.parm.get(p):
+                    continue
+                if os.path.isabs(ud.parm[p]):
+                    realpath = os.path.realpath(ud.parm[p])
+                    if realpath.startswith(realroot):
+                        ud.parm[p] = os.path.relpath(realpath, realroot)
             ud.setup_localpath(self.d)
+            ud.rootdir = root
+
+            if hasattr(ud, "checkout_destdir"):
+                ud.checkout_destdir = checkout_destdir
 
             if ud.lockfile:
                 lf = bb.utils.lockfile(ud.lockfile)
 
-            ud.method.unpack(ud, root, self.d)
+            ud.method.unpack(ud, destdir, self.d)
 
             if ud.lockfile:
                 bb.utils.unlockfile(lf)
 
+            if is_module:
+                continue
+
+            if hasattr(ud, "nocheckout") and ud.nocheckout:
+                logger.warning(
+                    "Can't trace sources for"
+                    " %s because repo has not been checked out" % u)
+            else:
+                trace.commit(u, ud)
+
+            trace.move2root()
+
+        if is_module:
+            return
+        trace.write_data()
+        trace.close()
+
     def clean(self, urls=None):
         """
         Clean files that the fetcher gets or places
diff --git a/lib/bb/fetch2/crate.py b/lib/bb/fetch2/crate.py
index 3310ed00..68250974 100644
--- a/lib/bb/fetch2/crate.py
+++ b/lib/bb/fetch2/crate.py
@@ -101,8 +101,10 @@  class Crate(Wget):
         bp = d.getVar('BP')
         if bp == ud.parm.get('name'):
             cmd = "tar -xz --no-same-owner -f %s" % thefile
+            ud.destdir = rootdir
         else:
             cargo_bitbake = self._cargo_bitbake_path(rootdir)
+            ud.destdir = cargo_bitbake
 
             cmd = "tar -xz --no-same-owner -f %s -C %s" % (thefile, cargo_bitbake)
 
diff --git a/lib/bb/fetch2/gitsm.py b/lib/bb/fetch2/gitsm.py
index 47225b97..ed7ad39e 100644
--- a/lib/bb/fetch2/gitsm.py
+++ b/lib/bb/fetch2/gitsm.py
@@ -34,6 +34,11 @@  class GitSM(Git):
         """
         return ud.type in ['gitsm']
 
+    def urldata_init(self, ud, d):
+        super(GitSM, self).urldata_init(ud, d)
+        ud.module_data = []
+        ud.checkout_destdir = None
+
     def process_submodules(self, ud, workdir, function, d):
         """
         Iterate over all of the submodules in this repository and execute
@@ -144,6 +149,15 @@  class GitSM(Git):
 
             function(ud, url, module, paths[module], workdir, ld)
 
+            if function.__name__ == "unpack_submodules":
+                destdir = os.path.join(ud.checkout_destdir, paths[module])
+                ud.module_data.append({
+                    "url": url,
+                    "destdir": destdir.rstrip("/"),
+                    "parent_destdir": ud.checkout_destdir.rstrip("/"),
+                    "revision": subrevision[module]
+                })
+
         return submodules != []
 
     def need_update(self, ud, d):
@@ -215,9 +229,13 @@  class GitSM(Git):
             else:
                 repo_conf = os.path.join(ud.destdir, '.git')
 
+            checkout_destdir = os.path.join(ud.checkout_destdir, modpath)
+
             try:
                 newfetch = Fetch([url], d, cache=False)
-                newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', module)))
+                newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', module)), is_module=True, checkout_destdir=checkout_destdir)
+                # add nested submodules' data
+                ud.module_data += newfetch.ud[url].module_data
             except Exception as e:
                 logger.error('gitsm: submodule unpack failed: %s %s' % (type(e).__name__, str(e)))
                 raise
@@ -239,6 +257,10 @@  class GitSM(Git):
 
         Git.unpack(self, ud, destdir, d)
 
+        if not ud.checkout_destdir:
+            # for main git repo, checkout destdir corresponds with unpack destdir
+            ud.checkout_destdir = ud.destdir
+
         ret = self.process_submodules(ud, ud.destdir, unpack_submodules, d)
 
         if not ud.bareclone and ret:
diff --git a/lib/bb/fetch2/hg.py b/lib/bb/fetch2/hg.py
index 063e1300..0fd69db7 100644
--- a/lib/bb/fetch2/hg.py
+++ b/lib/bb/fetch2/hg.py
@@ -242,6 +242,7 @@  class Hg(FetchMethod):
         revflag = "-r %s" % ud.revision
         subdir = ud.parm.get("destsuffix", ud.module)
         codir = "%s/%s" % (destdir, subdir)
+        ud.destdir = codir
 
         scmdata = ud.parm.get("scmdata", "")
         if scmdata != "nokeep":
diff --git a/lib/bb/fetch2/npm.py b/lib/bb/fetch2/npm.py
index f83485ad..4ddb53e7 100644
--- a/lib/bb/fetch2/npm.py
+++ b/lib/bb/fetch2/npm.py
@@ -298,6 +298,7 @@  class Npm(FetchMethod):
         destsuffix = ud.parm.get("destsuffix", "npm")
         destdir = os.path.join(rootdir, destsuffix)
         npm_unpack(ud.localpath, destdir, d)
+        ud.destdir = destdir
 
     def clean(self, ud, d):
         """Clean any existing full or partial download"""
diff --git a/lib/bb/fetch2/npmsw.py b/lib/bb/fetch2/npmsw.py
index 4ff2c8ff..2516aef4 100644
--- a/lib/bb/fetch2/npmsw.py
+++ b/lib/bb/fetch2/npmsw.py
@@ -80,6 +80,9 @@  class NpmShrinkWrap(FetchMethod):
     def urldata_init(self, ud, d):
         """Init npmsw specific variables within url data"""
 
+        # initialize module_data (for module source tracing)
+        ud.module_data = []
+
         # Get the 'shrinkwrap' parameter
         ud.shrinkwrap_file = re.sub(r"^npmsw://", "", ud.url.split(";")[0])
 
@@ -192,6 +195,7 @@  class NpmShrinkWrap(FetchMethod):
                 raise ParameterError("Unsupported dependency: %s" % name, ud.url)
 
             ud.deps.append({
+                "name": name,
                 "url": url,
                 "localpath": localpath,
                 "extrapaths": extrapaths,
@@ -266,20 +270,31 @@  class NpmShrinkWrap(FetchMethod):
 
     def unpack(self, ud, rootdir, d):
         """Unpack the downloaded dependencies"""
-        destdir = d.getVar("S")
-        destsuffix = ud.parm.get("destsuffix")
-        if destsuffix:
-            destdir = os.path.join(rootdir, destsuffix)
+        # rootdir param is a temporary dir. The real rootdir, where sources are
+        # moved after being traced, is stored in ud.rootdir.
+        destsuffix = ud.parm.get("destsuffix") or os.path.relpath(d.getVar("S"), ud.rootdir)
+        destdir = os.path.join(rootdir, destsuffix)
+        ud.destdir = destdir
 
         bb.utils.mkdirhier(destdir)
         bb.utils.copyfile(ud.shrinkwrap_file,
                           os.path.join(destdir, "npm-shrinkwrap.json"))
 
+        for dep in ud.deps:
+            dep_destdir = os.path.join(destdir, dep["destsuffix"])
+            dep_parent_destdir = re.sub("/node_modules/"+dep["name"]+"$", "", dep_destdir) # this works also with scoped package names, like @foo/bar
+            ud.module_data.append({
+                "url": dep["url"] or dep["localpath"],
+                "destdir": dep_destdir.rstrip("/"),
+                "parent_destdir": dep_parent_destdir.rstrip("/"),
+                "revision": None
+            })
+
         auto = [dep["url"] for dep in ud.deps if not dep["localpath"]]
         manual = [dep for dep in ud.deps if dep["localpath"]]
 
         if auto:
-            ud.proxy.unpack(destdir, auto)
+            ud.proxy.unpack(destdir, auto, is_module=True)
 
         for dep in manual:
             depdestdir = os.path.join(destdir, dep["destsuffix"])
diff --git a/lib/bb/fetch2/trace_base.py b/lib/bb/fetch2/trace_base.py
new file mode 100644
index 00000000..49823f84
--- /dev/null
+++ b/lib/bb/fetch2/trace_base.py
@@ -0,0 +1,256 @@ 
+"""Module implementing a base process for upstream source tracing
+for bb.fetch2.Fetch.unpack()
+
+The process consists of:
+
+- creating a temporary directory where each SRC_URI element is unpacked
+
+- collecting relevant metadata (provenance) for each source file and for every
+  upstream source component, that can be used later on for Software Composition
+  Analysis, SBoM generation, etc.;
+
+- moving everything from the temporary directory to root, and iterate with the
+  next SRC_URI element;
+
+- saving metadata in a json file after all elements have been processed.
+
+It assumes that:
+
+- fetchers store unpack destination dir in urldata.destdir;
+- gitsm and npmsw fetchers store module metadata in urldata.module_data, as a
+  list of dict elements in the following format:
+    [
+        {
+            "url": "<module url>",
+            "destdir": "<module destination path>",
+            "parent_destdir": "<parent module destination path>"
+            "revision": "<git submodule revision (only for gitsm, else None)>"
+        }, ...
+    ]
+- urldata.is_unpacked_archive (boolean) is set to True or False for "file"
+  SRC_URI entries.
+"""
+
+# Copyright (C) 2023 Alberto Pianon <pianon@array.eu>
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+
+import os
+import json
+import tempfile
+
+import bb.utils
+import bb.compress.zstd
+
+class TraceException(Exception):
+    pass
+
+def scandir(path):
+    with os.scandir(path) as scan:
+        return { e.name: e for e in scan }
+
+def is_real_dir(e):
+    return e.is_dir() and not e.is_symlink()
+
+def is_real_and_nonempty_dir(e):
+    return is_real_dir(e) and scandir(e.path)
+
+def is_file_or_symlink(e):
+    return e.is_file() or e.is_symlink()
+
+def is_git_dir(e):
+    path_scandir = scandir(e.path)
+    if ".git" in path_scandir:
+        try:
+            bb.process.run(
+                ["git", "rev-parse", "--is-inside-work-tree"], cwd=e.path)
+            return True
+        except bb.process.ExecutionError:
+            return False
+    return False
+
+def check_is_real_dir(path, name):
+    if not os.path.exists(path) or os.path.islink(path) or os.path.isfile(path):
+        raise TraceException(
+            "%s path %s is not a directory" % (name, path))
+
+def move_contents(src_dir, dst_dir):
+    """Move and merge contents from src_dir to dst_dir
+
+    Conflict resolution criteria are explained in bb.tests.trace_base
+
+    It's optimized for fast execution time by using os.scandir and os.rename, so
+    it requires that both src_dir and dst_dir reside in the same filesystem.
+    """
+
+    check_is_real_dir(src_dir, "Source")
+    check_is_real_dir(dst_dir, "Destination")
+
+    if os.lstat(src_dir).st_dev != os.lstat(dst_dir).st_dev:
+        raise TraceException(
+            "Source %s and destination %s must be in the same filesystem" %
+            (src_dir, dst_dir)
+        )
+
+    src_scandir = scandir(src_dir)
+    dst_scandir = scandir(dst_dir)
+
+    for src_name, src in src_scandir.items():
+        dst = dst_scandir.get(src_name)
+        if dst:
+            # handle conflicts
+            if is_real_dir(src) and is_real_and_nonempty_dir(dst):
+                if is_git_dir(src):
+                    bb.utils.prunedir(dst.path)
+                else:
+                    move_contents(src.path, dst.path)
+                    os.rmdir(src.path)
+                    continue
+            elif is_real_dir(src) and is_file_or_symlink(dst):
+                os.remove(dst.path)
+            elif is_file_or_symlink(src) and is_real_dir(dst):
+                try:
+                    os.rmdir(dst.path)
+                except OSError as e:
+                    if e.errno == 39:
+                        raise TraceException(
+                            "Error while moving %s contents to %s, cannot move"
+                            " %s to %s: source is a file or a symlink, while"
+                            " destination is a non-empty directory."
+                            % (src_dir, dst_dir, src.path, dst.path)
+                        )
+                    else:
+                        raise e
+        dst_path = dst.path if dst else os.path.join(dst_dir, src_name)
+        os.rename(src.path, dst_path)
+
+def findall_files_and_links(path, exclude=[], skip_git_submodules=False):
+    """recusively find all files and links in path, excluding dir and file names
+    in exclude, and excluding git dirs if skip_git_submodules is set to True.
+
+    Returns tuple of sorted lists of file and link paths (sorting is for
+    reproducibility in tests)
+    """
+    files = []
+    links = []
+    with os.scandir(path) as scan:
+        for e in scan:
+            if e.name in exclude:
+                continue
+            if e.is_symlink():
+                links.append(e.path)
+            elif e.is_file():
+                files.append(e.path)
+            elif e.is_dir():
+                if skip_git_submodules and is_git_dir(e):
+                    continue
+                _files, _links = findall_files_and_links(
+                        e.path, exclude, skip_git_submodules)
+                files += _files
+                links += _links
+    return sorted(files), sorted(links)
+
+class TraceUnpackBase:
+    """base class for implementing a process for upstream source tracing
+    See this module's help for more details on the process.
+
+    This base class implements the process but does not collect any data. It is
+    intended to be subclassed in a separate 'trace' module, implementing
+    _collect_data() and _process_data() methods.
+
+    Method call order:
+        - __init__(): initialize tmpdir and td (trace data)
+        - for each SRC_URI entry unpack:
+          - commit(): go through all files in tmpdir (and in each module subdir
+            in case of gitsm and npmsw fecthers) and commit collected metadata
+            to td
+          - move2root(): moves all files from tmpdir to root
+        - write_data()
+        - close(): delete tmpdir and cache
+    """
+
+    def __init__(self, root, d):
+        """initialize properties and create temporary directory in root
+
+        Temporary unpack dir is created in 'root' to ensure they are in the
+        same filesystem, so files can be quickly moved to 'root' after tracing
+        """
+
+        self.root = root
+        self.d = d
+        self.td = {}
+        if not os.path.exists(root):
+            bb.utils.mkdirhier(root)
+        self.tmpdir = tempfile.mkdtemp(dir=root)
+
+    def commit(self, u, ud):
+        """go through all files in tmpdir and commit collected metadata to td.
+        dive into module subdirs in case of gitsm and npmsw fecthers
+
+        Params are:
+        - u -> str: src uri of the upstream repo/package that is being processed
+        - ud -> bb.fetch2.FetchData: src uri fetch data object; ud.url and u do not correspond when git/npm modules are being processed, so we need both
+        """
+
+        exclude=['.git', '.hg', '.svn']
+
+        # exclude node_modules subdirs (will be separately parsed)
+        if ud.type in ['npm', 'npmsw']:
+            exclude.append('node_modules')
+        # exclude git submodules (will be separately parsed)
+        skip_git_submodules = (ud.type == 'gitsm')
+
+        files, links = findall_files_and_links(
+            ud.destdir, exclude, skip_git_submodules)
+        self._collect_data(u, ud, files, links, ud.destdir)
+
+        if ud.type in ['gitsm', 'npmsw'] and ud.module_data:
+            self._process_module_data(ud)
+            for md in ud.module_data:
+                files, links = findall_files_and_links(
+                   md["destdir"], exclude, skip_git_submodules)
+                self._collect_data(
+                    md["url"], ud, files, links, md["destdir"], md)
+
+    def _process_module_data(self, ud):
+        """add parent module data to each module data item, to map dependencies
+        """
+        revision = ud.revisions[ud.names[0]] if ud.type == 'gitsm' else None
+        indexed_md = { md["destdir"]: md for md in ud.module_data }
+        # add main git repo (gitsm) or npm-shrinkwrap.json (npmsw)
+        indexed_md.update({
+                ud.destdir.rstrip("/"): {"url": ud.url, "revision": revision}
+        })
+        for md in ud.module_data:
+            md["parent_md"] = indexed_md[md["parent_destdir"]]
+
+    def move2root(self):
+        """move all files from temporary directory to root"""
+        move_contents(self.tmpdir, self.root)
+
+    def write_data(self):
+        self._process_data()
+        if not self.d.getVar("PN"):
+            return
+        if not os.path.exists("%s/temp" % self.root):
+            bb.utils.mkdirhier("%s/temp" % self.root)
+        path = "%s/temp/%s-%s.unpack.trace.json.zst" % (
+            self.root, self.d.getVar("PN"), self.d.getVar("PV"))
+        with bb.compress.zstd.open(path, "wt", encoding="utf-8") as f:
+            json.dump(self.td, f)
+            f.flush()
+
+    def close(self):
+        os.rmdir(self.tmpdir)
+        del self.td
+
+    def _collect_data(self, u, ud, files, links, destdir, md=None):
+        """
+        collect provenance metadata on the committed files. Not implemented
+        """
+        pass
+
+    def _process_data(self):
+        """post-process self.td. Not implemented"""
+        pass
\ No newline at end of file
diff --git a/lib/bb/tests/trace_base.py b/lib/bb/tests/trace_base.py
new file mode 100644
index 00000000..d96fb2c7
--- /dev/null
+++ b/lib/bb/tests/trace_base.py
@@ -0,0 +1,227 @@ 
+
+# Copyright (C) 2023 Alberto Pianon <pianon@array.eu>
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+
+import os
+import re
+import unittest
+import tempfile
+from pathlib import Path
+import subprocess
+
+import bb
+
+def create_src_dst(tmpdir):
+    src_dir = os.path.join(tmpdir, "src/")
+    dst_dir = os.path.join(tmpdir, "dst/")
+    os.makedirs(src_dir)
+    os.makedirs(dst_dir)
+    return Path(src_dir), Path(dst_dir)
+
+def make_dirname(path):
+    dirname = os.path.dirname(path)
+    if dirname:
+        os.makedirs(dirname, exist_ok=True)
+
+def create_file(path, content):
+    make_dirname(path)
+    with open(path, "w") as f:
+        f.write(content)
+
+def create_link(path, target):
+    make_dirname(path)
+    os.symlink(target, path)
+
+def get_tree(path):
+    curdir = os.getcwd()
+    os.chdir(path)
+    tree = []
+    for root, dirs, files in os.walk("."):
+        for f in dirs + files:
+            tree.append(re.sub(r"^\.\/", "", os.path.join(root, f)))
+    os.chdir(curdir)
+    return sorted(tree)
+
+def read_file(path):
+    with open(path) as f:
+        return f.read()
+
+class MoveContentsTest(unittest.TestCase):
+    """
+    Test the following conflict resolution criteria:
+
+    - if a file (or symlink) exists both in src_dir and in dst_dir, the
+      file/symlink in dst_dir will be overwritten;
+
+    - if a subdirectory exists both in src_dir and in dst_dir, their contents
+      will be merged, and in case of file/symlink conflicts, files/symlinks in
+      dst_dir will be overwritten - unless src_dir is a git repo; in  such a
+      case, dst_dir will be pruned and src_dir will be moved to dst_dir, for
+      consistency with bb.fetch2.git.Git.unpack method's behavior (which prunes
+      clone dir if already existing, before cloning)
+
+    - if the same relative path exists both in src_dir and in dst_dir, but the
+      path in src_dir is a directory and the path in dst_dir is a file/symlink,
+      the latter will be overwritten;
+
+    - if instead the path in src_dir is a file and the path in dst_dir is a
+      directory, the latter will be overwritten only if it is empty, otherwise
+      an exception will be raised.
+    """
+
+    def test_dir_merge_and_file_overwrite(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "dir/subdir/file.txt", "new")
+            create_file(dst_dir / "dir/subdir/file.txt", "old")
+            create_file(dst_dir / "dir/subdir/file1.txt", "old")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            expected_dst_tree = [
+                "dir",
+                "dir/subdir",
+                "dir/subdir/file.txt",
+                "dir/subdir/file1.txt"
+            ]
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertEqual(get_tree(dst_dir), expected_dst_tree)
+            self.assertEqual(read_file(dst_dir / "dir/subdir/file.txt"), "new")
+            self.assertEqual(read_file(dst_dir / "dir/subdir/file1.txt"), "old")
+
+    def test_file_vs_symlink_conflicts(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+
+            create_file(src_dir / "dir/subdir/fileA.txt", "new")
+            create_file(src_dir / "dir/fileB.txt", "new")
+            create_link(src_dir / "file.txt", "dir/subdir/fileA.txt")
+
+            create_file(dst_dir / "dir/subdir/fileA.txt", "old")
+            create_link(dst_dir / "dir/fileB.txt", "subdir/fileA.txt")
+            create_file(dst_dir / "file.txt", "old")
+
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertTrue(os.path.islink(dst_dir / "file.txt"))
+            self.assertEqual(
+                os.readlink(dst_dir / "file.txt"),
+                "dir/subdir/fileA.txt"
+            )
+            self.assertFalse(os.path.islink(dst_dir / "dir/fileB.txt"))
+            self.assertEqual(read_file(dst_dir / "dir/fileB.txt"), "new")
+
+    def test_dir_vs_file_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item0/content.txt", "hello")
+            create_file(dst_dir / "items/item0", "there")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertTrue(os.path.isdir(dst_dir / "items/item0"))
+            self.assertEqual(
+                read_file(dst_dir / "items/item0/content.txt"), "hello")
+
+    def test_dir_vs_symlink_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item0/content.txt", "hello")
+            create_file(dst_dir / "items/item1/content.txt", "there")
+            create_link(dst_dir / "items/item0", "item1")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertFalse(os.path.islink(dst_dir / "items/item0"))
+            self.assertEqual(
+                read_file(dst_dir / "items/item0/content.txt"), "hello")
+            self.assertEqual(
+                read_file(dst_dir / "items/item1/content.txt"), "there")
+
+    def test_symlink_vs_empty_dir_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item1/content.txt", "there")
+            create_link(src_dir / "items/item0", "item1")
+            os.makedirs(dst_dir / "items/item0")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertTrue(os.path.islink(dst_dir / "items/item0"))
+            self.assertEqual(read_file(dst_dir / "items/item0/content.txt"), "there")
+
+    def test_symlink_vs_nonempty_dir_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item1/content.txt", "there")
+            create_link(src_dir / "items/item0", "item1")
+            create_file(dst_dir / "items/item0/content.txt", "hello")
+            with self.assertRaises(bb.fetch2.trace_base.TraceException) as context:
+                bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+
+    def test_file_vs_empty_dir_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item0", "test")
+            os.makedirs(dst_dir / "items/item0")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertEqual(get_tree(src_dir), [])
+            self.assertTrue(os.path.isfile(dst_dir/ "items/item0"))
+
+    def test_file_vs_nonempty_dir_conflict(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            create_file(src_dir / "items/item0", "test")
+            create_file(dst_dir / "items/item0/content.txt", "test")
+            with self.assertRaises(bb.fetch2.trace_base.TraceException) as context:
+                bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+
+    def test_git_dir(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            src_dir, dst_dir = create_src_dst(tmpdir)
+            git_repo = src_dir / "src/my_git_repo"
+            create_file(git_repo / "foo.txt", "hello")
+            subprocess.check_output(["git", "init"], cwd=git_repo)
+            create_file(dst_dir / "src/my_git_repo/content.txt", "there")
+            bb.fetch2.trace_base.move_contents(src_dir, dst_dir)
+            self.assertFalse(
+                os.path.exists(dst_dir / "src/my_git_repo/content.txt"))
+                # git clone dir should be pruned if already existing
+            self.assertEqual(
+                read_file(dst_dir / "src/my_git_repo/foo.txt"), "hello")
+            self.assertTrue(os.path.isdir(dst_dir / "src/my_git_repo/.git"))
+
+
+class FindAllFilesAndLinksTest(unittest.TestCase):
+    """test if all files and links are correctly returned, and if specific
+    file/dir names and git subdirs are correctly excluded"""
+
+    def test_findall_files_and_links(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            files = {
+                str(tmpdir/"foo/example/example.txt"): "example",
+                str(tmpdir/"foo/foo.txt"): "foo",
+                str(tmpdir/"foo/foo2.txt"): "foo2",
+                str(tmpdir/"README"): "hello",
+            }
+            ignored = {
+                str(tmpdir/".git"): "fake",
+                str(tmpdir/"foo2/dummy"): "dummy"
+            }
+            allfiles = files.copy()
+            allfiles.update(ignored)
+            links = {
+                str(tmpdir/"example"): "foo/example", # link to dir
+                str(tmpdir/"example.txt"): "foo/example/example.txt", # link to file
+            }
+            for path, content in allfiles.items():
+                create_file(path, content)
+            for path, target in links.items():
+                create_link(path, target)
+            subprocess.check_output(["git", "init"], cwd=tmpdir/"foo2")
+            res_files, res_links = bb.fetch2.trace_base.findall_files_and_links(
+                    tmpdir, exclude=['.git'], skip_git_submodules=True)
+            self.assertEqual(res_files, sorted(list(files.keys())))
+            self.assertEqual(res_links, sorted(list(links.keys())))
+
+
+if __name__ == '__main__':
+    unittest.main()