diff mbox series

[v5] fetch2: Add API for upstream source tracing

Message ID 20231001075225.1054512-1-alberto@pianon.eu
State Accepted, archived
Commit 05051152cc42acc52bcf9af9a696f632fac4307f
Headers show
Series [v5] fetch2: Add API for upstream source tracing | expand

Commit Message

Alberto Pianon Oct. 1, 2023, 7:52 a.m. UTC
From: Alberto Pianon <alberto@pianon.eu>

This patch adds an API to bb.fetch2 to enable users to plug in an unpack
tracer that can trace each source file back to its corresponding
upstream source url, even when multiple upstream sources are combined
together in the same unpack directory. This may be required for software
composition analysis, license compliance, and detailed SBoM generation.

This patch provides only the needed hooks in bb.fetch2 code and a dummy
abstract class defining the API; users may load their own unpack tracer
class by setting the BB_UNPACK_TRACER_CLASS config parameter.

Signed-off-by: Alberto Pianon <alberto@pianon.eu>
---
 lib/bb/fetch2/__init__.py | 78 +++++++++++++++++++++++++++++++++++++++
 lib/bb/fetch2/crate.py    |  2 +
 lib/bb/fetch2/git.py      |  2 +
 lib/bb/fetch2/gitsm.py    |  4 ++
 lib/bb/fetch2/hg.py       |  1 +
 lib/bb/fetch2/npm.py      |  1 +
 lib/bb/fetch2/npmsw.py    |  3 ++
 7 files changed, 91 insertions(+)

Comments

Alberto Pianon Oct. 11, 2023, 8:02 a.m. UTC | #1
Hi Richard,
I just noticed that the patch has been merged!
Thanks for your availability and patience :)
Cheers,
Alberto

On 2023-10-01 09:52, alberto@pianon.eu wrote:
> From: Alberto Pianon <alberto@pianon.eu>
> 
> This patch adds an API to bb.fetch2 to enable users to plug in an 
> unpack
> tracer that can trace each source file back to its corresponding
> upstream source url, even when multiple upstream sources are combined
> together in the same unpack directory. This may be required for 
> software
> composition analysis, license compliance, and detailed SBoM generation.
> 
> This patch provides only the needed hooks in bb.fetch2 code and a dummy
> abstract class defining the API; users may load their own unpack tracer
> class by setting the BB_UNPACK_TRACER_CLASS config parameter.
> 
> Signed-off-by: Alberto Pianon <alberto@pianon.eu>
> ---
>  lib/bb/fetch2/__init__.py | 78 +++++++++++++++++++++++++++++++++++++++
>  lib/bb/fetch2/crate.py    |  2 +
>  lib/bb/fetch2/git.py      |  2 +
>  lib/bb/fetch2/gitsm.py    |  4 ++
>  lib/bb/fetch2/hg.py       |  1 +
>  lib/bb/fetch2/npm.py      |  1 +
>  lib/bb/fetch2/npmsw.py    |  3 ++
>  7 files changed, 91 insertions(+)
> 
> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
> index ffb1a92b..35e9ca96 100644
> --- a/lib/bb/fetch2/__init__.py
> +++ b/lib/bb/fetch2/__init__.py
> @@ -1579,6 +1579,7 @@ class FetchMethod(object):
>              unpackdir = rootdir
> 
>          if not unpack or not cmd:
> +            urldata.unpack_tracer.unpack("file-copy", unpackdir)
>              # If file == dest, then avoid any copies, as we already 
> put the file into dest!
>              dest = os.path.join(unpackdir, os.path.basename(file))
>              if file != dest and not (os.path.exists(dest) and 
> os.path.samefile(file, dest)):
> @@ -1593,6 +1594,8 @@ class FetchMethod(object):
>                          destdir = urlpath.rsplit("/", 1)[0] + '/'
>                          bb.utils.mkdirhier("%s/%s" % (unpackdir, 
> destdir))
>                  cmd = 'cp -fpPRH "%s" "%s"' % (file, destdir)
> +        else:
> +            urldata.unpack_tracer.unpack("archive-extract", unpackdir)
> 
>          if not cmd:
>              return
> @@ -1684,6 +1687,55 @@ class FetchMethod(object):
>          """
>          return []
> 
> +
> +class DummyUnpackTracer(object):
> +    """
> +    Abstract API definition for a class that traces unpacked source 
> files back
> +    to their respective upstream SRC_URI entries, for software 
> composition
> +    analysis, license compliance and detailed SBOM generation 
> purposes.
> +    User may load their own unpack tracer class (instead of the dummy
> +    one) by setting the BB_UNPACK_TRACER_CLASS config parameter.
> +    """
> +    def start(self, unpackdir, urldata_dict, d):
> +        """
> +        Start tracing the core Fetch.unpack process, using an index to 
> map
> +        unpacked files to each SRC_URI entry.
> +        This method is called by Fetch.unpack and it may receive 
> nested calls by
> +        gitsm and npmsw fetchers, that expand SRC_URI entries by 
> adding implicit
> +        URLs and by recursively calling Fetch.unpack from new (nested) 
> Fetch
> +        instances.
> +        """
> +        return
> +    def start_url(self, url):
> +        """Start tracing url unpack process.
> +        This method is called by Fetch.unpack before the 
> fetcher-specific unpack
> +        method starts, and it may receive nested calls by gitsm and 
> npmsw
> +        fetchers.
> +        """
> +        return
> +    def unpack(self, unpack_type, destdir):
> +        """
> +        Set unpack_type and destdir for current url.
> +        This method is called by the fetcher-specific unpack method 
> after url
> +        tracing started.
> +        """
> +        return
> +    def finish_url(self, url):
> +        """Finish tracing url unpack process and update the file 
> index.
> +        This method is called by Fetch.unpack after the 
> fetcher-specific unpack
> +        method finished its job, and it may receive nested calls by 
> gitsm
> +        and npmsw fetchers.
> +        """
> +        return
> +    def complete(self):
> +        """
> +        Finish tracing the Fetch.unpack process, and check if all 
> nested
> +        Fecth.unpack calls (if any) have been completed; if so, save 
> collected
> +        metadata.
> +        """
> +        return
> +
> +
>  class Fetch(object):
>      def __init__(self, urls, d, cache = True, localonly = False, 
> connection_cache = None):
>          if localonly and cache:
> @@ -1704,10 +1756,30 @@ class Fetch(object):
>          if key in urldata_cache:
>              self.ud = urldata_cache[key]
> 
> +        # the unpack_tracer object needs to be made available to 
> possible nested
> +        # Fetch instances (when those are created by gitsm and npmsw 
> fetchers)
> +        # so we set it as a global variable
> +        global unpack_tracer
> +        try:
> +            unpack_tracer
> +        except NameError:
> +            class_path = d.getVar("BB_UNPACK_TRACER_CLASS")
> +            if class_path:
> +                # use user-defined unpack tracer class
> +                import importlib
> +                module_name, _, class_name = 
> class_path.rpartition(".")
> +                module = importlib.import_module(module_name)
> +                class_ = getattr(module, class_name)
> +                unpack_tracer = class_()
> +            else:
> +                # fall back to the dummy/abstract class
> +                unpack_tracer = DummyUnpackTracer()
> +
>          for url in urls:
>              if url not in self.ud:
>                  try:
>                      self.ud[url] = FetchData(url, d, localonly)
> +                    self.ud[url].unpack_tracer = unpack_tracer
>                  except NonLocalMethod:
>                      if localonly:
>                          self.ud[url] = None
> @@ -1883,6 +1955,8 @@ class Fetch(object):
>          if not urls:
>              urls = self.urls
> 
> +        unpack_tracer.start(root, self.ud, self.d)
> +
>          for u in urls:
>              ud = self.ud[u]
>              ud.setup_localpath(self.d)
> @@ -1890,11 +1964,15 @@ class Fetch(object):
>              if ud.lockfile:
>                  lf = bb.utils.lockfile(ud.lockfile)
> 
> +            unpack_tracer.start_url(u)
>              ud.method.unpack(ud, root, self.d)
> +            unpack_tracer.finish_url(u)
> 
>              if ud.lockfile:
>                  bb.utils.unlockfile(lf)
> 
> +        unpack_tracer.complete()
> +
>      def clean(self, urls=None):
>          """
>          Clean files that the fetcher gets or places
> diff --git a/lib/bb/fetch2/crate.py b/lib/bb/fetch2/crate.py
> index 3310ed00..01d49435 100644
> --- a/lib/bb/fetch2/crate.py
> +++ b/lib/bb/fetch2/crate.py
> @@ -101,8 +101,10 @@ class Crate(Wget):
>          bp = d.getVar('BP')
>          if bp == ud.parm.get('name'):
>              cmd = "tar -xz --no-same-owner -f %s" % thefile
> +            ud.unpack_tracer.unpack("crate-extract", rootdir)
>          else:
>              cargo_bitbake = self._cargo_bitbake_path(rootdir)
> +            ud.unpack_tracer.unpack("cargo-extract", cargo_bitbake)
> 
>              cmd = "tar -xz --no-same-owner -f %s -C %s" % (thefile, 
> cargo_bitbake)
> 
> diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
> index 4385d0b3..c7ed1f03 100644
> --- a/lib/bb/fetch2/git.py
> +++ b/lib/bb/fetch2/git.py
> @@ -589,6 +589,8 @@ class Git(FetchMethod):
>          destdir = ud.destdir = os.path.join(destdir, destsuffix)
>          if os.path.exists(destdir):
>              bb.utils.prunedir(destdir)
> +        if not ud.bareclone:
> +            ud.unpack_tracer.unpack("git", destdir)
> 
>          need_lfs = self._need_lfs(ud)
> 
> diff --git a/lib/bb/fetch2/gitsm.py b/lib/bb/fetch2/gitsm.py
> index a87361cc..f7f3af72 100644
> --- a/lib/bb/fetch2/gitsm.py
> +++ b/lib/bb/fetch2/gitsm.py
> @@ -218,6 +218,10 @@ class GitSM(Git):
> 
>              try:
>                  newfetch = Fetch([url], d, cache=False)
> +                # modpath is needed by unpack tracer to calculate 
> submodule
> +                # checkout dir
> +                new_ud = newfetch.ud[url]
> +                new_ud.modpath = modpath
>                  
> newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', 
> module)))
>              except Exception as e:
>                  logger.error('gitsm: submodule unpack failed: %s %s' % 
> (type(e).__name__, str(e)))
> diff --git a/lib/bb/fetch2/hg.py b/lib/bb/fetch2/hg.py
> index 063e1300..cbff8c49 100644
> --- a/lib/bb/fetch2/hg.py
> +++ b/lib/bb/fetch2/hg.py
> @@ -242,6 +242,7 @@ class Hg(FetchMethod):
>          revflag = "-r %s" % ud.revision
>          subdir = ud.parm.get("destsuffix", ud.module)
>          codir = "%s/%s" % (destdir, subdir)
> +        ud.unpack_tracer.unpack("hg", codir)
> 
>          scmdata = ud.parm.get("scmdata", "")
>          if scmdata != "nokeep":
> diff --git a/lib/bb/fetch2/npm.py b/lib/bb/fetch2/npm.py
> index f83485ad..15f3f19b 100644
> --- a/lib/bb/fetch2/npm.py
> +++ b/lib/bb/fetch2/npm.py
> @@ -298,6 +298,7 @@ class Npm(FetchMethod):
>          destsuffix = ud.parm.get("destsuffix", "npm")
>          destdir = os.path.join(rootdir, destsuffix)
>          npm_unpack(ud.localpath, destdir, d)
> +        ud.unpack_tracer.unpack("npm", destdir)
> 
>      def clean(self, ud, d):
>          """Clean any existing full or partial download"""
> diff --git a/lib/bb/fetch2/npmsw.py b/lib/bb/fetch2/npmsw.py
> index 4ff2c8ff..ff5f8dc7 100644
> --- a/lib/bb/fetch2/npmsw.py
> +++ b/lib/bb/fetch2/npmsw.py
> @@ -191,7 +191,9 @@ class NpmShrinkWrap(FetchMethod):
>              else:
>                  raise ParameterError("Unsupported dependency: %s" % 
> name, ud.url)
> 
> +            # name is needed by unpack tracer for module mapping
>              ud.deps.append({
> +                "name": name,
>                  "url": url,
>                  "localpath": localpath,
>                  "extrapaths": extrapaths,
> @@ -270,6 +272,7 @@ class NpmShrinkWrap(FetchMethod):
>          destsuffix = ud.parm.get("destsuffix")
>          if destsuffix:
>              destdir = os.path.join(rootdir, destsuffix)
> +        ud.unpack_tracer.unpack("npm-shrinkwrap", destdir)
> 
>          bb.utils.mkdirhier(destdir)
>          bb.utils.copyfile(ud.shrinkwrap_file,
Richard Purdie Oct. 17, 2023, 12:04 p.m. UTC | #2
On Wed, 2023-10-11 at 10:02 +0200, Alberto Pianon wrote:
> I just noticed that the patch has been merged!
> Thanks for your availability and patience :)
> 

No problem, I'm glad we finally got there!

Cheers,

Richard
diff mbox series

Patch

diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
index ffb1a92b..35e9ca96 100644
--- a/lib/bb/fetch2/__init__.py
+++ b/lib/bb/fetch2/__init__.py
@@ -1579,6 +1579,7 @@  class FetchMethod(object):
             unpackdir = rootdir
 
         if not unpack or not cmd:
+            urldata.unpack_tracer.unpack("file-copy", unpackdir)
             # If file == dest, then avoid any copies, as we already put the file into dest!
             dest = os.path.join(unpackdir, os.path.basename(file))
             if file != dest and not (os.path.exists(dest) and os.path.samefile(file, dest)):
@@ -1593,6 +1594,8 @@  class FetchMethod(object):
                         destdir = urlpath.rsplit("/", 1)[0] + '/'
                         bb.utils.mkdirhier("%s/%s" % (unpackdir, destdir))
                 cmd = 'cp -fpPRH "%s" "%s"' % (file, destdir)
+        else:
+            urldata.unpack_tracer.unpack("archive-extract", unpackdir)
 
         if not cmd:
             return
@@ -1684,6 +1687,55 @@  class FetchMethod(object):
         """
         return []
 
+
+class DummyUnpackTracer(object):
+    """
+    Abstract API definition for a class that traces unpacked source files back
+    to their respective upstream SRC_URI entries, for software composition
+    analysis, license compliance and detailed SBOM generation purposes.
+    User may load their own unpack tracer class (instead of the dummy
+    one) by setting the BB_UNPACK_TRACER_CLASS config parameter.
+    """
+    def start(self, unpackdir, urldata_dict, d):
+        """
+        Start tracing the core Fetch.unpack process, using an index to map
+        unpacked files to each SRC_URI entry.
+        This method is called by Fetch.unpack and it may receive nested calls by
+        gitsm and npmsw fetchers, that expand SRC_URI entries by adding implicit
+        URLs and by recursively calling Fetch.unpack from new (nested) Fetch
+        instances.
+        """
+        return
+    def start_url(self, url):
+        """Start tracing url unpack process.
+        This method is called by Fetch.unpack before the fetcher-specific unpack
+        method starts, and it may receive nested calls by gitsm and npmsw
+        fetchers.
+        """
+        return
+    def unpack(self, unpack_type, destdir):
+        """
+        Set unpack_type and destdir for current url.
+        This method is called by the fetcher-specific unpack method after url
+        tracing started.
+        """
+        return
+    def finish_url(self, url):
+        """Finish tracing url unpack process and update the file index.
+        This method is called by Fetch.unpack after the fetcher-specific unpack
+        method finished its job, and it may receive nested calls by gitsm
+        and npmsw fetchers.
+        """
+        return
+    def complete(self):
+        """
+        Finish tracing the Fetch.unpack process, and check if all nested
+        Fecth.unpack calls (if any) have been completed; if so, save collected
+        metadata.
+        """
+        return
+
+
 class Fetch(object):
     def __init__(self, urls, d, cache = True, localonly = False, connection_cache = None):
         if localonly and cache:
@@ -1704,10 +1756,30 @@  class Fetch(object):
         if key in urldata_cache:
             self.ud = urldata_cache[key]
 
+        # the unpack_tracer object needs to be made available to possible nested
+        # Fetch instances (when those are created by gitsm and npmsw fetchers)
+        # so we set it as a global variable
+        global unpack_tracer
+        try:
+            unpack_tracer
+        except NameError:
+            class_path = d.getVar("BB_UNPACK_TRACER_CLASS")
+            if class_path:
+                # use user-defined unpack tracer class
+                import importlib
+                module_name, _, class_name = class_path.rpartition(".")
+                module = importlib.import_module(module_name)
+                class_ = getattr(module, class_name)
+                unpack_tracer = class_()
+            else:
+                # fall back to the dummy/abstract class
+                unpack_tracer = DummyUnpackTracer()
+
         for url in urls:
             if url not in self.ud:
                 try:
                     self.ud[url] = FetchData(url, d, localonly)
+                    self.ud[url].unpack_tracer = unpack_tracer
                 except NonLocalMethod:
                     if localonly:
                         self.ud[url] = None
@@ -1883,6 +1955,8 @@  class Fetch(object):
         if not urls:
             urls = self.urls
 
+        unpack_tracer.start(root, self.ud, self.d)
+
         for u in urls:
             ud = self.ud[u]
             ud.setup_localpath(self.d)
@@ -1890,11 +1964,15 @@  class Fetch(object):
             if ud.lockfile:
                 lf = bb.utils.lockfile(ud.lockfile)
 
+            unpack_tracer.start_url(u)
             ud.method.unpack(ud, root, self.d)
+            unpack_tracer.finish_url(u)
 
             if ud.lockfile:
                 bb.utils.unlockfile(lf)
 
+        unpack_tracer.complete()
+
     def clean(self, urls=None):
         """
         Clean files that the fetcher gets or places
diff --git a/lib/bb/fetch2/crate.py b/lib/bb/fetch2/crate.py
index 3310ed00..01d49435 100644
--- a/lib/bb/fetch2/crate.py
+++ b/lib/bb/fetch2/crate.py
@@ -101,8 +101,10 @@  class Crate(Wget):
         bp = d.getVar('BP')
         if bp == ud.parm.get('name'):
             cmd = "tar -xz --no-same-owner -f %s" % thefile
+            ud.unpack_tracer.unpack("crate-extract", rootdir)
         else:
             cargo_bitbake = self._cargo_bitbake_path(rootdir)
+            ud.unpack_tracer.unpack("cargo-extract", cargo_bitbake)
 
             cmd = "tar -xz --no-same-owner -f %s -C %s" % (thefile, cargo_bitbake)
 
diff --git a/lib/bb/fetch2/git.py b/lib/bb/fetch2/git.py
index 4385d0b3..c7ed1f03 100644
--- a/lib/bb/fetch2/git.py
+++ b/lib/bb/fetch2/git.py
@@ -589,6 +589,8 @@  class Git(FetchMethod):
         destdir = ud.destdir = os.path.join(destdir, destsuffix)
         if os.path.exists(destdir):
             bb.utils.prunedir(destdir)
+        if not ud.bareclone:
+            ud.unpack_tracer.unpack("git", destdir)
 
         need_lfs = self._need_lfs(ud)
 
diff --git a/lib/bb/fetch2/gitsm.py b/lib/bb/fetch2/gitsm.py
index a87361cc..f7f3af72 100644
--- a/lib/bb/fetch2/gitsm.py
+++ b/lib/bb/fetch2/gitsm.py
@@ -218,6 +218,10 @@  class GitSM(Git):
 
             try:
                 newfetch = Fetch([url], d, cache=False)
+                # modpath is needed by unpack tracer to calculate submodule
+                # checkout dir
+                new_ud = newfetch.ud[url]
+                new_ud.modpath = modpath
                 newfetch.unpack(root=os.path.dirname(os.path.join(repo_conf, 'modules', module)))
             except Exception as e:
                 logger.error('gitsm: submodule unpack failed: %s %s' % (type(e).__name__, str(e)))
diff --git a/lib/bb/fetch2/hg.py b/lib/bb/fetch2/hg.py
index 063e1300..cbff8c49 100644
--- a/lib/bb/fetch2/hg.py
+++ b/lib/bb/fetch2/hg.py
@@ -242,6 +242,7 @@  class Hg(FetchMethod):
         revflag = "-r %s" % ud.revision
         subdir = ud.parm.get("destsuffix", ud.module)
         codir = "%s/%s" % (destdir, subdir)
+        ud.unpack_tracer.unpack("hg", codir)
 
         scmdata = ud.parm.get("scmdata", "")
         if scmdata != "nokeep":
diff --git a/lib/bb/fetch2/npm.py b/lib/bb/fetch2/npm.py
index f83485ad..15f3f19b 100644
--- a/lib/bb/fetch2/npm.py
+++ b/lib/bb/fetch2/npm.py
@@ -298,6 +298,7 @@  class Npm(FetchMethod):
         destsuffix = ud.parm.get("destsuffix", "npm")
         destdir = os.path.join(rootdir, destsuffix)
         npm_unpack(ud.localpath, destdir, d)
+        ud.unpack_tracer.unpack("npm", destdir)
 
     def clean(self, ud, d):
         """Clean any existing full or partial download"""
diff --git a/lib/bb/fetch2/npmsw.py b/lib/bb/fetch2/npmsw.py
index 4ff2c8ff..ff5f8dc7 100644
--- a/lib/bb/fetch2/npmsw.py
+++ b/lib/bb/fetch2/npmsw.py
@@ -191,7 +191,9 @@  class NpmShrinkWrap(FetchMethod):
             else:
                 raise ParameterError("Unsupported dependency: %s" % name, ud.url)
 
+            # name is needed by unpack tracer for module mapping
             ud.deps.append({
+                "name": name,
                 "url": url,
                 "localpath": localpath,
                 "extrapaths": extrapaths,
@@ -270,6 +272,7 @@  class NpmShrinkWrap(FetchMethod):
         destsuffix = ud.parm.get("destsuffix")
         if destsuffix:
             destdir = os.path.join(rootdir, destsuffix)
+        ud.unpack_tracer.unpack("npm-shrinkwrap", destdir)
 
         bb.utils.mkdirhier(destdir)
         bb.utils.copyfile(ud.shrinkwrap_file,