diff mbox series

fetch2: add Google Cloud Platform (GCP) fetcher

Message ID 20230731133444.713728-1-eekmecic@snap.com
State Accepted, archived
Commit 8e7e5719c1de79eb488732818871add3a6fc238b
Headers show
Series fetch2: add Google Cloud Platform (GCP) fetcher | expand

Commit Message

Emil Ekmečić July 31, 2023, 1:34 p.m. UTC
From: Emil Ekmečić <eekmecic@snap.com>

This fetcher allows BitBake to fetch from a Google Cloud Storage
bucket. The fetcher expects a gs:// URI of the following form:

SSTATE_MIRRORS = "file://.* gs://<bucket name>/PATH"

In addition, a GCP project name must be specified using the
GCP_PROJECT_NAME variable like so:

GCP_PROJECT_NAME = "my-project"

The fetcher uses the Google Cloud Storage Python Client, and
expects it to be installed, configured, and authenticated prior
to use.

Signed-off-by: Emil Ekmečić <eekmecic@snap.com>
---
 lib/bb/fetch2/__init__.py |   4 +-
 lib/bb/fetch2/gcp.py      | 108 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 lib/bb/fetch2/gcp.py

Comments

Alexander Kanavin July 31, 2023, 2:08 p.m. UTC | #1
There are a number of expectations that any new fetcher should either
fulfil, or explain why it doesn't apply:
https://git.yoctoproject.org/poky/tree/bitbake/lib/bb/fetch2/README

Can you please go over that list, and provide answers to the points in it?

Alex

On Mon, 31 Jul 2023 at 15:34, eekmecic via lists.openembedded.org
<eekmecic=snap.com@lists.openembedded.org> wrote:
>
> From: Emil Ekmečić <eekmecic@snap.com>
>
> This fetcher allows BitBake to fetch from a Google Cloud Storage
> bucket. The fetcher expects a gs:// URI of the following form:
>
> SSTATE_MIRRORS = "file://.* gs://<bucket name>/PATH"
>
> In addition, a GCP project name must be specified using the
> GCP_PROJECT_NAME variable like so:
>
> GCP_PROJECT_NAME = "my-project"
>
> The fetcher uses the Google Cloud Storage Python Client, and
> expects it to be installed, configured, and authenticated prior
> to use.
>
> Signed-off-by: Emil Ekmečić <eekmecic@snap.com>
> ---
>  lib/bb/fetch2/__init__.py |   4 +-
>  lib/bb/fetch2/gcp.py      | 108 ++++++++++++++++++++++++++++++++++++++
>  2 files changed, 111 insertions(+), 1 deletion(-)
>  create mode 100644 lib/bb/fetch2/gcp.py
>
> diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
> index 8afe012e..0a3d7a58 100644
> --- a/lib/bb/fetch2/__init__.py
> +++ b/lib/bb/fetch2/__init__.py
> @@ -1290,7 +1290,7 @@ class FetchData(object):
>
>              if checksum_name in self.parm:
>                  checksum_expected = self.parm[checksum_name]
> -            elif self.type not in ["http", "https", "ftp", "ftps", "sftp", "s3", "az", "crate"]:
> +            elif self.type not in ["http", "https", "ftp", "ftps", "sftp", "s3", "az", "crate", "gs"]:
>                  checksum_expected = None
>              else:
>                  checksum_expected = d.getVarFlag("SRC_URI", checksum_name)
> @@ -1973,6 +1973,7 @@ from . import npm
>  from . import npmsw
>  from . import az
>  from . import crate
> +from . import gcp
>
>  methods.append(local.Local())
>  methods.append(wget.Wget())
> @@ -1994,3 +1995,4 @@ methods.append(npm.Npm())
>  methods.append(npmsw.NpmShrinkWrap())
>  methods.append(az.Az())
>  methods.append(crate.Crate())
> +methods.append(gcp.GCP())
> diff --git a/lib/bb/fetch2/gcp.py b/lib/bb/fetch2/gcp.py
> new file mode 100644
> index 00000000..7431ea4d
> --- /dev/null
> +++ b/lib/bb/fetch2/gcp.py
> @@ -0,0 +1,108 @@
> +"""
> +BitBake 'Fetch' implementation for Google Cloup Platform Storage.
> +
> +Class for fetching files from Google Cloud Storage using the
> +Google Cloud Storage Python Client. The GCS Python Client must
> +be correctly installed, configured and authenticated prior to use.
> +Additionally, gsutil must also be installed.
> +
> +"""
> +
> +# Copyright (C) 2023, Snap Inc.
> +#
> +# Based in part on bb.fetch2.s3:
> +#    Copyright (C) 2017 Andre McCurdy
> +#
> +# SPDX-License-Identifier: GPL-2.0-only
> +#
> +# Based on functions from the base bb module, Copyright 2003 Holger Schurig
> +
> +import os
> +import bb
> +import urllib.parse, urllib.error
> +from bb.fetch2 import FetchMethod
> +from bb.fetch2 import FetchError
> +from bb.fetch2 import logger
> +from google.cloud import storage
> +
> +class GCP(FetchMethod):
> +    """
> +    Class to fetch urls via GCP's Python API.
> +    """
> +    def __init__(self):
> +        self.gcp_client = None
> +
> +    def init(self, d):
> +        """
> +        Initialize GCP client with the correct project name.
> +        """
> +        self.get_gcp_client(d)
> +
> +    def supports(self, ud, d):
> +        """
> +        Check to see if a given url can be fetched with GCP.
> +        """
> +        return ud.type in ['gs']
> +
> +    def recommends_checksum(self, urldata):
> +        return True
> +
> +    def urldata_init(self, ud, d):
> +        if 'downloadfilename' in ud.parm:
> +            ud.basename = ud.parm['downloadfilename']
> +        else:
> +            ud.basename = os.path.basename(ud.path)
> +
> +        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
> +
> +    def get_gcp_client(self, d):
> +        project = d.getVar("GCP_PROJECT_NAME") or ""
> +        if project == "":
> +            raise FetchError(f"No GCP project was specified using the GCP_PROJECT_NAME variable, unable to initialize GCP client!")
> +        logger.debug2(f"Trying to get GCP client for GCP project '{project}'")
> +        self.gcp_client = storage.Client(project=project)
> +
> +    def download(self, ud, d):
> +        """
> +        Fetch urls using the GCP API.
> +        Assumes localpath was called first.
> +        """
> +        logger.debug2(f"Trying to download gs://{ud.host}{ud.path} to {ud.localpath}")
> +        if self.gcp_client is None:
> +            self.get_gcp_client(d)
> +
> +        bb.fetch2.check_network_access(d, "gsutil stat", ud.url)
> +
> +        # Path sometimes has leading slash, so strip it
> +        path = ud.path.lstrip("/")
> +        blob = self.gcp_client.bucket(ud.host).blob(path)
> +        blob.download_to_filename(ud.localpath)
> +
> +        # Additional sanity checks copied from the wget class (although there
> +        # are no known issues which mean these are required, treat the GCP API
> +        # tool with a little healthy suspicion).
> +        if not os.path.exists(ud.localpath):
> +            raise FetchError(f"The GCP API returned success for gs://{ud.host}{ud.path} but {ud.localpath} doesn't exist?!")
> +
> +        if os.path.getsize(ud.localpath) == 0:
> +            os.remove(ud.localpath)
> +            raise FetchError(f"The downloaded file for gs://{ud.host}{ud.path} resulted in a zero size file?! Deleting and failing since this isn't right.")
> +
> +        return True
> +
> +    def checkstatus(self, fetch, ud, d):
> +        """
> +        Check the status of a URL.
> +        """
> +        logger.debug2(f"Checking status of gs://{ud.host}{ud.path}")
> +        if self.gcp_client is None:
> +            self.get_gcp_client(d)
> +
> +        bb.fetch2.check_network_access(d, "gsutil stat", ud.url)
> +
> +        # Path sometimes has leading slash, so strip it
> +        path = ud.path.lstrip("/")
> +        if self.gcp_client.bucket(ud.host).blob(path).exists() == False:
> +            raise FetchError(f"The GCP API reported that gs://{ud.host}{ud.path} does not exist")
> +        else:
> +            return True
> --
> 2.40.1
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#14894): https://lists.openembedded.org/g/bitbake-devel/message/14894
> Mute This Topic: https://lists.openembedded.org/mt/100461456/1686489
> Group Owner: bitbake-devel+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/bitbake-devel/unsub [alex.kanavin@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
Richard Purdie July 31, 2023, 2:18 p.m. UTC | #2
On Mon, 2023-07-31 at 06:34 -0700, eekmecic via lists.openembedded.org
wrote:
> From: Emil Ekmečić <eekmecic@snap.com>
> 
> This fetcher allows BitBake to fetch from a Google Cloud Storage
> bucket. The fetcher expects a gs:// URI of the following form:
> 
> SSTATE_MIRRORS = "file://.* gs://<bucket name>/PATH"
> 
> In addition, a GCP project name must be specified using the
> GCP_PROJECT_NAME variable like so:
> 
> GCP_PROJECT_NAME = "my-project"

In addition to Alexander Kanavin's very valid question/comment, I
wondered about this name. It means you can only have one cloud storage
bucket enabled project wide. Would you expect to have different gs://
url namespaces in different recipes?

In other words, should the project name not be part of the url?

Variables separate to the urls tends to hint at API issues.

Cheers,

Richard
diff mbox series

Patch

diff --git a/lib/bb/fetch2/__init__.py b/lib/bb/fetch2/__init__.py
index 8afe012e..0a3d7a58 100644
--- a/lib/bb/fetch2/__init__.py
+++ b/lib/bb/fetch2/__init__.py
@@ -1290,7 +1290,7 @@  class FetchData(object):
 
             if checksum_name in self.parm:
                 checksum_expected = self.parm[checksum_name]
-            elif self.type not in ["http", "https", "ftp", "ftps", "sftp", "s3", "az", "crate"]:
+            elif self.type not in ["http", "https", "ftp", "ftps", "sftp", "s3", "az", "crate", "gs"]:
                 checksum_expected = None
             else:
                 checksum_expected = d.getVarFlag("SRC_URI", checksum_name)
@@ -1973,6 +1973,7 @@  from . import npm
 from . import npmsw
 from . import az
 from . import crate
+from . import gcp
 
 methods.append(local.Local())
 methods.append(wget.Wget())
@@ -1994,3 +1995,4 @@  methods.append(npm.Npm())
 methods.append(npmsw.NpmShrinkWrap())
 methods.append(az.Az())
 methods.append(crate.Crate())
+methods.append(gcp.GCP())
diff --git a/lib/bb/fetch2/gcp.py b/lib/bb/fetch2/gcp.py
new file mode 100644
index 00000000..7431ea4d
--- /dev/null
+++ b/lib/bb/fetch2/gcp.py
@@ -0,0 +1,108 @@ 
+"""
+BitBake 'Fetch' implementation for Google Cloup Platform Storage.
+
+Class for fetching files from Google Cloud Storage using the
+Google Cloud Storage Python Client. The GCS Python Client must
+be correctly installed, configured and authenticated prior to use.
+Additionally, gsutil must also be installed.
+
+"""
+
+# Copyright (C) 2023, Snap Inc.
+#
+# Based in part on bb.fetch2.s3:
+#    Copyright (C) 2017 Andre McCurdy
+#
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Based on functions from the base bb module, Copyright 2003 Holger Schurig
+
+import os
+import bb
+import urllib.parse, urllib.error
+from bb.fetch2 import FetchMethod
+from bb.fetch2 import FetchError
+from bb.fetch2 import logger
+from google.cloud import storage
+
+class GCP(FetchMethod):
+    """
+    Class to fetch urls via GCP's Python API.
+    """
+    def __init__(self):
+        self.gcp_client = None
+
+    def init(self, d):
+        """
+        Initialize GCP client with the correct project name.
+        """
+        self.get_gcp_client(d)
+
+    def supports(self, ud, d):
+        """
+        Check to see if a given url can be fetched with GCP.
+        """
+        return ud.type in ['gs']
+
+    def recommends_checksum(self, urldata):
+        return True
+
+    def urldata_init(self, ud, d):
+        if 'downloadfilename' in ud.parm:
+            ud.basename = ud.parm['downloadfilename']
+        else:
+            ud.basename = os.path.basename(ud.path)
+
+        ud.localfile = d.expand(urllib.parse.unquote(ud.basename))
+
+    def get_gcp_client(self, d):
+        project = d.getVar("GCP_PROJECT_NAME") or ""
+        if project == "":
+            raise FetchError(f"No GCP project was specified using the GCP_PROJECT_NAME variable, unable to initialize GCP client!")
+        logger.debug2(f"Trying to get GCP client for GCP project '{project}'")
+        self.gcp_client = storage.Client(project=project)
+
+    def download(self, ud, d):
+        """
+        Fetch urls using the GCP API.
+        Assumes localpath was called first.
+        """
+        logger.debug2(f"Trying to download gs://{ud.host}{ud.path} to {ud.localpath}")
+        if self.gcp_client is None:
+            self.get_gcp_client(d)
+
+        bb.fetch2.check_network_access(d, "gsutil stat", ud.url)
+
+        # Path sometimes has leading slash, so strip it
+        path = ud.path.lstrip("/")
+        blob = self.gcp_client.bucket(ud.host).blob(path)
+        blob.download_to_filename(ud.localpath)
+
+        # Additional sanity checks copied from the wget class (although there
+        # are no known issues which mean these are required, treat the GCP API
+        # tool with a little healthy suspicion).
+        if not os.path.exists(ud.localpath):
+            raise FetchError(f"The GCP API returned success for gs://{ud.host}{ud.path} but {ud.localpath} doesn't exist?!")
+
+        if os.path.getsize(ud.localpath) == 0:
+            os.remove(ud.localpath)
+            raise FetchError(f"The downloaded file for gs://{ud.host}{ud.path} resulted in a zero size file?! Deleting and failing since this isn't right.")
+
+        return True
+
+    def checkstatus(self, fetch, ud, d):
+        """
+        Check the status of a URL.
+        """
+        logger.debug2(f"Checking status of gs://{ud.host}{ud.path}")
+        if self.gcp_client is None:
+            self.get_gcp_client(d)
+
+        bb.fetch2.check_network_access(d, "gsutil stat", ud.url)
+
+        # Path sometimes has leading slash, so strip it
+        path = ud.path.lstrip("/")
+        if self.gcp_client.bucket(ud.host).blob(path).exists() == False:
+            raise FetchError(f"The GCP API reported that gs://{ud.host}{ud.path} does not exist")
+        else:
+            return True