diff mbox series

sstate-cache-cleaner.py: Add a script for sstate cache cleaning

Message ID 20221121111102.5556-1-tomasz.dziendzielski@gmail.com
State New
Headers show
Series sstate-cache-cleaner.py: Add a script for sstate cache cleaning | expand

Commit Message

Tomasz Dziendzielski Nov. 21, 2022, 11:11 a.m. UTC
From: Mikolaj Lasota <mikolaj.lasota@protonmail.com>

Bash script used at the moment takes too much time to calculate obsolete
sstate cache files. Let's try to rewrite necessary logic in python and
store intermediate data in memory rather than temporary files.

Signed-off-by: Mikolaj Lasota <mikolaj.lasota@protonmail.com>
Signed-off-by: Tomasz Dziendzielski <tomasz.dziendzielski@gmail.com>
---
 scripts/sstate-cache-cleaner.py | 166 ++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100755 scripts/sstate-cache-cleaner.py

Comments

Randy MacLeod Jan. 26, 2023, 4:31 p.m. UTC | #1
On 2022-11-21 06:11, Tomasz Dziendzielski via lists.openembedded.org wrote:
> From: Mikolaj Lasota<mikolaj.lasota@protonmail.com>
>
> Bash script used at the moment takes too much time to calculate obsolete
> sstate cache files. Let's try to rewrite necessary logic in python and
> store intermediate data in memory rather than temporary files.


This seems like a nice improvement that got missed.

I'd test it and report on the speed-up but I have a million other
things to do so I'm just replying instead. What sort of performance
difference are you seeing?

Should we keep the old scripts/sstate-cache-management.sh
but make it a wrapper for this script or just remove it?

>
> Signed-off-by: Mikolaj Lasota<mikolaj.lasota@protonmail.com>
> Signed-off-by: Tomasz Dziendzielski<tomasz.dziendzielski@gmail.com>
> ---
>   scripts/sstate-cache-cleaner.py | 166 ++++++++++++++++++++++++++++++++
>   1 file changed, 166 insertions(+)
>   create mode 100755 scripts/sstate-cache-cleaner.py
>
> diff --git a/scripts/sstate-cache-cleaner.py b/scripts/sstate-cache-cleaner.py
> new file mode 100755
> index 0000000000..f01db35775
> --- /dev/null
> +++ b/scripts/sstate-cache-cleaner.py
> @@ -0,0 +1,166 @@
> +#!/usr/bin/env python3
> +
> +"""
> +This script is a python rewrite of poky based scripts/sstate-cache-management.sh
> +It has a subset of original script features - namely the ability to filter cache files by stamp files references.
> +The output is a list of unreferenced sstate-cache files - which are obsolete and can be removed.
> +
> +To test the script agains the original one (shell) one might create a small test environment:

  * against

../Randy


> + - create a local sstate-cache directory
> + - run two or more separate builds (different hashes/machines) using above dir (SSTATE_DIR)
> + - run original shell script using stamp dir from one of the above builds and the common cache dir
> + - run this script with the same arguments (same stamp & cache dirs)
> +"""
> +
> +import argparse
> +import fnmatch
> +import logging
> +import os
> +import re
> +import time
> +from functools import reduce
> +
> +formatter = logging.Formatter('%(asctime)s - %(funcName)s - %(levelname)s - %(message)s')
> +logger = logging.getLogger('sstate-cache-cleaner')
> +logger.setLevel(logging.DEBUG)
> +fh = logging.FileHandler('sstate-cache-cleaner.log', 'w')
> +fh.setLevel(logging.DEBUG)
> +fh.setFormatter(formatter)
> +ch = logging.StreamHandler()
> +ch.setLevel(logging.INFO)
> +ch.setFormatter(formatter)
> +logger.addHandler(fh)
> +logger.addHandler(ch)
> +
> +TIME = time.time()
> +ONE_DAY_IN_SECONDS = 86400
> +
> +def collect_sstate_cache_files(cache_dir):
> +    """ Collect all sstate-cache files form cache_dir and figure out accelerated tasks for cleaning. """
> +
> +    logger.info('Collecting sstate-cache files...')
> +
> +    sstate_tasks = set()
> +    cache_files = dict()
> +    cache_file_regex = re.compile(r'sstate.*:([^_]*)_(.*)\.tgz.*')
> +    for root, dirs, files in os.walk(cache_dir):
> +        for filename in files:
> +            if fnmatch.fnmatch(filename, 'sstate*'):
> +                match = cache_file_regex.match(filename)
> +                if match:
> +                    _hash = match.group(1)
> +                    _task = match.group(2)
> +                    sstate_tasks.add(_task)
> +                    f = os.path.join(root, filename)
> +                    try:
> +                        if os.stat(f).st_ctime < TIME - ONE_DAY_IN_SECONDS:
> +                            if _hash in cache_files:
> +                                cache_files[_hash].append(f)
> +                            else:
> +                                cache_files[_hash] = [f]
> +                    except FileNotFoundError as err:
> +                        logger.error(err)
> +
> +    num_of_files = reduce(lambda count, element: count + len(element), cache_files.values(), 0)
> +    num_of_hashes = len(cache_files)
> +    logger.info(f'Found {num_of_files} sstate files ({num_of_hashes} hashes)')
> +    return cache_files, sstate_tasks
> +
> +def collect_stamps(stamps_dirs_list, tasks):
> +    """ Collect hashes from the stamp files (only for tasks which were found in sstate-cache) """
> +
> +    logger.info('Collecting stamps...')
> +
> +    stamps = set()
> +    for stamps_dir in stamps_dirs_list:
> +        logger.debug(f'Looking for stamps in {stamps_dir}')
> +        for root, dirs, files in os.walk(stamps_dir):
> +            for filename in files:
> +                for task in tasks:
> +                    if fnmatch.fnmatch(filename, f'*.do_{task}_setscene.*'):
> +                        match = re.match(rf'.*\.do_{task}_setscene\.([^\.]*).*', filename)
> +                        if match:
> +                            stamps.add(match.group(1))
> +                    elif fnmatch.fnmatch(filename, f'*.do_{task}.*'):
> +                        match = re.match(rf'.*do_{task}(\.sigdata)?\.([^\.]*).*', filename)
> +                        if match:
> +                            stamps.add(match.group(2))
> +                    continue
> +
> +    logger.info(f'Found {len(stamps)} stamps')
> +    return stamps
> +
> +def compute_obsolete_sstate_cache_files(stamps, cache):
> +    """ Figure out which cache files are obsolete.
> +
> +    Check if a cache file is referenced by a stamp file. If yes - it is needed - and therefore should be filtered out
> +    from the processed list. The list which is returned is a list of files to be removed.
> +    """
> +
> +    logger.info('Filtering sstate-cache list for unreferenced (obsolete) files...')
> +
> +    num_stamps = len(stamps) - 1
> +    progress = -1
> +    for i, stamp in enumerate(stamps):
> +        _progress = int(i / num_stamps * 100)
> +        if _progress % 5 == 0 and _progress > progress:
> +            progress = _progress
> +            logger.debug(f'[{progress:3d}%] Cleaning stamp {i}/{num_stamps}')
> +        if stamp in cache:
> +            del cache[stamp]
> +
> +    num_of_files = reduce(lambda count, element: count + len(element), cache.values(), 0)
> +    logger.info(f'Found {num_of_files} sstate files to be removed')
> +    return cache
> +
> +def parse_arguments():
> +    """ Parse arguments for cache & stamp directories and output file name """
> +
> +    parser = argparse.ArgumentParser(
> +                        description='Sstate cache cleanup script. \
> +                                     Cache files which are not referenced by stamp files will be listed for removal.',
> +                        epilog='This is a python re-write of poky provided sstate-cache-management.sh script. \
> +                                Only stamp based cleaning is implemented.')
> +    parser.add_argument('--cache-dir', required=True,
> +                        help='Specify sstate-cache directory')
> +    parser.add_argument('--stamps-dir', required=True, nargs='+',
> +                        help='Specify stamps directories')
> +    parser.add_argument('--output-file', '-f', required=True,
> +                        help='Specify a file for script output - a list of obsolete sstate-cache files.')
> +
> +    logger.debug('Parsing arguments...')
> +    return parser.parse_args()
> +
> +def main():
> +    args = parse_arguments()
> +
> +    stamps_dirs_list = args.stamps_dir
> +    for i, path in enumerate(stamps_dirs_list):
> +        abs_path = os.path.abspath(path)
> +        if not os.path.isdir(abs_path):
> +            raise ValueError(f'Stamps directory doesn\'t exist: {abs_path} !')
> +        stamps_dirs_list[i] = abs_path
> +
> +    cache_dir = os.path.abspath(args.cache_dir)
> +    if not os.path.isdir(cache_dir):
> +        raise ValueError(f'Cache directory doesn\'t exist: {cache_dir} !')
> +
> +    output_file_path = os.path.abspath(args.output_file)
> +
> +    cache, tasks = collect_sstate_cache_files(cache_dir)
> +    stamps = collect_stamps(stamps_dirs_list, tasks)
> +
> +    obsolete_sstate = compute_obsolete_sstate_cache_files(stamps, cache)
> +    obsolete_sstate_files = [item for sublist in obsolete_sstate.values() for item in sublist]
> +
> +    if not os.path.isdir(os.path.dirname(output_file_path)):
> +        logger.warning(f'Output directory doesn\'t exist and will be created: {output_file_path}')
> +        os.makedirs(os.path.dirname(output_file_path))
> +
> +    with open(output_file_path, 'w') as out:
> +        out.write('\n'.join(obsolete_sstate_files))
> +
> +    logger.info(f'List of obsolete sstate-cache files saved: {output_file_path}')
> +
> +if __name__ == "__main__":
> +    main()
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#173649):https://lists.openembedded.org/g/openembedded-core/message/173649
> Mute This Topic:https://lists.openembedded.org/mt/95169760/3616765
> Group Owner:openembedded-core+owner@lists.openembedded.org
> Unsubscribe:https://lists.openembedded.org/g/openembedded-core/unsub  [randy.macleod@windriver.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
Richard Purdie Jan. 26, 2023, 4:40 p.m. UTC | #2
On Thu, 2023-01-26 at 11:31 -0500, Randy MacLeod wrote:
> On 2022-11-21 06:11, Tomasz Dziendzielski via lists.openembedded.org wrote:
>  
> 
> 
> 
> > From: Mikolaj Lasota <mikolaj.lasota@protonmail.com>
> > 
> > Bash script used at the moment takes too much time to calculate obsolete
> > sstate cache files. Let's try to rewrite necessary logic in python and
> > store intermediate data in memory rather than temporary files.
> 
> This seems like a nice improvement that got missed.
> I'd test it and report on the speed-up but I have a million other
>  things to do so I'm just replying instead. What sort of performance
>  difference are you seeing?
> Should we keep the old scripts/sstate-cache-management.sh
>  but make it a wrapper for this script or just remove it?

I've been asked about this a couple of times. I'm a lot happier with
this script in python so I like the idea. What worries me is this:

"""
This script is a python rewrite of poky based scripts/sstate-cache-
management.sh. It has a subset of original script features
"""

I really don't like having two of something, each with some set of
good/bad.

If the patch deletes the original script and suggests people add the
missing functionality, that improves things. It would be helpful to
know which functionality is missing too.

Cheers,

Richard
Tomasz Dziendzielski Feb. 2, 2023, 11:56 a.m. UTC | #3
Hi,

>As for the achieved performance improvement, I don't have access to exact
data (time, sstate cache size) anymore - maybe Tomasz will be able to find
original findings.
>However, from the top of my head I can say that in our case the original
version of the script was not able to finish the cleanup in couple of days
(and got finally killed), while proposed version took just couple of hours
(~4h).

The internal ticket only has a comment "CI execution time reduced from ~16
days to 5 minutes". This looks like a big improvement.

Best regards,
Tomasz Dziendzielski
diff mbox series

Patch

diff --git a/scripts/sstate-cache-cleaner.py b/scripts/sstate-cache-cleaner.py
new file mode 100755
index 0000000000..f01db35775
--- /dev/null
+++ b/scripts/sstate-cache-cleaner.py
@@ -0,0 +1,166 @@ 
+#!/usr/bin/env python3
+
+"""
+This script is a python rewrite of poky based scripts/sstate-cache-management.sh
+It has a subset of original script features - namely the ability to filter cache files by stamp files references.
+The output is a list of unreferenced sstate-cache files - which are obsolete and can be removed.
+
+To test the script agains the original one (shell) one might create a small test environment:
+ - create a local sstate-cache directory
+ - run two or more separate builds (different hashes/machines) using above dir (SSTATE_DIR)
+ - run original shell script using stamp dir from one of the above builds and the common cache dir
+ - run this script with the same arguments (same stamp & cache dirs)
+"""
+
+import argparse
+import fnmatch
+import logging
+import os
+import re
+import time
+from functools import reduce
+
+formatter = logging.Formatter('%(asctime)s - %(funcName)s - %(levelname)s - %(message)s')
+logger = logging.getLogger('sstate-cache-cleaner')
+logger.setLevel(logging.DEBUG)
+fh = logging.FileHandler('sstate-cache-cleaner.log', 'w')
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(formatter)
+ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
+ch.setFormatter(formatter)
+logger.addHandler(fh)
+logger.addHandler(ch)
+
+TIME = time.time()
+ONE_DAY_IN_SECONDS = 86400
+
+def collect_sstate_cache_files(cache_dir):
+    """ Collect all sstate-cache files form cache_dir and figure out accelerated tasks for cleaning. """
+
+    logger.info('Collecting sstate-cache files...')
+
+    sstate_tasks = set()
+    cache_files = dict()
+    cache_file_regex = re.compile(r'sstate.*:([^_]*)_(.*)\.tgz.*')
+    for root, dirs, files in os.walk(cache_dir):
+        for filename in files:
+            if fnmatch.fnmatch(filename, 'sstate*'):
+                match = cache_file_regex.match(filename)
+                if match:
+                    _hash = match.group(1)
+                    _task = match.group(2)
+                    sstate_tasks.add(_task)
+                    f = os.path.join(root, filename)
+                    try:
+                        if os.stat(f).st_ctime < TIME - ONE_DAY_IN_SECONDS:
+                            if _hash in cache_files:
+                                cache_files[_hash].append(f)
+                            else:
+                                cache_files[_hash] = [f]
+                    except FileNotFoundError as err:
+                        logger.error(err)
+
+    num_of_files = reduce(lambda count, element: count + len(element), cache_files.values(), 0)
+    num_of_hashes = len(cache_files)
+    logger.info(f'Found {num_of_files} sstate files ({num_of_hashes} hashes)')
+    return cache_files, sstate_tasks
+
+def collect_stamps(stamps_dirs_list, tasks):
+    """ Collect hashes from the stamp files (only for tasks which were found in sstate-cache) """
+
+    logger.info('Collecting stamps...')
+
+    stamps = set()
+    for stamps_dir in stamps_dirs_list:
+        logger.debug(f'Looking for stamps in {stamps_dir}')
+        for root, dirs, files in os.walk(stamps_dir):
+            for filename in files:
+                for task in tasks:
+                    if fnmatch.fnmatch(filename, f'*.do_{task}_setscene.*'):
+                        match = re.match(rf'.*\.do_{task}_setscene\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(1))
+                    elif fnmatch.fnmatch(filename, f'*.do_{task}.*'):
+                        match = re.match(rf'.*do_{task}(\.sigdata)?\.([^\.]*).*', filename)
+                        if match:
+                            stamps.add(match.group(2))
+                    continue
+
+    logger.info(f'Found {len(stamps)} stamps')
+    return stamps
+
+def compute_obsolete_sstate_cache_files(stamps, cache):
+    """ Figure out which cache files are obsolete.
+
+    Check if a cache file is referenced by a stamp file. If yes - it is needed - and therefore should be filtered out
+    from the processed list. The list which is returned is a list of files to be removed.
+    """
+
+    logger.info('Filtering sstate-cache list for unreferenced (obsolete) files...')
+
+    num_stamps = len(stamps) - 1
+    progress = -1
+    for i, stamp in enumerate(stamps):
+        _progress = int(i / num_stamps * 100)
+        if _progress % 5 == 0 and _progress > progress:
+            progress = _progress
+            logger.debug(f'[{progress:3d}%] Cleaning stamp {i}/{num_stamps}')
+        if stamp in cache:
+            del cache[stamp]
+
+    num_of_files = reduce(lambda count, element: count + len(element), cache.values(), 0)
+    logger.info(f'Found {num_of_files} sstate files to be removed')
+    return cache
+
+def parse_arguments():
+    """ Parse arguments for cache & stamp directories and output file name """
+
+    parser = argparse.ArgumentParser(
+                        description='Sstate cache cleanup script. \
+                                     Cache files which are not referenced by stamp files will be listed for removal.',
+                        epilog='This is a python re-write of poky provided sstate-cache-management.sh script. \
+                                Only stamp based cleaning is implemented.')
+    parser.add_argument('--cache-dir', required=True,
+                        help='Specify sstate-cache directory')
+    parser.add_argument('--stamps-dir', required=True, nargs='+',
+                        help='Specify stamps directories')
+    parser.add_argument('--output-file', '-f', required=True,
+                        help='Specify a file for script output - a list of obsolete sstate-cache files.')
+
+    logger.debug('Parsing arguments...')
+    return parser.parse_args()
+
+def main():
+    args = parse_arguments()
+
+    stamps_dirs_list = args.stamps_dir
+    for i, path in enumerate(stamps_dirs_list):
+        abs_path = os.path.abspath(path)
+        if not os.path.isdir(abs_path):
+            raise ValueError(f'Stamps directory doesn\'t exist: {abs_path} !')
+        stamps_dirs_list[i] = abs_path
+
+    cache_dir = os.path.abspath(args.cache_dir)
+    if not os.path.isdir(cache_dir):
+        raise ValueError(f'Cache directory doesn\'t exist: {cache_dir} !')
+
+    output_file_path = os.path.abspath(args.output_file)
+
+    cache, tasks = collect_sstate_cache_files(cache_dir)
+    stamps = collect_stamps(stamps_dirs_list, tasks)
+
+    obsolete_sstate = compute_obsolete_sstate_cache_files(stamps, cache)
+    obsolete_sstate_files = [item for sublist in obsolete_sstate.values() for item in sublist]
+
+    if not os.path.isdir(os.path.dirname(output_file_path)):
+        logger.warning(f'Output directory doesn\'t exist and will be created: {output_file_path}')
+        os.makedirs(os.path.dirname(output_file_path))
+
+    with open(output_file_path, 'w') as out:
+        out.write('\n'.join(obsolete_sstate_files))
+
+    logger.info(f'List of obsolete sstate-cache files saved: {output_file_path}')
+
+if __name__ == "__main__":
+    main()