diff mbox series

[v4,15/35] tests/functional: enable pre-emptive caching of assets

Message ID 20240821082748.65853-16-thuth@redhat.com
State New
Headers show
Series Convert avocado tests to normal Python unittests | expand

Commit Message

Thomas Huth Aug. 21, 2024, 8:27 a.m. UTC
From: Daniel P. Berrangé <berrange@redhat.com>

Many tests need to access assets stored on remote sites. We don't want
to download these during test execution when run by meson, since this
risks hitting test timeouts when data transfers are slow.

Add support for pre-emptive caching of assets by setting the env var
QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
instead of running the test, the assets will be downloaded and saved
to the cache, then the timestamp file created.

A meson custom target is created as a dependency of each test suite
to trigger the pre-emptive caching logic before the test runs.

When run in caching mode, it will locate assets by looking for class
level variables with a name prefix "ASSET_", and type "Asset".

At the ninja level

   ninja test --suite functional

will speculatively download any assets that are not already cached,
so it is advisable to set a timeout multiplier.

   QEMU_TEST_NO_DOWNLOAD=1 ninja test --suite functional

will fail the test if a required asset is not already cached

   ninja precache-functional

will download and cache all assets required by the functional
tests

At the make level, precaching is always done by

   make check-functional

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
Tested-by: Richard Henderson <richard.henderson@linaro.org>
[thuth: Remove the duplicated "path = os.path.basename(...)" line]
Signed-off-by: Thomas Huth <thuth@redhat.com>
---
 tests/Makefile.include                 |  3 ++-
 tests/functional/meson.build           | 33 +++++++++++++++++++++++--
 tests/functional/qemu_test/asset.py    | 34 ++++++++++++++++++++++++++
 tests/functional/qemu_test/testcase.py |  7 ++++++
 4 files changed, 74 insertions(+), 3 deletions(-)

Comments

Philippe Mathieu-Daudé Aug. 23, 2024, 7:28 a.m. UTC | #1
Hi,

On 21/8/24 10:27, Thomas Huth wrote:
> From: Daniel P. Berrangé <berrange@redhat.com>
> 
> Many tests need to access assets stored on remote sites. We don't want
> to download these during test execution when run by meson, since this
> risks hitting test timeouts when data transfers are slow.
> 
> Add support for pre-emptive caching of assets by setting the env var
> QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
> instead of running the test, the assets will be downloaded and saved
> to the cache, then the timestamp file created.
> 
> A meson custom target is created as a dependency of each test suite
> to trigger the pre-emptive caching logic before the test runs.
> 
> When run in caching mode, it will locate assets by looking for class
> level variables with a name prefix "ASSET_", and type "Asset".
> 
> At the ninja level
> 
>     ninja test --suite functional
> 
> will speculatively download any assets that are not already cached,
> so it is advisable to set a timeout multiplier.
> 
>     QEMU_TEST_NO_DOWNLOAD=1 ninja test --suite functional
> 
> will fail the test if a required asset is not already cached
> 
>     ninja precache-functional
> 
> will download and cache all assets required by the functional
> tests
> 
> At the make level, precaching is always done by
> 
>     make check-functional
> 
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> Tested-by: Richard Henderson <richard.henderson@linaro.org>
> [thuth: Remove the duplicated "path = os.path.basename(...)" line]
> Signed-off-by: Thomas Huth <thuth@redhat.com>
> ---
>   tests/Makefile.include                 |  3 ++-
>   tests/functional/meson.build           | 33 +++++++++++++++++++++++--
>   tests/functional/qemu_test/asset.py    | 34 ++++++++++++++++++++++++++
>   tests/functional/qemu_test/testcase.py |  7 ++++++
>   4 files changed, 74 insertions(+), 3 deletions(-)
> 
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index 66c8cc3123..010369bd3a 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -161,7 +161,8 @@ $(FUNCTIONAL_TARGETS):
>   
>   .PHONY: check-functional
>   check-functional:
> -	@$(MAKE) SPEED=thorough check-func check-func-quick
> +	@$(NINJA) precache-functional
> +	@QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func check-func-quick
>   
>   # Consolidated targets
>   
> diff --git a/tests/functional/meson.build b/tests/functional/meson.build
> index 8a8fa0ab99..cef74b82a9 100644
> --- a/tests/functional/meson.build
> +++ b/tests/functional/meson.build
> @@ -33,6 +33,7 @@ tests_x86_64_quick = [
>   tests_x86_64_thorough = [
>   ]
>   
> +precache_all = []
>   foreach speed : ['quick', 'thorough']
>     foreach dir : target_dirs
>       if not dir.endswith('-softmmu')
> @@ -63,11 +64,35 @@ foreach speed : ['quick', 'thorough']
>                                  meson.current_source_dir())
>   
>       foreach test : target_tests
> -      test('func-@0@/@1@'.format(target_base, test),
> +      testname = '@0@-@1@'.format(target_base, test)
> +      testfile = 'test_' + test + '.py'
> +      testpath = meson.current_source_dir() / testfile
> +      teststamp = testname + '.tstamp'
> +      test_precache_env = environment()
> +      test_precache_env.set('QEMU_TEST_PRECACHE', meson.current_build_dir() / teststamp)
> +      test_precache_env.set('PYTHONPATH', meson.project_source_root() / 'python:' +
> +                                          meson.current_source_dir())
> +      precache = custom_target('func-precache-' + testname,
> +                               output: teststamp,
> +                               command: [python, testpath],
> +                               depend_files: files(testpath),
> +                               build_by_default: false,
> +                               env: test_precache_env)
> +      precache_all += precache
> +
> +      # Ideally we would add 'precache' to 'depends' here, such that
> +      # 'build_by_default: false' lets the pre-caching automatically
> +      # run immediately before the test runs. In practice this is
> +      # broken in meson, with it running the pre-caching in the normal
> +      # compile phase https://github.com/mesonbuild/meson/issues/2518
> +      # If the above bug ever gets fixed, when QEMU changes the min
> +      # meson version, add the 'depends' and remove the custom
> +      # 'run_target' logic below & in Makefile.include
> +      test('func-' + testname,
>              python,
>              depends: [test_deps, test_emulator, emulator_modules],
>              env: test_env,
> -           args: [meson.current_source_dir() / 'test_' + test + '.py'],
> +           args: [testpath],
>              protocol: 'tap',
>              timeout: test_timeouts.get(test, 60),
>              priority: test_timeouts.get(test, 60),
> @@ -75,3 +100,7 @@ foreach speed : ['quick', 'thorough']
>       endforeach
>     endforeach
>   endforeach
> +
> +run_target('precache-functional',
> +           depends: precache_all,
> +           command: ['true'])
> diff --git a/tests/functional/qemu_test/asset.py b/tests/functional/qemu_test/asset.py
> index cbeb6278af..9250ff2b06 100644
> --- a/tests/functional/qemu_test/asset.py
> +++ b/tests/functional/qemu_test/asset.py
> @@ -9,6 +9,8 @@
>   import logging
>   import os
>   import subprocess
> +import sys
> +import unittest
>   import urllib.request
>   from pathlib import Path
>   from shutil import copyfileobj
> @@ -62,6 +64,9 @@ def fetch(self):
>                              self.cache_file, self.url)
>               return str(self.cache_file)
>   
> +        if os.environ.get("QEMU_TEST_NO_DOWNLOAD", False):
> +            raise Exception("Asset cache is invalid and downloads disabled")
> +
>           self.log.info("Downloading %s to %s...", self.url, self.cache_file)
>           tmp_cache_file = self.cache_file.with_suffix(".download")
>   
> @@ -95,3 +100,32 @@ def fetch(self):
>   
>           self.log.info("Cached %s at %s" % (self.url, self.cache_file))
>           return str(self.cache_file)
> +
> +    def precache_test(test):
> +        log = logging.getLogger('qemu-test')
> +        log.setLevel(logging.DEBUG)
> +        handler = logging.StreamHandler(sys.stdout)
> +        handler.setLevel(logging.DEBUG)
> +        formatter = logging.Formatter(
> +            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
> +        handler.setFormatter(formatter)
> +        log.addHandler(handler)
> +        for name, asset in vars(test.__class__).items():
> +            if name.startswith("ASSET_") and type(asset) == Asset:
> +                log.info("Attempting to cache '%s'" % asset)
> +                asset.fetch()
> +        log.removeHandler(handler)
> +
> +    def precache_suite(suite):
> +        for test in suite:
> +            if isinstance(test, unittest.TestSuite):
> +                Asset.precache_suite(test)
> +            elif isinstance(test, unittest.TestCase):
> +                Asset.precache_test(test)
> +
> +    def precache_suites(path, cacheTstamp):
> +        loader = unittest.loader.defaultTestLoader
> +        tests = loader.loadTestsFromNames([path], None)
> +
> +        with open(cacheTstamp, "w") as fh:
> +            Asset.precache_suite(tests)


When using multiple jobs (-jN) I'm observing some hangs,
apparently multiple threads trying to download the same file.
The files are eventually downloaded successfully but it takes
longer. Should we acquire some exclusive lock somewhere?

> diff --git a/tests/functional/qemu_test/testcase.py b/tests/functional/qemu_test/testcase.py
> index b2dd863c6e..18314be9d1 100644
> --- a/tests/functional/qemu_test/testcase.py
> +++ b/tests/functional/qemu_test/testcase.py
> @@ -21,6 +21,7 @@
>   from qemu.machine import QEMUMachine
>   from qemu.utils import kvm_available, tcg_available
>   
> +from .asset import Asset
>   from .cmd import run_cmd
>   from .config import BUILD_DIR
>   
> @@ -58,6 +59,12 @@ def tearDown(self):
>   
>       def main():
>           path = os.path.basename(sys.argv[0])[:-3]
> +
> +        cache = os.environ.get("QEMU_TEST_PRECACHE", None)
> +        if cache is not None:
> +            Asset.precache_suites(path, cache)
> +            return
> +
>           tr = pycotap.TAPTestRunner(message_log = pycotap.LogMode.LogToError,
>                                      test_output_log = pycotap.LogMode.LogToError)
>           unittest.main(module = None, testRunner = tr, argv=["__dummy__", path])
Thomas Huth Aug. 27, 2024, 1:16 p.m. UTC | #2
On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
> Hi,
> 
> On 21/8/24 10:27, Thomas Huth wrote:
>> From: Daniel P. Berrangé <berrange@redhat.com>
>>
>> Many tests need to access assets stored on remote sites. We don't want
>> to download these during test execution when run by meson, since this
>> risks hitting test timeouts when data transfers are slow.
>>
>> Add support for pre-emptive caching of assets by setting the env var
>> QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
>> instead of running the test, the assets will be downloaded and saved
>> to the cache, then the timestamp file created.
>>
>> A meson custom target is created as a dependency of each test suite
>> to trigger the pre-emptive caching logic before the test runs.
>>
>> When run in caching mode, it will locate assets by looking for class
>> level variables with a name prefix "ASSET_", and type "Asset".
>>
>> At the ninja level
>>
>>     ninja test --suite functional
>>
>> will speculatively download any assets that are not already cached,
>> so it is advisable to set a timeout multiplier.
>>
>>     QEMU_TEST_NO_DOWNLOAD=1 ninja test --suite functional
>>
>> will fail the test if a required asset is not already cached
>>
>>     ninja precache-functional
>>
>> will download and cache all assets required by the functional
>> tests
>>
>> At the make level, precaching is always done by
>>
>>     make check-functional
>>
>> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
>> Tested-by: Richard Henderson <richard.henderson@linaro.org>
>> [thuth: Remove the duplicated "path = os.path.basename(...)" line]
>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>> ---
>>   tests/Makefile.include                 |  3 ++-
>>   tests/functional/meson.build           | 33 +++++++++++++++++++++++--
>>   tests/functional/qemu_test/asset.py    | 34 ++++++++++++++++++++++++++
>>   tests/functional/qemu_test/testcase.py |  7 ++++++
>>   4 files changed, 74 insertions(+), 3 deletions(-)
>>
>> diff --git a/tests/Makefile.include b/tests/Makefile.include
>> index 66c8cc3123..010369bd3a 100644
>> --- a/tests/Makefile.include
>> +++ b/tests/Makefile.include
>> @@ -161,7 +161,8 @@ $(FUNCTIONAL_TARGETS):
>>   .PHONY: check-functional
>>   check-functional:
>> -    @$(MAKE) SPEED=thorough check-func check-func-quick
>> +    @$(NINJA) precache-functional
>> +    @QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func 
>> check-func-quick
>>   # Consolidated targets
>> diff --git a/tests/functional/meson.build b/tests/functional/meson.build
>> index 8a8fa0ab99..cef74b82a9 100644
>> --- a/tests/functional/meson.build
>> +++ b/tests/functional/meson.build
>> @@ -33,6 +33,7 @@ tests_x86_64_quick = [
>>   tests_x86_64_thorough = [
>>   ]
>> +precache_all = []
>>   foreach speed : ['quick', 'thorough']
>>     foreach dir : target_dirs
>>       if not dir.endswith('-softmmu')
>> @@ -63,11 +64,35 @@ foreach speed : ['quick', 'thorough']
>>                                  meson.current_source_dir())
>>       foreach test : target_tests
>> -      test('func-@0@/@1@'.format(target_base, test),
>> +      testname = '@0@-@1@'.format(target_base, test)
>> +      testfile = 'test_' + test + '.py'
>> +      testpath = meson.current_source_dir() / testfile
>> +      teststamp = testname + '.tstamp'
>> +      test_precache_env = environment()
>> +      test_precache_env.set('QEMU_TEST_PRECACHE', 
>> meson.current_build_dir() / teststamp)
>> +      test_precache_env.set('PYTHONPATH', meson.project_source_root() / 
>> 'python:' +
>> +                                          meson.current_source_dir())
>> +      precache = custom_target('func-precache-' + testname,
>> +                               output: teststamp,
>> +                               command: [python, testpath],
>> +                               depend_files: files(testpath),
>> +                               build_by_default: false,
>> +                               env: test_precache_env)
>> +      precache_all += precache
>> +
>> +      # Ideally we would add 'precache' to 'depends' here, such that
>> +      # 'build_by_default: false' lets the pre-caching automatically
>> +      # run immediately before the test runs. In practice this is
>> +      # broken in meson, with it running the pre-caching in the normal
>> +      # compile phase https://github.com/mesonbuild/meson/issues/2518
>> +      # If the above bug ever gets fixed, when QEMU changes the min
>> +      # meson version, add the 'depends' and remove the custom
>> +      # 'run_target' logic below & in Makefile.include
>> +      test('func-' + testname,
>>              python,
>>              depends: [test_deps, test_emulator, emulator_modules],
>>              env: test_env,
>> -           args: [meson.current_source_dir() / 'test_' + test + '.py'],
>> +           args: [testpath],
>>              protocol: 'tap',
>>              timeout: test_timeouts.get(test, 60),
>>              priority: test_timeouts.get(test, 60),
>> @@ -75,3 +100,7 @@ foreach speed : ['quick', 'thorough']
>>       endforeach
>>     endforeach
>>   endforeach
>> +
>> +run_target('precache-functional',
>> +           depends: precache_all,
>> +           command: ['true'])
>> diff --git a/tests/functional/qemu_test/asset.py 
>> b/tests/functional/qemu_test/asset.py
>> index cbeb6278af..9250ff2b06 100644
>> --- a/tests/functional/qemu_test/asset.py
>> +++ b/tests/functional/qemu_test/asset.py
>> @@ -9,6 +9,8 @@
>>   import logging
>>   import os
>>   import subprocess
>> +import sys
>> +import unittest
>>   import urllib.request
>>   from pathlib import Path
>>   from shutil import copyfileobj
>> @@ -62,6 +64,9 @@ def fetch(self):
>>                              self.cache_file, self.url)
>>               return str(self.cache_file)
>> +        if os.environ.get("QEMU_TEST_NO_DOWNLOAD", False):
>> +            raise Exception("Asset cache is invalid and downloads disabled")
>> +
>>           self.log.info("Downloading %s to %s...", self.url, self.cache_file)
>>           tmp_cache_file = self.cache_file.with_suffix(".download")
>> @@ -95,3 +100,32 @@ def fetch(self):
>>           self.log.info("Cached %s at %s" % (self.url, self.cache_file))
>>           return str(self.cache_file)
>> +
>> +    def precache_test(test):
>> +        log = logging.getLogger('qemu-test')
>> +        log.setLevel(logging.DEBUG)
>> +        handler = logging.StreamHandler(sys.stdout)
>> +        handler.setLevel(logging.DEBUG)
>> +        formatter = logging.Formatter(
>> +            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
>> +        handler.setFormatter(formatter)
>> +        log.addHandler(handler)
>> +        for name, asset in vars(test.__class__).items():
>> +            if name.startswith("ASSET_") and type(asset) == Asset:
>> +                log.info("Attempting to cache '%s'" % asset)
>> +                asset.fetch()
>> +        log.removeHandler(handler)
>> +
>> +    def precache_suite(suite):
>> +        for test in suite:
>> +            if isinstance(test, unittest.TestSuite):
>> +                Asset.precache_suite(test)
>> +            elif isinstance(test, unittest.TestCase):
>> +                Asset.precache_test(test)
>> +
>> +    def precache_suites(path, cacheTstamp):
>> +        loader = unittest.loader.defaultTestLoader
>> +        tests = loader.loadTestsFromNames([path], None)
>> +
>> +        with open(cacheTstamp, "w") as fh:
>> +            Asset.precache_suite(tests)
> 
> 
> When using multiple jobs (-jN) I'm observing some hangs,
> apparently multiple threads trying to download the same file.
> The files are eventually downloaded successfully but it takes
> longer. Should we acquire some exclusive lock somewhere?

I haven't seen that yet ... what did you exactly run? "make check-functional 
-jN" ? Or "make check-functional-<target> -jN" ?

  Thomas
Thomas Huth Aug. 27, 2024, 2:24 p.m. UTC | #3
On 27/08/2024 15.16, Thomas Huth wrote:
> On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
>> Hi,
>>
>> On 21/8/24 10:27, Thomas Huth wrote:
>>> From: Daniel P. Berrangé <berrange@redhat.com>
>>>
>>> Many tests need to access assets stored on remote sites. We don't want
>>> to download these during test execution when run by meson, since this
>>> risks hitting test timeouts when data transfers are slow.
>>>
>>> Add support for pre-emptive caching of assets by setting the env var
>>> QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
>>> instead of running the test, the assets will be downloaded and saved
>>> to the cache, then the timestamp file created.
>>>
>>> A meson custom target is created as a dependency of each test suite
>>> to trigger the pre-emptive caching logic before the test runs.
>>>
>>> When run in caching mode, it will locate assets by looking for class
>>> level variables with a name prefix "ASSET_", and type "Asset".
>>>
>>> At the ninja level
>>>
>>>     ninja test --suite functional
>>>
>>> will speculatively download any assets that are not already cached,
>>> so it is advisable to set a timeout multiplier.
>>>
>>>     QEMU_TEST_NO_DOWNLOAD=1 ninja test --suite functional
>>>
>>> will fail the test if a required asset is not already cached
>>>
>>>     ninja precache-functional
>>>
>>> will download and cache all assets required by the functional
>>> tests
>>>
>>> At the make level, precaching is always done by
>>>
>>>     make check-functional
>>>
>>> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
>>> Tested-by: Richard Henderson <richard.henderson@linaro.org>
>>> [thuth: Remove the duplicated "path = os.path.basename(...)" line]
>>> Signed-off-by: Thomas Huth <thuth@redhat.com>
>>> ---
>>>   tests/Makefile.include                 |  3 ++-
>>>   tests/functional/meson.build           | 33 +++++++++++++++++++++++--
>>>   tests/functional/qemu_test/asset.py    | 34 ++++++++++++++++++++++++++
>>>   tests/functional/qemu_test/testcase.py |  7 ++++++
>>>   4 files changed, 74 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/tests/Makefile.include b/tests/Makefile.include
>>> index 66c8cc3123..010369bd3a 100644
>>> --- a/tests/Makefile.include
>>> +++ b/tests/Makefile.include
>>> @@ -161,7 +161,8 @@ $(FUNCTIONAL_TARGETS):
>>>   .PHONY: check-functional
>>>   check-functional:
>>> -    @$(MAKE) SPEED=thorough check-func check-func-quick
>>> +    @$(NINJA) precache-functional
>>> +    @QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func 
>>> check-func-quick
>>>   # Consolidated targets
>>> diff --git a/tests/functional/meson.build b/tests/functional/meson.build
>>> index 8a8fa0ab99..cef74b82a9 100644
>>> --- a/tests/functional/meson.build
>>> +++ b/tests/functional/meson.build
>>> @@ -33,6 +33,7 @@ tests_x86_64_quick = [
>>>   tests_x86_64_thorough = [
>>>   ]
>>> +precache_all = []
>>>   foreach speed : ['quick', 'thorough']
>>>     foreach dir : target_dirs
>>>       if not dir.endswith('-softmmu')
>>> @@ -63,11 +64,35 @@ foreach speed : ['quick', 'thorough']
>>>                                  meson.current_source_dir())
>>>       foreach test : target_tests
>>> -      test('func-@0@/@1@'.format(target_base, test),
>>> +      testname = '@0@-@1@'.format(target_base, test)
>>> +      testfile = 'test_' + test + '.py'
>>> +      testpath = meson.current_source_dir() / testfile
>>> +      teststamp = testname + '.tstamp'
>>> +      test_precache_env = environment()
>>> +      test_precache_env.set('QEMU_TEST_PRECACHE', 
>>> meson.current_build_dir() / teststamp)
>>> +      test_precache_env.set('PYTHONPATH', meson.project_source_root() / 
>>> 'python:' +
>>> +                                          meson.current_source_dir())
>>> +      precache = custom_target('func-precache-' + testname,
>>> +                               output: teststamp,
>>> +                               command: [python, testpath],
>>> +                               depend_files: files(testpath),
>>> +                               build_by_default: false,
>>> +                               env: test_precache_env)
>>> +      precache_all += precache
>>> +
>>> +      # Ideally we would add 'precache' to 'depends' here, such that
>>> +      # 'build_by_default: false' lets the pre-caching automatically
>>> +      # run immediately before the test runs. In practice this is
>>> +      # broken in meson, with it running the pre-caching in the normal
>>> +      # compile phase https://github.com/mesonbuild/meson/issues/2518
>>> +      # If the above bug ever gets fixed, when QEMU changes the min
>>> +      # meson version, add the 'depends' and remove the custom
>>> +      # 'run_target' logic below & in Makefile.include
>>> +      test('func-' + testname,
>>>              python,
>>>              depends: [test_deps, test_emulator, emulator_modules],
>>>              env: test_env,
>>> -           args: [meson.current_source_dir() / 'test_' + test + '.py'],
>>> +           args: [testpath],
>>>              protocol: 'tap',
>>>              timeout: test_timeouts.get(test, 60),
>>>              priority: test_timeouts.get(test, 60),
>>> @@ -75,3 +100,7 @@ foreach speed : ['quick', 'thorough']
>>>       endforeach
>>>     endforeach
>>>   endforeach
>>> +
>>> +run_target('precache-functional',
>>> +           depends: precache_all,
>>> +           command: ['true'])
>>> diff --git a/tests/functional/qemu_test/asset.py 
>>> b/tests/functional/qemu_test/asset.py
>>> index cbeb6278af..9250ff2b06 100644
>>> --- a/tests/functional/qemu_test/asset.py
>>> +++ b/tests/functional/qemu_test/asset.py
>>> @@ -9,6 +9,8 @@
>>>   import logging
>>>   import os
>>>   import subprocess
>>> +import sys
>>> +import unittest
>>>   import urllib.request
>>>   from pathlib import Path
>>>   from shutil import copyfileobj
>>> @@ -62,6 +64,9 @@ def fetch(self):
>>>                              self.cache_file, self.url)
>>>               return str(self.cache_file)
>>> +        if os.environ.get("QEMU_TEST_NO_DOWNLOAD", False):
>>> +            raise Exception("Asset cache is invalid and downloads 
>>> disabled")
>>> +
>>>           self.log.info("Downloading %s to %s...", self.url, 
>>> self.cache_file)
>>>           tmp_cache_file = self.cache_file.with_suffix(".download")
>>> @@ -95,3 +100,32 @@ def fetch(self):
>>>           self.log.info("Cached %s at %s" % (self.url, self.cache_file))
>>>           return str(self.cache_file)
>>> +
>>> +    def precache_test(test):
>>> +        log = logging.getLogger('qemu-test')
>>> +        log.setLevel(logging.DEBUG)
>>> +        handler = logging.StreamHandler(sys.stdout)
>>> +        handler.setLevel(logging.DEBUG)
>>> +        formatter = logging.Formatter(
>>> +            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
>>> +        handler.setFormatter(formatter)
>>> +        log.addHandler(handler)
>>> +        for name, asset in vars(test.__class__).items():
>>> +            if name.startswith("ASSET_") and type(asset) == Asset:
>>> +                log.info("Attempting to cache '%s'" % asset)
>>> +                asset.fetch()
>>> +        log.removeHandler(handler)
>>> +
>>> +    def precache_suite(suite):
>>> +        for test in suite:
>>> +            if isinstance(test, unittest.TestSuite):
>>> +                Asset.precache_suite(test)
>>> +            elif isinstance(test, unittest.TestCase):
>>> +                Asset.precache_test(test)
>>> +
>>> +    def precache_suites(path, cacheTstamp):
>>> +        loader = unittest.loader.defaultTestLoader
>>> +        tests = loader.loadTestsFromNames([path], None)
>>> +
>>> +        with open(cacheTstamp, "w") as fh:
>>> +            Asset.precache_suite(tests)
>>
>>
>> When using multiple jobs (-jN) I'm observing some hangs,
>> apparently multiple threads trying to download the same file.
>> The files are eventually downloaded successfully but it takes
>> longer. Should we acquire some exclusive lock somewhere?
> 
> I haven't seen that yet ... what did you exactly run? "make check-functional 
> -jN" ? Or "make check-functional-<target> -jN" ?

After applying some of your patches, I think I've run now into this problem, 
too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to 
download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).

Daniel, any ideas how to fix this in the Asset code?

  Thomas
Daniel P. Berrangé Aug. 29, 2024, 10:15 a.m. UTC | #4
On Tue, Aug 27, 2024 at 04:24:59PM +0200, Thomas Huth wrote:
> On 27/08/2024 15.16, Thomas Huth wrote:
> > On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
> > > Hi,
> > > 
> > > On 21/8/24 10:27, Thomas Huth wrote:
> > > > From: Daniel P. Berrangé <berrange@redhat.com>
> > > > 
> > > > Many tests need to access assets stored on remote sites. We don't want
> > > > to download these during test execution when run by meson, since this
> > > > risks hitting test timeouts when data transfers are slow.
> > > > 
> > > > Add support for pre-emptive caching of assets by setting the env var
> > > > QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
> > > > instead of running the test, the assets will be downloaded and saved
> > > > to the cache, then the timestamp file created.
> > > > 
> > > > A meson custom target is created as a dependency of each test suite
> > > > to trigger the pre-emptive caching logic before the test runs.
> > > > 
> > > > When run in caching mode, it will locate assets by looking for class
> > > > level variables with a name prefix "ASSET_", and type "Asset".
> > > > 
> > > > At the ninja level
> > > > 
> > > >     ninja test --suite functional
> > > > 
> > > > will speculatively download any assets that are not already cached,
> > > > so it is advisable to set a timeout multiplier.
> > > > 
> > > >     QEMU_TEST_NO_DOWNLOAD=1 ninja test --suite functional
> > > > 
> > > > will fail the test if a required asset is not already cached
> > > > 
> > > >     ninja precache-functional
> > > > 
> > > > will download and cache all assets required by the functional
> > > > tests
> > > > 
> > > > At the make level, precaching is always done by
> > > > 
> > > >     make check-functional
> > > > 
> > > > Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> > > > Tested-by: Richard Henderson <richard.henderson@linaro.org>
> > > > [thuth: Remove the duplicated "path = os.path.basename(...)" line]
> > > > Signed-off-by: Thomas Huth <thuth@redhat.com>
> > > > ---
> > > >   tests/Makefile.include                 |  3 ++-
> > > >   tests/functional/meson.build           | 33 +++++++++++++++++++++++--
> > > >   tests/functional/qemu_test/asset.py    | 34 ++++++++++++++++++++++++++
> > > >   tests/functional/qemu_test/testcase.py |  7 ++++++
> > > >   4 files changed, 74 insertions(+), 3 deletions(-)
> > > > 
> > > > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > > > index 66c8cc3123..010369bd3a 100644
> > > > --- a/tests/Makefile.include
> > > > +++ b/tests/Makefile.include
> > > > @@ -161,7 +161,8 @@ $(FUNCTIONAL_TARGETS):
> > > >   .PHONY: check-functional
> > > >   check-functional:
> > > > -    @$(MAKE) SPEED=thorough check-func check-func-quick
> > > > +    @$(NINJA) precache-functional
> > > > +    @QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func
> > > > check-func-quick
> > > >   # Consolidated targets
> > > > diff --git a/tests/functional/meson.build b/tests/functional/meson.build
> > > > index 8a8fa0ab99..cef74b82a9 100644
> > > > --- a/tests/functional/meson.build
> > > > +++ b/tests/functional/meson.build
> > > > @@ -33,6 +33,7 @@ tests_x86_64_quick = [
> > > >   tests_x86_64_thorough = [
> > > >   ]
> > > > +precache_all = []
> > > >   foreach speed : ['quick', 'thorough']
> > > >     foreach dir : target_dirs
> > > >       if not dir.endswith('-softmmu')
> > > > @@ -63,11 +64,35 @@ foreach speed : ['quick', 'thorough']
> > > >                                  meson.current_source_dir())
> > > >       foreach test : target_tests
> > > > -      test('func-@0@/@1@'.format(target_base, test),
> > > > +      testname = '@0@-@1@'.format(target_base, test)
> > > > +      testfile = 'test_' + test + '.py'
> > > > +      testpath = meson.current_source_dir() / testfile
> > > > +      teststamp = testname + '.tstamp'
> > > > +      test_precache_env = environment()
> > > > +      test_precache_env.set('QEMU_TEST_PRECACHE',
> > > > meson.current_build_dir() / teststamp)
> > > > +      test_precache_env.set('PYTHONPATH',
> > > > meson.project_source_root() / 'python:' +
> > > > +                                          meson.current_source_dir())
> > > > +      precache = custom_target('func-precache-' + testname,
> > > > +                               output: teststamp,
> > > > +                               command: [python, testpath],
> > > > +                               depend_files: files(testpath),
> > > > +                               build_by_default: false,
> > > > +                               env: test_precache_env)
> > > > +      precache_all += precache
> > > > +
> > > > +      # Ideally we would add 'precache' to 'depends' here, such that
> > > > +      # 'build_by_default: false' lets the pre-caching automatically
> > > > +      # run immediately before the test runs. In practice this is
> > > > +      # broken in meson, with it running the pre-caching in the normal
> > > > +      # compile phase https://github.com/mesonbuild/meson/issues/2518
> > > > +      # If the above bug ever gets fixed, when QEMU changes the min
> > > > +      # meson version, add the 'depends' and remove the custom
> > > > +      # 'run_target' logic below & in Makefile.include
> > > > +      test('func-' + testname,
> > > >              python,
> > > >              depends: [test_deps, test_emulator, emulator_modules],
> > > >              env: test_env,
> > > > -           args: [meson.current_source_dir() / 'test_' + test + '.py'],
> > > > +           args: [testpath],
> > > >              protocol: 'tap',
> > > >              timeout: test_timeouts.get(test, 60),
> > > >              priority: test_timeouts.get(test, 60),
> > > > @@ -75,3 +100,7 @@ foreach speed : ['quick', 'thorough']
> > > >       endforeach
> > > >     endforeach
> > > >   endforeach
> > > > +
> > > > +run_target('precache-functional',
> > > > +           depends: precache_all,
> > > > +           command: ['true'])
> > > > diff --git a/tests/functional/qemu_test/asset.py
> > > > b/tests/functional/qemu_test/asset.py
> > > > index cbeb6278af..9250ff2b06 100644
> > > > --- a/tests/functional/qemu_test/asset.py
> > > > +++ b/tests/functional/qemu_test/asset.py
> > > > @@ -9,6 +9,8 @@
> > > >   import logging
> > > >   import os
> > > >   import subprocess
> > > > +import sys
> > > > +import unittest
> > > >   import urllib.request
> > > >   from pathlib import Path
> > > >   from shutil import copyfileobj
> > > > @@ -62,6 +64,9 @@ def fetch(self):
> > > >                              self.cache_file, self.url)
> > > >               return str(self.cache_file)
> > > > +        if os.environ.get("QEMU_TEST_NO_DOWNLOAD", False):
> > > > +            raise Exception("Asset cache is invalid and
> > > > downloads disabled")
> > > > +
> > > >           self.log.info("Downloading %s to %s...", self.url,
> > > > self.cache_file)
> > > >           tmp_cache_file = self.cache_file.with_suffix(".download")
> > > > @@ -95,3 +100,32 @@ def fetch(self):
> > > >           self.log.info("Cached %s at %s" % (self.url, self.cache_file))
> > > >           return str(self.cache_file)
> > > > +
> > > > +    def precache_test(test):
> > > > +        log = logging.getLogger('qemu-test')
> > > > +        log.setLevel(logging.DEBUG)
> > > > +        handler = logging.StreamHandler(sys.stdout)
> > > > +        handler.setLevel(logging.DEBUG)
> > > > +        formatter = logging.Formatter(
> > > > +            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
> > > > +        handler.setFormatter(formatter)
> > > > +        log.addHandler(handler)
> > > > +        for name, asset in vars(test.__class__).items():
> > > > +            if name.startswith("ASSET_") and type(asset) == Asset:
> > > > +                log.info("Attempting to cache '%s'" % asset)
> > > > +                asset.fetch()
> > > > +        log.removeHandler(handler)
> > > > +
> > > > +    def precache_suite(suite):
> > > > +        for test in suite:
> > > > +            if isinstance(test, unittest.TestSuite):
> > > > +                Asset.precache_suite(test)
> > > > +            elif isinstance(test, unittest.TestCase):
> > > > +                Asset.precache_test(test)
> > > > +
> > > > +    def precache_suites(path, cacheTstamp):
> > > > +        loader = unittest.loader.defaultTestLoader
> > > > +        tests = loader.loadTestsFromNames([path], None)
> > > > +
> > > > +        with open(cacheTstamp, "w") as fh:
> > > > +            Asset.precache_suite(tests)
> > > 
> > > 
> > > When using multiple jobs (-jN) I'm observing some hangs,
> > > apparently multiple threads trying to download the same file.
> > > The files are eventually downloaded successfully but it takes
> > > longer. Should we acquire some exclusive lock somewhere?
> > 
> > I haven't seen that yet ... what did you exactly run? "make
> > check-functional -jN" ? Or "make check-functional-<target> -jN" ?
> 
> After applying some of your patches, I think I've run now into this problem,
> too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to
> download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).
> 
> Daniel, any ideas how to fix this in the Asset code?

So when downloading we open a file with a ".download" suffix, write to
that, and then rename it to the final filename.

If we have concurrent usage, both will open the same file and try to
write to it. Assuming both are downloading the same content we would
probably "get lucky" and have a consistent file at the end, but clearly
it is bad to rely on luck.

The lame option is to use NamedTemporaryFile for the teporary file.
This ensures both processes will write to different temp files, and
the final rename is atomic. This guarantees safety, but still has
the double download penalty.

The serious option is to use fcntl.lockf(..., fcntl.LOCK_EX) on the
temp file. If we can't acquire the lock then just immediately close
the temp file (don't delete it) and assume another thread is going to
finish its download.

On windows  we'll need msvcrt.locking(..., msvcrt.LK_WLCK, ...)
instead of fcntl.

With regards,
Daniel
Thomas Huth Aug. 30, 2024, 7:38 a.m. UTC | #5
On 29/08/2024 12.15, Daniel P. Berrangé wrote:
> On Tue, Aug 27, 2024 at 04:24:59PM +0200, Thomas Huth wrote:
>> On 27/08/2024 15.16, Thomas Huth wrote:
>>> On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
>>>> Hi,
>>>>
>>>> On 21/8/24 10:27, Thomas Huth wrote:
>>>>> From: Daniel P. Berrangé <berrange@redhat.com>
>>>>>
>>>>> Many tests need to access assets stored on remote sites. We don't want
>>>>> to download these during test execution when run by meson, since this
>>>>> risks hitting test timeouts when data transfers are slow.
>>>>>
>>>>> Add support for pre-emptive caching of assets by setting the env var
>>>>> QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
>>>>> instead of running the test, the assets will be downloaded and saved
>>>>> to the cache, then the timestamp file created.
...
>>>>
>>>> When using multiple jobs (-jN) I'm observing some hangs,
>>>> apparently multiple threads trying to download the same file.
>>>> The files are eventually downloaded successfully but it takes
>>>> longer. Should we acquire some exclusive lock somewhere?
>>>
>>> I haven't seen that yet ... what did you exactly run? "make
>>> check-functional -jN" ? Or "make check-functional-<target> -jN" ?
>>
>> After applying some of your patches, I think I've run now into this problem,
>> too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to
>> download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).
>>
>> Daniel, any ideas how to fix this in the Asset code?
> 
> So when downloading we open a file with a ".download" suffix, write to
> that, and then rename it to the final filename.
> 
> If we have concurrent usage, both will open the same file and try to
> write to it. Assuming both are downloading the same content we would
> probably "get lucky" and have a consistent file at the end, but clearly
> it is bad to rely on luck.
> 
> The lame option is to use NamedTemporaryFile for the teporary file.
> This ensures both processes will write to different temp files, and
> the final rename is atomic. This guarantees safety, but still has
> the double download penalty.
> 
> The serious option is to use fcntl.lockf(..., fcntl.LOCK_EX) on the
> temp file. If we can't acquire the lock then just immediately close
> the temp file (don't delete it) and assume another thread is going to
> finish its download.
> 
> On windows  we'll need msvcrt.locking(..., msvcrt.LK_WLCK, ...)
> instead of fcntl.

While looking for portable solutions, I noticed that newer versions
of Python have a "x" mode for creating files only if they do not
exist yet. So I think something like this could be a solution:

@@ -71,17 +72,26 @@ def fetch(self):
          tmp_cache_file = self.cache_file.with_suffix(".download")
  
          try:
-            resp = urllib.request.urlopen(self.url)
+            with tmp_cache_file.open("xb") as dst:
+                with urllib.request.urlopen(self.url) as resp:
+                    copyfileobj(resp, dst)
+        except FileExistsError:
+            # Another thread already seems to download this asset,
+            # so wait until it is done
+            self.log.debug("%s already exists, waiting for other thread to finish...",
+                           tmp_cache_file)
+            i = 0
+            while i < 600 and os.path.exists(tmp_cache_file):
+                sleep(1)
+                i += 1
+            if os.path.exists(self.cache_file):
+                return str(self.cache_file)
+            raise
          except Exception as e:
              self.log.error("Unable to download %s: %s", self.url, e)
-            raise
-
-        try:
-            with tmp_cache_file.open("wb+") as dst:
-                copyfileobj(resp, dst)
-        except:
              tmp_cache_file.unlink()
              raise
+
          try:
              # Set these just for informational purposes
              os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",

What do you think, does it look reasonable?

  Thomas
Daniel P. Berrangé Aug. 30, 2024, 7:42 a.m. UTC | #6
On Fri, Aug 30, 2024 at 09:38:17AM +0200, Thomas Huth wrote:
> On 29/08/2024 12.15, Daniel P. Berrangé wrote:
> > On Tue, Aug 27, 2024 at 04:24:59PM +0200, Thomas Huth wrote:
> > > On 27/08/2024 15.16, Thomas Huth wrote:
> > > > On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
> > > > > Hi,
> > > > > 
> > > > > On 21/8/24 10:27, Thomas Huth wrote:
> > > > > > From: Daniel P. Berrangé <berrange@redhat.com>
> > > > > > 
> > > > > > Many tests need to access assets stored on remote sites. We don't want
> > > > > > to download these during test execution when run by meson, since this
> > > > > > risks hitting test timeouts when data transfers are slow.
> > > > > > 
> > > > > > Add support for pre-emptive caching of assets by setting the env var
> > > > > > QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
> > > > > > instead of running the test, the assets will be downloaded and saved
> > > > > > to the cache, then the timestamp file created.
> ...
> > > > > 
> > > > > When using multiple jobs (-jN) I'm observing some hangs,
> > > > > apparently multiple threads trying to download the same file.
> > > > > The files are eventually downloaded successfully but it takes
> > > > > longer. Should we acquire some exclusive lock somewhere?
> > > > 
> > > > I haven't seen that yet ... what did you exactly run? "make
> > > > check-functional -jN" ? Or "make check-functional-<target> -jN" ?
> > > 
> > > After applying some of your patches, I think I've run now into this problem,
> > > too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to
> > > download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).
> > > 
> > > Daniel, any ideas how to fix this in the Asset code?
> > 
> > So when downloading we open a file with a ".download" suffix, write to
> > that, and then rename it to the final filename.
> > 
> > If we have concurrent usage, both will open the same file and try to
> > write to it. Assuming both are downloading the same content we would
> > probably "get lucky" and have a consistent file at the end, but clearly
> > it is bad to rely on luck.
> > 
> > The lame option is to use NamedTemporaryFile for the teporary file.
> > This ensures both processes will write to different temp files, and
> > the final rename is atomic. This guarantees safety, but still has
> > the double download penalty.
> > 
> > The serious option is to use fcntl.lockf(..., fcntl.LOCK_EX) on the
> > temp file. If we can't acquire the lock then just immediately close
> > the temp file (don't delete it) and assume another thread is going to
> > finish its download.
> > 
> > On windows  we'll need msvcrt.locking(..., msvcrt.LK_WLCK, ...)
> > instead of fcntl.
> 
> While looking for portable solutions, I noticed that newer versions
> of Python have a "x" mode for creating files only if they do not
> exist yet. So I think something like this could be a solution:
> 
> @@ -71,17 +72,26 @@ def fetch(self):
>          tmp_cache_file = self.cache_file.with_suffix(".download")
>          try:
> -            resp = urllib.request.urlopen(self.url)
> +            with tmp_cache_file.open("xb") as dst:
> +                with urllib.request.urlopen(self.url) as resp:
> +                    copyfileobj(resp, dst)
> +        except FileExistsError:
> +            # Another thread already seems to download this asset,
> +            # so wait until it is done
> +            self.log.debug("%s already exists, waiting for other thread to finish...",
> +                           tmp_cache_file)
> +            i = 0
> +            while i < 600 and os.path.exists(tmp_cache_file):
> +                sleep(1)
> +                i += 1
> +            if os.path.exists(self.cache_file):
> +                return str(self.cache_file)
> +            raise
>          except Exception as e:
>              self.log.error("Unable to download %s: %s", self.url, e)
> -            raise
> -
> -        try:
> -            with tmp_cache_file.open("wb+") as dst:
> -                copyfileobj(resp, dst)
> -        except:
>              tmp_cache_file.unlink()
>              raise
> +
>          try:
>              # Set these just for informational purposes
>              os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",
> 
> What do you think, does it look reasonable?

The main risk with this, as opposed to fcntl locking, is that it is not
crash-safe. If a download is interrupted, subsequent cache runs will
wait for a process that doesn't exist to finish downloading and then
raise an exception, requiring manual user cleanup of the partial
download.

Perhaps if we see the tmp_cache_file, and it doesn't change in size
after N seconds, we could force unlink it, and create a new download,
so we gracefully recover ?

With regards,
Daniel
Thomas Huth Aug. 30, 2024, 11:27 a.m. UTC | #7
On 30/08/2024 09.42, Daniel P. Berrangé wrote:
> On Fri, Aug 30, 2024 at 09:38:17AM +0200, Thomas Huth wrote:
>> On 29/08/2024 12.15, Daniel P. Berrangé wrote:
>>> On Tue, Aug 27, 2024 at 04:24:59PM +0200, Thomas Huth wrote:
>>>> On 27/08/2024 15.16, Thomas Huth wrote:
>>>>> On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 21/8/24 10:27, Thomas Huth wrote:
>>>>>>> From: Daniel P. Berrangé <berrange@redhat.com>
>>>>>>>
>>>>>>> Many tests need to access assets stored on remote sites. We don't want
>>>>>>> to download these during test execution when run by meson, since this
>>>>>>> risks hitting test timeouts when data transfers are slow.
>>>>>>>
>>>>>>> Add support for pre-emptive caching of assets by setting the env var
>>>>>>> QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
>>>>>>> instead of running the test, the assets will be downloaded and saved
>>>>>>> to the cache, then the timestamp file created.
>> ...
>>>>>>
>>>>>> When using multiple jobs (-jN) I'm observing some hangs,
>>>>>> apparently multiple threads trying to download the same file.
>>>>>> The files are eventually downloaded successfully but it takes
>>>>>> longer. Should we acquire some exclusive lock somewhere?
>>>>>
>>>>> I haven't seen that yet ... what did you exactly run? "make
>>>>> check-functional -jN" ? Or "make check-functional-<target> -jN" ?
>>>>
>>>> After applying some of your patches, I think I've run now into this problem,
>>>> too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to
>>>> download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).
>>>>
>>>> Daniel, any ideas how to fix this in the Asset code?
>>>
>>> So when downloading we open a file with a ".download" suffix, write to
>>> that, and then rename it to the final filename.
>>>
>>> If we have concurrent usage, both will open the same file and try to
>>> write to it. Assuming both are downloading the same content we would
>>> probably "get lucky" and have a consistent file at the end, but clearly
>>> it is bad to rely on luck.
>>>
>>> The lame option is to use NamedTemporaryFile for the teporary file.
>>> This ensures both processes will write to different temp files, and
>>> the final rename is atomic. This guarantees safety, but still has
>>> the double download penalty.
>>>
>>> The serious option is to use fcntl.lockf(..., fcntl.LOCK_EX) on the
>>> temp file. If we can't acquire the lock then just immediately close
>>> the temp file (don't delete it) and assume another thread is going to
>>> finish its download.
>>>
>>> On windows  we'll need msvcrt.locking(..., msvcrt.LK_WLCK, ...)
>>> instead of fcntl.
>>
>> While looking for portable solutions, I noticed that newer versions
>> of Python have a "x" mode for creating files only if they do not
>> exist yet. So I think something like this could be a solution:
>>
>> @@ -71,17 +72,26 @@ def fetch(self):
>>           tmp_cache_file = self.cache_file.with_suffix(".download")
>>           try:
>> -            resp = urllib.request.urlopen(self.url)
>> +            with tmp_cache_file.open("xb") as dst:
>> +                with urllib.request.urlopen(self.url) as resp:
>> +                    copyfileobj(resp, dst)
>> +        except FileExistsError:
>> +            # Another thread already seems to download this asset,
>> +            # so wait until it is done
>> +            self.log.debug("%s already exists, waiting for other thread to finish...",
>> +                           tmp_cache_file)
>> +            i = 0
>> +            while i < 600 and os.path.exists(tmp_cache_file):
>> +                sleep(1)
>> +                i += 1
>> +            if os.path.exists(self.cache_file):
>> +                return str(self.cache_file)
>> +            raise
>>           except Exception as e:
>>               self.log.error("Unable to download %s: %s", self.url, e)
>> -            raise
>> -
>> -        try:
>> -            with tmp_cache_file.open("wb+") as dst:
>> -                copyfileobj(resp, dst)
>> -        except:
>>               tmp_cache_file.unlink()
>>               raise
>> +
>>           try:
>>               # Set these just for informational purposes
>>               os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",
>>
>> What do you think, does it look reasonable?
> 
> The main risk with this, as opposed to fcntl locking, is that it is not
> crash-safe. If a download is interrupted, subsequent cache runs will
> wait for a process that doesn't exist to finish downloading and then
> raise an exception, requiring manual user cleanup of the partial
> download.
> 
> Perhaps if we see the tmp_cache_file, and it doesn't change in size
> after N seconds, we could force unlink it, and create a new download,
> so we gracefully recover ?

Sounds like a plan ... does this look acceptable:

@@ -70,18 +71,52 @@ def fetch(self):
          self.log.info("Downloading %s to %s...", self.url, self.cache_file)
          tmp_cache_file = self.cache_file.with_suffix(".download")

-        try:
-            resp = urllib.request.urlopen(self.url)
-        except Exception as e:
-            self.log.error("Unable to download %s: %s", self.url, e)
-            raise
+        for retries in range(3):
+            try:
+                with tmp_cache_file.open("xb") as dst:
+                    with urllib.request.urlopen(self.url) as resp:
+                        copyfileobj(resp, dst)
+                break
+            except FileExistsError:
+                # Another thread already seems to download this asset,
+                # so wait until it is done
+                self.log.debug("%s already exists, "
+                               "waiting for other thread to finish...",
+                               tmp_cache_file)
+                try:
+                    current_size = tmp_cache_file.stat().st_size
+                    new_size = current_size
+                except:
+                    if os.path.exists(self.cache_file):
+                        return str(self.cache_file)
+                    raise
+                waittime = lastchange = 600
+                while waittime > 0:
+                    sleep(1)
+                    waittime -= 1
+                    try:
+                        new_size = tmp_cache_file.stat().st_size
+                    except:
+                        if os.path.exists(self.cache_file):
+                            return str(self.cache_file)
+                        raise
+                    if new_size != current_size:
+                        lastchange = waittime
+                        current_size = new_size
+                    elif lastchange - waittime > 90:
+                       self.log.debug("%s seems to be stale ... "
+                                      "deleting and retrying download...",
+                                      tmp_cache_file)
+                       tmp_cache_file.unlink()
+                       break
+                if waittime > 0:
+                    continue
+                raise
+            except Exception as e:
+                self.log.error("Unable to download %s: %s", self.url, e)
+                tmp_cache_file.unlink()
+                raise

-        try:
-            with tmp_cache_file.open("wb+") as dst:
-                copyfileobj(resp, dst)
-        except:
-            tmp_cache_file.unlink()
-            raise
          try:
              # Set these just for informational purposes
              os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",

?

I tried it with a stale file in my cache, and it seems to work - after 90 
seconds, one of the threads is properly trying to redownload the file.

  Thomas
Daniel P. Berrangé Aug. 30, 2024, 11:37 a.m. UTC | #8
On Fri, Aug 30, 2024 at 01:27:15PM +0200, Thomas Huth wrote:
> On 30/08/2024 09.42, Daniel P. Berrangé wrote:
> > On Fri, Aug 30, 2024 at 09:38:17AM +0200, Thomas Huth wrote:
> > > On 29/08/2024 12.15, Daniel P. Berrangé wrote:
> > > > On Tue, Aug 27, 2024 at 04:24:59PM +0200, Thomas Huth wrote:
> > > > > On 27/08/2024 15.16, Thomas Huth wrote:
> > > > > > On 23/08/2024 09.28, Philippe Mathieu-Daudé wrote:
> > > > > > > Hi,
> > > > > > > 
> > > > > > > On 21/8/24 10:27, Thomas Huth wrote:
> > > > > > > > From: Daniel P. Berrangé <berrange@redhat.com>
> > > > > > > > 
> > > > > > > > Many tests need to access assets stored on remote sites. We don't want
> > > > > > > > to download these during test execution when run by meson, since this
> > > > > > > > risks hitting test timeouts when data transfers are slow.
> > > > > > > > 
> > > > > > > > Add support for pre-emptive caching of assets by setting the env var
> > > > > > > > QEMU_TEST_PRECACHE to point to a timestamp file. When this is set,
> > > > > > > > instead of running the test, the assets will be downloaded and saved
> > > > > > > > to the cache, then the timestamp file created.
> > > ...
> > > > > > > 
> > > > > > > When using multiple jobs (-jN) I'm observing some hangs,
> > > > > > > apparently multiple threads trying to download the same file.
> > > > > > > The files are eventually downloaded successfully but it takes
> > > > > > > longer. Should we acquire some exclusive lock somewhere?
> > > > > > 
> > > > > > I haven't seen that yet ... what did you exactly run? "make
> > > > > > check-functional -jN" ? Or "make check-functional-<target> -jN" ?
> > > > > 
> > > > > After applying some of your patches, I think I've run now into this problem,
> > > > > too: It's because test_aarch64_sbsaref.py and test_aarch64_virt.py try to
> > > > > download the same asset in parallel (alpine-standard-3.17.2-aarch64.iso).
> > > > > 
> > > > > Daniel, any ideas how to fix this in the Asset code?
> > > > 
> > > > So when downloading we open a file with a ".download" suffix, write to
> > > > that, and then rename it to the final filename.
> > > > 
> > > > If we have concurrent usage, both will open the same file and try to
> > > > write to it. Assuming both are downloading the same content we would
> > > > probably "get lucky" and have a consistent file at the end, but clearly
> > > > it is bad to rely on luck.
> > > > 
> > > > The lame option is to use NamedTemporaryFile for the teporary file.
> > > > This ensures both processes will write to different temp files, and
> > > > the final rename is atomic. This guarantees safety, but still has
> > > > the double download penalty.
> > > > 
> > > > The serious option is to use fcntl.lockf(..., fcntl.LOCK_EX) on the
> > > > temp file. If we can't acquire the lock then just immediately close
> > > > the temp file (don't delete it) and assume another thread is going to
> > > > finish its download.
> > > > 
> > > > On windows  we'll need msvcrt.locking(..., msvcrt.LK_WLCK, ...)
> > > > instead of fcntl.
> > > 
> > > While looking for portable solutions, I noticed that newer versions
> > > of Python have a "x" mode for creating files only if they do not
> > > exist yet. So I think something like this could be a solution:
> > > 
> > > @@ -71,17 +72,26 @@ def fetch(self):
> > >           tmp_cache_file = self.cache_file.with_suffix(".download")
> > >           try:
> > > -            resp = urllib.request.urlopen(self.url)
> > > +            with tmp_cache_file.open("xb") as dst:
> > > +                with urllib.request.urlopen(self.url) as resp:
> > > +                    copyfileobj(resp, dst)
> > > +        except FileExistsError:
> > > +            # Another thread already seems to download this asset,
> > > +            # so wait until it is done
> > > +            self.log.debug("%s already exists, waiting for other thread to finish...",
> > > +                           tmp_cache_file)
> > > +            i = 0
> > > +            while i < 600 and os.path.exists(tmp_cache_file):
> > > +                sleep(1)
> > > +                i += 1
> > > +            if os.path.exists(self.cache_file):
> > > +                return str(self.cache_file)
> > > +            raise
> > >           except Exception as e:
> > >               self.log.error("Unable to download %s: %s", self.url, e)
> > > -            raise
> > > -
> > > -        try:
> > > -            with tmp_cache_file.open("wb+") as dst:
> > > -                copyfileobj(resp, dst)
> > > -        except:
> > >               tmp_cache_file.unlink()
> > >               raise
> > > +
> > >           try:
> > >               # Set these just for informational purposes
> > >               os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",
> > > 
> > > What do you think, does it look reasonable?
> > 
> > The main risk with this, as opposed to fcntl locking, is that it is not
> > crash-safe. If a download is interrupted, subsequent cache runs will
> > wait for a process that doesn't exist to finish downloading and then
> > raise an exception, requiring manual user cleanup of the partial
> > download.
> > 
> > Perhaps if we see the tmp_cache_file, and it doesn't change in size
> > after N seconds, we could force unlink it, and create a new download,
> > so we gracefully recover ?
> 
> Sounds like a plan ... does this look acceptable:
> 
> @@ -70,18 +71,52 @@ def fetch(self):
>          self.log.info("Downloading %s to %s...", self.url, self.cache_file)
>          tmp_cache_file = self.cache_file.with_suffix(".download")
> 
> -        try:
> -            resp = urllib.request.urlopen(self.url)
> -        except Exception as e:
> -            self.log.error("Unable to download %s: %s", self.url, e)
> -            raise
> +        for retries in range(3):
> +            try:
> +                with tmp_cache_file.open("xb") as dst:
> +                    with urllib.request.urlopen(self.url) as resp:
> +                        copyfileobj(resp, dst)
> +                break
> +            except FileExistsError:
> +                # Another thread already seems to download this asset,
> +                # so wait until it is done
> +                self.log.debug("%s already exists, "
> +                               "waiting for other thread to finish...",
> +                               tmp_cache_file)
> +                try:
> +                    current_size = tmp_cache_file.stat().st_size
> +                    new_size = current_size
> +                except:
> +                    if os.path.exists(self.cache_file):
> +                        return str(self.cache_file)
> +                    raise
> +                waittime = lastchange = 600
> +                while waittime > 0:
> +                    sleep(1)
> +                    waittime -= 1
> +                    try:
> +                        new_size = tmp_cache_file.stat().st_size
> +                    except:
> +                        if os.path.exists(self.cache_file):
> +                            return str(self.cache_file)
> +                        raise
> +                    if new_size != current_size:
> +                        lastchange = waittime
> +                        current_size = new_size
> +                    elif lastchange - waittime > 90:
> +                       self.log.debug("%s seems to be stale ... "
> +                                      "deleting and retrying download...",
> +                                      tmp_cache_file)
> +                       tmp_cache_file.unlink()
> +                       break
> +                if waittime > 0:
> +                    continue
> +                raise
> +            except Exception as e:
> +                self.log.error("Unable to download %s: %s", self.url, e)
> +                tmp_cache_file.unlink()
> +                raise
> 
> -        try:
> -            with tmp_cache_file.open("wb+") as dst:
> -                copyfileobj(resp, dst)
> -        except:
> -            tmp_cache_file.unlink()
> -            raise
>          try:
>              # Set these just for informational purposes
>              os.setxattr(str(tmp_cache_file), "user.qemu-asset-url",
> 
> ?
> 
> I tried it with a stale file in my cache, and it seems to work - after 90
> seconds, one of the threads is properly trying to redownload the file.

Yeah, I think that'll work ok 

With regards,
Daniel
diff mbox series

Patch

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 66c8cc3123..010369bd3a 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -161,7 +161,8 @@  $(FUNCTIONAL_TARGETS):
 
 .PHONY: check-functional
 check-functional:
-	@$(MAKE) SPEED=thorough check-func check-func-quick
+	@$(NINJA) precache-functional
+	@QEMU_TEST_NO_DOWNLOAD=1 $(MAKE) SPEED=thorough check-func check-func-quick
 
 # Consolidated targets
 
diff --git a/tests/functional/meson.build b/tests/functional/meson.build
index 8a8fa0ab99..cef74b82a9 100644
--- a/tests/functional/meson.build
+++ b/tests/functional/meson.build
@@ -33,6 +33,7 @@  tests_x86_64_quick = [
 tests_x86_64_thorough = [
 ]
 
+precache_all = []
 foreach speed : ['quick', 'thorough']
   foreach dir : target_dirs
     if not dir.endswith('-softmmu')
@@ -63,11 +64,35 @@  foreach speed : ['quick', 'thorough']
                                meson.current_source_dir())
 
     foreach test : target_tests
-      test('func-@0@/@1@'.format(target_base, test),
+      testname = '@0@-@1@'.format(target_base, test)
+      testfile = 'test_' + test + '.py'
+      testpath = meson.current_source_dir() / testfile
+      teststamp = testname + '.tstamp'
+      test_precache_env = environment()
+      test_precache_env.set('QEMU_TEST_PRECACHE', meson.current_build_dir() / teststamp)
+      test_precache_env.set('PYTHONPATH', meson.project_source_root() / 'python:' +
+                                          meson.current_source_dir())
+      precache = custom_target('func-precache-' + testname,
+                               output: teststamp,
+                               command: [python, testpath],
+                               depend_files: files(testpath),
+                               build_by_default: false,
+                               env: test_precache_env)
+      precache_all += precache
+
+      # Ideally we would add 'precache' to 'depends' here, such that
+      # 'build_by_default: false' lets the pre-caching automatically
+      # run immediately before the test runs. In practice this is
+      # broken in meson, with it running the pre-caching in the normal
+      # compile phase https://github.com/mesonbuild/meson/issues/2518
+      # If the above bug ever gets fixed, when QEMU changes the min
+      # meson version, add the 'depends' and remove the custom
+      # 'run_target' logic below & in Makefile.include
+      test('func-' + testname,
            python,
            depends: [test_deps, test_emulator, emulator_modules],
            env: test_env,
-           args: [meson.current_source_dir() / 'test_' + test + '.py'],
+           args: [testpath],
            protocol: 'tap',
            timeout: test_timeouts.get(test, 60),
            priority: test_timeouts.get(test, 60),
@@ -75,3 +100,7 @@  foreach speed : ['quick', 'thorough']
     endforeach
   endforeach
 endforeach
+
+run_target('precache-functional',
+           depends: precache_all,
+           command: ['true'])
diff --git a/tests/functional/qemu_test/asset.py b/tests/functional/qemu_test/asset.py
index cbeb6278af..9250ff2b06 100644
--- a/tests/functional/qemu_test/asset.py
+++ b/tests/functional/qemu_test/asset.py
@@ -9,6 +9,8 @@ 
 import logging
 import os
 import subprocess
+import sys
+import unittest
 import urllib.request
 from pathlib import Path
 from shutil import copyfileobj
@@ -62,6 +64,9 @@  def fetch(self):
                            self.cache_file, self.url)
             return str(self.cache_file)
 
+        if os.environ.get("QEMU_TEST_NO_DOWNLOAD", False):
+            raise Exception("Asset cache is invalid and downloads disabled")
+
         self.log.info("Downloading %s to %s...", self.url, self.cache_file)
         tmp_cache_file = self.cache_file.with_suffix(".download")
 
@@ -95,3 +100,32 @@  def fetch(self):
 
         self.log.info("Cached %s at %s" % (self.url, self.cache_file))
         return str(self.cache_file)
+
+    def precache_test(test):
+        log = logging.getLogger('qemu-test')
+        log.setLevel(logging.DEBUG)
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        log.addHandler(handler)
+        for name, asset in vars(test.__class__).items():
+            if name.startswith("ASSET_") and type(asset) == Asset:
+                log.info("Attempting to cache '%s'" % asset)
+                asset.fetch()
+        log.removeHandler(handler)
+
+    def precache_suite(suite):
+        for test in suite:
+            if isinstance(test, unittest.TestSuite):
+                Asset.precache_suite(test)
+            elif isinstance(test, unittest.TestCase):
+                Asset.precache_test(test)
+
+    def precache_suites(path, cacheTstamp):
+        loader = unittest.loader.defaultTestLoader
+        tests = loader.loadTestsFromNames([path], None)
+
+        with open(cacheTstamp, "w") as fh:
+            Asset.precache_suite(tests)
diff --git a/tests/functional/qemu_test/testcase.py b/tests/functional/qemu_test/testcase.py
index b2dd863c6e..18314be9d1 100644
--- a/tests/functional/qemu_test/testcase.py
+++ b/tests/functional/qemu_test/testcase.py
@@ -21,6 +21,7 @@ 
 from qemu.machine import QEMUMachine
 from qemu.utils import kvm_available, tcg_available
 
+from .asset import Asset
 from .cmd import run_cmd
 from .config import BUILD_DIR
 
@@ -58,6 +59,12 @@  def tearDown(self):
 
     def main():
         path = os.path.basename(sys.argv[0])[:-3]
+
+        cache = os.environ.get("QEMU_TEST_PRECACHE", None)
+        if cache is not None:
+            Asset.precache_suites(path, cache)
+            return
+
         tr = pycotap.TAPTestRunner(message_log = pycotap.LogMode.LogToError,
                                    test_output_log = pycotap.LogMode.LogToError)
         unittest.main(module = None, testRunner = tr, argv=["__dummy__", path])