diff mbox series

[v3,10/10] tests/migration-test: Add a test for postcopy hangs during RECOVER

Message ID 20231004220240.167175-11-peterx@redhat.com
State New
Headers show
Series migration: Better error handling in rp thread, allow failures in recover | expand

Commit Message

Peter Xu Oct. 4, 2023, 10:02 p.m. UTC
From: Fabiano Rosas <farosas@suse.de>

To do so, create two paired sockets, but make them not providing real data.
Feed those fake sockets to src/dst QEMUs for recovery to let them go into
RECOVER stage without going out.  Test that we can always kick it out and
recover again with the right ports.

This patch is based on Fabiano's version here:

https://lore.kernel.org/r/877cowmdu0.fsf@suse.de

Signed-off-by: Fabiano Rosas <farosas@suse.de>
[peterx: write commit message, remove case 1, fix bugs, and more]
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 tests/qtest/migration-test.c | 94 ++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

Comments

Fabiano Rosas Oct. 5, 2023, 1:24 p.m. UTC | #1
Peter Xu <peterx@redhat.com> writes:

> From: Fabiano Rosas <farosas@suse.de>
>
> To do so, create two paired sockets, but make them not providing real data.
> Feed those fake sockets to src/dst QEMUs for recovery to let them go into
> RECOVER stage without going out.  Test that we can always kick it out and
> recover again with the right ports.
>
> This patch is based on Fabiano's version here:
>
> https://lore.kernel.org/r/877cowmdu0.fsf@suse.de
>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> [peterx: write commit message, remove case 1, fix bugs, and more]
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  tests/qtest/migration-test.c | 94 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 94 insertions(+)
>
> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
> index 46f1c275a2..fb7a3765e4 100644
> --- a/tests/qtest/migration-test.c
> +++ b/tests/qtest/migration-test.c
> @@ -729,6 +729,7 @@ typedef struct {
>      /* Postcopy specific fields */
>      void *postcopy_data;
>      bool postcopy_preempt;
> +    bool postcopy_recovery_test_fail;
>  } MigrateCommon;
>  
>  static int test_migrate_start(QTestState **from, QTestState **to,
> @@ -1381,6 +1382,78 @@ static void test_postcopy_preempt_tls_psk(void)
>  }
>  #endif
>  
> +static void wait_for_postcopy_status(QTestState *one, const char *status)
> +{
> +    wait_for_migration_status(one, status,
> +                              (const char * []) { "failed", "active",
> +                                                  "completed", NULL });
> +}
> +
> +static void postcopy_recover_fail(QTestState *from, QTestState *to)
> +{
> +    int ret, pair1[2], pair2[2];
> +    char c;
> +
> +    /* Create two unrelated socketpairs */
> +    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
> +    g_assert_cmpint(ret, ==, 0);
> +
> +    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair2);
> +    g_assert_cmpint(ret, ==, 0);
> +
> +    /*
> +     * Give the guests unpaired ends of the sockets, so they'll all blocked
> +     * at reading.  This mimics a wrong channel established.
> +     */
> +    qtest_qmp_fds_assert_success(from, &pair1[0], 1,
> +                                 "{ 'execute': 'getfd',"
> +                                 "  'arguments': { 'fdname': 'fd-mig' }}");
> +    qtest_qmp_fds_assert_success(to, &pair2[0], 1,
> +                                 "{ 'execute': 'getfd',"
> +                                 "  'arguments': { 'fdname': 'fd-mig' }}");
> +
> +    /*
> +     * Write the 1st byte as QEMU_VM_COMMAND (0x8) for the dest socket, to
> +     * emulate the 1st byte of a real recovery, but stops from there to
> +     * keep dest QEMU in RECOVER.  This is needed so that we can kick off
> +     * the recover process on dest QEMU (by triggering the G_IO_IN event).
> +     *
> +     * NOTE: this trick is not needed on src QEMUs, because src doesn't
> +     * rely on an pre-existing G_IO_IN event, so it will always trigger the
> +     * upcoming recovery anyway even if it can read nothing.
> +     */
> +#define QEMU_VM_COMMAND              0x08
> +    c = QEMU_VM_COMMAND;
> +    ret = send(pair2[1], &c, 1, 0);
> +    g_assert_cmpint(ret, ==, 1);
> +
> +    migrate_recover(to, "fd:fd-mig");
> +    migrate_qmp(from, "fd:fd-mig", "{'resume': true}");
> +
> +    /*
> +     * Make sure both QEMU instances will go into RECOVER stage, then test
> +     * kicking them out using migrate-pause.
> +     */
> +    wait_for_postcopy_status(from, "postcopy-recover");
> +    wait_for_postcopy_status(to, "postcopy-recover");

Is this wait out of place? I think we're trying to resume too fast after
migrate_recover():

# {                        
#     "error": {                                                                                                                                                                               
#         "class": "GenericError",                                                                                                                                                             
#         "desc": "Cannot resume if there is no paused migration"
#     }                                                                                                                                                                                        
# }  

> +
> +    /*
> +     * This would be issued by the admin upon noticing the hang, we should
> +     * make sure we're able to kick this out.
> +     */
> +    migrate_pause(from);
> +    wait_for_postcopy_status(from, "postcopy-paused");
> +
> +    /* Do the same test on dest */
> +    migrate_pause(to);
> +    wait_for_postcopy_status(to, "postcopy-paused");
> +
> +    close(pair1[0]);
> +    close(pair1[1]);
> +    close(pair2[0]);
> +    close(pair2[1]);
> +}
> +
>  static void test_postcopy_recovery_common(MigrateCommon *args)
>  {
>      QTestState *from, *to;
> @@ -1420,6 +1493,15 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
>                                (const char * []) { "failed", "active",
>                                                    "completed", NULL });
>  
> +    if (args->postcopy_recovery_test_fail) {
> +        /*
> +         * Test when a wrong socket specified for recover, and then the
> +         * ability to kick it out, and continue with a correct socket.
> +         */
> +        postcopy_recover_fail(from, to);
> +        /* continue with a good recovery */
> +    }
> +
>      /*
>       * Create a new socket to emulate a new channel that is different
>       * from the broken migration channel; tell the destination to
> @@ -1459,6 +1541,15 @@ static void test_postcopy_recovery_compress(void)
>      test_postcopy_recovery_common(&args);
>  }
>  
> +static void test_postcopy_recovery_double_fail(void)
> +{
> +    MigrateCommon args = {
> +        .postcopy_recovery_test_fail = true,
> +    };
> +
> +    test_postcopy_recovery_common(&args);
> +}
> +
>  #ifdef CONFIG_GNUTLS
>  static void test_postcopy_recovery_tls_psk(void)
>  {
> @@ -2841,6 +2932,9 @@ int main(int argc, char **argv)
>              qtest_add_func("/migration/postcopy/recovery/compress/plain",
>                             test_postcopy_recovery_compress);
>          }
> +        qtest_add_func("/migration/postcopy/recovery/double-failures",
> +                       test_postcopy_recovery_double_fail);
> +
>      }
>  
>      qtest_add_func("/migration/bad_dest", test_baddest);
Fabiano Rosas Oct. 5, 2023, 1:37 p.m. UTC | #2
Fabiano Rosas <farosas@suse.de> writes:

> Peter Xu <peterx@redhat.com> writes:
>
>> From: Fabiano Rosas <farosas@suse.de>
>>
>> To do so, create two paired sockets, but make them not providing real data.
>> Feed those fake sockets to src/dst QEMUs for recovery to let them go into
>> RECOVER stage without going out.  Test that we can always kick it out and
>> recover again with the right ports.
>>
>> This patch is based on Fabiano's version here:
>>
>> https://lore.kernel.org/r/877cowmdu0.fsf@suse.de
>>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> [peterx: write commit message, remove case 1, fix bugs, and more]
>> Signed-off-by: Peter Xu <peterx@redhat.com>
>> ---
>>  tests/qtest/migration-test.c | 94 ++++++++++++++++++++++++++++++++++++
>>  1 file changed, 94 insertions(+)
>>
>> diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
>> index 46f1c275a2..fb7a3765e4 100644
>> --- a/tests/qtest/migration-test.c
>> +++ b/tests/qtest/migration-test.c
>> @@ -729,6 +729,7 @@ typedef struct {
>>      /* Postcopy specific fields */
>>      void *postcopy_data;
>>      bool postcopy_preempt;
>> +    bool postcopy_recovery_test_fail;
>>  } MigrateCommon;
>>  
>>  static int test_migrate_start(QTestState **from, QTestState **to,
>> @@ -1381,6 +1382,78 @@ static void test_postcopy_preempt_tls_psk(void)
>>  }
>>  #endif
>>  
>> +static void wait_for_postcopy_status(QTestState *one, const char *status)
>> +{
>> +    wait_for_migration_status(one, status,
>> +                              (const char * []) { "failed", "active",
>> +                                                  "completed", NULL });
>> +}
>> +
>> +static void postcopy_recover_fail(QTestState *from, QTestState *to)
>> +{
>> +    int ret, pair1[2], pair2[2];
>> +    char c;
>> +
>> +    /* Create two unrelated socketpairs */
>> +    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
>> +    g_assert_cmpint(ret, ==, 0);
>> +
>> +    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair2);
>> +    g_assert_cmpint(ret, ==, 0);
>> +
>> +    /*
>> +     * Give the guests unpaired ends of the sockets, so they'll all blocked
>> +     * at reading.  This mimics a wrong channel established.
>> +     */
>> +    qtest_qmp_fds_assert_success(from, &pair1[0], 1,
>> +                                 "{ 'execute': 'getfd',"
>> +                                 "  'arguments': { 'fdname': 'fd-mig' }}");
>> +    qtest_qmp_fds_assert_success(to, &pair2[0], 1,
>> +                                 "{ 'execute': 'getfd',"
>> +                                 "  'arguments': { 'fdname': 'fd-mig' }}");
>> +
>> +    /*
>> +     * Write the 1st byte as QEMU_VM_COMMAND (0x8) for the dest socket, to
>> +     * emulate the 1st byte of a real recovery, but stops from there to
>> +     * keep dest QEMU in RECOVER.  This is needed so that we can kick off
>> +     * the recover process on dest QEMU (by triggering the G_IO_IN event).
>> +     *
>> +     * NOTE: this trick is not needed on src QEMUs, because src doesn't
>> +     * rely on an pre-existing G_IO_IN event, so it will always trigger the
>> +     * upcoming recovery anyway even if it can read nothing.
>> +     */
>> +#define QEMU_VM_COMMAND              0x08
>> +    c = QEMU_VM_COMMAND;
>> +    ret = send(pair2[1], &c, 1, 0);
>> +    g_assert_cmpint(ret, ==, 1);
>> +
>> +    migrate_recover(to, "fd:fd-mig");
>> +    migrate_qmp(from, "fd:fd-mig", "{'resume': true}");
>> +
>> +    /*
>> +     * Make sure both QEMU instances will go into RECOVER stage, then test
>> +     * kicking them out using migrate-pause.
>> +     */
>> +    wait_for_postcopy_status(from, "postcopy-recover");
>> +    wait_for_postcopy_status(to, "postcopy-recover");
>
> Is this wait out of place? I think we're trying to resume too fast after
> migrate_recover():
>
> # {
> #     "error": {
> #         "class": "GenericError",
> #         "desc": "Cannot resume if there is no paused migration"
> #     }
> # }
>

Ugh, sorry about the long lines:

{
    "error": {
        "class": "GenericError",
        "desc": "Cannot resume if there is no paused migration"
    }
}
Peter Xu Oct. 5, 2023, 8:55 p.m. UTC | #3
On Thu, Oct 05, 2023 at 10:37:56AM -0300, Fabiano Rosas wrote:
> >> +    /*
> >> +     * Make sure both QEMU instances will go into RECOVER stage, then test
> >> +     * kicking them out using migrate-pause.
> >> +     */
> >> +    wait_for_postcopy_status(from, "postcopy-recover");
> >> +    wait_for_postcopy_status(to, "postcopy-recover");
> >
> > Is this wait out of place? I think we're trying to resume too fast after
> > migrate_recover():
> >
> > # {
> > #     "error": {
> > #         "class": "GenericError",
> > #         "desc": "Cannot resume if there is no paused migration"
> > #     }
> > # }
> >
> 
> Ugh, sorry about the long lines:
> 
> {
>     "error": {
>         "class": "GenericError",
>         "desc": "Cannot resume if there is no paused migration"
>     }
> }

Sorry I didn't get you here.  Could you elaborate your question?

Here we wait on both sides and make sure they'll all be in RECOVER stage,
and we should know that they won't proceed further because the pipes
contain mostly nothing so they'll just block at the pipes.  What did I
miss?

Thanks,
Fabiano Rosas Oct. 5, 2023, 9:10 p.m. UTC | #4
Peter Xu <peterx@redhat.com> writes:

> On Thu, Oct 05, 2023 at 10:37:56AM -0300, Fabiano Rosas wrote:
>> >> +    /*
>> >> +     * Make sure both QEMU instances will go into RECOVER stage, then test
>> >> +     * kicking them out using migrate-pause.
>> >> +     */
>> >> +    wait_for_postcopy_status(from, "postcopy-recover");
>> >> +    wait_for_postcopy_status(to, "postcopy-recover");
>> >
>> > Is this wait out of place? I think we're trying to resume too fast after
>> > migrate_recover():
>> >
>> > # {
>> > #     "error": {
>> > #         "class": "GenericError",
>> > #         "desc": "Cannot resume if there is no paused migration"
>> > #     }
>> > # }
>> >
>> 
>> Ugh, sorry about the long lines:
>> 
>> {
>>     "error": {
>>         "class": "GenericError",
>>         "desc": "Cannot resume if there is no paused migration"
>>     }
>> }
>
> Sorry I didn't get you here.  Could you elaborate your question?
>

The test is sometimes failing with the above message.

But indeed my question doesn't make sense. I forgot migrate_recover
happens on the destination. Nevermind.

The bug is still present nonetheless. We're going into migrate_prepare
in some state other than POSTCOPY_PAUSED.
Peter Xu Oct. 5, 2023, 9:44 p.m. UTC | #5
On Thu, Oct 05, 2023 at 06:10:20PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Thu, Oct 05, 2023 at 10:37:56AM -0300, Fabiano Rosas wrote:
> >> >> +    /*
> >> >> +     * Make sure both QEMU instances will go into RECOVER stage, then test
> >> >> +     * kicking them out using migrate-pause.
> >> >> +     */
> >> >> +    wait_for_postcopy_status(from, "postcopy-recover");
> >> >> +    wait_for_postcopy_status(to, "postcopy-recover");
> >> >
> >> > Is this wait out of place? I think we're trying to resume too fast after
> >> > migrate_recover():
> >> >
> >> > # {
> >> > #     "error": {
> >> > #         "class": "GenericError",
> >> > #         "desc": "Cannot resume if there is no paused migration"
> >> > #     }
> >> > # }
> >> >
> >> 
> >> Ugh, sorry about the long lines:
> >> 
> >> {
> >>     "error": {
> >>         "class": "GenericError",
> >>         "desc": "Cannot resume if there is no paused migration"
> >>     }
> >> }
> >
> > Sorry I didn't get you here.  Could you elaborate your question?
> >
> 
> The test is sometimes failing with the above message.
> 
> But indeed my question doesn't make sense. I forgot migrate_recover
> happens on the destination. Nevermind.
> 
> The bug is still present nonetheless. We're going into migrate_prepare
> in some state other than POSTCOPY_PAUSED.

Oh I see.  Interestingly I cannot reproduce on my host, just like last
time..

What is your setup for running the test?  Anything special?  Here's my
cmdline:

$ cat reproduce.sh 
index=$1
loop=0

while :; do
        echo "Starting loop=$loop..."
        QTEST_QEMU_BINARY=./qemu-system-x86_64 ./tests/qtest/migration-test -p /x86_64/migration/postcopy/recovery/double-failures
        if [[ $? != 0 ]]; then
                echo "index $index REPRODUCED (loop=$loop) !"
                break
        fi
        loop=$(( loop + 1 ))
done

Survives 200+ loops and kept going.

However I think I saw what's wrong here, could you help try below fixup?

Thanks,

===8<===
From 52bd2cd5ddf472e0bb99789dba3660a626382630 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Thu, 5 Oct 2023 17:38:42 -0400
Subject: [PATCH] fixup! tests/migration-test: Add a test for postcopy hangs
 during RECOVER

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 tests/qtest/migration-test.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index fb7a3765e4..1bdae0a579 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1489,9 +1489,8 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
      * migrate-recover command can only succeed if destination machine
      * is in the paused state
      */
-    wait_for_migration_status(to, "postcopy-paused",
-                              (const char * []) { "failed", "active",
-                                                  "completed", NULL });
+    wait_for_postcopy_status(to, "postcopy-paused");
+    wait_for_postcopy_status(from, "postcopy-paused");
 
     if (args->postcopy_recovery_test_fail) {
         /*
@@ -1514,9 +1513,6 @@ static void test_postcopy_recovery_common(MigrateCommon *args)
      * Try to rebuild the migration channel using the resume flag and
      * the newly created channel
      */
-    wait_for_migration_status(from, "postcopy-paused",
-                              (const char * []) { "failed", "active",
-                                                  "completed", NULL });
     migrate_qmp(from, uri, "{'resume': true}");
 
     /* Restore the postcopy bandwidth to unlimited */
Fabiano Rosas Oct. 5, 2023, 10:01 p.m. UTC | #6
Peter Xu <peterx@redhat.com> writes:

> On Thu, Oct 05, 2023 at 06:10:20PM -0300, Fabiano Rosas wrote:
>> Peter Xu <peterx@redhat.com> writes:
>> 
>> > On Thu, Oct 05, 2023 at 10:37:56AM -0300, Fabiano Rosas wrote:
>> >> >> +    /*
>> >> >> +     * Make sure both QEMU instances will go into RECOVER stage, then test
>> >> >> +     * kicking them out using migrate-pause.
>> >> >> +     */
>> >> >> +    wait_for_postcopy_status(from, "postcopy-recover");
>> >> >> +    wait_for_postcopy_status(to, "postcopy-recover");
>> >> >
>> >> > Is this wait out of place? I think we're trying to resume too fast after
>> >> > migrate_recover():
>> >> >
>> >> > # {
>> >> > #     "error": {
>> >> > #         "class": "GenericError",
>> >> > #         "desc": "Cannot resume if there is no paused migration"
>> >> > #     }
>> >> > # }
>> >> >
>> >> 
>> >> Ugh, sorry about the long lines:
>> >> 
>> >> {
>> >>     "error": {
>> >>         "class": "GenericError",
>> >>         "desc": "Cannot resume if there is no paused migration"
>> >>     }
>> >> }
>> >
>> > Sorry I didn't get you here.  Could you elaborate your question?
>> >
>> 
>> The test is sometimes failing with the above message.
>> 
>> But indeed my question doesn't make sense. I forgot migrate_recover
>> happens on the destination. Nevermind.
>> 
>> The bug is still present nonetheless. We're going into migrate_prepare
>> in some state other than POSTCOPY_PAUSED.
>
> Oh I see.  Interestingly I cannot reproduce on my host, just like last
> time..
>
> What is your setup for running the test?  Anything special?  Here's my
> cmdline:

The crudest oneliner:

for i in $(seq 1 9999); do echo "$i ============="; \
QTEST_QEMU_BINARY=./qemu-system-x86_64 \
./tests/qtest/migration-test -r /x86_64/migration/postcopy/recovery || break ; done

I suspect my system has something specific to it that affects the timing
of the tests. But I have no idea what it could be.

$ lscpu       
Architecture:            x86_64      
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         39 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  16
  On-line CPU(s) list:   0-15
Vendor ID:               GenuineIntel
  Model name:            11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz
    CPU family:          6
    Model:               141
    Thread(s) per core:  2
    Core(s) per socket:  8
    Socket(s):           1
    Stepping:            1
    CPU max MHz:         4800.0000
    CPU min MHz:         800.0000
    BogoMIPS:            4992.00

>
> $ cat reproduce.sh 
> index=$1
> loop=0
>
> while :; do
>         echo "Starting loop=$loop..."
>         QTEST_QEMU_BINARY=./qemu-system-x86_64 ./tests/qtest/migration-test -p /x86_64/migration/postcopy/recovery/double-failures
>         if [[ $? != 0 ]]; then
>                 echo "index $index REPRODUCED (loop=$loop) !"
>                 break
>         fi
>         loop=$(( loop + 1 ))
> done
>
> Survives 200+ loops and kept going.
>
> However I think I saw what's wrong here, could you help try below fixup?
>

Sure. I won't get to it until tomorrow though.
Fabiano Rosas Oct. 9, 2023, 4:50 p.m. UTC | #7
Fabiano Rosas <farosas@suse.de> writes:

> Peter Xu <peterx@redhat.com> writes:
>
>> On Thu, Oct 05, 2023 at 06:10:20PM -0300, Fabiano Rosas wrote:
>>> Peter Xu <peterx@redhat.com> writes:
>>> 
>>> > On Thu, Oct 05, 2023 at 10:37:56AM -0300, Fabiano Rosas wrote:
>>> >> >> +    /*
>>> >> >> +     * Make sure both QEMU instances will go into RECOVER stage, then test
>>> >> >> +     * kicking them out using migrate-pause.
>>> >> >> +     */
>>> >> >> +    wait_for_postcopy_status(from, "postcopy-recover");
>>> >> >> +    wait_for_postcopy_status(to, "postcopy-recover");
>>> >> >
>>> >> > Is this wait out of place? I think we're trying to resume too fast after
>>> >> > migrate_recover():
>>> >> >
>>> >> > # {
>>> >> > #     "error": {
>>> >> > #         "class": "GenericError",
>>> >> > #         "desc": "Cannot resume if there is no paused migration"
>>> >> > #     }
>>> >> > # }
>>> >> >
>>> >> 
>>> >> Ugh, sorry about the long lines:
>>> >> 
>>> >> {
>>> >>     "error": {
>>> >>         "class": "GenericError",
>>> >>         "desc": "Cannot resume if there is no paused migration"
>>> >>     }
>>> >> }
>>> >
>>> > Sorry I didn't get you here.  Could you elaborate your question?
>>> >
>>> 
>>> The test is sometimes failing with the above message.
>>> 
>>> But indeed my question doesn't make sense. I forgot migrate_recover
>>> happens on the destination. Nevermind.
>>> 
>>> The bug is still present nonetheless. We're going into migrate_prepare
>>> in some state other than POSTCOPY_PAUSED.
>>
>> Oh I see.  Interestingly I cannot reproduce on my host, just like last
>> time..
>>
>> What is your setup for running the test?  Anything special?  Here's my
>> cmdline:
>
> The crudest oneliner:
>
> for i in $(seq 1 9999); do echo "$i ============="; \
> QTEST_QEMU_BINARY=./qemu-system-x86_64 \
> ./tests/qtest/migration-test -r /x86_64/migration/postcopy/recovery || break ; done
>
> I suspect my system has something specific to it that affects the timing
> of the tests. But I have no idea what it could be.
>
> $ lscpu       
> Architecture:            x86_64      
>   CPU op-mode(s):        32-bit, 64-bit
>   Address sizes:         39 bits physical, 48 bits virtual
>   Byte Order:            Little Endian
> CPU(s):                  16
>   On-line CPU(s) list:   0-15
> Vendor ID:               GenuineIntel
>   Model name:            11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz
>     CPU family:          6
>     Model:               141
>     Thread(s) per core:  2
>     Core(s) per socket:  8
>     Socket(s):           1
>     Stepping:            1
>     CPU max MHz:         4800.0000
>     CPU min MHz:         800.0000
>     BogoMIPS:            4992.00
>
>>
>> $ cat reproduce.sh 
>> index=$1
>> loop=0
>>
>> while :; do
>>         echo "Starting loop=$loop..."
>>         QTEST_QEMU_BINARY=./qemu-system-x86_64 ./tests/qtest/migration-test -p /x86_64/migration/postcopy/recovery/double-failures
>>         if [[ $? != 0 ]]; then
>>                 echo "index $index REPRODUCED (loop=$loop) !"
>>                 break
>>         fi
>>         loop=$(( loop + 1 ))
>> done
>>
>> Survives 200+ loops and kept going.
>>
>> However I think I saw what's wrong here, could you help try below fixup?
>>
>
> Sure. I won't get to it until tomorrow though.

It seems to have fixed the issue. 3500 iterations and still going.
Peter Xu Oct. 10, 2023, 4 p.m. UTC | #8
On Mon, Oct 09, 2023 at 01:50:08PM -0300, Fabiano Rosas wrote:
> It seems to have fixed the issue. 3500 iterations and still going.

I'll go with that then, but feel free to report whenever that's hit again.

Thanks a lot.
diff mbox series

Patch

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 46f1c275a2..fb7a3765e4 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -729,6 +729,7 @@  typedef struct {
     /* Postcopy specific fields */
     void *postcopy_data;
     bool postcopy_preempt;
+    bool postcopy_recovery_test_fail;
 } MigrateCommon;
 
 static int test_migrate_start(QTestState **from, QTestState **to,
@@ -1381,6 +1382,78 @@  static void test_postcopy_preempt_tls_psk(void)
 }
 #endif
 
+static void wait_for_postcopy_status(QTestState *one, const char *status)
+{
+    wait_for_migration_status(one, status,
+                              (const char * []) { "failed", "active",
+                                                  "completed", NULL });
+}
+
+static void postcopy_recover_fail(QTestState *from, QTestState *to)
+{
+    int ret, pair1[2], pair2[2];
+    char c;
+
+    /* Create two unrelated socketpairs */
+    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair1);
+    g_assert_cmpint(ret, ==, 0);
+
+    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair2);
+    g_assert_cmpint(ret, ==, 0);
+
+    /*
+     * Give the guests unpaired ends of the sockets, so they'll all blocked
+     * at reading.  This mimics a wrong channel established.
+     */
+    qtest_qmp_fds_assert_success(from, &pair1[0], 1,
+                                 "{ 'execute': 'getfd',"
+                                 "  'arguments': { 'fdname': 'fd-mig' }}");
+    qtest_qmp_fds_assert_success(to, &pair2[0], 1,
+                                 "{ 'execute': 'getfd',"
+                                 "  'arguments': { 'fdname': 'fd-mig' }}");
+
+    /*
+     * Write the 1st byte as QEMU_VM_COMMAND (0x8) for the dest socket, to
+     * emulate the 1st byte of a real recovery, but stops from there to
+     * keep dest QEMU in RECOVER.  This is needed so that we can kick off
+     * the recover process on dest QEMU (by triggering the G_IO_IN event).
+     *
+     * NOTE: this trick is not needed on src QEMUs, because src doesn't
+     * rely on an pre-existing G_IO_IN event, so it will always trigger the
+     * upcoming recovery anyway even if it can read nothing.
+     */
+#define QEMU_VM_COMMAND              0x08
+    c = QEMU_VM_COMMAND;
+    ret = send(pair2[1], &c, 1, 0);
+    g_assert_cmpint(ret, ==, 1);
+
+    migrate_recover(to, "fd:fd-mig");
+    migrate_qmp(from, "fd:fd-mig", "{'resume': true}");
+
+    /*
+     * Make sure both QEMU instances will go into RECOVER stage, then test
+     * kicking them out using migrate-pause.
+     */
+    wait_for_postcopy_status(from, "postcopy-recover");
+    wait_for_postcopy_status(to, "postcopy-recover");
+
+    /*
+     * This would be issued by the admin upon noticing the hang, we should
+     * make sure we're able to kick this out.
+     */
+    migrate_pause(from);
+    wait_for_postcopy_status(from, "postcopy-paused");
+
+    /* Do the same test on dest */
+    migrate_pause(to);
+    wait_for_postcopy_status(to, "postcopy-paused");
+
+    close(pair1[0]);
+    close(pair1[1]);
+    close(pair2[0]);
+    close(pair2[1]);
+}
+
 static void test_postcopy_recovery_common(MigrateCommon *args)
 {
     QTestState *from, *to;
@@ -1420,6 +1493,15 @@  static void test_postcopy_recovery_common(MigrateCommon *args)
                               (const char * []) { "failed", "active",
                                                   "completed", NULL });
 
+    if (args->postcopy_recovery_test_fail) {
+        /*
+         * Test when a wrong socket specified for recover, and then the
+         * ability to kick it out, and continue with a correct socket.
+         */
+        postcopy_recover_fail(from, to);
+        /* continue with a good recovery */
+    }
+
     /*
      * Create a new socket to emulate a new channel that is different
      * from the broken migration channel; tell the destination to
@@ -1459,6 +1541,15 @@  static void test_postcopy_recovery_compress(void)
     test_postcopy_recovery_common(&args);
 }
 
+static void test_postcopy_recovery_double_fail(void)
+{
+    MigrateCommon args = {
+        .postcopy_recovery_test_fail = true,
+    };
+
+    test_postcopy_recovery_common(&args);
+}
+
 #ifdef CONFIG_GNUTLS
 static void test_postcopy_recovery_tls_psk(void)
 {
@@ -2841,6 +2932,9 @@  int main(int argc, char **argv)
             qtest_add_func("/migration/postcopy/recovery/compress/plain",
                            test_postcopy_recovery_compress);
         }
+        qtest_add_func("/migration/postcopy/recovery/double-failures",
+                       test_postcopy_recovery_double_fail);
+
     }
 
     qtest_add_func("/migration/bad_dest", test_baddest);