diff mbox

[ovs-dev] ovn-northd: Add native active-standby HA.

Message ID 20170801161918.1426-1-russell@ovn.org
State Accepted
Headers show

Commit Message

Russell Bryant Aug. 1, 2017, 4:19 p.m. UTC
Add native support for active-standby HA in ovn-northd by having each
instance attempt to acquire an OVSDB lock.  Only the instance of
ovn-northd that currently holds the lock will make active changes to
the OVN databases.

Signed-off-by: Russell Bryant <russell@ovn.org>
---
 NEWS                        |  1 +
 ovn/northd/ovn-northd.8.xml |  9 +++++++++
 ovn/northd/ovn-northd.c     | 40 +++++++++++++++++++++++++++++++---------
 3 files changed, 41 insertions(+), 9 deletions(-)

Comments

Han Zhou Aug. 1, 2017, 7:26 p.m. UTC | #1
On Tue, Aug 1, 2017 at 9:19 AM, Russell Bryant <russell@ovn.org> wrote:
>
> Add native support for active-standby HA in ovn-northd by having each
> instance attempt to acquire an OVSDB lock.  Only the instance of
> ovn-northd that currently holds the lock will make active changes to
> the OVN databases.
>
> Signed-off-by: Russell Bryant <russell@ovn.org>
> ---
>  NEWS                        |  1 +
>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
>  ovn/northd/ovn-northd.c     | 40 +++++++++++++++++++++++++++++++---------
>  3 files changed, 41 insertions(+), 9 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index facea0228..f3cdd2443 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -49,6 +49,7 @@ Post-v2.7.0
>         one chassis is specified, OVN will manage high availability for
that
>         gateway.
>       * Add support for ACL logging.
> +     * ovn-northd now has native support for active-standby high
availability.
>     - Tracing with ofproto/trace now traces through recirculation.
>     - OVSDB:
>       * New support for role-based access control (see ovsdb-server(1)).
> diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
> index 1527e8a60..0d85ec0d2 100644
> --- a/ovn/northd/ovn-northd.8.xml
> +++ b/ovn/northd/ovn-northd.8.xml
> @@ -72,6 +72,15 @@
>        </dl>
>      </p>
>
> +    <h1>Active-Standby for High Availability</h1>
> +    <p>
> +      You may run <code>ovn-northd</code> more than once in an OVN
deployment.
> +      OVN will automatically ensure that only one of them is active at a
time.
> +      If multiple instances of <code>ovn-northd</code> are running and
the
> +      active <code>ovn-northd</code> fails, one of the hot standby
instances
> +      of <code>ovn-northd</code> will automatically take over.
> +    </p>
> +
>      <h1>Logical Flow Table Structure</h1>
>
>      <p>
> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
> index 10e0c7ce0..3d2be4267 100644
> --- a/ovn/northd/ovn-northd.c
> +++ b/ovn/northd/ovn-northd.c
> @@ -6531,6 +6531,12 @@ main(int argc, char *argv[])
>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
>
> +    /* Ensure that only a single ovn-northd is active in the deployment
by
> +     * acquiring a lock called "ovn_northd" on the southbound database
> +     * and then only performing DB transactions if the lock is held. */
> +    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
> +    bool had_lock = false;
> +
>      /* Main loop. */
>      exiting = false;
>      while (!exiting) {
> @@ -6541,15 +6547,29 @@ main(int argc, char *argv[])
>              .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
>          };
>
> -        struct chassis_index chassis_index;
> -        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
> +        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> +            VLOG_INFO("ovn-northd lock acquired. "
> +                      "This ovn-northd instance is now active.");
> +            had_lock = true;
> +        } else if (had_lock && !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> +            VLOG_INFO("ovn-northd lock lost. "
> +                      "This ovn-northd instance is now on standby.");

Should it try lock again, if we want it to be standby? Otherwise, this
instance won't have a chance to be active any more.

> +            had_lock = false;
> +        }
>
> -        ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
> -        ovnsb_db_run(&ctx, &ovnsb_idl_loop);
> -        if (ctx.ovnsb_txn) {
> -            check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
> -            check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
> -            check_and_update_rbac(&ctx);
> +        struct chassis_index chassis_index;
> +        bool destroy_chassis_index = false;
> +        if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> +            chassis_index_init(&chassis_index, ctx.ovnsb_idl);
> +            destroy_chassis_index = true;
> +
> +            ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
> +            ovnsb_db_run(&ctx, &ovnsb_idl_loop);
> +            if (ctx.ovnsb_txn) {
> +                check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
> +                check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
> +                check_and_update_rbac(&ctx);
> +            }
>          }
>
>          unixctl_server_run(unixctl);
> @@ -6565,7 +6585,9 @@ main(int argc, char *argv[])
>              exiting = true;
>          }
>
> -        chassis_index_destroy(&chassis_index);
> +        if (destroy_chassis_index) {
> +            chassis_index_destroy(&chassis_index);
> +        }
>      }
>
>      unixctl_server_destroy(unixctl);
> --
> 2.13.3
>

Acked-by: Han Zhou <zhouhan@gmail.com>
Russell Bryant Aug. 1, 2017, 7:48 p.m. UTC | #2
On Tue, Aug 1, 2017 at 3:26 PM, Han Zhou <zhouhan@gmail.com> wrote:
>
>
> On Tue, Aug 1, 2017 at 9:19 AM, Russell Bryant <russell@ovn.org> wrote:
>>
>> Add native support for active-standby HA in ovn-northd by having each
>> instance attempt to acquire an OVSDB lock.  Only the instance of
>> ovn-northd that currently holds the lock will make active changes to
>> the OVN databases.
>>
>> Signed-off-by: Russell Bryant <russell@ovn.org>
>> ---
>>  NEWS                        |  1 +
>>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
>>  ovn/northd/ovn-northd.c     | 40 +++++++++++++++++++++++++++++++---------
>>  3 files changed, 41 insertions(+), 9 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index facea0228..f3cdd2443 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -49,6 +49,7 @@ Post-v2.7.0
>>         one chassis is specified, OVN will manage high availability for
>> that
>>         gateway.
>>       * Add support for ACL logging.
>> +     * ovn-northd now has native support for active-standby high
>> availability.
>>     - Tracing with ofproto/trace now traces through recirculation.
>>     - OVSDB:
>>       * New support for role-based access control (see ovsdb-server(1)).
>> diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
>> index 1527e8a60..0d85ec0d2 100644
>> --- a/ovn/northd/ovn-northd.8.xml
>> +++ b/ovn/northd/ovn-northd.8.xml
>> @@ -72,6 +72,15 @@
>>        </dl>
>>      </p>
>>
>> +    <h1>Active-Standby for High Availability</h1>
>> +    <p>
>> +      You may run <code>ovn-northd</code> more than once in an OVN
>> deployment.
>> +      OVN will automatically ensure that only one of them is active at a
>> time.
>> +      If multiple instances of <code>ovn-northd</code> are running and
>> the
>> +      active <code>ovn-northd</code> fails, one of the hot standby
>> instances
>> +      of <code>ovn-northd</code> will automatically take over.
>> +    </p>
>> +
>>      <h1>Logical Flow Table Structure</h1>
>>
>>      <p>
>> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
>> index 10e0c7ce0..3d2be4267 100644
>> --- a/ovn/northd/ovn-northd.c
>> +++ b/ovn/northd/ovn-northd.c
>> @@ -6531,6 +6531,12 @@ main(int argc, char *argv[])
>>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
>>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
>>
>> +    /* Ensure that only a single ovn-northd is active in the deployment
>> by
>> +     * acquiring a lock called "ovn_northd" on the southbound database
>> +     * and then only performing DB transactions if the lock is held. */
>> +    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
>> +    bool had_lock = false;
>> +
>>      /* Main loop. */
>>      exiting = false;
>>      while (!exiting) {
>> @@ -6541,15 +6547,29 @@ main(int argc, char *argv[])
>>              .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
>>          };
>>
>> -        struct chassis_index chassis_index;
>> -        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
>> +        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
>> +            VLOG_INFO("ovn-northd lock acquired. "
>> +                      "This ovn-northd instance is now active.");
>> +            had_lock = true;
>> +        } else if (had_lock && !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
>> +            VLOG_INFO("ovn-northd lock lost. "
>> +                      "This ovn-northd instance is now on standby.");
>
> Should it try lock again, if we want it to be standby? Otherwise, this
> instance won't have a chance to be active any more.

Good question ... I was assuming this scenario was due to a lost
connection, and that the IDL would automatically try to re-acquire the
lock.

I tested to make sure I saw a second ovn-northd go from standby to
active, but I have not tested active -> standby -> active again.

I'll take a closer look at this before applying the patch.

>
>> +            had_lock = false;
>> +        }
>>
>> -        ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
>> -        ovnsb_db_run(&ctx, &ovnsb_idl_loop);
>> -        if (ctx.ovnsb_txn) {
>> -            check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
>> -            check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
>> -            check_and_update_rbac(&ctx);
>> +        struct chassis_index chassis_index;
>> +        bool destroy_chassis_index = false;
>> +        if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
>> +            chassis_index_init(&chassis_index, ctx.ovnsb_idl);
>> +            destroy_chassis_index = true;
>> +
>> +            ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
>> +            ovnsb_db_run(&ctx, &ovnsb_idl_loop);
>> +            if (ctx.ovnsb_txn) {
>> +                check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
>> +                check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
>> +                check_and_update_rbac(&ctx);
>> +            }
>>          }
>>
>>          unixctl_server_run(unixctl);
>> @@ -6565,7 +6585,9 @@ main(int argc, char *argv[])
>>              exiting = true;
>>          }
>>
>> -        chassis_index_destroy(&chassis_index);
>> +        if (destroy_chassis_index) {
>> +            chassis_index_destroy(&chassis_index);
>> +        }
>>      }
>>
>>      unixctl_server_destroy(unixctl);
>> --
>> 2.13.3
>>
>
> Acked-by: Han Zhou <zhouhan@gmail.com>

Thanks for the review!
Numan Siddique Aug. 2, 2017, 1:21 a.m. UTC | #3
On Wed, Aug 2, 2017 at 1:18 AM, Russell Bryant <russell@ovn.org> wrote:

> On Tue, Aug 1, 2017 at 3:26 PM, Han Zhou <zhouhan@gmail.com> wrote:
> >
> >
> > On Tue, Aug 1, 2017 at 9:19 AM, Russell Bryant <russell@ovn.org> wrote:
> >>
> >> Add native support for active-standby HA in ovn-northd by having each
> >> instance attempt to acquire an OVSDB lock.  Only the instance of
> >> ovn-northd that currently holds the lock will make active changes to
> >> the OVN databases.
> >>
> >> Signed-off-by: Russell Bryant <russell@ovn.org>
> >> ---
> >>  NEWS                        |  1 +
> >>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
> >>  ovn/northd/ovn-northd.c     | 40 ++++++++++++++++++++++++++++++
> +---------
> >>  3 files changed, 41 insertions(+), 9 deletions(-)
> >>
> >> diff --git a/NEWS b/NEWS
> >> index facea0228..f3cdd2443 100644
> >> --- a/NEWS
> >> +++ b/NEWS
> >> @@ -49,6 +49,7 @@ Post-v2.7.0
> >>         one chassis is specified, OVN will manage high availability for
> >> that
> >>         gateway.
> >>       * Add support for ACL logging.
> >> +     * ovn-northd now has native support for active-standby high
> >> availability.
> >>     - Tracing with ofproto/trace now traces through recirculation.
> >>     - OVSDB:
> >>       * New support for role-based access control (see ovsdb-server(1)).
> >> diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
> >> index 1527e8a60..0d85ec0d2 100644
> >> --- a/ovn/northd/ovn-northd.8.xml
> >> +++ b/ovn/northd/ovn-northd.8.xml
> >> @@ -72,6 +72,15 @@
> >>        </dl>
> >>      </p>
> >>
> >> +    <h1>Active-Standby for High Availability</h1>
> >> +    <p>
> >> +      You may run <code>ovn-northd</code> more than once in an OVN
> >> deployment.
> >> +      OVN will automatically ensure that only one of them is active at
> a
> >> time.
> >> +      If multiple instances of <code>ovn-northd</code> are running and
> >> the
> >> +      active <code>ovn-northd</code> fails, one of the hot standby
> >> instances
> >> +      of <code>ovn-northd</code> will automatically take over.
> >> +    </p>
> >> +
> >>      <h1>Logical Flow Table Structure</h1>
> >>
> >>      <p>
> >> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
> >> index 10e0c7ce0..3d2be4267 100644
> >> --- a/ovn/northd/ovn-northd.c
> >> +++ b/ovn/northd/ovn-northd.c
> >> @@ -6531,6 +6531,12 @@ main(int argc, char *argv[])
> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl,
> &sbrec_chassis_col_nb_cfg);
> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
> >>
> >> +    /* Ensure that only a single ovn-northd is active in the deployment
> >> by
> >> +     * acquiring a lock called "ovn_northd" on the southbound database
> >> +     * and then only performing DB transactions if the lock is held. */
> >> +    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
> >> +    bool had_lock = false;
> >> +
> >>      /* Main loop. */
> >>      exiting = false;
> >>      while (!exiting) {
> >> @@ -6541,15 +6547,29 @@ main(int argc, char *argv[])
> >>              .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
> >>          };
> >>
> >> -        struct chassis_index chassis_index;
> >> -        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
> >> +        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> >> +            VLOG_INFO("ovn-northd lock acquired. "
> >> +                      "This ovn-northd instance is now active.");
> >> +            had_lock = true;
> >> +        } else if (had_lock && !ovsdb_idl_has_lock(ovnsb_idl_loop.idl))
> {
> >> +            VLOG_INFO("ovn-northd lock lost. "
> >> +                      "This ovn-northd instance is now on standby.");
> >
> > Should it try lock again, if we want it to be standby? Otherwise, this
> > instance won't have a chance to be active any more.
>
> Good question ... I was assuming this scenario was due to a lost
> connection, and that the IDL would automatically try to re-acquire the
> lock.
>
> I tested to make sure I saw a second ovn-northd go from standby to
> active, but I have not tested active -> standby -> active again.
>
> I'll take a closer look at this before applying the patch.
>
>
I tested it and it works fine. active -> standby -> active scenario also
works fine.
I also tested by restarting southbound ovsdb-server. Once ovsdb-server is
up again, the idl clients  try to
acquire the lock and one of the ovn-northd instance becomes active again.
I don't think it is required to try lock again as idl client takes care of
it.

How about starting another instance of ovn-northd in the sandbox/test
environment so that active/standby
scenario gets tested for all the ovn tests ?

Acked-by: Numan Siddique <nusiddiq@redhat.com>

Thanks
Numan

>
> >> +            had_lock = false;
> >> +        }
> >>
> >> -        ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
> >> -        ovnsb_db_run(&ctx, &ovnsb_idl_loop);
> >> -        if (ctx.ovnsb_txn) {
> >> -            check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
> >> -            check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
> >> -            check_and_update_rbac(&ctx);
> >> +        struct chassis_index chassis_index;
> >> +        bool destroy_chassis_index = false;
> >> +        if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> >> +            chassis_index_init(&chassis_index, ctx.ovnsb_idl);
> >> +            destroy_chassis_index = true;
> >> +
> >> +            ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
> >> +            ovnsb_db_run(&ctx, &ovnsb_idl_loop);
> >> +            if (ctx.ovnsb_txn) {
> >> +                check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
> >> +                check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
> >> +                check_and_update_rbac(&ctx);
> >> +            }
> >>          }
> >>
> >>          unixctl_server_run(unixctl);
> >> @@ -6565,7 +6585,9 @@ main(int argc, char *argv[])
> >>              exiting = true;
> >>          }
> >>
> >> -        chassis_index_destroy(&chassis_index);
> >> +        if (destroy_chassis_index) {
> >> +            chassis_index_destroy(&chassis_index);
> >> +        }
> >>      }
> >>
> >>      unixctl_server_destroy(unixctl);
> >> --
> >> 2.13.3
> >>
> >
> > Acked-by: Han Zhou <zhouhan@gmail.com>
>
> Thanks for the review!
>
> --
> Russell Bryant
>
Russell Bryant Aug. 2, 2017, 6:05 p.m. UTC | #4
On Tue, Aug 1, 2017 at 9:21 PM, Numan Siddique <nusiddiq@redhat.com> wrote:
>
>
> On Wed, Aug 2, 2017 at 1:18 AM, Russell Bryant <russell@ovn.org> wrote:
>>
>> On Tue, Aug 1, 2017 at 3:26 PM, Han Zhou <zhouhan@gmail.com> wrote:
>> >
>> >
>> > On Tue, Aug 1, 2017 at 9:19 AM, Russell Bryant <russell@ovn.org> wrote:
>> >>
>> >> Add native support for active-standby HA in ovn-northd by having each
>> >> instance attempt to acquire an OVSDB lock.  Only the instance of
>> >> ovn-northd that currently holds the lock will make active changes to
>> >> the OVN databases.
>> >>
>> >> Signed-off-by: Russell Bryant <russell@ovn.org>
>> >> ---
>> >>  NEWS                        |  1 +
>> >>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
>> >>  ovn/northd/ovn-northd.c     | 40
>> >> +++++++++++++++++++++++++++++++---------
>> >>  3 files changed, 41 insertions(+), 9 deletions(-)
>> >>
>> >> diff --git a/NEWS b/NEWS
>> >> index facea0228..f3cdd2443 100644
>> >> --- a/NEWS
>> >> +++ b/NEWS
>> >> @@ -49,6 +49,7 @@ Post-v2.7.0
>> >>         one chassis is specified, OVN will manage high availability for
>> >> that
>> >>         gateway.
>> >>       * Add support for ACL logging.
>> >> +     * ovn-northd now has native support for active-standby high
>> >> availability.
>> >>     - Tracing with ofproto/trace now traces through recirculation.
>> >>     - OVSDB:
>> >>       * New support for role-based access control (see
>> >> ovsdb-server(1)).
>> >> diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
>> >> index 1527e8a60..0d85ec0d2 100644
>> >> --- a/ovn/northd/ovn-northd.8.xml
>> >> +++ b/ovn/northd/ovn-northd.8.xml
>> >> @@ -72,6 +72,15 @@
>> >>        </dl>
>> >>      </p>
>> >>
>> >> +    <h1>Active-Standby for High Availability</h1>
>> >> +    <p>
>> >> +      You may run <code>ovn-northd</code> more than once in an OVN
>> >> deployment.
>> >> +      OVN will automatically ensure that only one of them is active at
>> >> a
>> >> time.
>> >> +      If multiple instances of <code>ovn-northd</code> are running and
>> >> the
>> >> +      active <code>ovn-northd</code> fails, one of the hot standby
>> >> instances
>> >> +      of <code>ovn-northd</code> will automatically take over.
>> >> +    </p>
>> >> +
>> >>      <h1>Logical Flow Table Structure</h1>
>> >>
>> >>      <p>
>> >> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
>> >> index 10e0c7ce0..3d2be4267 100644
>> >> --- a/ovn/northd/ovn-northd.c
>> >> +++ b/ovn/northd/ovn-northd.c
>> >> @@ -6531,6 +6531,12 @@ main(int argc, char *argv[])
>> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl,
>> >> &sbrec_chassis_col_nb_cfg);
>> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
>> >>
>> >> +    /* Ensure that only a single ovn-northd is active in the
>> >> deployment
>> >> by
>> >> +     * acquiring a lock called "ovn_northd" on the southbound database
>> >> +     * and then only performing DB transactions if the lock is held.
>> >> */
>> >> +    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
>> >> +    bool had_lock = false;
>> >> +
>> >>      /* Main loop. */
>> >>      exiting = false;
>> >>      while (!exiting) {
>> >> @@ -6541,15 +6547,29 @@ main(int argc, char *argv[])
>> >>              .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
>> >>          };
>> >>
>> >> -        struct chassis_index chassis_index;
>> >> -        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
>> >> +        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
>> >> +            VLOG_INFO("ovn-northd lock acquired. "
>> >> +                      "This ovn-northd instance is now active.");
>> >> +            had_lock = true;
>> >> +        } else if (had_lock &&
>> >> !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
>> >> +            VLOG_INFO("ovn-northd lock lost. "
>> >> +                      "This ovn-northd instance is now on standby.");
>> >
>> > Should it try lock again, if we want it to be standby? Otherwise, this
>> > instance won't have a chance to be active any more.
>>
>> Good question ... I was assuming this scenario was due to a lost
>> connection, and that the IDL would automatically try to re-acquire the
>> lock.
>>
>> I tested to make sure I saw a second ovn-northd go from standby to
>> active, but I have not tested active -> standby -> active again.
>>
>> I'll take a closer look at this before applying the patch.
>>
>
> I tested it and it works fine. active -> standby -> active scenario also
> works fine.
> I also tested by restarting southbound ovsdb-server. Once ovsdb-server is up
> again, the idl clients  try to
> acquire the lock and one of the ovn-northd instance becomes active again.
> I don't think it is required to try lock again as idl client takes care of
> it.
>
> How about starting another instance of ovn-northd in the sandbox/test
> environment so that active/standby
> scenario gets tested for all the ovn tests ?
>
> Acked-by: Numan Siddique <nusiddiq@redhat.com>

Thanks!  I updated the test suite to always run a backup ovn-northd to
help ensure it doesn't break anything.  I then applied this to master.
Miguel Angel Ajo Aug. 9, 2017, 9:01 a.m. UTC | #5
Nice idea, I have btw some comments/thoughts/questions regarding this:

1) Does OVSDB have any heartbeat protocol? (to detect that one northd has
died even during inactive periods).

     Otherwise we can document the need to tweak the tcp_keepalive settings
of the system to have some sensible settings that will make TCP detect the
connection failure in a reasonable amount of time.

2) We need to consider that in some cases the master ovsdb server and the
northd process will be colocated and therefore fall together. I guess that
in that case the lock is replicated to the slave ovsdb server, we need to
make sure that the lock will be dropped once the old slave(backup) becomes
master.


Best regards,


On Wed, Aug 2, 2017 at 8:05 PM, Russell Bryant <russell@ovn.org> wrote:

> On Tue, Aug 1, 2017 at 9:21 PM, Numan Siddique <nusiddiq@redhat.com>
> wrote:
> >
> >
> > On Wed, Aug 2, 2017 at 1:18 AM, Russell Bryant <russell@ovn.org> wrote:
> >>
> >> On Tue, Aug 1, 2017 at 3:26 PM, Han Zhou <zhouhan@gmail.com> wrote:
> >> >
> >> >
> >> > On Tue, Aug 1, 2017 at 9:19 AM, Russell Bryant <russell@ovn.org>
> wrote:
> >> >>
> >> >> Add native support for active-standby HA in ovn-northd by having each
> >> >> instance attempt to acquire an OVSDB lock.  Only the instance of
> >> >> ovn-northd that currently holds the lock will make active changes to
> >> >> the OVN databases.
> >> >>
> >> >> Signed-off-by: Russell Bryant <russell@ovn.org>
> >> >> ---
> >> >>  NEWS                        |  1 +
> >> >>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
> >> >>  ovn/northd/ovn-northd.c     | 40
> >> >> +++++++++++++++++++++++++++++++---------
> >> >>  3 files changed, 41 insertions(+), 9 deletions(-)
> >> >>
> >> >> diff --git a/NEWS b/NEWS
> >> >> index facea0228..f3cdd2443 100644
> >> >> --- a/NEWS
> >> >> +++ b/NEWS
> >> >> @@ -49,6 +49,7 @@ Post-v2.7.0
> >> >>         one chassis is specified, OVN will manage high availability
> for
> >> >> that
> >> >>         gateway.
> >> >>       * Add support for ACL logging.
> >> >> +     * ovn-northd now has native support for active-standby high
> >> >> availability.
> >> >>     - Tracing with ofproto/trace now traces through recirculation.
> >> >>     - OVSDB:
> >> >>       * New support for role-based access control (see
> >> >> ovsdb-server(1)).
> >> >> diff --git a/ovn/northd/ovn-northd.8.xml
> b/ovn/northd/ovn-northd.8.xml
> >> >> index 1527e8a60..0d85ec0d2 100644
> >> >> --- a/ovn/northd/ovn-northd.8.xml
> >> >> +++ b/ovn/northd/ovn-northd.8.xml
> >> >> @@ -72,6 +72,15 @@
> >> >>        </dl>
> >> >>      </p>
> >> >>
> >> >> +    <h1>Active-Standby for High Availability</h1>
> >> >> +    <p>
> >> >> +      You may run <code>ovn-northd</code> more than once in an OVN
> >> >> deployment.
> >> >> +      OVN will automatically ensure that only one of them is active
> at
> >> >> a
> >> >> time.
> >> >> +      If multiple instances of <code>ovn-northd</code> are running
> and
> >> >> the
> >> >> +      active <code>ovn-northd</code> fails, one of the hot standby
> >> >> instances
> >> >> +      of <code>ovn-northd</code> will automatically take over.
> >> >> +    </p>
> >> >> +
> >> >>      <h1>Logical Flow Table Structure</h1>
> >> >>
> >> >>      <p>
> >> >> diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
> >> >> index 10e0c7ce0..3d2be4267 100644
> >> >> --- a/ovn/northd/ovn-northd.c
> >> >> +++ b/ovn/northd/ovn-northd.c
> >> >> @@ -6531,6 +6531,12 @@ main(int argc, char *argv[])
> >> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl,
> >> >> &sbrec_chassis_col_nb_cfg);
> >> >>      ovsdb_idl_add_column(ovnsb_idl_loop.idl,
> &sbrec_chassis_col_name);
> >> >>
> >> >> +    /* Ensure that only a single ovn-northd is active in the
> >> >> deployment
> >> >> by
> >> >> +     * acquiring a lock called "ovn_northd" on the southbound
> database
> >> >> +     * and then only performing DB transactions if the lock is held.
> >> >> */
> >> >> +    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
> >> >> +    bool had_lock = false;
> >> >> +
> >> >>      /* Main loop. */
> >> >>      exiting = false;
> >> >>      while (!exiting) {
> >> >> @@ -6541,15 +6547,29 @@ main(int argc, char *argv[])
> >> >>              .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
> >> >>          };
> >> >>
> >> >> -        struct chassis_index chassis_index;
> >> >> -        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
> >> >> +        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> >> >> +            VLOG_INFO("ovn-northd lock acquired. "
> >> >> +                      "This ovn-northd instance is now active.");
> >> >> +            had_lock = true;
> >> >> +        } else if (had_lock &&
> >> >> !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
> >> >> +            VLOG_INFO("ovn-northd lock lost. "
> >> >> +                      "This ovn-northd instance is now on
> standby.");
> >> >
> >> > Should it try lock again, if we want it to be standby? Otherwise, this
> >> > instance won't have a chance to be active any more.
> >>
> >> Good question ... I was assuming this scenario was due to a lost
> >> connection, and that the IDL would automatically try to re-acquire the
> >> lock.
> >>
> >> I tested to make sure I saw a second ovn-northd go from standby to
> >> active, but I have not tested active -> standby -> active again.
> >>
> >> I'll take a closer look at this before applying the patch.
> >>
> >
> > I tested it and it works fine. active -> standby -> active scenario also
> > works fine.
> > I also tested by restarting southbound ovsdb-server. Once ovsdb-server
> is up
> > again, the idl clients  try to
> > acquire the lock and one of the ovn-northd instance becomes active again.
> > I don't think it is required to try lock again as idl client takes care
> of
> > it.
> >
> > How about starting another instance of ovn-northd in the sandbox/test
> > environment so that active/standby
> > scenario gets tested for all the ovn tests ?
> >
> > Acked-by: Numan Siddique <nusiddiq@redhat.com>
>
> Thanks!  I updated the test suite to always run a backup ovn-northd to
> help ensure it doesn't break anything.  I then applied this to master.
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Russell Bryant Aug. 9, 2017, 1 p.m. UTC | #6
On Wed, Aug 9, 2017 at 5:01 AM, Miguel Angel Ajo Pelayo
<majopela@redhat.com> wrote:
> Nice idea, I have btw some comments/thoughts/questions regarding this:
>
> 1) Does OVSDB have any heartbeat protocol? (to detect that one northd has
> died even during inactive periods).

Yes, it does.  By deafult, both ends of an OVSDB connection send a
regular keepalive message every 5 seconds to help detect a dead
connection.  It's configurable, and the default has caused extra
reconnects when doing performance testing and services get too busy to
keep up with the default keepalive settings.

>      Otherwise we can document the need to tweak the tcp_keepalive settings
> of the system to have some sensible settings that will make TCP detect the
> connection failure in a reasonable amount of time.

Indeed

> 2) We need to consider that in some cases the master ovsdb server and the
> northd process will be colocated and therefore fall together. I guess that
> in that case the lock is replicated to the slave ovsdb server, we need to
> make sure that the lock will be dropped once the old slave(backup) becomes
> master.

Good question.  I didn't look into how this behaves with
active/passive ovsdb-server.  I assume all locks are dropped when
services have to reconnect to the new master.  We should test it.

Also note that our Pacemaker config currently still manages
ovn-northd, so we're not exclusively relying on this behavior.  At a
minimum, if we still let Pacemaker drive the HA (because we're running
Pacemaker for ovsdb still anyway), this addition helps ensure that
only a single ovn-northd is active if somehow more than one was
accidentally started.
Ben Pfaff Aug. 9, 2017, 5:29 p.m. UTC | #7
On Tue, Aug 01, 2017 at 12:19:18PM -0400, Russell Bryant wrote:
> Add native support for active-standby HA in ovn-northd by having each
> instance attempt to acquire an OVSDB lock.  Only the instance of
> ovn-northd that currently holds the lock will make active changes to
> the OVN databases.
> 
> Signed-off-by: Russell Bryant <russell@ovn.org>
> ---
>  NEWS                        |  1 +
>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
>  ovn/northd/ovn-northd.c     | 40 +++++++++++++++++++++++++++++++---------
>  3 files changed, 41 insertions(+), 9 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index facea0228..f3cdd2443 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -49,6 +49,7 @@ Post-v2.7.0
>         one chassis is specified, OVN will manage high availability for that
>         gateway.
>       * Add support for ACL logging.
> +     * ovn-northd now has native support for active-standby high availability.
>     - Tracing with ofproto/trace now traces through recirculation.
>     - OVSDB:
>       * New support for role-based access control (see ovsdb-server(1)).

Are you trying to still get this into 2.8.0?  If not, then this should
move to the new post-v2.8.0 section.
Russell Bryant Aug. 9, 2017, 7:25 p.m. UTC | #8
On Wed, Aug 9, 2017 at 1:29 PM, Ben Pfaff <blp@ovn.org> wrote:
> On Tue, Aug 01, 2017 at 12:19:18PM -0400, Russell Bryant wrote:
>> Add native support for active-standby HA in ovn-northd by having each
>> instance attempt to acquire an OVSDB lock.  Only the instance of
>> ovn-northd that currently holds the lock will make active changes to
>> the OVN databases.
>>
>> Signed-off-by: Russell Bryant <russell@ovn.org>
>> ---
>>  NEWS                        |  1 +
>>  ovn/northd/ovn-northd.8.xml |  9 +++++++++
>>  ovn/northd/ovn-northd.c     | 40 +++++++++++++++++++++++++++++++---------
>>  3 files changed, 41 insertions(+), 9 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index facea0228..f3cdd2443 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -49,6 +49,7 @@ Post-v2.7.0
>>         one chassis is specified, OVN will manage high availability for that
>>         gateway.
>>       * Add support for ACL logging.
>> +     * ovn-northd now has native support for active-standby high availability.
>>     - Tracing with ofproto/trace now traces through recirculation.
>>     - OVSDB:
>>       * New support for role-based access control (see ovsdb-server(1)).
>
> Are you trying to still get this into 2.8.0?  If not, then this should
> move to the new post-v2.8.0 section.

I already applied this to both master and branch-2.8, actually
diff mbox

Patch

diff --git a/NEWS b/NEWS
index facea0228..f3cdd2443 100644
--- a/NEWS
+++ b/NEWS
@@ -49,6 +49,7 @@  Post-v2.7.0
        one chassis is specified, OVN will manage high availability for that
        gateway.
      * Add support for ACL logging.
+     * ovn-northd now has native support for active-standby high availability.
    - Tracing with ofproto/trace now traces through recirculation.
    - OVSDB:
      * New support for role-based access control (see ovsdb-server(1)).
diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
index 1527e8a60..0d85ec0d2 100644
--- a/ovn/northd/ovn-northd.8.xml
+++ b/ovn/northd/ovn-northd.8.xml
@@ -72,6 +72,15 @@ 
       </dl>
     </p>
 
+    <h1>Active-Standby for High Availability</h1>
+    <p>
+      You may run <code>ovn-northd</code> more than once in an OVN deployment.
+      OVN will automatically ensure that only one of them is active at a time.
+      If multiple instances of <code>ovn-northd</code> are running and the
+      active <code>ovn-northd</code> fails, one of the hot standby instances
+      of <code>ovn-northd</code> will automatically take over.
+    </p>
+
     <h1>Logical Flow Table Structure</h1>
 
     <p>
diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index 10e0c7ce0..3d2be4267 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -6531,6 +6531,12 @@  main(int argc, char *argv[])
     ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_nb_cfg);
     ovsdb_idl_add_column(ovnsb_idl_loop.idl, &sbrec_chassis_col_name);
 
+    /* Ensure that only a single ovn-northd is active in the deployment by
+     * acquiring a lock called "ovn_northd" on the southbound database
+     * and then only performing DB transactions if the lock is held. */
+    ovsdb_idl_set_lock(ovnsb_idl_loop.idl, "ovn_northd");
+    bool had_lock = false;
+
     /* Main loop. */
     exiting = false;
     while (!exiting) {
@@ -6541,15 +6547,29 @@  main(int argc, char *argv[])
             .ovnsb_txn = ovsdb_idl_loop_run(&ovnsb_idl_loop),
         };
 
-        struct chassis_index chassis_index;
-        chassis_index_init(&chassis_index, ctx.ovnsb_idl);
+        if (!had_lock && ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
+            VLOG_INFO("ovn-northd lock acquired. "
+                      "This ovn-northd instance is now active.");
+            had_lock = true;
+        } else if (had_lock && !ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
+            VLOG_INFO("ovn-northd lock lost. "
+                      "This ovn-northd instance is now on standby.");
+            had_lock = false;
+        }
 
-        ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
-        ovnsb_db_run(&ctx, &ovnsb_idl_loop);
-        if (ctx.ovnsb_txn) {
-            check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
-            check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
-            check_and_update_rbac(&ctx);
+        struct chassis_index chassis_index;
+        bool destroy_chassis_index = false;
+        if (ovsdb_idl_has_lock(ovnsb_idl_loop.idl)) {
+            chassis_index_init(&chassis_index, ctx.ovnsb_idl);
+            destroy_chassis_index = true;
+
+            ovnnb_db_run(&ctx, &chassis_index, &ovnsb_idl_loop);
+            ovnsb_db_run(&ctx, &ovnsb_idl_loop);
+            if (ctx.ovnsb_txn) {
+                check_and_add_supported_dhcp_opts_to_sb_db(&ctx);
+                check_and_add_supported_dhcpv6_opts_to_sb_db(&ctx);
+                check_and_update_rbac(&ctx);
+            }
         }
 
         unixctl_server_run(unixctl);
@@ -6565,7 +6585,9 @@  main(int argc, char *argv[])
             exiting = true;
         }
 
-        chassis_index_destroy(&chassis_index);
+        if (destroy_chassis_index) {
+            chassis_index_destroy(&chassis_index);
+        }
     }
 
     unixctl_server_destroy(unixctl);