diff mbox series

[ovs-dev,v13] ofproto-dpif-upcall: Don't set statistics to 0 when they jump back

Message ID 20230526120338.339696-1-bnemeth@redhat.com
State Accepted
Commit 59c908410570e852be0b5c04c85093179924410c
Headers show
Series [ovs-dev,v13] ofproto-dpif-upcall: Don't set statistics to 0 when they jump back | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Balazs Nemeth May 26, 2023, 12:03 p.m. UTC
The only way that stats->{n_packets,n_bytes} would decrease is due to an
overflow, or if there are bugs in how statistics are handled. In the
past, there were multiple issues that caused a jump backward. A
workaround was in place to set the statistics to 0 in that case. When
this happened while the revalidator was under heavy load, the workaround
had an unintended side effect where should_revalidate returned false
causing the flow to be removed because the metric it calculated was
based on a bogus value. Since many of those bugs have now been
identified and resolved, there is no need to set the statistics to 0. In
addition, the (unlikely) overflow still needs to be handled
appropriately. If an unexpected jump does happen, just log it as a
warning.

Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
---
 ofproto/ofproto-dpif-upcall.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

Comments

Eelco Chaudron May 26, 2023, 1:09 p.m. UTC | #1
On 26 May 2023, at 14:03, Balazs Nemeth wrote:

> The only way that stats->{n_packets,n_bytes} would decrease is due to an
> overflow, or if there are bugs in how statistics are handled. In the
> past, there were multiple issues that caused a jump backward. A
> workaround was in place to set the statistics to 0 in that case. When
> this happened while the revalidator was under heavy load, the workaround
> had an unintended side effect where should_revalidate returned false
> causing the flow to be removed because the metric it calculated was
> based on a bogus value. Since many of those bugs have now been
> identified and resolved, there is no need to set the statistics to 0. In
> addition, the (unlikely) overflow still needs to be handled
> appropriately. If an unexpected jump does happen, just log it as a
> warning.
>
> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>

Thanks for making the final change! Here is an example of the log message for others reviewing:

2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3

One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.

Acked-by: Eelco Chaudron <echaudro@redhat.com>

//Eelco


> ---
>  ofproto/ofproto-dpif-upcall.c | 30 ++++++++++++++++++++++++------
>  1 file changed, 24 insertions(+), 6 deletions(-)
>
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index cd57fdbd9..819f1eb4e 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -2339,6 +2339,27 @@ exit:
>      return result;
>  }
>
> +static void
> +log_unexpected_stats_jump(struct udpif_key *ukey,
> +                          const struct dpif_flow_stats *stats)
> +    OVS_REQUIRES(ukey->mutex)
> +{
> +    static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    struct ofpbuf *actions;
> +
> +    odp_format_ufid(&ukey->ufid, &ds);
> +    ds_put_cstr(&ds, " ");

nit: I forgot to add a , here, so it should probably be “, “.


> +    odp_flow_key_format(ukey->key, ukey->key_len, &ds);
> +    ds_put_cstr(&ds, ", actions:");
> +    actions = ovsrcu_get(struct ofpbuf *, &ukey->actions);
> +    format_odp_actions(&ds, actions->data, actions->size, NULL);
> +    VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64
> +            " to %"PRIu64" when handling ukey %s",
> +            ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds));
> +    ds_destroy(&ds);
> +}
> +
>  /* Verifies that the datapath actions of 'ukey' are still correct, and pushes
>   * 'stats' for it.
>   *
> @@ -2372,18 +2393,15 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
>
>      push.used = stats->used;
>      push.tcp_flags = stats->tcp_flags;
> -    push.n_packets = (stats->n_packets > ukey->stats.n_packets
> -                      ? stats->n_packets - ukey->stats.n_packets
> -                      : 0);
> -    push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes
> -                    ? stats->n_bytes - ukey->stats.n_bytes
> -                    : 0);
> +    push.n_packets = stats->n_packets - ukey->stats.n_packets;
> +    push.n_bytes = stats->n_bytes - ukey->stats.n_bytes;
>
>      if (stats->n_packets < ukey->stats.n_packets &&
>          ukey->stats.n_packets < UINT64_THREE_QUARTERS) {
>          /* Report cases where the packet counter is lower than the previous
>           * instance, but exclude the potential wrapping of an uint64_t. */
>          COVERAGE_INC(ukey_invalid_stat_reset);
> +        log_unexpected_stats_jump(ukey, stats);
>      }
>
>      if (need_revalidate) {
> -- 
> 2.40.1
Marcelo Leitner May 26, 2023, 1:40 p.m. UTC | #2
On Fri, May 26, 2023 at 03:09:02PM +0200, Eelco Chaudron wrote:
>
>
> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>
> > The only way that stats->{n_packets,n_bytes} would decrease is due to an
> > overflow, or if there are bugs in how statistics are handled. In the
> > past, there were multiple issues that caused a jump backward. A
> > workaround was in place to set the statistics to 0 in that case. When
> > this happened while the revalidator was under heavy load, the workaround
> > had an unintended side effect where should_revalidate returned false
> > causing the flow to be removed because the metric it calculated was
> > based on a bogus value. Since many of those bugs have now been
> > identified and resolved, there is no need to set the statistics to 0. In
> > addition, the (unlikely) overflow still needs to be handled
> > appropriately. If an unexpected jump does happen, just log it as a
> > warning.
> >
> > Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>
> Thanks for making the final change! Here is an example of the log message for others reviewing:
>
> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)

I've seen the multiple revisions here but I didn't read all of them,
so I'm pretty much out of context here already.  That said, I'm
confused with "Unexpected jump ... from 0 to 1" here.  Did you tweak
th test condition in order to trigger the log msg maybe?  Because I
would expect "0 to 1" to be a normal situation, so I just wanted to
make sure.

Thanks,
Marcelo

> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>
> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>
> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>
> //Eelco
>
>
> > ---
> >  ofproto/ofproto-dpif-upcall.c | 30 ++++++++++++++++++++++++------
> >  1 file changed, 24 insertions(+), 6 deletions(-)
> >
> > diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> > index cd57fdbd9..819f1eb4e 100644
> > --- a/ofproto/ofproto-dpif-upcall.c
> > +++ b/ofproto/ofproto-dpif-upcall.c
> > @@ -2339,6 +2339,27 @@ exit:
> >      return result;
> >  }
> >
> > +static void
> > +log_unexpected_stats_jump(struct udpif_key *ukey,
> > +                          const struct dpif_flow_stats *stats)
> > +    OVS_REQUIRES(ukey->mutex)
> > +{
> > +    static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
> > +    struct ds ds = DS_EMPTY_INITIALIZER;
> > +    struct ofpbuf *actions;
> > +
> > +    odp_format_ufid(&ukey->ufid, &ds);
> > +    ds_put_cstr(&ds, " ");
>
> nit: I forgot to add a , here, so it should probably be “, “.
>
>
> > +    odp_flow_key_format(ukey->key, ukey->key_len, &ds);
> > +    ds_put_cstr(&ds, ", actions:");
> > +    actions = ovsrcu_get(struct ofpbuf *, &ukey->actions);
> > +    format_odp_actions(&ds, actions->data, actions->size, NULL);
> > +    VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64
> > +            " to %"PRIu64" when handling ukey %s",
> > +            ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds));
> > +    ds_destroy(&ds);
> > +}
> > +
> >  /* Verifies that the datapath actions of 'ukey' are still correct, and pushes
> >   * 'stats' for it.
> >   *
> > @@ -2372,18 +2393,15 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
> >
> >      push.used = stats->used;
> >      push.tcp_flags = stats->tcp_flags;
> > -    push.n_packets = (stats->n_packets > ukey->stats.n_packets
> > -                      ? stats->n_packets - ukey->stats.n_packets
> > -                      : 0);
> > -    push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes
> > -                    ? stats->n_bytes - ukey->stats.n_bytes
> > -                    : 0);
> > +    push.n_packets = stats->n_packets - ukey->stats.n_packets;
> > +    push.n_bytes = stats->n_bytes - ukey->stats.n_bytes;
> >
> >      if (stats->n_packets < ukey->stats.n_packets &&
> >          ukey->stats.n_packets < UINT64_THREE_QUARTERS) {
> >          /* Report cases where the packet counter is lower than the previous
> >           * instance, but exclude the potential wrapping of an uint64_t. */
> >          COVERAGE_INC(ukey_invalid_stat_reset);
> > +        log_unexpected_stats_jump(ukey, stats);
> >      }
> >
> >      if (need_revalidate) {
> > --
> > 2.40.1
>
Eelco Chaudron May 26, 2023, 1:42 p.m. UTC | #3
On 26 May 2023, at 15:40, Marcelo Ricardo Leitner wrote:

> On Fri, May 26, 2023 at 03:09:02PM +0200, Eelco Chaudron wrote:
>>
>>
>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>
>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>> overflow, or if there are bugs in how statistics are handled. In the
>>> past, there were multiple issues that caused a jump backward. A
>>> workaround was in place to set the statistics to 0 in that case. When
>>> this happened while the revalidator was under heavy load, the workaround
>>> had an unintended side effect where should_revalidate returned false
>>> causing the flow to be removed because the metric it calculated was
>>> based on a bogus value. Since many of those bugs have now been
>>> identified and resolved, there is no need to set the statistics to 0. In
>>> addition, the (unlikely) overflow still needs to be handled
>>> appropriately. If an unexpected jump does happen, just log it as a
>>> warning.
>>>
>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>
>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>
>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>
> I've seen the multiple revisions here but I didn't read all of them,
> so I'm pretty much out of context here already.  That said, I'm
> confused with "Unexpected jump ... from 0 to 1" here.  Did you tweak
> th test condition in order to trigger the log msg maybe?  Because I
> would expect "0 to 1" to be a normal situation, so I just wanted to
> make sure.

Yes I did :)

>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>
>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>
>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>
>> //Eelco
>>
>>
>>> ---
>>>  ofproto/ofproto-dpif-upcall.c | 30 ++++++++++++++++++++++++------
>>>  1 file changed, 24 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>>> index cd57fdbd9..819f1eb4e 100644
>>> --- a/ofproto/ofproto-dpif-upcall.c
>>> +++ b/ofproto/ofproto-dpif-upcall.c
>>> @@ -2339,6 +2339,27 @@ exit:
>>>      return result;
>>>  }
>>>
>>> +static void
>>> +log_unexpected_stats_jump(struct udpif_key *ukey,
>>> +                          const struct dpif_flow_stats *stats)
>>> +    OVS_REQUIRES(ukey->mutex)
>>> +{
>>> +    static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
>>> +    struct ds ds = DS_EMPTY_INITIALIZER;
>>> +    struct ofpbuf *actions;
>>> +
>>> +    odp_format_ufid(&ukey->ufid, &ds);
>>> +    ds_put_cstr(&ds, " ");
>>
>> nit: I forgot to add a , here, so it should probably be “, “.
>>
>>
>>> +    odp_flow_key_format(ukey->key, ukey->key_len, &ds);
>>> +    ds_put_cstr(&ds, ", actions:");
>>> +    actions = ovsrcu_get(struct ofpbuf *, &ukey->actions);
>>> +    format_odp_actions(&ds, actions->data, actions->size, NULL);
>>> +    VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64
>>> +            " to %"PRIu64" when handling ukey %s",
>>> +            ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds));
>>> +    ds_destroy(&ds);
>>> +}
>>> +
>>>  /* Verifies that the datapath actions of 'ukey' are still correct, and pushes
>>>   * 'stats' for it.
>>>   *
>>> @@ -2372,18 +2393,15 @@ revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
>>>
>>>      push.used = stats->used;
>>>      push.tcp_flags = stats->tcp_flags;
>>> -    push.n_packets = (stats->n_packets > ukey->stats.n_packets
>>> -                      ? stats->n_packets - ukey->stats.n_packets
>>> -                      : 0);
>>> -    push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes
>>> -                    ? stats->n_bytes - ukey->stats.n_bytes
>>> -                    : 0);
>>> +    push.n_packets = stats->n_packets - ukey->stats.n_packets;
>>> +    push.n_bytes = stats->n_bytes - ukey->stats.n_bytes;
>>>
>>>      if (stats->n_packets < ukey->stats.n_packets &&
>>>          ukey->stats.n_packets < UINT64_THREE_QUARTERS) {
>>>          /* Report cases where the packet counter is lower than the previous
>>>           * instance, but exclude the potential wrapping of an uint64_t. */
>>>          COVERAGE_INC(ukey_invalid_stat_reset);
>>> +        log_unexpected_stats_jump(ukey, stats);
>>>      }
>>>
>>>      if (need_revalidate) {
>>> --
>>> 2.40.1
>>
Ilya Maximets May 26, 2023, 8:51 p.m. UTC | #4
On 5/26/23 15:09, Eelco Chaudron wrote:
> 
> 
> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
> 
>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>> overflow, or if there are bugs in how statistics are handled. In the
>> past, there were multiple issues that caused a jump backward. A
>> workaround was in place to set the statistics to 0 in that case. When
>> this happened while the revalidator was under heavy load, the workaround
>> had an unintended side effect where should_revalidate returned false
>> causing the flow to be removed because the metric it calculated was
>> based on a bogus value. Since many of those bugs have now been
>> identified and resolved, there is no need to set the statistics to 0. In
>> addition, the (unlikely) overflow still needs to be handled
>> appropriately. If an unexpected jump does happen, just log it as a
>> warning.
>>
>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
> 
> Thanks for making the final change! Here is an example of the log message for others reviewing:
> 
> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
> 
> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
> 
> Acked-by: Eelco Chaudron <echaudro@redhat.com>

Thanks!  I added the comma and applied the patch.

Best regards, Ilya Maximets.
Eelco Chaudron May 30, 2023, 7:32 a.m. UTC | #5
On 26 May 2023, at 22:51, Ilya Maximets wrote:

> On 5/26/23 15:09, Eelco Chaudron wrote:
>>
>>
>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>
>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>> overflow, or if there are bugs in how statistics are handled. In the
>>> past, there were multiple issues that caused a jump backward. A
>>> workaround was in place to set the statistics to 0 in that case. When
>>> this happened while the revalidator was under heavy load, the workaround
>>> had an unintended side effect where should_revalidate returned false
>>> causing the flow to be removed because the metric it calculated was
>>> based on a bogus value. Since many of those bugs have now been
>>> identified and resolved, there is no need to set the statistics to 0. In
>>> addition, the (unlikely) overflow still needs to be handled
>>> appropriately. If an unexpected jump does happen, just log it as a
>>> warning.
>>>
>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>
>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>
>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>
>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>
>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>
> Thanks!  I added the comma and applied the patch.

Are you planning on back porting this, as all other relevant patches where?

Cheers,

Eelco
Eelco Chaudron June 22, 2023, 8:50 a.m. UTC | #6
On 30 May 2023, at 9:32, Eelco Chaudron wrote:

> On 26 May 2023, at 22:51, Ilya Maximets wrote:
>
>> On 5/26/23 15:09, Eelco Chaudron wrote:
>>>
>>>
>>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>>
>>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>>> overflow, or if there are bugs in how statistics are handled. In the
>>>> past, there were multiple issues that caused a jump backward. A
>>>> workaround was in place to set the statistics to 0 in that case. When
>>>> this happened while the revalidator was under heavy load, the workaround
>>>> had an unintended side effect where should_revalidate returned false
>>>> causing the flow to be removed because the metric it calculated was
>>>> based on a bogus value. Since many of those bugs have now been
>>>> identified and resolved, there is no need to set the statistics to 0. In
>>>> addition, the (unlikely) overflow still needs to be handled
>>>> appropriately. If an unexpected jump does happen, just log it as a
>>>> warning.
>>>>
>>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>>
>>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>>
>>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>>
>>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>>
>>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>
>> Thanks!  I added the comma and applied the patch.
>
> Are you planning on back porting this, as all other relevant patches where?

Ilya did you maybe miss this email? Question still applies ;)


> Cheers,
>
> Eelco
Ilya Maximets June 22, 2023, 9:15 a.m. UTC | #7
On 6/22/23 10:50, Eelco Chaudron wrote:
> 
> 
> On 30 May 2023, at 9:32, Eelco Chaudron wrote:
> 
>> On 26 May 2023, at 22:51, Ilya Maximets wrote:
>>
>>> On 5/26/23 15:09, Eelco Chaudron wrote:
>>>>
>>>>
>>>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>>>
>>>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>>>> overflow, or if there are bugs in how statistics are handled. In the
>>>>> past, there were multiple issues that caused a jump backward. A
>>>>> workaround was in place to set the statistics to 0 in that case. When
>>>>> this happened while the revalidator was under heavy load, the workaround
>>>>> had an unintended side effect where should_revalidate returned false
>>>>> causing the flow to be removed because the metric it calculated was
>>>>> based on a bogus value. Since many of those bugs have now been
>>>>> identified and resolved, there is no need to set the statistics to 0. In
>>>>> addition, the (unlikely) overflow still needs to be handled
>>>>> appropriately. If an unexpected jump does happen, just log it as a
>>>>> warning.
>>>>>
>>>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>>>
>>>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>>>
>>>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>>>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>>>
>>>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>>>
>>>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>>
>>> Thanks!  I added the comma and applied the patch.
>>
>> Are you planning on back porting this, as all other relevant patches where?
> 
> Ilya did you maybe miss this email? Question still applies ;)

Yeah, sorry, it fell through the cracks.

I'm a bit worried about backporting this one, because the
original reason for the stats jump is still not clear and
so might still exist.  I guess, we could backport to 3.1,
but I'd like not to backport to LTS.

What do you think?

Best regards, Ilya Maximets.
Eelco Chaudron June 22, 2023, 9:30 a.m. UTC | #8
On 22 Jun 2023, at 11:15, Ilya Maximets wrote:

> On 6/22/23 10:50, Eelco Chaudron wrote:
>>
>>
>> On 30 May 2023, at 9:32, Eelco Chaudron wrote:
>>
>>> On 26 May 2023, at 22:51, Ilya Maximets wrote:
>>>
>>>> On 5/26/23 15:09, Eelco Chaudron wrote:
>>>>>
>>>>>
>>>>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>>>>
>>>>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>>>>> overflow, or if there are bugs in how statistics are handled. In the
>>>>>> past, there were multiple issues that caused a jump backward. A
>>>>>> workaround was in place to set the statistics to 0 in that case. When
>>>>>> this happened while the revalidator was under heavy load, the workaround
>>>>>> had an unintended side effect where should_revalidate returned false
>>>>>> causing the flow to be removed because the metric it calculated was
>>>>>> based on a bogus value. Since many of those bugs have now been
>>>>>> identified and resolved, there is no need to set the statistics to 0. In
>>>>>> addition, the (unlikely) overflow still needs to be handled
>>>>>> appropriately. If an unexpected jump does happen, just log it as a
>>>>>> warning.
>>>>>>
>>>>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>>>>
>>>>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>>>>
>>>>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>>>>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>>>>
>>>>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>>>>
>>>>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>>>
>>>> Thanks!  I added the comma and applied the patch.
>>>
>>> Are you planning on back porting this, as all other relevant patches where?
>>
>> Ilya did you maybe miss this email? Question still applies ;)
>
> Yeah, sorry, it fell through the cracks.
>
> I'm a bit worried about backporting this one, because the
> original reason for the stats jump is still not clear and
> so might still exist.  I guess, we could backport to 3.1,
> but I'd like not to backport to LTS.
>
> What do you think?

I’m happy with getting this in 3.1 ;)

Thanks,

Eelco
Ilya Maximets June 22, 2023, 11:29 a.m. UTC | #9
On 6/22/23 11:30, Eelco Chaudron wrote:
> 
> 
> On 22 Jun 2023, at 11:15, Ilya Maximets wrote:
> 
>> On 6/22/23 10:50, Eelco Chaudron wrote:
>>>
>>>
>>> On 30 May 2023, at 9:32, Eelco Chaudron wrote:
>>>
>>>> On 26 May 2023, at 22:51, Ilya Maximets wrote:
>>>>
>>>>> On 5/26/23 15:09, Eelco Chaudron wrote:
>>>>>>
>>>>>>
>>>>>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>>>>>
>>>>>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>>>>>> overflow, or if there are bugs in how statistics are handled. In the
>>>>>>> past, there were multiple issues that caused a jump backward. A
>>>>>>> workaround was in place to set the statistics to 0 in that case. When
>>>>>>> this happened while the revalidator was under heavy load, the workaround
>>>>>>> had an unintended side effect where should_revalidate returned false
>>>>>>> causing the flow to be removed because the metric it calculated was
>>>>>>> based on a bogus value. Since many of those bugs have now been
>>>>>>> identified and resolved, there is no need to set the statistics to 0. In
>>>>>>> addition, the (unlikely) overflow still needs to be handled
>>>>>>> appropriately. If an unexpected jump does happen, just log it as a
>>>>>>> warning.
>>>>>>>
>>>>>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>>>>>
>>>>>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>>>>>
>>>>>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>>>>>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>>>>>
>>>>>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>>>>>
>>>>>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>>>>
>>>>> Thanks!  I added the comma and applied the patch.
>>>>
>>>> Are you planning on back porting this, as all other relevant patches where?
>>>
>>> Ilya did you maybe miss this email? Question still applies ;)
>>
>> Yeah, sorry, it fell through the cracks.
>>
>> I'm a bit worried about backporting this one, because the
>> original reason for the stats jump is still not clear and
>> so might still exist.  I guess, we could backport to 3.1,
>> but I'd like not to backport to LTS.
>>
>> What do you think?
> 
> I’m happy with getting this in 3.1 ;)

OK.  Backported to 3.1 now.

Best regards, Ilya Maximets.
Eelco Chaudron June 22, 2023, 12:36 p.m. UTC | #10
On 22 Jun 2023, at 13:29, Ilya Maximets wrote:

> On 6/22/23 11:30, Eelco Chaudron wrote:
>>
>>
>> On 22 Jun 2023, at 11:15, Ilya Maximets wrote:
>>
>>> On 6/22/23 10:50, Eelco Chaudron wrote:
>>>>
>>>>
>>>> On 30 May 2023, at 9:32, Eelco Chaudron wrote:
>>>>
>>>>> On 26 May 2023, at 22:51, Ilya Maximets wrote:
>>>>>
>>>>>> On 5/26/23 15:09, Eelco Chaudron wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 26 May 2023, at 14:03, Balazs Nemeth wrote:
>>>>>>>
>>>>>>>> The only way that stats->{n_packets,n_bytes} would decrease is due to an
>>>>>>>> overflow, or if there are bugs in how statistics are handled. In the
>>>>>>>> past, there were multiple issues that caused a jump backward. A
>>>>>>>> workaround was in place to set the statistics to 0 in that case. When
>>>>>>>> this happened while the revalidator was under heavy load, the workaround
>>>>>>>> had an unintended side effect where should_revalidate returned false
>>>>>>>> causing the flow to be removed because the metric it calculated was
>>>>>>>> based on a bogus value. Since many of those bugs have now been
>>>>>>>> identified and resolved, there is no need to set the statistics to 0. In
>>>>>>>> addition, the (unlikely) overflow still needs to be handled
>>>>>>>> appropriately. If an unexpected jump does happen, just log it as a
>>>>>>>> warning.
>>>>>>>>
>>>>>>>> Signed-off-by: Balazs Nemeth <bnemeth@redhat.com>
>>>>>>>
>>>>>>> Thanks for making the final change! Here is an example of the log message for others reviewing:
>>>>>>>
>>>>>>> 2023-05-26T12:55:07.590Z|00003|ofproto_dpif_upcall(revalidator14)|WARN|Unexpected jump in packet stats from 0 to 1 when handling ukey ufid:90c082b0-7aa6-442a-9b86-04d10fe9ede6 recirc_id(0),dp_hash(0),skb_priority(0),in_port(2),skb_mark(0),ct_state(0),ct_zone(0)
>>>>>>> ,ct_mark(0),ct_label(0),eth(src=6e:48:c8:77:d3:8c,dst=33:33:00:00:00:16),eth_type(0x86dd),ipv6(src=::,dst=ff02::16,label=0,proto=58,tclass=0,hlimit=1,frag=no),key32(40 00),icmpv6(type=143,code=0), actions:1,3
>>>>>>>
>>>>>>> One nit on a missing “,” compared to the dp flow dump, but I think Ilya can add this on commit.
>>>>>>>
>>>>>>> Acked-by: Eelco Chaudron <echaudro@redhat.com>
>>>>>>
>>>>>> Thanks!  I added the comma and applied the patch.
>>>>>
>>>>> Are you planning on back porting this, as all other relevant patches where?
>>>>
>>>> Ilya did you maybe miss this email? Question still applies ;)
>>>
>>> Yeah, sorry, it fell through the cracks.
>>>
>>> I'm a bit worried about backporting this one, because the
>>> original reason for the stats jump is still not clear and
>>> so might still exist.  I guess, we could backport to 3.1,
>>> but I'd like not to backport to LTS.
>>>
>>> What do you think?
>>
>> I’m happy with getting this in 3.1 ;)
>
> OK.  Backported to 3.1 now.
>
> Best regards, Ilya Maximets.

Thanks!!

//Eelco
diff mbox series

Patch

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index cd57fdbd9..819f1eb4e 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -2339,6 +2339,27 @@  exit:
     return result;
 }
 
+static void
+log_unexpected_stats_jump(struct udpif_key *ukey,
+                          const struct dpif_flow_stats *stats)
+    OVS_REQUIRES(ukey->mutex)
+{
+    static struct vlog_rate_limit rll = VLOG_RATE_LIMIT_INIT(1, 5);
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    struct ofpbuf *actions;
+
+    odp_format_ufid(&ukey->ufid, &ds);
+    ds_put_cstr(&ds, " ");
+    odp_flow_key_format(ukey->key, ukey->key_len, &ds);
+    ds_put_cstr(&ds, ", actions:");
+    actions = ovsrcu_get(struct ofpbuf *, &ukey->actions);
+    format_odp_actions(&ds, actions->data, actions->size, NULL);
+    VLOG_WARN_RL(&rll, "Unexpected jump in packet stats from %"PRIu64
+            " to %"PRIu64" when handling ukey %s",
+            ukey->stats.n_packets, stats->n_packets, ds_cstr(&ds));
+    ds_destroy(&ds);
+}
+
 /* Verifies that the datapath actions of 'ukey' are still correct, and pushes
  * 'stats' for it.
  *
@@ -2372,18 +2393,15 @@  revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey,
 
     push.used = stats->used;
     push.tcp_flags = stats->tcp_flags;
-    push.n_packets = (stats->n_packets > ukey->stats.n_packets
-                      ? stats->n_packets - ukey->stats.n_packets
-                      : 0);
-    push.n_bytes = (stats->n_bytes > ukey->stats.n_bytes
-                    ? stats->n_bytes - ukey->stats.n_bytes
-                    : 0);
+    push.n_packets = stats->n_packets - ukey->stats.n_packets;
+    push.n_bytes = stats->n_bytes - ukey->stats.n_bytes;
 
     if (stats->n_packets < ukey->stats.n_packets &&
         ukey->stats.n_packets < UINT64_THREE_QUARTERS) {
         /* Report cases where the packet counter is lower than the previous
          * instance, but exclude the potential wrapping of an uint64_t. */
         COVERAGE_INC(ukey_invalid_stat_reset);
+        log_unexpected_stats_jump(ukey, stats);
     }
 
     if (need_revalidate) {