diff mbox

[v11,09/12] netfilter: add a netbuffer filter

Message ID 1442405768-23019-10-git-send-email-yanghy@cn.fujitsu.com
State New
Headers show

Commit Message

Yang Hongyang Sept. 16, 2015, 12:16 p.m. UTC
This filter is to buffer/release packets, this feature can be used
when using MicroCheckpointing, or other Remus like VM FT solutions, you
can also use it to simulate the network delay.
It has an interval option, if supplied, this filter will release
packets by interval.

Usage:
 -netdev tap,id=bn0
 -object filter-buffer,id=f0,netdev=bn0,chain=in,interval=1000

NOTE:
 the scale of interval is microsecond.

Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
---
v11: add a fixme comment from Jason
v10: use NetQueue flush api to flush packets
     sent_cb can not be called when we already return size
v9: adjustment due to the qapi change
v7: use QTAILQ_FOREACH_SAFE() when flush packets
v6: move the interval check earlier and some comment adjust
v5: remove dummy sent_cb
    change interval type from int64 to uint32
    check interval!=0 when initialise
    rename FILTERBUFFERState to FilterBufferState
v4: remove bh
    pass the packet to next filter instead of receiver
v3: check packet's sender and sender->peer when flush it
---
 net/Makefile.objs   |   1 +
 net/filter-buffer.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-options.hx     |  18 ++++++
 vl.c                |   7 ++-
 4 files changed, 195 insertions(+), 1 deletion(-)
 create mode 100644 net/filter-buffer.c

Comments

Markus Armbruster Sept. 24, 2015, 9:12 a.m. UTC | #1
Yang Hongyang <yanghy@cn.fujitsu.com> writes:

> This filter is to buffer/release packets, this feature can be used
> when using MicroCheckpointing, or other Remus like VM FT solutions, you

What's "Remus"?

> can also use it to simulate the network delay.
> It has an interval option, if supplied, this filter will release
> packets by interval.

Suggest "will delay packets by that time interval."

Is interval really optional?

>
> Usage:
>  -netdev tap,id=bn0
>  -object filter-buffer,id=f0,netdev=bn0,chain=in,interval=1000
>
> NOTE:
>  the scale of interval is microsecond.

Perhaps "interval is in microseconds".

>
> Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
> ---
> v11: add a fixme comment from Jason
> v10: use NetQueue flush api to flush packets
>      sent_cb can not be called when we already return size
> v9: adjustment due to the qapi change
> v7: use QTAILQ_FOREACH_SAFE() when flush packets
> v6: move the interval check earlier and some comment adjust
> v5: remove dummy sent_cb
>     change interval type from int64 to uint32
>     check interval!=0 when initialise
>     rename FILTERBUFFERState to FilterBufferState
> v4: remove bh
>     pass the packet to next filter instead of receiver
> v3: check packet's sender and sender->peer when flush it
> ---
>  net/Makefile.objs   |   1 +
>  net/filter-buffer.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-options.hx     |  18 ++++++
>  vl.c                |   7 ++-
>  4 files changed, 195 insertions(+), 1 deletion(-)
>  create mode 100644 net/filter-buffer.c
>
> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index 914aec0..5fa2f97 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -14,3 +14,4 @@ common-obj-$(CONFIG_SLIRP) += slirp.o
>  common-obj-$(CONFIG_VDE) += vde.o
>  common-obj-$(CONFIG_NETMAP) += netmap.o
>  common-obj-y += filter.o
> +common-obj-y += filter-buffer.o
> diff --git a/net/filter-buffer.c b/net/filter-buffer.c
> new file mode 100644
> index 0000000..ef94e91
> --- /dev/null
> +++ b/net/filter-buffer.c
> @@ -0,0 +1,170 @@
> +/*
> + * Copyright (c) 2015 FUJITSU LIMITED
> + * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * later.  See the COPYING file in the top-level directory.
> + */
> +
> +#include "net/filter.h"
> +#include "net/queue.h"
> +#include "qemu-common.h"
> +#include "qemu/timer.h"
> +#include "qemu/iov.h"
> +#include "qapi/qmp/qerror.h"
> +#include "qapi-visit.h"
> +#include "qom/object.h"
> +
> +#define TYPE_FILTER_BUFFER "filter-buffer"
> +
> +#define FILTER_BUFFER(obj) \
> +    OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER)
> +
> +struct FilterBufferState {
> +    NetFilterState parent_obj;
> +
> +    NetQueue *incoming_queue;
> +    uint32_t interval;
> +    QEMUTimer release_timer;
> +};
> +typedef struct FilterBufferState FilterBufferState;

Again, not splitting the declaration is more concise.

> +
> +static void filter_buffer_flush(NetFilterState *nf)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(nf);
> +
> +    if (!qemu_net_queue_flush(s->incoming_queue)) {
> +        /* Unable to empty the queue, purge remaining packets */
> +        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
> +    }
> +}

This either flushes or purges incoming_queue, where "purge" means
dropping packets.  Correct?

> +
> +static void filter_buffer_release_timer(void *opaque)
> +{
> +    NetFilterState *nf = opaque;
> +    FilterBufferState *s = FILTER_BUFFER(nf);

Style nit: blank line between declarations and statements, please.

> +    filter_buffer_flush(nf);

Is purging correct here?

> +    timer_mod(&s->release_timer,
> +              qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);

Timer rearmed to fire again in s->interval microseconds.

> +}
> +
> +/* filter APIs */
> +static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
> +                                         NetClientState *sender,
> +                                         unsigned flags,
> +                                         const struct iovec *iov,
> +                                         int iovcnt,
> +                                         NetPacketSent *sent_cb)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(nf);
> +
> +    /*
> +     * we return size when buffer a packet, the sender will take it as
> +     * a already sent packet, so sent_cb should not be called later

Humor me: when a comment has multiple sentences, start each one with a
capital letter, and end it with punctuation.

> +     * FIXME: even if guest can't receive packet for some reasons. Filter
> +     * can still accept packet until its internal queue is full.
> +     */

I'm not sure I understand the comment.

> +    qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
> +                              iov, iovcnt, NULL);
> +    return iov_size(iov, iovcnt);
> +}
> +
> +static void filter_buffer_cleanup(NetFilterState *nf)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(nf);
> +
> +    if (s->interval) {
> +        timer_del(&s->release_timer);
> +    }
> +
> +    /* flush packets */
> +    if (s->incoming_queue) {
> +        filter_buffer_flush(nf);

I guess purging is correct here.

> +        g_free(s->incoming_queue);
> +    }
> +}
> +
> +static void filter_buffer_setup(NetFilterState *nf, Error **errp)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(nf);
> +
> +    /*
> +     * this check should be dropped when there're VM FT solutions like MC
> +     * or COLO use this filter to release packets on demand.
> +     */

If you end a sentence with a period, you get to start it with a capital
letter :)

"there're" is odd.  Perhaps something like "We may want to accept zero
interval when VM FT solutions like MC or COLO use this filter to release
packets on demand."

> +    if (!s->interval) {
> +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
> +                   "a non-zero interval");

How can this happen?  Doesn't filter_buffer_set_interval() catch zero
intervals already?

> +        return;
> +    }
> +
> +    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
> +    if (s->interval) {

Condition cannot be false.  Same in filter_buffer_cleanup().

> +        timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
> +                      filter_buffer_release_timer, nf);
> +        timer_mod(&s->release_timer,
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);

Timer armed to fire in s->interval microseconds.

Unless I misunderstand something, this doesn't actually delay each
packet by s->interval microseconds, it batches packet delivery: all
packets arriving in a given interval are delayed until the end of the
interval.  Correct?

> +    }
> +}
> +
> +static void filter_buffer_class_init(ObjectClass *oc, void *data)
> +{
> +    NetFilterClass *nfc = NETFILTER_CLASS(oc);
> +
> +    nfc->setup = filter_buffer_setup;
> +    nfc->cleanup = filter_buffer_cleanup;
> +    nfc->receive_iov = filter_buffer_receive_iov;
> +}
> +
> +static void filter_buffer_get_interval(Object *obj, Visitor *v, void *opaque,
> +                                       const char *name, Error **errp)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(obj);
> +    uint32_t value = s->interval;
> +
> +    visit_type_uint32(v, &value, name, errp);
> +}
> +
> +static void filter_buffer_set_interval(Object *obj, Visitor *v, void *opaque,
> +                                       const char *name, Error **errp)
> +{
> +    FilterBufferState *s = FILTER_BUFFER(obj);
> +    Error *local_err = NULL;
> +    uint32_t value;
> +
> +    visit_type_uint32(v, &value, name, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +    if (!value) {
> +        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
> +                   PRIu32 "'", object_get_typename(obj), name, value);

What does it take?  That's what the user wants to know...  Perhaps
"Property '%s.%s' requires a positive value".

> +        goto out;
> +    }
> +    s->interval = value;
> +
> +out:
> +    error_propagate(errp, local_err);
> +}
> +
> +static void filter_buffer_init(Object *obj)
> +{
> +    object_property_add(obj, "interval", "int",
> +                        filter_buffer_get_interval,
> +                        filter_buffer_set_interval, NULL, NULL, NULL);
> +}
> +
> +static const TypeInfo filter_buffer_info = {
> +    .name = TYPE_FILTER_BUFFER,
> +    .parent = TYPE_NETFILTER,
> +    .class_init = filter_buffer_class_init,
> +    .instance_init = filter_buffer_init,
> +    .instance_size = sizeof(FilterBufferState),
> +};
> +
> +static void register_types(void)
> +{
> +    type_register_static(&filter_buffer_info);
> +}
> +
> +type_init(register_types);
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 7e147b8..b09f97f 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -3642,6 +3642,24 @@ in PEM format, in filenames @var{ca-cert.pem}, @var{ca-crl.pem} (optional),
>  @var{server-cert.pem} (only servers), @var{server-key.pem} (only servers),
>  @var{client-cert.pem} (only clients), and @var{client-key.pem} (only clients).
>  
> +@item -object filter-buffer,id=@var{id},netdev=@var{netdevid}[,chain=@var{all|in|out}][,interval=@var{t}]
> +
> +Buffer network packets on netdev @var{netdevid}.
> +If interval @var{t} provided, will release packets by interval.
> +Interval scale: microsecond.
> +
> +If interval @var{t} not provided, you have to make sure the packets can be

Are you sure omitting t works?  filter_buffer_set_interval() rejects
zero...

> +released, either by manually remove this filter or call the release buffer API,
> +otherwise, the packets will be buffered forever. Use with caution.

Please wrap your lines a bit earlier.

> +
> +chain @var{all|in|out} is an option that can be applied to any netfilter, default is @option{all}.
> +
> +@option{all} means this filter will receive packets both sent to/from the netdev
> +
> +@option{in} means this filter will receive packets sent to the netdev
> +
> +@option{out} means this filter will receive packets sent from the netdev
> +

I'd describe this like "filter is inserted in the receive / transmit
queue".

>  @end table
>  
>  ETEXI
> diff --git a/vl.c b/vl.c
> index ec589e2..3cf89d5 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char *type)
>      if (g_str_equal(type, "rng-egd")) {
>          return false;
>      }
> -    /* TODO: return false for concrete netfilters */
> +
> +    /* return false for concrete netfilters */

I find this comment useless, please drop it :)

> +    if (g_str_equal(type, "filter-buffer")) {
> +        return false;
> +    }
> +
>      return true;
>  }
Yang Hongyang Sept. 25, 2015, 7:18 a.m. UTC | #2
On 09/24/2015 05:12 PM, Markus Armbruster wrote:
> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>
>> This filter is to buffer/release packets, this feature can be used
>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>
> What's "Remus"?

Remus is an opensource VM FT solution:
paper: http://http//www.cs.ubc.ca/~andy/papers/remus-nsdi-final.pdf
First implemented on Xen:
http://wiki.xen.org/wiki/Remus

MicroCheckpointing in QEMU is another Remus implementation.

>
>> can also use it to simulate the network delay.
>> It has an interval option, if supplied, this filter will release
>> packets by interval.
>
> Suggest "will delay packets by that time interval."
>
> Is interval really optional?

It supposed to be optional. When the buffer filter has a user:
http://lists.nongnu.org/archive/html/qemu-devel/2015-09/msg00363.html

In this patchset, the zero interval check is removed. packets are
released on demand through filter_buffer_release_all() api call.

>
>>
>> Usage:
>>   -netdev tap,id=bn0
>>   -object filter-buffer,id=f0,netdev=bn0,chain=in,interval=1000
>>
>> NOTE:
>>   the scale of interval is microsecond.
>
> Perhaps "interval is in microseconds".

Better, thanks.

>
>>
>> Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
>> ---
>> v11: add a fixme comment from Jason
>> v10: use NetQueue flush api to flush packets
>>       sent_cb can not be called when we already return size
>> v9: adjustment due to the qapi change
>> v7: use QTAILQ_FOREACH_SAFE() when flush packets
>> v6: move the interval check earlier and some comment adjust
>> v5: remove dummy sent_cb
>>      change interval type from int64 to uint32
>>      check interval!=0 when initialise
>>      rename FILTERBUFFERState to FilterBufferState
>> v4: remove bh
>>      pass the packet to next filter instead of receiver
>> v3: check packet's sender and sender->peer when flush it
>> ---
>>   net/Makefile.objs   |   1 +
>>   net/filter-buffer.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   qemu-options.hx     |  18 ++++++
>>   vl.c                |   7 ++-
>>   4 files changed, 195 insertions(+), 1 deletion(-)
>>   create mode 100644 net/filter-buffer.c
>>
>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>> index 914aec0..5fa2f97 100644
>> --- a/net/Makefile.objs
>> +++ b/net/Makefile.objs
>> @@ -14,3 +14,4 @@ common-obj-$(CONFIG_SLIRP) += slirp.o
>>   common-obj-$(CONFIG_VDE) += vde.o
>>   common-obj-$(CONFIG_NETMAP) += netmap.o
>>   common-obj-y += filter.o
>> +common-obj-y += filter-buffer.o
>> diff --git a/net/filter-buffer.c b/net/filter-buffer.c
>> new file mode 100644
>> index 0000000..ef94e91
>> --- /dev/null
>> +++ b/net/filter-buffer.c
>> @@ -0,0 +1,170 @@
>> +/*
>> + * Copyright (c) 2015 FUJITSU LIMITED
>> + * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>> + * later.  See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "net/filter.h"
>> +#include "net/queue.h"
>> +#include "qemu-common.h"
>> +#include "qemu/timer.h"
>> +#include "qemu/iov.h"
>> +#include "qapi/qmp/qerror.h"
>> +#include "qapi-visit.h"
>> +#include "qom/object.h"
>> +
>> +#define TYPE_FILTER_BUFFER "filter-buffer"
>> +
>> +#define FILTER_BUFFER(obj) \
>> +    OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER)
>> +
>> +struct FilterBufferState {
>> +    NetFilterState parent_obj;
>> +
>> +    NetQueue *incoming_queue;
>> +    uint32_t interval;
>> +    QEMUTimer release_timer;
>> +};
>> +typedef struct FilterBufferState FilterBufferState;
>
> Again, not splitting the declaration is more concise.
>
>> +
>> +static void filter_buffer_flush(NetFilterState *nf)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>> +
>> +    if (!qemu_net_queue_flush(s->incoming_queue)) {
>> +        /* Unable to empty the queue, purge remaining packets */
>> +        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
>> +    }
>> +}
>
> This either flushes or purges incoming_queue, where "purge" means
> dropping packets.  Correct?

I think it is correct. Dropping packets is allowed, even on real
hardware, packets may lose.

>
>> +
>> +static void filter_buffer_release_timer(void *opaque)
>> +{
>> +    NetFilterState *nf = opaque;
>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>
> Style nit: blank line between declarations and statements, please.
>
>> +    filter_buffer_flush(nf);
>
> Is purging correct here?
>
>> +    timer_mod(&s->release_timer,
>> +              qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>
> Timer rearmed to fire again in s->interval microseconds.

Yes.

>
>> +}
>> +
>> +/* filter APIs */
>> +static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
>> +                                         NetClientState *sender,
>> +                                         unsigned flags,
>> +                                         const struct iovec *iov,
>> +                                         int iovcnt,
>> +                                         NetPacketSent *sent_cb)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>> +
>> +    /*
>> +     * we return size when buffer a packet, the sender will take it as
>> +     * a already sent packet, so sent_cb should not be called later
>
> Humor me: when a comment has multiple sentences, start each one with a
> capital letter, and end it with punctuation.

Ok.

>
>> +     * FIXME: even if guest can't receive packet for some reasons. Filter
>> +     * can still accept packet until its internal queue is full.
>> +     */
>
> I'm not sure I understand the comment.

This is taken from Jason's comments, may be he can have a better explain?

>
>> +    qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
>> +                              iov, iovcnt, NULL);
>> +    return iov_size(iov, iovcnt);
>> +}
>> +
>> +static void filter_buffer_cleanup(NetFilterState *nf)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>> +
>> +    if (s->interval) {
>> +        timer_del(&s->release_timer);
>> +    }
>> +
>> +    /* flush packets */
>> +    if (s->incoming_queue) {
>> +        filter_buffer_flush(nf);
>
> I guess purging is correct here.
>
>> +        g_free(s->incoming_queue);
>> +    }
>> +}
>> +
>> +static void filter_buffer_setup(NetFilterState *nf, Error **errp)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>> +
>> +    /*
>> +     * this check should be dropped when there're VM FT solutions like MC
>> +     * or COLO use this filter to release packets on demand.
>> +     */
>
> If you end a sentence with a period, you get to start it with a capital
> letter :)

Ok, thanks.

>
> "there're" is odd.  Perhaps something like "We may want to accept zero
> interval when VM FT solutions like MC or COLO use this filter to release
> packets on demand."
>
>> +    if (!s->interval) {
>> +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
>> +                   "a non-zero interval");
>
> How can this happen?  Doesn't filter_buffer_set_interval() catch zero
> intervals already?

When user do not supply an interval parameter, filter_buffer_set_interval()
won't be called, and the interval is 0.
When we have actual user like I mentioned above, we should allow user to omit
the interval option.

>
>> +        return;
>> +    }
>> +
>> +    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
>> +    if (s->interval) {
>
> Condition cannot be false.  Same in filter_buffer_cleanup().
>
>> +        timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
>> +                      filter_buffer_release_timer, nf);
>> +        timer_mod(&s->release_timer,
>> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>
> Timer armed to fire in s->interval microseconds.
>
> Unless I misunderstand something, this doesn't actually delay each
> packet by s->interval microseconds, it batches packet delivery: all
> packets arriving in a given interval are delayed until the end of the
> interval.  Correct?

Correct.

>
>> +    }
>> +}
>> +
>> +static void filter_buffer_class_init(ObjectClass *oc, void *data)
>> +{
>> +    NetFilterClass *nfc = NETFILTER_CLASS(oc);
>> +
>> +    nfc->setup = filter_buffer_setup;
>> +    nfc->cleanup = filter_buffer_cleanup;
>> +    nfc->receive_iov = filter_buffer_receive_iov;
>> +}
>> +
>> +static void filter_buffer_get_interval(Object *obj, Visitor *v, void *opaque,
>> +                                       const char *name, Error **errp)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>> +    uint32_t value = s->interval;
>> +
>> +    visit_type_uint32(v, &value, name, errp);
>> +}
>> +
>> +static void filter_buffer_set_interval(Object *obj, Visitor *v, void *opaque,
>> +                                       const char *name, Error **errp)
>> +{
>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>> +    Error *local_err = NULL;
>> +    uint32_t value;
>> +
>> +    visit_type_uint32(v, &value, name, &local_err);
>> +    if (local_err) {
>> +        goto out;
>> +    }
>> +    if (!value) {
>> +        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
>> +                   PRIu32 "'", object_get_typename(obj), name, value);
>
> What does it take?  That's what the user wants to know...  Perhaps
> "Property '%s.%s' requires a positive value".

Maybe better, thanks.

>
>> +        goto out;
>> +    }
>> +    s->interval = value;
>> +
>> +out:
>> +    error_propagate(errp, local_err);
>> +}
>> +
>> +static void filter_buffer_init(Object *obj)
>> +{
>> +    object_property_add(obj, "interval", "int",
>> +                        filter_buffer_get_interval,
>> +                        filter_buffer_set_interval, NULL, NULL, NULL);
>> +}
>> +
>> +static const TypeInfo filter_buffer_info = {
>> +    .name = TYPE_FILTER_BUFFER,
>> +    .parent = TYPE_NETFILTER,
>> +    .class_init = filter_buffer_class_init,
>> +    .instance_init = filter_buffer_init,
>> +    .instance_size = sizeof(FilterBufferState),
>> +};
>> +
>> +static void register_types(void)
>> +{
>> +    type_register_static(&filter_buffer_info);
>> +}
>> +
>> +type_init(register_types);
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index 7e147b8..b09f97f 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -3642,6 +3642,24 @@ in PEM format, in filenames @var{ca-cert.pem}, @var{ca-crl.pem} (optional),
>>   @var{server-cert.pem} (only servers), @var{server-key.pem} (only servers),
>>   @var{client-cert.pem} (only clients), and @var{client-key.pem} (only clients).
>>
>> +@item -object filter-buffer,id=@var{id},netdev=@var{netdevid}[,chain=@var{all|in|out}][,interval=@var{t}]
>> +
>> +Buffer network packets on netdev @var{netdevid}.
>> +If interval @var{t} provided, will release packets by interval.
>> +Interval scale: microsecond.
>> +
>> +If interval @var{t} not provided, you have to make sure the packets can be
>
> Are you sure omitting t works?  filter_buffer_set_interval() rejects
> zero...

When we omit t, filter_buffer_set_interval() won't be called.

>
>> +released, either by manually remove this filter or call the release buffer API,
>> +otherwise, the packets will be buffered forever. Use with caution.
>
> Please wrap your lines a bit earlier.
>
>> +
>> +chain @var{all|in|out} is an option that can be applied to any netfilter, default is @option{all}.
>> +
>> +@option{all} means this filter will receive packets both sent to/from the netdev
>> +
>> +@option{in} means this filter will receive packets sent to the netdev
>> +
>> +@option{out} means this filter will receive packets sent from the netdev
>> +
>
> I'd describe this like "filter is inserted in the receive / transmit
> queue".

thanks.

>
>>   @end table
>>
>>   ETEXI
>> diff --git a/vl.c b/vl.c
>> index ec589e2..3cf89d5 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char *type)
>>       if (g_str_equal(type, "rng-egd")) {
>>           return false;
>>       }
>> -    /* TODO: return false for concrete netfilters */
>> +
>> +    /* return false for concrete netfilters */
>
> I find this comment useless, please drop it :)

Ok, thanks.

>
>> +    if (g_str_equal(type, "filter-buffer")) {
>> +        return false;
>> +    }
>> +
>>       return true;
>>   }
> .
>
Yang Hongyang Sept. 25, 2015, 8:03 a.m. UTC | #3
On 09/24/2015 05:12 PM, Markus Armbruster wrote:
> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
[...]
>> diff --git a/vl.c b/vl.c
>> index ec589e2..3cf89d5 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char *type)
>>       if (g_str_equal(type, "rng-egd")) {
>>           return false;
>>       }
>> -    /* TODO: return false for concrete netfilters */
>> +
>> +    /* return false for concrete netfilters */
>
> I find this comment useless, please drop it :)

This might be useful for reminding others who wants to implement other filters.

>
>> +    if (g_str_equal(type, "filter-buffer")) {
>> +        return false;
>> +    }
>> +
>>       return true;
>>   }
> .
>
Thomas Huth Sept. 25, 2015, 8:18 a.m. UTC | #4
On 25/09/15 10:03, Yang Hongyang wrote:
> 
> 
> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
> [...]
>>> diff --git a/vl.c b/vl.c
>>> index ec589e2..3cf89d5 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char
>>> *type)
>>>       if (g_str_equal(type, "rng-egd")) {
>>>           return false;
>>>       }
>>> -    /* TODO: return false for concrete netfilters */
>>> +
>>> +    /* return false for concrete netfilters */
>>
>> I find this comment useless, please drop it :)
> 
> This might be useful for reminding others who wants to implement other
> filters.

I think the comment should explain why the code is return false here,
not what the code is doing (which is obvious). So maybe something like:

    /*
     * netfilters require that the corresponding
     * netdevs are already existing
     */

?

 Thomas
Jason Wang Sept. 25, 2015, 8:18 a.m. UTC | #5
On 09/25/2015 03:18 PM, Yang Hongyang wrote:
>
>
> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>
>>> This filter is to buffer/release packets, this feature can be used
>>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>>
>> What's "Remus"?
>
> Remus is an opensource VM FT solution:
> paper: http://http//www.cs.ubc.ca/~andy/papers/remus-nsdi-final.pdf
> First implemented on Xen:
> http://wiki.xen.org/wiki/Remus
>
> MicroCheckpointing in QEMU is another Remus implementation.
>
>

[...]

>>
>>> +     * FIXME: even if guest can't receive packet for some reasons.
>>> Filter
>>> +     * can still accept packet until its internal queue is full.
>>> +     */
>>
>> I'm not sure I understand the comment.
>
> This is taken from Jason's comments, may be he can have a better explain?

For example. For some reason, receiver could not receive more packets
(.can_receive() returns zero). Without a filter, at most one packet will
be queued in incoming queue and sender's poll will be disabled unit its
sent_cb() was called. With a filter, it will keep receive the packets
without caring about the receiver. This is suboptimal. May need more
thoughts (e.g keeping sent_cb).

[...]
Yang Hongyang Sept. 25, 2015, 8:22 a.m. UTC | #6
On 09/25/2015 04:18 PM, Thomas Huth wrote:
> On 25/09/15 10:03, Yang Hongyang wrote:
>>
>>
>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>> [...]
>>>> diff --git a/vl.c b/vl.c
>>>> index ec589e2..3cf89d5 100644
>>>> --- a/vl.c
>>>> +++ b/vl.c
>>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char
>>>> *type)
>>>>        if (g_str_equal(type, "rng-egd")) {
>>>>            return false;
>>>>        }
>>>> -    /* TODO: return false for concrete netfilters */
>>>> +
>>>> +    /* return false for concrete netfilters */
>>>
>>> I find this comment useless, please drop it :)
>>
>> This might be useful for reminding others who wants to implement other
>> filters.
>
> I think the comment should explain why the code is return false here,
> not what the code is doing (which is obvious). So maybe something like:
>
>      /*
>       * netfilters require that the corresponding
>       * netdevs are already existing
>       */

Makes more sense, thanks:)

>
> ?
>
>   Thomas
>
> .
>
Markus Armbruster Sept. 25, 2015, 3:07 p.m. UTC | #7
Yang Hongyang <yanghy@cn.fujitsu.com> writes:

> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>
>>> This filter is to buffer/release packets, this feature can be used
>>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>>
>> What's "Remus"?
>
> Remus is an opensource VM FT solution:
> paper: http://http//www.cs.ubc.ca/~andy/papers/remus-nsdi-final.pdf
> First implemented on Xen:
> http://wiki.xen.org/wiki/Remus
>
> MicroCheckpointing in QEMU is another Remus implementation.
>
>>
>>> can also use it to simulate the network delay.
>>> It has an interval option, if supplied, this filter will release
>>> packets by interval.
>>
>> Suggest "will delay packets by that time interval."
>>
>> Is interval really optional?
>
> It supposed to be optional. When the buffer filter has a user:
> http://lists.nongnu.org/archive/html/qemu-devel/2015-09/msg00363.html
>
> In this patchset, the zero interval check is removed. packets are
> released on demand through filter_buffer_release_all() api call.

Understand.  But is interval optional right at this point in this patch
series?

>>
>>>
>>> Usage:
>>>   -netdev tap,id=bn0
>>>   -object filter-buffer,id=f0,netdev=bn0,chain=in,interval=1000
>>>
>>> NOTE:
>>>   the scale of interval is microsecond.
>>
>> Perhaps "interval is in microseconds".
>
> Better, thanks.
>
>>
>>>
>>> Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
>>> ---
>>> v11: add a fixme comment from Jason
>>> v10: use NetQueue flush api to flush packets
>>>       sent_cb can not be called when we already return size
>>> v9: adjustment due to the qapi change
>>> v7: use QTAILQ_FOREACH_SAFE() when flush packets
>>> v6: move the interval check earlier and some comment adjust
>>> v5: remove dummy sent_cb
>>>      change interval type from int64 to uint32
>>>      check interval!=0 when initialise
>>>      rename FILTERBUFFERState to FilterBufferState
>>> v4: remove bh
>>>      pass the packet to next filter instead of receiver
>>> v3: check packet's sender and sender->peer when flush it
>>> ---
>>>   net/Makefile.objs   |   1 +
>>>   net/filter-buffer.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>   qemu-options.hx     |  18 ++++++
>>>   vl.c                |   7 ++-
>>>   4 files changed, 195 insertions(+), 1 deletion(-)
>>>   create mode 100644 net/filter-buffer.c
>>>
>>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>>> index 914aec0..5fa2f97 100644
>>> --- a/net/Makefile.objs
>>> +++ b/net/Makefile.objs
>>> @@ -14,3 +14,4 @@ common-obj-$(CONFIG_SLIRP) += slirp.o
>>>   common-obj-$(CONFIG_VDE) += vde.o
>>>   common-obj-$(CONFIG_NETMAP) += netmap.o
>>>   common-obj-y += filter.o
>>> +common-obj-y += filter-buffer.o
>>> diff --git a/net/filter-buffer.c b/net/filter-buffer.c
>>> new file mode 100644
>>> index 0000000..ef94e91
>>> --- /dev/null
>>> +++ b/net/filter-buffer.c
>>> @@ -0,0 +1,170 @@
>>> +/*
>>> + * Copyright (c) 2015 FUJITSU LIMITED
>>> + * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>> + * later.  See the COPYING file in the top-level directory.
>>> + */
>>> +
>>> +#include "net/filter.h"
>>> +#include "net/queue.h"
>>> +#include "qemu-common.h"
>>> +#include "qemu/timer.h"
>>> +#include "qemu/iov.h"
>>> +#include "qapi/qmp/qerror.h"
>>> +#include "qapi-visit.h"
>>> +#include "qom/object.h"
>>> +
>>> +#define TYPE_FILTER_BUFFER "filter-buffer"
>>> +
>>> +#define FILTER_BUFFER(obj) \
>>> +    OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER)
>>> +
>>> +struct FilterBufferState {
>>> +    NetFilterState parent_obj;
>>> +
>>> +    NetQueue *incoming_queue;
>>> +    uint32_t interval;
>>> +    QEMUTimer release_timer;
>>> +};
>>> +typedef struct FilterBufferState FilterBufferState;
>>
>> Again, not splitting the declaration is more concise.
>>
>>> +
>>> +static void filter_buffer_flush(NetFilterState *nf)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>> +
>>> +    if (!qemu_net_queue_flush(s->incoming_queue)) {
>>> +        /* Unable to empty the queue, purge remaining packets */
>>> +        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
>>> +    }
>>> +}
>>
>> This either flushes or purges incoming_queue, where "purge" means
>> dropping packets.  Correct?
>
> I think it is correct. Dropping packets is allowed, even on real
> hardware, packets may lose.
>
>>
>>> +
>>> +static void filter_buffer_release_timer(void *opaque)
>>> +{
>>> +    NetFilterState *nf = opaque;
>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>
>> Style nit: blank line between declarations and statements, please.
>>
>>> +    filter_buffer_flush(nf);
>>
>> Is purging correct here?

When the timer expires, we flush as many buffered packets as we can,
then throw away the rest.  Why throw them away?  Shouldn't we leave them
in the buffer, and only throw away packets when the buffer is full?

>>> +    timer_mod(&s->release_timer,
>>> +              qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>>
>> Timer rearmed to fire again in s->interval microseconds.
>
> Yes.
>
>>
>>> +}
>>> +
>>> +/* filter APIs */
>>> +static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
>>> +                                         NetClientState *sender,
>>> +                                         unsigned flags,
>>> +                                         const struct iovec *iov,
>>> +                                         int iovcnt,
>>> +                                         NetPacketSent *sent_cb)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>> +
>>> +    /*
>>> +     * we return size when buffer a packet, the sender will take it as
>>> +     * a already sent packet, so sent_cb should not be called later
>>
>> Humor me: when a comment has multiple sentences, start each one with a
>> capital letter, and end it with punctuation.
>
> Ok.
>
>>
>>> +     * FIXME: even if guest can't receive packet for some reasons. Filter
>>> +     * can still accept packet until its internal queue is full.
>>> +     */
>>
>> I'm not sure I understand the comment.
>
> This is taken from Jason's comments, may be he can have a better explain?
>
>>
>>> +    qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
>>> +                              iov, iovcnt, NULL);
>>> +    return iov_size(iov, iovcnt);
>>> +}
>>> +
>>> +static void filter_buffer_cleanup(NetFilterState *nf)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>> +
>>> +    if (s->interval) {
>>> +        timer_del(&s->release_timer);
>>> +    }
>>> +
>>> +    /* flush packets */
>>> +    if (s->incoming_queue) {
>>> +        filter_buffer_flush(nf);
>>
>> I guess purging is correct here.
>>
>>> +        g_free(s->incoming_queue);
>>> +    }
>>> +}
>>> +
>>> +static void filter_buffer_setup(NetFilterState *nf, Error **errp)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>> +
>>> +    /*
>>> +     * this check should be dropped when there're VM FT solutions like MC
>>> +     * or COLO use this filter to release packets on demand.
>>> +     */
>>
>> If you end a sentence with a period, you get to start it with a capital
>> letter :)
>
> Ok, thanks.
>
>>
>> "there're" is odd.  Perhaps something like "We may want to accept zero
>> interval when VM FT solutions like MC or COLO use this filter to release
>> packets on demand."
>>
>>> +    if (!s->interval) {
>>> +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
>>> +                   "a non-zero interval");
>>
>> How can this happen?  Doesn't filter_buffer_set_interval() catch zero
>> intervals already?
>
> When user do not supply an interval parameter, filter_buffer_set_interval()
> won't be called, and the interval is 0.
> When we have actual user like I mentioned above, we should allow user to omit
> the interval option.

Let's see whether I understand.

If you specify a non-zero interval, filter_buffer_set_interval() stores
it.

If you specify a zero interval, filter_buffer_set_interval() rejects it.

If you specify no interval, it defaults to zero, and
filter_buffer_setup() fails right here.

Therefore, interval is not optional right now.

Correct?

>>> +        return;
>>> +    }
>>> +
>>> +    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
>>> +    if (s->interval) {
>>
>> Condition cannot be false.  Same in filter_buffer_cleanup().
>>
>>> +        timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
>>> +                      filter_buffer_release_timer, nf);
>>> +        timer_mod(&s->release_timer,
>>> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>>
>> Timer armed to fire in s->interval microseconds.
>>
>> Unless I misunderstand something, this doesn't actually delay each
>> packet by s->interval microseconds, it batches packet delivery: all
>> packets arriving in a given interval are delayed until the end of the
>> interval.  Correct?
>
> Correct.

Documentation needs to spell that out.

>>> +    }
>>> +}
>>> +
>>> +static void filter_buffer_class_init(ObjectClass *oc, void *data)
>>> +{
>>> +    NetFilterClass *nfc = NETFILTER_CLASS(oc);
>>> +
>>> +    nfc->setup = filter_buffer_setup;
>>> +    nfc->cleanup = filter_buffer_cleanup;
>>> +    nfc->receive_iov = filter_buffer_receive_iov;
>>> +}
>>> +
>>> +static void filter_buffer_get_interval(Object *obj, Visitor *v,
>>> void *opaque,
>>> +                                       const char *name, Error **errp)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>>> +    uint32_t value = s->interval;
>>> +
>>> +    visit_type_uint32(v, &value, name, errp);
>>> +}
>>> +
>>> +static void filter_buffer_set_interval(Object *obj, Visitor *v,
>>> void *opaque,
>>> +                                       const char *name, Error **errp)
>>> +{
>>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>>> +    Error *local_err = NULL;
>>> +    uint32_t value;
>>> +
>>> +    visit_type_uint32(v, &value, name, &local_err);
>>> +    if (local_err) {
>>> +        goto out;
>>> +    }
>>> +    if (!value) {
>>> +        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
>>> +                   PRIu32 "'", object_get_typename(obj), name, value);
>>
>> What does it take?  That's what the user wants to know...  Perhaps
>> "Property '%s.%s' requires a positive value".
>
> Maybe better, thanks.
>
>>
>>> +        goto out;
>>> +    }
>>> +    s->interval = value;
>>> +
>>> +out:
>>> +    error_propagate(errp, local_err);
>>> +}
>>> +
>>> +static void filter_buffer_init(Object *obj)
>>> +{
>>> +    object_property_add(obj, "interval", "int",
>>> +                        filter_buffer_get_interval,
>>> +                        filter_buffer_set_interval, NULL, NULL, NULL);
>>> +}
>>> +
>>> +static const TypeInfo filter_buffer_info = {
>>> +    .name = TYPE_FILTER_BUFFER,
>>> +    .parent = TYPE_NETFILTER,
>>> +    .class_init = filter_buffer_class_init,
>>> +    .instance_init = filter_buffer_init,
>>> +    .instance_size = sizeof(FilterBufferState),
>>> +};
>>> +
>>> +static void register_types(void)
>>> +{
>>> +    type_register_static(&filter_buffer_info);
>>> +}
>>> +
>>> +type_init(register_types);
>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>> index 7e147b8..b09f97f 100644
>>> --- a/qemu-options.hx
>>> +++ b/qemu-options.hx
>>> @@ -3642,6 +3642,24 @@ in PEM format, in filenames
>>> @var{ca-cert.pem}, @var{ca-crl.pem} (optional),
>>>   @var{server-cert.pem} (only servers), @var{server-key.pem} (only servers),
>>>   @var{client-cert.pem} (only clients), and @var{client-key.pem}
>>> (only clients).
>>>
>>> +@item -object
>>> filter-buffer,id=@var{id},netdev=@var{netdevid}[,chain=@var{all|in|out}][,interval=@var{t}]
>>> +
>>> +Buffer network packets on netdev @var{netdevid}.
>>> +If interval @var{t} provided, will release packets by interval.
>>> +Interval scale: microsecond.
>>> +
>>> +If interval @var{t} not provided, you have to make sure the packets can be
>>
>> Are you sure omitting t works?  filter_buffer_set_interval() rejects
>> zero...
>
> When we omit t, filter_buffer_set_interval() won't be called.

But filter_buffer_setup() will be, and it will fail, won't it?

>>> +released, either by manually remove this filter or call the
>>> release buffer API,
>>> +otherwise, the packets will be buffered forever. Use with caution.
>>
>> Please wrap your lines a bit earlier.
>>
>>> +
>>> +chain @var{all|in|out} is an option that can be applied to any
>>> netfilter, default is @option{all}.
>>> +
>>> +@option{all} means this filter will receive packets both sent
>>> to/from the netdev
>>> +
>>> +@option{in} means this filter will receive packets sent to the netdev
>>> +
>>> +@option{out} means this filter will receive packets sent from the netdev
>>> +
>>
>> I'd describe this like "filter is inserted in the receive / transmit
>> queue".
>
> thanks.
>
>>
>>>   @end table
>>>
>>>   ETEXI
>>> diff --git a/vl.c b/vl.c
>>> index ec589e2..3cf89d5 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char *type)
>>>       if (g_str_equal(type, "rng-egd")) {
>>>           return false;
>>>       }
>>> -    /* TODO: return false for concrete netfilters */
>>> +
>>> +    /* return false for concrete netfilters */
>>
>> I find this comment useless, please drop it :)
>
> Ok, thanks.
>
>>
>>> +    if (g_str_equal(type, "filter-buffer")) {
>>> +        return false;
>>> +    }
>>> +
>>>       return true;
>>>   }
>> .
>>
Markus Armbruster Sept. 25, 2015, 3:13 p.m. UTC | #8
Jason Wang <jasowang@redhat.com> writes:

> On 09/25/2015 03:18 PM, Yang Hongyang wrote:
>>
>>
>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
[...]
>>>> +static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
>>>> +                                         NetClientState *sender,
>>>> +                                         unsigned flags,
>>>> +                                         const struct iovec *iov,
>>>> +                                         int iovcnt,
>>>> +                                         NetPacketSent *sent_cb)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> +
>>>> +    /*
>>>> +     * we return size when buffer a packet, the sender will take it as
>>>> +     * a already sent packet, so sent_cb should not be called later
[...]
>>>> +     * FIXME: even if guest can't receive packet for some reasons.
>>>> Filter
>>>> +     * can still accept packet until its internal queue is full.
>>>> +     */
>>>
>>> I'm not sure I understand the comment.
>>
>> This is taken from Jason's comments, may be he can have a better explain?
>
> For example. For some reason, receiver could not receive more packets
> (.can_receive() returns zero). Without a filter, at most one packet will
> be queued in incoming queue and sender's poll will be disabled unit its
> sent_cb() was called. With a filter, it will keep receive the packets
> without caring about the receiver. This is suboptimal. May need more
> thoughts (e.g keeping sent_cb).

Aha.  Perhaps you can work that explanation into the comment.
Markus Armbruster Sept. 25, 2015, 3:26 p.m. UTC | #9
Thomas Huth <thuth@redhat.com> writes:

> On 25/09/15 10:03, Yang Hongyang wrote:
>> 
>> 
>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>> [...]
>>>> diff --git a/vl.c b/vl.c
>>>> index ec589e2..3cf89d5 100644
>>>> --- a/vl.c
>>>> +++ b/vl.c
>>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char
>>>> *type)
      /*
       * Initial object creation happens before all other
       * QEMU data types are created. The majority of objects
       * can be created at this point. The rng-egd object
       * cannot be created here, as it depends on the chardev
       * already existing.
       */
      static bool object_create_initial(const char *type)
      {
>>>>      if (g_str_equal(type, "rng-egd")) {
>>>>          return false;
>>>>      }
>>>> -    /* TODO: return false for concrete netfilters */
>>>> +
>>>> +    /* return false for concrete netfilters */
     +    if (g_str_equal(type, "filter-buffer")) {
     +        return false;
     +    }
     +
          return true;
      }
>>>
>>> I find this comment useless, please drop it :)
>> 
>> This might be useful for reminding others who wants to implement other
>> filters.

I think as soon as there's one, how to add more is obvious enough.

> I think the comment should explain why the code is return false here,
> not what the code is doing (which is obvious). So maybe something like:
>
>     /*
>      * netfilters require that the corresponding
>      * netdevs are already existing
>      */
>
> ?

That explains *why* netfilters need to be initialized late.  More
useful.

The pre-existing case is explained in the function comment.  Which
becomes misleading at this point.

    /*
     * Can @type objects be created during initial object creation?
     * Initial object creation happens before all other QEMU data types
     * are created.  The majority of objects can be created at this
     * point.  Some can't, because the depend on other things already
     * existing.
     */
    static bool object_create_initial(const char *type)
    {
        /* rng-egd depends on its chardev already existing */
        if (g_str_equal(type, "rng-egd")) {
            return false;
        }

        /* Net filters depend on their netdev */
        if (g_str_equal(type, "filter-buffer")) {
            return false;
        }

        return true;
    }

Having to enumerate alll the net filter names here isn't nice.  An "is
subtype of TYPE_NETFILTER" test would be better.  Can't say whether it's
practical.

Of course, the real solution is to create objects either in topological
or in command line order, but that's not in the cards right now.
Jason Wang Sept. 28, 2015, 6:12 a.m. UTC | #10
On 09/25/2015 11:07 PM, Markus Armbruster wrote:
> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>
>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>>
>>>> This filter is to buffer/release packets, this feature can be used
>>>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>>> What's "Remus"?

[...]

>>>
>>>> +
>>>> +static void filter_buffer_release_timer(void *opaque)
>>>> +{
>>>> +    NetFilterState *nf = opaque;
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>> Style nit: blank line between declarations and statements, please.
>>>
>>>> +    filter_buffer_flush(nf);
>>> Is purging correct here?
> When the timer expires, we flush as many buffered packets as we can,
> then throw away the rest.  Why throw them away?  Shouldn't we leave them
> in the buffer, and only throw away packets when the buffer is full?

May need a "FIXME" or "TODO" here. I think this is for simplicity. We
could queue the packet if the receiver or next filter could not receive
packets. But currently there's no way for the next filter or recivier to
notify us that it can receive more packet. This could be done in the future.
Yang Hongyang Sept. 28, 2015, 6:40 a.m. UTC | #11
On 09/25/2015 11:26 PM, Markus Armbruster wrote:
> Thomas Huth <thuth@redhat.com> writes:
>
>> On 25/09/15 10:03, Yang Hongyang wrote:
>>>
>>>
>>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>> [...]
>>>>> diff --git a/vl.c b/vl.c
>>>>> index ec589e2..3cf89d5 100644
>>>>> --- a/vl.c
>>>>> +++ b/vl.c
>>>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char
>>>>> *type)
>        /*
>         * Initial object creation happens before all other
>         * QEMU data types are created. The majority of objects
>         * can be created at this point. The rng-egd object
>         * cannot be created here, as it depends on the chardev
>         * already existing.
>         */
>        static bool object_create_initial(const char *type)
>        {
>>>>>       if (g_str_equal(type, "rng-egd")) {
>>>>>           return false;
>>>>>       }
>>>>> -    /* TODO: return false for concrete netfilters */
>>>>> +
>>>>> +    /* return false for concrete netfilters */
>       +    if (g_str_equal(type, "filter-buffer")) {
>       +        return false;
>       +    }
>       +
>            return true;
>        }
>>>>
>>>> I find this comment useless, please drop it :)
>>>
>>> This might be useful for reminding others who wants to implement other
>>> filters.
>
> I think as soon as there's one, how to add more is obvious enough.
>
>> I think the comment should explain why the code is return false here,
>> not what the code is doing (which is obvious). So maybe something like:
>>
>>      /*
>>       * netfilters require that the corresponding
>>       * netdevs are already existing
>>       */
>>
>> ?
>
> That explains *why* netfilters need to be initialized late.  More
> useful.
>
> The pre-existing case is explained in the function comment.  Which
> becomes misleading at this point.
>
>      /*
>       * Can @type objects be created during initial object creation?
>       * Initial object creation happens before all other QEMU data types
>       * are created.  The majority of objects can be created at this
>       * point.  Some can't, because the depend on other things already
>       * existing.
>       */
>      static bool object_create_initial(const char *type)
>      {
>          /* rng-egd depends on its chardev already existing */
>          if (g_str_equal(type, "rng-egd")) {
>              return false;
>          }
>
>          /* Net filters depend on their netdev */
>          if (g_str_equal(type, "filter-buffer")) {
>              return false;
>          }
>
>          return true;
>      }
>
> Having to enumerate alll the net filter names here isn't nice.  An "is
> subtype of TYPE_NETFILTER" test would be better.  Can't say whether it's
> practical.
>
> Of course, the real solution is to create objects either in topological
> or in command line order, but that's not in the cards right now.

Further work should needed for this, currently, I will remove the comment
as you wish, thanks.

> .
>
Yang Hongyang Sept. 28, 2015, 6:42 a.m. UTC | #12
On 09/25/2015 11:07 PM, Markus Armbruster wrote:
> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>
>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>>
>>>> This filter is to buffer/release packets, this feature can be used
>>>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>>>
>>> What's "Remus"?
>>
>> Remus is an opensource VM FT solution:
>> paper: http://http//www.cs.ubc.ca/~andy/papers/remus-nsdi-final.pdf
>> First implemented on Xen:
>> http://wiki.xen.org/wiki/Remus
>>
>> MicroCheckpointing in QEMU is another Remus implementation.
>>
>>>
>>>> can also use it to simulate the network delay.
>>>> It has an interval option, if supplied, this filter will release
>>>> packets by interval.
>>>
>>> Suggest "will delay packets by that time interval."
>>>
>>> Is interval really optional?
>>
>> It supposed to be optional. When the buffer filter has a user:
>> http://lists.nongnu.org/archive/html/qemu-devel/2015-09/msg00363.html
>>
>> In this patchset, the zero interval check is removed. packets are
>> released on demand through filter_buffer_release_all() api call.
>
> Understand.  But is interval optional right at this point in this patch
> series?

It's not, will remove this optional description although we
will make it optional soon.

>
>>>
>>>>
>>>> Usage:
>>>>    -netdev tap,id=bn0
>>>>    -object filter-buffer,id=f0,netdev=bn0,chain=in,interval=1000
>>>>
>>>> NOTE:
>>>>    the scale of interval is microsecond.
>>>
>>> Perhaps "interval is in microseconds".
>>
>> Better, thanks.
>>
>>>
>>>>
>>>> Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
>>>> ---
>>>> v11: add a fixme comment from Jason
>>>> v10: use NetQueue flush api to flush packets
>>>>        sent_cb can not be called when we already return size
>>>> v9: adjustment due to the qapi change
>>>> v7: use QTAILQ_FOREACH_SAFE() when flush packets
>>>> v6: move the interval check earlier and some comment adjust
>>>> v5: remove dummy sent_cb
>>>>       change interval type from int64 to uint32
>>>>       check interval!=0 when initialise
>>>>       rename FILTERBUFFERState to FilterBufferState
>>>> v4: remove bh
>>>>       pass the packet to next filter instead of receiver
>>>> v3: check packet's sender and sender->peer when flush it
>>>> ---
>>>>    net/Makefile.objs   |   1 +
>>>>    net/filter-buffer.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>    qemu-options.hx     |  18 ++++++
>>>>    vl.c                |   7 ++-
>>>>    4 files changed, 195 insertions(+), 1 deletion(-)
>>>>    create mode 100644 net/filter-buffer.c
>>>>
>>>> diff --git a/net/Makefile.objs b/net/Makefile.objs
>>>> index 914aec0..5fa2f97 100644
>>>> --- a/net/Makefile.objs
>>>> +++ b/net/Makefile.objs
>>>> @@ -14,3 +14,4 @@ common-obj-$(CONFIG_SLIRP) += slirp.o
>>>>    common-obj-$(CONFIG_VDE) += vde.o
>>>>    common-obj-$(CONFIG_NETMAP) += netmap.o
>>>>    common-obj-y += filter.o
>>>> +common-obj-y += filter-buffer.o
>>>> diff --git a/net/filter-buffer.c b/net/filter-buffer.c
>>>> new file mode 100644
>>>> index 0000000..ef94e91
>>>> --- /dev/null
>>>> +++ b/net/filter-buffer.c
>>>> @@ -0,0 +1,170 @@
>>>> +/*
>>>> + * Copyright (c) 2015 FUJITSU LIMITED
>>>> + * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>>> + * later.  See the COPYING file in the top-level directory.
>>>> + */
>>>> +
>>>> +#include "net/filter.h"
>>>> +#include "net/queue.h"
>>>> +#include "qemu-common.h"
>>>> +#include "qemu/timer.h"
>>>> +#include "qemu/iov.h"
>>>> +#include "qapi/qmp/qerror.h"
>>>> +#include "qapi-visit.h"
>>>> +#include "qom/object.h"
>>>> +
>>>> +#define TYPE_FILTER_BUFFER "filter-buffer"
>>>> +
>>>> +#define FILTER_BUFFER(obj) \
>>>> +    OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER)
>>>> +
>>>> +struct FilterBufferState {
>>>> +    NetFilterState parent_obj;
>>>> +
>>>> +    NetQueue *incoming_queue;
>>>> +    uint32_t interval;
>>>> +    QEMUTimer release_timer;
>>>> +};
>>>> +typedef struct FilterBufferState FilterBufferState;
>>>
>>> Again, not splitting the declaration is more concise.
>>>
>>>> +
>>>> +static void filter_buffer_flush(NetFilterState *nf)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> +
>>>> +    if (!qemu_net_queue_flush(s->incoming_queue)) {
>>>> +        /* Unable to empty the queue, purge remaining packets */
>>>> +        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
>>>> +    }
>>>> +}
>>>
>>> This either flushes or purges incoming_queue, where "purge" means
>>> dropping packets.  Correct?
>>
>> I think it is correct. Dropping packets is allowed, even on real
>> hardware, packets may lose.
>>
>>>
>>>> +
>>>> +static void filter_buffer_release_timer(void *opaque)
>>>> +{
>>>> +    NetFilterState *nf = opaque;
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>
>>> Style nit: blank line between declarations and statements, please.
>>>
>>>> +    filter_buffer_flush(nf);
>>>
>>> Is purging correct here?
>
> When the timer expires, we flush as many buffered packets as we can,
> then throw away the rest.  Why throw them away?  Shouldn't we leave them
> in the buffer, and only throw away packets when the buffer is full?
>
>>>> +    timer_mod(&s->release_timer,
>>>> +              qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>>>
>>> Timer rearmed to fire again in s->interval microseconds.
>>
>> Yes.
>>
>>>
>>>> +}
>>>> +
>>>> +/* filter APIs */
>>>> +static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
>>>> +                                         NetClientState *sender,
>>>> +                                         unsigned flags,
>>>> +                                         const struct iovec *iov,
>>>> +                                         int iovcnt,
>>>> +                                         NetPacketSent *sent_cb)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> +
>>>> +    /*
>>>> +     * we return size when buffer a packet, the sender will take it as
>>>> +     * a already sent packet, so sent_cb should not be called later
>>>
>>> Humor me: when a comment has multiple sentences, start each one with a
>>> capital letter, and end it with punctuation.
>>
>> Ok.
>>
>>>
>>>> +     * FIXME: even if guest can't receive packet for some reasons. Filter
>>>> +     * can still accept packet until its internal queue is full.
>>>> +     */
>>>
>>> I'm not sure I understand the comment.
>>
>> This is taken from Jason's comments, may be he can have a better explain?
>>
>>>
>>>> +    qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
>>>> +                              iov, iovcnt, NULL);
>>>> +    return iov_size(iov, iovcnt);
>>>> +}
>>>> +
>>>> +static void filter_buffer_cleanup(NetFilterState *nf)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> +
>>>> +    if (s->interval) {
>>>> +        timer_del(&s->release_timer);
>>>> +    }
>>>> +
>>>> +    /* flush packets */
>>>> +    if (s->incoming_queue) {
>>>> +        filter_buffer_flush(nf);
>>>
>>> I guess purging is correct here.
>>>
>>>> +        g_free(s->incoming_queue);
>>>> +    }
>>>> +}
>>>> +
>>>> +static void filter_buffer_setup(NetFilterState *nf, Error **errp)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> +
>>>> +    /*
>>>> +     * this check should be dropped when there're VM FT solutions like MC
>>>> +     * or COLO use this filter to release packets on demand.
>>>> +     */
>>>
>>> If you end a sentence with a period, you get to start it with a capital
>>> letter :)
>>
>> Ok, thanks.
>>
>>>
>>> "there're" is odd.  Perhaps something like "We may want to accept zero
>>> interval when VM FT solutions like MC or COLO use this filter to release
>>> packets on demand."
>>>
>>>> +    if (!s->interval) {
>>>> +        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
>>>> +                   "a non-zero interval");
>>>
>>> How can this happen?  Doesn't filter_buffer_set_interval() catch zero
>>> intervals already?
>>
>> When user do not supply an interval parameter, filter_buffer_set_interval()
>> won't be called, and the interval is 0.
>> When we have actual user like I mentioned above, we should allow user to omit
>> the interval option.
>
> Let's see whether I understand.
>
> If you specify a non-zero interval, filter_buffer_set_interval() stores
> it.
>
> If you specify a zero interval, filter_buffer_set_interval() rejects it.
>
> If you specify no interval, it defaults to zero, and
> filter_buffer_setup() fails right here.
>
> Therefore, interval is not optional right now.
>
> Correct?
>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
>>>> +    if (s->interval) {
>>>
>>> Condition cannot be false.  Same in filter_buffer_cleanup().
>>>
>>>> +        timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
>>>> +                      filter_buffer_release_timer, nf);
>>>> +        timer_mod(&s->release_timer,
>>>> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
>>>
>>> Timer armed to fire in s->interval microseconds.
>>>
>>> Unless I misunderstand something, this doesn't actually delay each
>>> packet by s->interval microseconds, it batches packet delivery: all
>>> packets arriving in a given interval are delayed until the end of the
>>> interval.  Correct?
>>
>> Correct.
>
> Documentation needs to spell that out.
>
>>>> +    }
>>>> +}
>>>> +
>>>> +static void filter_buffer_class_init(ObjectClass *oc, void *data)
>>>> +{
>>>> +    NetFilterClass *nfc = NETFILTER_CLASS(oc);
>>>> +
>>>> +    nfc->setup = filter_buffer_setup;
>>>> +    nfc->cleanup = filter_buffer_cleanup;
>>>> +    nfc->receive_iov = filter_buffer_receive_iov;
>>>> +}
>>>> +
>>>> +static void filter_buffer_get_interval(Object *obj, Visitor *v,
>>>> void *opaque,
>>>> +                                       const char *name, Error **errp)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>>>> +    uint32_t value = s->interval;
>>>> +
>>>> +    visit_type_uint32(v, &value, name, errp);
>>>> +}
>>>> +
>>>> +static void filter_buffer_set_interval(Object *obj, Visitor *v,
>>>> void *opaque,
>>>> +                                       const char *name, Error **errp)
>>>> +{
>>>> +    FilterBufferState *s = FILTER_BUFFER(obj);
>>>> +    Error *local_err = NULL;
>>>> +    uint32_t value;
>>>> +
>>>> +    visit_type_uint32(v, &value, name, &local_err);
>>>> +    if (local_err) {
>>>> +        goto out;
>>>> +    }
>>>> +    if (!value) {
>>>> +        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
>>>> +                   PRIu32 "'", object_get_typename(obj), name, value);
>>>
>>> What does it take?  That's what the user wants to know...  Perhaps
>>> "Property '%s.%s' requires a positive value".
>>
>> Maybe better, thanks.
>>
>>>
>>>> +        goto out;
>>>> +    }
>>>> +    s->interval = value;
>>>> +
>>>> +out:
>>>> +    error_propagate(errp, local_err);
>>>> +}
>>>> +
>>>> +static void filter_buffer_init(Object *obj)
>>>> +{
>>>> +    object_property_add(obj, "interval", "int",
>>>> +                        filter_buffer_get_interval,
>>>> +                        filter_buffer_set_interval, NULL, NULL, NULL);
>>>> +}
>>>> +
>>>> +static const TypeInfo filter_buffer_info = {
>>>> +    .name = TYPE_FILTER_BUFFER,
>>>> +    .parent = TYPE_NETFILTER,
>>>> +    .class_init = filter_buffer_class_init,
>>>> +    .instance_init = filter_buffer_init,
>>>> +    .instance_size = sizeof(FilterBufferState),
>>>> +};
>>>> +
>>>> +static void register_types(void)
>>>> +{
>>>> +    type_register_static(&filter_buffer_info);
>>>> +}
>>>> +
>>>> +type_init(register_types);
>>>> diff --git a/qemu-options.hx b/qemu-options.hx
>>>> index 7e147b8..b09f97f 100644
>>>> --- a/qemu-options.hx
>>>> +++ b/qemu-options.hx
>>>> @@ -3642,6 +3642,24 @@ in PEM format, in filenames
>>>> @var{ca-cert.pem}, @var{ca-crl.pem} (optional),
>>>>    @var{server-cert.pem} (only servers), @var{server-key.pem} (only servers),
>>>>    @var{client-cert.pem} (only clients), and @var{client-key.pem}
>>>> (only clients).
>>>>
>>>> +@item -object
>>>> filter-buffer,id=@var{id},netdev=@var{netdevid}[,chain=@var{all|in|out}][,interval=@var{t}]
>>>> +
>>>> +Buffer network packets on netdev @var{netdevid}.
>>>> +If interval @var{t} provided, will release packets by interval.
>>>> +Interval scale: microsecond.
>>>> +
>>>> +If interval @var{t} not provided, you have to make sure the packets can be
>>>
>>> Are you sure omitting t works?  filter_buffer_set_interval() rejects
>>> zero...
>>
>> When we omit t, filter_buffer_set_interval() won't be called.
>
> But filter_buffer_setup() will be, and it will fail, won't it?
>
>>>> +released, either by manually remove this filter or call the
>>>> release buffer API,
>>>> +otherwise, the packets will be buffered forever. Use with caution.
>>>
>>> Please wrap your lines a bit earlier.
>>>
>>>> +
>>>> +chain @var{all|in|out} is an option that can be applied to any
>>>> netfilter, default is @option{all}.
>>>> +
>>>> +@option{all} means this filter will receive packets both sent
>>>> to/from the netdev
>>>> +
>>>> +@option{in} means this filter will receive packets sent to the netdev
>>>> +
>>>> +@option{out} means this filter will receive packets sent from the netdev
>>>> +
>>>
>>> I'd describe this like "filter is inserted in the receive / transmit
>>> queue".
>>
>> thanks.
>>
>>>
>>>>    @end table
>>>>
>>>>    ETEXI
>>>> diff --git a/vl.c b/vl.c
>>>> index ec589e2..3cf89d5 100644
>>>> --- a/vl.c
>>>> +++ b/vl.c
>>>> @@ -2794,7 +2794,12 @@ static bool object_create_initial(const char *type)
>>>>        if (g_str_equal(type, "rng-egd")) {
>>>>            return false;
>>>>        }
>>>> -    /* TODO: return false for concrete netfilters */
>>>> +
>>>> +    /* return false for concrete netfilters */
>>>
>>> I find this comment useless, please drop it :)
>>
>> Ok, thanks.
>>
>>>
>>>> +    if (g_str_equal(type, "filter-buffer")) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>>        return true;
>>>>    }
>>> .
>>>
> .
>
Markus Armbruster Sept. 28, 2015, 7:38 a.m. UTC | #13
Jason Wang <jasowang@redhat.com> writes:

> On 09/25/2015 11:07 PM, Markus Armbruster wrote:
>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>
>>> On 09/24/2015 05:12 PM, Markus Armbruster wrote:
>>>> Yang Hongyang <yanghy@cn.fujitsu.com> writes:
>>>>
>>>>> This filter is to buffer/release packets, this feature can be used
>>>>> when using MicroCheckpointing, or other Remus like VM FT solutions, you
>>>> What's "Remus"?
>
> [...]
>
>>>>
>>>>> +
>>>>> +static void filter_buffer_release_timer(void *opaque)
>>>>> +{
>>>>> +    NetFilterState *nf = opaque;
>>>>> +    FilterBufferState *s = FILTER_BUFFER(nf);
>>>> Style nit: blank line between declarations and statements, please.
>>>>
>>>>> +    filter_buffer_flush(nf);
>>>> Is purging correct here?
>> When the timer expires, we flush as many buffered packets as we can,
>> then throw away the rest.  Why throw them away?  Shouldn't we leave them
>> in the buffer, and only throw away packets when the buffer is full?
>
> May need a "FIXME" or "TODO" here. I think this is for simplicity. We
> could queue the packet if the receiver or next filter could not receive
> packets. But currently there's no way for the next filter or recivier to
> notify us that it can receive more packet. This could be done in the future.

Good enough for me.  Make it FIXME if purging packets is actually wrong,
else TODO.
diff mbox

Patch

diff --git a/net/Makefile.objs b/net/Makefile.objs
index 914aec0..5fa2f97 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -14,3 +14,4 @@  common-obj-$(CONFIG_SLIRP) += slirp.o
 common-obj-$(CONFIG_VDE) += vde.o
 common-obj-$(CONFIG_NETMAP) += netmap.o
 common-obj-y += filter.o
+common-obj-y += filter-buffer.o
diff --git a/net/filter-buffer.c b/net/filter-buffer.c
new file mode 100644
index 0000000..ef94e91
--- /dev/null
+++ b/net/filter-buffer.c
@@ -0,0 +1,170 @@ 
+/*
+ * Copyright (c) 2015 FUJITSU LIMITED
+ * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "net/filter.h"
+#include "net/queue.h"
+#include "qemu-common.h"
+#include "qemu/timer.h"
+#include "qemu/iov.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi-visit.h"
+#include "qom/object.h"
+
+#define TYPE_FILTER_BUFFER "filter-buffer"
+
+#define FILTER_BUFFER(obj) \
+    OBJECT_CHECK(FilterBufferState, (obj), TYPE_FILTER_BUFFER)
+
+struct FilterBufferState {
+    NetFilterState parent_obj;
+
+    NetQueue *incoming_queue;
+    uint32_t interval;
+    QEMUTimer release_timer;
+};
+typedef struct FilterBufferState FilterBufferState;
+
+static void filter_buffer_flush(NetFilterState *nf)
+{
+    FilterBufferState *s = FILTER_BUFFER(nf);
+
+    if (!qemu_net_queue_flush(s->incoming_queue)) {
+        /* Unable to empty the queue, purge remaining packets */
+        qemu_net_queue_purge(s->incoming_queue, nf->netdev);
+    }
+}
+
+static void filter_buffer_release_timer(void *opaque)
+{
+    NetFilterState *nf = opaque;
+    FilterBufferState *s = FILTER_BUFFER(nf);
+    filter_buffer_flush(nf);
+    timer_mod(&s->release_timer,
+              qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
+}
+
+/* filter APIs */
+static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
+                                         NetClientState *sender,
+                                         unsigned flags,
+                                         const struct iovec *iov,
+                                         int iovcnt,
+                                         NetPacketSent *sent_cb)
+{
+    FilterBufferState *s = FILTER_BUFFER(nf);
+
+    /*
+     * we return size when buffer a packet, the sender will take it as
+     * a already sent packet, so sent_cb should not be called later
+     * FIXME: even if guest can't receive packet for some reasons. Filter
+     * can still accept packet until its internal queue is full.
+     */
+    qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
+                              iov, iovcnt, NULL);
+    return iov_size(iov, iovcnt);
+}
+
+static void filter_buffer_cleanup(NetFilterState *nf)
+{
+    FilterBufferState *s = FILTER_BUFFER(nf);
+
+    if (s->interval) {
+        timer_del(&s->release_timer);
+    }
+
+    /* flush packets */
+    if (s->incoming_queue) {
+        filter_buffer_flush(nf);
+        g_free(s->incoming_queue);
+    }
+}
+
+static void filter_buffer_setup(NetFilterState *nf, Error **errp)
+{
+    FilterBufferState *s = FILTER_BUFFER(nf);
+
+    /*
+     * this check should be dropped when there're VM FT solutions like MC
+     * or COLO use this filter to release packets on demand.
+     */
+    if (!s->interval) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
+                   "a non-zero interval");
+        return;
+    }
+
+    s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+    if (s->interval) {
+        timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
+                      filter_buffer_release_timer, nf);
+        timer_mod(&s->release_timer,
+                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
+    }
+}
+
+static void filter_buffer_class_init(ObjectClass *oc, void *data)
+{
+    NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+    nfc->setup = filter_buffer_setup;
+    nfc->cleanup = filter_buffer_cleanup;
+    nfc->receive_iov = filter_buffer_receive_iov;
+}
+
+static void filter_buffer_get_interval(Object *obj, Visitor *v, void *opaque,
+                                       const char *name, Error **errp)
+{
+    FilterBufferState *s = FILTER_BUFFER(obj);
+    uint32_t value = s->interval;
+
+    visit_type_uint32(v, &value, name, errp);
+}
+
+static void filter_buffer_set_interval(Object *obj, Visitor *v, void *opaque,
+                                       const char *name, Error **errp)
+{
+    FilterBufferState *s = FILTER_BUFFER(obj);
+    Error *local_err = NULL;
+    uint32_t value;
+
+    visit_type_uint32(v, &value, name, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    if (!value) {
+        error_setg(&local_err, "Property '%s.%s' doesn't take value '%"
+                   PRIu32 "'", object_get_typename(obj), name, value);
+        goto out;
+    }
+    s->interval = value;
+
+out:
+    error_propagate(errp, local_err);
+}
+
+static void filter_buffer_init(Object *obj)
+{
+    object_property_add(obj, "interval", "int",
+                        filter_buffer_get_interval,
+                        filter_buffer_set_interval, NULL, NULL, NULL);
+}
+
+static const TypeInfo filter_buffer_info = {
+    .name = TYPE_FILTER_BUFFER,
+    .parent = TYPE_NETFILTER,
+    .class_init = filter_buffer_class_init,
+    .instance_init = filter_buffer_init,
+    .instance_size = sizeof(FilterBufferState),
+};
+
+static void register_types(void)
+{
+    type_register_static(&filter_buffer_info);
+}
+
+type_init(register_types);
diff --git a/qemu-options.hx b/qemu-options.hx
index 7e147b8..b09f97f 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3642,6 +3642,24 @@  in PEM format, in filenames @var{ca-cert.pem}, @var{ca-crl.pem} (optional),
 @var{server-cert.pem} (only servers), @var{server-key.pem} (only servers),
 @var{client-cert.pem} (only clients), and @var{client-key.pem} (only clients).
 
+@item -object filter-buffer,id=@var{id},netdev=@var{netdevid}[,chain=@var{all|in|out}][,interval=@var{t}]
+
+Buffer network packets on netdev @var{netdevid}.
+If interval @var{t} provided, will release packets by interval.
+Interval scale: microsecond.
+
+If interval @var{t} not provided, you have to make sure the packets can be
+released, either by manually remove this filter or call the release buffer API,
+otherwise, the packets will be buffered forever. Use with caution.
+
+chain @var{all|in|out} is an option that can be applied to any netfilter, default is @option{all}.
+
+@option{all} means this filter will receive packets both sent to/from the netdev
+
+@option{in} means this filter will receive packets sent to the netdev
+
+@option{out} means this filter will receive packets sent from the netdev
+
 @end table
 
 ETEXI
diff --git a/vl.c b/vl.c
index ec589e2..3cf89d5 100644
--- a/vl.c
+++ b/vl.c
@@ -2794,7 +2794,12 @@  static bool object_create_initial(const char *type)
     if (g_str_equal(type, "rng-egd")) {
         return false;
     }
-    /* TODO: return false for concrete netfilters */
+
+    /* return false for concrete netfilters */
+    if (g_str_equal(type, "filter-buffer")) {
+        return false;
+    }
+
     return true;
 }