diff mbox

[1/2] af_packet: use vmalloc_to_page() instead for the addresss returned by vmalloc()

Message ID 1291125408-14389-1-git-send-email-xiaosuo@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Changli Gao Nov. 30, 2010, 1:56 p.m. UTC
The following commit causes the pgv->buffer may point to the memory
returned by vmalloc(). And we can't use virt_to_page() for the vmalloc
address.

This patch introduces a new inline function pgv_to_page(), which calls
vmalloc_to_page() for the vmalloc address, and virt_to_page() for the
__get_free_pages address.

    commit 0e3125c755445664f00ad036e4fc2cd32fd52877
    Author: Neil Horman <nhorman@tuxdriver.com>
    Date:   Tue Nov 16 10:26:47 2010 -0800

    packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/packet/af_packet.c |   21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet Nov. 30, 2010, 2:12 p.m. UTC | #1
Le mardi 30 novembre 2010 à 21:56 +0800, Changli Gao a écrit :
> The following commit causes the pgv->buffer may point to the memory
> returned by vmalloc(). And we can't use virt_to_page() for the vmalloc
> address.
> 
> This patch introduces a new inline function pgv_to_page(), which calls
> vmalloc_to_page() for the vmalloc address, and virt_to_page() for the
> __get_free_pages address.
> 
>     commit 0e3125c755445664f00ad036e4fc2cd32fd52877
>     Author: Neil Horman <nhorman@tuxdriver.com>
>     Date:   Tue Nov 16 10:26:47 2010 -0800
> 
>     packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
> 

nice catch.

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ---
>  net/packet/af_packet.c |   21 ++++++++++++++-------
>  1 file changed, 14 insertions(+), 7 deletions(-)
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index 422705d..0171b20 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -224,6 +224,13 @@ struct packet_skb_cb {
>  
>  #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
>  
> +static inline struct page *pgv_to_page(void *addr)
> +{
> +	if (is_vmalloc_addr(addr))
> +		return vmalloc_to_page(addr);

Hmm, I am wondering if calling vmalloc_to_page(addr) several times for
each packet is not too expensive ? I believe it is.

What about caching "struct page *" pointer somewhere ?

Then later we have :

> -		p_start = virt_to_page(h.raw);
> -		p_end = virt_to_page(h_end);
> +		p_start = pgv_to_page(h.raw);
> +		p_end = pgv_to_page(h_end);
>  		while (p_start <= p_end) {
>  			flush_dcache_page(p_start);
>  			p_start++;

This was OK before Neil patch... after vmalloc(), assumption that
p_start can be incremented is completely wrong.

To fix this, we need something else than your patch.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Nov. 30, 2010, 2:27 p.m. UTC | #2
On Tue, Nov 30, 2010 at 10:12 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mardi 30 novembre 2010 à 21:56 +0800, Changli Gao a écrit :
>> The following commit causes the pgv->buffer may point to the memory
>> returned by vmalloc(). And we can't use virt_to_page() for the vmalloc
>> address.
>>
>> This patch introduces a new inline function pgv_to_page(), which calls
>> vmalloc_to_page() for the vmalloc address, and virt_to_page() for the
>> __get_free_pages address.
>>
>>     commit 0e3125c755445664f00ad036e4fc2cd32fd52877
>>     Author: Neil Horman <nhorman@tuxdriver.com>
>>     Date:   Tue Nov 16 10:26:47 2010 -0800
>>
>>     packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
>>
>
> nice catch.
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>> ---
>>  net/packet/af_packet.c |   21 ++++++++++++++-------
>>  1 file changed, 14 insertions(+), 7 deletions(-)
>> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
>> index 422705d..0171b20 100644
>> --- a/net/packet/af_packet.c
>> +++ b/net/packet/af_packet.c
>> @@ -224,6 +224,13 @@ struct packet_skb_cb {
>>
>>  #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
>>
>> +static inline struct page *pgv_to_page(void *addr)
>> +{
>> +     if (is_vmalloc_addr(addr))
>> +             return vmalloc_to_page(addr);
>
> Hmm, I am wondering if calling vmalloc_to_page(addr) several times for
> each packet is not too expensive ? I believe it is.
>
> What about caching "struct page *" pointer somewhere ?
>
> Then later we have :
>
>> -             p_start = virt_to_page(h.raw);
>> -             p_end = virt_to_page(h_end);
>> +             p_start = pgv_to_page(h.raw);
>> +             p_end = pgv_to_page(h_end);
>>               while (p_start <= p_end) {
>>                       flush_dcache_page(p_start);
>>                       p_start++;
>
> This was OK before Neil patch... after vmalloc(), assumption that
> p_start can be incremented is completely wrong.
>
> To fix this, we need something else than your patch.
>

Yes, you are right, and tpacket_fill_packet() has the same issue. Thanks.
Neil Horman Nov. 30, 2010, 2:37 p.m. UTC | #3
On Tue, Nov 30, 2010 at 03:12:21PM +0100, Eric Dumazet wrote:
> Le mardi 30 novembre 2010 à 21:56 +0800, Changli Gao a écrit :
> > The following commit causes the pgv->buffer may point to the memory
> > returned by vmalloc(). And we can't use virt_to_page() for the vmalloc
> > address.
> > 
> > This patch introduces a new inline function pgv_to_page(), which calls
> > vmalloc_to_page() for the vmalloc address, and virt_to_page() for the
> > __get_free_pages address.
> > 
> >     commit 0e3125c755445664f00ad036e4fc2cd32fd52877
> >     Author: Neil Horman <nhorman@tuxdriver.com>
> >     Date:   Tue Nov 16 10:26:47 2010 -0800
> > 
> >     packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4)
> > 
> 
> nice catch.
> 
Ouch, yes, thanks.

> > Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> > ---
> >  net/packet/af_packet.c |   21 ++++++++++++++-------
> >  1 file changed, 14 insertions(+), 7 deletions(-)
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index 422705d..0171b20 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -224,6 +224,13 @@ struct packet_skb_cb {
> >  
> >  #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
> >  
> > +static inline struct page *pgv_to_page(void *addr)
> > +{
> > +	if (is_vmalloc_addr(addr))
> > +		return vmalloc_to_page(addr);
> 
> Hmm, I am wondering if calling vmalloc_to_page(addr) several times for
> each packet is not too expensive ? I believe it is.
> 
> What about caching "struct page *" pointer somewhere ?
> 
> Then later we have :
> 
> > -		p_start = virt_to_page(h.raw);
> > -		p_end = virt_to_page(h_end);
> > +		p_start = pgv_to_page(h.raw);
> > +		p_end = pgv_to_page(h_end);
> >  		while (p_start <= p_end) {
> >  			flush_dcache_page(p_start);
> >  			p_start++;
> 
> This was OK before Neil patch... after vmalloc(), assumption that
> p_start can be incremented is completely wrong.
> 
> To fix this, we need something else than your patch.
> 
Off the top of my head, I would think that pgv_to_page could be prototyped such
that it could accept addr, offset and struct page ** arguments.  That way we can
track the current page that we're mapped to, lowering the number of calls to
vmalloc_to_page, and we can still use an increment like we do above (as long as
its wrapped in a subsequent call to pgv_to_page)
Neil

> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Dec. 1, 2010, 1:05 p.m. UTC | #4
On Tue, Nov 30, 2010 at 10:37 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> Off the top of my head, I would think that pgv_to_page could be prototyped such
> that it could accept addr, offset and struct page ** arguments.  That way we can
> track the current page that we're mapped to, lowering the number of calls to
> vmalloc_to_page, and we can still use an increment like we do above (as long as
> its wrapped in a subsequent call to pgv_to_page)

I'll try to optimize pgv_to_page() after this patch series merged. I
am planning to call vmalloc_to_page() previously, and cache its result
in a per pgv array for future use. Thanks.
Eric Dumazet Dec. 1, 2010, 1:38 p.m. UTC | #5
Le mercredi 01 décembre 2010 à 21:05 +0800, Changli Gao a écrit :
> On Tue, Nov 30, 2010 at 10:37 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> > Off the top of my head, I would think that pgv_to_page could be prototyped such
> > that it could accept addr, offset and struct page ** arguments.  That way we can
> > track the current page that we're mapped to, lowering the number of calls to
> > vmalloc_to_page, and we can still use an increment like we do above (as long as
> > its wrapped in a subsequent call to pgv_to_page)
> 
> I'll try to optimize pgv_to_page() after this patch series merged. I
> am planning to call vmalloc_to_page() previously, and cache its result
> in a per pgv array for future use. Thanks.
> 
> 

Hmm... fact is flush_dcache_page() is void on some arches.

Maybe the only thing to do is avoid pgv_to_page() calls if
ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is 0

    The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid
    pointless empty cache-thrashing loops on architectures for which
    flush_dcache_page() is a no-op.  Every architecture was provided with this
    flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is
    equal 1 or do nothing otherwise.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Dec. 1, 2010, 1:43 p.m. UTC | #6
Le mercredi 01 décembre 2010 à 14:38 +0100, Eric Dumazet a écrit :
> Le mercredi 01 décembre 2010 à 21:05 +0800, Changli Gao a écrit :
> > On Tue, Nov 30, 2010 at 10:37 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
> > > Off the top of my head, I would think that pgv_to_page could be prototyped such
> > > that it could accept addr, offset and struct page ** arguments.  That way we can
> > > track the current page that we're mapped to, lowering the number of calls to
> > > vmalloc_to_page, and we can still use an increment like we do above (as long as
> > > its wrapped in a subsequent call to pgv_to_page)
> > 
> > I'll try to optimize pgv_to_page() after this patch series merged. I
> > am planning to call vmalloc_to_page() previously, and cache its result
> > in a per pgv array for future use. Thanks.
> > 
> > 
> 
> Hmm... fact is flush_dcache_page() is void on some arches.
> 
> Maybe the only thing to do is avoid pgv_to_page() calls if
> ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is 0
> 
>     The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid
>     pointless empty cache-thrashing loops on architectures for which
>     flush_dcache_page() is a no-op.  Every architecture was provided with this
>     flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is
>     equal 1 or do nothing otherwise.
> 
> 

I guess using __pure attribute on pgv_to_page() should be enough ;)

static inline __pure struct page *pgv_to_page(void *addr)
{
       if (is_vmalloc_addr(addr))
               return vmalloc_to_page(addr);
       return virt_to_page(addr);
}

Compiler then should optimize away

flush_dcache_page(pgv_to_page(addr));


But the pgv_to_page() done in packet_sendmsg() will stay, of course.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao Dec. 1, 2010, 2:16 p.m. UTC | #7
On Wed, Dec 1, 2010 at 9:38 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mercredi 01 décembre 2010 à 21:05 +0800, Changli Gao a écrit :
>> On Tue, Nov 30, 2010 at 10:37 PM, Neil Horman <nhorman@tuxdriver.com> wrote:
>> > Off the top of my head, I would think that pgv_to_page could be prototyped such
>> > that it could accept addr, offset and struct page ** arguments.  That way we can
>> > track the current page that we're mapped to, lowering the number of calls to
>> > vmalloc_to_page, and we can still use an increment like we do above (as long as
>> > its wrapped in a subsequent call to pgv_to_page)
>>
>> I'll try to optimize pgv_to_page() after this patch series merged. I
>> am planning to call vmalloc_to_page() previously, and cache its result
>> in a per pgv array for future use. Thanks.
>>
>>
>
> Hmm... fact is flush_dcache_page() is void on some arches.
>
> Maybe the only thing to do is avoid pgv_to_page() calls if
> ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is 0
>

Even then, tpacket_fill_skb() is called for every skb, and
pgv_to_page() is used in it. We have to optimize pgv_to_page().
Thanks.
Eric Dumazet Dec. 1, 2010, 3:13 p.m. UTC | #8
Le mercredi 01 décembre 2010 à 22:16 +0800, Changli Gao a écrit :

> Even then, tpacket_fill_skb() is called for every skb, and
> pgv_to_page() is used in it. We have to optimize pgv_to_page().

With the __pure trick I gave, pgv_to_page() is _not_ called for the
typical use case of af_packet : packet sniffing.

Compiler is able to remove the call completely, since

static inline void flush_dcache_page(struct page *page) { }

The only remaining pgv_to_page() call is the one done in mmap packet
send, since we have to do :

page = pgv_to_page(data);
get_page(page);

I personally dont use this path, its known to be buggy...

Optimize if you want, but make all this
ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE conditional.

Its not needed to maintain an array of 'struct page *' if its not needed
at all.

# vi +2448 block/blk-core.c

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
 * rq_flush_dcache_pages - Helper function to flush all pages in a request
 * @rq: the request to be flushed
 *
 * Description:
 *     Flush all pages in @rq.
 */
void rq_flush_dcache_pages(struct request *rq)
{
        struct req_iterator iter;
        struct bio_vec *bvec;

        rq_for_each_segment(bvec, rq, iter)
                flush_dcache_page(bvec->bv_page);
}
EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
#endif



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Dec. 1, 2010, 8:44 p.m. UTC | #9
On Wed, Dec 01, 2010 at 04:13:16PM +0100, Eric Dumazet wrote:
> Le mercredi 01 décembre 2010 à 22:16 +0800, Changli Gao a écrit :
> 
> > Even then, tpacket_fill_skb() is called for every skb, and
> > pgv_to_page() is used in it. We have to optimize pgv_to_page().
> 
> With the __pure trick I gave, pgv_to_page() is _not_ called for the
> typical use case of af_packet : packet sniffing.
> 
> Compiler is able to remove the call completely, since
> 
> static inline void flush_dcache_page(struct page *page) { }
> 
> The only remaining pgv_to_page() call is the one done in mmap packet
> send, since we have to do :
> 
> page = pgv_to_page(data);
> get_page(page);
> 
> I personally dont use this path, its known to be buggy...
> 
> Optimize if you want, but make all this
> ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE conditional.
> 
> Its not needed to maintain an array of 'struct page *' if its not needed
> at all.
> 
+1 to this approach, its much cleaner than maintaing an array of struct pages to
do fast mappings.  I just spent a few hours putting a test patch together to do
such a mapping, and it requires either doing a lengthly search of the ring
buffer, or passing around the struct pgv that the potentially vmalloced address
came from, which fans out quite alot and gets tricky in several places (the tx
paths skb desctructor for instance).  At any rate, letting the compiler
eliminate code using __pure here seems much more efficient
Neil

> # vi +2448 block/blk-core.c
> 
> #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
> /**
>  * rq_flush_dcache_pages - Helper function to flush all pages in a request
>  * @rq: the request to be flushed
>  *
>  * Description:
>  *     Flush all pages in @rq.
>  */
> void rq_flush_dcache_pages(struct request *rq)
> {
>         struct req_iterator iter;
>         struct bio_vec *bvec;
> 
>         rq_for_each_segment(bvec, rq, iter)
>                 flush_dcache_page(bvec->bv_page);
> }
> EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
> #endif
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 422705d..0171b20 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -224,6 +224,13 @@  struct packet_skb_cb {
 
 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
 
+static inline struct page *pgv_to_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	return virt_to_page(addr);
+}
+
 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 {
 	union {
@@ -236,11 +243,11 @@  static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 	switch (po->tp_version) {
 	case TPACKET_V1:
 		h.h1->tp_status = status;
-		flush_dcache_page(virt_to_page(&h.h1->tp_status));
+		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 		break;
 	case TPACKET_V2:
 		h.h2->tp_status = status;
-		flush_dcache_page(virt_to_page(&h.h2->tp_status));
+		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
 	default:
 		pr_err("TPACKET version not supported\n");
@@ -263,10 +270,10 @@  static int __packet_get_status(struct packet_sock *po, void *frame)
 	h.raw = frame;
 	switch (po->tp_version) {
 	case TPACKET_V1:
-		flush_dcache_page(virt_to_page(&h.h1->tp_status));
+		flush_dcache_page(pgv_to_page(&h.h1->tp_status));
 		return h.h1->tp_status;
 	case TPACKET_V2:
-		flush_dcache_page(virt_to_page(&h.h2->tp_status));
+		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
 	default:
 		pr_err("TPACKET version not supported\n");
@@ -803,8 +810,8 @@  static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		struct page *p_start, *p_end;
 		u8 *h_end = h.raw + macoff + snaplen - 1;
 
-		p_start = virt_to_page(h.raw);
-		p_end = virt_to_page(h_end);
+		p_start = pgv_to_page(h.raw);
+		p_end = pgv_to_page(h_end);
 		while (p_start <= p_end) {
 			flush_dcache_page(p_start);
 			p_start++;
@@ -915,7 +922,7 @@  static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	}
 
 	err = -EFAULT;
-	page = virt_to_page(data);
+	page = pgv_to_page(data);
 	offset = offset_in_page(data);
 	len_max = PAGE_SIZE - offset;
 	len = ((to_write > len_max) ? len_max : to_write);