diff mbox

Multicast packet loss

Message ID 49BED109.3020504@cosmosbay.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet March 16, 2009, 10:22 p.m. UTC
David Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Fri, 13 Mar 2009 23:30:31 +0100
> 
>> David Miller a écrit :
>>>> Also, when an event was queued for later invocation, I also needed to keep
>>>> a reference on "struct socket" to make sure it doesnt disappear before
>>>> the invocation. Not all sockets are RCU guarded (we added RCU only for 
>>>> some protocols (TCP, UDP ...). So I found keeping a read_lock
>>>> on callback was the easyest thing to do. I now realize we might
>>>> overflow preempt_count, so special care is needed.
>>> You're using this in UDP so... make the rule that you can't use
>>> this with a non-RCU-quiescent protocol.
>> UDP/TCP only ? I though many other protocols (not all using RCU) were
>> using sock_def_readable() too...
> 
> Maybe create a inet_def_readable() just for this purpose :-)


Here is the last incantation of the patch, that of course should be
split in two parts and better Changelog for further discussion on lkml.

We need to take a reference on sock when queued on a softirq delay
list. RCU wont help here because of SLAB_DESTROY_BY_RCU thing :
Another cpu could free/reuse the socket before we have a chance to
call softirq_delay_exec()

UDP & UDPLite use this delayed wakeup feature.

Thank you

[PATCH] softirq: Introduce mechanism to defer wakeups

Some network workloads need to call scheduler too many times. For example,
each received multicast frame can wakeup many threads. ksoftirqd is then
not able to drain NIC RX queues in time and we get frame losses and high
latencies.

This patch adds an infrastructure to delay work done in
sock_def_readable() at end of do_softirq(). This needs to
make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/interrupt.h |   18 +++++++++++++++
 include/linux/irqflags.h  |   11 ++++-----
 include/linux/sched.h     |    2 -
 include/net/sock.h        |    2 +
 include/net/udplite.h     |    1
 kernel/lockdep.c          |    2 -
 kernel/softirq.c          |   42 ++++++++++++++++++++++++++++++++++--
 lib/locking-selftest.c    |    4 +--
 net/core/sock.c           |   41 +++++++++++++++++++++++++++++++++++
 net/ipv4/udp.c            |    7 ++++++
 net/ipv6/udp.c            |    7 ++++++
 11 files changed, 125 insertions(+), 12 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Peter Zijlstra March 17, 2009, 10:11 a.m. UTC | #1
On Mon, 2009-03-16 at 23:22 +0100, Eric Dumazet wrote:

> Here is the last incantation of the patch, that of course should be
> split in two parts and better Changelog for further discussion on lkml.

I read the entire thread up to now, and I still don't really understand
the Changelog, sorry :(

> [PATCH] softirq: Introduce mechanism to defer wakeups
> 
> Some network workloads need to call scheduler too many times. For example,
> each received multicast frame can wakeup many threads. ksoftirqd is then
> not able to drain NIC RX queues in time and we get frame losses and high
> latencies.
> 
> This patch adds an infrastructure to delay work done in
> sock_def_readable() at end of do_softirq(). This needs to
> make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS

How does that solve the wakeup issue?

> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---

> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -158,6 +158,42 @@ void local_bh_enable_ip(unsigned long ip)
>  }
>  EXPORT_SYMBOL(local_bh_enable_ip);
>  
> +
> +#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
> +static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
> +	SOFTIRQ_DELAY_END
> +};

Why the magic termination value? Can't we NULL terminate the list

> +
> +/*
> + * Caller must disable preemption, and take care of appropriate
> + * locking and refcounting
> + */

Shouldn't we call it __softirq_delay_queue() if the caller needs to
disabled preemption?

Futhermore, don't we always require the caller to take care of lifetime
issues when we queue something?

> +int softirq_delay_queue(struct softirq_delay *sdel)
> +{
> +	if (!sdel->next) {
> +		sdel->next = __get_cpu_var(softirq_delay_head);
> +		__get_cpu_var(softirq_delay_head) = sdel;
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Because locking is provided by subsystem, please note
> + * that sdel->func(sdel) is responsible for setting sdel->next to NULL
> + */
> +static void softirq_delay_exec(void)
> +{
> +	struct softirq_delay *sdel;
> +
> +	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
> +		__get_cpu_var(softirq_delay_head) = sdel->next;
> +		sdel->func(sdel);	/*	sdel->next = NULL;*/
> +		}
> +}

Why can't we write:

  struct softirq_delay *sdel, *next;

  sdel = __get_cpu_var(softirq_delay_head);
  __get_cpu_var(softirq_delay_head) = NULL;

  while (sdel) {
    next = sdel->next;
    sdel->func(sdel);
    sdel = next;
  }

Why does it matter what happens to sdel->next? we've done the callback.

Aah, the crux is in the re-use policy.. that most certainly does deserve
a comment.

How about we make sdel->next point to itself in the init case?

Then we can write:

  while (sdel) {
    next = sdel->next;
    sdel->next = sdel;
    sdel->func(sdel);
    sdel = next;
  }

and have the enqueue bit look like:

int __softirq_delay_queue(struct softirq_delay *sdel)
{
  struct softirq_delay **head;

  if (sdel->next != sdel)
    return 0;

  head = &__get_cpu_var(softirq_delay_head);
  sdel->next = *head;
  *head = sdel;
  return 1;
}
     
> @@ -1691,6 +1694,43 @@ static void sock_def_readable(struct sock *sk, int len)
>  	read_unlock(&sk->sk_callback_lock);
>  }
>  
> +/*
> + * helper function called by softirq_delay_exec(),
> + * if inet_def_readable() queued us.
> + */
> +static void sock_readable_defer(struct softirq_delay *sdel)
> +{
> +	struct sock *sk = container_of(sdel, struct sock, sk_delay);
> +
> +	sdel->next = NULL;
> +	/*
> +	 * At this point, we dont own a lock on socket, only a reference.
> +	 * We must commit above write, or another cpu could miss a wakeup
> +	 */
> +	smp_wmb();

Where's the matching barrier?

> +	sock_def_readable(sk, 0);
> +	sock_put(sk);
> +}
> +
> +/*
> + * Custom version of sock_def_readable()
> + * We want to defer scheduler processing at the end of do_softirq()
> + * Called with socket locked.
> + */
> +void inet_def_readable(struct sock *sk, int len)
> +{
> +	if (running_from_softirq()) {
> +		if (softirq_delay_queue(&sk->sk_delay))
> +			/*
> +			 * If we queued this socket, take a reference on it
> +			 * Caller owns socket lock, so write to sk_delay.next
> +			 * will be committed before unlock.
> +			 */
> +			sock_hold(sk);
> +	} else
> +		sock_def_readable(sk, len);
> +}

OK, so the idea is to handle a bunch of packets and instead of waking N
threads for each packet, only wake them once at the end of the batch?

Sounds like a sensible idea.. 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2009, 11:08 a.m. UTC | #2
Peter Zijlstra a écrit :
> On Mon, 2009-03-16 at 23:22 +0100, Eric Dumazet wrote:
> 
>> Here is the last incantation of the patch, that of course should be
>> split in two parts and better Changelog for further discussion on lkml.
> 
> I read the entire thread up to now, and I still don't really understand
> the Changelog, sorry :(

Sure, I should have taken more time, will repost this in a couple of hours,
with nice CHangelogs and split patches.

> 
>> [PATCH] softirq: Introduce mechanism to defer wakeups
>>
>> Some network workloads need to call scheduler too many times. For example,
>> each received multicast frame can wakeup many threads. ksoftirqd is then
>> not able to drain NIC RX queues in time and we get frame losses and high
>> latencies.
>>
>> This patch adds an infrastructure to delay work done in
>> sock_def_readable() at end of do_softirq(). This needs to
>> make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS
> 
> How does that solve the wakeup issue?

Apparently, on SMP machines this actually helps a lot, in case of multicast
trafic handled by many subscribers. skb_cloning involves atomic ops on
route cache entries, and if we wakeup threads as we currently do, they
start to consume skb while the feeder is still doing skb clones for
other sockets. Many cache line ping pongs are slowing down the softirq.

I will post the test program to reproduce the problem.

> 
>> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
>> ---
> 
>> --- a/kernel/softirq.c
>> +++ b/kernel/softirq.c
>> @@ -158,6 +158,42 @@ void local_bh_enable_ip(unsigned long ip)
>>  }
>>  EXPORT_SYMBOL(local_bh_enable_ip);
>>  
>> +
>> +#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
>> +static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
>> +	SOFTIRQ_DELAY_END
>> +};
> 
> Why the magic termination value? Can't we NULL terminate the list

Yes we can, you are right.

> 
>> +
>> +/*
>> + * Caller must disable preemption, and take care of appropriate
>> + * locking and refcounting
>> + */
> 
> Shouldn't we call it __softirq_delay_queue() if the caller needs to
> disabled preemption?

I was wondering if some BUG_ON() can be added to crash if preemption is enabled
at this point. Could not find an existing check,
doing again the 'if (running_from_softirq())'" test might be overkill,
should I document caller should do :

skeleton :

    lock_my_data(data); /* barrier here */
    sdel = &data->sdel;
    if (running_from_softirq()) {
	if (softirq_delay_queue(sdel)) {
		hold a refcount on data;
	} else {
		/* already queued, nothing to do */
	}
    } else {
	/* cannot queue the work , must do it right now */
	do_work(data);
    }
    release_my_data(data);
}

> 
> Futhermore, don't we always require the caller to take care of lifetime
> issues when we queue something?

You mean comment is too verbose... or 

> 
>> +int softirq_delay_queue(struct softirq_delay *sdel)
>> +{
>> +	if (!sdel->next) {
>> +		sdel->next = __get_cpu_var(softirq_delay_head);
>> +		__get_cpu_var(softirq_delay_head) = sdel;
>> +		return 1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Because locking is provided by subsystem, please note
>> + * that sdel->func(sdel) is responsible for setting sdel->next to NULL
>> + */
>> +static void softirq_delay_exec(void)
>> +{
>> +	struct softirq_delay *sdel;
>> +
>> +	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
>> +		__get_cpu_var(softirq_delay_head) = sdel->next;
>> +		sdel->func(sdel);	/*	sdel->next = NULL;*/
>> +		}
>> +}
> 
> Why can't we write:
> 
>   struct softirq_delay *sdel, *next;
> 
>   sdel = __get_cpu_var(softirq_delay_head);
>   __get_cpu_var(softirq_delay_head) = NULL;
> 
>   while (sdel) {
>     next = sdel->next;
>     sdel->func(sdel);
>     sdel = next;
>   }
> 
> Why does it matter what happens to sdel->next? we've done the callback.
> 
> Aah, the crux is in the re-use policy.. that most certainly does deserve
> a comment.

Hum, so my comment was not verbose enough :)

> 
> How about we make sdel->next point to itself in the init case?
> 
> Then we can write:
> 
>   while (sdel) {
>     next = sdel->next;
>     sdel->next = sdel;
>     sdel->func(sdel);
>     sdel = next;
>   }
> 
> and have the enqueue bit look like:
> 
> int __softirq_delay_queue(struct softirq_delay *sdel)
> {
>   struct softirq_delay **head;
> 
>   if (sdel->next != sdel)
>     return 0;

Yes we could do that

> 
>   head = &__get_cpu_var(softirq_delay_head);
>   sdel->next = *head;
>   *head = sdel;
>   return 1;
> }
>      
>> @@ -1691,6 +1694,43 @@ static void sock_def_readable(struct sock *sk, int len)
>>  	read_unlock(&sk->sk_callback_lock);
>>  }
>>  
>> +/*
>> + * helper function called by softirq_delay_exec(),
>> + * if inet_def_readable() queued us.
>> + */
>> +static void sock_readable_defer(struct softirq_delay *sdel)
>> +{
>> +	struct sock *sk = container_of(sdel, struct sock, sk_delay);
>> +
>> +	sdel->next = NULL;
>> +	/*
>> +	 * At this point, we dont own a lock on socket, only a reference.
>> +	 * We must commit above write, or another cpu could miss a wakeup
>> +	 */
>> +	smp_wmb();
> 
> Where's the matching barrier?

Check softirq_delay_exec(void) comment, where I stated synchronization had
to be done by the subsystem.

In this socket case, caller of softirq_delay_exec() has a lock on socket.

Problem is I dont want to get this lock again in sock_readable_defer() callback

if sdel->next is not committed, another cpu could call _softirq_delay_queue() and
find sdel->next being not null (or != sdel with your suggestion). Then next->func()
wont be called as it should (or called litle bit too soon)

So matching barrier is on "lock_my_data(data)" in previous skeleton ?

> 
>> +	sock_def_readable(sk, 0);
>> +	sock_put(sk);
>> +}
>> +
>> +/*
>> + * Custom version of sock_def_readable()
>> + * We want to defer scheduler processing at the end of do_softirq()
>> + * Called with socket locked.
>> + */
>> +void inet_def_readable(struct sock *sk, int len)
>> +{
>> +	if (running_from_softirq()) {
>> +		if (softirq_delay_queue(&sk->sk_delay))
>> +			/*
>> +			 * If we queued this socket, take a reference on it
>> +			 * Caller owns socket lock, so write to sk_delay.next
>> +			 * will be committed before unlock.
>> +			 */
>> +			sock_hold(sk);
>> +	} else
>> +		sock_def_readable(sk, len);
>> +}
> 
> OK, so the idea is to handle a bunch of packets and instead of waking N
> threads for each packet, only wake them once at the end of the batch?
> 
> Sounds like a sensible idea.. 

Idea is to batch wakeups() yes, and if we receive several packets for
the same socket(s), we reduce number of wakeups to one. In the multicast stress
situation of Athena CR, it really helps, no packets dropped instead of
30%

Thanks Peter

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra March 17, 2009, 11:57 a.m. UTC | #3
On Tue, 2009-03-17 at 12:08 +0100, Eric Dumazet wrote:

> >> +
> >> +/*
> >> + * Caller must disable preemption, and take care of appropriate
> >> + * locking and refcounting
> >> + */
> > 
> > Shouldn't we call it __softirq_delay_queue() if the caller needs to
> > disabled preemption?
> 
> I was wondering if some BUG_ON() can be added to crash if preemption is enabled
> at this point.

__get_cpu_var() has a preemption check and will generate BUGs when
CONFIG_DEBUG_PREEMPT similar to smp_processor_id().

>  Could not find an existing check,
> doing again the 'if (running_from_softirq())'" test might be overkill,
> should I document caller should do :
> 
> skeleton :
> 
>     lock_my_data(data); /* barrier here */
>     sdel = &data->sdel;
>     if (running_from_softirq()) {

Small nit: I don't particularly like the running_from_softirq() name,
but in_softirq() is already taken, and sadly means something slightly
different.

> 	if (softirq_delay_queue(sdel)) {
> 		hold a refcount on data;
> 	} else {
> 		/* already queued, nothing to do */
> 	}
>     } else {
> 	/* cannot queue the work , must do it right now */
> 	do_work(data);
>     }
>     release_my_data(data);
> }
> 
> > 
> > Futhermore, don't we always require the caller to take care of lifetime
> > issues when we queue something?
> 
> You mean comment is too verbose... or 

Yeah.

> > Aah, the crux is in the re-use policy.. that most certainly does deserve
> > a comment.
> 
> Hum, so my comment was not verbose enough :)

That too :-) 

> >> +static void sock_readable_defer(struct softirq_delay *sdel)
> >> +{
> >> +	struct sock *sk = container_of(sdel, struct sock, sk_delay);
> >> +
> >> +	sdel->next = NULL;
> >> +	/*
> >> +	 * At this point, we dont own a lock on socket, only a reference.
> >> +	 * We must commit above write, or another cpu could miss a wakeup
> >> +	 */
> >> +	smp_wmb();
> > 
> > Where's the matching barrier?
> 
> Check softirq_delay_exec(void) comment, where I stated synchronization had
> to be done by the subsystem.

afaiu the memory barrier semantics you cannot pair a wmb with a lock
barrier, it must either be a read, read_barrier_depends or full barrier.

> In this socket case, caller of softirq_delay_exec() has a lock on socket.
> 
> Problem is I dont want to get this lock again in sock_readable_defer() callback
> 
> if sdel->next is not committed, another cpu could call _softirq_delay_queue() and
> find sdel->next being not null (or != sdel with your suggestion). Then next->func()
> wont be called as it should (or called litle bit too soon)

Right, what we can do is put the wmb in the callback and the rmb right
before the __queue op, or simply integrate it into the framework.

> > OK, so the idea is to handle a bunch of packets and instead of waking N
> > threads for each packet, only wake them once at the end of the batch?
> > 
> > Sounds like a sensible idea.. 
> 
> Idea is to batch wakeups() yes, and if we receive several packets for
> the same socket(s), we reduce number of wakeups to one. In the multicast stress
> situation of Athena CR, it really helps, no packets dropped instead of
> 30%

Yes I can see that helping tremendously.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz March 17, 2009, 3 p.m. UTC | #4
Eric Dumazet wrote:
> Sure, I should have taken more time, will repost this in a couple of hours,
> with nice CHangelogs and split patches.

One small thing: with CONFIG_IPV6=m, inet_def_readable needs to be exported,
right?

Thanks,
Brian Bloniarz
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2009, 3:16 p.m. UTC | #5
Brian Bloniarz a écrit :
> Eric Dumazet wrote:
>> Sure, I should have taken more time, will repost this in a couple of
>> hours,
>> with nice CHangelogs and split patches.
> 
> One small thing: with CONFIG_IPV6=m, inet_def_readable needs to be
> exported,
> right?
> 

Absolutely, thank you !

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Stevens March 17, 2009, 7:39 p.m. UTC | #6
I did some testing with this and see at least a 20% improvement
without drop.

I agree with Peter's recommended changes (esp. sentinel vs null),
and also the trivial brace indentation  in softirq_delay_exec(),
but otherwise looks  good to me. Nice work.

                                        +-DLS

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet March 17, 2009, 9:19 p.m. UTC | #7
David Stevens a écrit :
> I did some testing with this and see at least a 20% improvement
> without drop.
> 
> I agree with Peter's recommended changes (esp. sentinel vs null),
> and also the trivial brace indentation  in softirq_delay_exec(),
> but otherwise looks  good to me. Nice work.
> 
>                                         +-DLS
> 
> 

Still I dont like very much all softirq.c changes. I feel very
uncomfortable to justify one extra call in do_softirq(), and
not very clean interface (stuff about locking, barriers...)

Easy way could be to add a SOFTIRQ but its not very wise.

I was wondering if we could use the infrastructure added in commit
54514a70adefe356afe854e2d3912d46668068e6
(softirq: Add support for triggering softirq work on softirqs.)
But I dont understand how it can works...
(softirq_work_list is feeded, but never processed)

Alternatively, we could use a framework dedicated to
network use, with well defined semantic :

Calling softirq_delay_exec() from net_rx_action(),
from this function, we know if time_squeeze was incremented,
or all netdev_budget consumed, and in this stress case 
try to give the wakeups job to another cpu.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 3, 2009, 7:28 p.m. UTC | #8
Hi Eric,

We've been experimenting with this softirq-delay patch in production, and
have seen some hard-to-reproduce crashes. We finally managed to capture a
kexec crashdump this morning.

This is the dmesg:

[53417.592868] Unable to handle kernel NULL pointer dereference at 0000000000000000 RIP:
[53417.598377]  [<ffffffff80243643>] __do_softirq+0xc3/0x150
[53417.606300] PGD 32abb8067 PUD 32faf5067 PMD 0
[53417.610829] Oops: 0000 [1] SMP
[53417.614032] CPU 2
[53417.616083] Modules linked in: nfs lockd nfs_acl sunrpc openafs(P) autofs4 ipv6 ac sbs sbshc video output dock battery container iptable_filter ip_tables x_tables parport_pc lp parport loop joydev iTCO_wdt iTCO_vendor_support evdev button i5000_edac psmouse serio_raw pcspkr shpchp pci_hotplug edac_core ext3 jbd mbcache sr_mod cdrom ata_generic usbhid hid ata_piix sg sd_mod ehci_hcd pata_acpi uhci_hcd libata bnx2 aacraid usbcore scsi_mod thermal processor fan fbcon tileblit font bitblit softcursor fuse
[53417.662067] Pid: 13039, comm: gball Tainted: P        2.6.24-19acr2-generic #1
[53417.669219] RIP: 0010:[<ffffffff80243643>]  [<ffffffff80243643>] __do_softirq+0xc3/0x150
[53417.677368] RSP: 0018:ffff8103314f3f20  EFLAGS: 00010297
[53417.682697] RAX: ffff810084a1b000 RBX: ffffffff805ba530 RCX: 0000000000000000
[53417.689843] RDX: ffff8103305811e0 RSI: 0000000000000282 RDI: ffff810332ada580
[53417.696993] RBP: 0000000000000000 R08: ffff81032fad9f08 R09: ffff810332382000
[53417.704144] R10: 0000000000000000 R11: ffffffff80316ec0 R12: ffffffff8062b3d8
[53417.711294] R13: ffffffff8062b480 R14: 0000000000000002 R15: 000000000000000a
[53417.718447] FS:  00007fab0d7b8750(0000) GS:ffff810334401b80(0000) knlGS:0000000000000000
[53417.726568] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[53417.732332] CR2: 0000000000000000 CR3: 0000000329e2d000 CR4: 00000000000006e0
[53417.739476] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[53417.746637] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[53417.753787] Process gball (pid: 13039, threadinfo ffff81032adde000, task ffff810329ff77d0)
[53417.761991] Stack:  ffffffff8062b3d8 0000000000000046 ffff8103314f3f68 0000000000000000
[53417.770146]  00000000000000a0 ffff81032addfee8 0000000000000000 ffffffff8020d50c
[53417.777660]  ffff8103314f3f68 00000000000000c1 ffffffff8020ed25 ffffffff8062c870
[53417.784961] Call Trace:
[53417.787635]  <IRQ>  [<ffffffff8020d50c>] call_softirq+0x1c/0x30
[53417.793597]  [<ffffffff8020ed25>] do_softirq+0x35/0x90
[53417.798747]  [<ffffffff80243578>] irq_exit+0x88/0x90
[53417.803727]  [<ffffffff8020ef70>] do_IRQ+0x80/0x100
[53417.808624]  [<ffffffff8020c891>] ret_from_intr+0x0/0xa
[53417.813862]  <EOI>  [<ffffffff803e53c8>] skb_release_all+0x18/0x150
[53417.820164]  [<ffffffff803e4ad9>] __kfree_skb+0x9/0x90
[53417.825327]  [<ffffffff80437612>] udp_recvmsg+0x222/0x260
[53417.830744]  [<ffffffff80231264>] source_load+0x34/0x70
[53417.835984]  [<ffffffff80232a9a>] find_busiest_group+0x1fa/0x850
[53417.842019]  [<ffffffff803e0100>] sock_common_recvmsg+0x30/0x50
[53417.847958]  [<ffffffff803de1ca>] sock_recvmsg+0x14a/0x160
[53417.853462]  [<ffffffff80231c21>] update_curr+0x71/0x100
[53419.858789]  [<ffffffff802320fd>] __dequeue_entity+0x3d/0x50
[53417.864469]  [<ffffffff80253ab0>] autoremove_wake_function+0x0/0x30
[53417.870758]  [<ffffffff8046662f>] thread_return+0x3a/0x57b
[53417.876262]  [<ffffffff803df73e>] sys_recvfrom+0xfe/0x190
[53417.881680]  [<ffffffff802e2a95>] sys_epoll_wait+0x245/0x4e0
[53417.887358]  [<ffffffff80233e20>] default_wake_function+0x0/0x10
[53417.893384]  [<ffffffff8020c37e>] system_call+0x7e/0x83
[53417.898628]
[53417.900134]
[53417.900134] Code: 48 8b 11 48 89 cf 65 48 8b 04 25 08 00 00 00 4a 89 14 20 ff
[53417.909430] RIP  [<ffffffff80243643>] __do_softirq+0xc3/0x150
[53417.915210]  RSP <ffff8103314f3f20>

The disassembly where it crashed:
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:273
ffffffff8024361b:       d1 ed                   shr    %ebp
rcu_bh_qsctr_inc():
/local/home/bmb/doc/kernels/linux-hardy-eric/include/linux/rcupdate.h:130
ffffffff8024361d:       48 8b 40 08             mov    0x8(%rax),%rax
ffffffff80243621:       41 c7 44 05 08 01 00    movl   $0x1,0x8(%r13,%rax,1)
ffffffff80243628:       00 00
__do_softirq():
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:273
ffffffff8024362a:       75 d8                   jne    ffffffff80243604 <__do_softirq+0x84>
softirq_delay_exec():
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:225
ffffffff8024362c:       48 8b 14 24             mov    (%rsp),%rdx
ffffffff80243630:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
ffffffff80243637:       00 00
ffffffff80243639:       48 8b 0c 10             mov    (%rax,%rdx,1),%rcx
ffffffff8024363d:       48 83 f9 01             cmp    $0x1,%rcx
ffffffff80243641:       74 29                   je     ffffffff8024366c <__do_softirq+0xec>
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:226
ffffffff80243643:       48 8b 11                mov    (%rcx),%rdx
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:227
ffffffff80243646:       48 89 cf                mov    %rcx,%rdi
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:226
ffffffff80243649:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
ffffffff80243650:       00 00
ffffffff80243652:       4a 89 14 20             mov    %rdx,(%rax,%r12,1)
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:227
ffffffff80243656:       ff 51 08                callq  *0x8(%rcx)
/local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:225
ffffffff80243659:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
ffffffff80243660:       00 00
ffffffff80243662:       4a 8b 0c 20             mov    (%rax,%r12,1),%rcx
ffffffff80243666:       48 83 f9 01             cmp    $0x1,%rcx
ffffffff8024366a:       75 d7                   jne    ffffffff80243643 <__do_softirq+0xc3>
raw_local_irq_disable():
/local/home/bmb/doc/kernels/linux-hardy-eric/debian/build/build-generic/include2/asm/irqflags_64.h:76
ffffffff8024366c:       fa                      cli

And softirq.c line numbers:
    218   * Because locking is provided by subsystem, please note
    219   * that sdel->func(sdel) is responsible for setting sdel->next to NULL
    220   */
    221  static void softirq_delay_exec(void)
    222  {
    223          struct softirq_delay *sdel;
    224
    225          while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
    226                  __get_cpu_var(softirq_delay_head) = sdel->next;
    227                  sdel->func(sdel);       /*      sdel->next = NULL;*/
    228                  }
    229  }

So it's crashing because __get_cpu_var(softirq_delay_head)) is NULL somehow.

We aren't running a recent kernel -- we're running Ubuntu Hardy's 2.6.24-19,
with a backported version of this patch. One more atypical thing is that
we run openafs, 1.4.6.dfsg1-2.

Like I said, I have a full vmcore (3, actually) and would be happy to post any
more information you'd like to know.

Thanks,
Brian Bloniarz

Eric Dumazet wrote:
> David Miller a écrit :
>> From: Eric Dumazet <dada1@cosmosbay.com>
>> Date: Fri, 13 Mar 2009 23:30:31 +0100
>>
>>> David Miller a écrit :
>>>>> Also, when an event was queued for later invocation, I also needed to keep
>>>>> a reference on "struct socket" to make sure it doesnt disappear before
>>>>> the invocation. Not all sockets are RCU guarded (we added RCU only for 
>>>>> some protocols (TCP, UDP ...). So I found keeping a read_lock
>>>>> on callback was the easyest thing to do. I now realize we might
>>>>> overflow preempt_count, so special care is needed.
>>>> You're using this in UDP so... make the rule that you can't use
>>>> this with a non-RCU-quiescent protocol.
>>> UDP/TCP only ? I though many other protocols (not all using RCU) were
>>> using sock_def_readable() too...
>> Maybe create a inet_def_readable() just for this purpose :-)
> 
> 
> Here is the last incantation of the patch, that of course should be
> split in two parts and better Changelog for further discussion on lkml.
> 
> We need to take a reference on sock when queued on a softirq delay
> list. RCU wont help here because of SLAB_DESTROY_BY_RCU thing :
> Another cpu could free/reuse the socket before we have a chance to
> call softirq_delay_exec()
> 
> UDP & UDPLite use this delayed wakeup feature.
> 
> Thank you
> 
> [PATCH] softirq: Introduce mechanism to defer wakeups
> 
> Some network workloads need to call scheduler too many times. For example,
> each received multicast frame can wakeup many threads. ksoftirqd is then
> not able to drain NIC RX queues in time and we get frame losses and high
> latencies.
> 
> This patch adds an infrastructure to delay work done in
> sock_def_readable() at end of do_softirq(). This needs to
> make available current->softirq_context even if !CONFIG_TRACE_IRQFLAGS
> 
> 
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
>  include/linux/interrupt.h |   18 +++++++++++++++
>  include/linux/irqflags.h  |   11 ++++-----
>  include/linux/sched.h     |    2 -
>  include/net/sock.h        |    2 +
>  include/net/udplite.h     |    1
>  kernel/lockdep.c          |    2 -
>  kernel/softirq.c          |   42 ++++++++++++++++++++++++++++++++++--
>  lib/locking-selftest.c    |    4 +--
>  net/core/sock.c           |   41 +++++++++++++++++++++++++++++++++++
>  net/ipv4/udp.c            |    7 ++++++
>  net/ipv6/udp.c            |    7 ++++++
>  11 files changed, 125 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 9127f6b..a773d0c 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -295,6 +295,24 @@ extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softir
>  extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
>  				  int this_cpu, int softirq);
>  
> +/*
> + * softirq delayed works : should be delayed at do_softirq() end
> + */
> +struct softirq_delay {
> +	struct softirq_delay	*next;
> +	void 			(*func)(struct softirq_delay *);
> +};
> +
> +int softirq_delay_queue(struct softirq_delay *sdel);
> +
> +static inline void softirq_delay_init(struct softirq_delay *sdel,
> +				      void (*func)(struct softirq_delay *))
> +{
> +	sdel->next = NULL;
> +	sdel->func = func;
> +}
> +
> +
>  /* Tasklets --- multithreaded analogue of BHs.
>  
>     Main feature differing them of generic softirqs: tasklet
> diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
> index 74bde13..30c1e01 100644
> --- a/include/linux/irqflags.h
> +++ b/include/linux/irqflags.h
> @@ -13,19 +13,21 @@
>  
>  #include <linux/typecheck.h>
>  
> +#define softirq_enter()	do { current->softirq_context++; } while (0)
> +#define softirq_exit()	do { current->softirq_context--; } while (0)
> +#define softirq_context(p)	((p)->softirq_context)
> +#define running_from_softirq()  (softirq_context(current) > 0)
> +
>  #ifdef CONFIG_TRACE_IRQFLAGS
>    extern void trace_softirqs_on(unsigned long ip);
>    extern void trace_softirqs_off(unsigned long ip);
>    extern void trace_hardirqs_on(void);
>    extern void trace_hardirqs_off(void);
>  # define trace_hardirq_context(p)	((p)->hardirq_context)
> -# define trace_softirq_context(p)	((p)->softirq_context)
>  # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
>  # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
>  # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
>  # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
> -# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
> -# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
>  # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
>  #else
>  # define trace_hardirqs_on()		do { } while (0)
> @@ -33,13 +35,10 @@
>  # define trace_softirqs_on(ip)		do { } while (0)
>  # define trace_softirqs_off(ip)		do { } while (0)
>  # define trace_hardirq_context(p)	0
> -# define trace_softirq_context(p)	0
>  # define trace_hardirqs_enabled(p)	0
>  # define trace_softirqs_enabled(p)	0
>  # define trace_hardirq_enter()		do { } while (0)
>  # define trace_hardirq_exit()		do { } while (0)
> -# define trace_softirq_enter()		do { } while (0)
> -# define trace_softirq_exit()		do { } while (0)
>  # define INIT_TRACE_IRQFLAGS
>  #endif
>  
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 8c216e0..5dd8487 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1320,8 +1320,8 @@ struct task_struct {
>  	unsigned long softirq_enable_ip;
>  	unsigned int softirq_enable_event;
>  	int hardirq_context;
> -	int softirq_context;
>  #endif
> +	int softirq_context;
>  #ifdef CONFIG_LOCKDEP
>  # define MAX_LOCK_DEPTH 48UL
>  	u64 curr_chain_key;
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4bb1ff9..0160a83 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -260,6 +260,7 @@ struct sock {
>  	unsigned long	        sk_lingertime;
>  	struct sk_buff_head	sk_error_queue;
>  	struct proto		*sk_prot_creator;
> +	struct softirq_delay	sk_delay;
>  	rwlock_t		sk_callback_lock;
>  	int			sk_err,
>  				sk_err_soft;
> @@ -960,6 +961,7 @@ extern void *sock_kmalloc(struct sock *sk, int size,
>  			  gfp_t priority);
>  extern void sock_kfree_s(struct sock *sk, void *mem, int size);
>  extern void sk_send_sigurg(struct sock *sk);
> +extern void inet_def_readable(struct sock *sk, int len);
>  
>  /*
>   * Functions to fill in entries in struct proto_ops when a protocol
> diff --git a/include/net/udplite.h b/include/net/udplite.h
> index afdffe6..7ce0ee0 100644
> --- a/include/net/udplite.h
> +++ b/include/net/udplite.h
> @@ -25,6 +25,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
>  /* Designate sk as UDP-Lite socket */
>  static inline int udplite_sk_init(struct sock *sk)
>  {
> +	sk->sk_data_ready = inet_def_readable;
>  	udp_sk(sk)->pcflag = UDPLITE_BIT;
>  	return 0;
>  }
> diff --git a/kernel/lockdep.c b/kernel/lockdep.c
> index 06b0c35..9873b40 100644
> --- a/kernel/lockdep.c
> +++ b/kernel/lockdep.c
> @@ -1807,7 +1807,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
>  	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
>  		curr->comm, task_pid_nr(curr),
>  		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
> -		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
> +		softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
>  		trace_hardirqs_enabled(curr),
>  		trace_softirqs_enabled(curr));
>  	print_lock(this);
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index bdbe9de..91a1714 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -158,6 +158,42 @@ void local_bh_enable_ip(unsigned long ip)
>  }
>  EXPORT_SYMBOL(local_bh_enable_ip);
>  
> +
> +#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
> +static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
> +	SOFTIRQ_DELAY_END
> +};
> +
> +/*
> + * Caller must disable preemption, and take care of appropriate
> + * locking and refcounting
> + */
> +int softirq_delay_queue(struct softirq_delay *sdel)
> +{
> +	if (!sdel->next) {
> +		sdel->next = __get_cpu_var(softirq_delay_head);
> +		__get_cpu_var(softirq_delay_head) = sdel;
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Because locking is provided by subsystem, please note
> + * that sdel->func(sdel) is responsible for setting sdel->next to NULL
> + */
> +static void softirq_delay_exec(void)
> +{
> +	struct softirq_delay *sdel;
> +
> +	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
> +		__get_cpu_var(softirq_delay_head) = sdel->next;
> +		sdel->func(sdel);	/*	sdel->next = NULL;*/
> +		}
> +}
> +
> +
> +
>  /*
>   * We restart softirq processing MAX_SOFTIRQ_RESTART times,
>   * and we fall back to softirqd after that.
> @@ -180,7 +216,7 @@ asmlinkage void __do_softirq(void)
>  	account_system_vtime(current);
>  
>  	__local_bh_disable((unsigned long)__builtin_return_address(0));
> -	trace_softirq_enter();
> +	softirq_enter();
>  
>  	cpu = smp_processor_id();
>  restart:
> @@ -211,6 +247,8 @@ restart:
>  		pending >>= 1;
>  	} while (pending);
>  
> +	softirq_delay_exec();
> +
>  	local_irq_disable();
>  
>  	pending = local_softirq_pending();
> @@ -220,7 +258,7 @@ restart:
>  	if (pending)
>  		wakeup_softirqd();
>  
> -	trace_softirq_exit();
> +	softirq_exit();
>  
>  	account_system_vtime(current);
>  	_local_bh_enable();
> diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
> index 280332c..1aa7351 100644
> --- a/lib/locking-selftest.c
> +++ b/lib/locking-selftest.c
> @@ -157,11 +157,11 @@ static void init_shared_classes(void)
>  #define SOFTIRQ_ENTER()				\
>  		local_bh_disable();		\
>  		local_irq_disable();		\
> -		trace_softirq_enter();		\
> +		softirq_enter();		\
>  		WARN_ON(!in_softirq());
>  
>  #define SOFTIRQ_EXIT()				\
> -		trace_softirq_exit();		\
> +		softirq_exit();		\
>  		local_irq_enable();		\
>  		local_bh_enable();
>  
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 0620046..c8745d1 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -213,6 +213,8 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
>  /* Maximal space eaten by iovec or ancilliary data plus some space */
>  int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
>  
> +static void sock_readable_defer(struct softirq_delay *sdel);
> +
>  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
>  {
>  	struct timeval tv;
> @@ -1074,6 +1076,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>  #endif
>  
>  		rwlock_init(&newsk->sk_dst_lock);
> +		softirq_delay_init(&newsk->sk_delay, sock_readable_defer);
>  		rwlock_init(&newsk->sk_callback_lock);
>  		lockdep_set_class_and_name(&newsk->sk_callback_lock,
>  				af_callback_keys + newsk->sk_family,
> @@ -1691,6 +1694,43 @@ static void sock_def_readable(struct sock *sk, int len)
>  	read_unlock(&sk->sk_callback_lock);
>  }
>  
> +/*
> + * helper function called by softirq_delay_exec(),
> + * if inet_def_readable() queued us.
> + */
> +static void sock_readable_defer(struct softirq_delay *sdel)
> +{
> +	struct sock *sk = container_of(sdel, struct sock, sk_delay);
> +
> +	sdel->next = NULL;
> +	/*
> +	 * At this point, we dont own a lock on socket, only a reference.
> +	 * We must commit above write, or another cpu could miss a wakeup
> +	 */
> +	smp_wmb();
> +	sock_def_readable(sk, 0);
> +	sock_put(sk);
> +}
> +
> +/*
> + * Custom version of sock_def_readable()
> + * We want to defer scheduler processing at the end of do_softirq()
> + * Called with socket locked.
> + */
> +void inet_def_readable(struct sock *sk, int len)
> +{
> +	if (running_from_softirq()) {
> +		if (softirq_delay_queue(&sk->sk_delay))
> +			/*
> +			 * If we queued this socket, take a reference on it
> +			 * Caller owns socket lock, so write to sk_delay.next
> +			 * will be committed before unlock.
> +			 */
> +			sock_hold(sk);
> +	} else
> +		sock_def_readable(sk, len);
> +}
> +
>  static void sock_def_write_space(struct sock *sk)
>  {
>  	read_lock(&sk->sk_callback_lock);
> @@ -1768,6 +1808,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
>  		sk->sk_sleep	=	NULL;
>  
>  	rwlock_init(&sk->sk_dst_lock);
> +	softirq_delay_init(&sk->sk_delay, sock_readable_defer);
>  	rwlock_init(&sk->sk_callback_lock);
>  	lockdep_set_class_and_name(&sk->sk_callback_lock,
>  			af_callback_keys + sk->sk_family,
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 05b7abb..1cc0907 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1342,6 +1342,12 @@ void udp_destroy_sock(struct sock *sk)
>  	release_sock(sk);
>  }
>  
> +static int udp_init_sock(struct sock *sk)
> +{
> +	sk->sk_data_ready = inet_def_readable;
> +	return 0;
> +}
> +
>  /*
>   *	Socket option code for UDP
>   */
> @@ -1559,6 +1565,7 @@ struct proto udp_prot = {
>  	.connect	   = ip4_datagram_connect,
>  	.disconnect	   = udp_disconnect,
>  	.ioctl		   = udp_ioctl,
> +	.init		   = udp_init_sock,
>  	.destroy	   = udp_destroy_sock,
>  	.setsockopt	   = udp_setsockopt,
>  	.getsockopt	   = udp_getsockopt,
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 84b1a29..1a9f8d4 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -960,6 +960,12 @@ void udpv6_destroy_sock(struct sock *sk)
>  	inet6_destroy_sock(sk);
>  }
>  
> +static int udpv6_init_sock(struct sock *sk)
> +{
> +	sk->sk_data_ready = inet_def_readable;
> +	return 0;
> +}
> +
>  /*
>   *	Socket option code for UDP
>   */
> @@ -1084,6 +1090,7 @@ struct proto udpv6_prot = {
>  	.connect	   = ip6_datagram_connect,
>  	.disconnect	   = udp_disconnect,
>  	.ioctl		   = udp_ioctl,
> +	.init 		   = udpv6_init_sock,
>  	.destroy	   = udpv6_destroy_sock,
>  	.setsockopt	   = udpv6_setsockopt,
>  	.getsockopt	   = udpv6_getsockopt,
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 5, 2009, 1:49 p.m. UTC | #9
Brian Bloniarz a écrit :
> Hi Eric,
> 
> We've been experimenting with this softirq-delay patch in production, and
> have seen some hard-to-reproduce crashes. We finally managed to capture a
> kexec crashdump this morning.
> 
> This is the dmesg:
> 
> [53417.592868] Unable to handle kernel NULL pointer dereference at
> 0000000000000000 RIP:
> [53417.598377]  [<ffffffff80243643>] __do_softirq+0xc3/0x150
> [53417.606300] PGD 32abb8067 PUD 32faf5067 PMD 0
> [53417.610829] Oops: 0000 [1] SMP
> [53417.614032] CPU 2
> [53417.616083] Modules linked in: nfs lockd nfs_acl sunrpc openafs(P)
> autofs4 ipv6 ac sbs sbshc video output dock battery container
> iptable_filter ip_tables x_tables parport_pc lp parport loop joydev
> iTCO_wdt iTCO_vendor_support evdev button i5000_edac psmouse serio_raw
> pcspkr shpchp pci_hotplug edac_core ext3 jbd mbcache sr_mod cdrom
> ata_generic usbhid hid ata_piix sg sd_mod ehci_hcd pata_acpi uhci_hcd
> libata bnx2 aacraid usbcore scsi_mod thermal processor fan fbcon
> tileblit font bitblit softcursor fuse
> [53417.662067] Pid: 13039, comm: gball Tainted: P       
> 2.6.24-19acr2-generic #1
> [53417.669219] RIP: 0010:[<ffffffff80243643>]  [<ffffffff80243643>]
> __do_softirq+0xc3/0x150
> [53417.677368] RSP: 0018:ffff8103314f3f20  EFLAGS: 00010297
> [53417.682697] RAX: ffff810084a1b000 RBX: ffffffff805ba530 RCX:
> 0000000000000000
> [53417.689843] RDX: ffff8103305811e0 RSI: 0000000000000282 RDI:
> ffff810332ada580
> [53417.696993] RBP: 0000000000000000 R08: ffff81032fad9f08 R09:
> ffff810332382000
> [53417.704144] R10: 0000000000000000 R11: ffffffff80316ec0 R12:
> ffffffff8062b3d8
> [53417.711294] R13: ffffffff8062b480 R14: 0000000000000002 R15:
> 000000000000000a
> [53417.718447] FS:  00007fab0d7b8750(0000) GS:ffff810334401b80(0000)
> knlGS:0000000000000000
> [53417.726568] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
> [53417.732332] CR2: 0000000000000000 CR3: 0000000329e2d000 CR4:
> 00000000000006e0
> [53417.739476] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
> 0000000000000000
> [53417.746637] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
> 0000000000000400
> [53417.753787] Process gball (pid: 13039, threadinfo ffff81032adde000,
> task ffff810329ff77d0)
> [53417.761991] Stack:  ffffffff8062b3d8 0000000000000046
> ffff8103314f3f68 0000000000000000
> [53417.770146]  00000000000000a0 ffff81032addfee8 0000000000000000
> ffffffff8020d50c
> [53417.777660]  ffff8103314f3f68 00000000000000c1 ffffffff8020ed25
> ffffffff8062c870
> [53417.784961] Call Trace:
> [53417.787635]  <IRQ>  [<ffffffff8020d50c>] call_softirq+0x1c/0x30
> [53417.793597]  [<ffffffff8020ed25>] do_softirq+0x35/0x90
> [53417.798747]  [<ffffffff80243578>] irq_exit+0x88/0x90
> [53417.803727]  [<ffffffff8020ef70>] do_IRQ+0x80/0x100
> [53417.808624]  [<ffffffff8020c891>] ret_from_intr+0x0/0xa
> [53417.813862]  <EOI>  [<ffffffff803e53c8>] skb_release_all+0x18/0x150
> [53417.820164]  [<ffffffff803e4ad9>] __kfree_skb+0x9/0x90
> [53417.825327]  [<ffffffff80437612>] udp_recvmsg+0x222/0x260
> [53417.830744]  [<ffffffff80231264>] source_load+0x34/0x70
> [53417.835984]  [<ffffffff80232a9a>] find_busiest_group+0x1fa/0x850
> [53417.842019]  [<ffffffff803e0100>] sock_common_recvmsg+0x30/0x50
> [53417.847958]  [<ffffffff803de1ca>] sock_recvmsg+0x14a/0x160
> [53417.853462]  [<ffffffff80231c21>] update_curr+0x71/0x100
> [53419.858789]  [<ffffffff802320fd>] __dequeue_entity+0x3d/0x50
> [53417.864469]  [<ffffffff80253ab0>] autoremove_wake_function+0x0/0x30
> [53417.870758]  [<ffffffff8046662f>] thread_return+0x3a/0x57b
> [53417.876262]  [<ffffffff803df73e>] sys_recvfrom+0xfe/0x190
> [53417.881680]  [<ffffffff802e2a95>] sys_epoll_wait+0x245/0x4e0
> [53417.887358]  [<ffffffff80233e20>] default_wake_function+0x0/0x10
> [53417.893384]  [<ffffffff8020c37e>] system_call+0x7e/0x83
> [53417.898628]
> [53417.900134]
> [53417.900134] Code: 48 8b 11 48 89 cf 65 48 8b 04 25 08 00 00 00 4a 89
> 14 20 ff
> [53417.909430] RIP  [<ffffffff80243643>] __do_softirq+0xc3/0x150
> [53417.915210]  RSP <ffff8103314f3f20>
> 
> The disassembly where it crashed:
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:273
> ffffffff8024361b:       d1 ed                   shr    %ebp
> rcu_bh_qsctr_inc():
> /local/home/bmb/doc/kernels/linux-hardy-eric/include/linux/rcupdate.h:130
> ffffffff8024361d:       48 8b 40 08             mov    0x8(%rax),%rax
> ffffffff80243621:       41 c7 44 05 08 01 00    movl  
> $0x1,0x8(%r13,%rax,1)
> ffffffff80243628:       00 00
> __do_softirq():
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:273
> ffffffff8024362a:       75 d8                   jne    ffffffff80243604
> <__do_softirq+0x84>
> softirq_delay_exec():
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:225
> ffffffff8024362c:       48 8b 14 24             mov    (%rsp),%rdx
> ffffffff80243630:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
> ffffffff80243637:       00 00
> ffffffff80243639:       48 8b 0c 10             mov    (%rax,%rdx,1),%rcx
> ffffffff8024363d:       48 83 f9 01             cmp    $0x1,%rcx
> ffffffff80243641:       74 29                   je     ffffffff8024366c
> <__do_softirq+0xec>
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:226
> ffffffff80243643:       48 8b 11                mov    (%rcx),%rdx
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:227
> ffffffff80243646:       48 89 cf                mov    %rcx,%rdi
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:226
> ffffffff80243649:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
> ffffffff80243650:       00 00
> ffffffff80243652:       4a 89 14 20             mov    %rdx,(%rax,%r12,1)
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:227
> ffffffff80243656:       ff 51 08                callq  *0x8(%rcx)
> /local/home/bmb/doc/kernels/linux-hardy-eric/kernel/softirq.c:225
> ffffffff80243659:       65 48 8b 04 25 08 00    mov    %gs:0x8,%rax
> ffffffff80243660:       00 00
> ffffffff80243662:       4a 8b 0c 20             mov    (%rax,%r12,1),%rcx
> ffffffff80243666:       48 83 f9 01             cmp    $0x1,%rcx
> ffffffff8024366a:       75 d7                   jne    ffffffff80243643
> <__do_softirq+0xc3>
> raw_local_irq_disable():
> /local/home/bmb/doc/kernels/linux-hardy-eric/debian/build/build-generic/include2/asm/irqflags_64.h:76
> 
> ffffffff8024366c:       fa                      cli
> 
> And softirq.c line numbers:
>    218   * Because locking is provided by subsystem, please note
>    219   * that sdel->func(sdel) is responsible for setting sdel->next
> to NULL
>    220   */
>    221  static void softirq_delay_exec(void)
>    222  {
>    223          struct softirq_delay *sdel;
>    224
>    225          while ((sdel = __get_cpu_var(softirq_delay_head)) !=
> SOFTIRQ_DELAY_END) {
>    226                  __get_cpu_var(softirq_delay_head) = sdel->next;
>    227                  sdel->func(sdel);       /*      sdel->next =
> NULL;*/
>    228                  }
>    229  }
> 
> So it's crashing because __get_cpu_var(softirq_delay_head)) is NULL
> somehow.
> 
> We aren't running a recent kernel -- we're running Ubuntu Hardy's
> 2.6.24-19,
> with a backported version of this patch. One more atypical thing is that
> we run openafs, 1.4.6.dfsg1-2.
> 
> Like I said, I have a full vmcore (3, actually) and would be happy to
> post any
> more information you'd like to know.
> 
> Thanks,
> Brian Bloniarz

Hi Brian

2.6.24-19 kernel... hmm...

Could you please send me the diff of your backport against this kernel ?

I take you use Ubuntu Hardys 8.04 LTS server edition ?

Pointer being null might tell us that we managed to call inet_def_readable()
without socket lock hold...

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 6, 2009, 9:53 p.m. UTC | #10
Eric Dumazet wrote:
> Pointer being null might tell us that we managed to call inet_def_readable()
> without socket lock hold...

Trying to track this down: I added:
	BUG_ON(!spin_is_locked(&sk->sk_lock.slock));
to the top of inet_def_readable. This gives me the following panic:

[ 2528.745311] kernel BUG at net/core/sock.c:1674!
[ 2528.745311] invalid opcode: 0000 [#1] PREEMPT SMP
[ 2528.745311] last sysfs file: /sys/devices/system/cpu/cpu7/crash_notes
[ 2528.745311] CPU 6
[ 2528.745311] Modules linked in: iptable_filter ip_tables x_tables parport_pc lp parport loop iTCO_wdt iTCO_vendor_support serio_raw psmouse pcspkr i5k_amb shpchp i5000_edac pci_hotplug button edac_core ipv6 ibmpex joydev ipmi_msghandler evdev ext3 jbd mbcache usbhid hid sr_mod cdrom pata_acpi ata_generic sg sd_mod ata_piix ehci_hcd uhci_hcd libata aacraid usbcore scsi_mod bnx2 thermal processor fan thermal_sys fuse
[ 2528.745311] Pid: 14507, comm: signalgen Not tainted 2.6.29.1-eric2-lowlat-lockdep #3 IBM System x3550 -[7978AC1]-
[ 2528.745311] RIP: 0010:[<ffffffff80444ec2>]  [<ffffffff80444ec2>] inet_def_readable+0x52/0x60
[ 2528.745311] RSP: 0018:ffff88043b985b58  EFLAGS: 00010246
[ 2528.745311] RAX: 0000000000000019 RBX: ffff88043b90c280 RCX: 0000000000000000
[ 2528.745311] RDX: 0000000000001919 RSI: 0000000000000068 RDI: ffff88043b90c280
[ 2528.745311] RBP: ffff88043b985b68 R08: 0000000000000000 R09: 0000000000000000
[ 2528.745311] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88043b811400
[ 2528.745311] R13: 0000000000000000 R14: 0000000000000068 R15: 0000000000000000
[ 2528.745311] FS:  00007f82f0742750(0000) GS:ffff88043dbc8280(0000) knlGS:0000000000000000
[ 2528.745311] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2528.745311] CR2: 000000000057f1a0 CR3: 000000043915e000 CR4: 00000000000406e0
[ 2528.745311] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 2528.745311] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 2528.745311] Process signalgen (pid: 14507, threadinfo ffff88043b984000, task ffff8804309a9ef0)
[ 2528.745311] Stack:
[ 2528.745311]  ffff88043b811400 ffff88043b90c280 ffff88043b985b98 ffffffff80444ff6
[ 2528.745311]  ffff88043b90c280 ffff88043b811400 0000000000000000 ffff88043b90c2c0
[ 2528.745311]  ffff88043b985bc8 ffffffff8049ee67 ffff88043b985bc8 ffff88043b811400
[ 2528.745311] Call Trace:
[ 2528.745311]  [<ffffffff80444ff6>] sock_queue_rcv_skb+0xd6/0x120
[ 2528.745311]  [<ffffffff8049ee67>] __udp_queue_rcv_skb+0x27/0xe0
[ 2528.745311]  [<ffffffff8044406a>] release_sock+0x7a/0xe0
[ 2528.745311]  [<ffffffff804a1d0d>] udp_recvmsg+0x1ed/0x330
[ 2528.745311]  [<ffffffff804437e2>] sock_common_recvmsg+0x32/0x50
[ 2528.745311]  [<ffffffff80441449>] sock_recvmsg+0x139/0x150
[ 2528.745311]  [<ffffffff8025a590>] ? autoremove_wake_function+0x0/0x40
[ 2528.745311]  [<ffffffff8026c4d9>] ? validate_chain+0x469/0x1270
[ 2528.745311]  [<ffffffff8026d60e>] ? __lock_acquire+0x32e/0xa40
[ 2528.745311]  [<ffffffff804429df>] sys_recvfrom+0xaf/0x110
[ 2528.745311]  [<ffffffff804e6109>] ? mutex_unlock+0x9/0x10
[ 2528.745311]  [<ffffffff80310041>] ? sys_epoll_wait+0x4a1/0x510
[ 2528.745311]  [<ffffffff8020c55b>] system_call_fastpath+0x16/0x1b
[ 2528.745311] Code: 85 c0 7e 1b 48 8d bf 98 02 00 00 e8 29 34 e0 ff 85 c0 74 04 f0 ff 43 28 48 83 c4 08 5b c9 c3 e8 15 f3 ff ff 48 83 c4 08 5b c9 c3 <0f> 0b eb fe 66 2e 0f 1f 84 00 00 00 00 00 55 48 89 e5 48 83 ec
[ 2528.745311] RIP  [<ffffffff80444ec2>] inet_def_readable+0x52/0x60
[ 2528.745311]  RSP <ffff88043b985b58>

Looks to me like __release_sock will call sk_backlog_rcv() with
the socket unlocked -- does that help at all?

Thanks,
Brian Bloniarz
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 6, 2009, 10:12 p.m. UTC | #11
Brian Bloniarz wrote:
>     BUG_ON(!spin_is_locked(&sk->sk_lock.slock));

Oh, sorry, I think I'm just misunderstanding how the socket
lock works. This doesn't actually check that the socket is locked,
right?

-Brian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Bloniarz April 7, 2009, 8:08 p.m. UTC | #12
Eric Dumazet wrote:
 > Brian Bloniarz a écrit :
 >> We've been experimenting with this softirq-delay patch in production, and
 >> have seen some hard-to-reproduce crashes. We finally managed to capture a
 >> kexec crashdump this morning.
 >
 > Pointer being null might tell us that we managed to call inet_def_readable()
 > without socket lock hold...

False alarm -- I think I did the backport to 2.6.24 incorrectly. 2.6.24 was
before the UDP receive path started taking the socket lock, so
inet_def_readable's assumption doesn't hold.

Sorry to waste everyone's time.

Thanks,
Brian Bloniarz
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet April 8, 2009, 8:12 a.m. UTC | #13
Brian Bloniarz a écrit :
> Eric Dumazet wrote:
>> Brian Bloniarz a écrit :
>>> We've been experimenting with this softirq-delay patch in production,
> and
>>> have seen some hard-to-reproduce crashes. We finally managed to
> capture a
>>> kexec crashdump this morning.
>>
>> Pointer being null might tell us that we managed to call
> inet_def_readable()
>> without socket lock hold...
> 
> False alarm -- I think I did the backport to 2.6.24 incorrectly. 2.6.24 was
> before the UDP receive path started taking the socket lock, so
> inet_def_readable's assumption doesn't hold.
> 
> Sorry to waste everyone's time.
> 

Thanks for doing this discovery work and analysis. 

I am currently off-computers and could not do this until next week.

So, if you want to use 2.6.24, we need to back port other patches as well ?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b..a773d0c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -295,6 +295,24 @@  extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softir
 extern void __send_remote_softirq(struct call_single_data *cp, int cpu,
 				  int this_cpu, int softirq);
 
+/*
+ * softirq delayed works : should be delayed at do_softirq() end
+ */
+struct softirq_delay {
+	struct softirq_delay	*next;
+	void 			(*func)(struct softirq_delay *);
+};
+
+int softirq_delay_queue(struct softirq_delay *sdel);
+
+static inline void softirq_delay_init(struct softirq_delay *sdel,
+				      void (*func)(struct softirq_delay *))
+{
+	sdel->next = NULL;
+	sdel->func = func;
+}
+
+
 /* Tasklets --- multithreaded analogue of BHs.
 
    Main feature differing them of generic softirqs: tasklet
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13..30c1e01 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -13,19 +13,21 @@ 
 
 #include <linux/typecheck.h>
 
+#define softirq_enter()	do { current->softirq_context++; } while (0)
+#define softirq_exit()	do { current->softirq_context--; } while (0)
+#define softirq_context(p)	((p)->softirq_context)
+#define running_from_softirq()  (softirq_context(current) > 0)
+
 #ifdef CONFIG_TRACE_IRQFLAGS
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)	((p)->hardirq_context)
-# define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -33,13 +35,10 @@ 
 # define trace_softirqs_on(ip)		do { } while (0)
 # define trace_softirqs_off(ip)		do { } while (0)
 # define trace_hardirq_context(p)	0
-# define trace_softirq_context(p)	0
 # define trace_hardirqs_enabled(p)	0
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8c216e0..5dd8487 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1320,8 +1320,8 @@  struct task_struct {
 	unsigned long softirq_enable_ip;
 	unsigned int softirq_enable_event;
 	int hardirq_context;
-	int softirq_context;
 #endif
+	int softirq_context;
 #ifdef CONFIG_LOCKDEP
 # define MAX_LOCK_DEPTH 48UL
 	u64 curr_chain_key;
diff --git a/include/net/sock.h b/include/net/sock.h
index 4bb1ff9..0160a83 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -260,6 +260,7 @@  struct sock {
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
 	struct proto		*sk_prot_creator;
+	struct softirq_delay	sk_delay;
 	rwlock_t		sk_callback_lock;
 	int			sk_err,
 				sk_err_soft;
@@ -960,6 +961,7 @@  extern void *sock_kmalloc(struct sock *sk, int size,
 			  gfp_t priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
 extern void sk_send_sigurg(struct sock *sk);
+extern void inet_def_readable(struct sock *sk, int len);
 
 /*
  * Functions to fill in entries in struct proto_ops when a protocol
diff --git a/include/net/udplite.h b/include/net/udplite.h
index afdffe6..7ce0ee0 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -25,6 +25,7 @@  static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 /* Designate sk as UDP-Lite socket */
 static inline int udplite_sk_init(struct sock *sk)
 {
+	sk->sk_data_ready = inet_def_readable;
 	udp_sk(sk)->pcflag = UDPLITE_BIT;
 	return 0;
 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 06b0c35..9873b40 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1807,7 +1807,7 @@  print_usage_bug(struct task_struct *curr, struct held_lock *this,
 	printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
 		curr->comm, task_pid_nr(curr),
 		trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
-		trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
+		softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
 		trace_hardirqs_enabled(curr),
 		trace_softirqs_enabled(curr));
 	print_lock(this);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de..91a1714 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -158,6 +158,42 @@  void local_bh_enable_ip(unsigned long ip)
 }
 EXPORT_SYMBOL(local_bh_enable_ip);
 
+
+#define SOFTIRQ_DELAY_END (struct softirq_delay *)1L
+static DEFINE_PER_CPU(struct softirq_delay *, softirq_delay_head) = {
+	SOFTIRQ_DELAY_END
+};
+
+/*
+ * Caller must disable preemption, and take care of appropriate
+ * locking and refcounting
+ */
+int softirq_delay_queue(struct softirq_delay *sdel)
+{
+	if (!sdel->next) {
+		sdel->next = __get_cpu_var(softirq_delay_head);
+		__get_cpu_var(softirq_delay_head) = sdel;
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Because locking is provided by subsystem, please note
+ * that sdel->func(sdel) is responsible for setting sdel->next to NULL
+ */
+static void softirq_delay_exec(void)
+{
+	struct softirq_delay *sdel;
+
+	while ((sdel = __get_cpu_var(softirq_delay_head)) != SOFTIRQ_DELAY_END) {
+		__get_cpu_var(softirq_delay_head) = sdel->next;
+		sdel->func(sdel);	/*	sdel->next = NULL;*/
+		}
+}
+
+
+
 /*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
@@ -180,7 +216,7 @@  asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -211,6 +247,8 @@  restart:
 		pending >>= 1;
 	} while (pending);
 
+	softirq_delay_exec();
+
 	local_irq_disable();
 
 	pending = local_softirq_pending();
@@ -220,7 +258,7 @@  restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c..1aa7351 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@  static void init_shared_classes(void)
 #define SOFTIRQ_ENTER()				\
 		local_bh_disable();		\
 		local_irq_disable();		\
-		trace_softirq_enter();		\
+		softirq_enter();		\
 		WARN_ON(!in_softirq());
 
 #define SOFTIRQ_EXIT()				\
-		trace_softirq_exit();		\
+		softirq_exit();		\
 		local_irq_enable();		\
 		local_bh_enable();
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 0620046..c8745d1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -213,6 +213,8 @@  __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
+static void sock_readable_defer(struct softirq_delay *sdel);
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
@@ -1074,6 +1076,7 @@  struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 #endif
 
 		rwlock_init(&newsk->sk_dst_lock);
+		softirq_delay_init(&newsk->sk_delay, sock_readable_defer);
 		rwlock_init(&newsk->sk_callback_lock);
 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
 				af_callback_keys + newsk->sk_family,
@@ -1691,6 +1694,43 @@  static void sock_def_readable(struct sock *sk, int len)
 	read_unlock(&sk->sk_callback_lock);
 }
 
+/*
+ * helper function called by softirq_delay_exec(),
+ * if inet_def_readable() queued us.
+ */
+static void sock_readable_defer(struct softirq_delay *sdel)
+{
+	struct sock *sk = container_of(sdel, struct sock, sk_delay);
+
+	sdel->next = NULL;
+	/*
+	 * At this point, we dont own a lock on socket, only a reference.
+	 * We must commit above write, or another cpu could miss a wakeup
+	 */
+	smp_wmb();
+	sock_def_readable(sk, 0);
+	sock_put(sk);
+}
+
+/*
+ * Custom version of sock_def_readable()
+ * We want to defer scheduler processing at the end of do_softirq()
+ * Called with socket locked.
+ */
+void inet_def_readable(struct sock *sk, int len)
+{
+	if (running_from_softirq()) {
+		if (softirq_delay_queue(&sk->sk_delay))
+			/*
+			 * If we queued this socket, take a reference on it
+			 * Caller owns socket lock, so write to sk_delay.next
+			 * will be committed before unlock.
+			 */
+			sock_hold(sk);
+	} else
+		sock_def_readable(sk, len);
+}
+
 static void sock_def_write_space(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
@@ -1768,6 +1808,7 @@  void sock_init_data(struct socket *sock, struct sock *sk)
 		sk->sk_sleep	=	NULL;
 
 	rwlock_init(&sk->sk_dst_lock);
+	softirq_delay_init(&sk->sk_delay, sock_readable_defer);
 	rwlock_init(&sk->sk_callback_lock);
 	lockdep_set_class_and_name(&sk->sk_callback_lock,
 			af_callback_keys + sk->sk_family,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 05b7abb..1cc0907 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1342,6 +1342,12 @@  void udp_destroy_sock(struct sock *sk)
 	release_sock(sk);
 }
 
+static int udp_init_sock(struct sock *sk)
+{
+	sk->sk_data_ready = inet_def_readable;
+	return 0;
+}
+
 /*
  *	Socket option code for UDP
  */
@@ -1559,6 +1565,7 @@  struct proto udp_prot = {
 	.connect	   = ip4_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init		   = udp_init_sock,
 	.destroy	   = udp_destroy_sock,
 	.setsockopt	   = udp_setsockopt,
 	.getsockopt	   = udp_getsockopt,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 84b1a29..1a9f8d4 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -960,6 +960,12 @@  void udpv6_destroy_sock(struct sock *sk)
 	inet6_destroy_sock(sk);
 }
 
+static int udpv6_init_sock(struct sock *sk)
+{
+	sk->sk_data_ready = inet_def_readable;
+	return 0;
+}
+
 /*
  *	Socket option code for UDP
  */
@@ -1084,6 +1090,7 @@  struct proto udpv6_prot = {
 	.connect	   = ip6_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init 		   = udpv6_init_sock,
 	.destroy	   = udpv6_destroy_sock,
 	.setsockopt	   = udpv6_setsockopt,
 	.getsockopt	   = udpv6_getsockopt,