Message ID | 20130108224313.GA13304@suse.de |
---|---|
State | Not Applicable, archived |
Delegated to: | David Miller |
Headers | show |
Mel Gorman <mgorman@suse.de> wrote: > Please try the following patch. However, even if it works the benefit of > capture may be so marginal that partially reverting it and simplifying > compaction.c is the better decision. I already got my VM stuck on this one. I had two twosleepy instances, 2774 was the one that got stuck (also confirmed by watching top). Btw, have you been able to reproduce this on your end? I think the easiest reproduction on my 2-core VM is by running 2 twosleepy processes and doing the following to dirty a lot of pages: while time find $LARGISH_NFS_MOUNT -type f -print0 | \ xargs -0 -n1 -P4 sh -c 'cat "$1" >> /tmp/z; > /tmp/z' --; do date; done I've updated git://bogomips.org/toosleepy.git to automate the reporting for me. ===> /proc/vmstat <=== nr_free_pages 2035 nr_inactive_anon 4044 nr_active_anon 3913 nr_inactive_file 98877 nr_active_file 4373 nr_unevictable 0 nr_mlock 0 nr_anon_pages 7839 nr_mapped 2350 nr_file_pages 103382 nr_dirty 512 nr_writeback 0 nr_slab_reclaimable 1578 nr_slab_unreclaimable 5642 nr_page_table_pages 800 nr_kernel_stack 170 nr_unstable 0 nr_bounce 0 nr_vmscan_write 0 nr_vmscan_immediate_reclaim 0 nr_writeback_temp 0 nr_isolated_anon 0 nr_isolated_file 0 nr_shmem 115 nr_dirtied 889731 nr_written 25225 nr_anon_transparent_hugepages 0 nr_free_cma 0 nr_dirty_threshold 22336 nr_dirty_background_threshold 11168 pgpgin 45284 pgpgout 101948 pswpin 0 pswpout 0 pgalloc_dma 299007 pgalloc_dma32 24235925 pgalloc_normal 0 pgalloc_movable 0 pgfree 24539843 pgactivate 5440 pgdeactivate 4476 pgfault 1072378 pgmajfault 338 pgrefill_dma 508 pgrefill_dma32 3968 pgrefill_normal 0 pgrefill_movable 0 pgsteal_kswapd_dma 22463 pgsteal_kswapd_dma32 553340 pgsteal_kswapd_normal 0 pgsteal_kswapd_movable 0 pgsteal_direct_dma 3956 pgsteal_direct_dma32 220354 pgsteal_direct_normal 0 pgsteal_direct_movable 0 pgscan_kswapd_dma 22463 pgscan_kswapd_dma32 554313 pgscan_kswapd_normal 0 pgscan_kswapd_movable 0 pgscan_direct_dma 3956 pgscan_direct_dma32 220397 pgscan_direct_normal 0 pgscan_direct_movable 0 pgscan_direct_throttle 0 pginodesteal 0 slabs_scanned 4096 kswapd_inodesteal 0 kswapd_low_wmark_hit_quickly 1726 kswapd_high_wmark_hit_quickly 21 kswapd_skip_congestion_wait 0 pageoutrun 9065 allocstall 4004 pgrotated 0 pgmigrate_success 1242 pgmigrate_fail 0 compact_migrate_scanned 141232 compact_free_scanned 181666 compact_isolated 52638 compact_stall 2024 compact_fail 1450 compact_success 574 unevictable_pgs_culled 1063 unevictable_pgs_scanned 0 unevictable_pgs_rescued 1653 unevictable_pgs_mlocked 1653 unevictable_pgs_munlocked 1652 unevictable_pgs_cleared 1 unevictable_pgs_stranded 0 thp_fault_alloc 0 thp_fault_fallback 0 thp_collapse_alloc 0 thp_collapse_alloc_failed 0 thp_split 0 thp_zero_page_alloc 0 thp_zero_page_alloc_failed 0 ===> 2724[2724]/stack <=== [<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0 [<ffffffff81077a9d>] futex_wait+0x17d/0x280 [<ffffffff8107988c>] do_futex+0x11c/0xae0 [<ffffffff8107a2d8>] sys_futex+0x88/0x180 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2724[2725]/stack <=== [<ffffffff810f5944>] poll_schedule_timeout+0x44/0x60 [<ffffffff810f6d94>] do_sys_poll+0x374/0x4b0 [<ffffffff810f71ce>] sys_ppoll+0x19e/0x1b0 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2724[2726]/stack <=== [<ffffffffffffffff>] 0xffffffffffffffff ===> 2724[2727]/stack <=== [<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250 [<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80 [<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0 [<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0 [<ffffffff81303a99>] sys_sendto+0x119/0x160 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2724[2728]/stack <=== [<ffffffff8105d02c>] hrtimer_nanosleep+0x9c/0x150 [<ffffffff8105d13e>] sys_nanosleep+0x5e/0x80 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2774[2774]/stack <=== [<ffffffff81077300>] futex_wait_queue_me+0xc0/0xf0 [<ffffffff81077a9d>] futex_wait+0x17d/0x280 [<ffffffff8107988c>] do_futex+0x11c/0xae0 [<ffffffff8107a2d8>] sys_futex+0x88/0x180 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2774[2775]/stack <=== [<ffffffff810f5944>] poll_schedule_timeout+0x44/0x60 [<ffffffff810f6d94>] do_sys_poll+0x374/0x4b0 [<ffffffff810f71ce>] sys_ppoll+0x19e/0x1b0 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2774[2776]/stack <=== [<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250 [<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80 [<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0 [<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0 [<ffffffff81303a99>] sys_sendto+0x119/0x160 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2774[2777]/stack <=== [<ffffffff81310098>] sk_stream_wait_memory+0x1b8/0x250 [<ffffffff8134bea7>] tcp_sendmsg+0x697/0xd80 [<ffffffff81370d0e>] inet_sendmsg+0x5e/0xa0 [<ffffffff81300ab7>] sock_sendmsg+0x87/0xa0 [<ffffffff81303a99>] sys_sendto+0x119/0x160 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff ===> 2774[2778]/stack <=== [<ffffffff810400b8>] do_wait+0x1f8/0x220 [<ffffffff81040ea0>] sys_wait4+0x70/0xf0 [<ffffffff813b0729>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff SysRq : Show Memory Mem-Info: DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 CPU 1: hi: 0, btch: 1 usd: 0 DMA32 per-cpu: CPU 0: hi: 186, btch: 31 usd: 108 CPU 1: hi: 186, btch: 31 usd: 162 active_anon:3990 inactive_anon:4042 isolated_anon:0 active_file:4362 inactive_file:98536 isolated_file:0 unevictable:0 dirty:513 writeback:0 unstable:0 free:1896 slab_reclaimable:1530 slab_unreclaimable:5661 mapped:2342 shmem:115 pagetables:784 bounce:0 free_cma:0 DMA free:2080kB min:84kB low:104kB high:124kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:12168kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15644kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:16kB slab_unreclaimable:192kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 488 488 488 DMA32 free:5568kB min:2784kB low:3480kB high:4176kB active_anon:15960kB inactive_anon:16168kB active_file:17448kB inactive_file:381976kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:499960kB managed:491256kB mlocked:0kB dirty:2052kB writeback:0kB mapped:9368kB shmem:460kB slab_reclaimable:6104kB slab_unreclaimable:22452kB kernel_stack:1416kB pagetables:3136kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 DMA: 6*4kB (UR) 5*8kB (UR) 2*16kB (U) 0*32kB 11*64kB (R) 2*128kB (R) 0*256kB 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB = 2080kB DMA32: 280*4kB (UEM) 66*8kB (UEM) 99*16kB (U) 40*32kB (UM) 16*64kB (M) 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 5536kB 103002 total pagecache pages 0 pages in swap cache Swap cache stats: add 0, delete 0, find 0/0 Free swap = 392188kB Total swap = 392188kB 131054 pages RAM 3820 pages reserved 411919 pages shared 116133 pages non-shared -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote: > Mel Gorman <mgorman@suse.de> wrote: > > Please try the following patch. However, even if it works the benefit of > > capture may be so marginal that partially reverting it and simplifying > > compaction.c is the better decision. > > I already got my VM stuck on this one. I had two twosleepy instances, > 2774 was the one that got stuck (also confirmed by watching top). > > Btw, have you been able to reproduce this on your end? > > I think the easiest reproduction on my 2-core VM is by running 2 > twosleepy processes and doing the following to dirty a lot of pages: Given the persistent sk_stream_wait_memory() traces I suspect a plain TCP bug, triggered by some extra wait somewhere. Please mm guys don't spend too much time right now, I'll try to reproduce the problem. Don't be confused by sk_stream_wait_memory() name. A thread is stuck here because TCP stack is failing to wake it. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-01-08 at 18:14 -0800, Eric Dumazet wrote: > On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote: > > Mel Gorman <mgorman@suse.de> wrote: > > > Please try the following patch. However, even if it works the benefit of > > > capture may be so marginal that partially reverting it and simplifying > > > compaction.c is the better decision. > > > > I already got my VM stuck on this one. I had two twosleepy instances, > > 2774 was the one that got stuck (also confirmed by watching top). > > > > Btw, have you been able to reproduce this on your end? > > > > I think the easiest reproduction on my 2-core VM is by running 2 > > twosleepy processes and doing the following to dirty a lot of pages: > > Given the persistent sk_stream_wait_memory() traces I suspect a plain > TCP bug, triggered by some extra wait somewhere. > > Please mm guys don't spend too much time right now, I'll try to > reproduce the problem. > > Don't be confused by sk_stream_wait_memory() name. > A thread is stuck here because TCP stack is failing to wake it. > Hmm, it seems sk_filter() can return -ENOMEM because skb has the pfmemalloc() set. It seems nobody really tested this stuff under memory stress. Mel, it looks like you are the guy who could fix this, after all ;) One TCP socket keeps retransmitting an SKB via loopback, and TCP stack drops the packet again and again. commit c93bdd0e03e848555d144eb44a1f275b871a8dd5 Author: Mel Gorman <mgorman@suse.de> Date: Tue Jul 31 16:44:19 2012 -0700 netvm: allow skb allocation to use PFMEMALLOC reserves Change the skb allocation API to indicate RX usage and use this to fall back to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and the socket is later found to be unrelated to page reclaim, the packet is dropped so that the memory remains available for page reclaim. Network protocols are expected to recover from this packet loss. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [davem@davemloft.net: Use static branches, coding style corrections] [sebastian@breakpoint.cc: Avoid unnecessary cast, fix !CONFIG_NET build] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: David S. Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 08, 2013 at 06:32:29PM -0800, Eric Dumazet wrote: > On Tue, 2013-01-08 at 18:14 -0800, Eric Dumazet wrote: > > On Tue, 2013-01-08 at 23:23 +0000, Eric Wong wrote: > > > Mel Gorman <mgorman@suse.de> wrote: > > > > Please try the following patch. However, even if it works the benefit of > > > > capture may be so marginal that partially reverting it and simplifying > > > > compaction.c is the better decision. > > > > > > I already got my VM stuck on this one. I had two twosleepy instances, > > > 2774 was the one that got stuck (also confirmed by watching top). > > > > > > Btw, have you been able to reproduce this on your end? > > > > > > I think the easiest reproduction on my 2-core VM is by running 2 > > > twosleepy processes and doing the following to dirty a lot of pages: > > > > Given the persistent sk_stream_wait_memory() traces I suspect a plain > > TCP bug, triggered by some extra wait somewhere. > > > > Please mm guys don't spend too much time right now, I'll try to > > reproduce the problem. > > > > Don't be confused by sk_stream_wait_memory() name. > > A thread is stuck here because TCP stack is failing to wake it. > > > > Hmm, it seems sk_filter() can return -ENOMEM because skb has the > pfmemalloc() set. > The skb should not have pfmemalloc set in most cases, particularly after cfd19c5a (mm: only set page->pfmemalloc when ALLOC_NO_WATERMARKS was used) but the capture patch also failed to clear pfmemalloc properly so it could be set in error.
Mel Gorman <mgorman@suse.de> wrote: > When I looked at it for long enough I found a number of problems. Most > affect timing but two serious issues are in there. One affects how long > kswapd spends compacting versus reclaiming and the other increases lock > contention meaning that async compaction can abort early. Both are serious > and could explain why a driver would fail high-order allocations. > > Please try the following patch. However, even if it works the benefit of > capture may be so marginal that partially reverting it and simplifying > compaction.c is the better decision. Btw, I'm still testing this patch with the "page->pfemalloc = false" change on top of it. > diff --git a/mm/compaction.c b/mm/compaction.c > index 6b807e4..03c82c0 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -857,7 +857,8 @@ static int compact_finished(struct zone *zone, > } else { > unsigned int order; > for (order = cc->order; order < MAX_ORDER; order++) { > - struct free_area *area = &zone->free_area[cc->order]; > + struct free_area *area = &zone->free_area[order]; I noticed something like this hunk wasn't in your latest partial revert (<20130109135010.GB13475@suse.de>) I admit I don't understand this code, but this jumped out at me. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/mm/compaction.c b/mm/compaction.c index 6b807e4..03c82c0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -857,7 +857,8 @@ static int compact_finished(struct zone *zone, } else { unsigned int order; for (order = cc->order; order < MAX_ORDER; order++) { - struct free_area *area = &zone->free_area[cc->order]; + struct free_area *area = &zone->free_area[order]; + /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[cc->migratetype])) return COMPACT_PARTIAL; @@ -929,6 +930,11 @@ static void compact_capture_page(struct compact_control *cc) if (!cc->page || *cc->page) return; + /* Check that watermarks are satisifed before acquiring locks */ + if (!zone_watermark_ok(cc->zone, cc->order, low_wmark_pages(cc->zone), + 0, 0)) + return; + /* * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP * regardless of the migratetype of the freelist is is captured from. @@ -941,7 +947,7 @@ static void compact_capture_page(struct compact_control *cc) */ if (cc->migratetype == MIGRATE_MOVABLE) { mtype_low = 0; - mtype_high = MIGRATE_PCPTYPES; + mtype_high = MIGRATE_PCPTYPES + 1; } else { mtype_low = cc->migratetype; mtype_high = cc->migratetype + 1; @@ -1118,7 +1124,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; - int alloc_flags = 0; /* Check if the GFP flags allow compaction */ if (!order || !may_enter_fs || !may_perform_io) @@ -1126,10 +1131,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, count_compact_event(COMPACTSTALL); -#ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1139,9 +1140,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, contended, page); rc = max(status, rc); - /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) + /* If a page was captured, stop compacting */ + if (*page) break; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ba5e37..9d20c13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2180,10 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags &= ~PF_MEMALLOC; /* If compaction captured a page, prep and use it */ - if (page) { - prep_new_page(page, order, gfp_mask); + if (page && !prep_new_page(page, order, gfp_mask)) goto got_page; - } if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */