Message ID | 1504894024-2750-5-git-send-email-ldufour@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | Speculative page faults | expand |
Hi, On (09/08/17 20:06), Laurent Dufour wrote: [..] > @@ -903,6 +910,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > mm->map_count--; > mpol_put(vma_policy(next)); > kmem_cache_free(vm_area_cachep, next); > + write_seqcount_end(&next->vm_sequence); > /* > * In mprotect's case 6 (see comments on vma_merge), > * we must remove another next too. It would clutter > @@ -932,11 +940,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > if (remove_next == 2) { > remove_next = 1; > end = next->vm_end; > + write_seqcount_end(&vma->vm_sequence); > goto again; > - } > - else if (next) > + } else if (next) { > + if (next != vma) > + write_seqcount_begin_nested(&next->vm_sequence, > + SINGLE_DEPTH_NESTING); > vma_gap_update(next); > - else { > + } else { > /* > * If remove_next == 2 we obviously can't > * reach this path. > @@ -962,6 +973,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, > if (insert && file) > uprobe_mmap(insert); > > + if (next && next != vma) > + write_seqcount_end(&next->vm_sequence); > + write_seqcount_end(&vma->vm_sequence); ok, so what I got on my box is: vm_munmap() -> down_write_killable(&mm->mmap_sem) do_munmap() __split_vma() __vma_adjust() -> write_seqcount_begin(&vma->vm_sequence) -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING) so this gives 3 dependencies ->mmap_sem -> ->vm_seq ->vm_seq -> ->vm_seq/1 ->mmap_sem -> ->vm_seq/1 SyS_mremap() -> down_write_killable(¤t->mm->mmap_sem) move_vma() -> write_seqcount_begin(&vma->vm_sequence) -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING); move_page_tables() __pte_alloc() pte_alloc_one() __alloc_pages_nodemask() fs_reclaim_acquire() I think here we have prepare_alloc_pages() call, that does -> fs_reclaim_acquire(gfp_mask) -> fs_reclaim_release(gfp_mask) so that adds one more dependency ->mmap_sem -> ->vm_seq -> fs_reclaim ->mmap_sem -> ->vm_seq/1 -> fs_reclaim now, under memory pressure we hit the slow path and perform direct reclaim. direct reclaim is done under fs_reclaim lock, so we end up with the following call chain __alloc_pages_nodemask() __alloc_pages_slowpath() __perform_reclaim() -> fs_reclaim_acquire(gfp_mask); try_to_free_pages() shrink_node() shrink_active_list() rmap_walk_file() -> i_mmap_lock_read(mapping); and this break the existing dependency. since we now take the leaf lock (fs_reclaim) first and the the root lock (->mmap_sem). well, seems to be the case. -ss
Hi Sergey, On 13/09/2017 13:53, Sergey Senozhatsky wrote: > Hi, > > On (09/08/17 20:06), Laurent Dufour wrote: > [..] >> @@ -903,6 +910,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, >> mm->map_count--; >> mpol_put(vma_policy(next)); >> kmem_cache_free(vm_area_cachep, next); >> + write_seqcount_end(&next->vm_sequence); >> /* >> * In mprotect's case 6 (see comments on vma_merge), >> * we must remove another next too. It would clutter >> @@ -932,11 +940,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, >> if (remove_next == 2) { >> remove_next = 1; >> end = next->vm_end; >> + write_seqcount_end(&vma->vm_sequence); >> goto again; >> - } >> - else if (next) >> + } else if (next) { >> + if (next != vma) >> + write_seqcount_begin_nested(&next->vm_sequence, >> + SINGLE_DEPTH_NESTING); >> vma_gap_update(next); >> - else { >> + } else { >> /* >> * If remove_next == 2 we obviously can't >> * reach this path. >> @@ -962,6 +973,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, >> if (insert && file) >> uprobe_mmap(insert); >> >> + if (next && next != vma) >> + write_seqcount_end(&next->vm_sequence); >> + write_seqcount_end(&vma->vm_sequence); > > > ok, so what I got on my box is: > > vm_munmap() -> down_write_killable(&mm->mmap_sem) > do_munmap() > __split_vma() > __vma_adjust() -> write_seqcount_begin(&vma->vm_sequence) > -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING) > > so this gives 3 dependencies ->mmap_sem -> ->vm_seq > ->vm_seq -> ->vm_seq/1 > ->mmap_sem -> ->vm_seq/1 > > > SyS_mremap() -> down_write_killable(¤t->mm->mmap_sem) > move_vma() -> write_seqcount_begin(&vma->vm_sequence) > -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING); > move_page_tables() > __pte_alloc() > pte_alloc_one() > __alloc_pages_nodemask() > fs_reclaim_acquire() > > > I think here we have prepare_alloc_pages() call, that does > > -> fs_reclaim_acquire(gfp_mask) > -> fs_reclaim_release(gfp_mask) > > so that adds one more dependency ->mmap_sem -> ->vm_seq -> fs_reclaim > ->mmap_sem -> ->vm_seq/1 -> fs_reclaim > > > now, under memory pressure we hit the slow path and perform direct > reclaim. direct reclaim is done under fs_reclaim lock, so we end up > with the following call chain > > __alloc_pages_nodemask() > __alloc_pages_slowpath() > __perform_reclaim() -> fs_reclaim_acquire(gfp_mask); > try_to_free_pages() > shrink_node() > shrink_active_list() > rmap_walk_file() -> i_mmap_lock_read(mapping); > > > and this break the existing dependency. since we now take the leaf lock > (fs_reclaim) first and the the root lock (->mmap_sem). Thanks for looking at this. I'm sorry, I should have miss something. My understanding is that there are 2 chains of locks: 1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq 2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim 2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem So the solution would be to have in __vma_adjust() mmap_sem -> vm_seq -> i_mmap_rwsem But this will raised the following dependency from unmap_mapping_range() unmap_mapping_range() -> i_mmap_rwsem unmap_mapping_range_tree() unmap_mapping_range_vma() zap_page_range_single() unmap_single_vma() unmap_page_range() -> vm_seq And there is no way to get rid of it easily as in unmap_mapping_range() there is no VMA identified yet. That's being said I can't see any clear way to get lock dependency cleaned here. Furthermore, this is not clear to me how a deadlock could happen as vm_seq is a sequence lock, and there is no way to get blocked here. Cheers, Laurent. > > well, seems to be the case. > > -ss >
Hi, On (09/13/17 18:56), Laurent Dufour wrote: > Hi Sergey, > > On 13/09/2017 13:53, Sergey Senozhatsky wrote: > > Hi, > > > > On (09/08/17 20:06), Laurent Dufour wrote: [..] > > ok, so what I got on my box is: > > > > vm_munmap() -> down_write_killable(&mm->mmap_sem) > > do_munmap() > > __split_vma() > > __vma_adjust() -> write_seqcount_begin(&vma->vm_sequence) > > -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING) > > > > so this gives 3 dependencies ->mmap_sem -> ->vm_seq > > ->vm_seq -> ->vm_seq/1 > > ->mmap_sem -> ->vm_seq/1 > > > > > > SyS_mremap() -> down_write_killable(¤t->mm->mmap_sem) > > move_vma() -> write_seqcount_begin(&vma->vm_sequence) > > -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING); > > move_page_tables() > > __pte_alloc() > > pte_alloc_one() > > __alloc_pages_nodemask() > > fs_reclaim_acquire() > > > > > > I think here we have prepare_alloc_pages() call, that does > > > > -> fs_reclaim_acquire(gfp_mask) > > -> fs_reclaim_release(gfp_mask) > > > > so that adds one more dependency ->mmap_sem -> ->vm_seq -> fs_reclaim > > ->mmap_sem -> ->vm_seq/1 -> fs_reclaim > > > > > > now, under memory pressure we hit the slow path and perform direct > > reclaim. direct reclaim is done under fs_reclaim lock, so we end up > > with the following call chain > > > > __alloc_pages_nodemask() > > __alloc_pages_slowpath() > > __perform_reclaim() -> fs_reclaim_acquire(gfp_mask); > > try_to_free_pages() > > shrink_node() > > shrink_active_list() > > rmap_walk_file() -> i_mmap_lock_read(mapping); > > > > > > and this break the existing dependency. since we now take the leaf lock > > (fs_reclaim) first and the the root lock (->mmap_sem). > > Thanks for looking at this. > I'm sorry, I should have miss something. no prob :) > My understanding is that there are 2 chains of locks: > 1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq > 2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim > 2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem yes, as far as lockdep warning suggests. > So the solution would be to have in __vma_adjust() > mmap_sem -> vm_seq -> i_mmap_rwsem > > But this will raised the following dependency from unmap_mapping_range() > unmap_mapping_range() -> i_mmap_rwsem > unmap_mapping_range_tree() > unmap_mapping_range_vma() > zap_page_range_single() > unmap_single_vma() > unmap_page_range() -> vm_seq > > And there is no way to get rid of it easily as in unmap_mapping_range() > there is no VMA identified yet. > > That's being said I can't see any clear way to get lock dependency cleaned > here. > Furthermore, this is not clear to me how a deadlock could happen as vm_seq > is a sequence lock, and there is no way to get blocked here. as far as I understand, seq locks can deadlock, technically. not on the write() side, but on the read() side: read_seqcount_begin() raw_read_seqcount_begin() __read_seqcount_begin() and __read_seqcount_begin() spins for ever __read_seqcount_begin() { repeat: ret = READ_ONCE(s->sequence); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; } return ret; } so if there are two CPUs, one doing write_seqcount() and the other one doing read_seqcount() then what can happen is something like this CPU0 CPU1 fs_reclaim_acquire() write_seqcount_begin() fs_reclaim_acquire() read_seqcount_begin() write_seqcount_end() CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin() and now waits for fs_reclaim_acquire(). makes sense? -ss
Hi, On 14/09/2017 02:31, Sergey Senozhatsky wrote: > Hi, > > On (09/13/17 18:56), Laurent Dufour wrote: >> Hi Sergey, >> >> On 13/09/2017 13:53, Sergey Senozhatsky wrote: >>> Hi, >>> >>> On (09/08/17 20:06), Laurent Dufour wrote: > [..] >>> ok, so what I got on my box is: >>> >>> vm_munmap() -> down_write_killable(&mm->mmap_sem) >>> do_munmap() >>> __split_vma() >>> __vma_adjust() -> write_seqcount_begin(&vma->vm_sequence) >>> -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING) >>> >>> so this gives 3 dependencies ->mmap_sem -> ->vm_seq >>> ->vm_seq -> ->vm_seq/1 >>> ->mmap_sem -> ->vm_seq/1 >>> >>> >>> SyS_mremap() -> down_write_killable(¤t->mm->mmap_sem) >>> move_vma() -> write_seqcount_begin(&vma->vm_sequence) >>> -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING); >>> move_page_tables() >>> __pte_alloc() >>> pte_alloc_one() >>> __alloc_pages_nodemask() >>> fs_reclaim_acquire() >>> >>> >>> I think here we have prepare_alloc_pages() call, that does >>> >>> -> fs_reclaim_acquire(gfp_mask) >>> -> fs_reclaim_release(gfp_mask) >>> >>> so that adds one more dependency ->mmap_sem -> ->vm_seq -> fs_reclaim >>> ->mmap_sem -> ->vm_seq/1 -> fs_reclaim >>> >>> >>> now, under memory pressure we hit the slow path and perform direct >>> reclaim. direct reclaim is done under fs_reclaim lock, so we end up >>> with the following call chain >>> >>> __alloc_pages_nodemask() >>> __alloc_pages_slowpath() >>> __perform_reclaim() -> fs_reclaim_acquire(gfp_mask); >>> try_to_free_pages() >>> shrink_node() >>> shrink_active_list() >>> rmap_walk_file() -> i_mmap_lock_read(mapping); >>> >>> >>> and this break the existing dependency. since we now take the leaf lock >>> (fs_reclaim) first and the the root lock (->mmap_sem). >> >> Thanks for looking at this. >> I'm sorry, I should have miss something. > > no prob :) > > >> My understanding is that there are 2 chains of locks: >> 1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq >> 2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim >> 2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem > > yes, as far as lockdep warning suggests. > >> So the solution would be to have in __vma_adjust() >> mmap_sem -> vm_seq -> i_mmap_rwsem >> >> But this will raised the following dependency from unmap_mapping_range() >> unmap_mapping_range() -> i_mmap_rwsem >> unmap_mapping_range_tree() >> unmap_mapping_range_vma() >> zap_page_range_single() >> unmap_single_vma() >> unmap_page_range() -> vm_seq >> >> And there is no way to get rid of it easily as in unmap_mapping_range() >> there is no VMA identified yet. >> >> That's being said I can't see any clear way to get lock dependency cleaned >> here. >> Furthermore, this is not clear to me how a deadlock could happen as vm_seq >> is a sequence lock, and there is no way to get blocked here. > > as far as I understand, > seq locks can deadlock, technically. not on the write() side, but on > the read() side: > > read_seqcount_begin() > raw_read_seqcount_begin() > __read_seqcount_begin() > > and __read_seqcount_begin() spins for ever > > __read_seqcount_begin() > { > repeat: > ret = READ_ONCE(s->sequence); > if (unlikely(ret & 1)) { > cpu_relax(); > goto repeat; > } > return ret; > } > > > so if there are two CPUs, one doing write_seqcount() and the other one > doing read_seqcount() then what can happen is something like this > > CPU0 CPU1 > > fs_reclaim_acquire() > write_seqcount_begin() > fs_reclaim_acquire() read_seqcount_begin() > write_seqcount_end() > > CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from > CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin() > and now waits for fs_reclaim_acquire(). makes sense? Yes, this makes sense. But in the case of this series, there is no call to __read_seqcount_begin(), and the reader (the speculative page fault handler), is just checking for (vm_seq & 1) and if this is true, simply exit the speculative path without waiting. So there is no deadlock possibility. The bad case would be to have 2 concurrent threads calling write_seqcount_begin() on the same VMA, leading a wrongly freed sequence lock but this can't happen because of the mmap_sem holding for write in such a case. Cheers, Laurent.
Hi, On (09/14/17 09:55), Laurent Dufour wrote: [..] > > so if there are two CPUs, one doing write_seqcount() and the other one > > doing read_seqcount() then what can happen is something like this > > > > CPU0 CPU1 > > > > fs_reclaim_acquire() > > write_seqcount_begin() > > fs_reclaim_acquire() read_seqcount_begin() > > write_seqcount_end() > > > > CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from > > CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin() > > and now waits for fs_reclaim_acquire(). makes sense? > > Yes, this makes sense. > > But in the case of this series, there is no call to > __read_seqcount_begin(), and the reader (the speculative page fault > handler), is just checking for (vm_seq & 1) and if this is true, simply > exit the speculative path without waiting. > So there is no deadlock possibility. probably lockdep just knows that those locks interleave at some point. by the way, I think there is one path that can spin find_vma_srcu() read_seqbegin() read_seqcount_begin() raw_read_seqcount_begin() __read_seqcount_begin() -ss
On 14/09/2017 10:13, Sergey Senozhatsky wrote: > Hi, > > On (09/14/17 09:55), Laurent Dufour wrote: > [..] >>> so if there are two CPUs, one doing write_seqcount() and the other one >>> doing read_seqcount() then what can happen is something like this >>> >>> CPU0 CPU1 >>> >>> fs_reclaim_acquire() >>> write_seqcount_begin() >>> fs_reclaim_acquire() read_seqcount_begin() >>> write_seqcount_end() >>> >>> CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from >>> CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin() >>> and now waits for fs_reclaim_acquire(). makes sense? >> >> Yes, this makes sense. >> >> But in the case of this series, there is no call to >> __read_seqcount_begin(), and the reader (the speculative page fault >> handler), is just checking for (vm_seq & 1) and if this is true, simply >> exit the speculative path without waiting. >> So there is no deadlock possibility. > > probably lockdep just knows that those locks interleave at some > point. > > > by the way, I think there is one path that can spin > > find_vma_srcu() > read_seqbegin() > read_seqcount_begin() > raw_read_seqcount_begin() > __read_seqcount_begin() That's right, but here this is the sequence counter mm->mm_seq, not the vm_seq one. This one is held to protect walking the VMA list "locklessly"... Cheers, Laurent.
On (09/14/17 10:58), Laurent Dufour wrote: [..] > That's right, but here this is the sequence counter mm->mm_seq, not the > vm_seq one. d'oh... you are right. -ss
On 14/09/2017 11:11, Sergey Senozhatsky wrote: > On (09/14/17 10:58), Laurent Dufour wrote: > [..] >> That's right, but here this is the sequence counter mm->mm_seq, not the >> vm_seq one. > > d'oh... you are right. So I'm doubting about the probability of a deadlock here, but I don't like to see lockdep complaining. Is there an easy way to make it happy ?
On (09/14/17 11:15), Laurent Dufour wrote: > On 14/09/2017 11:11, Sergey Senozhatsky wrote: > > On (09/14/17 10:58), Laurent Dufour wrote: > > [..] > >> That's right, but here this is the sequence counter mm->mm_seq, not the > >> vm_seq one. > > > > d'oh... you are right. > > So I'm doubting about the probability of a deadlock here, but I don't like > to see lockdep complaining. Is there an easy way to make it happy ? /* * well... answering your question - it seems raw versions of seqcount * functions don't call lockdep's lock_acquire/lock_release... * * but I have never told you that. never. */ lockdep, perhaps, can be wrong sometimes, and may be it's one of those cases. may be not... I'm not a MM guy myself. below is a lockdep splat I got yesterday. that's v3 of SPF patch set. [ 2763.365898] ====================================================== [ 2763.365899] WARNING: possible circular locking dependency detected [ 2763.365902] 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 Not tainted [ 2763.365903] ------------------------------------------------------ [ 2763.365905] khugepaged/42 is trying to acquire lock: [ 2763.365906] (&mapping->i_mmap_rwsem){++++}, at: [<ffffffff811181cc>] rmap_walk_file+0x5a/0x142 [ 2763.365913] but task is already holding lock: [ 2763.365915] (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35 [ 2763.365920] which lock already depends on the new lock. [ 2763.365922] the existing dependency chain (in reverse order) is: [ 2763.365924] -> #3 (fs_reclaim){+.+.}: [ 2763.365930] lock_acquire+0x176/0x19e [ 2763.365932] fs_reclaim_acquire+0x32/0x35 [ 2763.365934] __alloc_pages_nodemask+0x6d/0x1f9 [ 2763.365937] pte_alloc_one+0x17/0x62 [ 2763.365940] __pte_alloc+0x1f/0x83 [ 2763.365943] move_page_tables+0x2c3/0x5a2 [ 2763.365944] move_vma.isra.25+0xff/0x29f [ 2763.365946] SyS_mremap+0x41b/0x49e [ 2763.365949] entry_SYSCALL_64_fastpath+0x18/0xad [ 2763.365951] -> #2 (&vma->vm_sequence/1){+.+.}: [ 2763.365955] lock_acquire+0x176/0x19e [ 2763.365958] write_seqcount_begin_nested+0x1b/0x1d [ 2763.365959] __vma_adjust+0x1c4/0x5f1 [ 2763.365961] __split_vma+0x12c/0x181 [ 2763.365963] do_munmap+0x128/0x2af [ 2763.365965] vm_munmap+0x5a/0x73 [ 2763.365968] elf_map+0xb1/0xce [ 2763.365970] load_elf_binary+0x91e/0x137a [ 2763.365973] search_binary_handler+0x70/0x1f3 [ 2763.365974] do_execveat_common+0x45e/0x68e [ 2763.365978] call_usermodehelper_exec_async+0xf7/0x11f [ 2763.365980] ret_from_fork+0x27/0x40 [ 2763.365981] -> #1 (&vma->vm_sequence){+.+.}: [ 2763.365985] lock_acquire+0x176/0x19e [ 2763.365987] write_seqcount_begin_nested+0x1b/0x1d [ 2763.365989] __vma_adjust+0x1a9/0x5f1 [ 2763.365991] __split_vma+0x12c/0x181 [ 2763.365993] do_munmap+0x128/0x2af [ 2763.365994] vm_munmap+0x5a/0x73 [ 2763.365996] elf_map+0xb1/0xce [ 2763.365998] load_elf_binary+0x91e/0x137a [ 2763.365999] search_binary_handler+0x70/0x1f3 [ 2763.366001] do_execveat_common+0x45e/0x68e [ 2763.366003] call_usermodehelper_exec_async+0xf7/0x11f [ 2763.366005] ret_from_fork+0x27/0x40 [ 2763.366006] -> #0 (&mapping->i_mmap_rwsem){++++}: [ 2763.366010] __lock_acquire+0xa72/0xca0 [ 2763.366012] lock_acquire+0x176/0x19e [ 2763.366015] down_read+0x3b/0x55 [ 2763.366017] rmap_walk_file+0x5a/0x142 [ 2763.366018] page_referenced+0xfc/0x134 [ 2763.366022] shrink_active_list+0x1ac/0x37d [ 2763.366024] shrink_node_memcg.constprop.72+0x3ca/0x567 [ 2763.366026] shrink_node+0x3f/0x14c [ 2763.366028] try_to_free_pages+0x288/0x47a [ 2763.366030] __alloc_pages_slowpath+0x3a7/0xa49 [ 2763.366032] __alloc_pages_nodemask+0xf1/0x1f9 [ 2763.366035] khugepaged+0xc8/0x167c [ 2763.366037] kthread+0x133/0x13b [ 2763.366039] ret_from_fork+0x27/0x40 [ 2763.366040] other info that might help us debug this: [ 2763.366042] Chain exists of: &mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim [ 2763.366048] Possible unsafe locking scenario: [ 2763.366049] CPU0 CPU1 [ 2763.366050] ---- ---- [ 2763.366051] lock(fs_reclaim); [ 2763.366054] lock(&vma->vm_sequence/1); [ 2763.366056] lock(fs_reclaim); [ 2763.366058] lock(&mapping->i_mmap_rwsem); [ 2763.366061] *** DEADLOCK *** [ 2763.366063] 1 lock held by khugepaged/42: [ 2763.366064] #0: (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35 [ 2763.366068] stack backtrace: [ 2763.366071] CPU: 2 PID: 42 Comm: khugepaged Not tainted 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 [ 2763.366073] Call Trace: [ 2763.366077] dump_stack+0x67/0x8e [ 2763.366080] print_circular_bug+0x2a1/0x2af [ 2763.366083] ? graph_unlock+0x69/0x69 [ 2763.366085] check_prev_add+0x76/0x20d [ 2763.366087] ? graph_unlock+0x69/0x69 [ 2763.366090] __lock_acquire+0xa72/0xca0 [ 2763.366093] ? __save_stack_trace+0xa3/0xbf [ 2763.366096] lock_acquire+0x176/0x19e [ 2763.366098] ? rmap_walk_file+0x5a/0x142 [ 2763.366100] down_read+0x3b/0x55 [ 2763.366102] ? rmap_walk_file+0x5a/0x142 [ 2763.366103] rmap_walk_file+0x5a/0x142 [ 2763.366106] page_referenced+0xfc/0x134 [ 2763.366108] ? page_vma_mapped_walk_done.isra.17+0xb/0xb [ 2763.366109] ? page_get_anon_vma+0x6d/0x6d [ 2763.366112] shrink_active_list+0x1ac/0x37d [ 2763.366115] shrink_node_memcg.constprop.72+0x3ca/0x567 [ 2763.366118] ? ___might_sleep+0xd5/0x234 [ 2763.366121] shrink_node+0x3f/0x14c [ 2763.366123] try_to_free_pages+0x288/0x47a [ 2763.366126] __alloc_pages_slowpath+0x3a7/0xa49 [ 2763.366128] ? ___might_sleep+0xd5/0x234 [ 2763.366131] __alloc_pages_nodemask+0xf1/0x1f9 [ 2763.366133] khugepaged+0xc8/0x167c [ 2763.366138] ? remove_wait_queue+0x47/0x47 [ 2763.366140] ? collapse_shmem.isra.45+0x828/0x828 [ 2763.366142] kthread+0x133/0x13b [ 2763.366145] ? __list_del_entry+0x1d/0x1d [ 2763.366147] ret_from_fork+0x27/0x40 -ss
Hi, On 14/09/2017 11:40, Sergey Senozhatsky wrote: > On (09/14/17 11:15), Laurent Dufour wrote: >> On 14/09/2017 11:11, Sergey Senozhatsky wrote: >>> On (09/14/17 10:58), Laurent Dufour wrote: >>> [..] >>>> That's right, but here this is the sequence counter mm->mm_seq, not the >>>> vm_seq one. >>> >>> d'oh... you are right. >> >> So I'm doubting about the probability of a deadlock here, but I don't like >> to see lockdep complaining. Is there an easy way to make it happy ? > > > /* > * well... answering your question - it seems raw versions of seqcount > * functions don't call lockdep's lock_acquire/lock_release... > * > * but I have never told you that. never. > */ Hum... I'm not sure that would be the best way since in other case lockdep checks are valid, but if getting rid of locked's warning is required to get this series upstream, I'd use raw versions... Please advice... > > lockdep, perhaps, can be wrong sometimes, and may be it's one of those > cases. may be not... I'm not a MM guy myself. From the code reading I can't see any valid reason for a circular lock dependency. > below is a lockdep splat I got yesterday. that's v3 of SPF patch set. This is exactly the same you got previously, and I still can't see how the chain "&mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim" could happen. Cheers, Laurent. > > [ 2763.365898] ====================================================== > [ 2763.365899] WARNING: possible circular locking dependency detected > [ 2763.365902] 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 Not tainted > [ 2763.365903] ------------------------------------------------------ > [ 2763.365905] khugepaged/42 is trying to acquire lock: > [ 2763.365906] (&mapping->i_mmap_rwsem){++++}, at: [<ffffffff811181cc>] rmap_walk_file+0x5a/0x142 > [ 2763.365913] > but task is already holding lock: > [ 2763.365915] (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35 > [ 2763.365920] > which lock already depends on the new lock. > > [ 2763.365922] > the existing dependency chain (in reverse order) is: > [ 2763.365924] > -> #3 (fs_reclaim){+.+.}: > [ 2763.365930] lock_acquire+0x176/0x19e > [ 2763.365932] fs_reclaim_acquire+0x32/0x35 > [ 2763.365934] __alloc_pages_nodemask+0x6d/0x1f9 > [ 2763.365937] pte_alloc_one+0x17/0x62 > [ 2763.365940] __pte_alloc+0x1f/0x83 > [ 2763.365943] move_page_tables+0x2c3/0x5a2 > [ 2763.365944] move_vma.isra.25+0xff/0x29f > [ 2763.365946] SyS_mremap+0x41b/0x49e > [ 2763.365949] entry_SYSCALL_64_fastpath+0x18/0xad > [ 2763.365951] > -> #2 (&vma->vm_sequence/1){+.+.}: > [ 2763.365955] lock_acquire+0x176/0x19e > [ 2763.365958] write_seqcount_begin_nested+0x1b/0x1d > [ 2763.365959] __vma_adjust+0x1c4/0x5f1 > [ 2763.365961] __split_vma+0x12c/0x181 > [ 2763.365963] do_munmap+0x128/0x2af > [ 2763.365965] vm_munmap+0x5a/0x73 > [ 2763.365968] elf_map+0xb1/0xce > [ 2763.365970] load_elf_binary+0x91e/0x137a > [ 2763.365973] search_binary_handler+0x70/0x1f3 > [ 2763.365974] do_execveat_common+0x45e/0x68e > [ 2763.365978] call_usermodehelper_exec_async+0xf7/0x11f > [ 2763.365980] ret_from_fork+0x27/0x40 > [ 2763.365981] > -> #1 (&vma->vm_sequence){+.+.}: > [ 2763.365985] lock_acquire+0x176/0x19e > [ 2763.365987] write_seqcount_begin_nested+0x1b/0x1d > [ 2763.365989] __vma_adjust+0x1a9/0x5f1 > [ 2763.365991] __split_vma+0x12c/0x181 > [ 2763.365993] do_munmap+0x128/0x2af > [ 2763.365994] vm_munmap+0x5a/0x73 > [ 2763.365996] elf_map+0xb1/0xce > [ 2763.365998] load_elf_binary+0x91e/0x137a > [ 2763.365999] search_binary_handler+0x70/0x1f3 > [ 2763.366001] do_execveat_common+0x45e/0x68e > [ 2763.366003] call_usermodehelper_exec_async+0xf7/0x11f > [ 2763.366005] ret_from_fork+0x27/0x40 > [ 2763.366006] > -> #0 (&mapping->i_mmap_rwsem){++++}: > [ 2763.366010] __lock_acquire+0xa72/0xca0 > [ 2763.366012] lock_acquire+0x176/0x19e > [ 2763.366015] down_read+0x3b/0x55 > [ 2763.366017] rmap_walk_file+0x5a/0x142 > [ 2763.366018] page_referenced+0xfc/0x134 > [ 2763.366022] shrink_active_list+0x1ac/0x37d > [ 2763.366024] shrink_node_memcg.constprop.72+0x3ca/0x567 > [ 2763.366026] shrink_node+0x3f/0x14c > [ 2763.366028] try_to_free_pages+0x288/0x47a > [ 2763.366030] __alloc_pages_slowpath+0x3a7/0xa49 > [ 2763.366032] __alloc_pages_nodemask+0xf1/0x1f9 > [ 2763.366035] khugepaged+0xc8/0x167c > [ 2763.366037] kthread+0x133/0x13b > [ 2763.366039] ret_from_fork+0x27/0x40 > [ 2763.366040] > other info that might help us debug this: > > [ 2763.366042] Chain exists of: > &mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim > > [ 2763.366048] Possible unsafe locking scenario: > > [ 2763.366049] CPU0 CPU1 > [ 2763.366050] ---- ---- > [ 2763.366051] lock(fs_reclaim); > [ 2763.366054] lock(&vma->vm_sequence/1); > [ 2763.366056] lock(fs_reclaim); > [ 2763.366058] lock(&mapping->i_mmap_rwsem); > [ 2763.366061] > *** DEADLOCK *** > > [ 2763.366063] 1 lock held by khugepaged/42: > [ 2763.366064] #0: (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35 > [ 2763.366068] > stack backtrace: > [ 2763.366071] CPU: 2 PID: 42 Comm: khugepaged Not tainted 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 > [ 2763.366073] Call Trace: > [ 2763.366077] dump_stack+0x67/0x8e > [ 2763.366080] print_circular_bug+0x2a1/0x2af > [ 2763.366083] ? graph_unlock+0x69/0x69 > [ 2763.366085] check_prev_add+0x76/0x20d > [ 2763.366087] ? graph_unlock+0x69/0x69 > [ 2763.366090] __lock_acquire+0xa72/0xca0 > [ 2763.366093] ? __save_stack_trace+0xa3/0xbf > [ 2763.366096] lock_acquire+0x176/0x19e > [ 2763.366098] ? rmap_walk_file+0x5a/0x142 > [ 2763.366100] down_read+0x3b/0x55 > [ 2763.366102] ? rmap_walk_file+0x5a/0x142 > [ 2763.366103] rmap_walk_file+0x5a/0x142 > [ 2763.366106] page_referenced+0xfc/0x134 > [ 2763.366108] ? page_vma_mapped_walk_done.isra.17+0xb/0xb > [ 2763.366109] ? page_get_anon_vma+0x6d/0x6d > [ 2763.366112] shrink_active_list+0x1ac/0x37d > [ 2763.366115] shrink_node_memcg.constprop.72+0x3ca/0x567 > [ 2763.366118] ? ___might_sleep+0xd5/0x234 > [ 2763.366121] shrink_node+0x3f/0x14c > [ 2763.366123] try_to_free_pages+0x288/0x47a > [ 2763.366126] __alloc_pages_slowpath+0x3a7/0xa49 > [ 2763.366128] ? ___might_sleep+0xd5/0x234 > [ 2763.366131] __alloc_pages_nodemask+0xf1/0x1f9 > [ 2763.366133] khugepaged+0xc8/0x167c > [ 2763.366138] ? remove_wait_queue+0x47/0x47 > [ 2763.366140] ? collapse_shmem.isra.45+0x828/0x828 > [ 2763.366142] kthread+0x133/0x13b > [ 2763.366145] ? __list_del_entry+0x1d/0x1d > [ 2763.366147] ret_from_fork+0x27/0x40 > > -ss >
On Fri, Sep 15, 2017 at 02:38:51PM +0200, Laurent Dufour wrote: > > /* > > * well... answering your question - it seems raw versions of seqcount > > * functions don't call lockdep's lock_acquire/lock_release... > > * > > * but I have never told you that. never. > > */ > > Hum... I'm not sure that would be the best way since in other case lockdep > checks are valid, but if getting rid of locked's warning is required to get > this series upstream, I'd use raw versions... Please advice... No sensible other way to shut it up come to mind though. Might be best to use the raw primitives with a comment explaining why.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 46f4ecf5479a..df9a530c8ca1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -344,6 +344,7 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; + seqcount_t vm_sequence; } __randomize_layout; struct core_thread { diff --git a/mm/memory.c b/mm/memory.c index 530d887ca885..f250e7c92948 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1499,6 +1499,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long next; BUG_ON(addr >= end); + write_seqcount_begin(&vma->vm_sequence); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1508,6 +1509,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); + write_seqcount_end(&vma->vm_sequence); } diff --git a/mm/mmap.c b/mm/mmap.c index 680506faceae..0a0012c7e50c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -558,6 +558,8 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, else mm->highest_vm_end = vm_end_gap(vma); + seqcount_init(&vma->vm_sequence); + /* * vma->vm_prev wasn't known when we followed the rbtree to find the * correct insertion point for that vma. As a result, we could not @@ -799,6 +801,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } } + write_seqcount_begin(&vma->vm_sequence); + if (next && next != vma) + write_seqcount_begin_nested(&next->vm_sequence, + SINGLE_DEPTH_NESTING); + anon_vma = vma->anon_vma; if (!anon_vma && adjust_next) anon_vma = next->anon_vma; @@ -903,6 +910,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, mm->map_count--; mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); + write_seqcount_end(&next->vm_sequence); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -932,11 +940,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (remove_next == 2) { remove_next = 1; end = next->vm_end; + write_seqcount_end(&vma->vm_sequence); goto again; - } - else if (next) + } else if (next) { + if (next != vma) + write_seqcount_begin_nested(&next->vm_sequence, + SINGLE_DEPTH_NESTING); vma_gap_update(next); - else { + } else { /* * If remove_next == 2 we obviously can't * reach this path. @@ -962,6 +973,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); + if (next && next != vma) + write_seqcount_end(&next->vm_sequence); + write_seqcount_end(&vma->vm_sequence); + validate_mm(mm); return 0;