]> Gitweb @ Texas Instruments - Open Source Git Repositories - git.TI.com/gitweb - glsdk/meta-ti-glsdk.git/blob - recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.27/0056-mm-hugetlbfs-close-race-during-teardown-of-hugetlbfs.patch
linux-ti33x-psp 3.2: update to 3.2.28 and add motorcape support
[glsdk/meta-ti-glsdk.git] / recipes-kernel / linux / linux-ti33x-psp-3.2 / 3.2.27 / 0056-mm-hugetlbfs-close-race-during-teardown-of-hugetlbfs.patch
1 From 677941da036e27de0418fa601b49f8c8c6ccf594 Mon Sep 17 00:00:00 2001
2 From: Mel Gorman <mgorman@suse.de>
3 Date: Tue, 31 Jul 2012 16:46:20 -0700
4 Subject: [PATCH 56/70] mm: hugetlbfs: close race during teardown of hugetlbfs
5  shared page tables
7 commit d833352a4338dc31295ed832a30c9ccff5c7a183 upstream.
9 If a process creates a large hugetlbfs mapping that is eligible for page
10 table sharing and forks heavily with children some of whom fault and
11 others which destroy the mapping then it is possible for page tables to
12 get corrupted.  Some teardowns of the mapping encounter a "bad pmd" and
13 output a message to the kernel log.  The final teardown will trigger a
14 BUG_ON in mm/filemap.c.
16 This was reproduced in 3.4 but is known to have existed for a long time
17 and goes back at least as far as 2.6.37.  It was probably was introduced
18 in 2.6.20 by [39dde65c: shared page table for hugetlb page].  The messages
19 look like this;
21 [  ..........] Lots of bad pmd messages followed by this
22 [  127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7).
23 [  127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7).
24 [  127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7).
25 [  127.186778] ------------[ cut here ]------------
26 [  127.186781] kernel BUG at mm/filemap.c:134!
27 [  127.186782] invalid opcode: 0000 [#1] SMP
28 [  127.186783] CPU 7
29 [  127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod
30 [  127.186801]
31 [  127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR
32 [  127.186804] RIP: 0010:[<ffffffff810ed6ce>]  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
33 [  127.186809] RSP: 0000:ffff8804144b5c08  EFLAGS: 00010002
34 [  127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0
35 [  127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00
36 [  127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003
37 [  127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8
38 [  127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8
39 [  127.186815] FS:  00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000
40 [  127.186816] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
41 [  127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0
42 [  127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
43 [  127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
44 [  127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0)
45 [  127.186821] Stack:
46 [  127.186822]  ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b
47 [  127.186824]  ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98
48 [  127.186825]  ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000
49 [  127.186827] Call Trace:
50 [  127.186829]  [<ffffffff810ed83b>] delete_from_page_cache+0x3b/0x80
51 [  127.186832]  [<ffffffff811bc925>] truncate_hugepages+0x115/0x220
52 [  127.186834]  [<ffffffff811bca43>] hugetlbfs_evict_inode+0x13/0x30
53 [  127.186837]  [<ffffffff811655c7>] evict+0xa7/0x1b0
54 [  127.186839]  [<ffffffff811657a3>] iput_final+0xd3/0x1f0
55 [  127.186840]  [<ffffffff811658f9>] iput+0x39/0x50
56 [  127.186842]  [<ffffffff81162708>] d_kill+0xf8/0x130
57 [  127.186843]  [<ffffffff81162812>] dput+0xd2/0x1a0
58 [  127.186845]  [<ffffffff8114e2d0>] __fput+0x170/0x230
59 [  127.186848]  [<ffffffff81236e0e>] ? rb_erase+0xce/0x150
60 [  127.186849]  [<ffffffff8114e3ad>] fput+0x1d/0x30
61 [  127.186851]  [<ffffffff81117db7>] remove_vma+0x37/0x80
62 [  127.186853]  [<ffffffff81119182>] do_munmap+0x2d2/0x360
63 [  127.186855]  [<ffffffff811cc639>] sys_shmdt+0xc9/0x170
64 [  127.186857]  [<ffffffff81410a39>] system_call_fastpath+0x16/0x1b
65 [  127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0
66 [  127.186868] RIP  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
67 [  127.186870]  RSP <ffff8804144b5c08>
68 [  127.186871] ---[ end trace 7cbac5d1db69f426 ]---
70 The bug is a race and not always easy to reproduce.  To reproduce it I was
71 doing the following on a single socket I7-based machine with 16G of RAM.
73 $ hugeadm --pool-pages-max DEFAULT:13G
74 $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax
75 $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall
76 $ for i in `seq 1 9000`; do ./hugetlbfs-test; done
78 On my particular machine, it usually triggers within 10 minutes but
79 enabling debug options can change the timing such that it never hits.
80 Once the bug is triggered, the machine is in trouble and needs to be
81 rebooted.  The machine will respond but processes accessing proc like "ps
82 aux" will hang due to the BUG_ON.  shutdown will also hang and needs a
83 hard reset or a sysrq-b.
85 The basic problem is a race between page table sharing and teardown.  For
86 the most part page table sharing depends on i_mmap_mutex.  In some cases,
87 it is also taking the mm->page_table_lock for the PTE updates but with
88 shared page tables, it is the i_mmap_mutex that is more important.
90 Unfortunately it appears to be also insufficient. Consider the following
91 situation
93 Process A                                       Process B
94 ---------                                       ---------
95 hugetlb_fault                                   shmdt
96                                                 LockWrite(mmap_sem)
97                                                   do_munmap
98                                                     unmap_region
99                                                       unmap_vmas
100                                                         unmap_single_vma
101                                                           unmap_hugepage_range
102                                                             Lock(i_mmap_mutex)
103                                                             Lock(mm->page_table_lock)
104                                                             huge_pmd_unshare/unmap tables <--- (1)
105                                                             Unlock(mm->page_table_lock)
106                                                             Unlock(i_mmap_mutex)
107   huge_pte_alloc                                      ...
108     Lock(i_mmap_mutex)                                ...
109     vma_prio_walk, find svma, spte                    ...
110     Lock(mm->page_table_lock)                         ...
111     share spte                                        ...
112     Unlock(mm->page_table_lock)                       ...
113     Unlock(i_mmap_mutex)                              ...
114   hugetlb_no_page                                                                         <--- (2)
115                                                       free_pgtables
116                                                         unlink_file_vma
117                                                         hugetlb_free_pgd_range
118                                                     remove_vma_list
120 In this scenario, it is possible for Process A to share page tables with
121 Process B that is trying to tear them down.  The i_mmap_mutex on its own
122 does not prevent Process A walking Process B's page tables.  At (1) above,
123 the page tables are not shared yet so it unmaps the PMDs.  Process A sets
124 up page table sharing and at (2) faults a new entry.  Process B then trips
125 up on it in free_pgtables.
127 This patch fixes the problem by adding a new function
128 __unmap_hugepage_range_final that is only called when the VMA is about to
129 be destroyed.  This function clears VM_MAYSHARE during
130 unmap_hugepage_range() under the i_mmap_mutex.  This makes the VMA
131 ineligible for sharing and avoids the race.  Superficially this looks like
132 it would then be vunerable to truncate and madvise issues but hugetlbfs
133 has its own truncate handlers so does not use unmap_mapping_range() and
134 does not support madvise(DONTNEED).
136 This should be treated as a -stable candidate if it is merged.
138 Test program is as follows. The test case was mostly written by Michal
139 Hocko with a few minor changes to reproduce this bug.
141 ==== CUT HERE ====
143 static size_t huge_page_size = (2UL << 20);
144 static size_t nr_huge_page_A = 512;
145 static size_t nr_huge_page_B = 5632;
147 unsigned int get_random(unsigned int max)
149         struct timeval tv;
151         gettimeofday(&tv, NULL);
152         srandom(tv.tv_usec);
153         return random() % max;
156 static void play(void *addr, size_t size)
158         unsigned char *start = addr,
159                       *end = start + size,
160                       *a;
161         start += get_random(size/2);
163         /* we could itterate on huge pages but let's give it more time. */
164         for (a = start; a < end; a += 4096)
165                 *a = 0;
168 int main(int argc, char **argv)
170         key_t key = IPC_PRIVATE;
171         size_t sizeA = nr_huge_page_A * huge_page_size;
172         size_t sizeB = nr_huge_page_B * huge_page_size;
173         int shmidA, shmidB;
174         void *addrA = NULL, *addrB = NULL;
175         int nr_children = 300, n = 0;
177         if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
178                 perror("shmget:");
179                 return 1;
180         }
182         if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) {
183                 perror("shmat");
184                 return 1;
185         }
186         if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
187                 perror("shmget:");
188                 return 1;
189         }
191         if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) {
192                 perror("shmat");
193                 return 1;
194         }
196 fork_child:
197         switch(fork()) {
198                 case 0:
199                         switch (n%3) {
200                         case 0:
201                                 play(addrA, sizeA);
202                                 break;
203                         case 1:
204                                 play(addrB, sizeB);
205                                 break;
206                         case 2:
207                                 break;
208                         }
209                         break;
210                 case -1:
211                         perror("fork:");
212                         break;
213                 default:
214                         if (++n < nr_children)
215                                 goto fork_child;
216                         play(addrA, sizeA);
217                         break;
218         }
219         shmdt(addrA);
220         shmdt(addrB);
221         do {
222                 wait(NULL);
223         } while (--n > 0);
224         shmctl(shmidA, IPC_RMID, NULL);
225         shmctl(shmidB, IPC_RMID, NULL);
226         return 0;
229 [akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build]
230 Signed-off-by: Hugh Dickins <hughd@google.com>
231 Reviewed-by: Michal Hocko <mhocko@suse.cz>
232 Signed-off-by: Mel Gorman <mgorman@suse.de>
233 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
234 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
235 [bwh: Backported to 3.2:
236  - Adjust context
237  - Drop the mmu_gather * parameters]
238 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
239 ---
240  include/linux/hugetlb.h |   10 ++++++++++
241  mm/hugetlb.c            |   28 ++++++++++++++++++++++++++--
242  mm/memory.c             |    7 +++++--
243  3 files changed, 41 insertions(+), 4 deletions(-)
245 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
246 index c5ed2f1..a2227f7 100644
247 --- a/include/linux/hugetlb.h
248 +++ b/include/linux/hugetlb.h
249 @@ -41,6 +41,9 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
250                         unsigned long *, int *, int, unsigned int flags);
251  void unmap_hugepage_range(struct vm_area_struct *,
252                         unsigned long, unsigned long, struct page *);
253 +void __unmap_hugepage_range_final(struct vm_area_struct *vma,
254 +                         unsigned long start, unsigned long end,
255 +                         struct page *ref_page);
256  void __unmap_hugepage_range(struct vm_area_struct *,
257                         unsigned long, unsigned long, struct page *);
258  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
259 @@ -99,6 +102,13 @@ static inline unsigned long hugetlb_total_pages(void)
260  #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
261  #define hugetlb_prefault(mapping, vma)         ({ BUG(); 0; })
262  #define unmap_hugepage_range(vma, start, end, page)    BUG()
263 +static inline void __unmap_hugepage_range_final(struct vm_area_struct *vma,
264 +                       unsigned long start, unsigned long end,
265 +                       struct page *ref_page)
266 +{
267 +       BUG();
268 +}
270  static inline void hugetlb_report_meminfo(struct seq_file *m)
271  {
272  }
273 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
274 index b1e1bad..0f897b8 100644
275 --- a/mm/hugetlb.c
276 +++ b/mm/hugetlb.c
277 @@ -2382,6 +2382,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
278         }
279  }
280  
281 +void __unmap_hugepage_range_final(struct vm_area_struct *vma,
282 +                         unsigned long start, unsigned long end,
283 +                         struct page *ref_page)
284 +{
285 +       __unmap_hugepage_range(vma, start, end, ref_page);
287 +       /*
288 +        * Clear this flag so that x86's huge_pmd_share page_table_shareable
289 +        * test will fail on a vma being torn down, and not grab a page table
290 +        * on its way out.  We're lucky that the flag has such an appropriate
291 +        * name, and can in fact be safely cleared here. We could clear it
292 +        * before the __unmap_hugepage_range above, but all that's necessary
293 +        * is to clear it before releasing the i_mmap_mutex. This works
294 +        * because in the context this is called, the VMA is about to be
295 +        * destroyed and the i_mmap_mutex is held.
296 +        */
297 +       vma->vm_flags &= ~VM_MAYSHARE;
298 +}
300  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
301                           unsigned long end, struct page *ref_page)
302  {
303 @@ -2939,9 +2958,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
304                 }
305         }
306         spin_unlock(&mm->page_table_lock);
307 -       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
309 +       /*
310 +        * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
311 +        * may have cleared our pud entry and done put_page on the page table:
312 +        * once we release i_mmap_mutex, another task can do the final put_page
313 +        * and that page table be reused and filled with junk.
314 +        */
315         flush_tlb_range(vma, start, end);
316 +       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
317  }
318  
319  int hugetlb_reserve_pages(struct inode *inode,
320 diff --git a/mm/memory.c b/mm/memory.c
321 index 1b1ca17..70f5daf 100644
322 --- a/mm/memory.c
323 +++ b/mm/memory.c
324 @@ -1358,8 +1358,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
325                                  * Since no pte has actually been setup, it is
326                                  * safe to do nothing in this case.
327                                  */
328 -                               if (vma->vm_file)
329 -                                       unmap_hugepage_range(vma, start, end, NULL);
330 +                               if (vma->vm_file) {
331 +                                       mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
332 +                                       __unmap_hugepage_range_final(vma, start, end, NULL);
333 +                                       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
334 +                               }
335  
336                                 start = end;
337                         } else
338 -- 
339 1.7.7.6