recipes-kernel/linux/linux-ti33x-psp-3.2/3.2.27/0056-mm-hugetlbfs-close-race-during-teardown-of-hugetlbfs.patch

   1 From 677941da036e27de0418fa601b49f8c8c6ccf594 Mon Sep 17 00:00:00 2001
   2 From: Mel Gorman <mgorman@suse.de>
   3 Date: Tue, 31 Jul 2012 16:46:20 -0700
   4 Subject: [PATCH 56/70] mm: hugetlbfs: close race during teardown of hugetlbfs
   5  shared page tables
   6
   7 commit d833352a4338dc31295ed832a30c9ccff5c7a183 upstream.
   8
   9 If a process creates a large hugetlbfs mapping that is eligible for page
  10 table sharing and forks heavily with children some of whom fault and
  11 others which destroy the mapping then it is possible for page tables to
  12 get corrupted.  Some teardowns of the mapping encounter a "bad pmd" and
  13 output a message to the kernel log.  The final teardown will trigger a
  14 BUG_ON in mm/filemap.c.
  15
  16 This was reproduced in 3.4 but is known to have existed for a long time
  17 and goes back at least as far as 2.6.37.  It was probably was introduced
  18 in 2.6.20 by [39dde65c: shared page table for hugetlb page].  The messages
  19 look like this;
  20
  21 [  ..........] Lots of bad pmd messages followed by this
  22 [  127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7).
  23 [  127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7).
  24 [  127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7).
  25 [  127.186778] ------------[ cut here ]------------
  26 [  127.186781] kernel BUG at mm/filemap.c:134!
  27 [  127.186782] invalid opcode: 0000 [#1] SMP
  28 [  127.186783] CPU 7
  29 [  127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod
  30 [  127.186801]
  31 [  127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR
  32 [  127.186804] RIP: 0010:[<ffffffff810ed6ce>]  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
  33 [  127.186809] RSP: 0000:ffff8804144b5c08  EFLAGS: 00010002
  34 [  127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0
  35 [  127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00
  36 [  127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003
  37 [  127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8
  38 [  127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8
  39 [  127.186815] FS:  00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000
  40 [  127.186816] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
  41 [  127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0
  42 [  127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  43 [  127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
  44 [  127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0)
  45 [  127.186821] Stack:
  46 [  127.186822]  ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b
  47 [  127.186824]  ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98
  48 [  127.186825]  ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000
  49 [  127.186827] Call Trace:
  50 [  127.186829]  [<ffffffff810ed83b>] delete_from_page_cache+0x3b/0x80
  51 [  127.186832]  [<ffffffff811bc925>] truncate_hugepages+0x115/0x220
  52 [  127.186834]  [<ffffffff811bca43>] hugetlbfs_evict_inode+0x13/0x30
  53 [  127.186837]  [<ffffffff811655c7>] evict+0xa7/0x1b0
  54 [  127.186839]  [<ffffffff811657a3>] iput_final+0xd3/0x1f0
  55 [  127.186840]  [<ffffffff811658f9>] iput+0x39/0x50
  56 [  127.186842]  [<ffffffff81162708>] d_kill+0xf8/0x130
  57 [  127.186843]  [<ffffffff81162812>] dput+0xd2/0x1a0
  58 [  127.186845]  [<ffffffff8114e2d0>] __fput+0x170/0x230
  59 [  127.186848]  [<ffffffff81236e0e>] ? rb_erase+0xce/0x150
  60 [  127.186849]  [<ffffffff8114e3ad>] fput+0x1d/0x30
  61 [  127.186851]  [<ffffffff81117db7>] remove_vma+0x37/0x80
  62 [  127.186853]  [<ffffffff81119182>] do_munmap+0x2d2/0x360
  63 [  127.186855]  [<ffffffff811cc639>] sys_shmdt+0xc9/0x170
  64 [  127.186857]  [<ffffffff81410a39>] system_call_fastpath+0x16/0x1b
  65 [  127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0
  66 [  127.186868] RIP  [<ffffffff810ed6ce>] __delete_from_page_cache+0x15e/0x160
  67 [  127.186870]  RSP <ffff8804144b5c08>
  68 [  127.186871] ---[ end trace 7cbac5d1db69f426 ]---
  69
  70 The bug is a race and not always easy to reproduce.  To reproduce it I was
  71 doing the following on a single socket I7-based machine with 16G of RAM.
  72
  73 $ hugeadm --pool-pages-max DEFAULT:13G
  74 $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax
  75 $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall
  76 $ for i in `seq 1 9000`; do ./hugetlbfs-test; done
  77
  78 On my particular machine, it usually triggers within 10 minutes but
  79 enabling debug options can change the timing such that it never hits.
  80 Once the bug is triggered, the machine is in trouble and needs to be
  81 rebooted.  The machine will respond but processes accessing proc like "ps
  82 aux" will hang due to the BUG_ON.  shutdown will also hang and needs a
  83 hard reset or a sysrq-b.
  84
  85 The basic problem is a race between page table sharing and teardown.  For
  86 the most part page table sharing depends on i_mmap_mutex.  In some cases,
  87 it is also taking the mm->page_table_lock for the PTE updates but with
  88 shared page tables, it is the i_mmap_mutex that is more important.
  89
  90 Unfortunately it appears to be also insufficient. Consider the following
  91 situation
  92
  93 Process A                                       Process B
  94 ---------                                       ---------
  95 hugetlb_fault                                   shmdt
  96                                                 LockWrite(mmap_sem)
  97                                                   do_munmap
  98                                                     unmap_region
  99                                                       unmap_vmas
 100                                                         unmap_single_vma
 101                                                           unmap_hugepage_range
 102                                                             Lock(i_mmap_mutex)
 103                                                             Lock(mm->page_table_lock)
 104                                                             huge_pmd_unshare/unmap tables <--- (1)
 105                                                             Unlock(mm->page_table_lock)
 106                                                             Unlock(i_mmap_mutex)
 107   huge_pte_alloc                                      ...
 108     Lock(i_mmap_mutex)                                ...
 109     vma_prio_walk, find svma, spte                    ...
 110     Lock(mm->page_table_lock)                         ...
 111     share spte                                        ...
 112     Unlock(mm->page_table_lock)                       ...
 113     Unlock(i_mmap_mutex)                              ...
 114   hugetlb_no_page                                                                         <--- (2)
 115                                                       free_pgtables
 116                                                         unlink_file_vma
 117                                                         hugetlb_free_pgd_range
 118                                                     remove_vma_list
 119
 120 In this scenario, it is possible for Process A to share page tables with
 121 Process B that is trying to tear them down.  The i_mmap_mutex on its own
 122 does not prevent Process A walking Process B's page tables.  At (1) above,
 123 the page tables are not shared yet so it unmaps the PMDs.  Process A sets
 124 up page table sharing and at (2) faults a new entry.  Process B then trips
 125 up on it in free_pgtables.
 126
 127 This patch fixes the problem by adding a new function
 128 __unmap_hugepage_range_final that is only called when the VMA is about to
 129 be destroyed.  This function clears VM_MAYSHARE during
 130 unmap_hugepage_range() under the i_mmap_mutex.  This makes the VMA
 131 ineligible for sharing and avoids the race.  Superficially this looks like
 132 it would then be vunerable to truncate and madvise issues but hugetlbfs
 133 has its own truncate handlers so does not use unmap_mapping_range() and
 134 does not support madvise(DONTNEED).
 135
 136 This should be treated as a -stable candidate if it is merged.
 137
 138 Test program is as follows. The test case was mostly written by Michal
 139 Hocko with a few minor changes to reproduce this bug.
 140
 141 ==== CUT HERE ====
 142
 143 static size_t huge_page_size = (2UL << 20);
 144 static size_t nr_huge_page_A = 512;
 145 static size_t nr_huge_page_B = 5632;
 146
 147 unsigned int get_random(unsigned int max)
 148 {
 149         struct timeval tv;
 150
 151         gettimeofday(&tv, NULL);
 152         srandom(tv.tv_usec);
 153         return random() % max;
 154 }
 155
 156 static void play(void *addr, size_t size)
 157 {
 158         unsigned char *start = addr,
 159                       *end = start + size,
 160                       *a;
 161         start += get_random(size/2);
 162
 163         /* we could itterate on huge pages but let's give it more time. */
 164         for (a = start; a < end; a += 4096)
 165                 *a = 0;
 166 }
 167
 168 int main(int argc, char **argv)
 169 {
 170         key_t key = IPC_PRIVATE;
 171         size_t sizeA = nr_huge_page_A * huge_page_size;
 172         size_t sizeB = nr_huge_page_B * huge_page_size;
 173         int shmidA, shmidB;
 174         void *addrA = NULL, *addrB = NULL;
 175         int nr_children = 300, n = 0;
 176
 177         if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
 178                 perror("shmget:");
 179                 return 1;
 180         }
 181
 182         if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) {
 183                 perror("shmat");
 184                 return 1;
 185         }
 186         if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) {
 187                 perror("shmget:");
 188                 return 1;
 189         }
 190
 191         if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) {
 192                 perror("shmat");
 193                 return 1;
 194         }
 195
 196 fork_child:
 197         switch(fork()) {
 198                 case 0:
 199                         switch (n%3) {
 200                         case 0:
 201                                 play(addrA, sizeA);
 202                                 break;
 203                         case 1:
 204                                 play(addrB, sizeB);
 205                                 break;
 206                         case 2:
 207                                 break;
 208                         }
 209                         break;
 210                 case -1:
 211                         perror("fork:");
 212                         break;
 213                 default:
 214                         if (++n < nr_children)
 215                                 goto fork_child;
 216                         play(addrA, sizeA);
 217                         break;
 218         }
 219         shmdt(addrA);
 220         shmdt(addrB);
 221         do {
 222                 wait(NULL);
 223         } while (--n > 0);
 224         shmctl(shmidA, IPC_RMID, NULL);
 225         shmctl(shmidB, IPC_RMID, NULL);
 226         return 0;
 227 }
 228
 229 [akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build]
 230 Signed-off-by: Hugh Dickins <hughd@google.com>
 231 Reviewed-by: Michal Hocko <mhocko@suse.cz>
 232 Signed-off-by: Mel Gorman <mgorman@suse.de>
 233 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 234 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 235 [bwh: Backported to 3.2:
 236  - Adjust context
 237  - Drop the mmu_gather * parameters]
 238 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
 239 ---
 240  include/linux/hugetlb.h |   10 ++++++++++
 241  mm/hugetlb.c            |   28 ++++++++++++++++++++++++++--
 242  mm/memory.c             |    7 +++++--
 243  3 files changed, 41 insertions(+), 4 deletions(-)
 244
 245 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
 246 index c5ed2f1..a2227f7 100644
 247 --- a/include/linux/hugetlb.h
 248 +++ b/include/linux/hugetlb.h
 249 @@ -41,6 +41,9 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 250                         unsigned long *, int *, int, unsigned int flags);
 251  void unmap_hugepage_range(struct vm_area_struct *,
 252                         unsigned long, unsigned long, struct page *);
 253 +void __unmap_hugepage_range_final(struct vm_area_struct *vma,
 254 +                         unsigned long start, unsigned long end,
 255 +                         struct page *ref_page);
 256  void __unmap_hugepage_range(struct vm_area_struct *,
 257                         unsigned long, unsigned long, struct page *);
 258  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 259 @@ -99,6 +102,13 @@ static inline unsigned long hugetlb_total_pages(void)
 260  #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
 261  #define hugetlb_prefault(mapping, vma)         ({ BUG(); 0; })
 262  #define unmap_hugepage_range(vma, start, end, page)    BUG()
 263 +static inline void __unmap_hugepage_range_final(struct vm_area_struct *vma,
 264 +                       unsigned long start, unsigned long end,
 265 +                       struct page *ref_page)
 266 +{
 267 +       BUG();
 268 +}
 269 +
 270  static inline void hugetlb_report_meminfo(struct seq_file *m)
 271  {
 272  }
 273 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
 274 index b1e1bad..0f897b8 100644
 275 --- a/mm/hugetlb.c
 276 +++ b/mm/hugetlb.c
 277 @@ -2382,6 +2382,25 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 278         }
 279  }
 280
 281 +void __unmap_hugepage_range_final(struct vm_area_struct *vma,
 282 +                         unsigned long start, unsigned long end,
 283 +                         struct page *ref_page)
 284 +{
 285 +       __unmap_hugepage_range(vma, start, end, ref_page);
 286 +
 287 +       /*
 288 +        * Clear this flag so that x86's huge_pmd_share page_table_shareable
 289 +        * test will fail on a vma being torn down, and not grab a page table
 290 +        * on its way out.  We're lucky that the flag has such an appropriate
 291 +        * name, and can in fact be safely cleared here. We could clear it
 292 +        * before the __unmap_hugepage_range above, but all that's necessary
 293 +        * is to clear it before releasing the i_mmap_mutex. This works
 294 +        * because in the context this is called, the VMA is about to be
 295 +        * destroyed and the i_mmap_mutex is held.
 296 +        */
 297 +       vma->vm_flags &= ~VM_MAYSHARE;
 298 +}
 299 +
 300  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 301                           unsigned long end, struct page *ref_page)
 302  {
 303 @@ -2939,9 +2958,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 304                 }
 305         }
 306         spin_unlock(&mm->page_table_lock);
 307 -       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 308 -
 309 +       /*
 310 +        * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
 311 +        * may have cleared our pud entry and done put_page on the page table:
 312 +        * once we release i_mmap_mutex, another task can do the final put_page
 313 +        * and that page table be reused and filled with junk.
 314 +        */
 315         flush_tlb_range(vma, start, end);
 316 +       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 317  }
 318
 319  int hugetlb_reserve_pages(struct inode *inode,
 320 diff --git a/mm/memory.c b/mm/memory.c
 321 index 1b1ca17..70f5daf 100644
 322 --- a/mm/memory.c
 323 +++ b/mm/memory.c
 324 @@ -1358,8 +1358,11 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
 325                                  * Since no pte has actually been setup, it is
 326                                  * safe to do nothing in this case.
 327                                  */
 328 -                               if (vma->vm_file)
 329 -                                       unmap_hugepage_range(vma, start, end, NULL);
 330 +                               if (vma->vm_file) {
 331 +                                       mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
 332 +                                       __unmap_hugepage_range_final(vma, start, end, NULL);
 333 +                                       mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 334 +                               }
 335
 336                                 start = end;
 337                         } else
 338 --
 339 1.7.7.6
 340