aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorTodd Poynor2012-11-01 15:36:34 -0500
committerTodd Poynor2012-11-01 15:36:34 -0500
commit925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3 (patch)
treea56506710f0340db055191e3cf0a207699c1b849 /mm
parent834029ac9d0ad8dea4e6a21bc34877dc3740b9f4 (diff)
parent27d0858dbcf199838b8c50a3e94d397bf326d986 (diff)
downloadkernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.gz
kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.xz
kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.zip
Merge remote-tracking branch 'stable/linux-3.0.y' into android-3.0
Change-Id: I9685feb9277b450da10d78a455b3c0674d6cfe18 Signed-off-by: Todd Poynor <toddpoynor@google.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c31
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/hugetlb.c68
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memcontrol.c9
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/memory_hotplug.c18
-rw-r--r--mm/mempolicy.c148
-rw-r--r--mm/migrate.c240
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/nobootmem.c3
-rw-r--r--mm/page_alloc.c120
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c11
-rw-r--r--mm/vmscan.c305
-rw-r--r--mm/vmstat.c2
20 files changed, 771 insertions, 334 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index c4bc5acf865..8ea7308601b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 /* Account for isolated anon and file pages */
39 unsigned long nr_anon;
40 unsigned long nr_file;
41
42 unsigned int order; /* order a direct compactor needs */ 38 unsigned int order; /* order a direct compactor needs */
43 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
44 struct zone *zone; 40 struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
223static void acct_isolated(struct zone *zone, struct compact_control *cc) 219static void acct_isolated(struct zone *zone, struct compact_control *cc)
224{ 220{
225 struct page *page; 221 struct page *page;
226 unsigned int count[NR_LRU_LISTS] = { 0, }; 222 unsigned int count[2] = { 0, };
227 223
228 list_for_each_entry(page, &cc->migratepages, lru) { 224 list_for_each_entry(page, &cc->migratepages, lru)
229 int lru = page_lru_base_type(page); 225 count[!!page_is_file_cache(page)]++;
230 count[lru]++;
231 }
232 226
233 cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 227 __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
234 cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 228 __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
235 __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
236 __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
237} 229}
238 230
239/* Similar to reclaim, but different enough that they don't share logic */ 231/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
269 unsigned long last_pageblock_nr = 0, pageblock_nr; 261 unsigned long last_pageblock_nr = 0, pageblock_nr;
270 unsigned long nr_scanned = 0, nr_isolated = 0; 262 unsigned long nr_scanned = 0, nr_isolated = 0;
271 struct list_head *migratelist = &cc->migratepages; 263 struct list_head *migratelist = &cc->migratepages;
264 isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
272 265
273 /* Do not scan outside zone boundaries */ 266 /* Do not scan outside zone boundaries */
274 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 267 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
378 continue; 371 continue;
379 } 372 }
380 373
374 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE;
376
381 /* Try isolate the page */ 377 /* Try isolate the page */
382 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 378 if (__isolate_lru_page(page, mode, 0) != 0)
383 continue; 379 continue;
384 380
385 VM_BUG_ON(PageTransCompound(page)); 381 VM_BUG_ON(PageTransCompound(page));
@@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
581 nr_migrate = cc->nr_migratepages; 577 nr_migrate = cc->nr_migratepages;
582 err = migrate_pages(&cc->migratepages, compaction_alloc, 578 err = migrate_pages(&cc->migratepages, compaction_alloc,
583 (unsigned long)cc, false, 579 (unsigned long)cc, false,
584 cc->sync); 580 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
585 update_nr_listpages(cc); 581 update_nr_listpages(cc);
586 nr_remaining = cc->nr_migratepages; 582 nr_remaining = cc->nr_migratepages;
587 583
@@ -596,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
596 if (err) { 592 if (err) {
597 putback_lru_pages(&cc->migratepages); 593 putback_lru_pages(&cc->migratepages);
598 cc->nr_migratepages = 0; 594 cc->nr_migratepages = 0;
595 if (err == -ENOMEM) {
596 ret = COMPACT_PARTIAL;
597 goto out;
598 }
599 } 599 }
600
601 } 600 }
602 601
603out: 602out:
diff --git a/mm/filemap.c b/mm/filemap.c
index b7d860390f3..10481ebd96c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
516 struct page *page; 516 struct page *page;
517 517
518 if (cpuset_do_page_mem_spread()) { 518 if (cpuset_do_page_mem_spread()) {
519 get_mems_allowed(); 519 unsigned int cpuset_mems_cookie;
520 n = cpuset_mem_spread_node(); 520 do {
521 page = alloc_pages_exact_node(n, gfp, 0); 521 cpuset_mems_cookie = get_mems_allowed();
522 put_mems_allowed(); 522 n = cpuset_mem_spread_node();
523 page = alloc_pages_exact_node(n, gfp, 0);
524 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
525
523 return page; 526 return page;
524 } 527 }
525 return alloc_pages(gfp, 0); 528 return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7001ac53b3..037f077b986 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
460 struct zonelist *zonelist; 460 struct zonelist *zonelist;
461 struct zone *zone; 461 struct zone *zone;
462 struct zoneref *z; 462 struct zoneref *z;
463 unsigned int cpuset_mems_cookie;
463 464
464 get_mems_allowed(); 465retry_cpuset:
466 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 467 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 468 htlb_alloc_mask, &mpol, &nodemask);
467 /* 469 /*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 490 }
489 } 491 }
490 } 492 }
491err: 493
492 mpol_cond_put(mpol); 494 mpol_cond_put(mpol);
493 put_mems_allowed(); 495 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
496 goto retry_cpuset;
494 return page; 497 return page;
498
499err:
500 mpol_cond_put(mpol);
501 return NULL;
495} 502}
496 503
497static void update_and_free_page(struct hstate *h, struct page *page) 504static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2060,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2060 kref_get(&reservations->refs); 2067 kref_get(&reservations->refs);
2061} 2068}
2062 2069
2070static void resv_map_put(struct vm_area_struct *vma)
2071{
2072 struct resv_map *reservations = vma_resv_map(vma);
2073
2074 if (!reservations)
2075 return;
2076 kref_put(&reservations->refs, resv_map_release);
2077}
2078
2063static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2079static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2064{ 2080{
2065 struct hstate *h = hstate_vma(vma); 2081 struct hstate *h = hstate_vma(vma);
@@ -2075,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2075 reserve = (end - start) - 2091 reserve = (end - start) -
2076 region_count(&reservations->regions, start, end); 2092 region_count(&reservations->regions, start, end);
2077 2093
2078 kref_put(&reservations->refs, resv_map_release); 2094 resv_map_put(vma);
2079 2095
2080 if (reserve) { 2096 if (reserve) {
2081 hugetlb_acct_memory(h, -reserve); 2097 hugetlb_acct_memory(h, -reserve);
@@ -2285,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2285{ 2301{
2286 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2302 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2287 __unmap_hugepage_range(vma, start, end, ref_page); 2303 __unmap_hugepage_range(vma, start, end, ref_page);
2304 /*
2305 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2306 * test will fail on a vma being torn down, and not grab a page table
2307 * on its way out. We're lucky that the flag has such an appropriate
2308 * name, and can in fact be safely cleared here. We could clear it
2309 * before the __unmap_hugepage_range above, but all that's necessary
2310 * is to clear it before releasing the i_mmap_mutex below.
2311 *
2312 * This works because in the contexts this is called, the VMA is
2313 * going to be destroyed. It is not vunerable to madvise(DONTNEED)
2314 * because madvise is not supported on hugetlbfs. The same applies
2315 * for direct IO. unmap_hugepage_range() is only being called just
2316 * before free_pgtables() so clearing VM_MAYSHARE will not cause
2317 * surprises later.
2318 */
2319 vma->vm_flags &= ~VM_MAYSHARE;
2288 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2320 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2289} 2321}
2290 2322
@@ -2398,7 +2430,6 @@ retry_avoidcopy:
2398 if (outside_reserve) { 2430 if (outside_reserve) {
2399 BUG_ON(huge_pte_none(pte)); 2431 BUG_ON(huge_pte_none(pte));
2400 if (unmap_ref_private(mm, vma, old_page, address)) { 2432 if (unmap_ref_private(mm, vma, old_page, address)) {
2401 BUG_ON(page_count(old_page) != 1);
2402 BUG_ON(huge_pte_none(pte)); 2433 BUG_ON(huge_pte_none(pte));
2403 spin_lock(&mm->page_table_lock); 2434 spin_lock(&mm->page_table_lock);
2404 goto retry_avoidcopy; 2435 goto retry_avoidcopy;
@@ -2838,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2838 } 2869 }
2839 } 2870 }
2840 spin_unlock(&mm->page_table_lock); 2871 spin_unlock(&mm->page_table_lock);
2841 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2872 /*
2842 2873 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
2874 * may have cleared our pud entry and done put_page on the page table:
2875 * once we release i_mmap_mutex, another task can do the final put_page
2876 * and that page table be reused and filled with junk.
2877 */
2843 flush_tlb_range(vma, start, end); 2878 flush_tlb_range(vma, start, end);
2879 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2844} 2880}
2845 2881
2846int hugetlb_reserve_pages(struct inode *inode, 2882int hugetlb_reserve_pages(struct inode *inode,
@@ -2878,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2878 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 2914 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2879 } 2915 }
2880 2916
2881 if (chg < 0) 2917 if (chg < 0) {
2882 return chg; 2918 ret = chg;
2919 goto out_err;
2920 }
2883 2921
2884 /* There must be enough filesystem quota for the mapping */ 2922 /* There must be enough filesystem quota for the mapping */
2885 if (hugetlb_get_quota(inode->i_mapping, chg)) 2923 if (hugetlb_get_quota(inode->i_mapping, chg)) {
2886 return -ENOSPC; 2924 ret = -ENOSPC;
2925 goto out_err;
2926 }
2887 2927
2888 /* 2928 /*
2889 * Check enough hugepages are available for the reservation. 2929 * Check enough hugepages are available for the reservation.
@@ -2892,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2892 ret = hugetlb_acct_memory(h, chg); 2932 ret = hugetlb_acct_memory(h, chg);
2893 if (ret < 0) { 2933 if (ret < 0) {
2894 hugetlb_put_quota(inode->i_mapping, chg); 2934 hugetlb_put_quota(inode->i_mapping, chg);
2895 return ret; 2935 goto out_err;
2896 } 2936 }
2897 2937
2898 /* 2938 /*
@@ -2909,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode,
2909 if (!vma || vma->vm_flags & VM_MAYSHARE) 2949 if (!vma || vma->vm_flags & VM_MAYSHARE)
2910 region_add(&inode->i_mapping->private_list, from, to); 2950 region_add(&inode->i_mapping->private_list, from, to);
2911 return 0; 2951 return 0;
2952out_err:
2953 if (vma)
2954 resv_map_put(vma);
2955 return ret;
2912} 2956}
2913 2957
2914void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 2958void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..deabe5f603a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/ksm.h> 15#include <linux/ksm.h>
16#include <linux/file.h>
16 17
17/* 18/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to 19 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
197 struct address_space *mapping; 198 struct address_space *mapping;
198 loff_t offset, endoff; 199 loff_t offset, endoff;
199 int error; 200 int error;
201 struct file *f;
200 202
201 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 203 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
202 204
203 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 205 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
204 return -EINVAL; 206 return -EINVAL;
205 207
206 if (!vma->vm_file || !vma->vm_file->f_mapping 208 f = vma->vm_file;
207 || !vma->vm_file->f_mapping->host) { 209
210 if (!f || !f->f_mapping || !f->f_mapping->host) {
208 return -EINVAL; 211 return -EINVAL;
209 } 212 }
210 213
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 221 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 223
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 224 /*
225 * vmtruncate_range may need to take i_mutex and i_alloc_sem.
226 * We need to explicitly grab a reference because the vma (and
227 * hence the vma's reference to the file) can go away as soon as
228 * we drop mmap_sem.
229 */
230 get_file(f);
222 up_read(&current->mm->mmap_sem); 231 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 232 error = vmtruncate_range(mapping->host, offset, endoff);
233 fput(f);
224 down_read(&current->mm->mmap_sem); 234 down_read(&current->mm->mmap_sem);
225 return error; 235 return error;
226} 236}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 283068f5af9..57cdf5ad692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1251unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1251unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1252 struct list_head *dst, 1252 struct list_head *dst,
1253 unsigned long *scanned, int order, 1253 unsigned long *scanned, int order,
1254 int mode, struct zone *z, 1254 isolate_mode_t mode,
1255 struct zone *z,
1255 struct mem_cgroup *mem_cont, 1256 struct mem_cgroup *mem_cont,
1256 int active, int file) 1257 int active, int file)
1257{ 1258{
@@ -4605,6 +4606,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4605swap_buffers: 4606swap_buffers:
4606 /* Swap primary and spare array */ 4607 /* Swap primary and spare array */
4607 thresholds->spare = thresholds->primary; 4608 thresholds->spare = thresholds->primary;
4609 /* If all events are unregistered, free the spare array */
4610 if (!new) {
4611 kfree(thresholds->spare);
4612 thresholds->spare = NULL;
4613 }
4614
4608 rcu_assign_pointer(thresholds->primary, new); 4615 rcu_assign_pointer(thresholds->primary, new);
4609 4616
4610 /* To be sure that nobody uses thresholds */ 4617 /* To be sure that nobody uses thresholds */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2f49dcf4f47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
1334 /* Keep page count to indicate a given hugepage is isolated. */ 1334 /* Keep page count to indicate a given hugepage is isolated. */
1335 1335
1336 list_add(&hpage->lru, &pagelist); 1336 list_add(&hpage->lru, &pagelist);
1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, 1337 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
1338 true); 1338 MIGRATE_SYNC);
1339 if (ret) { 1339 if (ret) {
1340 struct page *page1, *page2; 1340 struct page *page1, *page2;
1341 list_for_each_entry_safe(page1, page2, &pagelist, lru) 1341 list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
1464 page_is_file_cache(page)); 1464 page_is_file_cache(page));
1465 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1467 0, true); 1467 false, MIGRATE_SYNC);
1468 if (ret) { 1468 if (ret) {
1469 putback_lru_pages(&pagelist); 1469 putback_lru_pages(&pagelist);
1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1470 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..e0a3e51d519 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -116,9 +116,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
116 struct mem_section *ms; 116 struct mem_section *ms;
117 struct page *page, *memmap; 117 struct page *page, *memmap;
118 118
119 if (!pfn_valid(start_pfn))
120 return;
121
122 section_nr = pfn_to_section_nr(start_pfn); 119 section_nr = pfn_to_section_nr(start_pfn);
123 ms = __nr_to_section(section_nr); 120 ms = __nr_to_section(section_nr);
124 121
@@ -177,9 +174,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
177 end_pfn = pfn + pgdat->node_spanned_pages; 174 end_pfn = pfn + pgdat->node_spanned_pages;
178 175
179 /* register_section info */ 176 /* register_section info */
180 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) 177 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
181 register_page_bootmem_info_section(pfn); 178 /*
182 179 * Some platforms can assign the same pfn to multiple nodes - on
180 * node0 as well as nodeN. To avoid registering a pfn against
181 * multiple nodes we check that this pfn does not already
182 * reside in some other node.
183 */
184 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
185 register_page_bootmem_info_section(pfn);
186 }
183} 187}
184#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 188#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
185 189
@@ -747,7 +751,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
747 } 751 }
748 /* this function returns # of failed pages */ 752 /* this function returns # of failed pages */
749 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 753 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
750 true, true); 754 true, MIGRATE_SYNC);
751 if (ret) 755 if (ret)
752 putback_lru_pages(&source); 756 putback_lru_pages(&source);
753 } 757 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a85171de5d0..5dce7d46f79 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,24 +606,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
606 return first; 606 return first;
607} 607}
608 608
609/* Apply policy to a single VMA */ 609/*
610static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) 610 * Apply policy to a single VMA
611 * This must be called with the mmap_sem held for writing.
612 */
613static int vma_replace_policy(struct vm_area_struct *vma,
614 struct mempolicy *pol)
611{ 615{
612 int err = 0; 616 int err;
613 struct mempolicy *old = vma->vm_policy; 617 struct mempolicy *old;
618 struct mempolicy *new;
614 619
615 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 620 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
616 vma->vm_start, vma->vm_end, vma->vm_pgoff, 621 vma->vm_start, vma->vm_end, vma->vm_pgoff,
617 vma->vm_ops, vma->vm_file, 622 vma->vm_ops, vma->vm_file,
618 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 623 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
619 624
620 if (vma->vm_ops && vma->vm_ops->set_policy) 625 new = mpol_dup(pol);
626 if (IS_ERR(new))
627 return PTR_ERR(new);
628
629 if (vma->vm_ops && vma->vm_ops->set_policy) {
621 err = vma->vm_ops->set_policy(vma, new); 630 err = vma->vm_ops->set_policy(vma, new);
622 if (!err) { 631 if (err)
623 mpol_get(new); 632 goto err_out;
624 vma->vm_policy = new;
625 mpol_put(old);
626 } 633 }
634
635 old = vma->vm_policy;
636 vma->vm_policy = new; /* protected by mmap_sem */
637 mpol_put(old);
638
639 return 0;
640 err_out:
641 mpol_put(new);
627 return err; 642 return err;
628} 643}
629 644
@@ -666,7 +681,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
666 if (err) 681 if (err)
667 goto out; 682 goto out;
668 } 683 }
669 err = policy_vma(vma, new_pol); 684 err = vma_replace_policy(vma, new_pol);
670 if (err) 685 if (err)
671 goto out; 686 goto out;
672 } 687 }
@@ -933,7 +948,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
933 948
934 if (!list_empty(&pagelist)) { 949 if (!list_empty(&pagelist)) {
935 err = migrate_pages(&pagelist, new_node_page, dest, 950 err = migrate_pages(&pagelist, new_node_page, dest,
936 false, true); 951 false, MIGRATE_SYNC);
937 if (err) 952 if (err)
938 putback_lru_pages(&pagelist); 953 putback_lru_pages(&pagelist);
939 } 954 }
@@ -1496,8 +1511,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1496 addr); 1511 addr);
1497 if (vpol) 1512 if (vpol)
1498 pol = vpol; 1513 pol = vpol;
1499 } else if (vma->vm_policy) 1514 } else if (vma->vm_policy) {
1500 pol = vma->vm_policy; 1515 pol = vma->vm_policy;
1516
1517 /*
1518 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1519 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1520 * count on these policies which will be dropped by
1521 * mpol_cond_put() later
1522 */
1523 if (mpol_needs_cond_ref(pol))
1524 mpol_get(pol);
1525 }
1501 } 1526 }
1502 if (!pol) 1527 if (!pol)
1503 pol = &default_policy; 1528 pol = &default_policy;
@@ -1817,18 +1842,24 @@ struct page *
1817alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1842alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1818 unsigned long addr, int node) 1843 unsigned long addr, int node)
1819{ 1844{
1820 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1845 struct mempolicy *pol;
1821 struct zonelist *zl; 1846 struct zonelist *zl;
1822 struct page *page; 1847 struct page *page;
1848 unsigned int cpuset_mems_cookie;
1849
1850retry_cpuset:
1851 pol = get_vma_policy(current, vma, addr);
1852 cpuset_mems_cookie = get_mems_allowed();
1823 1853
1824 get_mems_allowed();
1825 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1854 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1826 unsigned nid; 1855 unsigned nid;
1827 1856
1828 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1857 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1829 mpol_cond_put(pol); 1858 mpol_cond_put(pol);
1830 page = alloc_page_interleave(gfp, order, nid); 1859 page = alloc_page_interleave(gfp, order, nid);
1831 put_mems_allowed(); 1860 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1861 goto retry_cpuset;
1862
1832 return page; 1863 return page;
1833 } 1864 }
1834 zl = policy_zonelist(gfp, pol, node); 1865 zl = policy_zonelist(gfp, pol, node);
@@ -1839,7 +1870,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1839 struct page *page = __alloc_pages_nodemask(gfp, order, 1870 struct page *page = __alloc_pages_nodemask(gfp, order,
1840 zl, policy_nodemask(gfp, pol)); 1871 zl, policy_nodemask(gfp, pol));
1841 __mpol_put(pol); 1872 __mpol_put(pol);
1842 put_mems_allowed(); 1873 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1874 goto retry_cpuset;
1843 return page; 1875 return page;
1844 } 1876 }
1845 /* 1877 /*
@@ -1847,7 +1879,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1847 */ 1879 */
1848 page = __alloc_pages_nodemask(gfp, order, zl, 1880 page = __alloc_pages_nodemask(gfp, order, zl,
1849 policy_nodemask(gfp, pol)); 1881 policy_nodemask(gfp, pol));
1850 put_mems_allowed(); 1882 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1883 goto retry_cpuset;
1851 return page; 1884 return page;
1852} 1885}
1853 1886
@@ -1874,11 +1907,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1874{ 1907{
1875 struct mempolicy *pol = current->mempolicy; 1908 struct mempolicy *pol = current->mempolicy;
1876 struct page *page; 1909 struct page *page;
1910 unsigned int cpuset_mems_cookie;
1877 1911
1878 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1912 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1879 pol = &default_policy; 1913 pol = &default_policy;
1880 1914
1881 get_mems_allowed(); 1915retry_cpuset:
1916 cpuset_mems_cookie = get_mems_allowed();
1917
1882 /* 1918 /*
1883 * No reference counting needed for current->mempolicy 1919 * No reference counting needed for current->mempolicy
1884 * nor system default_policy 1920 * nor system default_policy
@@ -1889,7 +1925,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1889 page = __alloc_pages_nodemask(gfp, order, 1925 page = __alloc_pages_nodemask(gfp, order,
1890 policy_zonelist(gfp, pol, numa_node_id()), 1926 policy_zonelist(gfp, pol, numa_node_id()),
1891 policy_nodemask(gfp, pol)); 1927 policy_nodemask(gfp, pol));
1892 put_mems_allowed(); 1928
1929 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1930 goto retry_cpuset;
1931
1893 return page; 1932 return page;
1894} 1933}
1895EXPORT_SYMBOL(alloc_pages_current); 1934EXPORT_SYMBOL(alloc_pages_current);
@@ -1992,7 +2031,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1992 */ 2031 */
1993 2032
1994/* lookup first element intersecting start-end */ 2033/* lookup first element intersecting start-end */
1995/* Caller holds sp->lock */ 2034/* Caller holds sp->mutex */
1996static struct sp_node * 2035static struct sp_node *
1997sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2036sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1998{ 2037{
@@ -2056,36 +2095,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2056 2095
2057 if (!sp->root.rb_node) 2096 if (!sp->root.rb_node)
2058 return NULL; 2097 return NULL;
2059 spin_lock(&sp->lock); 2098 mutex_lock(&sp->mutex);
2060 sn = sp_lookup(sp, idx, idx+1); 2099 sn = sp_lookup(sp, idx, idx+1);
2061 if (sn) { 2100 if (sn) {
2062 mpol_get(sn->policy); 2101 mpol_get(sn->policy);
2063 pol = sn->policy; 2102 pol = sn->policy;
2064 } 2103 }
2065 spin_unlock(&sp->lock); 2104 mutex_unlock(&sp->mutex);
2066 return pol; 2105 return pol;
2067} 2106}
2068 2107
2108static void sp_free(struct sp_node *n)
2109{
2110 mpol_put(n->policy);
2111 kmem_cache_free(sn_cache, n);
2112}
2113
2069static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2114static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2070{ 2115{
2071 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2116 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2072 rb_erase(&n->nd, &sp->root); 2117 rb_erase(&n->nd, &sp->root);
2073 mpol_put(n->policy); 2118 sp_free(n);
2074 kmem_cache_free(sn_cache, n);
2075} 2119}
2076 2120
2077static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2121static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2078 struct mempolicy *pol) 2122 struct mempolicy *pol)
2079{ 2123{
2080 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2124 struct sp_node *n;
2125 struct mempolicy *newpol;
2081 2126
2127 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2082 if (!n) 2128 if (!n)
2083 return NULL; 2129 return NULL;
2130
2131 newpol = mpol_dup(pol);
2132 if (IS_ERR(newpol)) {
2133 kmem_cache_free(sn_cache, n);
2134 return NULL;
2135 }
2136 newpol->flags |= MPOL_F_SHARED;
2137
2084 n->start = start; 2138 n->start = start;
2085 n->end = end; 2139 n->end = end;
2086 mpol_get(pol); 2140 n->policy = newpol;
2087 pol->flags |= MPOL_F_SHARED; /* for unref */ 2141
2088 n->policy = pol;
2089 return n; 2142 return n;
2090} 2143}
2091 2144
@@ -2093,10 +2146,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2093static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2146static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2094 unsigned long end, struct sp_node *new) 2147 unsigned long end, struct sp_node *new)
2095{ 2148{
2096 struct sp_node *n, *new2 = NULL; 2149 struct sp_node *n;
2150 int ret = 0;
2097 2151
2098restart: 2152 mutex_lock(&sp->mutex);
2099 spin_lock(&sp->lock);
2100 n = sp_lookup(sp, start, end); 2153 n = sp_lookup(sp, start, end);
2101 /* Take care of old policies in the same range. */ 2154 /* Take care of old policies in the same range. */
2102 while (n && n->start < end) { 2155 while (n && n->start < end) {
@@ -2109,16 +2162,14 @@ restart:
2109 } else { 2162 } else {
2110 /* Old policy spanning whole new range. */ 2163 /* Old policy spanning whole new range. */
2111 if (n->end > end) { 2164 if (n->end > end) {
2165 struct sp_node *new2;
2166 new2 = sp_alloc(end, n->end, n->policy);
2112 if (!new2) { 2167 if (!new2) {
2113 spin_unlock(&sp->lock); 2168 ret = -ENOMEM;
2114 new2 = sp_alloc(end, n->end, n->policy); 2169 goto out;
2115 if (!new2)
2116 return -ENOMEM;
2117 goto restart;
2118 } 2170 }
2119 n->end = start; 2171 n->end = start;
2120 sp_insert(sp, new2); 2172 sp_insert(sp, new2);
2121 new2 = NULL;
2122 break; 2173 break;
2123 } else 2174 } else
2124 n->end = start; 2175 n->end = start;
@@ -2129,12 +2180,9 @@ restart:
2129 } 2180 }
2130 if (new) 2181 if (new)
2131 sp_insert(sp, new); 2182 sp_insert(sp, new);
2132 spin_unlock(&sp->lock); 2183out:
2133 if (new2) { 2184 mutex_unlock(&sp->mutex);
2134 mpol_put(new2->policy); 2185 return ret;
2135 kmem_cache_free(sn_cache, new2);
2136 }
2137 return 0;
2138} 2186}
2139 2187
2140/** 2188/**
@@ -2152,7 +2200,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2152 int ret; 2200 int ret;
2153 2201
2154 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2202 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2155 spin_lock_init(&sp->lock); 2203 mutex_init(&sp->mutex);
2156 2204
2157 if (mpol) { 2205 if (mpol) {
2158 struct vm_area_struct pvma; 2206 struct vm_area_struct pvma;
@@ -2206,7 +2254,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2206 } 2254 }
2207 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2255 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2208 if (err && new) 2256 if (err && new)
2209 kmem_cache_free(sn_cache, new); 2257 sp_free(new);
2210 return err; 2258 return err;
2211} 2259}
2212 2260
@@ -2218,16 +2266,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2218 2266
2219 if (!p->root.rb_node) 2267 if (!p->root.rb_node)
2220 return; 2268 return;
2221 spin_lock(&p->lock); 2269 mutex_lock(&p->mutex);
2222 next = rb_first(&p->root); 2270 next = rb_first(&p->root);
2223 while (next) { 2271 while (next) {
2224 n = rb_entry(next, struct sp_node, nd); 2272 n = rb_entry(next, struct sp_node, nd);
2225 next = rb_next(&n->nd); 2273 next = rb_next(&n->nd);
2226 rb_erase(&n->nd, &p->root); 2274 sp_delete(p, n);
2227 mpol_put(n->policy);
2228 kmem_cache_free(sn_cache, n);
2229 } 2275 }
2230 spin_unlock(&p->lock); 2276 mutex_unlock(&p->mutex);
2231} 2277}
2232 2278
2233/* assumes fs == KERNEL_DS */ 2279/* assumes fs == KERNEL_DS */
@@ -2493,7 +2539,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2493 break; 2539 break;
2494 2540
2495 default: 2541 default:
2496 BUG(); 2542 return -EINVAL;
2497 } 2543 }
2498 2544
2499 l = strlen(policy_modes[mode]); 2545 l = strlen(policy_modes[mode]);
diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a632f..480714b6f3f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,6 +220,56 @@ out:
220 pte_unmap_unlock(ptep, ptl); 220 pte_unmap_unlock(ptep, ptl);
221} 221}
222 222
223#ifdef CONFIG_BLOCK
224/* Returns true if all buffers are successfully locked */
225static bool buffer_migrate_lock_buffers(struct buffer_head *head,
226 enum migrate_mode mode)
227{
228 struct buffer_head *bh = head;
229
230 /* Simple case, sync compaction */
231 if (mode != MIGRATE_ASYNC) {
232 do {
233 get_bh(bh);
234 lock_buffer(bh);
235 bh = bh->b_this_page;
236
237 } while (bh != head);
238
239 return true;
240 }
241
242 /* async case, we cannot block on lock_buffer so use trylock_buffer */
243 do {
244 get_bh(bh);
245 if (!trylock_buffer(bh)) {
246 /*
247 * We failed to lock the buffer and cannot stall in
248 * async migration. Release the taken locks
249 */
250 struct buffer_head *failed_bh = bh;
251 put_bh(failed_bh);
252 bh = head;
253 while (bh != failed_bh) {
254 unlock_buffer(bh);
255 put_bh(bh);
256 bh = bh->b_this_page;
257 }
258 return false;
259 }
260
261 bh = bh->b_this_page;
262 } while (bh != head);
263 return true;
264}
265#else
266static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
267 enum migrate_mode mode)
268{
269 return true;
270}
271#endif /* CONFIG_BLOCK */
272
223/* 273/*
224 * Replace the page in the mapping. 274 * Replace the page in the mapping.
225 * 275 *
@@ -229,7 +279,8 @@ out:
229 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 279 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
230 */ 280 */
231static int migrate_page_move_mapping(struct address_space *mapping, 281static int migrate_page_move_mapping(struct address_space *mapping,
232 struct page *newpage, struct page *page) 282 struct page *newpage, struct page *page,
283 struct buffer_head *head, enum migrate_mode mode)
233{ 284{
234 int expected_count; 285 int expected_count;
235 void **pslot; 286 void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
259 } 310 }
260 311
261 /* 312 /*
313 * In the async migration case of moving a page with buffers, lock the
314 * buffers using trylock before the mapping is moved. If the mapping
315 * was moved, we later failed to lock the buffers and could not move
316 * the mapping back due to an elevated page count, we would have to
317 * block waiting on other references to be dropped.
318 */
319 if (mode == MIGRATE_ASYNC && head &&
320 !buffer_migrate_lock_buffers(head, mode)) {
321 page_unfreeze_refs(page, expected_count);
322 spin_unlock_irq(&mapping->tree_lock);
323 return -EAGAIN;
324 }
325
326 /*
262 * Now we know that no one else is looking at the page. 327 * Now we know that no one else is looking at the page.
263 */ 328 */
264 get_page(newpage); /* add cache reference */ 329 get_page(newpage); /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
415 * Pages are locked upon entry and exit. 480 * Pages are locked upon entry and exit.
416 */ 481 */
417int migrate_page(struct address_space *mapping, 482int migrate_page(struct address_space *mapping,
418 struct page *newpage, struct page *page) 483 struct page *newpage, struct page *page,
484 enum migrate_mode mode)
419{ 485{
420 int rc; 486 int rc;
421 487
422 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 488 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
423 489
424 rc = migrate_page_move_mapping(mapping, newpage, page); 490 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
425 491
426 if (rc) 492 if (rc)
427 return rc; 493 return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
438 * exist. 504 * exist.
439 */ 505 */
440int buffer_migrate_page(struct address_space *mapping, 506int buffer_migrate_page(struct address_space *mapping,
441 struct page *newpage, struct page *page) 507 struct page *newpage, struct page *page, enum migrate_mode mode)
442{ 508{
443 struct buffer_head *bh, *head; 509 struct buffer_head *bh, *head;
444 int rc; 510 int rc;
445 511
446 if (!page_has_buffers(page)) 512 if (!page_has_buffers(page))
447 return migrate_page(mapping, newpage, page); 513 return migrate_page(mapping, newpage, page, mode);
448 514
449 head = page_buffers(page); 515 head = page_buffers(page);
450 516
451 rc = migrate_page_move_mapping(mapping, newpage, page); 517 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
452 518
453 if (rc) 519 if (rc)
454 return rc; 520 return rc;
455 521
456 bh = head; 522 /*
457 do { 523 * In the async case, migrate_page_move_mapping locked the buffers
458 get_bh(bh); 524 * with an IRQ-safe spinlock held. In the sync case, the buffers
459 lock_buffer(bh); 525 * need to be locked now
460 bh = bh->b_this_page; 526 */
461 527 if (mode != MIGRATE_ASYNC)
462 } while (bh != head); 528 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
463 529
464 ClearPagePrivate(page); 530 ClearPagePrivate(page);
465 set_page_private(newpage, page_private(page)); 531 set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
536 * Default handling if a filesystem does not provide a migration function. 602 * Default handling if a filesystem does not provide a migration function.
537 */ 603 */
538static int fallback_migrate_page(struct address_space *mapping, 604static int fallback_migrate_page(struct address_space *mapping,
539 struct page *newpage, struct page *page) 605 struct page *newpage, struct page *page, enum migrate_mode mode)
540{ 606{
541 if (PageDirty(page)) 607 if (PageDirty(page)) {
608 /* Only writeback pages in full synchronous migration */
609 if (mode != MIGRATE_SYNC)
610 return -EBUSY;
542 return writeout(mapping, page); 611 return writeout(mapping, page);
612 }
543 613
544 /* 614 /*
545 * Buffers may be managed in a filesystem specific way. 615 * Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
549 !try_to_release_page(page, GFP_KERNEL)) 619 !try_to_release_page(page, GFP_KERNEL))
550 return -EAGAIN; 620 return -EAGAIN;
551 621
552 return migrate_page(mapping, newpage, page); 622 return migrate_page(mapping, newpage, page, mode);
553} 623}
554 624
555/* 625/*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
564 * == 0 - success 634 * == 0 - success
565 */ 635 */
566static int move_to_new_page(struct page *newpage, struct page *page, 636static int move_to_new_page(struct page *newpage, struct page *page,
567 int remap_swapcache, bool sync) 637 int remap_swapcache, enum migrate_mode mode)
568{ 638{
569 struct address_space *mapping; 639 struct address_space *mapping;
570 int rc; 640 int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
585 655
586 mapping = page_mapping(page); 656 mapping = page_mapping(page);
587 if (!mapping) 657 if (!mapping)
588 rc = migrate_page(mapping, newpage, page); 658 rc = migrate_page(mapping, newpage, page, mode);
589 else { 659 else if (mapping->a_ops->migratepage)
590 /* 660 /*
591 * Do not writeback pages if !sync and migratepage is 661 * Most pages have a mapping and most filesystems provide a
592 * not pointing to migrate_page() which is nonblocking 662 * migratepage callback. Anonymous pages are part of swap
593 * (swapcache/tmpfs uses migratepage = migrate_page). 663 * space which also has its own migratepage callback. This
664 * is the most common path for page migration.
594 */ 665 */
595 if (PageDirty(page) && !sync && 666 rc = mapping->a_ops->migratepage(mapping,
596 mapping->a_ops->migratepage != migrate_page) 667 newpage, page, mode);
597 rc = -EBUSY; 668 else
598 else if (mapping->a_ops->migratepage) 669 rc = fallback_migrate_page(mapping, newpage, page, mode);
599 /*
600 * Most pages have a mapping and most filesystems
601 * should provide a migration function. Anonymous
602 * pages are part of swap space which also has its
603 * own migration function. This is the most common
604 * path for page migration.
605 */
606 rc = mapping->a_ops->migratepage(mapping,
607 newpage, page);
608 else
609 rc = fallback_migrate_page(mapping, newpage, page);
610 }
611 670
612 if (rc) { 671 if (rc) {
613 newpage->mapping = NULL; 672 newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
621 return rc; 680 return rc;
622} 681}
623 682
624/* 683static int __unmap_and_move(struct page *page, struct page *newpage,
625 * Obtain the lock on page, remove all ptes and migrate the page 684 int force, bool offlining, enum migrate_mode mode)
626 * to the newly allocated page in newpage.
627 */
628static int unmap_and_move(new_page_t get_new_page, unsigned long private,
629 struct page *page, int force, bool offlining, bool sync)
630{ 685{
631 int rc = 0; 686 int rc = -EAGAIN;
632 int *result = NULL;
633 struct page *newpage = get_new_page(page, private, &result);
634 int remap_swapcache = 1; 687 int remap_swapcache = 1;
635 int charge = 0; 688 int charge = 0;
636 struct mem_cgroup *mem; 689 struct mem_cgroup *mem;
637 struct anon_vma *anon_vma = NULL; 690 struct anon_vma *anon_vma = NULL;
638 691
639 if (!newpage)
640 return -ENOMEM;
641
642 if (page_count(page) == 1) {
643 /* page was freed from under us. So we are done. */
644 goto move_newpage;
645 }
646 if (unlikely(PageTransHuge(page)))
647 if (unlikely(split_huge_page(page)))
648 goto move_newpage;
649
650 /* prepare cgroup just returns 0 or -ENOMEM */
651 rc = -EAGAIN;
652
653 if (!trylock_page(page)) { 692 if (!trylock_page(page)) {
654 if (!force || !sync) 693 if (!force || mode == MIGRATE_ASYNC)
655 goto move_newpage; 694 goto out;
656 695
657 /* 696 /*
658 * It's not safe for direct compaction to call lock_page. 697 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
668 * altogether. 707 * altogether.
669 */ 708 */
670 if (current->flags & PF_MEMALLOC) 709 if (current->flags & PF_MEMALLOC)
671 goto move_newpage; 710 goto out;
672 711
673 lock_page(page); 712 lock_page(page);
674 } 713 }
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
697 736
698 if (PageWriteback(page)) { 737 if (PageWriteback(page)) {
699 /* 738 /*
700 * For !sync, there is no point retrying as the retry loop 739 * Only in the case of a full syncronous migration is it
701 * is expected to be too short for PageWriteback to be cleared 740 * necessary to wait for PageWriteback. In the async case,
741 * the retry loop is too short and in the sync-light case,
742 * the overhead of stalling is too much
702 */ 743 */
703 if (!sync) { 744 if (mode != MIGRATE_SYNC) {
704 rc = -EBUSY; 745 rc = -EBUSY;
705 goto uncharge; 746 goto uncharge;
706 } 747 }
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
771 812
772skip_unmap: 813skip_unmap:
773 if (!page_mapped(page)) 814 if (!page_mapped(page))
774 rc = move_to_new_page(newpage, page, remap_swapcache, sync); 815 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
775 816
776 if (rc && remap_swapcache) 817 if (rc && remap_swapcache)
777 remove_migration_ptes(page, page); 818 remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
785 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 826 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
786unlock: 827unlock:
787 unlock_page(page); 828 unlock_page(page);
829out:
830 return rc;
831}
788 832
789move_newpage: 833/*
834 * Obtain the lock on page, remove all ptes and migrate the page
835 * to the newly allocated page in newpage.
836 */
837static int unmap_and_move(new_page_t get_new_page, unsigned long private,
838 struct page *page, int force, bool offlining,
839 enum migrate_mode mode)
840{
841 int rc = 0;
842 int *result = NULL;
843 struct page *newpage = get_new_page(page, private, &result);
844
845 if (!newpage)
846 return -ENOMEM;
847
848 if (page_count(page) == 1) {
849 /* page was freed from under us. So we are done. */
850 goto out;
851 }
852
853 if (unlikely(PageTransHuge(page)))
854 if (unlikely(split_huge_page(page)))
855 goto out;
856
857 rc = __unmap_and_move(page, newpage, force, offlining, mode);
858out:
790 if (rc != -EAGAIN) { 859 if (rc != -EAGAIN) {
791 /* 860 /*
792 * A page that has been migrated has all references 861 * A page that has been migrated has all references
793 * removed and will be freed. A page that has not been 862 * removed and will be freed. A page that has not been
794 * migrated will have kepts its references and be 863 * migrated will have kepts its references and be
795 * restored. 864 * restored.
796 */ 865 */
797 list_del(&page->lru); 866 list_del(&page->lru);
798 dec_zone_page_state(page, NR_ISOLATED_ANON + 867 dec_zone_page_state(page, NR_ISOLATED_ANON +
799 page_is_file_cache(page)); 868 page_is_file_cache(page));
800 putback_lru_page(page); 869 putback_lru_page(page);
801 } 870 }
802
803 /* 871 /*
804 * Move the new page to the LRU. If migration was not successful 872 * Move the new page to the LRU. If migration was not successful
805 * then this will free the page. 873 * then this will free the page.
806 */ 874 */
807 putback_lru_page(newpage); 875 putback_lru_page(newpage);
808
809 if (result) { 876 if (result) {
810 if (rc) 877 if (rc)
811 *result = rc; 878 *result = rc;
@@ -835,7 +902,8 @@ move_newpage:
835 */ 902 */
836static int unmap_and_move_huge_page(new_page_t get_new_page, 903static int unmap_and_move_huge_page(new_page_t get_new_page,
837 unsigned long private, struct page *hpage, 904 unsigned long private, struct page *hpage,
838 int force, bool offlining, bool sync) 905 int force, bool offlining,
906 enum migrate_mode mode)
839{ 907{
840 int rc = 0; 908 int rc = 0;
841 int *result = NULL; 909 int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
848 rc = -EAGAIN; 916 rc = -EAGAIN;
849 917
850 if (!trylock_page(hpage)) { 918 if (!trylock_page(hpage)) {
851 if (!force || !sync) 919 if (!force || mode != MIGRATE_SYNC)
852 goto out; 920 goto out;
853 lock_page(hpage); 921 lock_page(hpage);
854 } 922 }
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
859 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 927 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
860 928
861 if (!page_mapped(hpage)) 929 if (!page_mapped(hpage))
862 rc = move_to_new_page(new_hpage, hpage, 1, sync); 930 rc = move_to_new_page(new_hpage, hpage, 1, mode);
863 931
864 if (rc) 932 if (rc)
865 remove_migration_ptes(hpage, hpage); 933 remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
902 */ 970 */
903int migrate_pages(struct list_head *from, 971int migrate_pages(struct list_head *from,
904 new_page_t get_new_page, unsigned long private, bool offlining, 972 new_page_t get_new_page, unsigned long private, bool offlining,
905 bool sync) 973 enum migrate_mode mode)
906{ 974{
907 int retry = 1; 975 int retry = 1;
908 int nr_failed = 0; 976 int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
923 991
924 rc = unmap_and_move(get_new_page, private, 992 rc = unmap_and_move(get_new_page, private,
925 page, pass > 2, offlining, 993 page, pass > 2, offlining,
926 sync); 994 mode);
927 995
928 switch(rc) { 996 switch(rc) {
929 case -ENOMEM: 997 case -ENOMEM:
@@ -953,7 +1021,7 @@ out:
953 1021
954int migrate_huge_pages(struct list_head *from, 1022int migrate_huge_pages(struct list_head *from,
955 new_page_t get_new_page, unsigned long private, bool offlining, 1023 new_page_t get_new_page, unsigned long private, bool offlining,
956 bool sync) 1024 enum migrate_mode mode)
957{ 1025{
958 int retry = 1; 1026 int retry = 1;
959 int nr_failed = 0; 1027 int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
970 1038
971 rc = unmap_and_move_huge_page(get_new_page, 1039 rc = unmap_and_move_huge_page(get_new_page,
972 private, page, pass > 2, offlining, 1040 private, page, pass > 2, offlining,
973 sync); 1041 mode);
974 1042
975 switch(rc) { 1043 switch(rc) {
976 case -ENOMEM: 1044 case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
1099 err = 0; 1167 err = 0;
1100 if (!list_empty(&pagelist)) { 1168 if (!list_empty(&pagelist)) {
1101 err = migrate_pages(&pagelist, new_page_node, 1169 err = migrate_pages(&pagelist, new_page_node,
1102 (unsigned long)pm, 0, true); 1170 (unsigned long)pm, 0, MIGRATE_SYNC);
1103 if (err) 1171 if (err)
1104 putback_lru_pages(&pagelist); 1172 putback_lru_pages(&pagelist);
1105 } 1173 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088..71c78115c45 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f258..e39e3efe4a4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
83 83
84static void __init __free_pages_memory(unsigned long start, unsigned long end) 84static void __init __free_pages_memory(unsigned long start, unsigned long end)
85{ 85{
86 int i; 86 unsigned long i, start_aligned, end_aligned;
87 unsigned long start_aligned, end_aligned;
88 int order = ilog2(BITS_PER_LONG); 87 int order = ilog2(BITS_PER_LONG);
89 88
90 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 89 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f474da7ee..bfe789472b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -555,7 +555,7 @@ static inline void __free_one_page(struct page *page,
555 combined_idx = buddy_idx & page_idx; 555 combined_idx = buddy_idx & page_idx;
556 higher_page = page + (combined_idx - page_idx); 556 higher_page = page + (combined_idx - page_idx);
557 buddy_idx = __find_buddy_index(combined_idx, order + 1); 557 buddy_idx = __find_buddy_index(combined_idx, order + 1);
558 higher_buddy = page + (buddy_idx - combined_idx); 558 higher_buddy = higher_page + (buddy_idx - combined_idx);
559 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 559 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
560 list_add_tail(&page->lru, 560 list_add_tail(&page->lru,
561 &zone->free_area[order].free_list[migratetype]); 561 &zone->free_area[order].free_list[migratetype]);
@@ -1912,14 +1912,20 @@ static struct page *
1912__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1912__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1913 struct zonelist *zonelist, enum zone_type high_zoneidx, 1913 struct zonelist *zonelist, enum zone_type high_zoneidx,
1914 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1914 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1915 int migratetype, unsigned long *did_some_progress, 1915 int migratetype, bool sync_migration,
1916 bool sync_migration) 1916 bool *deferred_compaction,
1917 unsigned long *did_some_progress)
1917{ 1918{
1918 struct page *page; 1919 struct page *page;
1919 1920
1920 if (!order || compaction_deferred(preferred_zone)) 1921 if (!order)
1921 return NULL; 1922 return NULL;
1922 1923
1924 if (compaction_deferred(preferred_zone)) {
1925 *deferred_compaction = true;
1926 return NULL;
1927 }
1928
1923 current->flags |= PF_MEMALLOC; 1929 current->flags |= PF_MEMALLOC;
1924 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1930 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1925 nodemask, sync_migration); 1931 nodemask, sync_migration);
@@ -1947,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1947 * but not enough to satisfy watermarks. 1953 * but not enough to satisfy watermarks.
1948 */ 1954 */
1949 count_vm_event(COMPACTFAIL); 1955 count_vm_event(COMPACTFAIL);
1950 defer_compaction(preferred_zone); 1956
1957 /*
1958 * As async compaction considers a subset of pageblocks, only
1959 * defer if the failure was a sync compaction failure.
1960 */
1961 if (sync_migration)
1962 defer_compaction(preferred_zone);
1951 1963
1952 cond_resched(); 1964 cond_resched();
1953 } 1965 }
@@ -1959,8 +1971,9 @@ static inline struct page *
1959__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1971__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1960 struct zonelist *zonelist, enum zone_type high_zoneidx, 1972 struct zonelist *zonelist, enum zone_type high_zoneidx,
1961 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1973 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1962 int migratetype, unsigned long *did_some_progress, 1974 int migratetype, bool sync_migration,
1963 bool sync_migration) 1975 bool *deferred_compaction,
1976 unsigned long *did_some_progress)
1964{ 1977{
1965 return NULL; 1978 return NULL;
1966} 1979}
@@ -2110,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2110 unsigned long pages_reclaimed = 0; 2123 unsigned long pages_reclaimed = 0;
2111 unsigned long did_some_progress; 2124 unsigned long did_some_progress;
2112 bool sync_migration = false; 2125 bool sync_migration = false;
2126 bool deferred_compaction = false;
2113 2127
2114 /* 2128 /*
2115 * In the slowpath, we sanity check order to avoid ever trying to 2129 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2190,12 +2204,22 @@ rebalance:
2190 zonelist, high_zoneidx, 2204 zonelist, high_zoneidx,
2191 nodemask, 2205 nodemask,
2192 alloc_flags, preferred_zone, 2206 alloc_flags, preferred_zone,
2193 migratetype, &did_some_progress, 2207 migratetype, sync_migration,
2194 sync_migration); 2208 &deferred_compaction,
2209 &did_some_progress);
2195 if (page) 2210 if (page)
2196 goto got_pg; 2211 goto got_pg;
2197 sync_migration = true; 2212 sync_migration = true;
2198 2213
2214 /*
2215 * If compaction is deferred for high-order allocations, it is because
2216 * sync compaction recently failed. In this is the case and the caller
2217 * has requested the system not be heavily disrupted, fail the
2218 * allocation now instead of entering direct reclaim
2219 */
2220 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2221 goto nopage;
2222
2199 /* Try direct reclaim and then allocating */ 2223 /* Try direct reclaim and then allocating */
2200 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2224 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2201 zonelist, high_zoneidx, 2225 zonelist, high_zoneidx,
@@ -2266,8 +2290,9 @@ rebalance:
2266 zonelist, high_zoneidx, 2290 zonelist, high_zoneidx,
2267 nodemask, 2291 nodemask,
2268 alloc_flags, preferred_zone, 2292 alloc_flags, preferred_zone,
2269 migratetype, &did_some_progress, 2293 migratetype, sync_migration,
2270 sync_migration); 2294 &deferred_compaction,
2295 &did_some_progress);
2271 if (page) 2296 if (page)
2272 goto got_pg; 2297 goto got_pg;
2273 } 2298 }
@@ -2291,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2291{ 2316{
2292 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2317 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2293 struct zone *preferred_zone; 2318 struct zone *preferred_zone;
2294 struct page *page; 2319 struct page *page = NULL;
2295 int migratetype = allocflags_to_migratetype(gfp_mask); 2320 int migratetype = allocflags_to_migratetype(gfp_mask);
2321 unsigned int cpuset_mems_cookie;
2296 2322
2297 gfp_mask &= gfp_allowed_mask; 2323 gfp_mask &= gfp_allowed_mask;
2298 2324
@@ -2311,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2311 if (unlikely(!zonelist->_zonerefs->zone)) 2337 if (unlikely(!zonelist->_zonerefs->zone))
2312 return NULL; 2338 return NULL;
2313 2339
2314 get_mems_allowed(); 2340retry_cpuset:
2341 cpuset_mems_cookie = get_mems_allowed();
2342
2315 /* The preferred zone is used for statistics later */ 2343 /* The preferred zone is used for statistics later */
2316 first_zones_zonelist(zonelist, high_zoneidx, 2344 first_zones_zonelist(zonelist, high_zoneidx,
2317 nodemask ? : &cpuset_current_mems_allowed, 2345 nodemask ? : &cpuset_current_mems_allowed,
2318 &preferred_zone); 2346 &preferred_zone);
2319 if (!preferred_zone) { 2347 if (!preferred_zone)
2320 put_mems_allowed(); 2348 goto out;
2321 return NULL;
2322 }
2323 2349
2324 /* First allocation attempt */ 2350 /* First allocation attempt */
2325 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2351 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2329,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2329 page = __alloc_pages_slowpath(gfp_mask, order, 2355 page = __alloc_pages_slowpath(gfp_mask, order,
2330 zonelist, high_zoneidx, nodemask, 2356 zonelist, high_zoneidx, nodemask,
2331 preferred_zone, migratetype); 2357 preferred_zone, migratetype);
2332 put_mems_allowed();
2333 2358
2334 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2359 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2360
2361out:
2362 /*
2363 * When updating a task's mems_allowed, it is possible to race with
2364 * parallel threads in such a way that an allocation can fail while
2365 * the mask is being updated. If a page allocation is about to fail,
2366 * check if the cpuset changed during allocation and if so, retry.
2367 */
2368 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2369 goto retry_cpuset;
2370
2335 return page; 2371 return page;
2336} 2372}
2337EXPORT_SYMBOL(__alloc_pages_nodemask); 2373EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2555,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2555bool skip_free_areas_node(unsigned int flags, int nid) 2591bool skip_free_areas_node(unsigned int flags, int nid)
2556{ 2592{
2557 bool ret = false; 2593 bool ret = false;
2594 unsigned int cpuset_mems_cookie;
2558 2595
2559 if (!(flags & SHOW_MEM_FILTER_NODES)) 2596 if (!(flags & SHOW_MEM_FILTER_NODES))
2560 goto out; 2597 goto out;
2561 2598
2562 get_mems_allowed(); 2599 do {
2563 ret = !node_isset(nid, cpuset_current_mems_allowed); 2600 cpuset_mems_cookie = get_mems_allowed();
2564 put_mems_allowed(); 2601 ret = !node_isset(nid, cpuset_current_mems_allowed);
2602 } while (!put_mems_allowed(cpuset_mems_cookie));
2565out: 2603out:
2566 return ret; 2604 return ret;
2567} 2605}
@@ -3441,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3441 if (page_to_nid(page) != zone_to_nid(zone)) 3479 if (page_to_nid(page) != zone_to_nid(zone))
3442 continue; 3480 continue;
3443 3481
3444 /* Blocks with reserved pages will never free, skip them. */
3445 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3446 if (pageblock_is_reserved(pfn, block_end_pfn))
3447 continue;
3448
3449 block_migratetype = get_pageblock_migratetype(page); 3482 block_migratetype = get_pageblock_migratetype(page);
3450 3483
3451 /* If this block is reserved, account for it */ 3484 /* Only test what is necessary when the reserves are not met */
3452 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 3485 if (reserve > 0) {
3453 reserve--; 3486 /*
3454 continue; 3487 * Blocks with reserved pages will never free, skip
3455 } 3488 * them.
3489 */
3490 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3491 if (pageblock_is_reserved(pfn, block_end_pfn))
3492 continue;
3456 3493
3457 /* Suitable for reserving if this block is movable */ 3494 /* If this block is reserved, account for it */
3458 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 3495 if (block_migratetype == MIGRATE_RESERVE) {
3459 set_pageblock_migratetype(page, MIGRATE_RESERVE); 3496 reserve--;
3460 move_freepages_block(zone, page, MIGRATE_RESERVE); 3497 continue;
3461 reserve--; 3498 }
3462 continue; 3499
3500 /* Suitable for reserving if this block is movable */
3501 if (block_migratetype == MIGRATE_MOVABLE) {
3502 set_pageblock_migratetype(page,
3503 MIGRATE_RESERVE);
3504 move_freepages_block(zone, page,
3505 MIGRATE_RESERVE);
3506 reserve--;
3507 continue;
3508 }
3463 } 3509 }
3464 3510
3465 /* 3511 /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 0ae7a09141e..af0cc7a58f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1630,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1630 areas[group] = ptr; 1630 areas[group] = ptr;
1631 1631
1632 base = min(ptr, base); 1632 base = min(ptr, base);
1633 }
1634
1635 /*
1636 * Copy data and free unused parts. This should happen after all
1637 * allocations are complete; otherwise, we may end up with
1638 * overlapping groups.
1639 */
1640 for (group = 0; group < ai->nr_groups; group++) {
1641 struct pcpu_group_info *gi = &ai->groups[group];
1642 void *ptr = areas[group];
1633 1643
1634 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { 1644 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1635 if (gi->cpu_map[i] == NR_CPUS) { 1645 if (gi->cpu_map[i] == NR_CPUS) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 883e98f78ca..df31a443293 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2348,12 +2348,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
2348{ 2348{
2349 struct inode *inode; 2349 struct inode *inode;
2350 struct dentry *dentry = NULL; 2350 struct dentry *dentry = NULL;
2351 u64 inum = fid->raw[2]; 2351 u64 inum;
2352 inum = (inum << 32) | fid->raw[1];
2353 2352
2354 if (fh_len < 3) 2353 if (fh_len < 3)
2355 return NULL; 2354 return NULL;
2356 2355
2356 inum = fid->raw[2];
2357 inum = (inum << 32) | fid->raw[1];
2358
2357 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), 2359 inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
2358 shmem_match, fid->raw); 2360 shmem_match, fid->raw);
2359 if (inode) { 2361 if (inode) {
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..a67f8121ce5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3218 if (in_interrupt() || (flags & __GFP_THISNODE)) 3218 if (in_interrupt() || (flags & __GFP_THISNODE))
3219 return NULL; 3219 return NULL;
3220 nid_alloc = nid_here = numa_mem_id(); 3220 nid_alloc = nid_here = numa_mem_id();
3221 get_mems_allowed();
3222 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3223 nid_alloc = cpuset_slab_spread_node(); 3222 nid_alloc = cpuset_slab_spread_node();
3224 else if (current->mempolicy) 3223 else if (current->mempolicy)
3225 nid_alloc = slab_node(current->mempolicy); 3224 nid_alloc = slab_node(current->mempolicy);
3226 put_mems_allowed();
3227 if (nid_alloc != nid_here) 3225 if (nid_alloc != nid_here)
3228 return ____cache_alloc_node(cachep, flags, nid_alloc); 3226 return ____cache_alloc_node(cachep, flags, nid_alloc);
3229 return NULL; 3227 return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3246 enum zone_type high_zoneidx = gfp_zone(flags); 3244 enum zone_type high_zoneidx = gfp_zone(flags);
3247 void *obj = NULL; 3245 void *obj = NULL;
3248 int nid; 3246 int nid;
3247 unsigned int cpuset_mems_cookie;
3249 3248
3250 if (flags & __GFP_THISNODE) 3249 if (flags & __GFP_THISNODE)
3251 return NULL; 3250 return NULL;
3252 3251
3253 get_mems_allowed();
3254 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3255 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3252 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3256 3253
3254retry_cpuset:
3255 cpuset_mems_cookie = get_mems_allowed();
3256 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3257
3257retry: 3258retry:
3258 /* 3259 /*
3259 * Look through allowed nodes for objects available 3260 * Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
3306 } 3307 }
3307 } 3308 }
3308 } 3309 }
3309 put_mems_allowed(); 3310
3311 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3312 goto retry_cpuset;
3310 return obj; 3313 return obj;
3311} 3314}
3312 3315
diff --git a/mm/slub.c b/mm/slub.c
index 10ab2335e2e..ae6e80ed1e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1457 struct zone *zone; 1457 struct zone *zone;
1458 enum zone_type high_zoneidx = gfp_zone(flags); 1458 enum zone_type high_zoneidx = gfp_zone(flags);
1459 struct page *page; 1459 struct page *page;
1460 unsigned int cpuset_mems_cookie;
1460 1461
1461 /* 1462 /*
1462 * The defrag ratio allows a configuration of the tradeoffs between 1463 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1480 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1481 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1481 return NULL; 1482 return NULL;
1482 1483
1483 get_mems_allowed(); 1484 do {
1484 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1485 cpuset_mems_cookie = get_mems_allowed();
1485 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1486 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1486 struct kmem_cache_node *n; 1487 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1487 1488 struct kmem_cache_node *n;
1488 n = get_node(s, zone_to_nid(zone)); 1489
1489 1490 n = get_node(s, zone_to_nid(zone));
1490 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1491
1491 n->nr_partial > s->min_partial) { 1492 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1492 page = get_partial_node(n); 1493 n->nr_partial > s->min_partial) {
1493 if (page) { 1494 page = get_partial_node(n);
1494 put_mems_allowed(); 1495 if (page) {
1495 return page; 1496 /*
1497 * Return the object even if
1498 * put_mems_allowed indicated that
1499 * the cpuset mems_allowed was
1500 * updated in parallel. It's a
1501 * harmless race between the alloc
1502 * and the cpuset update.
1503 */
1504 put_mems_allowed(cpuset_mems_cookie);
1505 return page;
1506 }
1496 } 1507 }
1497 } 1508 }
1498 } 1509 } while (!put_mems_allowed(cpuset_mems_cookie));
1499 put_mems_allowed();
1500#endif 1510#endif
1501 return NULL; 1511 return NULL;
1502} 1512}
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad..3e9829f3988 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -398,11 +398,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
398 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) 398 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
399 return 0; 399 return 0;
400 400
401 clear_page_mlock(page);
402
401 spin_lock_irq(&mapping->tree_lock); 403 spin_lock_irq(&mapping->tree_lock);
402 if (PageDirty(page)) 404 if (PageDirty(page))
403 goto failed; 405 goto failed;
404 406
405 clear_page_mlock(page);
406 BUG_ON(page_has_private(page)); 407 BUG_ON(page_has_private(page));
407 __delete_from_page_cache(page); 408 __delete_from_page_cache(page);
408 spin_unlock_irq(&mapping->tree_lock); 409 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 43b44dbadda..bdb70042c12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
256 struct rb_node rb_node; /* address sorted rbtree */ 256 struct rb_node rb_node; /* address sorted rbtree */
257 struct list_head list; /* address sorted list */ 257 struct list_head list; /* address sorted list */
258 struct list_head purge_list; /* "lazy purge" list */ 258 struct list_head purge_list; /* "lazy purge" list */
259 void *private; 259 struct vm_struct *vm;
260 struct rcu_head rcu_head; 260 struct rcu_head rcu_head;
261}; 261};
262 262
@@ -1174,9 +1174,10 @@ void __init vmalloc_init(void)
1174 /* Import existing vmlist entries. */ 1174 /* Import existing vmlist entries. */
1175 for (tmp = vmlist; tmp; tmp = tmp->next) { 1175 for (tmp = vmlist; tmp; tmp = tmp->next) {
1176 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1176 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1177 va->flags = tmp->flags | VM_VM_AREA; 1177 va->flags = VM_VM_AREA;
1178 va->va_start = (unsigned long)tmp->addr; 1178 va->va_start = (unsigned long)tmp->addr;
1179 va->va_end = va->va_start + tmp->size; 1179 va->va_end = va->va_start + tmp->size;
1180 va->vm = tmp;
1180 __insert_vmap_area(va); 1181 __insert_vmap_area(va);
1181 } 1182 }
1182 1183
@@ -1274,7 +1275,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1274 vm->addr = (void *)va->va_start; 1275 vm->addr = (void *)va->va_start;
1275 vm->size = va->va_end - va->va_start; 1276 vm->size = va->va_end - va->va_start;
1276 vm->caller = caller; 1277 vm->caller = caller;
1277 va->private = vm; 1278 va->vm = vm;
1278 va->flags |= VM_VM_AREA; 1279 va->flags |= VM_VM_AREA;
1279} 1280}
1280 1281
@@ -1397,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr)
1397 1398
1398 va = find_vmap_area((unsigned long)addr); 1399 va = find_vmap_area((unsigned long)addr);
1399 if (va && va->flags & VM_VM_AREA) 1400 if (va && va->flags & VM_VM_AREA)
1400 return va->private; 1401 return va->vm;
1401 1402
1402 return NULL; 1403 return NULL;
1403} 1404}
@@ -1416,7 +1417,7 @@ struct vm_struct *remove_vm_area(const void *addr)
1416 1417
1417 va = find_vmap_area((unsigned long)addr); 1418 va = find_vmap_area((unsigned long)addr);
1418 if (va && va->flags & VM_VM_AREA) { 1419 if (va && va->flags & VM_VM_AREA) {
1419 struct vm_struct *vm = va->private; 1420 struct vm_struct *vm = va->vm;
1420 1421
1421 if (!(vm->flags & VM_UNLIST)) { 1422 if (!(vm->flags & VM_UNLIST)) {
1422 struct vm_struct *tmp, **p; 1423 struct vm_struct *tmp, **p;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6072d74a16f..5326f98f506 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
248 248
249 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 long total_scan;
252 unsigned long max_pass; 252 long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
253 256
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 257 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
258 if (max_pass <= 0)
259 continue;
260
261 /*
262 * copy the current shrinker scan count into a local variable
263 * and zero it so that other concurrent shrinker invocations
264 * don't also do this scanning work.
265 */
266 do {
267 nr = shrinker->nr;
268 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
269
270 total_scan = nr;
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 271 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 272 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 273 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 274 total_scan += delta;
259 if (shrinker->nr < 0) { 275 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 276 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 277 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 278 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 279 total_scan = max_pass;
264 } 280 }
265 281
266 /* 282 /*
283 * We need to avoid excessive windup on filesystem shrinkers
284 * due to large numbers of GFP_NOFS allocations causing the
285 * shrinkers to return -1 all the time. This results in a large
286 * nr being built up so when a shrink that can do some work
287 * comes along it empties the entire cache due to nr >>>
288 * max_pass. This is bad for sustaining a working set in
289 * memory.
290 *
291 * Hence only allow the shrinker to scan the entire cache when
292 * a large delta change is calculated directly.
293 */
294 if (delta < max_pass / 4)
295 total_scan = min(total_scan, max_pass / 2);
296
297 /*
267 * Avoid risking looping forever due to too large nr value: 298 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 299 * never try to free more than twice the estimate number of
269 * freeable entries. 300 * freeable entries.
270 */ 301 */
271 if (shrinker->nr > max_pass * 2) 302 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 303 total_scan = max_pass * 2;
273 304
274 total_scan = shrinker->nr; 305 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 306 nr_pages_scanned, lru_pages,
307 max_pass, delta, total_scan);
276 308
277 while (total_scan >= SHRINK_BATCH) { 309 while (total_scan >= SHRINK_BATCH) {
278 long this_scan = SHRINK_BATCH; 310 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 311 int nr_before;
281 312
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 313 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
292 cond_resched(); 323 cond_resched();
293 } 324 }
294 325
295 shrinker->nr += total_scan; 326 /*
327 * move the unused scan count back into the shrinker in a
328 * manner that handles concurrent updates. If we exhausted the
329 * scan, there is no need to do an update.
330 */
331 do {
332 nr = shrinker->nr;
333 new_nr = total_scan + nr;
334 if (total_scan <= 0)
335 break;
336 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
337
338 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 339 }
297 up_read(&shrinker_rwsem); 340 up_read(&shrinker_rwsem);
298out: 341out:
@@ -665,7 +708,7 @@ static enum page_references page_check_references(struct page *page,
665 return PAGEREF_RECLAIM; 708 return PAGEREF_RECLAIM;
666 709
667 if (referenced_ptes) { 710 if (referenced_ptes) {
668 if (PageAnon(page)) 711 if (PageSwapBacked(page))
669 return PAGEREF_ACTIVATE; 712 return PAGEREF_ACTIVATE;
670 /* 713 /*
671 * All mapped pages start out with page table 714 * All mapped pages start out with page table
@@ -683,7 +726,13 @@ static enum page_references page_check_references(struct page *page,
683 */ 726 */
684 SetPageReferenced(page); 727 SetPageReferenced(page);
685 728
686 if (referenced_page) 729 if (referenced_page || referenced_ptes > 1)
730 return PAGEREF_ACTIVATE;
731
732 /*
733 * Activate file-backed executable pages after first usage.
734 */
735 if (vm_flags & VM_EXEC)
687 return PAGEREF_ACTIVATE; 736 return PAGEREF_ACTIVATE;
688 737
689 return PAGEREF_KEEP; 738 return PAGEREF_KEEP;
@@ -972,23 +1021,27 @@ keep_lumpy:
972 * 1021 *
973 * returns 0 on success, -ve errno on failure. 1022 * returns 0 on success, -ve errno on failure.
974 */ 1023 */
975int __isolate_lru_page(struct page *page, int mode, int file) 1024int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
976{ 1025{
1026 bool all_lru_mode;
977 int ret = -EINVAL; 1027 int ret = -EINVAL;
978 1028
979 /* Only take pages on the LRU. */ 1029 /* Only take pages on the LRU. */
980 if (!PageLRU(page)) 1030 if (!PageLRU(page))
981 return ret; 1031 return ret;
982 1032
1033 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1034 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1035
983 /* 1036 /*
984 * When checking the active state, we need to be sure we are 1037 * When checking the active state, we need to be sure we are
985 * dealing with comparible boolean values. Take the logical not 1038 * dealing with comparible boolean values. Take the logical not
986 * of each. 1039 * of each.
987 */ 1040 */
988 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1041 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
989 return ret; 1042 return ret;
990 1043
991 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1044 if (!all_lru_mode && !!page_is_file_cache(page) != file)
992 return ret; 1045 return ret;
993 1046
994 /* 1047 /*
@@ -1001,6 +1054,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1001 1054
1002 ret = -EBUSY; 1055 ret = -EBUSY;
1003 1056
1057 /*
1058 * To minimise LRU disruption, the caller can indicate that it only
1059 * wants to isolate pages it will be able to operate on without
1060 * blocking - clean pages for the most part.
1061 *
1062 * ISOLATE_CLEAN means that only clean pages should be isolated. This
1063 * is used by reclaim when it is cannot write to backing storage
1064 *
1065 * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1066 * that it is possible to migrate without blocking
1067 */
1068 if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
1069 /* All the caller can do on PageWriteback is block */
1070 if (PageWriteback(page))
1071 return ret;
1072
1073 if (PageDirty(page)) {
1074 struct address_space *mapping;
1075
1076 /* ISOLATE_CLEAN means only clean pages */
1077 if (mode & ISOLATE_CLEAN)
1078 return ret;
1079
1080 /*
1081 * Only pages without mappings or that have a
1082 * ->migratepage callback are possible to migrate
1083 * without blocking
1084 */
1085 mapping = page_mapping(page);
1086 if (mapping && !mapping->a_ops->migratepage)
1087 return ret;
1088 }
1089 }
1090
1091 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1092 return ret;
1093
1004 if (likely(get_page_unless_zero(page))) { 1094 if (likely(get_page_unless_zero(page))) {
1005 /* 1095 /*
1006 * Be careful not to clear PageLRU until after we're 1096 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1126,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1036 */ 1126 */
1037static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1127static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1038 struct list_head *src, struct list_head *dst, 1128 struct list_head *src, struct list_head *dst,
1039 unsigned long *scanned, int order, int mode, int file) 1129 unsigned long *scanned, int order, isolate_mode_t mode,
1130 int file)
1040{ 1131{
1041 unsigned long nr_taken = 0; 1132 unsigned long nr_taken = 0;
1042 unsigned long nr_lumpy_taken = 0; 1133 unsigned long nr_lumpy_taken = 0;
@@ -1111,7 +1202,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1111 * anon page which don't already have a swap slot is 1202 * anon page which don't already have a swap slot is
1112 * pointless. 1203 * pointless.
1113 */ 1204 */
1114 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1205 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1115 !PageSwapCache(cursor_page)) 1206 !PageSwapCache(cursor_page))
1116 break; 1207 break;
1117 1208
@@ -1161,8 +1252,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1161static unsigned long isolate_pages_global(unsigned long nr, 1252static unsigned long isolate_pages_global(unsigned long nr,
1162 struct list_head *dst, 1253 struct list_head *dst,
1163 unsigned long *scanned, int order, 1254 unsigned long *scanned, int order,
1164 int mode, struct zone *z, 1255 isolate_mode_t mode,
1165 int active, int file) 1256 struct zone *z, int active, int file)
1166{ 1257{
1167 int lru = LRU_BASE; 1258 int lru = LRU_BASE;
1168 if (active) 1259 if (active)
@@ -1408,6 +1499,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1408 unsigned long nr_taken; 1499 unsigned long nr_taken;
1409 unsigned long nr_anon; 1500 unsigned long nr_anon;
1410 unsigned long nr_file; 1501 unsigned long nr_file;
1502 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1411 1503
1412 while (unlikely(too_many_isolated(zone, file, sc))) { 1504 while (unlikely(too_many_isolated(zone, file, sc))) {
1413 congestion_wait(BLK_RW_ASYNC, HZ/10); 1505 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1510,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1418 } 1510 }
1419 1511
1420 set_reclaim_mode(priority, sc, false); 1512 set_reclaim_mode(priority, sc, false);
1513 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1514 reclaim_mode |= ISOLATE_ACTIVE;
1515
1421 lru_add_drain(); 1516 lru_add_drain();
1517
1518 if (!sc->may_unmap)
1519 reclaim_mode |= ISOLATE_UNMAPPED;
1520 if (!sc->may_writepage)
1521 reclaim_mode |= ISOLATE_CLEAN;
1522
1422 spin_lock_irq(&zone->lru_lock); 1523 spin_lock_irq(&zone->lru_lock);
1423 1524
1424 if (scanning_global_lru(sc)) { 1525 if (scanning_global_lru(sc)) {
1425 nr_taken = isolate_pages_global(nr_to_scan, 1526 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1426 &page_list, &nr_scanned, sc->order, 1527 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1427 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1428 ISOLATE_BOTH : ISOLATE_INACTIVE,
1429 zone, 0, file);
1430 zone->pages_scanned += nr_scanned; 1528 zone->pages_scanned += nr_scanned;
1431 if (current_is_kswapd()) 1529 if (current_is_kswapd())
1432 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1530 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1533,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1435 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1533 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1436 nr_scanned); 1534 nr_scanned);
1437 } else { 1535 } else {
1438 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1536 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1439 &page_list, &nr_scanned, sc->order, 1537 &nr_scanned, sc->order, reclaim_mode, zone,
1440 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1538 sc->mem_cgroup, 0, file);
1441 ISOLATE_BOTH : ISOLATE_INACTIVE,
1442 zone, sc->mem_cgroup,
1443 0, file);
1444 /* 1539 /*
1445 * mem_cgroup_isolate_pages() keeps track of 1540 * mem_cgroup_isolate_pages() keeps track of
1446 * scanned pages on its own. 1541 * scanned pages on its own.
@@ -1542,19 +1637,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1542 struct page *page; 1637 struct page *page;
1543 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1638 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1544 unsigned long nr_rotated = 0; 1639 unsigned long nr_rotated = 0;
1640 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1545 1641
1546 lru_add_drain(); 1642 lru_add_drain();
1643
1644 if (!sc->may_unmap)
1645 reclaim_mode |= ISOLATE_UNMAPPED;
1646 if (!sc->may_writepage)
1647 reclaim_mode |= ISOLATE_CLEAN;
1648
1547 spin_lock_irq(&zone->lru_lock); 1649 spin_lock_irq(&zone->lru_lock);
1548 if (scanning_global_lru(sc)) { 1650 if (scanning_global_lru(sc)) {
1549 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1651 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1550 &pgscanned, sc->order, 1652 &pgscanned, sc->order,
1551 ISOLATE_ACTIVE, zone, 1653 reclaim_mode, zone,
1552 1, file); 1654 1, file);
1553 zone->pages_scanned += pgscanned; 1655 zone->pages_scanned += pgscanned;
1554 } else { 1656 } else {
1555 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1657 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1556 &pgscanned, sc->order, 1658 &pgscanned, sc->order,
1557 ISOLATE_ACTIVE, zone, 1659 reclaim_mode, zone,
1558 sc->mem_cgroup, 1, file); 1660 sc->mem_cgroup, 1, file);
1559 /* 1661 /*
1560 * mem_cgroup_isolate_pages() keeps track of 1662 * mem_cgroup_isolate_pages() keeps track of
@@ -1747,23 +1849,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1747 u64 fraction[2], denominator; 1849 u64 fraction[2], denominator;
1748 enum lru_list l; 1850 enum lru_list l;
1749 int noswap = 0; 1851 int noswap = 0;
1750 int force_scan = 0; 1852 bool force_scan = false;
1751 unsigned long nr_force_scan[2]; 1853 unsigned long nr_force_scan[2];
1752 1854
1753 1855 /* kswapd does zone balancing and needs to scan this zone */
1754 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1856 if (scanning_global_lru(sc) && current_is_kswapd() &&
1755 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); 1857 zone->all_unreclaimable)
1756 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + 1858 force_scan = true;
1757 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 1859 /* memcg may have small limit and need to avoid priority drop */
1758 1860 if (!scanning_global_lru(sc))
1759 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1861 force_scan = true;
1760 /* kswapd does zone balancing and need to scan this zone */
1761 if (scanning_global_lru(sc) && current_is_kswapd())
1762 force_scan = 1;
1763 /* memcg may have small limit and need to avoid priority drop */
1764 if (!scanning_global_lru(sc))
1765 force_scan = 1;
1766 }
1767 1862
1768 /* If we have no swap space, do not bother scanning anon pages. */ 1863 /* If we have no swap space, do not bother scanning anon pages. */
1769 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1864 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1871,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1776 goto out; 1871 goto out;
1777 } 1872 }
1778 1873
1874 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1875 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1876 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1877 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1878
1779 if (scanning_global_lru(sc)) { 1879 if (scanning_global_lru(sc)) {
1780 free = zone_page_state(zone, NR_FREE_PAGES); 1880 free = zone_page_state(zone, NR_FREE_PAGES);
1781 /* If we have very few page cache pages, 1881 /* If we have very few page cache pages,
@@ -1912,8 +2012,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
1912 * inactive lists are large enough, continue reclaiming 2012 * inactive lists are large enough, continue reclaiming
1913 */ 2013 */
1914 pages_for_compaction = (2UL << sc->order); 2014 pages_for_compaction = (2UL << sc->order);
1915 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 2015 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1916 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2016 if (nr_swap_pages > 0)
2017 inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1917 if (sc->nr_reclaimed < pages_for_compaction && 2018 if (sc->nr_reclaimed < pages_for_compaction &&
1918 inactive_lru_pages > pages_for_compaction) 2019 inactive_lru_pages > pages_for_compaction)
1919 return true; 2020 return true;
@@ -1985,6 +2086,42 @@ restart:
1985 throttle_vm_writeout(sc->gfp_mask); 2086 throttle_vm_writeout(sc->gfp_mask);
1986} 2087}
1987 2088
2089/* Returns true if compaction should go ahead for a high-order request */
2090static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2091{
2092 unsigned long balance_gap, watermark;
2093 bool watermark_ok;
2094
2095 /* Do not consider compaction for orders reclaim is meant to satisfy */
2096 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
2097 return false;
2098
2099 /*
2100 * Compaction takes time to run and there are potentially other
2101 * callers using the pages just freed. Continue reclaiming until
2102 * there is a buffer of free pages available to give compaction
2103 * a reasonable chance of completing and allocating the page
2104 */
2105 balance_gap = min(low_wmark_pages(zone),
2106 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2107 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2108 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
2109 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
2110
2111 /*
2112 * If compaction is deferred, reclaim up to a point where
2113 * compaction will have a chance of success when re-enabled
2114 */
2115 if (compaction_deferred(zone))
2116 return watermark_ok;
2117
2118 /* If compaction is not ready to start, keep reclaiming */
2119 if (!compaction_suitable(zone, sc->order))
2120 return false;
2121
2122 return watermark_ok;
2123}
2124
1988/* 2125/*
1989 * This is the direct reclaim path, for page-allocating processes. We only 2126 * This is the direct reclaim path, for page-allocating processes. We only
1990 * try to reclaim pages from zones which will satisfy the caller's allocation 2127 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2000,14 +2137,20 @@ restart:
2000 * 2137 *
2001 * If a zone is deemed to be full of pinned pages then just give it a light 2138 * If a zone is deemed to be full of pinned pages then just give it a light
2002 * scan then give up on it. 2139 * scan then give up on it.
2140 *
2141 * This function returns true if a zone is being reclaimed for a costly
2142 * high-order allocation and compaction is ready to begin. This indicates to
2143 * the caller that it should consider retrying the allocation instead of
2144 * further reclaim.
2003 */ 2145 */
2004static void shrink_zones(int priority, struct zonelist *zonelist, 2146static bool shrink_zones(int priority, struct zonelist *zonelist,
2005 struct scan_control *sc) 2147 struct scan_control *sc)
2006{ 2148{
2007 struct zoneref *z; 2149 struct zoneref *z;
2008 struct zone *zone; 2150 struct zone *zone;
2009 unsigned long nr_soft_reclaimed; 2151 unsigned long nr_soft_reclaimed;
2010 unsigned long nr_soft_scanned; 2152 unsigned long nr_soft_scanned;
2153 bool aborted_reclaim = false;
2011 2154
2012 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2155 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2013 gfp_zone(sc->gfp_mask), sc->nodemask) { 2156 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2022,6 +2165,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2022 continue; 2165 continue;
2023 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2166 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2024 continue; /* Let kswapd poll it */ 2167 continue; /* Let kswapd poll it */
2168 if (COMPACTION_BUILD) {
2169 /*
2170 * If we already have plenty of memory free for
2171 * compaction in this zone, don't free any more.
2172 * Even though compaction is invoked for any
2173 * non-zero order, only frequent costly order
2174 * reclamation is disruptive enough to become a
2175 * noticable problem, like transparent huge page
2176 * allocations.
2177 */
2178 if (compaction_ready(zone, sc)) {
2179 aborted_reclaim = true;
2180 continue;
2181 }
2182 }
2025 /* 2183 /*
2026 * This steals pages from memory cgroups over softlimit 2184 * This steals pages from memory cgroups over softlimit
2027 * and returns the number of reclaimed pages and 2185 * and returns the number of reclaimed pages and
@@ -2039,6 +2197,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2039 2197
2040 shrink_zone(priority, zone, sc); 2198 shrink_zone(priority, zone, sc);
2041 } 2199 }
2200
2201 return aborted_reclaim;
2042} 2202}
2043 2203
2044static bool zone_reclaimable(struct zone *zone) 2204static bool zone_reclaimable(struct zone *zone)
@@ -2092,8 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2092 struct zoneref *z; 2252 struct zoneref *z;
2093 struct zone *zone; 2253 struct zone *zone;
2094 unsigned long writeback_threshold; 2254 unsigned long writeback_threshold;
2255 bool aborted_reclaim;
2095 2256
2096 get_mems_allowed();
2097 delayacct_freepages_start(); 2257 delayacct_freepages_start();
2098 2258
2099 if (scanning_global_lru(sc)) 2259 if (scanning_global_lru(sc))
@@ -2103,7 +2263,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2103 sc->nr_scanned = 0; 2263 sc->nr_scanned = 0;
2104 if (!priority) 2264 if (!priority)
2105 disable_swap_token(sc->mem_cgroup); 2265 disable_swap_token(sc->mem_cgroup);
2106 shrink_zones(priority, zonelist, sc); 2266 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2267
2107 /* 2268 /*
2108 * Don't shrink slabs when reclaiming memory from 2269 * Don't shrink slabs when reclaiming memory from
2109 * over limit cgroups 2270 * over limit cgroups
@@ -2155,7 +2316,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2155 2316
2156out: 2317out:
2157 delayacct_freepages_end(); 2318 delayacct_freepages_end();
2158 put_mems_allowed();
2159 2319
2160 if (sc->nr_reclaimed) 2320 if (sc->nr_reclaimed)
2161 return sc->nr_reclaimed; 2321 return sc->nr_reclaimed;
@@ -2168,6 +2328,10 @@ out:
2168 if (oom_killer_disabled) 2328 if (oom_killer_disabled)
2169 return 0; 2329 return 0;
2170 2330
2331 /* Aborted reclaim to try compaction? don't OOM, then */
2332 if (aborted_reclaim)
2333 return 1;
2334
2171 /* top priority shrink_zones still had more to do? don't OOM, then */ 2335 /* top priority shrink_zones still had more to do? don't OOM, then */
2172 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2336 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
2173 return 1; 2337 return 1;
@@ -2459,6 +2623,9 @@ loop_again:
2459 high_wmark_pages(zone), 0, 0)) { 2623 high_wmark_pages(zone), 0, 0)) {
2460 end_zone = i; 2624 end_zone = i;
2461 break; 2625 break;
2626 } else {
2627 /* If balanced, clear the congested flag */
2628 zone_clear_flag(zone, ZONE_CONGESTED);
2462 } 2629 }
2463 } 2630 }
2464 if (i < 0) 2631 if (i < 0)
@@ -2695,7 +2862,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2695 * them before going back to sleep. 2862 * them before going back to sleep.
2696 */ 2863 */
2697 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2864 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2698 schedule(); 2865
2866 if (!kthread_should_stop())
2867 schedule();
2868
2699 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2869 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2700 } else { 2870 } else {
2701 if (remaining) 2871 if (remaining)
@@ -2722,7 +2892,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2722static int kswapd(void *p) 2892static int kswapd(void *p)
2723{ 2893{
2724 unsigned long order, new_order; 2894 unsigned long order, new_order;
2895 unsigned balanced_order;
2725 int classzone_idx, new_classzone_idx; 2896 int classzone_idx, new_classzone_idx;
2897 int balanced_classzone_idx;
2726 pg_data_t *pgdat = (pg_data_t*)p; 2898 pg_data_t *pgdat = (pg_data_t*)p;
2727 struct task_struct *tsk = current; 2899 struct task_struct *tsk = current;
2728 2900
@@ -2753,7 +2925,9 @@ static int kswapd(void *p)
2753 set_freezable(); 2925 set_freezable();
2754 2926
2755 order = new_order = 0; 2927 order = new_order = 0;
2928 balanced_order = 0;
2756 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2929 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2930 balanced_classzone_idx = classzone_idx;
2757 for ( ; ; ) { 2931 for ( ; ; ) {
2758 int ret; 2932 int ret;
2759 2933
@@ -2762,7 +2936,8 @@ static int kswapd(void *p)
2762 * new request of a similar or harder type will succeed soon 2936 * new request of a similar or harder type will succeed soon
2763 * so consider going to sleep on the basis we reclaimed at 2937 * so consider going to sleep on the basis we reclaimed at
2764 */ 2938 */
2765 if (classzone_idx >= new_classzone_idx && order == new_order) { 2939 if (balanced_classzone_idx >= new_classzone_idx &&
2940 balanced_order == new_order) {
2766 new_order = pgdat->kswapd_max_order; 2941 new_order = pgdat->kswapd_max_order;
2767 new_classzone_idx = pgdat->classzone_idx; 2942 new_classzone_idx = pgdat->classzone_idx;
2768 pgdat->kswapd_max_order = 0; 2943 pgdat->kswapd_max_order = 0;
@@ -2777,9 +2952,12 @@ static int kswapd(void *p)
2777 order = new_order; 2952 order = new_order;
2778 classzone_idx = new_classzone_idx; 2953 classzone_idx = new_classzone_idx;
2779 } else { 2954 } else {
2780 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2955 kswapd_try_to_sleep(pgdat, balanced_order,
2956 balanced_classzone_idx);
2781 order = pgdat->kswapd_max_order; 2957 order = pgdat->kswapd_max_order;
2782 classzone_idx = pgdat->classzone_idx; 2958 classzone_idx = pgdat->classzone_idx;
2959 new_order = order;
2960 new_classzone_idx = classzone_idx;
2783 pgdat->kswapd_max_order = 0; 2961 pgdat->kswapd_max_order = 0;
2784 pgdat->classzone_idx = pgdat->nr_zones - 1; 2962 pgdat->classzone_idx = pgdat->nr_zones - 1;
2785 } 2963 }
@@ -2794,7 +2972,9 @@ static int kswapd(void *p)
2794 */ 2972 */
2795 if (!ret) { 2973 if (!ret) {
2796 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2974 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2797 order = balance_pgdat(pgdat, order, &classzone_idx); 2975 balanced_classzone_idx = classzone_idx;
2976 balanced_order = balance_pgdat(pgdat, order,
2977 &balanced_classzone_idx);
2798 } 2978 }
2799 } 2979 }
2800 return 0; 2980 return 0;
@@ -2952,14 +3132,17 @@ int kswapd_run(int nid)
2952} 3132}
2953 3133
2954/* 3134/*
2955 * Called by memory hotplug when all memory in a node is offlined. 3135 * Called by memory hotplug when all memory in a node is offlined. Caller must
3136 * hold lock_memory_hotplug().
2956 */ 3137 */
2957void kswapd_stop(int nid) 3138void kswapd_stop(int nid)
2958{ 3139{
2959 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 3140 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
2960 3141
2961 if (kswapd) 3142 if (kswapd) {
2962 kthread_stop(kswapd); 3143 kthread_stop(kswapd);
3144 NODE_DATA(nid)->kswapd = NULL;
3145 }
2963} 3146}
2964 3147
2965static int __init kswapd_init(void) 3148static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..6559013c5a1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
78 * 78 *
79 * vm_stat contains the global counters 79 * vm_stat contains the global counters
80 */ 80 */
81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 81atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
82EXPORT_SYMBOL(vm_stat); 82EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP