diff options
author | Todd Poynor | 2012-11-01 15:36:34 -0500 |
---|---|---|
committer | Todd Poynor | 2012-11-01 15:36:34 -0500 |
commit | 925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3 (patch) | |
tree | a56506710f0340db055191e3cf0a207699c1b849 /mm | |
parent | 834029ac9d0ad8dea4e6a21bc34877dc3740b9f4 (diff) | |
parent | 27d0858dbcf199838b8c50a3e94d397bf326d986 (diff) | |
download | kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.gz kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.xz kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.zip |
Merge remote-tracking branch 'stable/linux-3.0.y' into android-3.0
Change-Id: I9685feb9277b450da10d78a455b3c0674d6cfe18
Signed-off-by: Todd Poynor <toddpoynor@google.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 31 | ||||
-rw-r--r-- | mm/filemap.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 68 | ||||
-rw-r--r-- | mm/madvise.c | 16 | ||||
-rw-r--r-- | mm/memcontrol.c | 9 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 18 | ||||
-rw-r--r-- | mm/mempolicy.c | 148 | ||||
-rw-r--r-- | mm/migrate.c | 240 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 45 | ||||
-rw-r--r-- | mm/nobootmem.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 120 | ||||
-rw-r--r-- | mm/percpu.c | 10 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 40 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmalloc.c | 11 | ||||
-rw-r--r-- | mm/vmscan.c | 305 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
20 files changed, 771 insertions, 334 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index c4bc5acf865..8ea7308601b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,10 +35,6 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | /* Account for isolated anon and file pages */ | ||
39 | unsigned long nr_anon; | ||
40 | unsigned long nr_file; | ||
41 | |||
42 | unsigned int order; /* order a direct compactor needs */ | 38 | unsigned int order; /* order a direct compactor needs */ |
43 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
44 | struct zone *zone; | 40 | struct zone *zone; |
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, | |||
223 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 219 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
224 | { | 220 | { |
225 | struct page *page; | 221 | struct page *page; |
226 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 222 | unsigned int count[2] = { 0, }; |
227 | 223 | ||
228 | list_for_each_entry(page, &cc->migratepages, lru) { | 224 | list_for_each_entry(page, &cc->migratepages, lru) |
229 | int lru = page_lru_base_type(page); | 225 | count[!!page_is_file_cache(page)]++; |
230 | count[lru]++; | ||
231 | } | ||
232 | 226 | ||
233 | cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 227 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
234 | cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 228 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
235 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); | ||
236 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); | ||
237 | } | 229 | } |
238 | 230 | ||
239 | /* Similar to reclaim, but different enough that they don't share logic */ | 231 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
269 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 261 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
270 | unsigned long nr_scanned = 0, nr_isolated = 0; | 262 | unsigned long nr_scanned = 0, nr_isolated = 0; |
271 | struct list_head *migratelist = &cc->migratepages; | 263 | struct list_head *migratelist = &cc->migratepages; |
264 | isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; | ||
272 | 265 | ||
273 | /* Do not scan outside zone boundaries */ | 266 | /* Do not scan outside zone boundaries */ |
274 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 267 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
@@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
378 | continue; | 371 | continue; |
379 | } | 372 | } |
380 | 373 | ||
374 | if (!cc->sync) | ||
375 | mode |= ISOLATE_ASYNC_MIGRATE; | ||
376 | |||
381 | /* Try isolate the page */ | 377 | /* Try isolate the page */ |
382 | if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) | 378 | if (__isolate_lru_page(page, mode, 0) != 0) |
383 | continue; | 379 | continue; |
384 | 380 | ||
385 | VM_BUG_ON(PageTransCompound(page)); | 381 | VM_BUG_ON(PageTransCompound(page)); |
@@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
581 | nr_migrate = cc->nr_migratepages; | 577 | nr_migrate = cc->nr_migratepages; |
582 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 578 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
583 | (unsigned long)cc, false, | 579 | (unsigned long)cc, false, |
584 | cc->sync); | 580 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); |
585 | update_nr_listpages(cc); | 581 | update_nr_listpages(cc); |
586 | nr_remaining = cc->nr_migratepages; | 582 | nr_remaining = cc->nr_migratepages; |
587 | 583 | ||
@@ -596,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
596 | if (err) { | 592 | if (err) { |
597 | putback_lru_pages(&cc->migratepages); | 593 | putback_lru_pages(&cc->migratepages); |
598 | cc->nr_migratepages = 0; | 594 | cc->nr_migratepages = 0; |
595 | if (err == -ENOMEM) { | ||
596 | ret = COMPACT_PARTIAL; | ||
597 | goto out; | ||
598 | } | ||
599 | } | 599 | } |
600 | |||
601 | } | 600 | } |
602 | 601 | ||
603 | out: | 602 | out: |
diff --git a/mm/filemap.c b/mm/filemap.c index b7d860390f3..10481ebd96c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
516 | struct page *page; | 516 | struct page *page; |
517 | 517 | ||
518 | if (cpuset_do_page_mem_spread()) { | 518 | if (cpuset_do_page_mem_spread()) { |
519 | get_mems_allowed(); | 519 | unsigned int cpuset_mems_cookie; |
520 | n = cpuset_mem_spread_node(); | 520 | do { |
521 | page = alloc_pages_exact_node(n, gfp, 0); | 521 | cpuset_mems_cookie = get_mems_allowed(); |
522 | put_mems_allowed(); | 522 | n = cpuset_mem_spread_node(); |
523 | page = alloc_pages_exact_node(n, gfp, 0); | ||
524 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
525 | |||
523 | return page; | 526 | return page; |
524 | } | 527 | } |
525 | return alloc_pages(gfp, 0); | 528 | return alloc_pages(gfp, 0); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f7001ac53b3..037f077b986 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
460 | struct zonelist *zonelist; | 460 | struct zonelist *zonelist; |
461 | struct zone *zone; | 461 | struct zone *zone; |
462 | struct zoneref *z; | 462 | struct zoneref *z; |
463 | unsigned int cpuset_mems_cookie; | ||
463 | 464 | ||
464 | get_mems_allowed(); | 465 | retry_cpuset: |
466 | cpuset_mems_cookie = get_mems_allowed(); | ||
465 | zonelist = huge_zonelist(vma, address, | 467 | zonelist = huge_zonelist(vma, address, |
466 | htlb_alloc_mask, &mpol, &nodemask); | 468 | htlb_alloc_mask, &mpol, &nodemask); |
467 | /* | 469 | /* |
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
488 | } | 490 | } |
489 | } | 491 | } |
490 | } | 492 | } |
491 | err: | 493 | |
492 | mpol_cond_put(mpol); | 494 | mpol_cond_put(mpol); |
493 | put_mems_allowed(); | 495 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
496 | goto retry_cpuset; | ||
494 | return page; | 497 | return page; |
498 | |||
499 | err: | ||
500 | mpol_cond_put(mpol); | ||
501 | return NULL; | ||
495 | } | 502 | } |
496 | 503 | ||
497 | static void update_and_free_page(struct hstate *h, struct page *page) | 504 | static void update_and_free_page(struct hstate *h, struct page *page) |
@@ -2060,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2060 | kref_get(&reservations->refs); | 2067 | kref_get(&reservations->refs); |
2061 | } | 2068 | } |
2062 | 2069 | ||
2070 | static void resv_map_put(struct vm_area_struct *vma) | ||
2071 | { | ||
2072 | struct resv_map *reservations = vma_resv_map(vma); | ||
2073 | |||
2074 | if (!reservations) | ||
2075 | return; | ||
2076 | kref_put(&reservations->refs, resv_map_release); | ||
2077 | } | ||
2078 | |||
2063 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2079 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2064 | { | 2080 | { |
2065 | struct hstate *h = hstate_vma(vma); | 2081 | struct hstate *h = hstate_vma(vma); |
@@ -2075,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2075 | reserve = (end - start) - | 2091 | reserve = (end - start) - |
2076 | region_count(&reservations->regions, start, end); | 2092 | region_count(&reservations->regions, start, end); |
2077 | 2093 | ||
2078 | kref_put(&reservations->refs, resv_map_release); | 2094 | resv_map_put(vma); |
2079 | 2095 | ||
2080 | if (reserve) { | 2096 | if (reserve) { |
2081 | hugetlb_acct_memory(h, -reserve); | 2097 | hugetlb_acct_memory(h, -reserve); |
@@ -2285,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2285 | { | 2301 | { |
2286 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2302 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2287 | __unmap_hugepage_range(vma, start, end, ref_page); | 2303 | __unmap_hugepage_range(vma, start, end, ref_page); |
2304 | /* | ||
2305 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2306 | * test will fail on a vma being torn down, and not grab a page table | ||
2307 | * on its way out. We're lucky that the flag has such an appropriate | ||
2308 | * name, and can in fact be safely cleared here. We could clear it | ||
2309 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2310 | * is to clear it before releasing the i_mmap_mutex below. | ||
2311 | * | ||
2312 | * This works because in the contexts this is called, the VMA is | ||
2313 | * going to be destroyed. It is not vunerable to madvise(DONTNEED) | ||
2314 | * because madvise is not supported on hugetlbfs. The same applies | ||
2315 | * for direct IO. unmap_hugepage_range() is only being called just | ||
2316 | * before free_pgtables() so clearing VM_MAYSHARE will not cause | ||
2317 | * surprises later. | ||
2318 | */ | ||
2319 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2288 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2320 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
2289 | } | 2321 | } |
2290 | 2322 | ||
@@ -2398,7 +2430,6 @@ retry_avoidcopy: | |||
2398 | if (outside_reserve) { | 2430 | if (outside_reserve) { |
2399 | BUG_ON(huge_pte_none(pte)); | 2431 | BUG_ON(huge_pte_none(pte)); |
2400 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2432 | if (unmap_ref_private(mm, vma, old_page, address)) { |
2401 | BUG_ON(page_count(old_page) != 1); | ||
2402 | BUG_ON(huge_pte_none(pte)); | 2433 | BUG_ON(huge_pte_none(pte)); |
2403 | spin_lock(&mm->page_table_lock); | 2434 | spin_lock(&mm->page_table_lock); |
2404 | goto retry_avoidcopy; | 2435 | goto retry_avoidcopy; |
@@ -2838,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2838 | } | 2869 | } |
2839 | } | 2870 | } |
2840 | spin_unlock(&mm->page_table_lock); | 2871 | spin_unlock(&mm->page_table_lock); |
2841 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2872 | /* |
2842 | 2873 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
2874 | * may have cleared our pud entry and done put_page on the page table: | ||
2875 | * once we release i_mmap_mutex, another task can do the final put_page | ||
2876 | * and that page table be reused and filled with junk. | ||
2877 | */ | ||
2843 | flush_tlb_range(vma, start, end); | 2878 | flush_tlb_range(vma, start, end); |
2879 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2844 | } | 2880 | } |
2845 | 2881 | ||
2846 | int hugetlb_reserve_pages(struct inode *inode, | 2882 | int hugetlb_reserve_pages(struct inode *inode, |
@@ -2878,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2878 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | 2914 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
2879 | } | 2915 | } |
2880 | 2916 | ||
2881 | if (chg < 0) | 2917 | if (chg < 0) { |
2882 | return chg; | 2918 | ret = chg; |
2919 | goto out_err; | ||
2920 | } | ||
2883 | 2921 | ||
2884 | /* There must be enough filesystem quota for the mapping */ | 2922 | /* There must be enough filesystem quota for the mapping */ |
2885 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2923 | if (hugetlb_get_quota(inode->i_mapping, chg)) { |
2886 | return -ENOSPC; | 2924 | ret = -ENOSPC; |
2925 | goto out_err; | ||
2926 | } | ||
2887 | 2927 | ||
2888 | /* | 2928 | /* |
2889 | * Check enough hugepages are available for the reservation. | 2929 | * Check enough hugepages are available for the reservation. |
@@ -2892,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2892 | ret = hugetlb_acct_memory(h, chg); | 2932 | ret = hugetlb_acct_memory(h, chg); |
2893 | if (ret < 0) { | 2933 | if (ret < 0) { |
2894 | hugetlb_put_quota(inode->i_mapping, chg); | 2934 | hugetlb_put_quota(inode->i_mapping, chg); |
2895 | return ret; | 2935 | goto out_err; |
2896 | } | 2936 | } |
2897 | 2937 | ||
2898 | /* | 2938 | /* |
@@ -2909,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2909 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 2949 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
2910 | region_add(&inode->i_mapping->private_list, from, to); | 2950 | region_add(&inode->i_mapping->private_list, from, to); |
2911 | return 0; | 2951 | return 0; |
2952 | out_err: | ||
2953 | if (vma) | ||
2954 | resv_map_put(vma); | ||
2955 | return ret; | ||
2912 | } | 2956 | } |
2913 | 2957 | ||
2914 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 2958 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed50..deabe5f603a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
15 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
16 | #include <linux/file.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * Any behaviour which results in changes to the vma->vm_flags needs to | 19 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
197 | struct address_space *mapping; | 198 | struct address_space *mapping; |
198 | loff_t offset, endoff; | 199 | loff_t offset, endoff; |
199 | int error; | 200 | int error; |
201 | struct file *f; | ||
200 | 202 | ||
201 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 203 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
202 | 204 | ||
203 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 205 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
204 | return -EINVAL; | 206 | return -EINVAL; |
205 | 207 | ||
206 | if (!vma->vm_file || !vma->vm_file->f_mapping | 208 | f = vma->vm_file; |
207 | || !vma->vm_file->f_mapping->host) { | 209 | |
210 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
208 | return -EINVAL; | 211 | return -EINVAL; |
209 | } | 212 | } |
210 | 213 | ||
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 221 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 222 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 223 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 224 | /* |
225 | * vmtruncate_range may need to take i_mutex and i_alloc_sem. | ||
226 | * We need to explicitly grab a reference because the vma (and | ||
227 | * hence the vma's reference to the file) can go away as soon as | ||
228 | * we drop mmap_sem. | ||
229 | */ | ||
230 | get_file(f); | ||
222 | up_read(¤t->mm->mmap_sem); | 231 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 232 | error = vmtruncate_range(mapping->host, offset, endoff); |
233 | fput(f); | ||
224 | down_read(¤t->mm->mmap_sem); | 234 | down_read(¤t->mm->mmap_sem); |
225 | return error; | 235 | return error; |
226 | } | 236 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 283068f5af9..57cdf5ad692 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1251 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | 1251 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, |
1252 | struct list_head *dst, | 1252 | struct list_head *dst, |
1253 | unsigned long *scanned, int order, | 1253 | unsigned long *scanned, int order, |
1254 | int mode, struct zone *z, | 1254 | isolate_mode_t mode, |
1255 | struct zone *z, | ||
1255 | struct mem_cgroup *mem_cont, | 1256 | struct mem_cgroup *mem_cont, |
1256 | int active, int file) | 1257 | int active, int file) |
1257 | { | 1258 | { |
@@ -4605,6 +4606,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4605 | swap_buffers: | 4606 | swap_buffers: |
4606 | /* Swap primary and spare array */ | 4607 | /* Swap primary and spare array */ |
4607 | thresholds->spare = thresholds->primary; | 4608 | thresholds->spare = thresholds->primary; |
4609 | /* If all events are unregistered, free the spare array */ | ||
4610 | if (!new) { | ||
4611 | kfree(thresholds->spare); | ||
4612 | thresholds->spare = NULL; | ||
4613 | } | ||
4614 | |||
4608 | rcu_assign_pointer(thresholds->primary, new); | 4615 | rcu_assign_pointer(thresholds->primary, new); |
4609 | 4616 | ||
4610 | /* To be sure that nobody uses thresholds */ | 4617 | /* To be sure that nobody uses thresholds */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059..2f49dcf4f47 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1334 | /* Keep page count to indicate a given hugepage is isolated. */ | 1334 | /* Keep page count to indicate a given hugepage is isolated. */ |
1335 | 1335 | ||
1336 | list_add(&hpage->lru, &pagelist); | 1336 | list_add(&hpage->lru, &pagelist); |
1337 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, | 1337 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, |
1338 | true); | 1338 | MIGRATE_SYNC); |
1339 | if (ret) { | 1339 | if (ret) { |
1340 | struct page *page1, *page2; | 1340 | struct page *page1, *page2; |
1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | 1341 | list_for_each_entry_safe(page1, page2, &pagelist, lru) |
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1464 | page_is_file_cache(page)); | 1464 | page_is_file_cache(page)); |
1465 | list_add(&page->lru, &pagelist); | 1465 | list_add(&page->lru, &pagelist); |
1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1467 | 0, true); | 1467 | false, MIGRATE_SYNC); |
1468 | if (ret) { | 1468 | if (ret) { |
1469 | putback_lru_pages(&pagelist); | 1469 | putback_lru_pages(&pagelist); |
1470 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1470 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b5a11..e0a3e51d519 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -116,9 +116,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) | |||
116 | struct mem_section *ms; | 116 | struct mem_section *ms; |
117 | struct page *page, *memmap; | 117 | struct page *page, *memmap; |
118 | 118 | ||
119 | if (!pfn_valid(start_pfn)) | ||
120 | return; | ||
121 | |||
122 | section_nr = pfn_to_section_nr(start_pfn); | 119 | section_nr = pfn_to_section_nr(start_pfn); |
123 | ms = __nr_to_section(section_nr); | 120 | ms = __nr_to_section(section_nr); |
124 | 121 | ||
@@ -177,9 +174,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
177 | end_pfn = pfn + pgdat->node_spanned_pages; | 174 | end_pfn = pfn + pgdat->node_spanned_pages; |
178 | 175 | ||
179 | /* register_section info */ | 176 | /* register_section info */ |
180 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) | 177 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
181 | register_page_bootmem_info_section(pfn); | 178 | /* |
182 | 179 | * Some platforms can assign the same pfn to multiple nodes - on | |
180 | * node0 as well as nodeN. To avoid registering a pfn against | ||
181 | * multiple nodes we check that this pfn does not already | ||
182 | * reside in some other node. | ||
183 | */ | ||
184 | if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) | ||
185 | register_page_bootmem_info_section(pfn); | ||
186 | } | ||
183 | } | 187 | } |
184 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 188 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
185 | 189 | ||
@@ -747,7 +751,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
747 | } | 751 | } |
748 | /* this function returns # of failed pages */ | 752 | /* this function returns # of failed pages */ |
749 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 753 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, |
750 | true, true); | 754 | true, MIGRATE_SYNC); |
751 | if (ret) | 755 | if (ret) |
752 | putback_lru_pages(&source); | 756 | putback_lru_pages(&source); |
753 | } | 757 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a85171de5d0..5dce7d46f79 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -606,24 +606,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
606 | return first; | 606 | return first; |
607 | } | 607 | } |
608 | 608 | ||
609 | /* Apply policy to a single VMA */ | 609 | /* |
610 | static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | 610 | * Apply policy to a single VMA |
611 | * This must be called with the mmap_sem held for writing. | ||
612 | */ | ||
613 | static int vma_replace_policy(struct vm_area_struct *vma, | ||
614 | struct mempolicy *pol) | ||
611 | { | 615 | { |
612 | int err = 0; | 616 | int err; |
613 | struct mempolicy *old = vma->vm_policy; | 617 | struct mempolicy *old; |
618 | struct mempolicy *new; | ||
614 | 619 | ||
615 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | 620 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", |
616 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | 621 | vma->vm_start, vma->vm_end, vma->vm_pgoff, |
617 | vma->vm_ops, vma->vm_file, | 622 | vma->vm_ops, vma->vm_file, |
618 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | 623 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); |
619 | 624 | ||
620 | if (vma->vm_ops && vma->vm_ops->set_policy) | 625 | new = mpol_dup(pol); |
626 | if (IS_ERR(new)) | ||
627 | return PTR_ERR(new); | ||
628 | |||
629 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
621 | err = vma->vm_ops->set_policy(vma, new); | 630 | err = vma->vm_ops->set_policy(vma, new); |
622 | if (!err) { | 631 | if (err) |
623 | mpol_get(new); | 632 | goto err_out; |
624 | vma->vm_policy = new; | ||
625 | mpol_put(old); | ||
626 | } | 633 | } |
634 | |||
635 | old = vma->vm_policy; | ||
636 | vma->vm_policy = new; /* protected by mmap_sem */ | ||
637 | mpol_put(old); | ||
638 | |||
639 | return 0; | ||
640 | err_out: | ||
641 | mpol_put(new); | ||
627 | return err; | 642 | return err; |
628 | } | 643 | } |
629 | 644 | ||
@@ -666,7 +681,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
666 | if (err) | 681 | if (err) |
667 | goto out; | 682 | goto out; |
668 | } | 683 | } |
669 | err = policy_vma(vma, new_pol); | 684 | err = vma_replace_policy(vma, new_pol); |
670 | if (err) | 685 | if (err) |
671 | goto out; | 686 | goto out; |
672 | } | 687 | } |
@@ -933,7 +948,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
933 | 948 | ||
934 | if (!list_empty(&pagelist)) { | 949 | if (!list_empty(&pagelist)) { |
935 | err = migrate_pages(&pagelist, new_node_page, dest, | 950 | err = migrate_pages(&pagelist, new_node_page, dest, |
936 | false, true); | 951 | false, MIGRATE_SYNC); |
937 | if (err) | 952 | if (err) |
938 | putback_lru_pages(&pagelist); | 953 | putback_lru_pages(&pagelist); |
939 | } | 954 | } |
@@ -1496,8 +1511,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1496 | addr); | 1511 | addr); |
1497 | if (vpol) | 1512 | if (vpol) |
1498 | pol = vpol; | 1513 | pol = vpol; |
1499 | } else if (vma->vm_policy) | 1514 | } else if (vma->vm_policy) { |
1500 | pol = vma->vm_policy; | 1515 | pol = vma->vm_policy; |
1516 | |||
1517 | /* | ||
1518 | * shmem_alloc_page() passes MPOL_F_SHARED policy with | ||
1519 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference | ||
1520 | * count on these policies which will be dropped by | ||
1521 | * mpol_cond_put() later | ||
1522 | */ | ||
1523 | if (mpol_needs_cond_ref(pol)) | ||
1524 | mpol_get(pol); | ||
1525 | } | ||
1501 | } | 1526 | } |
1502 | if (!pol) | 1527 | if (!pol) |
1503 | pol = &default_policy; | 1528 | pol = &default_policy; |
@@ -1817,18 +1842,24 @@ struct page * | |||
1817 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1842 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1818 | unsigned long addr, int node) | 1843 | unsigned long addr, int node) |
1819 | { | 1844 | { |
1820 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1845 | struct mempolicy *pol; |
1821 | struct zonelist *zl; | 1846 | struct zonelist *zl; |
1822 | struct page *page; | 1847 | struct page *page; |
1848 | unsigned int cpuset_mems_cookie; | ||
1849 | |||
1850 | retry_cpuset: | ||
1851 | pol = get_vma_policy(current, vma, addr); | ||
1852 | cpuset_mems_cookie = get_mems_allowed(); | ||
1823 | 1853 | ||
1824 | get_mems_allowed(); | ||
1825 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1854 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1826 | unsigned nid; | 1855 | unsigned nid; |
1827 | 1856 | ||
1828 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1857 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1829 | mpol_cond_put(pol); | 1858 | mpol_cond_put(pol); |
1830 | page = alloc_page_interleave(gfp, order, nid); | 1859 | page = alloc_page_interleave(gfp, order, nid); |
1831 | put_mems_allowed(); | 1860 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1861 | goto retry_cpuset; | ||
1862 | |||
1832 | return page; | 1863 | return page; |
1833 | } | 1864 | } |
1834 | zl = policy_zonelist(gfp, pol, node); | 1865 | zl = policy_zonelist(gfp, pol, node); |
@@ -1839,7 +1870,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1839 | struct page *page = __alloc_pages_nodemask(gfp, order, | 1870 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1840 | zl, policy_nodemask(gfp, pol)); | 1871 | zl, policy_nodemask(gfp, pol)); |
1841 | __mpol_put(pol); | 1872 | __mpol_put(pol); |
1842 | put_mems_allowed(); | 1873 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1874 | goto retry_cpuset; | ||
1843 | return page; | 1875 | return page; |
1844 | } | 1876 | } |
1845 | /* | 1877 | /* |
@@ -1847,7 +1879,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1847 | */ | 1879 | */ |
1848 | page = __alloc_pages_nodemask(gfp, order, zl, | 1880 | page = __alloc_pages_nodemask(gfp, order, zl, |
1849 | policy_nodemask(gfp, pol)); | 1881 | policy_nodemask(gfp, pol)); |
1850 | put_mems_allowed(); | 1882 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1883 | goto retry_cpuset; | ||
1851 | return page; | 1884 | return page; |
1852 | } | 1885 | } |
1853 | 1886 | ||
@@ -1874,11 +1907,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1874 | { | 1907 | { |
1875 | struct mempolicy *pol = current->mempolicy; | 1908 | struct mempolicy *pol = current->mempolicy; |
1876 | struct page *page; | 1909 | struct page *page; |
1910 | unsigned int cpuset_mems_cookie; | ||
1877 | 1911 | ||
1878 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1912 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1879 | pol = &default_policy; | 1913 | pol = &default_policy; |
1880 | 1914 | ||
1881 | get_mems_allowed(); | 1915 | retry_cpuset: |
1916 | cpuset_mems_cookie = get_mems_allowed(); | ||
1917 | |||
1882 | /* | 1918 | /* |
1883 | * No reference counting needed for current->mempolicy | 1919 | * No reference counting needed for current->mempolicy |
1884 | * nor system default_policy | 1920 | * nor system default_policy |
@@ -1889,7 +1925,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1889 | page = __alloc_pages_nodemask(gfp, order, | 1925 | page = __alloc_pages_nodemask(gfp, order, |
1890 | policy_zonelist(gfp, pol, numa_node_id()), | 1926 | policy_zonelist(gfp, pol, numa_node_id()), |
1891 | policy_nodemask(gfp, pol)); | 1927 | policy_nodemask(gfp, pol)); |
1892 | put_mems_allowed(); | 1928 | |
1929 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1930 | goto retry_cpuset; | ||
1931 | |||
1893 | return page; | 1932 | return page; |
1894 | } | 1933 | } |
1895 | EXPORT_SYMBOL(alloc_pages_current); | 1934 | EXPORT_SYMBOL(alloc_pages_current); |
@@ -1992,7 +2031,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1992 | */ | 2031 | */ |
1993 | 2032 | ||
1994 | /* lookup first element intersecting start-end */ | 2033 | /* lookup first element intersecting start-end */ |
1995 | /* Caller holds sp->lock */ | 2034 | /* Caller holds sp->mutex */ |
1996 | static struct sp_node * | 2035 | static struct sp_node * |
1997 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2036 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
1998 | { | 2037 | { |
@@ -2056,36 +2095,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
2056 | 2095 | ||
2057 | if (!sp->root.rb_node) | 2096 | if (!sp->root.rb_node) |
2058 | return NULL; | 2097 | return NULL; |
2059 | spin_lock(&sp->lock); | 2098 | mutex_lock(&sp->mutex); |
2060 | sn = sp_lookup(sp, idx, idx+1); | 2099 | sn = sp_lookup(sp, idx, idx+1); |
2061 | if (sn) { | 2100 | if (sn) { |
2062 | mpol_get(sn->policy); | 2101 | mpol_get(sn->policy); |
2063 | pol = sn->policy; | 2102 | pol = sn->policy; |
2064 | } | 2103 | } |
2065 | spin_unlock(&sp->lock); | 2104 | mutex_unlock(&sp->mutex); |
2066 | return pol; | 2105 | return pol; |
2067 | } | 2106 | } |
2068 | 2107 | ||
2108 | static void sp_free(struct sp_node *n) | ||
2109 | { | ||
2110 | mpol_put(n->policy); | ||
2111 | kmem_cache_free(sn_cache, n); | ||
2112 | } | ||
2113 | |||
2069 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2114 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2070 | { | 2115 | { |
2071 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2116 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
2072 | rb_erase(&n->nd, &sp->root); | 2117 | rb_erase(&n->nd, &sp->root); |
2073 | mpol_put(n->policy); | 2118 | sp_free(n); |
2074 | kmem_cache_free(sn_cache, n); | ||
2075 | } | 2119 | } |
2076 | 2120 | ||
2077 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2121 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2078 | struct mempolicy *pol) | 2122 | struct mempolicy *pol) |
2079 | { | 2123 | { |
2080 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2124 | struct sp_node *n; |
2125 | struct mempolicy *newpol; | ||
2081 | 2126 | ||
2127 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
2082 | if (!n) | 2128 | if (!n) |
2083 | return NULL; | 2129 | return NULL; |
2130 | |||
2131 | newpol = mpol_dup(pol); | ||
2132 | if (IS_ERR(newpol)) { | ||
2133 | kmem_cache_free(sn_cache, n); | ||
2134 | return NULL; | ||
2135 | } | ||
2136 | newpol->flags |= MPOL_F_SHARED; | ||
2137 | |||
2084 | n->start = start; | 2138 | n->start = start; |
2085 | n->end = end; | 2139 | n->end = end; |
2086 | mpol_get(pol); | 2140 | n->policy = newpol; |
2087 | pol->flags |= MPOL_F_SHARED; /* for unref */ | 2141 | |
2088 | n->policy = pol; | ||
2089 | return n; | 2142 | return n; |
2090 | } | 2143 | } |
2091 | 2144 | ||
@@ -2093,10 +2146,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
2093 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | 2146 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, |
2094 | unsigned long end, struct sp_node *new) | 2147 | unsigned long end, struct sp_node *new) |
2095 | { | 2148 | { |
2096 | struct sp_node *n, *new2 = NULL; | 2149 | struct sp_node *n; |
2150 | int ret = 0; | ||
2097 | 2151 | ||
2098 | restart: | 2152 | mutex_lock(&sp->mutex); |
2099 | spin_lock(&sp->lock); | ||
2100 | n = sp_lookup(sp, start, end); | 2153 | n = sp_lookup(sp, start, end); |
2101 | /* Take care of old policies in the same range. */ | 2154 | /* Take care of old policies in the same range. */ |
2102 | while (n && n->start < end) { | 2155 | while (n && n->start < end) { |
@@ -2109,16 +2162,14 @@ restart: | |||
2109 | } else { | 2162 | } else { |
2110 | /* Old policy spanning whole new range. */ | 2163 | /* Old policy spanning whole new range. */ |
2111 | if (n->end > end) { | 2164 | if (n->end > end) { |
2165 | struct sp_node *new2; | ||
2166 | new2 = sp_alloc(end, n->end, n->policy); | ||
2112 | if (!new2) { | 2167 | if (!new2) { |
2113 | spin_unlock(&sp->lock); | 2168 | ret = -ENOMEM; |
2114 | new2 = sp_alloc(end, n->end, n->policy); | 2169 | goto out; |
2115 | if (!new2) | ||
2116 | return -ENOMEM; | ||
2117 | goto restart; | ||
2118 | } | 2170 | } |
2119 | n->end = start; | 2171 | n->end = start; |
2120 | sp_insert(sp, new2); | 2172 | sp_insert(sp, new2); |
2121 | new2 = NULL; | ||
2122 | break; | 2173 | break; |
2123 | } else | 2174 | } else |
2124 | n->end = start; | 2175 | n->end = start; |
@@ -2129,12 +2180,9 @@ restart: | |||
2129 | } | 2180 | } |
2130 | if (new) | 2181 | if (new) |
2131 | sp_insert(sp, new); | 2182 | sp_insert(sp, new); |
2132 | spin_unlock(&sp->lock); | 2183 | out: |
2133 | if (new2) { | 2184 | mutex_unlock(&sp->mutex); |
2134 | mpol_put(new2->policy); | 2185 | return ret; |
2135 | kmem_cache_free(sn_cache, new2); | ||
2136 | } | ||
2137 | return 0; | ||
2138 | } | 2186 | } |
2139 | 2187 | ||
2140 | /** | 2188 | /** |
@@ -2152,7 +2200,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
2152 | int ret; | 2200 | int ret; |
2153 | 2201 | ||
2154 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2202 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2155 | spin_lock_init(&sp->lock); | 2203 | mutex_init(&sp->mutex); |
2156 | 2204 | ||
2157 | if (mpol) { | 2205 | if (mpol) { |
2158 | struct vm_area_struct pvma; | 2206 | struct vm_area_struct pvma; |
@@ -2206,7 +2254,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2206 | } | 2254 | } |
2207 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | 2255 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); |
2208 | if (err && new) | 2256 | if (err && new) |
2209 | kmem_cache_free(sn_cache, new); | 2257 | sp_free(new); |
2210 | return err; | 2258 | return err; |
2211 | } | 2259 | } |
2212 | 2260 | ||
@@ -2218,16 +2266,14 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2218 | 2266 | ||
2219 | if (!p->root.rb_node) | 2267 | if (!p->root.rb_node) |
2220 | return; | 2268 | return; |
2221 | spin_lock(&p->lock); | 2269 | mutex_lock(&p->mutex); |
2222 | next = rb_first(&p->root); | 2270 | next = rb_first(&p->root); |
2223 | while (next) { | 2271 | while (next) { |
2224 | n = rb_entry(next, struct sp_node, nd); | 2272 | n = rb_entry(next, struct sp_node, nd); |
2225 | next = rb_next(&n->nd); | 2273 | next = rb_next(&n->nd); |
2226 | rb_erase(&n->nd, &p->root); | 2274 | sp_delete(p, n); |
2227 | mpol_put(n->policy); | ||
2228 | kmem_cache_free(sn_cache, n); | ||
2229 | } | 2275 | } |
2230 | spin_unlock(&p->lock); | 2276 | mutex_unlock(&p->mutex); |
2231 | } | 2277 | } |
2232 | 2278 | ||
2233 | /* assumes fs == KERNEL_DS */ | 2279 | /* assumes fs == KERNEL_DS */ |
@@ -2493,7 +2539,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) | |||
2493 | break; | 2539 | break; |
2494 | 2540 | ||
2495 | default: | 2541 | default: |
2496 | BUG(); | 2542 | return -EINVAL; |
2497 | } | 2543 | } |
2498 | 2544 | ||
2499 | l = strlen(policy_modes[mode]); | 2545 | l = strlen(policy_modes[mode]); |
diff --git a/mm/migrate.c b/mm/migrate.c index 14d0a6a632f..480714b6f3f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -220,6 +220,56 @@ out: | |||
220 | pte_unmap_unlock(ptep, ptl); | 220 | pte_unmap_unlock(ptep, ptl); |
221 | } | 221 | } |
222 | 222 | ||
223 | #ifdef CONFIG_BLOCK | ||
224 | /* Returns true if all buffers are successfully locked */ | ||
225 | static bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
226 | enum migrate_mode mode) | ||
227 | { | ||
228 | struct buffer_head *bh = head; | ||
229 | |||
230 | /* Simple case, sync compaction */ | ||
231 | if (mode != MIGRATE_ASYNC) { | ||
232 | do { | ||
233 | get_bh(bh); | ||
234 | lock_buffer(bh); | ||
235 | bh = bh->b_this_page; | ||
236 | |||
237 | } while (bh != head); | ||
238 | |||
239 | return true; | ||
240 | } | ||
241 | |||
242 | /* async case, we cannot block on lock_buffer so use trylock_buffer */ | ||
243 | do { | ||
244 | get_bh(bh); | ||
245 | if (!trylock_buffer(bh)) { | ||
246 | /* | ||
247 | * We failed to lock the buffer and cannot stall in | ||
248 | * async migration. Release the taken locks | ||
249 | */ | ||
250 | struct buffer_head *failed_bh = bh; | ||
251 | put_bh(failed_bh); | ||
252 | bh = head; | ||
253 | while (bh != failed_bh) { | ||
254 | unlock_buffer(bh); | ||
255 | put_bh(bh); | ||
256 | bh = bh->b_this_page; | ||
257 | } | ||
258 | return false; | ||
259 | } | ||
260 | |||
261 | bh = bh->b_this_page; | ||
262 | } while (bh != head); | ||
263 | return true; | ||
264 | } | ||
265 | #else | ||
266 | static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, | ||
267 | enum migrate_mode mode) | ||
268 | { | ||
269 | return true; | ||
270 | } | ||
271 | #endif /* CONFIG_BLOCK */ | ||
272 | |||
223 | /* | 273 | /* |
224 | * Replace the page in the mapping. | 274 | * Replace the page in the mapping. |
225 | * | 275 | * |
@@ -229,7 +279,8 @@ out: | |||
229 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. | 279 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
230 | */ | 280 | */ |
231 | static int migrate_page_move_mapping(struct address_space *mapping, | 281 | static int migrate_page_move_mapping(struct address_space *mapping, |
232 | struct page *newpage, struct page *page) | 282 | struct page *newpage, struct page *page, |
283 | struct buffer_head *head, enum migrate_mode mode) | ||
233 | { | 284 | { |
234 | int expected_count; | 285 | int expected_count; |
235 | void **pslot; | 286 | void **pslot; |
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
259 | } | 310 | } |
260 | 311 | ||
261 | /* | 312 | /* |
313 | * In the async migration case of moving a page with buffers, lock the | ||
314 | * buffers using trylock before the mapping is moved. If the mapping | ||
315 | * was moved, we later failed to lock the buffers and could not move | ||
316 | * the mapping back due to an elevated page count, we would have to | ||
317 | * block waiting on other references to be dropped. | ||
318 | */ | ||
319 | if (mode == MIGRATE_ASYNC && head && | ||
320 | !buffer_migrate_lock_buffers(head, mode)) { | ||
321 | page_unfreeze_refs(page, expected_count); | ||
322 | spin_unlock_irq(&mapping->tree_lock); | ||
323 | return -EAGAIN; | ||
324 | } | ||
325 | |||
326 | /* | ||
262 | * Now we know that no one else is looking at the page. | 327 | * Now we know that no one else is looking at the page. |
263 | */ | 328 | */ |
264 | get_page(newpage); /* add cache reference */ | 329 | get_page(newpage); /* add cache reference */ |
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page); | |||
415 | * Pages are locked upon entry and exit. | 480 | * Pages are locked upon entry and exit. |
416 | */ | 481 | */ |
417 | int migrate_page(struct address_space *mapping, | 482 | int migrate_page(struct address_space *mapping, |
418 | struct page *newpage, struct page *page) | 483 | struct page *newpage, struct page *page, |
484 | enum migrate_mode mode) | ||
419 | { | 485 | { |
420 | int rc; | 486 | int rc; |
421 | 487 | ||
422 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 488 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
423 | 489 | ||
424 | rc = migrate_page_move_mapping(mapping, newpage, page); | 490 | rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); |
425 | 491 | ||
426 | if (rc) | 492 | if (rc) |
427 | return rc; | 493 | return rc; |
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page); | |||
438 | * exist. | 504 | * exist. |
439 | */ | 505 | */ |
440 | int buffer_migrate_page(struct address_space *mapping, | 506 | int buffer_migrate_page(struct address_space *mapping, |
441 | struct page *newpage, struct page *page) | 507 | struct page *newpage, struct page *page, enum migrate_mode mode) |
442 | { | 508 | { |
443 | struct buffer_head *bh, *head; | 509 | struct buffer_head *bh, *head; |
444 | int rc; | 510 | int rc; |
445 | 511 | ||
446 | if (!page_has_buffers(page)) | 512 | if (!page_has_buffers(page)) |
447 | return migrate_page(mapping, newpage, page); | 513 | return migrate_page(mapping, newpage, page, mode); |
448 | 514 | ||
449 | head = page_buffers(page); | 515 | head = page_buffers(page); |
450 | 516 | ||
451 | rc = migrate_page_move_mapping(mapping, newpage, page); | 517 | rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); |
452 | 518 | ||
453 | if (rc) | 519 | if (rc) |
454 | return rc; | 520 | return rc; |
455 | 521 | ||
456 | bh = head; | 522 | /* |
457 | do { | 523 | * In the async case, migrate_page_move_mapping locked the buffers |
458 | get_bh(bh); | 524 | * with an IRQ-safe spinlock held. In the sync case, the buffers |
459 | lock_buffer(bh); | 525 | * need to be locked now |
460 | bh = bh->b_this_page; | 526 | */ |
461 | 527 | if (mode != MIGRATE_ASYNC) | |
462 | } while (bh != head); | 528 | BUG_ON(!buffer_migrate_lock_buffers(head, mode)); |
463 | 529 | ||
464 | ClearPagePrivate(page); | 530 | ClearPagePrivate(page); |
465 | set_page_private(newpage, page_private(page)); | 531 | set_page_private(newpage, page_private(page)); |
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page) | |||
536 | * Default handling if a filesystem does not provide a migration function. | 602 | * Default handling if a filesystem does not provide a migration function. |
537 | */ | 603 | */ |
538 | static int fallback_migrate_page(struct address_space *mapping, | 604 | static int fallback_migrate_page(struct address_space *mapping, |
539 | struct page *newpage, struct page *page) | 605 | struct page *newpage, struct page *page, enum migrate_mode mode) |
540 | { | 606 | { |
541 | if (PageDirty(page)) | 607 | if (PageDirty(page)) { |
608 | /* Only writeback pages in full synchronous migration */ | ||
609 | if (mode != MIGRATE_SYNC) | ||
610 | return -EBUSY; | ||
542 | return writeout(mapping, page); | 611 | return writeout(mapping, page); |
612 | } | ||
543 | 613 | ||
544 | /* | 614 | /* |
545 | * Buffers may be managed in a filesystem specific way. | 615 | * Buffers may be managed in a filesystem specific way. |
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
549 | !try_to_release_page(page, GFP_KERNEL)) | 619 | !try_to_release_page(page, GFP_KERNEL)) |
550 | return -EAGAIN; | 620 | return -EAGAIN; |
551 | 621 | ||
552 | return migrate_page(mapping, newpage, page); | 622 | return migrate_page(mapping, newpage, page, mode); |
553 | } | 623 | } |
554 | 624 | ||
555 | /* | 625 | /* |
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
564 | * == 0 - success | 634 | * == 0 - success |
565 | */ | 635 | */ |
566 | static int move_to_new_page(struct page *newpage, struct page *page, | 636 | static int move_to_new_page(struct page *newpage, struct page *page, |
567 | int remap_swapcache, bool sync) | 637 | int remap_swapcache, enum migrate_mode mode) |
568 | { | 638 | { |
569 | struct address_space *mapping; | 639 | struct address_space *mapping; |
570 | int rc; | 640 | int rc; |
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
585 | 655 | ||
586 | mapping = page_mapping(page); | 656 | mapping = page_mapping(page); |
587 | if (!mapping) | 657 | if (!mapping) |
588 | rc = migrate_page(mapping, newpage, page); | 658 | rc = migrate_page(mapping, newpage, page, mode); |
589 | else { | 659 | else if (mapping->a_ops->migratepage) |
590 | /* | 660 | /* |
591 | * Do not writeback pages if !sync and migratepage is | 661 | * Most pages have a mapping and most filesystems provide a |
592 | * not pointing to migrate_page() which is nonblocking | 662 | * migratepage callback. Anonymous pages are part of swap |
593 | * (swapcache/tmpfs uses migratepage = migrate_page). | 663 | * space which also has its own migratepage callback. This |
664 | * is the most common path for page migration. | ||
594 | */ | 665 | */ |
595 | if (PageDirty(page) && !sync && | 666 | rc = mapping->a_ops->migratepage(mapping, |
596 | mapping->a_ops->migratepage != migrate_page) | 667 | newpage, page, mode); |
597 | rc = -EBUSY; | 668 | else |
598 | else if (mapping->a_ops->migratepage) | 669 | rc = fallback_migrate_page(mapping, newpage, page, mode); |
599 | /* | ||
600 | * Most pages have a mapping and most filesystems | ||
601 | * should provide a migration function. Anonymous | ||
602 | * pages are part of swap space which also has its | ||
603 | * own migration function. This is the most common | ||
604 | * path for page migration. | ||
605 | */ | ||
606 | rc = mapping->a_ops->migratepage(mapping, | ||
607 | newpage, page); | ||
608 | else | ||
609 | rc = fallback_migrate_page(mapping, newpage, page); | ||
610 | } | ||
611 | 670 | ||
612 | if (rc) { | 671 | if (rc) { |
613 | newpage->mapping = NULL; | 672 | newpage->mapping = NULL; |
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
621 | return rc; | 680 | return rc; |
622 | } | 681 | } |
623 | 682 | ||
624 | /* | 683 | static int __unmap_and_move(struct page *page, struct page *newpage, |
625 | * Obtain the lock on page, remove all ptes and migrate the page | 684 | int force, bool offlining, enum migrate_mode mode) |
626 | * to the newly allocated page in newpage. | ||
627 | */ | ||
628 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
629 | struct page *page, int force, bool offlining, bool sync) | ||
630 | { | 685 | { |
631 | int rc = 0; | 686 | int rc = -EAGAIN; |
632 | int *result = NULL; | ||
633 | struct page *newpage = get_new_page(page, private, &result); | ||
634 | int remap_swapcache = 1; | 687 | int remap_swapcache = 1; |
635 | int charge = 0; | 688 | int charge = 0; |
636 | struct mem_cgroup *mem; | 689 | struct mem_cgroup *mem; |
637 | struct anon_vma *anon_vma = NULL; | 690 | struct anon_vma *anon_vma = NULL; |
638 | 691 | ||
639 | if (!newpage) | ||
640 | return -ENOMEM; | ||
641 | |||
642 | if (page_count(page) == 1) { | ||
643 | /* page was freed from under us. So we are done. */ | ||
644 | goto move_newpage; | ||
645 | } | ||
646 | if (unlikely(PageTransHuge(page))) | ||
647 | if (unlikely(split_huge_page(page))) | ||
648 | goto move_newpage; | ||
649 | |||
650 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
651 | rc = -EAGAIN; | ||
652 | |||
653 | if (!trylock_page(page)) { | 692 | if (!trylock_page(page)) { |
654 | if (!force || !sync) | 693 | if (!force || mode == MIGRATE_ASYNC) |
655 | goto move_newpage; | 694 | goto out; |
656 | 695 | ||
657 | /* | 696 | /* |
658 | * It's not safe for direct compaction to call lock_page. | 697 | * It's not safe for direct compaction to call lock_page. |
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
668 | * altogether. | 707 | * altogether. |
669 | */ | 708 | */ |
670 | if (current->flags & PF_MEMALLOC) | 709 | if (current->flags & PF_MEMALLOC) |
671 | goto move_newpage; | 710 | goto out; |
672 | 711 | ||
673 | lock_page(page); | 712 | lock_page(page); |
674 | } | 713 | } |
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
697 | 736 | ||
698 | if (PageWriteback(page)) { | 737 | if (PageWriteback(page)) { |
699 | /* | 738 | /* |
700 | * For !sync, there is no point retrying as the retry loop | 739 | * Only in the case of a full syncronous migration is it |
701 | * is expected to be too short for PageWriteback to be cleared | 740 | * necessary to wait for PageWriteback. In the async case, |
741 | * the retry loop is too short and in the sync-light case, | ||
742 | * the overhead of stalling is too much | ||
702 | */ | 743 | */ |
703 | if (!sync) { | 744 | if (mode != MIGRATE_SYNC) { |
704 | rc = -EBUSY; | 745 | rc = -EBUSY; |
705 | goto uncharge; | 746 | goto uncharge; |
706 | } | 747 | } |
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
771 | 812 | ||
772 | skip_unmap: | 813 | skip_unmap: |
773 | if (!page_mapped(page)) | 814 | if (!page_mapped(page)) |
774 | rc = move_to_new_page(newpage, page, remap_swapcache, sync); | 815 | rc = move_to_new_page(newpage, page, remap_swapcache, mode); |
775 | 816 | ||
776 | if (rc && remap_swapcache) | 817 | if (rc && remap_swapcache) |
777 | remove_migration_ptes(page, page); | 818 | remove_migration_ptes(page, page); |
@@ -785,27 +826,53 @@ uncharge: | |||
785 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | 826 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
786 | unlock: | 827 | unlock: |
787 | unlock_page(page); | 828 | unlock_page(page); |
829 | out: | ||
830 | return rc; | ||
831 | } | ||
788 | 832 | ||
789 | move_newpage: | 833 | /* |
834 | * Obtain the lock on page, remove all ptes and migrate the page | ||
835 | * to the newly allocated page in newpage. | ||
836 | */ | ||
837 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | ||
838 | struct page *page, int force, bool offlining, | ||
839 | enum migrate_mode mode) | ||
840 | { | ||
841 | int rc = 0; | ||
842 | int *result = NULL; | ||
843 | struct page *newpage = get_new_page(page, private, &result); | ||
844 | |||
845 | if (!newpage) | ||
846 | return -ENOMEM; | ||
847 | |||
848 | if (page_count(page) == 1) { | ||
849 | /* page was freed from under us. So we are done. */ | ||
850 | goto out; | ||
851 | } | ||
852 | |||
853 | if (unlikely(PageTransHuge(page))) | ||
854 | if (unlikely(split_huge_page(page))) | ||
855 | goto out; | ||
856 | |||
857 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | ||
858 | out: | ||
790 | if (rc != -EAGAIN) { | 859 | if (rc != -EAGAIN) { |
791 | /* | 860 | /* |
792 | * A page that has been migrated has all references | 861 | * A page that has been migrated has all references |
793 | * removed and will be freed. A page that has not been | 862 | * removed and will be freed. A page that has not been |
794 | * migrated will have kepts its references and be | 863 | * migrated will have kepts its references and be |
795 | * restored. | 864 | * restored. |
796 | */ | 865 | */ |
797 | list_del(&page->lru); | 866 | list_del(&page->lru); |
798 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 867 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
799 | page_is_file_cache(page)); | 868 | page_is_file_cache(page)); |
800 | putback_lru_page(page); | 869 | putback_lru_page(page); |
801 | } | 870 | } |
802 | |||
803 | /* | 871 | /* |
804 | * Move the new page to the LRU. If migration was not successful | 872 | * Move the new page to the LRU. If migration was not successful |
805 | * then this will free the page. | 873 | * then this will free the page. |
806 | */ | 874 | */ |
807 | putback_lru_page(newpage); | 875 | putback_lru_page(newpage); |
808 | |||
809 | if (result) { | 876 | if (result) { |
810 | if (rc) | 877 | if (rc) |
811 | *result = rc; | 878 | *result = rc; |
@@ -835,7 +902,8 @@ move_newpage: | |||
835 | */ | 902 | */ |
836 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 903 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
837 | unsigned long private, struct page *hpage, | 904 | unsigned long private, struct page *hpage, |
838 | int force, bool offlining, bool sync) | 905 | int force, bool offlining, |
906 | enum migrate_mode mode) | ||
839 | { | 907 | { |
840 | int rc = 0; | 908 | int rc = 0; |
841 | int *result = NULL; | 909 | int *result = NULL; |
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
848 | rc = -EAGAIN; | 916 | rc = -EAGAIN; |
849 | 917 | ||
850 | if (!trylock_page(hpage)) { | 918 | if (!trylock_page(hpage)) { |
851 | if (!force || !sync) | 919 | if (!force || mode != MIGRATE_SYNC) |
852 | goto out; | 920 | goto out; |
853 | lock_page(hpage); | 921 | lock_page(hpage); |
854 | } | 922 | } |
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
859 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); | 927 | try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
860 | 928 | ||
861 | if (!page_mapped(hpage)) | 929 | if (!page_mapped(hpage)) |
862 | rc = move_to_new_page(new_hpage, hpage, 1, sync); | 930 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
863 | 931 | ||
864 | if (rc) | 932 | if (rc) |
865 | remove_migration_ptes(hpage, hpage); | 933 | remove_migration_ptes(hpage, hpage); |
@@ -902,7 +970,7 @@ out: | |||
902 | */ | 970 | */ |
903 | int migrate_pages(struct list_head *from, | 971 | int migrate_pages(struct list_head *from, |
904 | new_page_t get_new_page, unsigned long private, bool offlining, | 972 | new_page_t get_new_page, unsigned long private, bool offlining, |
905 | bool sync) | 973 | enum migrate_mode mode) |
906 | { | 974 | { |
907 | int retry = 1; | 975 | int retry = 1; |
908 | int nr_failed = 0; | 976 | int nr_failed = 0; |
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from, | |||
923 | 991 | ||
924 | rc = unmap_and_move(get_new_page, private, | 992 | rc = unmap_and_move(get_new_page, private, |
925 | page, pass > 2, offlining, | 993 | page, pass > 2, offlining, |
926 | sync); | 994 | mode); |
927 | 995 | ||
928 | switch(rc) { | 996 | switch(rc) { |
929 | case -ENOMEM: | 997 | case -ENOMEM: |
@@ -953,7 +1021,7 @@ out: | |||
953 | 1021 | ||
954 | int migrate_huge_pages(struct list_head *from, | 1022 | int migrate_huge_pages(struct list_head *from, |
955 | new_page_t get_new_page, unsigned long private, bool offlining, | 1023 | new_page_t get_new_page, unsigned long private, bool offlining, |
956 | bool sync) | 1024 | enum migrate_mode mode) |
957 | { | 1025 | { |
958 | int retry = 1; | 1026 | int retry = 1; |
959 | int nr_failed = 0; | 1027 | int nr_failed = 0; |
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from, | |||
970 | 1038 | ||
971 | rc = unmap_and_move_huge_page(get_new_page, | 1039 | rc = unmap_and_move_huge_page(get_new_page, |
972 | private, page, pass > 2, offlining, | 1040 | private, page, pass > 2, offlining, |
973 | sync); | 1041 | mode); |
974 | 1042 | ||
975 | switch(rc) { | 1043 | switch(rc) { |
976 | case -ENOMEM: | 1044 | case -ENOMEM: |
@@ -1099,7 +1167,7 @@ set_status: | |||
1099 | err = 0; | 1167 | err = 0; |
1100 | if (!list_empty(&pagelist)) { | 1168 | if (!list_empty(&pagelist)) { |
1101 | err = migrate_pages(&pagelist, new_page_node, | 1169 | err = migrate_pages(&pagelist, new_page_node, |
1102 | (unsigned long)pm, 0, true); | 1170 | (unsigned long)pm, 0, MIGRATE_SYNC); |
1103 | if (err) | 1171 | if (err) |
1104 | putback_lru_pages(&pagelist); | 1172 | putback_lru_pages(&pagelist); |
1105 | } | 1173 | } |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d032de4088..71c78115c45 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 6e93dc7f258..e39e3efe4a4 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
83 | 83 | ||
84 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | 84 | static void __init __free_pages_memory(unsigned long start, unsigned long end) |
85 | { | 85 | { |
86 | int i; | 86 | unsigned long i, start_aligned, end_aligned; |
87 | unsigned long start_aligned, end_aligned; | ||
88 | int order = ilog2(BITS_PER_LONG); | 87 | int order = ilog2(BITS_PER_LONG); |
89 | 88 | ||
90 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | 89 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2f474da7ee..bfe789472b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -555,7 +555,7 @@ static inline void __free_one_page(struct page *page, | |||
555 | combined_idx = buddy_idx & page_idx; | 555 | combined_idx = buddy_idx & page_idx; |
556 | higher_page = page + (combined_idx - page_idx); | 556 | higher_page = page + (combined_idx - page_idx); |
557 | buddy_idx = __find_buddy_index(combined_idx, order + 1); | 557 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
558 | higher_buddy = page + (buddy_idx - combined_idx); | 558 | higher_buddy = higher_page + (buddy_idx - combined_idx); |
559 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 559 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
560 | list_add_tail(&page->lru, | 560 | list_add_tail(&page->lru, |
561 | &zone->free_area[order].free_list[migratetype]); | 561 | &zone->free_area[order].free_list[migratetype]); |
@@ -1912,14 +1912,20 @@ static struct page * | |||
1912 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1912 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1913 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1913 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1914 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1914 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1915 | int migratetype, unsigned long *did_some_progress, | 1915 | int migratetype, bool sync_migration, |
1916 | bool sync_migration) | 1916 | bool *deferred_compaction, |
1917 | unsigned long *did_some_progress) | ||
1917 | { | 1918 | { |
1918 | struct page *page; | 1919 | struct page *page; |
1919 | 1920 | ||
1920 | if (!order || compaction_deferred(preferred_zone)) | 1921 | if (!order) |
1921 | return NULL; | 1922 | return NULL; |
1922 | 1923 | ||
1924 | if (compaction_deferred(preferred_zone)) { | ||
1925 | *deferred_compaction = true; | ||
1926 | return NULL; | ||
1927 | } | ||
1928 | |||
1923 | current->flags |= PF_MEMALLOC; | 1929 | current->flags |= PF_MEMALLOC; |
1924 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1930 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1925 | nodemask, sync_migration); | 1931 | nodemask, sync_migration); |
@@ -1947,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
1947 | * but not enough to satisfy watermarks. | 1953 | * but not enough to satisfy watermarks. |
1948 | */ | 1954 | */ |
1949 | count_vm_event(COMPACTFAIL); | 1955 | count_vm_event(COMPACTFAIL); |
1950 | defer_compaction(preferred_zone); | 1956 | |
1957 | /* | ||
1958 | * As async compaction considers a subset of pageblocks, only | ||
1959 | * defer if the failure was a sync compaction failure. | ||
1960 | */ | ||
1961 | if (sync_migration) | ||
1962 | defer_compaction(preferred_zone); | ||
1951 | 1963 | ||
1952 | cond_resched(); | 1964 | cond_resched(); |
1953 | } | 1965 | } |
@@ -1959,8 +1971,9 @@ static inline struct page * | |||
1959 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1971 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1960 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1972 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1961 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1973 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1962 | int migratetype, unsigned long *did_some_progress, | 1974 | int migratetype, bool sync_migration, |
1963 | bool sync_migration) | 1975 | bool *deferred_compaction, |
1976 | unsigned long *did_some_progress) | ||
1964 | { | 1977 | { |
1965 | return NULL; | 1978 | return NULL; |
1966 | } | 1979 | } |
@@ -2110,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2110 | unsigned long pages_reclaimed = 0; | 2123 | unsigned long pages_reclaimed = 0; |
2111 | unsigned long did_some_progress; | 2124 | unsigned long did_some_progress; |
2112 | bool sync_migration = false; | 2125 | bool sync_migration = false; |
2126 | bool deferred_compaction = false; | ||
2113 | 2127 | ||
2114 | /* | 2128 | /* |
2115 | * In the slowpath, we sanity check order to avoid ever trying to | 2129 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2190,12 +2204,22 @@ rebalance: | |||
2190 | zonelist, high_zoneidx, | 2204 | zonelist, high_zoneidx, |
2191 | nodemask, | 2205 | nodemask, |
2192 | alloc_flags, preferred_zone, | 2206 | alloc_flags, preferred_zone, |
2193 | migratetype, &did_some_progress, | 2207 | migratetype, sync_migration, |
2194 | sync_migration); | 2208 | &deferred_compaction, |
2209 | &did_some_progress); | ||
2195 | if (page) | 2210 | if (page) |
2196 | goto got_pg; | 2211 | goto got_pg; |
2197 | sync_migration = true; | 2212 | sync_migration = true; |
2198 | 2213 | ||
2214 | /* | ||
2215 | * If compaction is deferred for high-order allocations, it is because | ||
2216 | * sync compaction recently failed. In this is the case and the caller | ||
2217 | * has requested the system not be heavily disrupted, fail the | ||
2218 | * allocation now instead of entering direct reclaim | ||
2219 | */ | ||
2220 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | ||
2221 | goto nopage; | ||
2222 | |||
2199 | /* Try direct reclaim and then allocating */ | 2223 | /* Try direct reclaim and then allocating */ |
2200 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2224 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2201 | zonelist, high_zoneidx, | 2225 | zonelist, high_zoneidx, |
@@ -2266,8 +2290,9 @@ rebalance: | |||
2266 | zonelist, high_zoneidx, | 2290 | zonelist, high_zoneidx, |
2267 | nodemask, | 2291 | nodemask, |
2268 | alloc_flags, preferred_zone, | 2292 | alloc_flags, preferred_zone, |
2269 | migratetype, &did_some_progress, | 2293 | migratetype, sync_migration, |
2270 | sync_migration); | 2294 | &deferred_compaction, |
2295 | &did_some_progress); | ||
2271 | if (page) | 2296 | if (page) |
2272 | goto got_pg; | 2297 | goto got_pg; |
2273 | } | 2298 | } |
@@ -2291,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2291 | { | 2316 | { |
2292 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2317 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2293 | struct zone *preferred_zone; | 2318 | struct zone *preferred_zone; |
2294 | struct page *page; | 2319 | struct page *page = NULL; |
2295 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2320 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2321 | unsigned int cpuset_mems_cookie; | ||
2296 | 2322 | ||
2297 | gfp_mask &= gfp_allowed_mask; | 2323 | gfp_mask &= gfp_allowed_mask; |
2298 | 2324 | ||
@@ -2311,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2311 | if (unlikely(!zonelist->_zonerefs->zone)) | 2337 | if (unlikely(!zonelist->_zonerefs->zone)) |
2312 | return NULL; | 2338 | return NULL; |
2313 | 2339 | ||
2314 | get_mems_allowed(); | 2340 | retry_cpuset: |
2341 | cpuset_mems_cookie = get_mems_allowed(); | ||
2342 | |||
2315 | /* The preferred zone is used for statistics later */ | 2343 | /* The preferred zone is used for statistics later */ |
2316 | first_zones_zonelist(zonelist, high_zoneidx, | 2344 | first_zones_zonelist(zonelist, high_zoneidx, |
2317 | nodemask ? : &cpuset_current_mems_allowed, | 2345 | nodemask ? : &cpuset_current_mems_allowed, |
2318 | &preferred_zone); | 2346 | &preferred_zone); |
2319 | if (!preferred_zone) { | 2347 | if (!preferred_zone) |
2320 | put_mems_allowed(); | 2348 | goto out; |
2321 | return NULL; | ||
2322 | } | ||
2323 | 2349 | ||
2324 | /* First allocation attempt */ | 2350 | /* First allocation attempt */ |
2325 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2351 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -2329,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2329 | page = __alloc_pages_slowpath(gfp_mask, order, | 2355 | page = __alloc_pages_slowpath(gfp_mask, order, |
2330 | zonelist, high_zoneidx, nodemask, | 2356 | zonelist, high_zoneidx, nodemask, |
2331 | preferred_zone, migratetype); | 2357 | preferred_zone, migratetype); |
2332 | put_mems_allowed(); | ||
2333 | 2358 | ||
2334 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2359 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2360 | |||
2361 | out: | ||
2362 | /* | ||
2363 | * When updating a task's mems_allowed, it is possible to race with | ||
2364 | * parallel threads in such a way that an allocation can fail while | ||
2365 | * the mask is being updated. If a page allocation is about to fail, | ||
2366 | * check if the cpuset changed during allocation and if so, retry. | ||
2367 | */ | ||
2368 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
2369 | goto retry_cpuset; | ||
2370 | |||
2335 | return page; | 2371 | return page; |
2336 | } | 2372 | } |
2337 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2373 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2555,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2555 | bool skip_free_areas_node(unsigned int flags, int nid) | 2591 | bool skip_free_areas_node(unsigned int flags, int nid) |
2556 | { | 2592 | { |
2557 | bool ret = false; | 2593 | bool ret = false; |
2594 | unsigned int cpuset_mems_cookie; | ||
2558 | 2595 | ||
2559 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2596 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2560 | goto out; | 2597 | goto out; |
2561 | 2598 | ||
2562 | get_mems_allowed(); | 2599 | do { |
2563 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2600 | cpuset_mems_cookie = get_mems_allowed(); |
2564 | put_mems_allowed(); | 2601 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2602 | } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
2565 | out: | 2603 | out: |
2566 | return ret; | 2604 | return ret; |
2567 | } | 2605 | } |
@@ -3441,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3441 | if (page_to_nid(page) != zone_to_nid(zone)) | 3479 | if (page_to_nid(page) != zone_to_nid(zone)) |
3442 | continue; | 3480 | continue; |
3443 | 3481 | ||
3444 | /* Blocks with reserved pages will never free, skip them. */ | ||
3445 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3446 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3447 | continue; | ||
3448 | |||
3449 | block_migratetype = get_pageblock_migratetype(page); | 3482 | block_migratetype = get_pageblock_migratetype(page); |
3450 | 3483 | ||
3451 | /* If this block is reserved, account for it */ | 3484 | /* Only test what is necessary when the reserves are not met */ |
3452 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | 3485 | if (reserve > 0) { |
3453 | reserve--; | 3486 | /* |
3454 | continue; | 3487 | * Blocks with reserved pages will never free, skip |
3455 | } | 3488 | * them. |
3489 | */ | ||
3490 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3491 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3492 | continue; | ||
3456 | 3493 | ||
3457 | /* Suitable for reserving if this block is movable */ | 3494 | /* If this block is reserved, account for it */ |
3458 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | 3495 | if (block_migratetype == MIGRATE_RESERVE) { |
3459 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | 3496 | reserve--; |
3460 | move_freepages_block(zone, page, MIGRATE_RESERVE); | 3497 | continue; |
3461 | reserve--; | 3498 | } |
3462 | continue; | 3499 | |
3500 | /* Suitable for reserving if this block is movable */ | ||
3501 | if (block_migratetype == MIGRATE_MOVABLE) { | ||
3502 | set_pageblock_migratetype(page, | ||
3503 | MIGRATE_RESERVE); | ||
3504 | move_freepages_block(zone, page, | ||
3505 | MIGRATE_RESERVE); | ||
3506 | reserve--; | ||
3507 | continue; | ||
3508 | } | ||
3463 | } | 3509 | } |
3464 | 3510 | ||
3465 | /* | 3511 | /* |
diff --git a/mm/percpu.c b/mm/percpu.c index 0ae7a09141e..af0cc7a58f9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1630,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, | |||
1630 | areas[group] = ptr; | 1630 | areas[group] = ptr; |
1631 | 1631 | ||
1632 | base = min(ptr, base); | 1632 | base = min(ptr, base); |
1633 | } | ||
1634 | |||
1635 | /* | ||
1636 | * Copy data and free unused parts. This should happen after all | ||
1637 | * allocations are complete; otherwise, we may end up with | ||
1638 | * overlapping groups. | ||
1639 | */ | ||
1640 | for (group = 0; group < ai->nr_groups; group++) { | ||
1641 | struct pcpu_group_info *gi = &ai->groups[group]; | ||
1642 | void *ptr = areas[group]; | ||
1633 | 1643 | ||
1634 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { | 1644 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { |
1635 | if (gi->cpu_map[i] == NR_CPUS) { | 1645 | if (gi->cpu_map[i] == NR_CPUS) { |
diff --git a/mm/shmem.c b/mm/shmem.c index 883e98f78ca..df31a443293 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2348,12 +2348,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, | |||
2348 | { | 2348 | { |
2349 | struct inode *inode; | 2349 | struct inode *inode; |
2350 | struct dentry *dentry = NULL; | 2350 | struct dentry *dentry = NULL; |
2351 | u64 inum = fid->raw[2]; | 2351 | u64 inum; |
2352 | inum = (inum << 32) | fid->raw[1]; | ||
2353 | 2352 | ||
2354 | if (fh_len < 3) | 2353 | if (fh_len < 3) |
2355 | return NULL; | 2354 | return NULL; |
2356 | 2355 | ||
2356 | inum = fid->raw[2]; | ||
2357 | inum = (inum << 32) | fid->raw[1]; | ||
2358 | |||
2357 | inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), | 2359 | inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), |
2358 | shmem_match, fid->raw); | 2360 | shmem_match, fid->raw); |
2359 | if (inode) { | 2361 | if (inode) { |
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3218 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3218 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3219 | return NULL; | 3219 | return NULL; |
3220 | nid_alloc = nid_here = numa_mem_id(); | 3220 | nid_alloc = nid_here = numa_mem_id(); |
3221 | get_mems_allowed(); | ||
3222 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3221 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3223 | nid_alloc = cpuset_slab_spread_node(); | 3222 | nid_alloc = cpuset_slab_spread_node(); |
3224 | else if (current->mempolicy) | 3223 | else if (current->mempolicy) |
3225 | nid_alloc = slab_node(current->mempolicy); | 3224 | nid_alloc = slab_node(current->mempolicy); |
3226 | put_mems_allowed(); | ||
3227 | if (nid_alloc != nid_here) | 3225 | if (nid_alloc != nid_here) |
3228 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3226 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3229 | return NULL; | 3227 | return NULL; |
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3246 | enum zone_type high_zoneidx = gfp_zone(flags); | 3244 | enum zone_type high_zoneidx = gfp_zone(flags); |
3247 | void *obj = NULL; | 3245 | void *obj = NULL; |
3248 | int nid; | 3246 | int nid; |
3247 | unsigned int cpuset_mems_cookie; | ||
3249 | 3248 | ||
3250 | if (flags & __GFP_THISNODE) | 3249 | if (flags & __GFP_THISNODE) |
3251 | return NULL; | 3250 | return NULL; |
3252 | 3251 | ||
3253 | get_mems_allowed(); | ||
3254 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3255 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3252 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3256 | 3253 | ||
3254 | retry_cpuset: | ||
3255 | cpuset_mems_cookie = get_mems_allowed(); | ||
3256 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3257 | |||
3257 | retry: | 3258 | retry: |
3258 | /* | 3259 | /* |
3259 | * Look through allowed nodes for objects available | 3260 | * Look through allowed nodes for objects available |
@@ -3306,7 +3307,9 @@ retry: | |||
3306 | } | 3307 | } |
3307 | } | 3308 | } |
3308 | } | 3309 | } |
3309 | put_mems_allowed(); | 3310 | |
3311 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
3312 | goto retry_cpuset; | ||
3310 | return obj; | 3313 | return obj; |
3311 | } | 3314 | } |
3312 | 3315 | ||
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1457 | struct zone *zone; | 1457 | struct zone *zone; |
1458 | enum zone_type high_zoneidx = gfp_zone(flags); | 1458 | enum zone_type high_zoneidx = gfp_zone(flags); |
1459 | struct page *page; | 1459 | struct page *page; |
1460 | unsigned int cpuset_mems_cookie; | ||
1460 | 1461 | ||
1461 | /* | 1462 | /* |
1462 | * The defrag ratio allows a configuration of the tradeoffs between | 1463 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1480 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1481 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1481 | return NULL; | 1482 | return NULL; |
1482 | 1483 | ||
1483 | get_mems_allowed(); | 1484 | do { |
1484 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1485 | cpuset_mems_cookie = get_mems_allowed(); |
1485 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1486 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1486 | struct kmem_cache_node *n; | 1487 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1487 | 1488 | struct kmem_cache_node *n; | |
1488 | n = get_node(s, zone_to_nid(zone)); | 1489 | |
1489 | 1490 | n = get_node(s, zone_to_nid(zone)); | |
1490 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1491 | |
1491 | n->nr_partial > s->min_partial) { | 1492 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1492 | page = get_partial_node(n); | 1493 | n->nr_partial > s->min_partial) { |
1493 | if (page) { | 1494 | page = get_partial_node(n); |
1494 | put_mems_allowed(); | 1495 | if (page) { |
1495 | return page; | 1496 | /* |
1497 | * Return the object even if | ||
1498 | * put_mems_allowed indicated that | ||
1499 | * the cpuset mems_allowed was | ||
1500 | * updated in parallel. It's a | ||
1501 | * harmless race between the alloc | ||
1502 | * and the cpuset update. | ||
1503 | */ | ||
1504 | put_mems_allowed(cpuset_mems_cookie); | ||
1505 | return page; | ||
1506 | } | ||
1496 | } | 1507 | } |
1497 | } | 1508 | } |
1498 | } | 1509 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
1499 | put_mems_allowed(); | ||
1500 | #endif | 1510 | #endif |
1501 | return NULL; | 1511 | return NULL; |
1502 | } | 1512 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index e13f22efaad..3e9829f3988 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -398,11 +398,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
398 | if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) | 398 | if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) |
399 | return 0; | 399 | return 0; |
400 | 400 | ||
401 | clear_page_mlock(page); | ||
402 | |||
401 | spin_lock_irq(&mapping->tree_lock); | 403 | spin_lock_irq(&mapping->tree_lock); |
402 | if (PageDirty(page)) | 404 | if (PageDirty(page)) |
403 | goto failed; | 405 | goto failed; |
404 | 406 | ||
405 | clear_page_mlock(page); | ||
406 | BUG_ON(page_has_private(page)); | 407 | BUG_ON(page_has_private(page)); |
407 | __delete_from_page_cache(page); | 408 | __delete_from_page_cache(page); |
408 | spin_unlock_irq(&mapping->tree_lock); | 409 | spin_unlock_irq(&mapping->tree_lock); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 43b44dbadda..bdb70042c12 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -256,7 +256,7 @@ struct vmap_area { | |||
256 | struct rb_node rb_node; /* address sorted rbtree */ | 256 | struct rb_node rb_node; /* address sorted rbtree */ |
257 | struct list_head list; /* address sorted list */ | 257 | struct list_head list; /* address sorted list */ |
258 | struct list_head purge_list; /* "lazy purge" list */ | 258 | struct list_head purge_list; /* "lazy purge" list */ |
259 | void *private; | 259 | struct vm_struct *vm; |
260 | struct rcu_head rcu_head; | 260 | struct rcu_head rcu_head; |
261 | }; | 261 | }; |
262 | 262 | ||
@@ -1174,9 +1174,10 @@ void __init vmalloc_init(void) | |||
1174 | /* Import existing vmlist entries. */ | 1174 | /* Import existing vmlist entries. */ |
1175 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1175 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1176 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); | 1176 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1177 | va->flags = tmp->flags | VM_VM_AREA; | 1177 | va->flags = VM_VM_AREA; |
1178 | va->va_start = (unsigned long)tmp->addr; | 1178 | va->va_start = (unsigned long)tmp->addr; |
1179 | va->va_end = va->va_start + tmp->size; | 1179 | va->va_end = va->va_start + tmp->size; |
1180 | va->vm = tmp; | ||
1180 | __insert_vmap_area(va); | 1181 | __insert_vmap_area(va); |
1181 | } | 1182 | } |
1182 | 1183 | ||
@@ -1274,7 +1275,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1274 | vm->addr = (void *)va->va_start; | 1275 | vm->addr = (void *)va->va_start; |
1275 | vm->size = va->va_end - va->va_start; | 1276 | vm->size = va->va_end - va->va_start; |
1276 | vm->caller = caller; | 1277 | vm->caller = caller; |
1277 | va->private = vm; | 1278 | va->vm = vm; |
1278 | va->flags |= VM_VM_AREA; | 1279 | va->flags |= VM_VM_AREA; |
1279 | } | 1280 | } |
1280 | 1281 | ||
@@ -1397,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr) | |||
1397 | 1398 | ||
1398 | va = find_vmap_area((unsigned long)addr); | 1399 | va = find_vmap_area((unsigned long)addr); |
1399 | if (va && va->flags & VM_VM_AREA) | 1400 | if (va && va->flags & VM_VM_AREA) |
1400 | return va->private; | 1401 | return va->vm; |
1401 | 1402 | ||
1402 | return NULL; | 1403 | return NULL; |
1403 | } | 1404 | } |
@@ -1416,7 +1417,7 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1416 | 1417 | ||
1417 | va = find_vmap_area((unsigned long)addr); | 1418 | va = find_vmap_area((unsigned long)addr); |
1418 | if (va && va->flags & VM_VM_AREA) { | 1419 | if (va && va->flags & VM_VM_AREA) { |
1419 | struct vm_struct *vm = va->private; | 1420 | struct vm_struct *vm = va->vm; |
1420 | 1421 | ||
1421 | if (!(vm->flags & VM_UNLIST)) { | 1422 | if (!(vm->flags & VM_UNLIST)) { |
1422 | struct vm_struct *tmp, **p; | 1423 | struct vm_struct *tmp, **p; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 6072d74a16f..5326f98f506 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
248 | 248 | ||
249 | list_for_each_entry(shrinker, &shrinker_list, list) { | 249 | list_for_each_entry(shrinker, &shrinker_list, list) { |
250 | unsigned long long delta; | 250 | unsigned long long delta; |
251 | unsigned long total_scan; | 251 | long total_scan; |
252 | unsigned long max_pass; | 252 | long max_pass; |
253 | int shrink_ret = 0; | ||
254 | long nr; | ||
255 | long new_nr; | ||
253 | 256 | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 257 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
258 | if (max_pass <= 0) | ||
259 | continue; | ||
260 | |||
261 | /* | ||
262 | * copy the current shrinker scan count into a local variable | ||
263 | * and zero it so that other concurrent shrinker invocations | ||
264 | * don't also do this scanning work. | ||
265 | */ | ||
266 | do { | ||
267 | nr = shrinker->nr; | ||
268 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
269 | |||
270 | total_scan = nr; | ||
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 271 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 272 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 273 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 274 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 275 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 276 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 277 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 278 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 279 | total_scan = max_pass; |
264 | } | 280 | } |
265 | 281 | ||
266 | /* | 282 | /* |
283 | * We need to avoid excessive windup on filesystem shrinkers | ||
284 | * due to large numbers of GFP_NOFS allocations causing the | ||
285 | * shrinkers to return -1 all the time. This results in a large | ||
286 | * nr being built up so when a shrink that can do some work | ||
287 | * comes along it empties the entire cache due to nr >>> | ||
288 | * max_pass. This is bad for sustaining a working set in | ||
289 | * memory. | ||
290 | * | ||
291 | * Hence only allow the shrinker to scan the entire cache when | ||
292 | * a large delta change is calculated directly. | ||
293 | */ | ||
294 | if (delta < max_pass / 4) | ||
295 | total_scan = min(total_scan, max_pass / 2); | ||
296 | |||
297 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 298 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 299 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 300 | * freeable entries. |
270 | */ | 301 | */ |
271 | if (shrinker->nr > max_pass * 2) | 302 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 303 | total_scan = max_pass * 2; |
273 | 304 | ||
274 | total_scan = shrinker->nr; | 305 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 306 | nr_pages_scanned, lru_pages, |
307 | max_pass, delta, total_scan); | ||
276 | 308 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 309 | while (total_scan >= SHRINK_BATCH) { |
278 | long this_scan = SHRINK_BATCH; | 310 | long this_scan = SHRINK_BATCH; |
279 | int shrink_ret; | ||
280 | int nr_before; | 311 | int nr_before; |
281 | 312 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 313 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
292 | cond_resched(); | 323 | cond_resched(); |
293 | } | 324 | } |
294 | 325 | ||
295 | shrinker->nr += total_scan; | 326 | /* |
327 | * move the unused scan count back into the shrinker in a | ||
328 | * manner that handles concurrent updates. If we exhausted the | ||
329 | * scan, there is no need to do an update. | ||
330 | */ | ||
331 | do { | ||
332 | nr = shrinker->nr; | ||
333 | new_nr = total_scan + nr; | ||
334 | if (total_scan <= 0) | ||
335 | break; | ||
336 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
337 | |||
338 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 339 | } |
297 | up_read(&shrinker_rwsem); | 340 | up_read(&shrinker_rwsem); |
298 | out: | 341 | out: |
@@ -665,7 +708,7 @@ static enum page_references page_check_references(struct page *page, | |||
665 | return PAGEREF_RECLAIM; | 708 | return PAGEREF_RECLAIM; |
666 | 709 | ||
667 | if (referenced_ptes) { | 710 | if (referenced_ptes) { |
668 | if (PageAnon(page)) | 711 | if (PageSwapBacked(page)) |
669 | return PAGEREF_ACTIVATE; | 712 | return PAGEREF_ACTIVATE; |
670 | /* | 713 | /* |
671 | * All mapped pages start out with page table | 714 | * All mapped pages start out with page table |
@@ -683,7 +726,13 @@ static enum page_references page_check_references(struct page *page, | |||
683 | */ | 726 | */ |
684 | SetPageReferenced(page); | 727 | SetPageReferenced(page); |
685 | 728 | ||
686 | if (referenced_page) | 729 | if (referenced_page || referenced_ptes > 1) |
730 | return PAGEREF_ACTIVATE; | ||
731 | |||
732 | /* | ||
733 | * Activate file-backed executable pages after first usage. | ||
734 | */ | ||
735 | if (vm_flags & VM_EXEC) | ||
687 | return PAGEREF_ACTIVATE; | 736 | return PAGEREF_ACTIVATE; |
688 | 737 | ||
689 | return PAGEREF_KEEP; | 738 | return PAGEREF_KEEP; |
@@ -972,23 +1021,27 @@ keep_lumpy: | |||
972 | * | 1021 | * |
973 | * returns 0 on success, -ve errno on failure. | 1022 | * returns 0 on success, -ve errno on failure. |
974 | */ | 1023 | */ |
975 | int __isolate_lru_page(struct page *page, int mode, int file) | 1024 | int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) |
976 | { | 1025 | { |
1026 | bool all_lru_mode; | ||
977 | int ret = -EINVAL; | 1027 | int ret = -EINVAL; |
978 | 1028 | ||
979 | /* Only take pages on the LRU. */ | 1029 | /* Only take pages on the LRU. */ |
980 | if (!PageLRU(page)) | 1030 | if (!PageLRU(page)) |
981 | return ret; | 1031 | return ret; |
982 | 1032 | ||
1033 | all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == | ||
1034 | (ISOLATE_ACTIVE|ISOLATE_INACTIVE); | ||
1035 | |||
983 | /* | 1036 | /* |
984 | * When checking the active state, we need to be sure we are | 1037 | * When checking the active state, we need to be sure we are |
985 | * dealing with comparible boolean values. Take the logical not | 1038 | * dealing with comparible boolean values. Take the logical not |
986 | * of each. | 1039 | * of each. |
987 | */ | 1040 | */ |
988 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 1041 | if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) |
989 | return ret; | 1042 | return ret; |
990 | 1043 | ||
991 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) | 1044 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
992 | return ret; | 1045 | return ret; |
993 | 1046 | ||
994 | /* | 1047 | /* |
@@ -1001,6 +1054,43 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1001 | 1054 | ||
1002 | ret = -EBUSY; | 1055 | ret = -EBUSY; |
1003 | 1056 | ||
1057 | /* | ||
1058 | * To minimise LRU disruption, the caller can indicate that it only | ||
1059 | * wants to isolate pages it will be able to operate on without | ||
1060 | * blocking - clean pages for the most part. | ||
1061 | * | ||
1062 | * ISOLATE_CLEAN means that only clean pages should be isolated. This | ||
1063 | * is used by reclaim when it is cannot write to backing storage | ||
1064 | * | ||
1065 | * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages | ||
1066 | * that it is possible to migrate without blocking | ||
1067 | */ | ||
1068 | if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) { | ||
1069 | /* All the caller can do on PageWriteback is block */ | ||
1070 | if (PageWriteback(page)) | ||
1071 | return ret; | ||
1072 | |||
1073 | if (PageDirty(page)) { | ||
1074 | struct address_space *mapping; | ||
1075 | |||
1076 | /* ISOLATE_CLEAN means only clean pages */ | ||
1077 | if (mode & ISOLATE_CLEAN) | ||
1078 | return ret; | ||
1079 | |||
1080 | /* | ||
1081 | * Only pages without mappings or that have a | ||
1082 | * ->migratepage callback are possible to migrate | ||
1083 | * without blocking | ||
1084 | */ | ||
1085 | mapping = page_mapping(page); | ||
1086 | if (mapping && !mapping->a_ops->migratepage) | ||
1087 | return ret; | ||
1088 | } | ||
1089 | } | ||
1090 | |||
1091 | if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) | ||
1092 | return ret; | ||
1093 | |||
1004 | if (likely(get_page_unless_zero(page))) { | 1094 | if (likely(get_page_unless_zero(page))) { |
1005 | /* | 1095 | /* |
1006 | * Be careful not to clear PageLRU until after we're | 1096 | * Be careful not to clear PageLRU until after we're |
@@ -1036,7 +1126,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
1036 | */ | 1126 | */ |
1037 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1127 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1038 | struct list_head *src, struct list_head *dst, | 1128 | struct list_head *src, struct list_head *dst, |
1039 | unsigned long *scanned, int order, int mode, int file) | 1129 | unsigned long *scanned, int order, isolate_mode_t mode, |
1130 | int file) | ||
1040 | { | 1131 | { |
1041 | unsigned long nr_taken = 0; | 1132 | unsigned long nr_taken = 0; |
1042 | unsigned long nr_lumpy_taken = 0; | 1133 | unsigned long nr_lumpy_taken = 0; |
@@ -1111,7 +1202,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1111 | * anon page which don't already have a swap slot is | 1202 | * anon page which don't already have a swap slot is |
1112 | * pointless. | 1203 | * pointless. |
1113 | */ | 1204 | */ |
1114 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | 1205 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && |
1115 | !PageSwapCache(cursor_page)) | 1206 | !PageSwapCache(cursor_page)) |
1116 | break; | 1207 | break; |
1117 | 1208 | ||
@@ -1161,8 +1252,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1161 | static unsigned long isolate_pages_global(unsigned long nr, | 1252 | static unsigned long isolate_pages_global(unsigned long nr, |
1162 | struct list_head *dst, | 1253 | struct list_head *dst, |
1163 | unsigned long *scanned, int order, | 1254 | unsigned long *scanned, int order, |
1164 | int mode, struct zone *z, | 1255 | isolate_mode_t mode, |
1165 | int active, int file) | 1256 | struct zone *z, int active, int file) |
1166 | { | 1257 | { |
1167 | int lru = LRU_BASE; | 1258 | int lru = LRU_BASE; |
1168 | if (active) | 1259 | if (active) |
@@ -1408,6 +1499,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1408 | unsigned long nr_taken; | 1499 | unsigned long nr_taken; |
1409 | unsigned long nr_anon; | 1500 | unsigned long nr_anon; |
1410 | unsigned long nr_file; | 1501 | unsigned long nr_file; |
1502 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | ||
1411 | 1503 | ||
1412 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1504 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1413 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1505 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1418,15 +1510,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1418 | } | 1510 | } |
1419 | 1511 | ||
1420 | set_reclaim_mode(priority, sc, false); | 1512 | set_reclaim_mode(priority, sc, false); |
1513 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1514 | reclaim_mode |= ISOLATE_ACTIVE; | ||
1515 | |||
1421 | lru_add_drain(); | 1516 | lru_add_drain(); |
1517 | |||
1518 | if (!sc->may_unmap) | ||
1519 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1520 | if (!sc->may_writepage) | ||
1521 | reclaim_mode |= ISOLATE_CLEAN; | ||
1522 | |||
1422 | spin_lock_irq(&zone->lru_lock); | 1523 | spin_lock_irq(&zone->lru_lock); |
1423 | 1524 | ||
1424 | if (scanning_global_lru(sc)) { | 1525 | if (scanning_global_lru(sc)) { |
1425 | nr_taken = isolate_pages_global(nr_to_scan, | 1526 | nr_taken = isolate_pages_global(nr_to_scan, &page_list, |
1426 | &page_list, &nr_scanned, sc->order, | 1527 | &nr_scanned, sc->order, reclaim_mode, zone, 0, file); |
1427 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | ||
1428 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1429 | zone, 0, file); | ||
1430 | zone->pages_scanned += nr_scanned; | 1528 | zone->pages_scanned += nr_scanned; |
1431 | if (current_is_kswapd()) | 1529 | if (current_is_kswapd()) |
1432 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | 1530 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, |
@@ -1435,12 +1533,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1435 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1533 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1436 | nr_scanned); | 1534 | nr_scanned); |
1437 | } else { | 1535 | } else { |
1438 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1536 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, |
1439 | &page_list, &nr_scanned, sc->order, | 1537 | &nr_scanned, sc->order, reclaim_mode, zone, |
1440 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? | 1538 | sc->mem_cgroup, 0, file); |
1441 | ISOLATE_BOTH : ISOLATE_INACTIVE, | ||
1442 | zone, sc->mem_cgroup, | ||
1443 | 0, file); | ||
1444 | /* | 1539 | /* |
1445 | * mem_cgroup_isolate_pages() keeps track of | 1540 | * mem_cgroup_isolate_pages() keeps track of |
1446 | * scanned pages on its own. | 1541 | * scanned pages on its own. |
@@ -1542,19 +1637,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1542 | struct page *page; | 1637 | struct page *page; |
1543 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1638 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
1544 | unsigned long nr_rotated = 0; | 1639 | unsigned long nr_rotated = 0; |
1640 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | ||
1545 | 1641 | ||
1546 | lru_add_drain(); | 1642 | lru_add_drain(); |
1643 | |||
1644 | if (!sc->may_unmap) | ||
1645 | reclaim_mode |= ISOLATE_UNMAPPED; | ||
1646 | if (!sc->may_writepage) | ||
1647 | reclaim_mode |= ISOLATE_CLEAN; | ||
1648 | |||
1547 | spin_lock_irq(&zone->lru_lock); | 1649 | spin_lock_irq(&zone->lru_lock); |
1548 | if (scanning_global_lru(sc)) { | 1650 | if (scanning_global_lru(sc)) { |
1549 | nr_taken = isolate_pages_global(nr_pages, &l_hold, | 1651 | nr_taken = isolate_pages_global(nr_pages, &l_hold, |
1550 | &pgscanned, sc->order, | 1652 | &pgscanned, sc->order, |
1551 | ISOLATE_ACTIVE, zone, | 1653 | reclaim_mode, zone, |
1552 | 1, file); | 1654 | 1, file); |
1553 | zone->pages_scanned += pgscanned; | 1655 | zone->pages_scanned += pgscanned; |
1554 | } else { | 1656 | } else { |
1555 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, | 1657 | nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, |
1556 | &pgscanned, sc->order, | 1658 | &pgscanned, sc->order, |
1557 | ISOLATE_ACTIVE, zone, | 1659 | reclaim_mode, zone, |
1558 | sc->mem_cgroup, 1, file); | 1660 | sc->mem_cgroup, 1, file); |
1559 | /* | 1661 | /* |
1560 | * mem_cgroup_isolate_pages() keeps track of | 1662 | * mem_cgroup_isolate_pages() keeps track of |
@@ -1747,23 +1849,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1747 | u64 fraction[2], denominator; | 1849 | u64 fraction[2], denominator; |
1748 | enum lru_list l; | 1850 | enum lru_list l; |
1749 | int noswap = 0; | 1851 | int noswap = 0; |
1750 | int force_scan = 0; | 1852 | bool force_scan = false; |
1751 | unsigned long nr_force_scan[2]; | 1853 | unsigned long nr_force_scan[2]; |
1752 | 1854 | ||
1753 | 1855 | /* kswapd does zone balancing and needs to scan this zone */ | |
1754 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1856 | if (scanning_global_lru(sc) && current_is_kswapd() && |
1755 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1857 | zone->all_unreclaimable) |
1756 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1858 | force_scan = true; |
1757 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 1859 | /* memcg may have small limit and need to avoid priority drop */ |
1758 | 1860 | if (!scanning_global_lru(sc)) | |
1759 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1861 | force_scan = true; |
1760 | /* kswapd does zone balancing and need to scan this zone */ | ||
1761 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1762 | force_scan = 1; | ||
1763 | /* memcg may have small limit and need to avoid priority drop */ | ||
1764 | if (!scanning_global_lru(sc)) | ||
1765 | force_scan = 1; | ||
1766 | } | ||
1767 | 1862 | ||
1768 | /* If we have no swap space, do not bother scanning anon pages. */ | 1863 | /* If we have no swap space, do not bother scanning anon pages. */ |
1769 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1864 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1776,6 +1871,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1776 | goto out; | 1871 | goto out; |
1777 | } | 1872 | } |
1778 | 1873 | ||
1874 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1875 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1876 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1877 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1878 | |||
1779 | if (scanning_global_lru(sc)) { | 1879 | if (scanning_global_lru(sc)) { |
1780 | free = zone_page_state(zone, NR_FREE_PAGES); | 1880 | free = zone_page_state(zone, NR_FREE_PAGES); |
1781 | /* If we have very few page cache pages, | 1881 | /* If we have very few page cache pages, |
@@ -1912,8 +2012,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
1912 | * inactive lists are large enough, continue reclaiming | 2012 | * inactive lists are large enough, continue reclaiming |
1913 | */ | 2013 | */ |
1914 | pages_for_compaction = (2UL << sc->order); | 2014 | pages_for_compaction = (2UL << sc->order); |
1915 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | 2015 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
1916 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 2016 | if (nr_swap_pages > 0) |
2017 | inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1917 | if (sc->nr_reclaimed < pages_for_compaction && | 2018 | if (sc->nr_reclaimed < pages_for_compaction && |
1918 | inactive_lru_pages > pages_for_compaction) | 2019 | inactive_lru_pages > pages_for_compaction) |
1919 | return true; | 2020 | return true; |
@@ -1985,6 +2086,42 @@ restart: | |||
1985 | throttle_vm_writeout(sc->gfp_mask); | 2086 | throttle_vm_writeout(sc->gfp_mask); |
1986 | } | 2087 | } |
1987 | 2088 | ||
2089 | /* Returns true if compaction should go ahead for a high-order request */ | ||
2090 | static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | ||
2091 | { | ||
2092 | unsigned long balance_gap, watermark; | ||
2093 | bool watermark_ok; | ||
2094 | |||
2095 | /* Do not consider compaction for orders reclaim is meant to satisfy */ | ||
2096 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) | ||
2097 | return false; | ||
2098 | |||
2099 | /* | ||
2100 | * Compaction takes time to run and there are potentially other | ||
2101 | * callers using the pages just freed. Continue reclaiming until | ||
2102 | * there is a buffer of free pages available to give compaction | ||
2103 | * a reasonable chance of completing and allocating the page | ||
2104 | */ | ||
2105 | balance_gap = min(low_wmark_pages(zone), | ||
2106 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2107 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2108 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | ||
2109 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | ||
2110 | |||
2111 | /* | ||
2112 | * If compaction is deferred, reclaim up to a point where | ||
2113 | * compaction will have a chance of success when re-enabled | ||
2114 | */ | ||
2115 | if (compaction_deferred(zone)) | ||
2116 | return watermark_ok; | ||
2117 | |||
2118 | /* If compaction is not ready to start, keep reclaiming */ | ||
2119 | if (!compaction_suitable(zone, sc->order)) | ||
2120 | return false; | ||
2121 | |||
2122 | return watermark_ok; | ||
2123 | } | ||
2124 | |||
1988 | /* | 2125 | /* |
1989 | * This is the direct reclaim path, for page-allocating processes. We only | 2126 | * This is the direct reclaim path, for page-allocating processes. We only |
1990 | * try to reclaim pages from zones which will satisfy the caller's allocation | 2127 | * try to reclaim pages from zones which will satisfy the caller's allocation |
@@ -2000,14 +2137,20 @@ restart: | |||
2000 | * | 2137 | * |
2001 | * If a zone is deemed to be full of pinned pages then just give it a light | 2138 | * If a zone is deemed to be full of pinned pages then just give it a light |
2002 | * scan then give up on it. | 2139 | * scan then give up on it. |
2140 | * | ||
2141 | * This function returns true if a zone is being reclaimed for a costly | ||
2142 | * high-order allocation and compaction is ready to begin. This indicates to | ||
2143 | * the caller that it should consider retrying the allocation instead of | ||
2144 | * further reclaim. | ||
2003 | */ | 2145 | */ |
2004 | static void shrink_zones(int priority, struct zonelist *zonelist, | 2146 | static bool shrink_zones(int priority, struct zonelist *zonelist, |
2005 | struct scan_control *sc) | 2147 | struct scan_control *sc) |
2006 | { | 2148 | { |
2007 | struct zoneref *z; | 2149 | struct zoneref *z; |
2008 | struct zone *zone; | 2150 | struct zone *zone; |
2009 | unsigned long nr_soft_reclaimed; | 2151 | unsigned long nr_soft_reclaimed; |
2010 | unsigned long nr_soft_scanned; | 2152 | unsigned long nr_soft_scanned; |
2153 | bool aborted_reclaim = false; | ||
2011 | 2154 | ||
2012 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2155 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2013 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2156 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2022,6 +2165,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2022 | continue; | 2165 | continue; |
2023 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2166 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2024 | continue; /* Let kswapd poll it */ | 2167 | continue; /* Let kswapd poll it */ |
2168 | if (COMPACTION_BUILD) { | ||
2169 | /* | ||
2170 | * If we already have plenty of memory free for | ||
2171 | * compaction in this zone, don't free any more. | ||
2172 | * Even though compaction is invoked for any | ||
2173 | * non-zero order, only frequent costly order | ||
2174 | * reclamation is disruptive enough to become a | ||
2175 | * noticable problem, like transparent huge page | ||
2176 | * allocations. | ||
2177 | */ | ||
2178 | if (compaction_ready(zone, sc)) { | ||
2179 | aborted_reclaim = true; | ||
2180 | continue; | ||
2181 | } | ||
2182 | } | ||
2025 | /* | 2183 | /* |
2026 | * This steals pages from memory cgroups over softlimit | 2184 | * This steals pages from memory cgroups over softlimit |
2027 | * and returns the number of reclaimed pages and | 2185 | * and returns the number of reclaimed pages and |
@@ -2039,6 +2197,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
2039 | 2197 | ||
2040 | shrink_zone(priority, zone, sc); | 2198 | shrink_zone(priority, zone, sc); |
2041 | } | 2199 | } |
2200 | |||
2201 | return aborted_reclaim; | ||
2042 | } | 2202 | } |
2043 | 2203 | ||
2044 | static bool zone_reclaimable(struct zone *zone) | 2204 | static bool zone_reclaimable(struct zone *zone) |
@@ -2092,8 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2092 | struct zoneref *z; | 2252 | struct zoneref *z; |
2093 | struct zone *zone; | 2253 | struct zone *zone; |
2094 | unsigned long writeback_threshold; | 2254 | unsigned long writeback_threshold; |
2255 | bool aborted_reclaim; | ||
2095 | 2256 | ||
2096 | get_mems_allowed(); | ||
2097 | delayacct_freepages_start(); | 2257 | delayacct_freepages_start(); |
2098 | 2258 | ||
2099 | if (scanning_global_lru(sc)) | 2259 | if (scanning_global_lru(sc)) |
@@ -2103,7 +2263,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2103 | sc->nr_scanned = 0; | 2263 | sc->nr_scanned = 0; |
2104 | if (!priority) | 2264 | if (!priority) |
2105 | disable_swap_token(sc->mem_cgroup); | 2265 | disable_swap_token(sc->mem_cgroup); |
2106 | shrink_zones(priority, zonelist, sc); | 2266 | aborted_reclaim = shrink_zones(priority, zonelist, sc); |
2267 | |||
2107 | /* | 2268 | /* |
2108 | * Don't shrink slabs when reclaiming memory from | 2269 | * Don't shrink slabs when reclaiming memory from |
2109 | * over limit cgroups | 2270 | * over limit cgroups |
@@ -2155,7 +2316,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2155 | 2316 | ||
2156 | out: | 2317 | out: |
2157 | delayacct_freepages_end(); | 2318 | delayacct_freepages_end(); |
2158 | put_mems_allowed(); | ||
2159 | 2319 | ||
2160 | if (sc->nr_reclaimed) | 2320 | if (sc->nr_reclaimed) |
2161 | return sc->nr_reclaimed; | 2321 | return sc->nr_reclaimed; |
@@ -2168,6 +2328,10 @@ out: | |||
2168 | if (oom_killer_disabled) | 2328 | if (oom_killer_disabled) |
2169 | return 0; | 2329 | return 0; |
2170 | 2330 | ||
2331 | /* Aborted reclaim to try compaction? don't OOM, then */ | ||
2332 | if (aborted_reclaim) | ||
2333 | return 1; | ||
2334 | |||
2171 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2335 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
2172 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2336 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) |
2173 | return 1; | 2337 | return 1; |
@@ -2459,6 +2623,9 @@ loop_again: | |||
2459 | high_wmark_pages(zone), 0, 0)) { | 2623 | high_wmark_pages(zone), 0, 0)) { |
2460 | end_zone = i; | 2624 | end_zone = i; |
2461 | break; | 2625 | break; |
2626 | } else { | ||
2627 | /* If balanced, clear the congested flag */ | ||
2628 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2462 | } | 2629 | } |
2463 | } | 2630 | } |
2464 | if (i < 0) | 2631 | if (i < 0) |
@@ -2695,7 +2862,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2695 | * them before going back to sleep. | 2862 | * them before going back to sleep. |
2696 | */ | 2863 | */ |
2697 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2864 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2698 | schedule(); | 2865 | |
2866 | if (!kthread_should_stop()) | ||
2867 | schedule(); | ||
2868 | |||
2699 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2869 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
2700 | } else { | 2870 | } else { |
2701 | if (remaining) | 2871 | if (remaining) |
@@ -2722,7 +2892,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2722 | static int kswapd(void *p) | 2892 | static int kswapd(void *p) |
2723 | { | 2893 | { |
2724 | unsigned long order, new_order; | 2894 | unsigned long order, new_order; |
2895 | unsigned balanced_order; | ||
2725 | int classzone_idx, new_classzone_idx; | 2896 | int classzone_idx, new_classzone_idx; |
2897 | int balanced_classzone_idx; | ||
2726 | pg_data_t *pgdat = (pg_data_t*)p; | 2898 | pg_data_t *pgdat = (pg_data_t*)p; |
2727 | struct task_struct *tsk = current; | 2899 | struct task_struct *tsk = current; |
2728 | 2900 | ||
@@ -2753,7 +2925,9 @@ static int kswapd(void *p) | |||
2753 | set_freezable(); | 2925 | set_freezable(); |
2754 | 2926 | ||
2755 | order = new_order = 0; | 2927 | order = new_order = 0; |
2928 | balanced_order = 0; | ||
2756 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | 2929 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2930 | balanced_classzone_idx = classzone_idx; | ||
2757 | for ( ; ; ) { | 2931 | for ( ; ; ) { |
2758 | int ret; | 2932 | int ret; |
2759 | 2933 | ||
@@ -2762,7 +2936,8 @@ static int kswapd(void *p) | |||
2762 | * new request of a similar or harder type will succeed soon | 2936 | * new request of a similar or harder type will succeed soon |
2763 | * so consider going to sleep on the basis we reclaimed at | 2937 | * so consider going to sleep on the basis we reclaimed at |
2764 | */ | 2938 | */ |
2765 | if (classzone_idx >= new_classzone_idx && order == new_order) { | 2939 | if (balanced_classzone_idx >= new_classzone_idx && |
2940 | balanced_order == new_order) { | ||
2766 | new_order = pgdat->kswapd_max_order; | 2941 | new_order = pgdat->kswapd_max_order; |
2767 | new_classzone_idx = pgdat->classzone_idx; | 2942 | new_classzone_idx = pgdat->classzone_idx; |
2768 | pgdat->kswapd_max_order = 0; | 2943 | pgdat->kswapd_max_order = 0; |
@@ -2777,9 +2952,12 @@ static int kswapd(void *p) | |||
2777 | order = new_order; | 2952 | order = new_order; |
2778 | classzone_idx = new_classzone_idx; | 2953 | classzone_idx = new_classzone_idx; |
2779 | } else { | 2954 | } else { |
2780 | kswapd_try_to_sleep(pgdat, order, classzone_idx); | 2955 | kswapd_try_to_sleep(pgdat, balanced_order, |
2956 | balanced_classzone_idx); | ||
2781 | order = pgdat->kswapd_max_order; | 2957 | order = pgdat->kswapd_max_order; |
2782 | classzone_idx = pgdat->classzone_idx; | 2958 | classzone_idx = pgdat->classzone_idx; |
2959 | new_order = order; | ||
2960 | new_classzone_idx = classzone_idx; | ||
2783 | pgdat->kswapd_max_order = 0; | 2961 | pgdat->kswapd_max_order = 0; |
2784 | pgdat->classzone_idx = pgdat->nr_zones - 1; | 2962 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2785 | } | 2963 | } |
@@ -2794,7 +2972,9 @@ static int kswapd(void *p) | |||
2794 | */ | 2972 | */ |
2795 | if (!ret) { | 2973 | if (!ret) { |
2796 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2974 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2797 | order = balance_pgdat(pgdat, order, &classzone_idx); | 2975 | balanced_classzone_idx = classzone_idx; |
2976 | balanced_order = balance_pgdat(pgdat, order, | ||
2977 | &balanced_classzone_idx); | ||
2798 | } | 2978 | } |
2799 | } | 2979 | } |
2800 | return 0; | 2980 | return 0; |
@@ -2952,14 +3132,17 @@ int kswapd_run(int nid) | |||
2952 | } | 3132 | } |
2953 | 3133 | ||
2954 | /* | 3134 | /* |
2955 | * Called by memory hotplug when all memory in a node is offlined. | 3135 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
3136 | * hold lock_memory_hotplug(). | ||
2956 | */ | 3137 | */ |
2957 | void kswapd_stop(int nid) | 3138 | void kswapd_stop(int nid) |
2958 | { | 3139 | { |
2959 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 3140 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
2960 | 3141 | ||
2961 | if (kswapd) | 3142 | if (kswapd) { |
2962 | kthread_stop(kswapd); | 3143 | kthread_stop(kswapd); |
3144 | NODE_DATA(nid)->kswapd = NULL; | ||
3145 | } | ||
2963 | } | 3146 | } |
2964 | 3147 | ||
2965 | static int __init kswapd_init(void) | 3148 | static int __init kswapd_init(void) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b..6559013c5a1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) | |||
78 | * | 78 | * |
79 | * vm_stat contains the global counters | 79 | * vm_stat contains the global counters |
80 | */ | 80 | */ |
81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 81 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; |
82 | EXPORT_SYMBOL(vm_stat); | 82 | EXPORT_SYMBOL(vm_stat); |
83 | 83 | ||
84 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |