aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Murphy2014-10-09 08:46:37 -0500
committerDan Murphy2014-10-09 08:46:37 -0500
commit68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf (patch)
tree5dd817815fc283a5a5629b937e4c3c2cf9cf8b17
parent595e0e568639ef203725532e9f4a767e8a7e3281 (diff)
parentb0807bc10a6ac95ab8bf3bbf57703a0f2edd9aa9 (diff)
downloadti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.tar.gz
ti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.tar.xz
ti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.zip
Merge tag 'v3.12.30' of http://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable into ti-linux-3.12.yti-linux-3.12.y
This is the 3.12.30 stable release * tag 'v3.12.30' of http://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable: (97 commits) Linux 3.12.30 mm: page_alloc: reduce cost of the fair zone allocation policy mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered mm: vmscan: only update per-cpu thresholds for online CPU mm: move zone->pages_scanned into a vmstat counter mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated memcg, vmscan: Fix forced scan of anonymous pages vmalloc: use rcu list iterator to reduce vmap_area_lock contention mm: make copy_pte_range static again mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault() shmem: fix init_page_accessed use to stop !PageLRU bug mm: avoid unnecessary atomic operations during end_page_writeback() mm: non-atomically mark page accessed during page cache allocation where possible fs: buffer: do not use unnecessary atomic operations when discarding buffers mm: do not use unnecessary atomic operations when adding pages to the LRU mm: do not use atomic operations when releasing pages mm: shmem: avoid atomic operation during shmem_getpage_gfp mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free ... Signed-off-by: Dan Murphy <DMurphy@ti.com>
-rw-r--r--Makefile2
-rw-r--r--arch/tile/mm/homecache.c2
-rw-r--r--arch/unicore32/include/asm/mmu_context.h4
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/mm/pgtable.c21
-rw-r--r--arch/x86/mm/tlb.c52
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/extent_io.c15
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/buffer.c28
-rw-r--r--fs/cramfs/inode.c3
-rw-r--r--fs/exec.c5
-rw-r--r--fs/ext4/mballoc.c14
-rw-r--r--fs/f2fs/checkpoint.c1
-rw-r--r--fs/f2fs/node.c2
-rw-r--r--fs/fuse/dev.c2
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/gfs2/aops.c1
-rw-r--r--fs/gfs2/meta_io.c4
-rw-r--r--fs/hugetlbfs/inode.c5
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/file.c1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/super.c16
-rw-r--r--include/linux/compaction.h20
-rw-r--r--include/linux/cpuset.h56
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/huge_mm.h4
-rw-r--r--include/linux/hugetlb.h10
-rw-r--r--include/linux/jump_label.h20
-rw-r--r--include/linux/migrate.h11
-rw-r--r--include/linux/mm.h11
-rw-r--r--include/linux/mm_types.h4
-rw-r--r--include/linux/mmzone.h233
-rw-r--r--include/linux/page-flags.h6
-rw-r--r--include/linux/pageblock-flags.h33
-rw-r--r--include/linux/pagemap.h131
-rw-r--r--include/linux/pagevec.h5
-rw-r--r--include/linux/plist.h45
-rw-r--r--include/linux/radix-tree.h5
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/shmem_fs.h1
-rw-r--r--include/linux/swap.h30
-rw-r--r--include/linux/swapfile.h2
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmacache.h38
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/trace/events/compaction.h67
-rw-r--r--include/trace/events/kmem.h10
-rw-r--r--include/trace/events/pagemap.h16
-rw-r--r--kernel/cpuset.c16
-rw-r--r--kernel/debug/debug_core.c14
-rw-r--r--kernel/fork.c7
-rw-r--r--lib/plist.c52
-rw-r--r--lib/radix-tree.c106
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c347
-rw-r--r--mm/filemap.c470
-rw-r--r--mm/fremap.c28
-rw-r--r--mm/frontswap.c13
-rw-r--r--mm/huge_memory.c93
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/internal.h22
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory.c4
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c56
-rw-r--r--mm/mincore.c20
-rw-r--r--mm/mmap.c55
-rw-r--r--mm/nommu.c24
-rw-r--r--mm/page_alloc.c423
-rw-r--r--mm/readahead.c37
-rw-r--r--mm/shmem.c133
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slub.c16
-rw-r--r--mm/swap.c101
-rw-r--r--mm/swap_state.c65
-rw-r--r--mm/swapfile.c224
-rw-r--r--mm/truncate.c74
-rw-r--r--mm/vmacache.c114
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c144
-rw-r--r--mm/vmstat.c13
88 files changed, 2367 insertions, 1358 deletions
diff --git a/Makefile b/Makefile
index 67cec33d00c7..1ad1566225ca 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
1VERSION = 3 1VERSION = 3
2PATCHLEVEL = 12 2PATCHLEVEL = 12
3SUBLEVEL = 29 3SUBLEVEL = 30
4EXTRAVERSION = 4EXTRAVERSION =
5NAME = One Giant Leap for Frogkind 5NAME = One Giant Leap for Frogkind
6 6
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c
index 004ba568d93f..33294fdc402e 100644
--- a/arch/tile/mm/homecache.c
+++ b/arch/tile/mm/homecache.c
@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order)
417 if (put_page_testzero(page)) { 417 if (put_page_testzero(page)) {
418 homecache_change_page_home(page, order, PAGE_HOME_HASH); 418 homecache_change_page_home(page, order, PAGE_HOME_HASH);
419 if (order == 0) { 419 if (order == 0) {
420 free_hot_cold_page(page, 0); 420 free_hot_cold_page(page, false);
421 } else { 421 } else {
422 init_page_count(page); 422 init_page_count(page);
423 __free_pages(page, order); 423 __free_pages(page, order);
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c658f7a..ef470a7a3d0f 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -14,6 +14,8 @@
14 14
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/mm.h>
18#include <linux/vmacache.h>
17#include <linux/io.h> 19#include <linux/io.h>
18 20
19#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
@@ -73,7 +75,7 @@ do { \
73 else \ 75 else \
74 mm->mmap = NULL; \ 76 mm->mmap = NULL; \
75 rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ 77 rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
76 mm->mmap_cache = NULL; \ 78 vmacache_invalidate(mm); \
77 mm->map_count--; \ 79 mm->map_count--; \
78 remove_vma(high_vma); \ 80 remove_vma(high_vma); \
79 } \ 81 } \
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index e6d90babc245..04905bfc508b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void)
62 62
63static inline void __flush_tlb_one(unsigned long addr) 63static inline void __flush_tlb_one(unsigned long addr)
64{ 64{
65 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); 65 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
66 __flush_tlb_single(addr); 66 __flush_tlb_single(addr);
67} 67}
68 68
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr)
93 */ 93 */
94static inline void __flush_tlb_up(void) 94static inline void __flush_tlb_up(void)
95{ 95{
96 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 96 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
97 __flush_tlb(); 97 __flush_tlb();
98} 98}
99 99
100static inline void flush_tlb_all(void) 100static inline void flush_tlb_all(void)
101{ 101{
102 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 102 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
103 __flush_tlb_all(); 103 __flush_tlb_all();
104} 104}
105 105
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2c3e4f..0e25a1bc5ab5 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
683 } 683 }
684 684
685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ 685 /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
686 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 686 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
687 __flush_tlb(); 687 __flush_tlb();
688 688
689 /* Save MTRR state */ 689 /* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
697static void post_set(void) __releases(set_atomicity_lock) 697static void post_set(void) __releases(set_atomicity_lock)
698{ 698{
699 /* Flush TLBs (no need to flush caches - they are disabled) */ 699 /* Flush TLBs (no need to flush caches - they are disabled) */
700 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 700 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
701 __flush_tlb(); 701 __flush_tlb();
702 702
703 /* Intel (P6) standard MTRRs */ 703 /* Intel (P6) standard MTRRs */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index dfa537a03be1..5da29d04de2f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
386int ptep_clear_flush_young(struct vm_area_struct *vma, 386int ptep_clear_flush_young(struct vm_area_struct *vma,
387 unsigned long address, pte_t *ptep) 387 unsigned long address, pte_t *ptep)
388{ 388{
389 int young; 389 /*
390 390 * On x86 CPUs, clearing the accessed bit without a TLB flush
391 young = ptep_test_and_clear_young(vma, address, ptep); 391 * doesn't cause data corruption. [ It could cause incorrect
392 if (young) 392 * page aging and the (mistaken) reclaim of hot pages, but the
393 flush_tlb_page(vma, address); 393 * chance of that should be relatively low. ]
394 394 *
395 return young; 395 * So as a performance optimization don't flush the TLB when
396 * clearing the accessed bit, it will eventually be flushed by
397 * a context switch or a VM operation anyway. [ In the rare
398 * event of it not getting flushed for a long time the delay
399 * shouldn't really matter because there's no real memory
400 * pressure for swapout to react to. ]
401 */
402 return ptep_test_and_clear_young(vma, address, ptep);
396} 403}
397 404
398#ifdef CONFIG_TRANSPARENT_HUGEPAGE 405#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 104 return;
105 105
106 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 106 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (f->flush_end == TLB_FLUSH_ALL) 108 if (f->flush_end == TLB_FLUSH_ALL)
109 local_flush_tlb(); 109 local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
131 info.flush_start = start; 131 info.flush_start = start;
132 info.flush_end = end; 132 info.flush_end = end;
133 133
134 count_vm_event(NR_TLB_REMOTE_FLUSH); 134 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
135 if (is_uv_system()) { 135 if (is_uv_system()) {
136 unsigned int cpu; 136 unsigned int cpu;
137 137
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
151 151
152 preempt_disable(); 152 preempt_disable();
153 153
154 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 154 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
155 local_flush_tlb(); 155 local_flush_tlb();
156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
158 preempt_enable(); 158 preempt_enable();
159} 159}
160 160
161/*
162 * It can find out the THP large page, or
163 * HUGETLB page in tlb_flush when THP disabled
164 */
165static inline unsigned long has_large_page(struct mm_struct *mm,
166 unsigned long start, unsigned long end)
167{
168 pgd_t *pgd;
169 pud_t *pud;
170 pmd_t *pmd;
171 unsigned long addr = ALIGN(start, HPAGE_SIZE);
172 for (; addr < end; addr += HPAGE_SIZE) {
173 pgd = pgd_offset(mm, addr);
174 if (likely(!pgd_none(*pgd))) {
175 pud = pud_offset(pgd, addr);
176 if (likely(!pud_none(*pud))) {
177 pmd = pmd_offset(pud, addr);
178 if (likely(!pmd_none(*pmd)))
179 if (pmd_large(*pmd))
180 return addr;
181 }
182 }
183 }
184 return 0;
185}
186
187void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 161void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
188 unsigned long end, unsigned long vmflag) 162 unsigned long end, unsigned long vmflag)
189{ 163{
190 unsigned long addr; 164 unsigned long addr;
191 unsigned act_entries, tlb_entries = 0; 165 unsigned act_entries, tlb_entries = 0;
166 unsigned long nr_base_pages;
192 167
193 preempt_disable(); 168 preempt_disable();
194 if (current->active_mm != mm) 169 if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
210 tlb_entries = tlb_lli_4k[ENTRIES]; 185 tlb_entries = tlb_lli_4k[ENTRIES];
211 else 186 else
212 tlb_entries = tlb_lld_4k[ENTRIES]; 187 tlb_entries = tlb_lld_4k[ENTRIES];
188
213 /* Assume all of TLB entries was occupied by this task */ 189 /* Assume all of TLB entries was occupied by this task */
214 act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; 190 act_entries = tlb_entries >> tlb_flushall_shift;
191 act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
192 nr_base_pages = (end - start) >> PAGE_SHIFT;
215 193
216 /* tlb_flushall_shift is on balance point, details in commit log */ 194 /* tlb_flushall_shift is on balance point, details in commit log */
217 if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { 195 if (nr_base_pages > act_entries) {
218 count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); 196 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
219 local_flush_tlb(); 197 local_flush_tlb();
220 } else { 198 } else {
221 if (has_large_page(mm, start, end)) {
222 local_flush_tlb();
223 goto flush_all;
224 }
225 /* flush range by one by one 'invlpg' */ 199 /* flush range by one by one 'invlpg' */
226 for (addr = start; addr < end; addr += PAGE_SIZE) { 200 for (addr = start; addr < end; addr += PAGE_SIZE) {
227 count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); 201 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
228 __flush_tlb_single(addr); 202 __flush_tlb_single(addr);
229 } 203 }
230 204
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
262 236
263static void do_flush_tlb_all(void *info) 237static void do_flush_tlb_all(void *info)
264{ 238{
265 count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 239 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
266 __flush_tlb_all(); 240 __flush_tlb_all();
267 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 241 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
268 leave_mm(smp_processor_id()); 242 leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
270 244
271void flush_tlb_all(void) 245void flush_tlb_all(void)
272{ 246{
273 count_vm_event(NR_TLB_REMOTE_FLUSH); 247 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
274 on_each_cpu(do_flush_tlb_all, NULL, 1); 248 on_each_cpu(do_flush_tlb_all, NULL, 1);
275} 249}
276 250
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6e9ff8fac75a..6357298932bf 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
474 rcu_read_lock(); 474 rcu_read_lock();
475 page = radix_tree_lookup(&mapping->page_tree, pg_index); 475 page = radix_tree_lookup(&mapping->page_tree, pg_index);
476 rcu_read_unlock(); 476 rcu_read_unlock();
477 if (page) { 477 if (page && !radix_tree_exceptional_entry(page)) {
478 misses++; 478 misses++;
479 if (misses > 4) 479 if (misses > 4)
480 break; 480 break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 594bbfd4996e..7015d9079bd1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4446 spin_unlock(&eb->refs_lock); 4446 spin_unlock(&eb->refs_lock);
4447} 4447}
4448 4448
4449static void mark_extent_buffer_accessed(struct extent_buffer *eb) 4449static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4450 struct page *accessed)
4450{ 4451{
4451 unsigned long num_pages, i; 4452 unsigned long num_pages, i;
4452 4453
@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4455 num_pages = num_extent_pages(eb->start, eb->len); 4456 num_pages = num_extent_pages(eb->start, eb->len);
4456 for (i = 0; i < num_pages; i++) { 4457 for (i = 0; i < num_pages; i++) {
4457 struct page *p = extent_buffer_page(eb, i); 4458 struct page *p = extent_buffer_page(eb, i);
4458 mark_page_accessed(p); 4459 if (p != accessed)
4460 mark_page_accessed(p);
4459 } 4461 }
4460} 4462}
4461 4463
@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4476 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4478 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4477 if (eb && atomic_inc_not_zero(&eb->refs)) { 4479 if (eb && atomic_inc_not_zero(&eb->refs)) {
4478 rcu_read_unlock(); 4480 rcu_read_unlock();
4479 mark_extent_buffer_accessed(eb); 4481 mark_extent_buffer_accessed(eb, NULL);
4480 return eb; 4482 return eb;
4481 } 4483 }
4482 rcu_read_unlock(); 4484 rcu_read_unlock();
@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4504 spin_unlock(&mapping->private_lock); 4506 spin_unlock(&mapping->private_lock);
4505 unlock_page(p); 4507 unlock_page(p);
4506 page_cache_release(p); 4508 page_cache_release(p);
4507 mark_extent_buffer_accessed(exists); 4509 mark_extent_buffer_accessed(exists, p);
4508 goto free_eb; 4510 goto free_eb;
4509 } 4511 }
4510 4512
@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4519 attach_extent_buffer_page(eb, p); 4521 attach_extent_buffer_page(eb, p);
4520 spin_unlock(&mapping->private_lock); 4522 spin_unlock(&mapping->private_lock);
4521 WARN_ON(PageDirty(p)); 4523 WARN_ON(PageDirty(p));
4522 mark_page_accessed(p);
4523 eb->pages[i] = p; 4524 eb->pages[i] = p;
4524 if (!PageUptodate(p)) 4525 if (!PageUptodate(p))
4525 uptodate = 0; 4526 uptodate = 0;
@@ -4549,7 +4550,7 @@ again:
4549 } 4550 }
4550 spin_unlock(&tree->buffer_lock); 4551 spin_unlock(&tree->buffer_lock);
4551 radix_tree_preload_end(); 4552 radix_tree_preload_end();
4552 mark_extent_buffer_accessed(exists); 4553 mark_extent_buffer_accessed(exists, NULL);
4553 goto free_eb; 4554 goto free_eb;
4554 } 4555 }
4555 /* add one reference for the tree */ 4556 /* add one reference for the tree */
@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4595 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); 4596 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4596 if (eb && atomic_inc_not_zero(&eb->refs)) { 4597 if (eb && atomic_inc_not_zero(&eb->refs)) {
4597 rcu_read_unlock(); 4598 rcu_read_unlock();
4598 mark_extent_buffer_accessed(eb); 4599 mark_extent_buffer_accessed(eb, NULL);
4599 return eb; 4600 return eb;
4600 } 4601 }
4601 rcu_read_unlock(); 4602 rcu_read_unlock();
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72da4df53c9a..ad80dfa6cf91 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
426 struct page *page = prepared_pages[pg]; 426 struct page *page = prepared_pages[pg];
427 /* 427 /*
428 * Copy data from userspace to the current page 428 * Copy data from userspace to the current page
429 *
430 * Disable pagefault to avoid recursive lock since
431 * the pages are already locked
432 */ 429 */
433 pagefault_disable();
434 copied = iov_iter_copy_from_user_atomic(page, i, offset, count); 430 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
435 pagefault_enable();
436 431
437 /* Flush processor's dcache for this page */ 432 /* Flush processor's dcache for this page */
438 flush_dcache_page(page); 433 flush_dcache_page(page);
@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
476 for (i = 0; i < num_pages; i++) { 471 for (i = 0; i < num_pages; i++) {
477 /* page checked is some magic around finding pages that 472 /* page checked is some magic around finding pages that
478 * have been modified without going through btrfs_set_page_dirty 473 * have been modified without going through btrfs_set_page_dirty
479 * clear it here 474 * clear it here. There should be no need to mark the pages
475 * accessed as prepare_pages should have marked them accessed
476 * in prepare_pages via find_or_create_page()
480 */ 477 */
481 ClearPageChecked(pages[i]); 478 ClearPageChecked(pages[i]);
482 unlock_page(pages[i]); 479 unlock_page(pages[i]);
483 mark_page_accessed(pages[i]);
484 page_cache_release(pages[i]); 480 page_cache_release(pages[i]);
485 } 481 }
486} 482}
diff --git a/fs/buffer.c b/fs/buffer.c
index aeeea6529bcd..b7888527f7c3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
227 int all_mapped = 1; 227 int all_mapped = 1;
228 228
229 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); 229 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
230 page = find_get_page(bd_mapping, index); 230 page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
231 if (!page) 231 if (!page)
232 goto out; 232 goto out;
233 233
@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1366 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1366 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1367 1367
1368 if (bh == NULL) { 1368 if (bh == NULL) {
1369 /* __find_get_block_slow will mark the page accessed */
1369 bh = __find_get_block_slow(bdev, block); 1370 bh = __find_get_block_slow(bdev, block);
1370 if (bh) 1371 if (bh)
1371 bh_lru_install(bh); 1372 bh_lru_install(bh);
1372 } 1373 } else
1373 if (bh)
1374 touch_buffer(bh); 1374 touch_buffer(bh);
1375
1375 return bh; 1376 return bh;
1376} 1377}
1377EXPORT_SYMBOL(__find_get_block); 1378EXPORT_SYMBOL(__find_get_block);
@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page);
1483/* 1484/*
1484 * Called when truncating a buffer on a page completely. 1485 * Called when truncating a buffer on a page completely.
1485 */ 1486 */
1487
1488/* Bits that are cleared during an invalidate */
1489#define BUFFER_FLAGS_DISCARD \
1490 (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1491 1 << BH_Delay | 1 << BH_Unwritten)
1492
1486static void discard_buffer(struct buffer_head * bh) 1493static void discard_buffer(struct buffer_head * bh)
1487{ 1494{
1495 unsigned long b_state, b_state_old;
1496
1488 lock_buffer(bh); 1497 lock_buffer(bh);
1489 clear_buffer_dirty(bh); 1498 clear_buffer_dirty(bh);
1490 bh->b_bdev = NULL; 1499 bh->b_bdev = NULL;
1491 clear_buffer_mapped(bh); 1500 b_state = bh->b_state;
1492 clear_buffer_req(bh); 1501 for (;;) {
1493 clear_buffer_new(bh); 1502 b_state_old = cmpxchg(&bh->b_state, b_state,
1494 clear_buffer_delay(bh); 1503 (b_state & ~BUFFER_FLAGS_DISCARD));
1495 clear_buffer_unwritten(bh); 1504 if (b_state_old == b_state)
1505 break;
1506 b_state = b_state_old;
1507 }
1496 unlock_buffer(bh); 1508 unlock_buffer(bh);
1497} 1509}
1498 1510
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49ff..2f6cfcaa55fd 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
179 struct page *page = NULL; 179 struct page *page = NULL;
180 180
181 if (blocknr + i < devsize) { 181 if (blocknr + i < devsize) {
182 page = read_mapping_page_async(mapping, blocknr + i, 182 page = read_mapping_page(mapping, blocknr + i, NULL);
183 NULL);
184 /* synchronous error? */ 183 /* synchronous error? */
185 if (IS_ERR(page)) 184 if (IS_ERR(page))
186 page = NULL; 185 page = NULL;
diff --git a/fs/exec.c b/fs/exec.c
index 95eef54de2b6..26bb91bf203b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/fdtable.h> 27#include <linux/fdtable.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/vmacache.h>
29#include <linux/stat.h> 30#include <linux/stat.h>
30#include <linux/fcntl.h> 31#include <linux/fcntl.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code);
818static int exec_mmap(struct mm_struct *mm) 819static int exec_mmap(struct mm_struct *mm)
819{ 820{
820 struct task_struct *tsk; 821 struct task_struct *tsk;
821 struct mm_struct * old_mm, *active_mm; 822 struct mm_struct *old_mm, *active_mm;
822 823
823 /* Notify parent that we're no longer interested in the old VM */ 824 /* Notify parent that we're no longer interested in the old VM */
824 tsk = current; 825 tsk = current;
@@ -844,6 +845,8 @@ static int exec_mmap(struct mm_struct *mm)
844 tsk->mm = mm; 845 tsk->mm = mm;
845 tsk->active_mm = mm; 846 tsk->active_mm = mm;
846 activate_mm(active_mm, mm); 847 activate_mm(active_mm, mm);
848 tsk->mm->vmacache_seqnum = 0;
849 vmacache_flush(tsk);
847 task_unlock(tsk); 850 task_unlock(tsk);
848 arch_pick_mmap_layout(mm); 851 arch_pick_mmap_layout(mm);
849 if (old_mm) { 852 if (old_mm) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 242226a87be7..7620133f78bf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1044 * allocating. If we are looking at the buddy cache we would 1044 * allocating. If we are looking at the buddy cache we would
1045 * have taken a reference using ext4_mb_load_buddy and that 1045 * have taken a reference using ext4_mb_load_buddy and that
1046 * would have pinned buddy page to page cache. 1046 * would have pinned buddy page to page cache.
1047 * The call to ext4_mb_get_buddy_page_lock will mark the
1048 * page accessed.
1047 */ 1049 */
1048 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1050 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1049 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1051 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1062 ret = -EIO; 1064 ret = -EIO;
1063 goto err; 1065 goto err;
1064 } 1066 }
1065 mark_page_accessed(page);
1066 1067
1067 if (e4b.bd_buddy_page == NULL) { 1068 if (e4b.bd_buddy_page == NULL) {
1068 /* 1069 /*
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1082 ret = -EIO; 1083 ret = -EIO;
1083 goto err; 1084 goto err;
1084 } 1085 }
1085 mark_page_accessed(page);
1086err: 1086err:
1087 ext4_mb_put_buddy_page_lock(&e4b); 1087 ext4_mb_put_buddy_page_lock(&e4b);
1088 return ret; 1088 return ret;
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1141 1141
1142 /* we could use find_or_create_page(), but it locks page 1142 /* we could use find_or_create_page(), but it locks page
1143 * what we'd like to avoid in fast path ... */ 1143 * what we'd like to avoid in fast path ... */
1144 page = find_get_page(inode->i_mapping, pnum); 1144 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1145 if (page == NULL || !PageUptodate(page)) { 1145 if (page == NULL || !PageUptodate(page)) {
1146 if (page) 1146 if (page)
1147 /* 1147 /*
@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1172 ret = -EIO; 1172 ret = -EIO;
1173 goto err; 1173 goto err;
1174 } 1174 }
1175
1176 /* Pages marked accessed already */
1175 e4b->bd_bitmap_page = page; 1177 e4b->bd_bitmap_page = page;
1176 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1178 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1177 mark_page_accessed(page);
1178 1179
1179 block++; 1180 block++;
1180 pnum = block / blocks_per_page; 1181 pnum = block / blocks_per_page;
1181 poff = block % blocks_per_page; 1182 poff = block % blocks_per_page;
1182 1183
1183 page = find_get_page(inode->i_mapping, pnum); 1184 page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1184 if (page == NULL || !PageUptodate(page)) { 1185 if (page == NULL || !PageUptodate(page)) {
1185 if (page) 1186 if (page)
1186 page_cache_release(page); 1187 page_cache_release(page);
@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1201 ret = -EIO; 1202 ret = -EIO;
1202 goto err; 1203 goto err;
1203 } 1204 }
1205
1206 /* Pages marked accessed already */
1204 e4b->bd_buddy_page = page; 1207 e4b->bd_buddy_page = page;
1205 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1208 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1206 mark_page_accessed(page);
1207 1209
1208 BUG_ON(e4b->bd_bitmap_page == NULL); 1210 BUG_ON(e4b->bd_bitmap_page == NULL);
1209 BUG_ON(e4b->bd_buddy_page == NULL); 1211 BUG_ON(e4b->bd_buddy_page == NULL);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bb312201ca95..15a29af63e20 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -70,7 +70,6 @@ repeat:
70 goto repeat; 70 goto repeat;
71 } 71 }
72out: 72out:
73 mark_page_accessed(page);
74 return page; 73 return page;
75} 74}
76 75
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 51ef27894433..d0335bdb65b4 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -970,7 +970,6 @@ repeat:
970 } 970 }
971got_it: 971got_it:
972 BUG_ON(nid != nid_of_node(page)); 972 BUG_ON(nid != nid_of_node(page));
973 mark_page_accessed(page);
974 return page; 973 return page;
975} 974}
976 975
@@ -1026,7 +1025,6 @@ page_hit:
1026 f2fs_put_page(page, 1); 1025 f2fs_put_page(page, 1);
1027 return ERR_PTR(-EIO); 1026 return ERR_PTR(-EIO);
1028 } 1027 }
1029 mark_page_accessed(page);
1030 return page; 1028 return page;
1031} 1029}
1032 1030
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index fa8cb4b7b8fe..fc8e4991736a 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1613,7 +1613,7 @@ out_finish:
1613 1613
1614static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1614static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1615{ 1615{
1616 release_pages(req->pages, req->num_pages, 0); 1616 release_pages(req->pages, req->num_pages, false);
1617} 1617}
1618 1618
1619static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1619static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4598345ab87d..d08c108065e1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
985 if (mapping_writably_mapped(mapping)) 985 if (mapping_writably_mapped(mapping))
986 flush_dcache_page(page); 986 flush_dcache_page(page);
987 987
988 pagefault_disable();
989 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 988 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
990 pagefault_enable();
991 flush_dcache_page(page); 989 flush_dcache_page(page);
992 990
993 mark_page_accessed(page);
994
995 if (!tmp) { 991 if (!tmp) {
996 unlock_page(page); 992 unlock_page(page);
997 page_cache_release(page); 993 page_cache_release(page);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1253c2006029..f3aee0bbe886 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
517 p = kmap_atomic(page); 517 p = kmap_atomic(page);
518 memcpy(buf + copied, p + offset, amt); 518 memcpy(buf + copied, p + offset, amt);
519 kunmap_atomic(p); 519 kunmap_atomic(p);
520 mark_page_accessed(page);
521 page_cache_release(page); 520 page_cache_release(page);
522 copied += amt; 521 copied += amt;
523 index++; 522 index++;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 52f177be3bf8..89afe3a8f626 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
128 yield(); 128 yield();
129 } 129 }
130 } else { 130 } else {
131 page = find_lock_page(mapping, index); 131 page = find_get_page_flags(mapping, index,
132 FGP_LOCK|FGP_ACCESSED);
132 if (!page) 133 if (!page)
133 return NULL; 134 return NULL;
134 } 135 }
@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
145 map_bh(bh, sdp->sd_vfs, blkno); 146 map_bh(bh, sdp->sd_vfs, blkno);
146 147
147 unlock_page(page); 148 unlock_page(page);
148 mark_page_accessed(page);
149 page_cache_release(page); 149 page_cache_release(page);
150 150
151 return bh; 151 return bh;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf1..a4a8ed56e438 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void)
1017 int error; 1017 int error;
1018 int i; 1018 int i;
1019 1019
1020 if (!hugepages_supported()) {
1021 pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
1022 return -ENOTSUPP;
1023 }
1024
1020 error = bdi_init(&hugetlbfs_backing_dev_info); 1025 error = bdi_init(&hugetlbfs_backing_dev_info);
1021 if (error) 1026 if (error)
1022 return error; 1027 return error;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 02003f02dd92..5a2c26525cfc 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
682 struct inode *inode = OFNI_EDONI_2SFFJ(f); 682 struct inode *inode = OFNI_EDONI_2SFFJ(f);
683 struct page *pg; 683 struct page *pg;
684 684
685 pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, 685 pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
686 (void *)jffs2_do_readpage_unlock, inode); 686 (void *)jffs2_do_readpage_unlock, inode);
687 if (IS_ERR(pg)) 687 if (IS_ERR(pg))
688 return (void *)pg; 688 return (void *)pg;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf72972..fdb74cbb9e0c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1220 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); 1220 end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
1221 if (end != NFS_I(inode)->npages) { 1221 if (end != NFS_I(inode)->npages) {
1222 rcu_read_lock(); 1222 rcu_read_lock();
1223 end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); 1223 end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
1224 rcu_read_unlock(); 1224 rcu_read_unlock();
1225 } 1225 }
1226 1226
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a27e3fecefaf..250ed5b20c8f 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
1748 if (page) { 1748 if (page) {
1749 set_page_dirty(page); 1749 set_page_dirty(page);
1750 unlock_page(page); 1750 unlock_page(page);
1751 mark_page_accessed(page);
1752 page_cache_release(page); 1751 page_cache_release(page);
1753 } 1752 }
1754 ntfs_debug("Done."); 1753 ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ea4ba9daeb47..a0b2f345da2b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2060 } 2060 }
2061 do { 2061 do {
2062 unlock_page(pages[--do_pages]); 2062 unlock_page(pages[--do_pages]);
2063 mark_page_accessed(pages[do_pages]);
2064 page_cache_release(pages[do_pages]); 2063 page_cache_release(pages[do_pages]);
2065 } while (do_pages); 2064 } while (do_pages);
2066 if (unlikely(status)) 2065 if (unlikely(status))
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ad4df869c907..7724fbdf443f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/vmacache.h>
2#include <linux/hugetlb.h> 3#include <linux/hugetlb.h>
3#include <linux/huge_mm.h> 4#include <linux/huge_mm.h>
4#include <linux/mount.h> 5#include <linux/mount.h>
@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
159 160
160 /* 161 /*
161 * We remember last_addr rather than next_addr to hit with 162 * We remember last_addr rather than next_addr to hit with
162 * mmap_cache most of the time. We have zero last_addr at 163 * vmacache most of the time. We have zero last_addr at
163 * the beginning and also after lseek. We will have -1 last_addr 164 * the beginning and also after lseek. We will have -1 last_addr
164 * after the end of the vmas. 165 * after the end of the vmas.
165 */ 166 */
diff --git a/fs/super.c b/fs/super.c
index d127de207376..fb68a4c90c98 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
112 112
113 sb = container_of(shrink, struct super_block, s_shrink); 113 sb = container_of(shrink, struct super_block, s_shrink);
114 114
115 if (!grab_super_passive(sb)) 115 /*
116 return 0; 116 * Don't call grab_super_passive as it is a potential
117 117 * scalability bottleneck. The counts could get updated
118 * between super_cache_count and super_cache_scan anyway.
119 * Call to super_cache_count with shrinker_rwsem held
120 * ensures the safety of call to list_lru_count_node() and
121 * s_op->nr_cached_objects().
122 */
118 if (sb->s_op && sb->s_op->nr_cached_objects) 123 if (sb->s_op && sb->s_op->nr_cached_objects)
119 total_objects = sb->s_op->nr_cached_objects(sb, 124 total_objects = sb->s_op->nr_cached_objects(sb,
120 sc->nid); 125 sc->nid);
@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink,
125 sc->nid); 130 sc->nid);
126 131
127 total_objects = vfs_pressure_ratio(total_objects); 132 total_objects = vfs_pressure_ratio(total_objects);
128 drop_super(sb);
129 return total_objects; 133 return total_objects;
130} 134}
131 135
@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s)
321 struct file_system_type *fs = s->s_type; 325 struct file_system_type *fs = s->s_type;
322 if (atomic_dec_and_test(&s->s_active)) { 326 if (atomic_dec_and_test(&s->s_active)) {
323 cleancache_invalidate_fs(s); 327 cleancache_invalidate_fs(s);
324 fs->kill_sb(s);
325
326 /* caches are now gone, we can safely kill the shrinker now */
327 unregister_shrinker(&s->s_shrink); 328 unregister_shrinker(&s->s_shrink);
329 fs->kill_sb(s);
328 330
329 put_filesystem(fs); 331 put_filesystem(fs);
330 put_super(s); 332 put_super(s);
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 091d72e70d8a..01e3132820da 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22extern int fragmentation_index(struct zone *zone, unsigned int order); 22extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync, bool *contended); 25 enum migrate_mode mode, bool *contended);
26extern void compact_pgdat(pg_data_t *pgdat, int order); 26extern void compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat); 27extern void reset_isolation_suitable(pg_data_t *pgdat);
28extern unsigned long compaction_suitable(struct zone *zone, int order); 28extern unsigned long compaction_suitable(struct zone *zone, int order);
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order)
62 return zone->compact_considered < defer_limit; 62 return zone->compact_considered < defer_limit;
63} 63}
64 64
65/*
66 * Update defer tracking counters after successful compaction of given order,
67 * which means an allocation either succeeded (alloc_success == true) or is
68 * expected to succeed.
69 */
70static inline void compaction_defer_reset(struct zone *zone, int order,
71 bool alloc_success)
72{
73 if (alloc_success) {
74 zone->compact_considered = 0;
75 zone->compact_defer_shift = 0;
76 }
77 if (order >= zone->compact_order_failed)
78 zone->compact_order_failed = order + 1;
79}
80
65/* Returns true if restarting compaction after many failures */ 81/* Returns true if restarting compaction after many failures */
66static inline bool compaction_restarting(struct zone *zone, int order) 82static inline bool compaction_restarting(struct zone *zone, int order)
67{ 83{
@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order)
75#else 91#else
76static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, 92static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
77 int order, gfp_t gfp_mask, nodemask_t *nodemask, 93 int order, gfp_t gfp_mask, nodemask_t *nodemask,
78 bool sync, bool *contended) 94 enum migrate_mode mode, bool *contended)
79{ 95{
80 return COMPACT_CONTINUE; 96 return COMPACT_CONTINUE;
81} 97}
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index cc1b01cf2035..a7ebb89ae9fb 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,10 +12,31 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/jump_label.h>
15 16
16#ifdef CONFIG_CPUSETS 17#ifdef CONFIG_CPUSETS
17 18
18extern int number_of_cpusets; /* How many cpusets are defined in system? */ 19extern struct static_key cpusets_enabled_key;
20static inline bool cpusets_enabled(void)
21{
22 return static_key_false(&cpusets_enabled_key);
23}
24
25static inline int nr_cpusets(void)
26{
27 /* jump label reference count + the top-level cpuset */
28 return static_key_count(&cpusets_enabled_key) + 1;
29}
30
31static inline void cpuset_inc(void)
32{
33 static_key_slow_inc(&cpusets_enabled_key);
34}
35
36static inline void cpuset_dec(void)
37{
38 static_key_slow_dec(&cpusets_enabled_key);
39}
19 40
20extern int cpuset_init(void); 41extern int cpuset_init(void);
21extern void cpuset_init_smp(void); 42extern void cpuset_init_smp(void);
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
32 53
33static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 54static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
34{ 55{
35 return number_of_cpusets <= 1 || 56 return nr_cpusets() <= 1 ||
36 __cpuset_node_allowed_softwall(node, gfp_mask); 57 __cpuset_node_allowed_softwall(node, gfp_mask);
37} 58}
38 59
39static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) 60static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
40{ 61{
41 return number_of_cpusets <= 1 || 62 return nr_cpusets() <= 1 ||
42 __cpuset_node_allowed_hardwall(node, gfp_mask); 63 __cpuset_node_allowed_hardwall(node, gfp_mask);
43} 64}
44 65
@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void);
87extern void cpuset_print_task_mems_allowed(struct task_struct *p); 108extern void cpuset_print_task_mems_allowed(struct task_struct *p);
88 109
89/* 110/*
90 * get_mems_allowed is required when making decisions involving mems_allowed 111 * read_mems_allowed_begin is required when making decisions involving
91 * such as during page allocation. mems_allowed can be updated in parallel 112 * mems_allowed such as during page allocation. mems_allowed can be updated in
92 * and depending on the new value an operation can fail potentially causing 113 * parallel and depending on the new value an operation can fail potentially
93 * process failure. A retry loop with get_mems_allowed and put_mems_allowed 114 * causing process failure. A retry loop with read_mems_allowed_begin and
94 * prevents these artificial failures. 115 * read_mems_allowed_retry prevents these artificial failures.
95 */ 116 */
96static inline unsigned int get_mems_allowed(void) 117static inline unsigned int read_mems_allowed_begin(void)
97{ 118{
98 return read_seqcount_begin(&current->mems_allowed_seq); 119 return read_seqcount_begin(&current->mems_allowed_seq);
99} 120}
100 121
101/* 122/*
102 * If this returns false, the operation that took place after get_mems_allowed 123 * If this returns true, the operation that took place after
103 * may have failed. It is up to the caller to retry the operation if 124 * read_mems_allowed_begin may have failed artificially due to a concurrent
125 * update of mems_allowed. It is up to the caller to retry the operation if
104 * appropriate. 126 * appropriate.
105 */ 127 */
106static inline bool put_mems_allowed(unsigned int seq) 128static inline bool read_mems_allowed_retry(unsigned int seq)
107{ 129{
108 return !read_seqcount_retry(&current->mems_allowed_seq, seq); 130 return read_seqcount_retry(&current->mems_allowed_seq, seq);
109} 131}
110 132
111static inline void set_mems_allowed(nodemask_t nodemask) 133static inline void set_mems_allowed(nodemask_t nodemask)
@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
119 141
120#else /* !CONFIG_CPUSETS */ 142#else /* !CONFIG_CPUSETS */
121 143
144static inline bool cpusets_enabled(void) { return false; }
145
122static inline int cpuset_init(void) { return 0; } 146static inline int cpuset_init(void) { return 0; }
123static inline void cpuset_init_smp(void) {} 147static inline void cpuset_init_smp(void) {}
124 148
@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
221{ 245{
222} 246}
223 247
224static inline unsigned int get_mems_allowed(void) 248static inline unsigned int read_mems_allowed_begin(void)
225{ 249{
226 return 0; 250 return 0;
227} 251}
228 252
229static inline bool put_mems_allowed(unsigned int seq) 253static inline bool read_mems_allowed_retry(unsigned int seq)
230{ 254{
231 return true; 255 return false;
232} 256}
233 257
234#endif /* !CONFIG_CPUSETS */ 258#endif /* !CONFIG_CPUSETS */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 9b4dd491f7e8..fa7ac989ff56 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
364 364
365extern void __free_pages(struct page *page, unsigned int order); 365extern void __free_pages(struct page *page, unsigned int order);
366extern void free_pages(unsigned long addr, unsigned int order); 366extern void free_pages(unsigned long addr, unsigned int order);
367extern void free_hot_cold_page(struct page *page, int cold); 367extern void free_hot_cold_page(struct page *page, bool cold);
368extern void free_hot_cold_page_list(struct list_head *list, int cold); 368extern void free_hot_cold_page_list(struct list_head *list, bool cold);
369 369
370extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); 370extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
371extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); 371extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a291552ab767..aac671be9581 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
92#endif /* CONFIG_DEBUG_VM */ 92#endif /* CONFIG_DEBUG_VM */
93 93
94extern unsigned long transparent_hugepage_flags; 94extern unsigned long transparent_hugepage_flags;
95extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
96 pmd_t *dst_pmd, pmd_t *src_pmd,
97 struct vm_area_struct *vma,
98 unsigned long addr, unsigned long end);
99extern int split_huge_page_to_list(struct page *page, struct list_head *list); 95extern int split_huge_page_to_list(struct page *page, struct list_head *list);
100static inline int split_huge_page(struct page *page) 96static inline int split_huge_page(struct page *page)
101{ 97{
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5214ff63c351..511b1a0d6cc2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h)
396#endif 396#endif
397} 397}
398 398
399static inline bool hugepages_supported(void)
400{
401 /*
402 * Some platform decide whether they support huge pages at boot
403 * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
404 * there is no such support
405 */
406 return HPAGE_SHIFT != 0;
407}
408
399#else /* CONFIG_HUGETLB_PAGE */ 409#else /* CONFIG_HUGETLB_PAGE */
400struct hstate {}; 410struct hstate {};
401#define alloc_huge_page_node(h, nid) NULL 411#define alloc_huge_page_node(h, nid) NULL
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index a5079072da66..9216e465289a 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -62,6 +62,10 @@ struct static_key {
62 62
63# include <asm/jump_label.h> 63# include <asm/jump_label.h>
64# define HAVE_JUMP_LABEL 64# define HAVE_JUMP_LABEL
65#else
66struct static_key {
67 atomic_t enabled;
68};
65#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ 69#endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */
66 70
67enum jump_label_type { 71enum jump_label_type {
@@ -72,6 +76,12 @@ enum jump_label_type {
72struct module; 76struct module;
73 77
74#include <linux/atomic.h> 78#include <linux/atomic.h>
79
80static inline int static_key_count(struct static_key *key)
81{
82 return atomic_read(&key->enabled);
83}
84
75#ifdef HAVE_JUMP_LABEL 85#ifdef HAVE_JUMP_LABEL
76 86
77#define JUMP_LABEL_TRUE_BRANCH 1UL 87#define JUMP_LABEL_TRUE_BRANCH 1UL
@@ -122,24 +132,20 @@ extern void jump_label_apply_nops(struct module *mod);
122 132
123#else /* !HAVE_JUMP_LABEL */ 133#else /* !HAVE_JUMP_LABEL */
124 134
125struct static_key {
126 atomic_t enabled;
127};
128
129static __always_inline void jump_label_init(void) 135static __always_inline void jump_label_init(void)
130{ 136{
131} 137}
132 138
133static __always_inline bool static_key_false(struct static_key *key) 139static __always_inline bool static_key_false(struct static_key *key)
134{ 140{
135 if (unlikely(atomic_read(&key->enabled)) > 0) 141 if (unlikely(static_key_count(key) > 0))
136 return true; 142 return true;
137 return false; 143 return false;
138} 144}
139 145
140static __always_inline bool static_key_true(struct static_key *key) 146static __always_inline bool static_key_true(struct static_key *key)
141{ 147{
142 if (likely(atomic_read(&key->enabled)) > 0) 148 if (likely(static_key_count(key) > 0))
143 return true; 149 return true;
144 return false; 150 return false;
145} 151}
@@ -179,7 +185,7 @@ static inline int jump_label_apply_nops(struct module *mod)
179 185
180static inline bool static_key_enabled(struct static_key *key) 186static inline bool static_key_enabled(struct static_key *key)
181{ 187{
182 return (atomic_read(&key->enabled) > 0); 188 return static_key_count(key) > 0;
183} 189}
184 190
185#endif /* _LINUX_JUMP_LABEL_H */ 191#endif /* _LINUX_JUMP_LABEL_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ee8b14ae4f3f..449905ebcab3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -5,7 +5,9 @@
5#include <linux/mempolicy.h> 5#include <linux/mempolicy.h>
6#include <linux/migrate_mode.h> 6#include <linux/migrate_mode.h>
7 7
8typedef struct page *new_page_t(struct page *, unsigned long private, int **); 8typedef struct page *new_page_t(struct page *page, unsigned long private,
9 int **reason);
10typedef void free_page_t(struct page *page, unsigned long private);
9 11
10/* 12/*
11 * Return values from addresss_space_operations.migratepage(): 13 * Return values from addresss_space_operations.migratepage():
@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l);
39extern void putback_movable_pages(struct list_head *l); 41extern void putback_movable_pages(struct list_head *l);
40extern int migrate_page(struct address_space *, 42extern int migrate_page(struct address_space *,
41 struct page *, struct page *, enum migrate_mode); 43 struct page *, struct page *, enum migrate_mode);
42extern int migrate_pages(struct list_head *l, new_page_t x, 44extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
43 unsigned long private, enum migrate_mode mode, int reason); 45 unsigned long private, enum migrate_mode mode, int reason);
44 46
45extern int fail_migrate_page(struct address_space *, 47extern int fail_migrate_page(struct address_space *,
@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping,
61 63
62static inline void putback_lru_pages(struct list_head *l) {} 64static inline void putback_lru_pages(struct list_head *l) {}
63static inline void putback_movable_pages(struct list_head *l) {} 65static inline void putback_movable_pages(struct list_head *l) {}
64static inline int migrate_pages(struct list_head *l, new_page_t x, 66static inline int migrate_pages(struct list_head *l, new_page_t new,
65 unsigned long private, enum migrate_mode mode, int reason) 67 free_page_t free, unsigned long private, enum migrate_mode mode,
68 int reason)
66 { return -ENOSYS; } 69 { return -ENOSYS; }
67 70
68static inline int migrate_prep(void) { return -ENOSYS; } 71static inline int migrate_prep(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 073734339583..2b3a5330dcf2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags);
919extern bool skip_free_areas_node(unsigned int flags, int nid); 919extern bool skip_free_areas_node(unsigned int flags, int nid);
920 920
921int shmem_zero_setup(struct vm_area_struct *); 921int shmem_zero_setup(struct vm_area_struct *);
922#ifdef CONFIG_SHMEM
923bool shmem_mapping(struct address_space *mapping);
924#else
925static inline bool shmem_mapping(struct address_space *mapping)
926{
927 return false;
928}
929#endif
922 930
923extern int can_do_mlock(void); 931extern int can_do_mlock(void);
924extern int user_shm_lock(size_t, struct user_struct *); 932extern int user_shm_lock(size_t, struct user_struct *);
@@ -1623,9 +1631,6 @@ void page_cache_async_readahead(struct address_space *mapping,
1623 unsigned long size); 1631 unsigned long size);
1624 1632
1625unsigned long max_sane_readahead(unsigned long nr); 1633unsigned long max_sane_readahead(unsigned long nr);
1626unsigned long ra_submit(struct file_ra_state *ra,
1627 struct address_space *mapping,
1628 struct file *filp);
1629 1634
1630/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ 1635/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
1631extern int expand_stack(struct vm_area_struct *vma, unsigned long address); 1636extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8e082f18fb6a..b8131e7d6eda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -324,9 +324,9 @@ struct mm_rss_stat {
324 324
325struct kioctx_table; 325struct kioctx_table;
326struct mm_struct { 326struct mm_struct {
327 struct vm_area_struct * mmap; /* list of VMAs */ 327 struct vm_area_struct *mmap; /* list of VMAs */
328 struct rb_root mm_rb; 328 struct rb_root mm_rb;
329 struct vm_area_struct * mmap_cache; /* last find_vma result */ 329 u32 vmacache_seqnum; /* per-thread vmacache */
330#ifdef CONFIG_MMU 330#ifdef CONFIG_MMU
331 unsigned long (*get_unmapped_area) (struct file *filp, 331 unsigned long (*get_unmapped_area) (struct file *filp,
332 unsigned long addr, unsigned long len, 332 unsigned long addr, unsigned long len,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 56482904a676..450f19c5c865 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled;
78#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) 78#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1)
79#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) 79#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1)
80 80
81static inline int get_pageblock_migratetype(struct page *page) 81#define get_pageblock_migratetype(page) \
82 get_pfnblock_flags_mask(page, page_to_pfn(page), \
83 PB_migrate_end, MIGRATETYPE_MASK)
84
85static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
82{ 86{
83 BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); 87 BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
84 return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); 88 return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
89 MIGRATETYPE_MASK);
85} 90}
86 91
87struct free_area { 92struct free_area {
@@ -138,6 +143,7 @@ enum zone_stat_item {
138 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 143 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
139 NR_DIRTIED, /* page dirtyings since bootup */ 144 NR_DIRTIED, /* page dirtyings since bootup */
140 NR_WRITTEN, /* page writings since bootup */ 145 NR_WRITTEN, /* page writings since bootup */
146 NR_PAGES_SCANNED, /* pages scanned since last reclaim */
141#ifdef CONFIG_NUMA 147#ifdef CONFIG_NUMA
142 NUMA_HIT, /* allocated in intended node */ 148 NUMA_HIT, /* allocated in intended node */
143 NUMA_MISS, /* allocated in non intended node */ 149 NUMA_MISS, /* allocated in non intended node */
@@ -316,19 +322,12 @@ enum zone_type {
316#ifndef __GENERATING_BOUNDS_H 322#ifndef __GENERATING_BOUNDS_H
317 323
318struct zone { 324struct zone {
319 /* Fields commonly accessed by the page allocator */ 325 /* Read-mostly fields */
320 326
321 /* zone watermarks, access with *_wmark_pages(zone) macros */ 327 /* zone watermarks, access with *_wmark_pages(zone) macros */
322 unsigned long watermark[NR_WMARK]; 328 unsigned long watermark[NR_WMARK];
323 329
324 /* 330 /*
325 * When free pages are below this point, additional steps are taken
326 * when reading the number of free pages to avoid per-cpu counter
327 * drift allowing watermarks to be breached
328 */
329 unsigned long percpu_drift_mark;
330
331 /*
332 * We don't know if the memory that we're going to allocate will be freeable 331 * We don't know if the memory that we're going to allocate will be freeable
333 * or/and it will be released eventually, so to avoid totally wasting several 332 * or/and it will be released eventually, so to avoid totally wasting several
334 * GB of ram we must reserve some of the lower zone memory (otherwise we risk 333 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
@@ -336,40 +335,26 @@ struct zone {
336 * on the higher zones). This array is recalculated at runtime if the 335 * on the higher zones). This array is recalculated at runtime if the
337 * sysctl_lowmem_reserve_ratio sysctl changes. 336 * sysctl_lowmem_reserve_ratio sysctl changes.
338 */ 337 */
339 unsigned long lowmem_reserve[MAX_NR_ZONES]; 338 long lowmem_reserve[MAX_NR_ZONES];
340
341 /*
342 * This is a per-zone reserve of pages that should not be
343 * considered dirtyable memory.
344 */
345 unsigned long dirty_balance_reserve;
346 339
347#ifdef CONFIG_NUMA 340#ifdef CONFIG_NUMA
348 int node; 341 int node;
342#endif
343
349 /* 344 /*
350 * zone reclaim becomes active if more unmapped pages exist. 345 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
346 * this zone's LRU. Maintained by the pageout code.
351 */ 347 */
352 unsigned long min_unmapped_pages; 348 unsigned int inactive_ratio;
353 unsigned long min_slab_pages; 349
354#endif 350 struct pglist_data *zone_pgdat;
355 struct per_cpu_pageset __percpu *pageset; 351 struct per_cpu_pageset __percpu *pageset;
352
356 /* 353 /*
357 * free areas of different sizes 354 * This is a per-zone reserve of pages that should not be
355 * considered dirtyable memory.
358 */ 356 */
359 spinlock_t lock; 357 unsigned long dirty_balance_reserve;
360#if defined CONFIG_COMPACTION || defined CONFIG_CMA
361 /* Set to true when the PG_migrate_skip bits should be cleared */
362 bool compact_blockskip_flush;
363
364 /* pfns where compaction scanners should start */
365 unsigned long compact_cached_free_pfn;
366 unsigned long compact_cached_migrate_pfn;
367#endif
368#ifdef CONFIG_MEMORY_HOTPLUG
369 /* see spanned/present_pages for more description */
370 seqlock_t span_seqlock;
371#endif
372 struct free_area free_area[MAX_ORDER];
373 358
374#ifndef CONFIG_SPARSEMEM 359#ifndef CONFIG_SPARSEMEM
375 /* 360 /*
@@ -379,71 +364,14 @@ struct zone {
379 unsigned long *pageblock_flags; 364 unsigned long *pageblock_flags;
380#endif /* CONFIG_SPARSEMEM */ 365#endif /* CONFIG_SPARSEMEM */
381 366
382#ifdef CONFIG_COMPACTION 367#ifdef CONFIG_NUMA
383 /*
384 * On compaction failure, 1<<compact_defer_shift compactions
385 * are skipped before trying again. The number attempted since
386 * last failure is tracked with compact_considered.
387 */
388 unsigned int compact_considered;
389 unsigned int compact_defer_shift;
390 int compact_order_failed;
391#endif
392
393 ZONE_PADDING(_pad1_)
394
395 /* Fields commonly accessed by the page reclaim scanner */
396 spinlock_t lru_lock;
397 struct lruvec lruvec;
398
399 unsigned long pages_scanned; /* since last reclaim */
400 unsigned long flags; /* zone flags, see below */
401
402 /* Zone statistics */
403 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
404
405 /*
406 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
407 * this zone's LRU. Maintained by the pageout code.
408 */
409 unsigned int inactive_ratio;
410
411
412 ZONE_PADDING(_pad2_)
413 /* Rarely used or read-mostly fields */
414
415 /* 368 /*
416 * wait_table -- the array holding the hash table 369 * zone reclaim becomes active if more unmapped pages exist.
417 * wait_table_hash_nr_entries -- the size of the hash table array
418 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
419 *
420 * The purpose of all these is to keep track of the people
421 * waiting for a page to become available and make them
422 * runnable again when possible. The trouble is that this
423 * consumes a lot of space, especially when so few things
424 * wait on pages at a given time. So instead of using
425 * per-page waitqueues, we use a waitqueue hash table.
426 *
427 * The bucket discipline is to sleep on the same queue when
428 * colliding and wake all in that wait queue when removing.
429 * When something wakes, it must check to be sure its page is
430 * truly available, a la thundering herd. The cost of a
431 * collision is great, but given the expected load of the
432 * table, they should be so rare as to be outweighed by the
433 * benefits from the saved space.
434 *
435 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
436 * primary users of these fields, and in mm/page_alloc.c
437 * free_area_init_core() performs the initialization of them.
438 */ 370 */
439 wait_queue_head_t * wait_table; 371 unsigned long min_unmapped_pages;
440 unsigned long wait_table_hash_nr_entries; 372 unsigned long min_slab_pages;
441 unsigned long wait_table_bits; 373#endif /* CONFIG_NUMA */
442 374
443 /*
444 * Discontig memory support fields.
445 */
446 struct pglist_data *zone_pgdat;
447 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ 375 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
448 unsigned long zone_start_pfn; 376 unsigned long zone_start_pfn;
449 377
@@ -489,14 +417,103 @@ struct zone {
489 * adjust_managed_page_count() should be used instead of directly 417 * adjust_managed_page_count() should be used instead of directly
490 * touching zone->managed_pages and totalram_pages. 418 * touching zone->managed_pages and totalram_pages.
491 */ 419 */
420 unsigned long managed_pages;
492 unsigned long spanned_pages; 421 unsigned long spanned_pages;
493 unsigned long present_pages; 422 unsigned long present_pages;
494 unsigned long managed_pages; 423
424 const char *name;
495 425
496 /* 426 /*
497 * rarely used fields: 427 * Number of MIGRATE_RESEVE page block. To maintain for just
428 * optimization. Protected by zone->lock.
498 */ 429 */
499 const char *name; 430 int nr_migrate_reserve_block;
431
432#ifdef CONFIG_MEMORY_HOTPLUG
433 /* see spanned/present_pages for more description */
434 seqlock_t span_seqlock;
435#endif
436
437 /*
438 * wait_table -- the array holding the hash table
439 * wait_table_hash_nr_entries -- the size of the hash table array
440 * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
441 *
442 * The purpose of all these is to keep track of the people
443 * waiting for a page to become available and make them
444 * runnable again when possible. The trouble is that this
445 * consumes a lot of space, especially when so few things
446 * wait on pages at a given time. So instead of using
447 * per-page waitqueues, we use a waitqueue hash table.
448 *
449 * The bucket discipline is to sleep on the same queue when
450 * colliding and wake all in that wait queue when removing.
451 * When something wakes, it must check to be sure its page is
452 * truly available, a la thundering herd. The cost of a
453 * collision is great, but given the expected load of the
454 * table, they should be so rare as to be outweighed by the
455 * benefits from the saved space.
456 *
457 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
458 * primary users of these fields, and in mm/page_alloc.c
459 * free_area_init_core() performs the initialization of them.
460 */
461 wait_queue_head_t *wait_table;
462 unsigned long wait_table_hash_nr_entries;
463 unsigned long wait_table_bits;
464
465 ZONE_PADDING(_pad1_)
466
467 /* Write-intensive fields used from the page allocator */
468 spinlock_t lock;
469
470 /* free areas of different sizes */
471 struct free_area free_area[MAX_ORDER];
472
473 /* zone flags, see below */
474 unsigned long flags;
475
476 ZONE_PADDING(_pad2_)
477
478 /* Write-intensive fields used by page reclaim */
479
480 /* Fields commonly accessed by the page reclaim scanner */
481 spinlock_t lru_lock;
482 struct lruvec lruvec;
483
484 /*
485 * When free pages are below this point, additional steps are taken
486 * when reading the number of free pages to avoid per-cpu counter
487 * drift allowing watermarks to be breached
488 */
489 unsigned long percpu_drift_mark;
490
491#if defined CONFIG_COMPACTION || defined CONFIG_CMA
492 /* pfn where compaction free scanner should start */
493 unsigned long compact_cached_free_pfn;
494 /* pfn where async and sync compaction migration scanner should start */
495 unsigned long compact_cached_migrate_pfn[2];
496#endif
497
498#ifdef CONFIG_COMPACTION
499 /*
500 * On compaction failure, 1<<compact_defer_shift compactions
501 * are skipped before trying again. The number attempted since
502 * last failure is tracked with compact_considered.
503 */
504 unsigned int compact_considered;
505 unsigned int compact_defer_shift;
506 int compact_order_failed;
507#endif
508
509#if defined CONFIG_COMPACTION || defined CONFIG_CMA
510 /* Set to true when the PG_migrate_skip bits should be cleared */
511 bool compact_blockskip_flush;
512#endif
513
514 ZONE_PADDING(_pad3_)
515 /* Zone statistics */
516 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
500} ____cacheline_internodealigned_in_smp; 517} ____cacheline_internodealigned_in_smp;
501 518
502typedef enum { 519typedef enum {
@@ -512,6 +529,7 @@ typedef enum {
512 ZONE_WRITEBACK, /* reclaim scanning has recently found 529 ZONE_WRITEBACK, /* reclaim scanning has recently found
513 * many pages under writeback 530 * many pages under writeback
514 */ 531 */
532 ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
515} zone_flags_t; 533} zone_flags_t;
516 534
517static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) 535static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
549 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 567 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
550} 568}
551 569
570static inline int zone_is_fair_depleted(const struct zone *zone)
571{
572 return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
573}
574
552static inline int zone_is_oom_locked(const struct zone *zone) 575static inline int zone_is_oom_locked(const struct zone *zone)
553{ 576{
554 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 577 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
803extern struct mutex zonelists_mutex; 826extern struct mutex zonelists_mutex;
804void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); 827void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
805void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 828void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
806bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 829bool zone_watermark_ok(struct zone *z, unsigned int order,
807 int classzone_idx, int alloc_flags); 830 unsigned long mark, int classzone_idx, int alloc_flags);
808bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 831bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
809 int classzone_idx, int alloc_flags); 832 unsigned long mark, int classzone_idx, int alloc_flags);
810enum memmap_context { 833enum memmap_context {
811 MEMMAP_EARLY, 834 MEMMAP_EARLY,
812 MEMMAP_HOTPLUG, 835 MEMMAP_HOTPLUG,
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index dd7d45b5c496..2284ea62c6cc 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -198,6 +198,7 @@ struct page; /* forward declaration */
198TESTPAGEFLAG(Locked, locked) 198TESTPAGEFLAG(Locked, locked)
199PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) 199PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error)
200PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) 200PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
201 __SETPAGEFLAG(Referenced, referenced)
201PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) 202PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
202PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) 203PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
203PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) 204PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */
208PAGEFLAG(SavePinned, savepinned); /* Xen */ 209PAGEFLAG(SavePinned, savepinned); /* Xen */
209PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) 210PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
210PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) 211PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
212 __SETPAGEFLAG(SwapBacked, swapbacked)
211 213
212__PAGEFLAG(SlobFree, slob_free) 214__PAGEFLAG(SlobFree, slob_free)
213 215
@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
228TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) 230TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
229PAGEFLAG(MappedToDisk, mappedtodisk) 231PAGEFLAG(MappedToDisk, mappedtodisk)
230 232
231/* PG_readahead is only used for file reads; PG_reclaim is only for writes */ 233/* PG_readahead is only used for reads; PG_reclaim is only for writes */
232PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) 234PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
233PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ 235PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
234 236
235#ifdef CONFIG_HIGHMEM 237#ifdef CONFIG_HIGHMEM
236/* 238/*
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index c08730c10c7a..2baeee12f48e 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -65,33 +65,26 @@ extern int pageblock_order;
65/* Forward declaration */ 65/* Forward declaration */
66struct page; 66struct page;
67 67
68unsigned long get_pageblock_flags_mask(struct page *page, 68unsigned long get_pfnblock_flags_mask(struct page *page,
69 unsigned long pfn,
69 unsigned long end_bitidx, 70 unsigned long end_bitidx,
70 unsigned long mask); 71 unsigned long mask);
71void set_pageblock_flags_mask(struct page *page, 72
73void set_pfnblock_flags_mask(struct page *page,
72 unsigned long flags, 74 unsigned long flags,
75 unsigned long pfn,
73 unsigned long end_bitidx, 76 unsigned long end_bitidx,
74 unsigned long mask); 77 unsigned long mask);
75 78
76/* Declarations for getting and setting flags. See mm/page_alloc.c */ 79/* Declarations for getting and setting flags. See mm/page_alloc.c */
77static inline unsigned long get_pageblock_flags_group(struct page *page, 80#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \
78 int start_bitidx, int end_bitidx) 81 get_pfnblock_flags_mask(page, page_to_pfn(page), \
79{ 82 end_bitidx, \
80 unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; 83 (1 << (end_bitidx - start_bitidx + 1)) - 1)
81 unsigned long mask = (1 << nr_flag_bits) - 1; 84#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \
82 85 set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \
83 return get_pageblock_flags_mask(page, end_bitidx, mask); 86 end_bitidx, \
84} 87 (1 << (end_bitidx - start_bitidx + 1)) - 1)
85
86static inline void set_pageblock_flags_group(struct page *page,
87 unsigned long flags,
88 int start_bitidx, int end_bitidx)
89{
90 unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1;
91 unsigned long mask = (1 << nr_flag_bits) - 1;
92
93 set_pageblock_flags_mask(page, flags, end_bitidx, mask);
94}
95 88
96#ifdef CONFIG_COMPACTION 89#ifdef CONFIG_COMPACTION
97#define get_pageblock_skip(page) \ 90#define get_pageblock_skip(page) \
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e3dea75a078b..d57a02a9747b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
99 99
100#define page_cache_get(page) get_page(page) 100#define page_cache_get(page) get_page(page)
101#define page_cache_release(page) put_page(page) 101#define page_cache_release(page) put_page(page)
102void release_pages(struct page **pages, int nr, int cold); 102void release_pages(struct page **pages, int nr, bool cold);
103 103
104/* 104/*
105 * speculatively take a reference to a page. 105 * speculatively take a reference to a page.
@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x)
243 243
244typedef int filler_t(void *, struct page *); 244typedef int filler_t(void *, struct page *);
245 245
246extern struct page * find_get_page(struct address_space *mapping, 246pgoff_t page_cache_next_hole(struct address_space *mapping,
247 pgoff_t index); 247 pgoff_t index, unsigned long max_scan);
248extern struct page * find_lock_page(struct address_space *mapping, 248pgoff_t page_cache_prev_hole(struct address_space *mapping,
249 pgoff_t index); 249 pgoff_t index, unsigned long max_scan);
250extern struct page * find_or_create_page(struct address_space *mapping, 250
251 pgoff_t index, gfp_t gfp_mask); 251#define FGP_ACCESSED 0x00000001
252#define FGP_LOCK 0x00000002
253#define FGP_CREAT 0x00000004
254#define FGP_WRITE 0x00000008
255#define FGP_NOFS 0x00000010
256#define FGP_NOWAIT 0x00000020
257
258struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
259 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask);
260
261/**
262 * find_get_page - find and get a page reference
263 * @mapping: the address_space to search
264 * @offset: the page index
265 *
266 * Looks up the page cache slot at @mapping & @offset. If there is a
267 * page cache page, it is returned with an increased refcount.
268 *
269 * Otherwise, %NULL is returned.
270 */
271static inline struct page *find_get_page(struct address_space *mapping,
272 pgoff_t offset)
273{
274 return pagecache_get_page(mapping, offset, 0, 0, 0);
275}
276
277static inline struct page *find_get_page_flags(struct address_space *mapping,
278 pgoff_t offset, int fgp_flags)
279{
280 return pagecache_get_page(mapping, offset, fgp_flags, 0, 0);
281}
282
283/**
284 * find_lock_page - locate, pin and lock a pagecache page
285 * pagecache_get_page - find and get a page reference
286 * @mapping: the address_space to search
287 * @offset: the page index
288 *
289 * Looks up the page cache slot at @mapping & @offset. If there is a
290 * page cache page, it is returned locked and with an increased
291 * refcount.
292 *
293 * Otherwise, %NULL is returned.
294 *
295 * find_lock_page() may sleep.
296 */
297static inline struct page *find_lock_page(struct address_space *mapping,
298 pgoff_t offset)
299{
300 return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0);
301}
302
303/**
304 * find_or_create_page - locate or add a pagecache page
305 * @mapping: the page's address_space
306 * @index: the page's index into the mapping
307 * @gfp_mask: page allocation mode
308 *
309 * Looks up the page cache slot at @mapping & @offset. If there is a
310 * page cache page, it is returned locked and with an increased
311 * refcount.
312 *
313 * If the page is not present, a new page is allocated using @gfp_mask
314 * and added to the page cache and the VM's LRU list. The page is
315 * returned locked and with an increased refcount.
316 *
317 * On memory exhaustion, %NULL is returned.
318 *
319 * find_or_create_page() may sleep, even if @gfp_flags specifies an
320 * atomic allocation!
321 */
322static inline struct page *find_or_create_page(struct address_space *mapping,
323 pgoff_t offset, gfp_t gfp_mask)
324{
325 return pagecache_get_page(mapping, offset,
326 FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
327 gfp_mask, gfp_mask & GFP_RECLAIM_MASK);
328}
329
330/**
331 * grab_cache_page_nowait - returns locked page at given index in given cache
332 * @mapping: target address_space
333 * @index: the page index
334 *
335 * Same as grab_cache_page(), but do not wait if the page is unavailable.
336 * This is intended for speculative data generators, where the data can
337 * be regenerated if the page couldn't be grabbed. This routine should
338 * be safe to call while holding the lock for another page.
339 *
340 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
341 * and deadlock against the caller's locked page.
342 */
343static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
344 pgoff_t index)
345{
346 return pagecache_get_page(mapping, index,
347 FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
348 mapping_gfp_mask(mapping),
349 GFP_NOFS);
350}
351
352struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
353struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
354unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
355 unsigned int nr_entries, struct page **entries,
356 pgoff_t *indices);
252unsigned find_get_pages(struct address_space *mapping, pgoff_t start, 357unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
253 unsigned int nr_pages, struct page **pages); 358 unsigned int nr_pages, struct page **pages);
254unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, 359unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping,
268 return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); 373 return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
269} 374}
270 375
271extern struct page * grab_cache_page_nowait(struct address_space *mapping,
272 pgoff_t index);
273extern struct page * read_cache_page_async(struct address_space *mapping,
274 pgoff_t index, filler_t *filler, void *data);
275extern struct page * read_cache_page(struct address_space *mapping, 376extern struct page * read_cache_page(struct address_space *mapping,
276 pgoff_t index, filler_t *filler, void *data); 377 pgoff_t index, filler_t *filler, void *data);
277extern struct page * read_cache_page_gfp(struct address_space *mapping, 378extern struct page * read_cache_page_gfp(struct address_space *mapping,
@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping,
279extern int read_cache_pages(struct address_space *mapping, 380extern int read_cache_pages(struct address_space *mapping,
280 struct list_head *pages, filler_t *filler, void *data); 381 struct list_head *pages, filler_t *filler, void *data);
281 382
282static inline struct page *read_mapping_page_async(
283 struct address_space *mapping,
284 pgoff_t index, void *data)
285{
286 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
287 return read_cache_page_async(mapping, index, filler, data);
288}
289
290static inline struct page *read_mapping_page(struct address_space *mapping, 383static inline struct page *read_mapping_page(struct address_space *mapping,
291 pgoff_t index, void *data) 384 pgoff_t index, void *data)
292{ 385{
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index e4dbfab37729..b45d391b4540 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,6 +22,11 @@ struct pagevec {
22 22
23void __pagevec_release(struct pagevec *pvec); 23void __pagevec_release(struct pagevec *pvec);
24void __pagevec_lru_add(struct pagevec *pvec); 24void __pagevec_lru_add(struct pagevec *pvec);
25unsigned pagevec_lookup_entries(struct pagevec *pvec,
26 struct address_space *mapping,
27 pgoff_t start, unsigned nr_entries,
28 pgoff_t *indices);
29void pagevec_remove_exceptionals(struct pagevec *pvec);
25unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, 30unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
26 pgoff_t start, unsigned nr_pages); 31 pgoff_t start, unsigned nr_pages);
27unsigned pagevec_lookup_tag(struct pagevec *pvec, 32unsigned pagevec_lookup_tag(struct pagevec *pvec,
diff --git a/include/linux/plist.h b/include/linux/plist.h
index aa0fb390bd29..8b6c970cff6c 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -98,6 +98,13 @@ struct plist_node {
98} 98}
99 99
100/** 100/**
101 * PLIST_HEAD - declare and init plist_head
102 * @head: name for struct plist_head variable
103 */
104#define PLIST_HEAD(head) \
105 struct plist_head head = PLIST_HEAD_INIT(head)
106
107/**
101 * PLIST_NODE_INIT - static struct plist_node initializer 108 * PLIST_NODE_INIT - static struct plist_node initializer
102 * @node: struct plist_node variable name 109 * @node: struct plist_node variable name
103 * @__prio: initial node priority 110 * @__prio: initial node priority
@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio)
134extern void plist_add(struct plist_node *node, struct plist_head *head); 141extern void plist_add(struct plist_node *node, struct plist_head *head);
135extern void plist_del(struct plist_node *node, struct plist_head *head); 142extern void plist_del(struct plist_node *node, struct plist_head *head);
136 143
144extern void plist_requeue(struct plist_node *node, struct plist_head *head);
145
137/** 146/**
138 * plist_for_each - iterate over the plist 147 * plist_for_each - iterate over the plist
139 * @pos: the type * to use as a loop counter 148 * @pos: the type * to use as a loop counter
@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
143 list_for_each_entry(pos, &(head)->node_list, node_list) 152 list_for_each_entry(pos, &(head)->node_list, node_list)
144 153
145/** 154/**
155 * plist_for_each_continue - continue iteration over the plist
156 * @pos: the type * to use as a loop cursor
157 * @head: the head for your list
158 *
159 * Continue to iterate over plist, continuing after the current position.
160 */
161#define plist_for_each_continue(pos, head) \
162 list_for_each_entry_continue(pos, &(head)->node_list, node_list)
163
164/**
146 * plist_for_each_safe - iterate safely over a plist of given type 165 * plist_for_each_safe - iterate safely over a plist of given type
147 * @pos: the type * to use as a loop counter 166 * @pos: the type * to use as a loop counter
148 * @n: another type * to use as temporary storage 167 * @n: another type * to use as temporary storage
@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head);
163 list_for_each_entry(pos, &(head)->node_list, mem.node_list) 182 list_for_each_entry(pos, &(head)->node_list, mem.node_list)
164 183
165/** 184/**
185 * plist_for_each_entry_continue - continue iteration over list of given type
186 * @pos: the type * to use as a loop cursor
187 * @head: the head for your list
188 * @m: the name of the list_struct within the struct
189 *
190 * Continue to iterate over list of given type, continuing after
191 * the current position.
192 */
193#define plist_for_each_entry_continue(pos, head, m) \
194 list_for_each_entry_continue(pos, &(head)->node_list, m.node_list)
195
196/**
166 * plist_for_each_entry_safe - iterate safely over list of given type 197 * plist_for_each_entry_safe - iterate safely over list of given type
167 * @pos: the type * to use as a loop counter 198 * @pos: the type * to use as a loop counter
168 * @n: another type * to use as temporary storage 199 * @n: another type * to use as temporary storage
@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node)
229#endif 260#endif
230 261
231/** 262/**
263 * plist_next - get the next entry in list
264 * @pos: the type * to cursor
265 */
266#define plist_next(pos) \
267 list_next_entry(pos, node_list)
268
269/**
270 * plist_prev - get the prev entry in list
271 * @pos: the type * to cursor
272 */
273#define plist_prev(pos) \
274 list_prev_entry(pos, node_list)
275
276/**
232 * plist_first - return the first node (and thus, highest priority) 277 * plist_first - return the first node (and thus, highest priority)
233 * @head: the &struct plist_head pointer 278 * @head: the &struct plist_head pointer
234 * 279 *
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 403940787be1..e8be53ecfc45 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item)
219int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); 219int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
220void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 220void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
221void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 221void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
222void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
222void *radix_tree_delete(struct radix_tree_root *, unsigned long); 223void *radix_tree_delete(struct radix_tree_root *, unsigned long);
223unsigned int 224unsigned int
224radix_tree_gang_lookup(struct radix_tree_root *root, void **results, 225radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
226unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, 227unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
227 void ***results, unsigned long *indices, 228 void ***results, unsigned long *indices,
228 unsigned long first_index, unsigned int max_items); 229 unsigned long first_index, unsigned int max_items);
229unsigned long radix_tree_next_hole(struct radix_tree_root *root,
230 unsigned long index, unsigned long max_scan);
231unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
232 unsigned long index, unsigned long max_scan);
233int radix_tree_preload(gfp_t gfp_mask); 230int radix_tree_preload(gfp_t gfp_mask);
234int radix_tree_maybe_preload(gfp_t gfp_mask); 231int radix_tree_maybe_preload(gfp_t gfp_mask);
235void radix_tree_init(void); 232void radix_tree_init(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0827bec7d82f..cb67b4e2dba2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -63,6 +63,10 @@ struct fs_struct;
63struct perf_event_context; 63struct perf_event_context;
64struct blk_plug; 64struct blk_plug;
65 65
66#define VMACACHE_BITS 2
67#define VMACACHE_SIZE (1U << VMACACHE_BITS)
68#define VMACACHE_MASK (VMACACHE_SIZE - 1)
69
66/* 70/*
67 * List of flags we want to share for kernel threads, 71 * List of flags we want to share for kernel threads,
68 * if only because they are not used by them anyway. 72 * if only because they are not used by them anyway.
@@ -1093,6 +1097,9 @@ struct task_struct {
1093#ifdef CONFIG_COMPAT_BRK 1097#ifdef CONFIG_COMPAT_BRK
1094 unsigned brk_randomized:1; 1098 unsigned brk_randomized:1;
1095#endif 1099#endif
1100 /* per-thread vma caching */
1101 u32 vmacache_seqnum;
1102 struct vm_area_struct *vmacache[VMACACHE_SIZE];
1096#if defined(SPLIT_RSS_COUNTING) 1103#if defined(SPLIT_RSS_COUNTING)
1097 struct task_rss_stat rss_stat; 1104 struct task_rss_stat rss_stat;
1098#endif 1105#endif
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 30aa0dc60d75..deb49609cd36 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name,
49 loff_t size, unsigned long flags); 49 loff_t size, unsigned long flags);
50extern int shmem_zero_setup(struct vm_area_struct *); 50extern int shmem_zero_setup(struct vm_area_struct *);
51extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 51extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
52extern bool shmem_mapping(struct address_space *mapping);
52extern void shmem_unlock_mapping(struct address_space *mapping); 53extern void shmem_unlock_mapping(struct address_space *mapping);
53extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 54extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
54 pgoff_t index, gfp_t gfp_mask); 55 pgoff_t index, gfp_t gfp_mask);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46ba0c6c219f..241bf0922770 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,9 @@ struct percpu_cluster {
214struct swap_info_struct { 214struct swap_info_struct {
215 unsigned long flags; /* SWP_USED etc: see above */ 215 unsigned long flags; /* SWP_USED etc: see above */
216 signed short prio; /* swap priority of this type */ 216 signed short prio; /* swap priority of this type */
217 struct plist_node list; /* entry in swap_active_head */
218 struct plist_node avail_list; /* entry in swap_avail_head */
217 signed char type; /* strange name for an index */ 219 signed char type; /* strange name for an index */
218 signed char next; /* next type on the swap list */
219 unsigned int max; /* extent of the swap_map */ 220 unsigned int max; /* extent of the swap_map */
220 unsigned char *swap_map; /* vmalloc'ed array of usage counts */ 221 unsigned char *swap_map; /* vmalloc'ed array of usage counts */
221 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ 222 struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +256,6 @@ struct swap_info_struct {
255 struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ 256 struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
256}; 257};
257 258
258struct swap_list_t {
259 int head; /* head of priority-ordered swapfile list */
260 int next; /* swapfile to be used next */
261};
262
263/* linux/mm/page_alloc.c */ 259/* linux/mm/page_alloc.c */
264extern unsigned long totalram_pages; 260extern unsigned long totalram_pages;
265extern unsigned long totalreserve_pages; 261extern unsigned long totalreserve_pages;
@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void);
272 268
273 269
274/* linux/mm/swap.c */ 270/* linux/mm/swap.c */
275extern void __lru_cache_add(struct page *);
276extern void lru_cache_add(struct page *); 271extern void lru_cache_add(struct page *);
272extern void lru_cache_add_anon(struct page *page);
273extern void lru_cache_add_file(struct page *page);
277extern void lru_add_page_tail(struct page *page, struct page *page_tail, 274extern void lru_add_page_tail(struct page *page, struct page *page_tail,
278 struct lruvec *lruvec, struct list_head *head); 275 struct lruvec *lruvec, struct list_head *head);
279extern void activate_page(struct page *); 276extern void activate_page(struct page *);
280extern void mark_page_accessed(struct page *); 277extern void mark_page_accessed(struct page *);
278extern void init_page_accessed(struct page *page);
281extern void lru_add_drain(void); 279extern void lru_add_drain(void);
282extern void lru_add_drain_cpu(int cpu); 280extern void lru_add_drain_cpu(int cpu);
283extern void lru_add_drain_all(void); 281extern void lru_add_drain_all(void);
@@ -287,22 +285,6 @@ extern void swap_setup(void);
287 285
288extern void add_page_to_unevictable_list(struct page *page); 286extern void add_page_to_unevictable_list(struct page *page);
289 287
290/**
291 * lru_cache_add: add a page to the page lists
292 * @page: the page to add
293 */
294static inline void lru_cache_add_anon(struct page *page)
295{
296 ClearPageActive(page);
297 __lru_cache_add(page);
298}
299
300static inline void lru_cache_add_file(struct page *page)
301{
302 ClearPageActive(page);
303 __lru_cache_add(page);
304}
305
306/* linux/mm/vmscan.c */ 288/* linux/mm/vmscan.c */
307extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 289extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
308 gfp_t gfp_mask, nodemask_t *mask); 290 gfp_t gfp_mask, nodemask_t *mask);
@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
460#define free_page_and_swap_cache(page) \ 442#define free_page_and_swap_cache(page) \
461 page_cache_release(page) 443 page_cache_release(page)
462#define free_pages_and_swap_cache(pages, nr) \ 444#define free_pages_and_swap_cache(pages, nr) \
463 release_pages((pages), (nr), 0); 445 release_pages((pages), (nr), false);
464 446
465static inline void show_swap_cache_info(void) 447static inline void show_swap_cache_info(void)
466{ 448{
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624e8c10..388293a91e8c 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
6 * want to expose them to the dozens of source files that include swap.h 6 * want to expose them to the dozens of source files that include swap.h
7 */ 7 */
8extern spinlock_t swap_lock; 8extern spinlock_t swap_lock;
9extern struct swap_list_t swap_list; 9extern struct plist_head swap_active_head;
10extern struct swap_info_struct *swap_info[]; 10extern struct swap_info_struct *swap_info[];
11extern int try_to_unuse(unsigned int, bool, unsigned long); 11extern int try_to_unuse(unsigned int, bool, unsigned long);
12 12
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index c557c6d096de..3a712e2e7d76 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
71 THP_ZERO_PAGE_ALLOC, 71 THP_ZERO_PAGE_ALLOC,
72 THP_ZERO_PAGE_ALLOC_FAILED, 72 THP_ZERO_PAGE_ALLOC_FAILED,
73#endif 73#endif
74#ifdef CONFIG_DEBUG_TLBFLUSH
74#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
75 NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ 76 NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
76 NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ 77 NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
77#endif 78#endif /* CONFIG_SMP */
78 NR_TLB_LOCAL_FLUSH_ALL, 79 NR_TLB_LOCAL_FLUSH_ALL,
79 NR_TLB_LOCAL_FLUSH_ONE, 80 NR_TLB_LOCAL_FLUSH_ONE,
81#endif /* CONFIG_DEBUG_TLBFLUSH */
80 NR_VM_EVENT_ITEMS 82 NR_VM_EVENT_ITEMS
81}; 83};
82 84
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
new file mode 100644
index 000000000000..c3fa0fd43949
--- /dev/null
+++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
1#ifndef __LINUX_VMACACHE_H
2#define __LINUX_VMACACHE_H
3
4#include <linux/sched.h>
5#include <linux/mm.h>
6
7/*
8 * Hash based on the page number. Provides a good hit rate for
9 * workloads with good locality and those with random accesses as well.
10 */
11#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
12
13static inline void vmacache_flush(struct task_struct *tsk)
14{
15 memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
16}
17
18extern void vmacache_flush_all(struct mm_struct *mm);
19extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
20extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
21 unsigned long addr);
22
23#ifndef CONFIG_MMU
24extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
25 unsigned long start,
26 unsigned long end);
27#endif
28
29static inline void vmacache_invalidate(struct mm_struct *mm)
30{
31 mm->vmacache_seqnum++;
32
33 /* deal with overflows */
34 if (unlikely(mm->vmacache_seqnum == 0))
35 vmacache_flush_all(mm);
36}
37
38#endif /* __LINUX_VMACACHE_H */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a67b38415768..67ce70c8279b 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu)
83#define count_vm_numa_events(x, y) do { (void)(y); } while (0) 83#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
84#endif /* CONFIG_NUMA_BALANCING */ 84#endif /* CONFIG_NUMA_BALANCING */
85 85
86#ifdef CONFIG_DEBUG_TLBFLUSH
87#define count_vm_tlb_event(x) count_vm_event(x)
88#define count_vm_tlb_events(x, y) count_vm_events(x, y)
89#else
90#define count_vm_tlb_event(x) do {} while (0)
91#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
92#endif
93
86#define __count_zone_vm_events(item, zone, delta) \ 94#define __count_zone_vm_events(item, zone, delta) \
87 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ 95 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
88 zone_idx(zone), delta) 96 zone_idx(zone), delta)
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index fde1b3e94c7d..c6814b917bdf 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -5,6 +5,7 @@
5#define _TRACE_COMPACTION_H 5#define _TRACE_COMPACTION_H
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/list.h>
8#include <linux/tracepoint.h> 9#include <linux/tracepoint.h>
9#include <trace/events/gfpflags.h> 10#include <trace/events/gfpflags.h>
10 11
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
47 48
48TRACE_EVENT(mm_compaction_migratepages, 49TRACE_EVENT(mm_compaction_migratepages,
49 50
50 TP_PROTO(unsigned long nr_migrated, 51 TP_PROTO(unsigned long nr_all,
51 unsigned long nr_failed), 52 int migrate_rc,
53 struct list_head *migratepages),
52 54
53 TP_ARGS(nr_migrated, nr_failed), 55 TP_ARGS(nr_all, migrate_rc, migratepages),
54 56
55 TP_STRUCT__entry( 57 TP_STRUCT__entry(
56 __field(unsigned long, nr_migrated) 58 __field(unsigned long, nr_migrated)
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages,
58 ), 60 ),
59 61
60 TP_fast_assign( 62 TP_fast_assign(
61 __entry->nr_migrated = nr_migrated; 63 unsigned long nr_failed = 0;
64 struct list_head *page_lru;
65
66 /*
67 * migrate_pages() returns either a non-negative number
68 * with the number of pages that failed migration, or an
69 * error code, in which case we need to count the remaining
70 * pages manually
71 */
72 if (migrate_rc >= 0)
73 nr_failed = migrate_rc;
74 else
75 list_for_each(page_lru, migratepages)
76 nr_failed++;
77
78 __entry->nr_migrated = nr_all - nr_failed;
62 __entry->nr_failed = nr_failed; 79 __entry->nr_failed = nr_failed;
63 ), 80 ),
64 81
@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages,
67 __entry->nr_failed) 84 __entry->nr_failed)
68); 85);
69 86
87TRACE_EVENT(mm_compaction_begin,
88 TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
89 unsigned long free_start, unsigned long zone_end),
90
91 TP_ARGS(zone_start, migrate_start, free_start, zone_end),
92
93 TP_STRUCT__entry(
94 __field(unsigned long, zone_start)
95 __field(unsigned long, migrate_start)
96 __field(unsigned long, free_start)
97 __field(unsigned long, zone_end)
98 ),
99
100 TP_fast_assign(
101 __entry->zone_start = zone_start;
102 __entry->migrate_start = migrate_start;
103 __entry->free_start = free_start;
104 __entry->zone_end = zone_end;
105 ),
106
107 TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu",
108 __entry->zone_start,
109 __entry->migrate_start,
110 __entry->free_start,
111 __entry->zone_end)
112);
113
114TRACE_EVENT(mm_compaction_end,
115 TP_PROTO(int status),
116
117 TP_ARGS(status),
118
119 TP_STRUCT__entry(
120 __field(int, status)
121 ),
122
123 TP_fast_assign(
124 __entry->status = status;
125 ),
126
127 TP_printk("status=%d", __entry->status)
128);
70 129
71#endif /* _TRACE_COMPACTION_H */ 130#endif /* _TRACE_COMPACTION_H */
72 131
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index d0c613476620..aece1346ceb7 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain,
267TRACE_EVENT(mm_page_alloc_extfrag, 267TRACE_EVENT(mm_page_alloc_extfrag,
268 268
269 TP_PROTO(struct page *page, 269 TP_PROTO(struct page *page,
270 int alloc_order, int fallback_order, 270 int alloc_order, int fallback_order,
271 int alloc_migratetype, int fallback_migratetype, 271 int alloc_migratetype, int fallback_migratetype, int new_migratetype),
272 int change_ownership),
273 272
274 TP_ARGS(page, 273 TP_ARGS(page,
275 alloc_order, fallback_order, 274 alloc_order, fallback_order,
276 alloc_migratetype, fallback_migratetype, 275 alloc_migratetype, fallback_migratetype, new_migratetype),
277 change_ownership),
278 276
279 TP_STRUCT__entry( 277 TP_STRUCT__entry(
280 __field( struct page *, page ) 278 __field( struct page *, page )
@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
291 __entry->fallback_order = fallback_order; 289 __entry->fallback_order = fallback_order;
292 __entry->alloc_migratetype = alloc_migratetype; 290 __entry->alloc_migratetype = alloc_migratetype;
293 __entry->fallback_migratetype = fallback_migratetype; 291 __entry->fallback_migratetype = fallback_migratetype;
294 __entry->change_ownership = change_ownership; 292 __entry->change_ownership = (new_migratetype == alloc_migratetype);
295 ), 293 ),
296 294
297 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", 295 TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h
index 1c9fabde69e4..ce0803b8d05f 100644
--- a/include/trace/events/pagemap.h
+++ b/include/trace/events/pagemap.h
@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion,
28 28
29 TP_PROTO( 29 TP_PROTO(
30 struct page *page, 30 struct page *page,
31 unsigned long pfn, 31 int lru
32 int lru,
33 unsigned long flags
34 ), 32 ),
35 33
36 TP_ARGS(page, pfn, lru, flags), 34 TP_ARGS(page, lru),
37 35
38 TP_STRUCT__entry( 36 TP_STRUCT__entry(
39 __field(struct page *, page ) 37 __field(struct page *, page )
@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion,
44 42
45 TP_fast_assign( 43 TP_fast_assign(
46 __entry->page = page; 44 __entry->page = page;
47 __entry->pfn = pfn; 45 __entry->pfn = page_to_pfn(page);
48 __entry->lru = lru; 46 __entry->lru = lru;
49 __entry->flags = flags; 47 __entry->flags = trace_pagemap_flags(page);
50 ), 48 ),
51 49
52 /* Flag format is based on page-types.c formatting for pagemap */ 50 /* Flag format is based on page-types.c formatting for pagemap */
@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion,
64 62
65TRACE_EVENT(mm_lru_activate, 63TRACE_EVENT(mm_lru_activate,
66 64
67 TP_PROTO(struct page *page, unsigned long pfn), 65 TP_PROTO(struct page *page),
68 66
69 TP_ARGS(page, pfn), 67 TP_ARGS(page),
70 68
71 TP_STRUCT__entry( 69 TP_STRUCT__entry(
72 __field(struct page *, page ) 70 __field(struct page *, page )
@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate,
75 73
76 TP_fast_assign( 74 TP_fast_assign(
77 __entry->page = page; 75 __entry->page = page;
78 __entry->pfn = pfn; 76 __entry->pfn = page_to_pfn(page);
79 ), 77 ),
80 78
81 /* Flag format is based on page-types.c formatting for pagemap */ 79 /* Flag format is based on page-types.c formatting for pagemap */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b29c52479a6..c8289138cad4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,12 +61,7 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h> 62#include <linux/wait.h>
63 63
64/* 64struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
65 * Tracks how many cpusets are currently defined in system.
66 * When there is only one cpuset (the root cpuset) we can
67 * short circuit some hooks.
68 */
69int number_of_cpusets __read_mostly;
70 65
71/* See "Frequency meter" comments, below. */ 66/* See "Frequency meter" comments, below. */
72 67
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
611 goto done; 606 goto done;
612 } 607 }
613 608
614 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 609 csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
615 if (!csa) 610 if (!csa)
616 goto done; 611 goto done;
617 csn = 0; 612 csn = 0;
@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1022 task_lock(tsk); 1017 task_lock(tsk);
1023 /* 1018 /*
1024 * Determine if a loop is necessary if another thread is doing 1019 * Determine if a loop is necessary if another thread is doing
1025 * get_mems_allowed(). If at least one node remains unchanged and 1020 * read_mems_allowed_begin(). If at least one node remains unchanged and
1026 * tsk does not have a mempolicy, then an empty nodemask will not be 1021 * tsk does not have a mempolicy, then an empty nodemask will not be
1027 * possible when mems_allowed is larger than a word. 1022 * possible when mems_allowed is larger than a word.
1028 */ 1023 */
@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1986 if (is_spread_slab(parent)) 1981 if (is_spread_slab(parent))
1987 set_bit(CS_SPREAD_SLAB, &cs->flags); 1982 set_bit(CS_SPREAD_SLAB, &cs->flags);
1988 1983
1989 number_of_cpusets++; 1984 cpuset_inc();
1990 1985
1991 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1986 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1992 goto out_unlock; 1987 goto out_unlock;
@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
2037 if (is_sched_load_balance(cs)) 2032 if (is_sched_load_balance(cs))
2038 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 2033 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2039 2034
2040 number_of_cpusets--; 2035 cpuset_dec();
2041 clear_bit(CS_ONLINE, &cs->flags); 2036 clear_bit(CS_ONLINE, &cs->flags);
2042 2037
2043 mutex_unlock(&cpuset_mutex); 2038 mutex_unlock(&cpuset_mutex);
@@ -2092,7 +2087,6 @@ int __init cpuset_init(void)
2092 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) 2087 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
2093 BUG(); 2088 BUG();
2094 2089
2095 number_of_cpusets = 1;
2096 return 0; 2090 return 0;
2097} 2091}
2098 2092
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0506d447aed2..e911ec662d03 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
49#include <linux/pid.h> 49#include <linux/pid.h>
50#include <linux/smp.h> 50#include <linux/smp.h>
51#include <linux/mm.h> 51#include <linux/mm.h>
52#include <linux/vmacache.h>
52#include <linux/rcupdate.h> 53#include <linux/rcupdate.h>
53 54
54#include <asm/cacheflush.h> 55#include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
224 if (!CACHE_FLUSH_IS_SAFE) 225 if (!CACHE_FLUSH_IS_SAFE)
225 return; 226 return;
226 227
227 if (current->mm && current->mm->mmap_cache) { 228 if (current->mm) {
228 flush_cache_range(current->mm->mmap_cache, 229 int i;
229 addr, addr + BREAK_INSTR_SIZE); 230
231 for (i = 0; i < VMACACHE_SIZE; i++) {
232 if (!current->vmacache[i])
233 continue;
234 flush_cache_range(current->vmacache[i],
235 addr, addr + BREAK_INSTR_SIZE);
236 }
230 } 237 }
238
231 /* Force flush instruction cache if it was outside the mm */ 239 /* Force flush instruction cache if it was outside the mm */
232 flush_icache_range(addr, addr + BREAK_INSTR_SIZE); 240 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
233} 241}
diff --git a/kernel/fork.c b/kernel/fork.c
index 143962949bed..29a1b0283d3b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
28#include <linux/mman.h> 28#include <linux/mman.h>
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/fs.h> 30#include <linux/fs.h>
31#include <linux/mm.h>
32#include <linux/vmacache.h>
31#include <linux/nsproxy.h> 33#include <linux/nsproxy.h>
32#include <linux/capability.h> 34#include <linux/capability.h>
33#include <linux/cpu.h> 35#include <linux/cpu.h>
@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
363 365
364 mm->locked_vm = 0; 366 mm->locked_vm = 0;
365 mm->mmap = NULL; 367 mm->mmap = NULL;
366 mm->mmap_cache = NULL; 368 mm->vmacache_seqnum = 0;
367 mm->map_count = 0; 369 mm->map_count = 0;
368 cpumask_clear(mm_cpumask(mm)); 370 cpumask_clear(mm_cpumask(mm));
369 mm->mm_rb = RB_ROOT; 371 mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
882 if (!oldmm) 884 if (!oldmm)
883 return 0; 885 return 0;
884 886
887 /* initialize the new vmacache entries */
888 vmacache_flush(tsk);
889
885 if (clone_flags & CLONE_VM) { 890 if (clone_flags & CLONE_VM) {
886 atomic_inc(&oldmm->mm_users); 891 atomic_inc(&oldmm->mm_users);
887 mm = oldmm; 892 mm = oldmm;
diff --git a/lib/plist.c b/lib/plist.c
index 1ebc95f7a46f..0f2084d30798 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head)
134 plist_check_head(head); 134 plist_check_head(head);
135} 135}
136 136
137/**
138 * plist_requeue - Requeue @node at end of same-prio entries.
139 *
140 * This is essentially an optimized plist_del() followed by
141 * plist_add(). It moves an entry already in the plist to
142 * after any other same-priority entries.
143 *
144 * @node: &struct plist_node pointer - entry to be moved
145 * @head: &struct plist_head pointer - list head
146 */
147void plist_requeue(struct plist_node *node, struct plist_head *head)
148{
149 struct plist_node *iter;
150 struct list_head *node_next = &head->node_list;
151
152 plist_check_head(head);
153 BUG_ON(plist_head_empty(head));
154 BUG_ON(plist_node_empty(node));
155
156 if (node == plist_last(head))
157 return;
158
159 iter = plist_next(node);
160
161 if (node->prio != iter->prio)
162 return;
163
164 plist_del(node, head);
165
166 plist_for_each_continue(iter, head) {
167 if (node->prio != iter->prio) {
168 node_next = &iter->node_list;
169 break;
170 }
171 }
172 list_add_tail(&node->node_list, node_next);
173
174 plist_check_head(head);
175}
176
137#ifdef CONFIG_DEBUG_PI_LIST 177#ifdef CONFIG_DEBUG_PI_LIST
138#include <linux/sched.h> 178#include <linux/sched.h>
139#include <linux/module.h> 179#include <linux/module.h>
@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect)
170 BUG_ON(prio_pos->prio_list.next != &first->prio_list); 210 BUG_ON(prio_pos->prio_list.next != &first->prio_list);
171} 211}
172 212
213static void __init plist_test_requeue(struct plist_node *node)
214{
215 plist_requeue(node, &test_head);
216
217 if (node != plist_last(&test_head))
218 BUG_ON(node->prio == plist_next(node)->prio);
219}
220
173static int __init plist_test(void) 221static int __init plist_test(void)
174{ 222{
175 int nr_expect = 0, i, loop; 223 int nr_expect = 0, i, loop;
@@ -193,6 +241,10 @@ static int __init plist_test(void)
193 nr_expect--; 241 nr_expect--;
194 } 242 }
195 plist_test_check(nr_expect); 243 plist_test_check(nr_expect);
244 if (!plist_node_empty(test_node + i)) {
245 plist_test_requeue(test_node + i);
246 plist_test_check(nr_expect);
247 }
196 } 248 }
197 249
198 for (i = 0; i < ARRAY_SIZE(test_node); i++) { 250 for (i = 0; i < ARRAY_SIZE(test_node); i++) {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 7811ed3b4e70..e8adb5d8a184 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -946,81 +946,6 @@ next:
946} 946}
947EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); 947EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
948 948
949
950/**
951 * radix_tree_next_hole - find the next hole (not-present entry)
952 * @root: tree root
953 * @index: index key
954 * @max_scan: maximum range to search
955 *
956 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest
957 * indexed hole.
958 *
959 * Returns: the index of the hole if found, otherwise returns an index
960 * outside of the set specified (in which case 'return - index >= max_scan'
961 * will be true). In rare cases of index wrap-around, 0 will be returned.
962 *
963 * radix_tree_next_hole may be called under rcu_read_lock. However, like
964 * radix_tree_gang_lookup, this will not atomically search a snapshot of
965 * the tree at a single point in time. For example, if a hole is created
966 * at index 5, then subsequently a hole is created at index 10,
967 * radix_tree_next_hole covering both indexes may return 10 if called
968 * under rcu_read_lock.
969 */
970unsigned long radix_tree_next_hole(struct radix_tree_root *root,
971 unsigned long index, unsigned long max_scan)
972{
973 unsigned long i;
974
975 for (i = 0; i < max_scan; i++) {
976 if (!radix_tree_lookup(root, index))
977 break;
978 index++;
979 if (index == 0)
980 break;
981 }
982
983 return index;
984}
985EXPORT_SYMBOL(radix_tree_next_hole);
986
987/**
988 * radix_tree_prev_hole - find the prev hole (not-present entry)
989 * @root: tree root
990 * @index: index key
991 * @max_scan: maximum range to search
992 *
993 * Search backwards in the range [max(index-max_scan+1, 0), index]
994 * for the first hole.
995 *
996 * Returns: the index of the hole if found, otherwise returns an index
997 * outside of the set specified (in which case 'index - return >= max_scan'
998 * will be true). In rare cases of wrap-around, ULONG_MAX will be returned.
999 *
1000 * radix_tree_next_hole may be called under rcu_read_lock. However, like
1001 * radix_tree_gang_lookup, this will not atomically search a snapshot of
1002 * the tree at a single point in time. For example, if a hole is created
1003 * at index 10, then subsequently a hole is created at index 5,
1004 * radix_tree_prev_hole covering both indexes may return 5 if called under
1005 * rcu_read_lock.
1006 */
1007unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
1008 unsigned long index, unsigned long max_scan)
1009{
1010 unsigned long i;
1011
1012 for (i = 0; i < max_scan; i++) {
1013 if (!radix_tree_lookup(root, index))
1014 break;
1015 index--;
1016 if (index == ULONG_MAX)
1017 break;
1018 }
1019
1020 return index;
1021}
1022EXPORT_SYMBOL(radix_tree_prev_hole);
1023
1024/** 949/**
1025 * radix_tree_gang_lookup - perform multiple lookup on a radix tree 950 * radix_tree_gang_lookup - perform multiple lookup on a radix tree
1026 * @root: radix tree root 951 * @root: radix tree root
@@ -1335,15 +1260,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
1335} 1260}
1336 1261
1337/** 1262/**
1338 * radix_tree_delete - delete an item from a radix tree 1263 * radix_tree_delete_item - delete an item from a radix tree
1339 * @root: radix tree root 1264 * @root: radix tree root
1340 * @index: index key 1265 * @index: index key
1266 * @item: expected item
1341 * 1267 *
1342 * Remove the item at @index from the radix tree rooted at @root. 1268 * Remove @item at @index from the radix tree rooted at @root.
1343 * 1269 *
1344 * Returns the address of the deleted item, or NULL if it was not present. 1270 * Returns the address of the deleted item, or NULL if it was not present
1271 * or the entry at the given @index was not @item.
1345 */ 1272 */
1346void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) 1273void *radix_tree_delete_item(struct radix_tree_root *root,
1274 unsigned long index, void *item)
1347{ 1275{
1348 struct radix_tree_node *node = NULL; 1276 struct radix_tree_node *node = NULL;
1349 struct radix_tree_node *slot = NULL; 1277 struct radix_tree_node *slot = NULL;
@@ -1378,6 +1306,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1378 if (slot == NULL) 1306 if (slot == NULL)
1379 goto out; 1307 goto out;
1380 1308
1309 if (item && slot != item) {
1310 slot = NULL;
1311 goto out;
1312 }
1313
1381 /* 1314 /*
1382 * Clear all tags associated with the item to be deleted. 1315 * Clear all tags associated with the item to be deleted.
1383 * This way of doing it would be inefficient, but seldom is any set. 1316 * This way of doing it would be inefficient, but seldom is any set.
@@ -1422,6 +1355,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1422out: 1355out:
1423 return slot; 1356 return slot;
1424} 1357}
1358EXPORT_SYMBOL(radix_tree_delete_item);
1359
1360/**
1361 * radix_tree_delete - delete an item from a radix tree
1362 * @root: radix tree root
1363 * @index: index key
1364 *
1365 * Remove the item at @index from the radix tree rooted at @root.
1366 *
1367 * Returns the address of the deleted item, or NULL if it was not present.
1368 */
1369void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
1370{
1371 return radix_tree_delete_item(root, index, NULL);
1372}
1425EXPORT_SYMBOL(radix_tree_delete); 1373EXPORT_SYMBOL(radix_tree_delete);
1426 1374
1427/** 1375/**
diff --git a/mm/Makefile b/mm/Makefile
index 305d10acd081..fb51bc61d80a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o balloon_compaction.o \ 19 compaction.o balloon_compaction.o vmacache.o \
20 interval_tree.o list_lru.o $(mmu-y) 20 interval_tree.o list_lru.o $(mmu-y)
21 21
22obj-y += init-mm.o 22obj-y += init-mm.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 6441083e76d3..adb6d0560e96 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone)
89 unsigned long end_pfn = zone_end_pfn(zone); 89 unsigned long end_pfn = zone_end_pfn(zone);
90 unsigned long pfn; 90 unsigned long pfn;
91 91
92 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn[0] = start_pfn;
93 zone->compact_cached_migrate_pfn[1] = start_pfn;
93 zone->compact_cached_free_pfn = end_pfn; 94 zone->compact_cached_free_pfn = end_pfn;
94 zone->compact_blockskip_flush = false; 95 zone->compact_blockskip_flush = false;
95 96
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat)
131 */ 132 */
132static void update_pageblock_skip(struct compact_control *cc, 133static void update_pageblock_skip(struct compact_control *cc,
133 struct page *page, unsigned long nr_isolated, 134 struct page *page, unsigned long nr_isolated,
134 bool migrate_scanner) 135 bool set_unsuitable, bool migrate_scanner)
135{ 136{
136 struct zone *zone = cc->zone; 137 struct zone *zone = cc->zone;
138 unsigned long pfn;
137 139
138 if (cc->ignore_skip_hint) 140 if (cc->ignore_skip_hint)
139 return; 141 return;
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc,
141 if (!page) 143 if (!page)
142 return; 144 return;
143 145
144 if (!nr_isolated) { 146 if (nr_isolated)
145 unsigned long pfn = page_to_pfn(page); 147 return;
148
149 /*
150 * Only skip pageblocks when all forms of compaction will be known to
151 * fail in the near future.
152 */
153 if (set_unsuitable)
146 set_pageblock_skip(page); 154 set_pageblock_skip(page);
147 155
148 /* Update where compaction should restart */ 156 pfn = page_to_pfn(page);
149 if (migrate_scanner) { 157
150 if (!cc->finished_update_migrate && 158 /* Update where async and sync compaction should restart */
151 pfn > zone->compact_cached_migrate_pfn) 159 if (migrate_scanner) {
152 zone->compact_cached_migrate_pfn = pfn; 160 if (cc->finished_update_migrate)
153 } else { 161 return;
154 if (!cc->finished_update_free && 162 if (pfn > zone->compact_cached_migrate_pfn[0])
155 pfn < zone->compact_cached_free_pfn) 163 zone->compact_cached_migrate_pfn[0] = pfn;
156 zone->compact_cached_free_pfn = pfn; 164 if (cc->mode != MIGRATE_ASYNC &&
157 } 165 pfn > zone->compact_cached_migrate_pfn[1])
166 zone->compact_cached_migrate_pfn[1] = pfn;
167 } else {
168 if (cc->finished_update_free)
169 return;
170 if (pfn < zone->compact_cached_free_pfn)
171 zone->compact_cached_free_pfn = pfn;
158 } 172 }
159} 173}
160#else 174#else
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
166 180
167static void update_pageblock_skip(struct compact_control *cc, 181static void update_pageblock_skip(struct compact_control *cc,
168 struct page *page, unsigned long nr_isolated, 182 struct page *page, unsigned long nr_isolated,
169 bool migrate_scanner) 183 bool set_unsuitable, bool migrate_scanner)
170{ 184{
171} 185}
172#endif /* CONFIG_COMPACTION */ 186#endif /* CONFIG_COMPACTION */
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
195 } 209 }
196 210
197 /* async aborts if taking too long or contended */ 211 /* async aborts if taking too long or contended */
198 if (!cc->sync) { 212 if (cc->mode == MIGRATE_ASYNC) {
199 cc->contended = true; 213 cc->contended = true;
200 return false; 214 return false;
201 } 215 }
@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
208 return true; 222 return true;
209} 223}
210 224
211static inline bool compact_trylock_irqsave(spinlock_t *lock, 225/*
212 unsigned long *flags, struct compact_control *cc) 226 * Aside from avoiding lock contention, compaction also periodically checks
227 * need_resched() and either schedules in sync compaction or aborts async
228 * compaction. This is similar to what compact_checklock_irqsave() does, but
229 * is used where no lock is concerned.
230 *
231 * Returns false when no scheduling was needed, or sync compaction scheduled.
232 * Returns true when async compaction should abort.
233 */
234static inline bool compact_should_abort(struct compact_control *cc)
213{ 235{
214 return compact_checklock_irqsave(lock, flags, false, cc); 236 /* async compaction aborts if contended */
237 if (need_resched()) {
238 if (cc->mode == MIGRATE_ASYNC) {
239 cc->contended = true;
240 return true;
241 }
242
243 cond_resched();
244 }
245
246 return false;
215} 247}
216 248
217/* Returns true if the page is within a block suitable for migration to */ 249/* Returns true if the page is within a block suitable for migration to */
218static bool suitable_migration_target(struct page *page) 250static bool suitable_migration_target(struct page *page)
219{ 251{
220 int migratetype = get_pageblock_migratetype(page); 252 /* If the page is a large free page, then disallow migration */
221
222 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
223 if (migratetype == MIGRATE_RESERVE)
224 return false;
225
226 if (is_migrate_isolate(migratetype))
227 return false;
228
229 /* If the page is a large free page, then allow migration */
230 if (PageBuddy(page) && page_order(page) >= pageblock_order) 253 if (PageBuddy(page) && page_order(page) >= pageblock_order)
231 return true; 254 return false;
232 255
233 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 256 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
234 if (migrate_async_suitable(migratetype)) 257 if (migrate_async_suitable(get_pageblock_migratetype(page)))
235 return true; 258 return true;
236 259
237 /* Otherwise skip the block */ 260 /* Otherwise skip the block */
@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
254 struct page *cursor, *valid_page = NULL; 277 struct page *cursor, *valid_page = NULL;
255 unsigned long flags; 278 unsigned long flags;
256 bool locked = false; 279 bool locked = false;
280 bool checked_pageblock = false;
257 281
258 cursor = pfn_to_page(blockpfn); 282 cursor = pfn_to_page(blockpfn);
259 283
@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
285 break; 309 break;
286 310
287 /* Recheck this is a suitable migration target under lock */ 311 /* Recheck this is a suitable migration target under lock */
288 if (!strict && !suitable_migration_target(page)) 312 if (!strict && !checked_pageblock) {
289 break; 313 /*
314 * We need to check suitability of pageblock only once
315 * and this isolate_freepages_block() is called with
316 * pageblock range, so just check once is sufficient.
317 */
318 checked_pageblock = true;
319 if (!suitable_migration_target(page))
320 break;
321 }
290 322
291 /* Recheck this is a buddy page under lock */ 323 /* Recheck this is a buddy page under lock */
292 if (!PageBuddy(page)) 324 if (!PageBuddy(page))
@@ -330,7 +362,8 @@ isolate_fail:
330 362
331 /* Update the pageblock-skip if the whole pageblock was scanned */ 363 /* Update the pageblock-skip if the whole pageblock was scanned */
332 if (blockpfn == end_pfn) 364 if (blockpfn == end_pfn)
333 update_pageblock_skip(cc, valid_page, total_isolated, false); 365 update_pageblock_skip(cc, valid_page, total_isolated, true,
366 false);
334 367
335 count_compact_events(COMPACTFREE_SCANNED, nr_scanned); 368 count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
336 if (total_isolated) 369 if (total_isolated)
@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
461 unsigned long last_pageblock_nr = 0, pageblock_nr; 494 unsigned long last_pageblock_nr = 0, pageblock_nr;
462 unsigned long nr_scanned = 0, nr_isolated = 0; 495 unsigned long nr_scanned = 0, nr_isolated = 0;
463 struct list_head *migratelist = &cc->migratepages; 496 struct list_head *migratelist = &cc->migratepages;
464 isolate_mode_t mode = 0;
465 struct lruvec *lruvec; 497 struct lruvec *lruvec;
466 unsigned long flags; 498 unsigned long flags;
467 bool locked = false; 499 bool locked = false;
468 struct page *page = NULL, *valid_page = NULL; 500 struct page *page = NULL, *valid_page = NULL;
501 bool set_unsuitable = true;
502 const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
503 ISOLATE_ASYNC_MIGRATE : 0) |
504 (unevictable ? ISOLATE_UNEVICTABLE : 0);
469 505
470 /* 506 /*
471 * Ensure that there are not too many pages isolated from the LRU 507 * Ensure that there are not too many pages isolated from the LRU
@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
474 */ 510 */
475 while (unlikely(too_many_isolated(zone))) { 511 while (unlikely(too_many_isolated(zone))) {
476 /* async migration should just abort */ 512 /* async migration should just abort */
477 if (!cc->sync) 513 if (cc->mode == MIGRATE_ASYNC)
478 return 0; 514 return 0;
479 515
480 congestion_wait(BLK_RW_ASYNC, HZ/10); 516 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
483 return 0; 519 return 0;
484 } 520 }
485 521
522 if (compact_should_abort(cc))
523 return 0;
524
486 /* Time to isolate some pages for migration */ 525 /* Time to isolate some pages for migration */
487 cond_resched();
488 for (; low_pfn < end_pfn; low_pfn++) { 526 for (; low_pfn < end_pfn; low_pfn++) {
489 /* give a chance to irqs before checking need_resched() */ 527 /* give a chance to irqs before checking need_resched() */
490 if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { 528 if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
491 if (should_release_lock(&zone->lru_lock)) { 529 if (should_release_lock(&zone->lru_lock)) {
492 spin_unlock_irqrestore(&zone->lru_lock, flags); 530 spin_unlock_irqrestore(&zone->lru_lock, flags);
493 locked = false; 531 locked = false;
@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
526 564
527 /* If isolation recently failed, do not retry */ 565 /* If isolation recently failed, do not retry */
528 pageblock_nr = low_pfn >> pageblock_order; 566 pageblock_nr = low_pfn >> pageblock_order;
529 if (!isolation_suitable(cc, page)) 567 if (last_pageblock_nr != pageblock_nr) {
530 goto next_pageblock; 568 int mt;
569
570 last_pageblock_nr = pageblock_nr;
571 if (!isolation_suitable(cc, page))
572 goto next_pageblock;
573
574 /*
575 * For async migration, also only scan in MOVABLE
576 * blocks. Async migration is optimistic to see if
577 * the minimum amount of work satisfies the allocation
578 */
579 mt = get_pageblock_migratetype(page);
580 if (cc->mode == MIGRATE_ASYNC &&
581 !migrate_async_suitable(mt)) {
582 set_unsuitable = false;
583 goto next_pageblock;
584 }
585 }
531 586
532 /* Skip if free */ 587 /* Skip if free */
533 if (PageBuddy(page)) 588 if (PageBuddy(page))
534 continue; 589 continue;
535 590
536 /* 591 /*
537 * For async migration, also only scan in MOVABLE blocks. Async
538 * migration is optimistic to see if the minimum amount of work
539 * satisfies the allocation
540 */
541 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
542 !migrate_async_suitable(get_pageblock_migratetype(page))) {
543 cc->finished_update_migrate = true;
544 goto next_pageblock;
545 }
546
547 /*
548 * Check may be lockless but that's ok as we recheck later. 592 * Check may be lockless but that's ok as we recheck later.
549 * It's possible to migrate LRU pages and balloon pages 593 * It's possible to migrate LRU pages and balloon pages
550 * Skip any other type of page 594 * Skip any other type of page
@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
553 if (unlikely(balloon_page_movable(page))) { 597 if (unlikely(balloon_page_movable(page))) {
554 if (locked && balloon_page_isolate(page)) { 598 if (locked && balloon_page_isolate(page)) {
555 /* Successfully isolated */ 599 /* Successfully isolated */
556 cc->finished_update_migrate = true; 600 goto isolate_success;
557 list_add(&page->lru, migratelist);
558 cc->nr_migratepages++;
559 nr_isolated++;
560 goto check_compact_cluster;
561 } 601 }
562 } 602 }
563 continue; 603 continue;
@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
580 continue; 620 continue;
581 } 621 }
582 622
623 /*
624 * Migration will fail if an anonymous page is pinned in memory,
625 * so avoid taking lru_lock and isolating it unnecessarily in an
626 * admittedly racy check.
627 */
628 if (!page_mapping(page) &&
629 page_count(page) > page_mapcount(page))
630 continue;
631
583 /* Check if it is ok to still hold the lock */ 632 /* Check if it is ok to still hold the lock */
584 locked = compact_checklock_irqsave(&zone->lru_lock, &flags, 633 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
585 locked, cc); 634 locked, cc);
@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
594 continue; 643 continue;
595 } 644 }
596 645
597 if (!cc->sync)
598 mode |= ISOLATE_ASYNC_MIGRATE;
599
600 if (unevictable)
601 mode |= ISOLATE_UNEVICTABLE;
602
603 lruvec = mem_cgroup_page_lruvec(page, zone); 646 lruvec = mem_cgroup_page_lruvec(page, zone);
604 647
605 /* Try isolate the page */ 648 /* Try isolate the page */
@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
609 VM_BUG_ON(PageTransCompound(page)); 652 VM_BUG_ON(PageTransCompound(page));
610 653
611 /* Successfully isolated */ 654 /* Successfully isolated */
612 cc->finished_update_migrate = true;
613 del_page_from_lru_list(page, lruvec, page_lru(page)); 655 del_page_from_lru_list(page, lruvec, page_lru(page));
656
657isolate_success:
658 cc->finished_update_migrate = true;
614 list_add(&page->lru, migratelist); 659 list_add(&page->lru, migratelist);
615 cc->nr_migratepages++; 660 cc->nr_migratepages++;
616 nr_isolated++; 661 nr_isolated++;
617 662
618check_compact_cluster:
619 /* Avoid isolating too much */ 663 /* Avoid isolating too much */
620 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 664 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
621 ++low_pfn; 665 ++low_pfn;
@@ -626,7 +670,6 @@ check_compact_cluster:
626 670
627next_pageblock: 671next_pageblock:
628 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; 672 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
629 last_pageblock_nr = pageblock_nr;
630 } 673 }
631 674
632 acct_isolated(zone, locked, cc); 675 acct_isolated(zone, locked, cc);
@@ -634,9 +677,13 @@ next_pageblock:
634 if (locked) 677 if (locked)
635 spin_unlock_irqrestore(&zone->lru_lock, flags); 678 spin_unlock_irqrestore(&zone->lru_lock, flags);
636 679
637 /* Update the pageblock-skip if the whole pageblock was scanned */ 680 /*
681 * Update the pageblock-skip information and cached scanner pfn,
682 * if the whole pageblock was scanned without isolating any page.
683 */
638 if (low_pfn == end_pfn) 684 if (low_pfn == end_pfn)
639 update_pageblock_skip(cc, valid_page, nr_isolated, true); 685 update_pageblock_skip(cc, valid_page, nr_isolated,
686 set_unsuitable, true);
640 687
641 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 688 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
642 689
@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone,
657 struct compact_control *cc) 704 struct compact_control *cc)
658{ 705{
659 struct page *page; 706 struct page *page;
660 unsigned long high_pfn, low_pfn, pfn, z_end_pfn; 707 unsigned long block_start_pfn; /* start of current pageblock */
708 unsigned long block_end_pfn; /* end of current pageblock */
709 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
661 int nr_freepages = cc->nr_freepages; 710 int nr_freepages = cc->nr_freepages;
662 struct list_head *freelist = &cc->freepages; 711 struct list_head *freelist = &cc->freepages;
663 712
@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone,
665 * Initialise the free scanner. The starting point is where we last 714 * Initialise the free scanner. The starting point is where we last
666 * successfully isolated from, zone-cached value, or the end of the 715 * successfully isolated from, zone-cached value, or the end of the
667 * zone when isolating for the first time. We need this aligned to 716 * zone when isolating for the first time. We need this aligned to
668 * the pageblock boundary, because we do pfn -= pageblock_nr_pages 717 * the pageblock boundary, because we do
669 * in the for loop. 718 * block_start_pfn -= pageblock_nr_pages in the for loop.
719 * For ending point, take care when isolating in last pageblock of a
720 * a zone which ends in the middle of a pageblock.
670 * The low boundary is the end of the pageblock the migration scanner 721 * The low boundary is the end of the pageblock the migration scanner
671 * is using. 722 * is using.
672 */ 723 */
673 pfn = cc->free_pfn & ~(pageblock_nr_pages-1); 724 block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
725 block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
726 zone_end_pfn(zone));
674 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); 727 low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
675 728
676 /* 729 /*
677 * Take care that if the migration scanner is at the end of the zone
678 * that the free scanner does not accidentally move to the next zone
679 * in the next isolation cycle.
680 */
681 high_pfn = min(low_pfn, pfn);
682
683 z_end_pfn = zone_end_pfn(zone);
684
685 /*
686 * Isolate free pages until enough are available to migrate the 730 * Isolate free pages until enough are available to migrate the
687 * pages on cc->migratepages. We stop searching if the migrate 731 * pages on cc->migratepages. We stop searching if the migrate
688 * and free page scanners meet or enough free pages are isolated. 732 * and free page scanners meet or enough free pages are isolated.
689 */ 733 */
690 for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 734 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
691 pfn -= pageblock_nr_pages) { 735 block_end_pfn = block_start_pfn,
736 block_start_pfn -= pageblock_nr_pages) {
692 unsigned long isolated; 737 unsigned long isolated;
693 unsigned long end_pfn;
694 738
695 /* 739 /*
696 * This can iterate a massively long zone without finding any 740 * This can iterate a massively long zone without finding any
697 * suitable migration targets, so periodically check if we need 741 * suitable migration targets, so periodically check if we need
698 * to schedule. 742 * to schedule, or even abort async compaction.
699 */ 743 */
700 cond_resched(); 744 if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
745 && compact_should_abort(cc))
746 break;
701 747
702 if (!pfn_valid(pfn)) 748 if (!pfn_valid(block_start_pfn))
703 continue; 749 continue;
704 750
705 /* 751 /*
@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone,
709 * i.e. it's possible that all pages within a zones range of 755 * i.e. it's possible that all pages within a zones range of
710 * pages do not belong to a single zone. 756 * pages do not belong to a single zone.
711 */ 757 */
712 page = pfn_to_page(pfn); 758 page = pfn_to_page(block_start_pfn);
713 if (page_zone(page) != zone) 759 if (page_zone(page) != zone)
714 continue; 760 continue;
715 761
@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone,
722 continue; 768 continue;
723 769
724 /* Found a block suitable for isolating free pages from */ 770 /* Found a block suitable for isolating free pages from */
725 isolated = 0; 771 cc->free_pfn = block_start_pfn;
772 isolated = isolate_freepages_block(cc, block_start_pfn,
773 block_end_pfn, freelist, false);
774 nr_freepages += isolated;
726 775
727 /* 776 /*
728 * Take care when isolating in last pageblock of a zone which 777 * Set a flag that we successfully isolated in this pageblock.
729 * ends in the middle of a pageblock. 778 * In the next loop iteration, zone->compact_cached_free_pfn
779 * will not be updated and thus it will effectively contain the
780 * highest pageblock we isolated pages from.
730 */ 781 */
731 end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); 782 if (isolated)
732 isolated = isolate_freepages_block(cc, pfn, end_pfn, 783 cc->finished_update_free = true;
733 freelist, false);
734 nr_freepages += isolated;
735 784
736 /* 785 /*
737 * Record the highest PFN we isolated pages from. When next 786 * isolate_freepages_block() might have aborted due to async
738 * looking for free pages, the search will restart here as 787 * compaction being contended
739 * page migration may have returned some pages to the allocator
740 */ 788 */
741 if (isolated) { 789 if (cc->contended)
742 cc->finished_update_free = true; 790 break;
743 high_pfn = max(high_pfn, pfn);
744 }
745 } 791 }
746 792
747 /* split_free_page does not map the pages */ 793 /* split_free_page does not map the pages */
@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone,
751 * If we crossed the migrate scanner, we want to keep it that way 797 * If we crossed the migrate scanner, we want to keep it that way
752 * so that compact_finished() may detect this 798 * so that compact_finished() may detect this
753 */ 799 */
754 if (pfn < low_pfn) 800 if (block_start_pfn < low_pfn)
755 cc->free_pfn = max(pfn, zone->zone_start_pfn); 801 cc->free_pfn = cc->migrate_pfn;
756 else 802
757 cc->free_pfn = high_pfn;
758 cc->nr_freepages = nr_freepages; 803 cc->nr_freepages = nr_freepages;
759} 804}
760 805
@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage,
769 struct compact_control *cc = (struct compact_control *)data; 814 struct compact_control *cc = (struct compact_control *)data;
770 struct page *freepage; 815 struct page *freepage;
771 816
772 /* Isolate free pages if necessary */ 817 /*
818 * Isolate free pages if necessary, and if we are not aborting due to
819 * contention.
820 */
773 if (list_empty(&cc->freepages)) { 821 if (list_empty(&cc->freepages)) {
774 isolate_freepages(cc->zone, cc); 822 if (!cc->contended)
823 isolate_freepages(cc->zone, cc);
775 824
776 if (list_empty(&cc->freepages)) 825 if (list_empty(&cc->freepages))
777 return NULL; 826 return NULL;
@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage,
785} 834}
786 835
787/* 836/*
788 * We cannot control nr_migratepages and nr_freepages fully when migration is 837 * This is a migrate-callback that "frees" freepages back to the isolated
789 * running as migrate_pages() has no knowledge of compact_control. When 838 * freelist. All pages on the freelist are from the same zone, so there is no
790 * migration is complete, we count the number of pages on the lists by hand. 839 * special handling needed for NUMA.
791 */ 840 */
792static void update_nr_listpages(struct compact_control *cc) 841static void compaction_free(struct page *page, unsigned long data)
793{ 842{
794 int nr_migratepages = 0; 843 struct compact_control *cc = (struct compact_control *)data;
795 int nr_freepages = 0;
796 struct page *page;
797
798 list_for_each_entry(page, &cc->migratepages, lru)
799 nr_migratepages++;
800 list_for_each_entry(page, &cc->freepages, lru)
801 nr_freepages++;
802 844
803 cc->nr_migratepages = nr_migratepages; 845 list_add(&page->lru, &cc->freepages);
804 cc->nr_freepages = nr_freepages; 846 cc->nr_freepages++;
805} 847}
806 848
807/* possible outcome of isolate_migratepages */ 849/* possible outcome of isolate_migratepages */
@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone,
848 unsigned int order; 890 unsigned int order;
849 unsigned long watermark; 891 unsigned long watermark;
850 892
851 if (fatal_signal_pending(current)) 893 if (cc->contended || fatal_signal_pending(current))
852 return COMPACT_PARTIAL; 894 return COMPACT_PARTIAL;
853 895
854 /* Compaction run completes if the migrate and free scanner meet */ 896 /* Compaction run completes if the migrate and free scanner meet */
855 if (cc->free_pfn <= cc->migrate_pfn) { 897 if (cc->free_pfn <= cc->migrate_pfn) {
898 /* Let the next compaction start anew. */
899 zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
900 zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
901 zone->compact_cached_free_pfn = zone_end_pfn(zone);
902
856 /* 903 /*
857 * Mark that the PG_migrate_skip information should be cleared 904 * Mark that the PG_migrate_skip information should be cleared
858 * by kswapd when it goes to sleep. kswapd does not set the 905 * by kswapd when it goes to sleep. kswapd does not set the
@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
950 int ret; 997 int ret;
951 unsigned long start_pfn = zone->zone_start_pfn; 998 unsigned long start_pfn = zone->zone_start_pfn;
952 unsigned long end_pfn = zone_end_pfn(zone); 999 unsigned long end_pfn = zone_end_pfn(zone);
1000 const bool sync = cc->mode != MIGRATE_ASYNC;
953 1001
954 ret = compaction_suitable(zone, cc->order); 1002 ret = compaction_suitable(zone, cc->order);
955 switch (ret) { 1003 switch (ret) {
@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
975 * information on where the scanners should start but check that it 1023 * information on where the scanners should start but check that it
976 * is initialised by ensuring the values are within zone boundaries. 1024 * is initialised by ensuring the values are within zone boundaries.
977 */ 1025 */
978 cc->migrate_pfn = zone->compact_cached_migrate_pfn; 1026 cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
979 cc->free_pfn = zone->compact_cached_free_pfn; 1027 cc->free_pfn = zone->compact_cached_free_pfn;
980 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { 1028 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
981 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); 1029 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
983 } 1031 }
984 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { 1032 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
985 cc->migrate_pfn = start_pfn; 1033 cc->migrate_pfn = start_pfn;
986 zone->compact_cached_migrate_pfn = cc->migrate_pfn; 1034 zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1035 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
987 } 1036 }
988 1037
1038 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
1039
989 migrate_prep_local(); 1040 migrate_prep_local();
990 1041
991 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 1042 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
992 unsigned long nr_migrate, nr_remaining;
993 int err; 1043 int err;
994 1044
995 switch (isolate_migratepages(zone, cc)) { 1045 switch (isolate_migratepages(zone, cc)) {
@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1004 ; 1054 ;
1005 } 1055 }
1006 1056
1007 nr_migrate = cc->nr_migratepages; 1057 if (!cc->nr_migratepages)
1058 continue;
1059
1008 err = migrate_pages(&cc->migratepages, compaction_alloc, 1060 err = migrate_pages(&cc->migratepages, compaction_alloc,
1009 (unsigned long)cc, 1061 compaction_free, (unsigned long)cc, cc->mode,
1010 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
1011 MR_COMPACTION); 1062 MR_COMPACTION);
1012 update_nr_listpages(cc);
1013 nr_remaining = cc->nr_migratepages;
1014 1063
1015 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1064 trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1016 nr_remaining); 1065 &cc->migratepages);
1017 1066
1018 /* Release isolated pages not migrated */ 1067 /* All pages were either migrated or will be released */
1068 cc->nr_migratepages = 0;
1019 if (err) { 1069 if (err) {
1020 putback_movable_pages(&cc->migratepages); 1070 putback_movable_pages(&cc->migratepages);
1021 cc->nr_migratepages = 0;
1022 /* 1071 /*
1023 * migrate_pages() may return -ENOMEM when scanners meet 1072 * migrate_pages() may return -ENOMEM when scanners meet
1024 * and we want compact_finished() to detect it 1073 * and we want compact_finished() to detect it
@@ -1035,12 +1084,13 @@ out:
1035 cc->nr_freepages -= release_freepages(&cc->freepages); 1084 cc->nr_freepages -= release_freepages(&cc->freepages);
1036 VM_BUG_ON(cc->nr_freepages != 0); 1085 VM_BUG_ON(cc->nr_freepages != 0);
1037 1086
1087 trace_mm_compaction_end(ret);
1088
1038 return ret; 1089 return ret;
1039} 1090}
1040 1091
1041static unsigned long compact_zone_order(struct zone *zone, 1092static unsigned long compact_zone_order(struct zone *zone, int order,
1042 int order, gfp_t gfp_mask, 1093 gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
1043 bool sync, bool *contended)
1044{ 1094{
1045 unsigned long ret; 1095 unsigned long ret;
1046 struct compact_control cc = { 1096 struct compact_control cc = {
@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone,
1049 .order = order, 1099 .order = order,
1050 .migratetype = allocflags_to_migratetype(gfp_mask), 1100 .migratetype = allocflags_to_migratetype(gfp_mask),
1051 .zone = zone, 1101 .zone = zone,
1052 .sync = sync, 1102 .mode = mode,
1053 }; 1103 };
1054 INIT_LIST_HEAD(&cc.freepages); 1104 INIT_LIST_HEAD(&cc.freepages);
1055 INIT_LIST_HEAD(&cc.migratepages); 1105 INIT_LIST_HEAD(&cc.migratepages);
@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500;
1071 * @order: The order of the current allocation 1121 * @order: The order of the current allocation
1072 * @gfp_mask: The GFP mask of the current allocation 1122 * @gfp_mask: The GFP mask of the current allocation
1073 * @nodemask: The allowed nodes to allocate from 1123 * @nodemask: The allowed nodes to allocate from
1074 * @sync: Whether migration is synchronous or not 1124 * @mode: The migration mode for async, sync light, or sync migration
1075 * @contended: Return value that is true if compaction was aborted due to lock contention 1125 * @contended: Return value that is true if compaction was aborted due to lock contention
1076 * @page: Optionally capture a free page of the requested order during compaction 1126 * @page: Optionally capture a free page of the requested order during compaction
1077 * 1127 *
@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500;
1079 */ 1129 */
1080unsigned long try_to_compact_pages(struct zonelist *zonelist, 1130unsigned long try_to_compact_pages(struct zonelist *zonelist,
1081 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1131 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1082 bool sync, bool *contended) 1132 enum migrate_mode mode, bool *contended)
1083{ 1133{
1084 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1134 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1085 int may_enter_fs = gfp_mask & __GFP_FS; 1135 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1104 nodemask) { 1154 nodemask) {
1105 int status; 1155 int status;
1106 1156
1107 status = compact_zone_order(zone, order, gfp_mask, sync, 1157 status = compact_zone_order(zone, order, gfp_mask, mode,
1108 contended); 1158 contended);
1109 rc = max(status, rc); 1159 rc = max(status, rc);
1110 1160
@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1140 compact_zone(zone, cc); 1190 compact_zone(zone, cc);
1141 1191
1142 if (cc->order > 0) { 1192 if (cc->order > 0) {
1143 int ok = zone_watermark_ok(zone, cc->order, 1193 if (zone_watermark_ok(zone, cc->order,
1144 low_wmark_pages(zone), 0, 0); 1194 low_wmark_pages(zone), 0, 0))
1145 if (ok && cc->order >= zone->compact_order_failed) 1195 compaction_defer_reset(zone, cc->order, false);
1146 zone->compact_order_failed = cc->order + 1;
1147 /* Currently async compaction is never deferred. */
1148 else if (!ok && cc->sync)
1149 defer_compaction(zone, cc->order);
1150 } 1196 }
1151 1197
1152 VM_BUG_ON(!list_empty(&cc->freepages)); 1198 VM_BUG_ON(!list_empty(&cc->freepages));
@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order)
1158{ 1204{
1159 struct compact_control cc = { 1205 struct compact_control cc = {
1160 .order = order, 1206 .order = order,
1161 .sync = false, 1207 .mode = MIGRATE_ASYNC,
1162 }; 1208 };
1163 1209
1164 if (!order) 1210 if (!order)
@@ -1171,7 +1217,8 @@ static void compact_node(int nid)
1171{ 1217{
1172 struct compact_control cc = { 1218 struct compact_control cc = {
1173 .order = -1, 1219 .order = -1,
1174 .sync = true, 1220 .mode = MIGRATE_SYNC,
1221 .ignore_skip_hint = true,
1175 }; 1222 };
1176 1223
1177 __compact_pgdat(NODE_DATA(nid), &cc); 1224 __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index ae4846ff4849..b012daefc2d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping)
192{ 192{
193 int ret = 0; 193 int ret = 0;
194 /* Check for outstanding write errors */ 194 /* Check for outstanding write errors */
195 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) 195 if (test_bit(AS_ENOSPC, &mapping->flags) &&
196 test_and_clear_bit(AS_ENOSPC, &mapping->flags))
196 ret = -ENOSPC; 197 ret = -ENOSPC;
197 if (test_and_clear_bit(AS_EIO, &mapping->flags)) 198 if (test_bit(AS_EIO, &mapping->flags) &&
199 test_and_clear_bit(AS_EIO, &mapping->flags))
198 ret = -EIO; 200 ret = -EIO;
199 return ret; 201 return ret;
200} 202}
@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
446} 448}
447EXPORT_SYMBOL_GPL(replace_page_cache_page); 449EXPORT_SYMBOL_GPL(replace_page_cache_page);
448 450
451static int page_cache_tree_insert(struct address_space *mapping,
452 struct page *page)
453{
454 void **slot;
455 int error;
456
457 slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
458 if (slot) {
459 void *p;
460
461 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
462 if (!radix_tree_exceptional_entry(p))
463 return -EEXIST;
464 radix_tree_replace_slot(slot, page);
465 mapping->nrpages++;
466 return 0;
467 }
468 error = radix_tree_insert(&mapping->page_tree, page->index, page);
469 if (!error)
470 mapping->nrpages++;
471 return error;
472}
473
449/** 474/**
450 * add_to_page_cache_locked - add a locked page to the pagecache 475 * add_to_page_cache_locked - add a locked page to the pagecache
451 * @page: page to add 476 * @page: page to add
@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
480 page->index = offset; 505 page->index = offset;
481 506
482 spin_lock_irq(&mapping->tree_lock); 507 spin_lock_irq(&mapping->tree_lock);
483 error = radix_tree_insert(&mapping->page_tree, offset, page); 508 error = page_cache_tree_insert(mapping, page);
484 radix_tree_preload_end(); 509 radix_tree_preload_end();
485 if (unlikely(error)) 510 if (unlikely(error))
486 goto err_insert; 511 goto err_insert;
487 mapping->nrpages++;
488 __inc_zone_page_state(page, NR_FILE_PAGES); 512 __inc_zone_page_state(page, NR_FILE_PAGES);
489 spin_unlock_irq(&mapping->tree_lock); 513 spin_unlock_irq(&mapping->tree_lock);
490 trace_mm_filemap_add_to_page_cache(page); 514 trace_mm_filemap_add_to_page_cache(page);
@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
520 if (cpuset_do_page_mem_spread()) { 544 if (cpuset_do_page_mem_spread()) {
521 unsigned int cpuset_mems_cookie; 545 unsigned int cpuset_mems_cookie;
522 do { 546 do {
523 cpuset_mems_cookie = get_mems_allowed(); 547 cpuset_mems_cookie = read_mems_allowed_begin();
524 n = cpuset_mem_spread_node(); 548 n = cpuset_mem_spread_node();
525 page = alloc_pages_exact_node(n, gfp, 0); 549 page = alloc_pages_exact_node(n, gfp, 0);
526 } while (!put_mems_allowed(cpuset_mems_cookie) && !page); 550 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
527 551
528 return page; 552 return page;
529 } 553 }
@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page);
620 */ 644 */
621void end_page_writeback(struct page *page) 645void end_page_writeback(struct page *page)
622{ 646{
623 if (TestClearPageReclaim(page)) 647 /*
648 * TestClearPageReclaim could be used here but it is an atomic
649 * operation and overkill in this particular case. Failing to
650 * shuffle a page marked for immediate reclaim is too mild to
651 * justify taking an atomic operation penalty at the end of
652 * ever page writeback.
653 */
654 if (PageReclaim(page)) {
655 ClearPageReclaim(page);
624 rotate_reclaimable_page(page); 656 rotate_reclaimable_page(page);
657 }
625 658
626 if (!test_clear_page_writeback(page)) 659 if (!test_clear_page_writeback(page))
627 BUG(); 660 BUG();
@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
686} 719}
687 720
688/** 721/**
689 * find_get_page - find and get a page reference 722 * page_cache_next_hole - find the next hole (not-present entry)
723 * @mapping: mapping
724 * @index: index
725 * @max_scan: maximum range to search
726 *
727 * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
728 * lowest indexed hole.
729 *
730 * Returns: the index of the hole if found, otherwise returns an index
731 * outside of the set specified (in which case 'return - index >=
732 * max_scan' will be true). In rare cases of index wrap-around, 0 will
733 * be returned.
734 *
735 * page_cache_next_hole may be called under rcu_read_lock. However,
736 * like radix_tree_gang_lookup, this will not atomically search a
737 * snapshot of the tree at a single point in time. For example, if a
738 * hole is created at index 5, then subsequently a hole is created at
739 * index 10, page_cache_next_hole covering both indexes may return 10
740 * if called under rcu_read_lock.
741 */
742pgoff_t page_cache_next_hole(struct address_space *mapping,
743 pgoff_t index, unsigned long max_scan)
744{
745 unsigned long i;
746
747 for (i = 0; i < max_scan; i++) {
748 struct page *page;
749
750 page = radix_tree_lookup(&mapping->page_tree, index);
751 if (!page || radix_tree_exceptional_entry(page))
752 break;
753 index++;
754 if (index == 0)
755 break;
756 }
757
758 return index;
759}
760EXPORT_SYMBOL(page_cache_next_hole);
761
762/**
763 * page_cache_prev_hole - find the prev hole (not-present entry)
764 * @mapping: mapping
765 * @index: index
766 * @max_scan: maximum range to search
767 *
768 * Search backwards in the range [max(index-max_scan+1, 0), index] for
769 * the first hole.
770 *
771 * Returns: the index of the hole if found, otherwise returns an index
772 * outside of the set specified (in which case 'index - return >=
773 * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
774 * will be returned.
775 *
776 * page_cache_prev_hole may be called under rcu_read_lock. However,
777 * like radix_tree_gang_lookup, this will not atomically search a
778 * snapshot of the tree at a single point in time. For example, if a
779 * hole is created at index 10, then subsequently a hole is created at
780 * index 5, page_cache_prev_hole covering both indexes may return 5 if
781 * called under rcu_read_lock.
782 */
783pgoff_t page_cache_prev_hole(struct address_space *mapping,
784 pgoff_t index, unsigned long max_scan)
785{
786 unsigned long i;
787
788 for (i = 0; i < max_scan; i++) {
789 struct page *page;
790
791 page = radix_tree_lookup(&mapping->page_tree, index);
792 if (!page || radix_tree_exceptional_entry(page))
793 break;
794 index--;
795 if (index == ULONG_MAX)
796 break;
797 }
798
799 return index;
800}
801EXPORT_SYMBOL(page_cache_prev_hole);
802
803/**
804 * find_get_entry - find and get a page cache entry
690 * @mapping: the address_space to search 805 * @mapping: the address_space to search
691 * @offset: the page index 806 * @offset: the page cache index
807 *
808 * Looks up the page cache slot at @mapping & @offset. If there is a
809 * page cache page, it is returned with an increased refcount.
692 * 810 *
693 * Is there a pagecache struct page at the given (mapping, offset) tuple? 811 * If the slot holds a shadow entry of a previously evicted page, it
694 * If yes, increment its refcount and return it; if no, return NULL. 812 * is returned.
813 *
814 * Otherwise, %NULL is returned.
695 */ 815 */
696struct page *find_get_page(struct address_space *mapping, pgoff_t offset) 816struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
697{ 817{
698 void **pagep; 818 void **pagep;
699 struct page *page; 819 struct page *page;
@@ -734,24 +854,30 @@ out:
734 854
735 return page; 855 return page;
736} 856}
737EXPORT_SYMBOL(find_get_page); 857EXPORT_SYMBOL(find_get_entry);
738 858
739/** 859/**
740 * find_lock_page - locate, pin and lock a pagecache page 860 * find_lock_entry - locate, pin and lock a page cache entry
741 * @mapping: the address_space to search 861 * @mapping: the address_space to search
742 * @offset: the page index 862 * @offset: the page cache index
863 *
864 * Looks up the page cache slot at @mapping & @offset. If there is a
865 * page cache page, it is returned locked and with an increased
866 * refcount.
743 * 867 *
744 * Locates the desired pagecache page, locks it, increments its reference 868 * If the slot holds a shadow entry of a previously evicted page, it
745 * count and returns its address. 869 * is returned.
746 * 870 *
747 * Returns zero if the page was not present. find_lock_page() may sleep. 871 * Otherwise, %NULL is returned.
872 *
873 * find_lock_entry() may sleep.
748 */ 874 */
749struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) 875struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
750{ 876{
751 struct page *page; 877 struct page *page;
752 878
753repeat: 879repeat:
754 page = find_get_page(mapping, offset); 880 page = find_get_entry(mapping, offset);
755 if (page && !radix_tree_exception(page)) { 881 if (page && !radix_tree_exception(page)) {
756 lock_page(page); 882 lock_page(page);
757 /* Has the page been truncated? */ 883 /* Has the page been truncated? */
@@ -764,44 +890,87 @@ repeat:
764 } 890 }
765 return page; 891 return page;
766} 892}
767EXPORT_SYMBOL(find_lock_page); 893EXPORT_SYMBOL(find_lock_entry);
768 894
769/** 895/**
770 * find_or_create_page - locate or add a pagecache page 896 * pagecache_get_page - find and get a page reference
771 * @mapping: the page's address_space 897 * @mapping: the address_space to search
772 * @index: the page's index into the mapping 898 * @offset: the page index
773 * @gfp_mask: page allocation mode 899 * @fgp_flags: PCG flags
900 * @gfp_mask: gfp mask to use if a page is to be allocated
901 *
902 * Looks up the page cache slot at @mapping & @offset.
903 *
904 * PCG flags modify how the page is returned
774 * 905 *
775 * Locates a page in the pagecache. If the page is not present, a new page 906 * FGP_ACCESSED: the page will be marked accessed
776 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 907 * FGP_LOCK: Page is return locked
777 * LRU list. The returned page is locked and has its reference count 908 * FGP_CREAT: If page is not present then a new page is allocated using
778 * incremented. 909 * @gfp_mask and added to the page cache and the VM's LRU
910 * list. The page is returned locked and with an increased
911 * refcount. Otherwise, %NULL is returned.
779 * 912 *
780 * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic 913 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
781 * allocation! 914 * if the GFP flags specified for FGP_CREAT are atomic.
782 * 915 *
783 * find_or_create_page() returns the desired page's address, or zero on 916 * If there is a page cache page, it is returned with an increased refcount.
784 * memory exhaustion.
785 */ 917 */
786struct page *find_or_create_page(struct address_space *mapping, 918struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
787 pgoff_t index, gfp_t gfp_mask) 919 int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
788{ 920{
789 struct page *page; 921 struct page *page;
790 int err; 922
791repeat: 923repeat:
792 page = find_lock_page(mapping, index); 924 page = find_get_entry(mapping, offset);
793 if (!page) { 925 if (radix_tree_exceptional_entry(page))
794 page = __page_cache_alloc(gfp_mask); 926 page = NULL;
927 if (!page)
928 goto no_page;
929
930 if (fgp_flags & FGP_LOCK) {
931 if (fgp_flags & FGP_NOWAIT) {
932 if (!trylock_page(page)) {
933 page_cache_release(page);
934 return NULL;
935 }
936 } else {
937 lock_page(page);
938 }
939
940 /* Has the page been truncated? */
941 if (unlikely(page->mapping != mapping)) {
942 unlock_page(page);
943 page_cache_release(page);
944 goto repeat;
945 }
946 VM_BUG_ON(page->index != offset);
947 }
948
949 if (page && (fgp_flags & FGP_ACCESSED))
950 mark_page_accessed(page);
951
952no_page:
953 if (!page && (fgp_flags & FGP_CREAT)) {
954 int err;
955 if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
956 cache_gfp_mask |= __GFP_WRITE;
957 if (fgp_flags & FGP_NOFS) {
958 cache_gfp_mask &= ~__GFP_FS;
959 radix_gfp_mask &= ~__GFP_FS;
960 }
961
962 page = __page_cache_alloc(cache_gfp_mask);
795 if (!page) 963 if (!page)
796 return NULL; 964 return NULL;
797 /* 965
798 * We want a regular kernel memory (not highmem or DMA etc) 966 if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
799 * allocation for the radix tree nodes, but we need to honour 967 fgp_flags |= FGP_LOCK;
800 * the context-specific requirements the caller has asked for. 968
801 * GFP_RECLAIM_MASK collects those requirements. 969 /* Init accessed so avoit atomic mark_page_accessed later */
802 */ 970 if (fgp_flags & FGP_ACCESSED)
803 err = add_to_page_cache_lru(page, mapping, index, 971 init_page_accessed(page);
804 (gfp_mask & GFP_RECLAIM_MASK)); 972
973 err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
805 if (unlikely(err)) { 974 if (unlikely(err)) {
806 page_cache_release(page); 975 page_cache_release(page);
807 page = NULL; 976 page = NULL;
@@ -809,9 +978,80 @@ repeat:
809 goto repeat; 978 goto repeat;
810 } 979 }
811 } 980 }
981
812 return page; 982 return page;
813} 983}
814EXPORT_SYMBOL(find_or_create_page); 984EXPORT_SYMBOL(pagecache_get_page);
985
986/**
987 * find_get_entries - gang pagecache lookup
988 * @mapping: The address_space to search
989 * @start: The starting page cache index
990 * @nr_entries: The maximum number of entries
991 * @entries: Where the resulting entries are placed
992 * @indices: The cache indices corresponding to the entries in @entries
993 *
994 * find_get_entries() will search for and return a group of up to
995 * @nr_entries entries in the mapping. The entries are placed at
996 * @entries. find_get_entries() takes a reference against any actual
997 * pages it returns.
998 *
999 * The search returns a group of mapping-contiguous page cache entries
1000 * with ascending indexes. There may be holes in the indices due to
1001 * not-present pages.
1002 *
1003 * Any shadow entries of evicted pages are included in the returned
1004 * array.
1005 *
1006 * find_get_entries() returns the number of pages and shadow entries
1007 * which were found.
1008 */
1009unsigned find_get_entries(struct address_space *mapping,
1010 pgoff_t start, unsigned int nr_entries,
1011 struct page **entries, pgoff_t *indices)
1012{
1013 void **slot;
1014 unsigned int ret = 0;
1015 struct radix_tree_iter iter;
1016
1017 if (!nr_entries)
1018 return 0;
1019
1020 rcu_read_lock();
1021restart:
1022 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1023 struct page *page;
1024repeat:
1025 page = radix_tree_deref_slot(slot);
1026 if (unlikely(!page))
1027 continue;
1028 if (radix_tree_exception(page)) {
1029 if (radix_tree_deref_retry(page))
1030 goto restart;
1031 /*
1032 * Otherwise, we must be storing a swap entry
1033 * here as an exceptional entry: so return it
1034 * without attempting to raise page count.
1035 */
1036 goto export;
1037 }
1038 if (!page_cache_get_speculative(page))
1039 goto repeat;
1040
1041 /* Has the page moved? */
1042 if (unlikely(page != *slot)) {
1043 page_cache_release(page);
1044 goto repeat;
1045 }
1046export:
1047 indices[ret] = iter.index;
1048 entries[ret] = page;
1049 if (++ret == nr_entries)
1050 break;
1051 }
1052 rcu_read_unlock();
1053 return ret;
1054}
815 1055
816/** 1056/**
817 * find_get_pages - gang pagecache lookup 1057 * find_get_pages - gang pagecache lookup
@@ -1031,39 +1271,6 @@ repeat:
1031} 1271}
1032EXPORT_SYMBOL(find_get_pages_tag); 1272EXPORT_SYMBOL(find_get_pages_tag);
1033 1273
1034/**
1035 * grab_cache_page_nowait - returns locked page at given index in given cache
1036 * @mapping: target address_space
1037 * @index: the page index
1038 *
1039 * Same as grab_cache_page(), but do not wait if the page is unavailable.
1040 * This is intended for speculative data generators, where the data can
1041 * be regenerated if the page couldn't be grabbed. This routine should
1042 * be safe to call while holding the lock for another page.
1043 *
1044 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
1045 * and deadlock against the caller's locked page.
1046 */
1047struct page *
1048grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
1049{
1050 struct page *page = find_get_page(mapping, index);
1051
1052 if (page) {
1053 if (trylock_page(page))
1054 return page;
1055 page_cache_release(page);
1056 return NULL;
1057 }
1058 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
1059 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
1060 page_cache_release(page);
1061 page = NULL;
1062 }
1063 return page;
1064}
1065EXPORT_SYMBOL(grab_cache_page_nowait);
1066
1067/* 1274/*
1068 * CD/DVDs are error prone. When a medium error occurs, the driver may fail 1275 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
1069 * a _large_ part of the i/o request. Imagine the worst scenario: 1276 * a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1797EXPORT_SYMBOL(generic_file_mmap); 2004EXPORT_SYMBOL(generic_file_mmap);
1798EXPORT_SYMBOL(generic_file_readonly_mmap); 2005EXPORT_SYMBOL(generic_file_readonly_mmap);
1799 2006
2007static struct page *wait_on_page_read(struct page *page)
2008{
2009 if (!IS_ERR(page)) {
2010 wait_on_page_locked(page);
2011 if (!PageUptodate(page)) {
2012 page_cache_release(page);
2013 page = ERR_PTR(-EIO);
2014 }
2015 }
2016 return page;
2017}
2018
1800static struct page *__read_cache_page(struct address_space *mapping, 2019static struct page *__read_cache_page(struct address_space *mapping,
1801 pgoff_t index, 2020 pgoff_t index,
1802 int (*filler)(void *, struct page *), 2021 int (*filler)(void *, struct page *),
@@ -1823,6 +2042,8 @@ repeat:
1823 if (err < 0) { 2042 if (err < 0) {
1824 page_cache_release(page); 2043 page_cache_release(page);
1825 page = ERR_PTR(err); 2044 page = ERR_PTR(err);
2045 } else {
2046 page = wait_on_page_read(page);
1826 } 2047 }
1827 } 2048 }
1828 return page; 2049 return page;
@@ -1859,6 +2080,10 @@ retry:
1859 if (err < 0) { 2080 if (err < 0) {
1860 page_cache_release(page); 2081 page_cache_release(page);
1861 return ERR_PTR(err); 2082 return ERR_PTR(err);
2083 } else {
2084 page = wait_on_page_read(page);
2085 if (IS_ERR(page))
2086 return page;
1862 } 2087 }
1863out: 2088out:
1864 mark_page_accessed(page); 2089 mark_page_accessed(page);
@@ -1866,40 +2091,25 @@ out:
1866} 2091}
1867 2092
1868/** 2093/**
1869 * read_cache_page_async - read into page cache, fill it if needed 2094 * read_cache_page - read into page cache, fill it if needed
1870 * @mapping: the page's address_space 2095 * @mapping: the page's address_space
1871 * @index: the page index 2096 * @index: the page index
1872 * @filler: function to perform the read 2097 * @filler: function to perform the read
1873 * @data: first arg to filler(data, page) function, often left as NULL 2098 * @data: first arg to filler(data, page) function, often left as NULL
1874 * 2099 *
1875 * Same as read_cache_page, but don't wait for page to become unlocked
1876 * after submitting it to the filler.
1877 *
1878 * Read into the page cache. If a page already exists, and PageUptodate() is 2100 * Read into the page cache. If a page already exists, and PageUptodate() is
1879 * not set, try to fill the page but don't wait for it to become unlocked. 2101 * not set, try to fill the page and wait for it to become unlocked.
1880 * 2102 *
1881 * If the page does not get brought uptodate, return -EIO. 2103 * If the page does not get brought uptodate, return -EIO.
1882 */ 2104 */
1883struct page *read_cache_page_async(struct address_space *mapping, 2105struct page *read_cache_page(struct address_space *mapping,
1884 pgoff_t index, 2106 pgoff_t index,
1885 int (*filler)(void *, struct page *), 2107 int (*filler)(void *, struct page *),
1886 void *data) 2108 void *data)
1887{ 2109{
1888 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 2110 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
1889} 2111}
1890EXPORT_SYMBOL(read_cache_page_async); 2112EXPORT_SYMBOL(read_cache_page);
1891
1892static struct page *wait_on_page_read(struct page *page)
1893{
1894 if (!IS_ERR(page)) {
1895 wait_on_page_locked(page);
1896 if (!PageUptodate(page)) {
1897 page_cache_release(page);
1898 page = ERR_PTR(-EIO);
1899 }
1900 }
1901 return page;
1902}
1903 2113
1904/** 2114/**
1905 * read_cache_page_gfp - read into page cache, using specified page allocation flags. 2115 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
1918{ 2128{
1919 filler_t *filler = (filler_t *)mapping->a_ops->readpage; 2129 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
1920 2130
1921 return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); 2131 return do_read_cache_page(mapping, index, filler, NULL, gfp);
1922} 2132}
1923EXPORT_SYMBOL(read_cache_page_gfp); 2133EXPORT_SYMBOL(read_cache_page_gfp);
1924 2134
1925/**
1926 * read_cache_page - read into page cache, fill it if needed
1927 * @mapping: the page's address_space
1928 * @index: the page index
1929 * @filler: function to perform the read
1930 * @data: first arg to filler(data, page) function, often left as NULL
1931 *
1932 * Read into the page cache. If a page already exists, and PageUptodate() is
1933 * not set, try to fill the page then wait for it to become unlocked.
1934 *
1935 * If the page does not get brought uptodate, return -EIO.
1936 */
1937struct page *read_cache_page(struct address_space *mapping,
1938 pgoff_t index,
1939 int (*filler)(void *, struct page *),
1940 void *data)
1941{
1942 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
1943}
1944EXPORT_SYMBOL(read_cache_page);
1945
1946static size_t __iovec_copy_from_user_inatomic(char *vaddr, 2135static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1947 const struct iovec *iov, size_t base, size_t bytes) 2136 const struct iovec *iov, size_t base, size_t bytes)
1948{ 2137{
@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
1976 char *kaddr; 2165 char *kaddr;
1977 size_t copied; 2166 size_t copied;
1978 2167
1979 BUG_ON(!in_atomic());
1980 kaddr = kmap_atomic(page); 2168 kaddr = kmap_atomic(page);
1981 if (likely(i->nr_segs == 1)) { 2169 if (likely(i->nr_segs == 1)) {
1982 int left; 2170 int left;
@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2186{ 2374{
2187 const struct address_space_operations *aops = mapping->a_ops; 2375 const struct address_space_operations *aops = mapping->a_ops;
2188 2376
2189 mark_page_accessed(page);
2190 return aops->write_end(file, mapping, pos, len, copied, page, fsdata); 2377 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2191} 2378}
2192EXPORT_SYMBOL(pagecache_write_end); 2379EXPORT_SYMBOL(pagecache_write_end);
@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write);
2268struct page *grab_cache_page_write_begin(struct address_space *mapping, 2455struct page *grab_cache_page_write_begin(struct address_space *mapping,
2269 pgoff_t index, unsigned flags) 2456 pgoff_t index, unsigned flags)
2270{ 2457{
2271 int status;
2272 gfp_t gfp_mask;
2273 struct page *page; 2458 struct page *page;
2274 gfp_t gfp_notmask = 0; 2459 int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
2275 2460
2276 gfp_mask = mapping_gfp_mask(mapping);
2277 if (mapping_cap_account_dirty(mapping))
2278 gfp_mask |= __GFP_WRITE;
2279 if (flags & AOP_FLAG_NOFS) 2461 if (flags & AOP_FLAG_NOFS)
2280 gfp_notmask = __GFP_FS; 2462 fgp_flags |= FGP_NOFS;
2281repeat: 2463
2282 page = find_lock_page(mapping, index); 2464 page = pagecache_get_page(mapping, index, fgp_flags,
2465 mapping_gfp_mask(mapping),
2466 GFP_KERNEL);
2283 if (page) 2467 if (page)
2284 goto found; 2468 wait_for_stable_page(page);
2285 2469
2286 page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2287 if (!page)
2288 return NULL;
2289 status = add_to_page_cache_lru(page, mapping, index,
2290 GFP_KERNEL & ~gfp_notmask);
2291 if (unlikely(status)) {
2292 page_cache_release(page);
2293 if (status == -EEXIST)
2294 goto repeat;
2295 return NULL;
2296 }
2297found:
2298 wait_for_stable_page(page);
2299 return page; 2470 return page;
2300} 2471}
2301EXPORT_SYMBOL(grab_cache_page_write_begin); 2472EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2344,18 +2515,15 @@ again:
2344 2515
2345 status = a_ops->write_begin(file, mapping, pos, bytes, flags, 2516 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2346 &page, &fsdata); 2517 &page, &fsdata);
2347 if (unlikely(status)) 2518 if (unlikely(status < 0))
2348 break; 2519 break;
2349 2520
2350 if (mapping_writably_mapped(mapping)) 2521 if (mapping_writably_mapped(mapping))
2351 flush_dcache_page(page); 2522 flush_dcache_page(page);
2352 2523
2353 pagefault_disable();
2354 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); 2524 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2355 pagefault_enable();
2356 flush_dcache_page(page); 2525 flush_dcache_page(page);
2357 2526
2358 mark_page_accessed(page);
2359 status = a_ops->write_end(file, mapping, pos, bytes, copied, 2527 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2360 page, fsdata); 2528 page, fsdata);
2361 if (unlikely(status < 0)) 2529 if (unlikely(status < 0))
diff --git a/mm/fremap.c b/mm/fremap.c
index bbc4d660221a..34feba60a17e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -23,28 +23,44 @@
23 23
24#include "internal.h" 24#include "internal.h"
25 25
26static int mm_counter(struct page *page)
27{
28 return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
29}
30
26static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 31static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
27 unsigned long addr, pte_t *ptep) 32 unsigned long addr, pte_t *ptep)
28{ 33{
29 pte_t pte = *ptep; 34 pte_t pte = *ptep;
35 struct page *page;
36 swp_entry_t entry;
30 37
31 if (pte_present(pte)) { 38 if (pte_present(pte)) {
32 struct page *page;
33
34 flush_cache_page(vma, addr, pte_pfn(pte)); 39 flush_cache_page(vma, addr, pte_pfn(pte));
35 pte = ptep_clear_flush(vma, addr, ptep); 40 pte = ptep_clear_flush(vma, addr, ptep);
36 page = vm_normal_page(vma, addr, pte); 41 page = vm_normal_page(vma, addr, pte);
37 if (page) { 42 if (page) {
38 if (pte_dirty(pte)) 43 if (pte_dirty(pte))
39 set_page_dirty(page); 44 set_page_dirty(page);
45 update_hiwater_rss(mm);
46 dec_mm_counter(mm, mm_counter(page));
40 page_remove_rmap(page); 47 page_remove_rmap(page);
41 page_cache_release(page); 48 page_cache_release(page);
49 }
50 } else { /* zap_pte() is not called when pte_none() */
51 if (!pte_file(pte)) {
42 update_hiwater_rss(mm); 52 update_hiwater_rss(mm);
43 dec_mm_counter(mm, MM_FILEPAGES); 53 entry = pte_to_swp_entry(pte);
54 if (non_swap_entry(entry)) {
55 if (is_migration_entry(entry)) {
56 page = migration_entry_to_page(entry);
57 dec_mm_counter(mm, mm_counter(page));
58 }
59 } else {
60 free_swap_and_cache(entry);
61 dec_mm_counter(mm, MM_SWAPENTS);
62 }
44 } 63 }
45 } else {
46 if (!pte_file(pte))
47 free_swap_and_cache(pte_to_swp_entry(pte));
48 pte_clear_not_present_full(mm, addr, ptep, 0); 64 pte_clear_not_present_full(mm, addr, ptep, 0);
49 } 65 }
50} 66}
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdcb3197..c30eec536f03 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
327 327
328static unsigned long __frontswap_curr_pages(void) 328static unsigned long __frontswap_curr_pages(void)
329{ 329{
330 int type;
331 unsigned long totalpages = 0; 330 unsigned long totalpages = 0;
332 struct swap_info_struct *si = NULL; 331 struct swap_info_struct *si = NULL;
333 332
334 assert_spin_locked(&swap_lock); 333 assert_spin_locked(&swap_lock);
335 for (type = swap_list.head; type >= 0; type = si->next) { 334 plist_for_each_entry(si, &swap_active_head, list)
336 si = swap_info[type];
337 totalpages += atomic_read(&si->frontswap_pages); 335 totalpages += atomic_read(&si->frontswap_pages);
338 }
339 return totalpages; 336 return totalpages;
340} 337}
341 338
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
347 int si_frontswap_pages; 344 int si_frontswap_pages;
348 unsigned long total_pages_to_unuse = total; 345 unsigned long total_pages_to_unuse = total;
349 unsigned long pages = 0, pages_to_unuse = 0; 346 unsigned long pages = 0, pages_to_unuse = 0;
350 int type;
351 347
352 assert_spin_locked(&swap_lock); 348 assert_spin_locked(&swap_lock);
353 for (type = swap_list.head; type >= 0; type = si->next) { 349 plist_for_each_entry(si, &swap_active_head, list) {
354 si = swap_info[type];
355 si_frontswap_pages = atomic_read(&si->frontswap_pages); 350 si_frontswap_pages = atomic_read(&si->frontswap_pages);
356 if (total_pages_to_unuse < si_frontswap_pages) { 351 if (total_pages_to_unuse < si_frontswap_pages) {
357 pages = pages_to_unuse = total_pages_to_unuse; 352 pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
366 } 361 }
367 vm_unacct_memory(pages); 362 vm_unacct_memory(pages);
368 *unused = pages_to_unuse; 363 *unused = pages_to_unuse;
369 *swapid = type; 364 *swapid = si->type;
370 ret = 0; 365 ret = 0;
371 break; 366 break;
372 } 367 }
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
413 /* 408 /*
414 * we don't want to hold swap_lock while doing a very 409 * we don't want to hold swap_lock while doing a very
415 * lengthy try_to_unuse, but swap_list may change 410 * lengthy try_to_unuse, but swap_list may change
416 * so restart scan from swap_list.head each time 411 * so restart scan from swap_active_head each time
417 */ 412 */
418 spin_lock(&swap_lock); 413 spin_lock(&swap_lock);
419 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); 414 ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 389973fd6bb7..2ee53749eb48 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag,
758 HPAGE_PMD_ORDER, vma, haddr, nd); 758 HPAGE_PMD_ORDER, vma, haddr, nd);
759} 759}
760 760
761#ifndef CONFIG_NUMA
762static inline struct page *alloc_hugepage(int defrag)
763{
764 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
765 HPAGE_PMD_ORDER);
766}
767#endif
768
769static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 761static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
770 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 762 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
771 struct page *zero_page) 763 struct page *zero_page)
@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void)
2197 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2189 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2198} 2190}
2199 2191
2192static int khugepaged_node_load[MAX_NUMNODES];
2193
2194static bool khugepaged_scan_abort(int nid)
2195{
2196 int i;
2197
2198 /*
2199 * If zone_reclaim_mode is disabled, then no extra effort is made to
2200 * allocate memory locally.
2201 */
2202 if (!zone_reclaim_mode)
2203 return false;
2204
2205 /* If there is a count for this node already, it must be acceptable */
2206 if (khugepaged_node_load[nid])
2207 return false;
2208
2209 for (i = 0; i < MAX_NUMNODES; i++) {
2210 if (!khugepaged_node_load[i])
2211 continue;
2212 if (node_distance(nid, i) > RECLAIM_DISTANCE)
2213 return true;
2214 }
2215 return false;
2216}
2217
2200#ifdef CONFIG_NUMA 2218#ifdef CONFIG_NUMA
2219static int khugepaged_find_target_node(void)
2220{
2221 static int last_khugepaged_target_node = NUMA_NO_NODE;
2222 int nid, target_node = 0, max_value = 0;
2223
2224 /* find first node with max normal pages hit */
2225 for (nid = 0; nid < MAX_NUMNODES; nid++)
2226 if (khugepaged_node_load[nid] > max_value) {
2227 max_value = khugepaged_node_load[nid];
2228 target_node = nid;
2229 }
2230
2231 /* do some balance if several nodes have the same hit record */
2232 if (target_node <= last_khugepaged_target_node)
2233 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2234 nid++)
2235 if (max_value == khugepaged_node_load[nid]) {
2236 target_node = nid;
2237 break;
2238 }
2239
2240 last_khugepaged_target_node = target_node;
2241 return target_node;
2242}
2243
2201static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2244static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2202{ 2245{
2203 if (IS_ERR(*hpage)) { 2246 if (IS_ERR(*hpage)) {
@@ -2231,9 +2274,8 @@ static struct page
2231 * mmap_sem in read mode is good idea also to allow greater 2274 * mmap_sem in read mode is good idea also to allow greater
2232 * scalability. 2275 * scalability.
2233 */ 2276 */
2234 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2277 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2235 node, __GFP_OTHER_NODE); 2278 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2236
2237 /* 2279 /*
2238 * After allocating the hugepage, release the mmap_sem read lock in 2280 * After allocating the hugepage, release the mmap_sem read lock in
2239 * preparation for taking it in write mode. 2281 * preparation for taking it in write mode.
@@ -2249,6 +2291,17 @@ static struct page
2249 return *hpage; 2291 return *hpage;
2250} 2292}
2251#else 2293#else
2294static int khugepaged_find_target_node(void)
2295{
2296 return 0;
2297}
2298
2299static inline struct page *alloc_hugepage(int defrag)
2300{
2301 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
2302 HPAGE_PMD_ORDER);
2303}
2304
2252static struct page *khugepaged_alloc_hugepage(bool *wait) 2305static struct page *khugepaged_alloc_hugepage(bool *wait)
2253{ 2306{
2254 struct page *hpage; 2307 struct page *hpage;
@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2455 if (pmd_trans_huge(*pmd)) 2508 if (pmd_trans_huge(*pmd))
2456 goto out; 2509 goto out;
2457 2510
2511 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2458 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2512 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2459 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2513 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2460 _pte++, _address += PAGE_SIZE) { 2514 _pte++, _address += PAGE_SIZE) {
@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2471 if (unlikely(!page)) 2525 if (unlikely(!page))
2472 goto out_unmap; 2526 goto out_unmap;
2473 /* 2527 /*
2474 * Chose the node of the first page. This could 2528 * Record which node the original page is from and save this
2475 * be more sophisticated and look at more pages, 2529 * information to khugepaged_node_load[].
2476 * but isn't for now. 2530 * Khupaged will allocate hugepage from the node has the max
2531 * hit record.
2477 */ 2532 */
2478 if (node == NUMA_NO_NODE) 2533 node = page_to_nid(page);
2479 node = page_to_nid(page); 2534 if (khugepaged_scan_abort(node))
2535 goto out_unmap;
2536 khugepaged_node_load[node]++;
2480 VM_BUG_ON(PageCompound(page)); 2537 VM_BUG_ON(PageCompound(page));
2481 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2538 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2482 goto out_unmap; 2539 goto out_unmap;
@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2491 ret = 1; 2548 ret = 1;
2492out_unmap: 2549out_unmap:
2493 pte_unmap_unlock(pte, ptl); 2550 pte_unmap_unlock(pte, ptl);
2494 if (ret) 2551 if (ret) {
2552 node = khugepaged_find_target_node();
2495 /* collapse_huge_page will return with the mmap_sem released */ 2553 /* collapse_huge_page will return with the mmap_sem released */
2496 collapse_huge_page(mm, address, hpage, vma, node); 2554 collapse_huge_page(mm, address, hpage, vma, node);
2555 }
2497out: 2556out:
2498 return ret; 2557 return ret;
2499} 2558}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f80b17106d24..c33d8a65298c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
574 goto err; 574 goto err;
575 575
576retry_cpuset: 576retry_cpuset:
577 cpuset_mems_cookie = get_mems_allowed(); 577 cpuset_mems_cookie = read_mems_allowed_begin();
578 zonelist = huge_zonelist(vma, address, 578 zonelist = huge_zonelist(vma, address,
579 htlb_alloc_mask(h), &mpol, &nodemask); 579 htlb_alloc_mask(h), &mpol, &nodemask);
580 580
@@ -596,7 +596,7 @@ retry_cpuset:
596 } 596 }
597 597
598 mpol_cond_put(mpol); 598 mpol_cond_put(mpol);
599 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 599 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
600 goto retry_cpuset; 600 goto retry_cpuset;
601 return page; 601 return page;
602 602
@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
2114 unsigned long tmp; 2114 unsigned long tmp;
2115 int ret; 2115 int ret;
2116 2116
2117 if (!hugepages_supported())
2118 return -ENOTSUPP;
2119
2117 tmp = h->max_huge_pages; 2120 tmp = h->max_huge_pages;
2118 2121
2119 if (write && h->order >= MAX_ORDER) 2122 if (write && h->order >= MAX_ORDER)
@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2167 unsigned long tmp; 2170 unsigned long tmp;
2168 int ret; 2171 int ret;
2169 2172
2173 if (!hugepages_supported())
2174 return -ENOTSUPP;
2175
2170 tmp = h->nr_overcommit_huge_pages; 2176 tmp = h->nr_overcommit_huge_pages;
2171 2177
2172 if (write && h->order >= MAX_ORDER) 2178 if (write && h->order >= MAX_ORDER)
@@ -2192,6 +2198,8 @@ out:
2192void hugetlb_report_meminfo(struct seq_file *m) 2198void hugetlb_report_meminfo(struct seq_file *m)
2193{ 2199{
2194 struct hstate *h = &default_hstate; 2200 struct hstate *h = &default_hstate;
2201 if (!hugepages_supported())
2202 return;
2195 seq_printf(m, 2203 seq_printf(m,
2196 "HugePages_Total: %5lu\n" 2204 "HugePages_Total: %5lu\n"
2197 "HugePages_Free: %5lu\n" 2205 "HugePages_Free: %5lu\n"
@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m)
2208int hugetlb_report_node_meminfo(int nid, char *buf) 2216int hugetlb_report_node_meminfo(int nid, char *buf)
2209{ 2217{
2210 struct hstate *h = &default_hstate; 2218 struct hstate *h = &default_hstate;
2219 if (!hugepages_supported())
2220 return 0;
2211 return sprintf(buf, 2221 return sprintf(buf,
2212 "Node %d HugePages_Total: %5u\n" 2222 "Node %d HugePages_Total: %5u\n"
2213 "Node %d HugePages_Free: %5u\n" 2223 "Node %d HugePages_Free: %5u\n"
@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void)
2222 struct hstate *h; 2232 struct hstate *h;
2223 int nid; 2233 int nid;
2224 2234
2235 if (!hugepages_supported())
2236 return;
2237
2225 for_each_node_state(nid, N_MEMORY) 2238 for_each_node_state(nid, N_MEMORY)
2226 for_each_hstate(h) 2239 for_each_hstate(h)
2227 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", 2240 pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
diff --git a/mm/internal.h b/mm/internal.h
index fdddbc83ac5f..d610f7ce4e9c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
11#ifndef __MM_INTERNAL_H 11#ifndef __MM_INTERNAL_H
12#define __MM_INTERNAL_H 12#define __MM_INTERNAL_H
13 13
14#include <linux/fs.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15 16
16void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 17void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
21 atomic_set(&page->_count, v); 22 atomic_set(&page->_count, v);
22} 23}
23 24
25extern int __do_page_cache_readahead(struct address_space *mapping,
26 struct file *filp, pgoff_t offset, unsigned long nr_to_read,
27 unsigned long lookahead_size);
28
29/*
30 * Submit IO for the read-ahead request in file_ra_state.
31 */
32static inline unsigned long ra_submit(struct file_ra_state *ra,
33 struct address_space *mapping, struct file *filp)
34{
35 return __do_page_cache_readahead(mapping, filp,
36 ra->start, ra->size, ra->async_size);
37}
38
24/* 39/*
25 * Turn a non-refcounted page (->_count == 0) into refcounted with 40 * Turn a non-refcounted page (->_count == 0) into refcounted with
26 * a count of one. 41 * a count of one.
@@ -120,7 +135,7 @@ struct compact_control {
120 unsigned long nr_migratepages; /* Number of pages to migrate */ 135 unsigned long nr_migratepages; /* Number of pages to migrate */
121 unsigned long free_pfn; /* isolate_freepages search base */ 136 unsigned long free_pfn; /* isolate_freepages search base */
122 unsigned long migrate_pfn; /* isolate_migratepages search base */ 137 unsigned long migrate_pfn; /* isolate_migratepages search base */
123 bool sync; /* Synchronous migration */ 138 enum migrate_mode mode; /* Async or sync migration mode */
124 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 139 bool ignore_skip_hint; /* Scan blocks even if marked skip */
125 bool finished_update_free; /* True when the zone cached pfns are 140 bool finished_update_free; /* True when the zone cached pfns are
126 * no longer being updated 141 * no longer being updated
@@ -130,7 +145,10 @@ struct compact_control {
130 int order; /* order a direct compactor needs */ 145 int order; /* order a direct compactor needs */
131 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 146 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 struct zone *zone; 147 struct zone *zone;
133 bool contended; /* True if a lock was contended */ 148 bool contended; /* True if a lock was contended, or
149 * need_resched() true during async
150 * compaction
151 */
134}; 152};
135 153
136unsigned long 154unsigned long
diff --git a/mm/madvise.c b/mm/madvise.c
index 539eeb96b323..a402f8fdc68e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
195 for (; start < end; start += PAGE_SIZE) { 195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197 197
198 page = find_get_page(mapping, index); 198 page = find_get_entry(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) { 199 if (!radix_tree_exceptional_entry(page)) {
200 if (page) 200 if (page)
201 page_cache_release(page); 201 page_cache_release(page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6e3f9c39bc22..4ab233d4714a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1554 1554
1555 /* Keep page count to indicate a given hugepage is isolated. */ 1555 /* Keep page count to indicate a given hugepage is isolated. */
1556 list_move(&hpage->lru, &pagelist); 1556 list_move(&hpage->lru, &pagelist);
1557 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1557 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1558 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1558 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1559 if (ret) { 1559 if (ret) {
1560 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1560 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags)
1635 inc_zone_page_state(page, NR_ISOLATED_ANON + 1635 inc_zone_page_state(page, NR_ISOLATED_ANON +
1636 page_is_file_cache(page)); 1636 page_is_file_cache(page));
1637 list_add(&page->lru, &pagelist); 1637 list_add(&page->lru, &pagelist);
1638 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1638 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1639 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1639 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1640 if (ret) { 1640 if (ret) {
1641 putback_lru_pages(&pagelist); 1641 putback_lru_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index 99fe3aa1035c..b5901068495f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -878,7 +878,7 @@ out_set_pte:
878 return 0; 878 return 0;
879} 879}
880 880
881int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 881static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
882 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 882 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
883 unsigned long addr, unsigned long end) 883 unsigned long addr, unsigned long end)
884{ 884{
@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm,
3698 pte_t entry; 3698 pte_t entry;
3699 spinlock_t *ptl; 3699 spinlock_t *ptl;
3700 3700
3701 entry = *pte; 3701 entry = ACCESS_ONCE(*pte);
3702 if (!pte_present(entry)) { 3702 if (!pte_present(entry)) {
3703 if (pte_none(entry)) { 3703 if (pte_none(entry)) {
3704 if (vma->vm_ops) { 3704 if (vma->vm_ops) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..d31730564617 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1321 * alloc_migrate_target should be improooooved!! 1321 * alloc_migrate_target should be improooooved!!
1322 * migrate_pages returns # of failed pages. 1322 * migrate_pages returns # of failed pages.
1323 */ 1323 */
1324 ret = migrate_pages(&source, alloc_migrate_target, 0, 1324 ret = migrate_pages(&source, alloc_migrate_target, NULL, 0,
1325 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1325 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1326 if (ret) 1326 if (ret)
1327 putback_movable_pages(&source); 1327 putback_movable_pages(&source);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0437f3595b32..cc61c7a7d6a1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1060 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1060 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1061 1061
1062 if (!list_empty(&pagelist)) { 1062 if (!list_empty(&pagelist)) {
1063 err = migrate_pages(&pagelist, new_node_page, dest, 1063 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1064 MIGRATE_SYNC, MR_SYSCALL); 1064 MIGRATE_SYNC, MR_SYSCALL);
1065 if (err) 1065 if (err)
1066 putback_movable_pages(&pagelist); 1066 putback_movable_pages(&pagelist);
@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1306 1306
1307 if (!list_empty(&pagelist)) { 1307 if (!list_empty(&pagelist)) {
1308 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1308 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1309 nr_failed = migrate_pages(&pagelist, new_page, 1309 nr_failed = migrate_pages(&pagelist, new_page, NULL,
1310 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1310 start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1311 if (nr_failed) 1311 if (nr_failed)
1312 putback_movable_pages(&pagelist); 1312 putback_movable_pages(&pagelist);
@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp)
1873 * If the effective policy is 'BIND, returns a pointer to the mempolicy's 1873 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1874 * @nodemask for filtering the zonelist. 1874 * @nodemask for filtering the zonelist.
1875 * 1875 *
1876 * Must be protected by get_mems_allowed() 1876 * Must be protected by read_mems_allowed_begin()
1877 */ 1877 */
1878struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1878struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1879 gfp_t gfp_flags, struct mempolicy **mpol, 1879 gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2037 2037
2038retry_cpuset: 2038retry_cpuset:
2039 pol = get_vma_policy(current, vma, addr); 2039 pol = get_vma_policy(current, vma, addr);
2040 cpuset_mems_cookie = get_mems_allowed(); 2040 cpuset_mems_cookie = read_mems_allowed_begin();
2041 2041
2042 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 2042 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2043 unsigned nid; 2043 unsigned nid;
@@ -2045,7 +2045,7 @@ retry_cpuset:
2045 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 2045 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2046 mpol_cond_put(pol); 2046 mpol_cond_put(pol);
2047 page = alloc_page_interleave(gfp, order, nid); 2047 page = alloc_page_interleave(gfp, order, nid);
2048 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2048 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2049 goto retry_cpuset; 2049 goto retry_cpuset;
2050 2050
2051 return page; 2051 return page;
@@ -2055,7 +2055,7 @@ retry_cpuset:
2055 policy_nodemask(gfp, pol)); 2055 policy_nodemask(gfp, pol));
2056 if (unlikely(mpol_needs_cond_ref(pol))) 2056 if (unlikely(mpol_needs_cond_ref(pol)))
2057 __mpol_put(pol); 2057 __mpol_put(pol);
2058 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2058 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2059 goto retry_cpuset; 2059 goto retry_cpuset;
2060 return page; 2060 return page;
2061} 2061}
@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2089 pol = &default_policy; 2089 pol = &default_policy;
2090 2090
2091retry_cpuset: 2091retry_cpuset:
2092 cpuset_mems_cookie = get_mems_allowed(); 2092 cpuset_mems_cookie = read_mems_allowed_begin();
2093 2093
2094 /* 2094 /*
2095 * No reference counting needed for current->mempolicy 2095 * No reference counting needed for current->mempolicy
@@ -2102,7 +2102,7 @@ retry_cpuset:
2102 policy_zonelist(gfp, pol, numa_node_id()), 2102 policy_zonelist(gfp, pol, numa_node_id()),
2103 policy_nodemask(gfp, pol)); 2103 policy_nodemask(gfp, pol));
2104 2104
2105 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2105 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2106 goto retry_cpuset; 2106 goto retry_cpuset;
2107 2107
2108 return page; 2108 return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index e3cf71dd1288..96d4d814ae2f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -867,8 +867,9 @@ out:
867 * Obtain the lock on page, remove all ptes and migrate the page 867 * Obtain the lock on page, remove all ptes and migrate the page
868 * to the newly allocated page in newpage. 868 * to the newly allocated page in newpage.
869 */ 869 */
870static int unmap_and_move(new_page_t get_new_page, unsigned long private, 870static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
871 struct page *page, int force, enum migrate_mode mode) 871 unsigned long private, struct page *page, int force,
872 enum migrate_mode mode)
872{ 873{
873 int rc = 0; 874 int rc = 0;
874 int *result = NULL; 875 int *result = NULL;
@@ -912,11 +913,18 @@ out:
912 page_is_file_cache(page)); 913 page_is_file_cache(page));
913 putback_lru_page(page); 914 putback_lru_page(page);
914 } 915 }
916
915 /* 917 /*
916 * Move the new page to the LRU. If migration was not successful 918 * If migration was not successful and there's a freeing callback, use
917 * then this will free the page. 919 * it. Otherwise, putback_lru_page() will drop the reference grabbed
920 * during isolation.
918 */ 921 */
919 putback_lru_page(newpage); 922 if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
923 ClearPageSwapBacked(newpage);
924 put_new_page(newpage, private);
925 } else
926 putback_lru_page(newpage);
927
920 if (result) { 928 if (result) {
921 if (rc) 929 if (rc)
922 *result = rc; 930 *result = rc;
@@ -945,8 +953,9 @@ out:
945 * will wait in the page fault for migration to complete. 953 * will wait in the page fault for migration to complete.
946 */ 954 */
947static int unmap_and_move_huge_page(new_page_t get_new_page, 955static int unmap_and_move_huge_page(new_page_t get_new_page,
948 unsigned long private, struct page *hpage, 956 free_page_t put_new_page, unsigned long private,
949 int force, enum migrate_mode mode) 957 struct page *hpage, int force,
958 enum migrate_mode mode)
950{ 959{
951 int rc = 0; 960 int rc = 0;
952 int *result = NULL; 961 int *result = NULL;
@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
982 if (!page_mapped(hpage)) 991 if (!page_mapped(hpage))
983 rc = move_to_new_page(new_hpage, hpage, 1, mode); 992 rc = move_to_new_page(new_hpage, hpage, 1, mode);
984 993
985 if (rc) 994 if (rc != MIGRATEPAGE_SUCCESS)
986 remove_migration_ptes(hpage, hpage); 995 remove_migration_ptes(hpage, hpage);
987 996
988 if (anon_vma) 997 if (anon_vma)
989 put_anon_vma(anon_vma); 998 put_anon_vma(anon_vma);
990 999
991 if (!rc) 1000 if (rc == MIGRATEPAGE_SUCCESS)
992 hugetlb_cgroup_migrate(hpage, new_hpage); 1001 hugetlb_cgroup_migrate(hpage, new_hpage);
993 1002
994 unlock_page(hpage); 1003 unlock_page(hpage);
995out: 1004out:
996 if (rc != -EAGAIN) 1005 if (rc != -EAGAIN)
997 putback_active_hugepage(hpage); 1006 putback_active_hugepage(hpage);
998 put_page(new_hpage); 1007
1008 /*
1009 * If migration was not successful and there's a freeing callback, use
1010 * it. Otherwise, put_page() will drop the reference grabbed during
1011 * isolation.
1012 */
1013 if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
1014 put_new_page(new_hpage, private);
1015 else
1016 put_page(new_hpage);
1017
999 if (result) { 1018 if (result) {
1000 if (rc) 1019 if (rc)
1001 *result = rc; 1020 *result = rc;
@@ -1012,6 +1031,8 @@ out:
1012 * @from: The list of pages to be migrated. 1031 * @from: The list of pages to be migrated.
1013 * @get_new_page: The function used to allocate free pages to be used 1032 * @get_new_page: The function used to allocate free pages to be used
1014 * as the target of the page migration. 1033 * as the target of the page migration.
1034 * @put_new_page: The function used to free target pages if migration
1035 * fails, or NULL if no special handling is necessary.
1015 * @private: Private data to be passed on to get_new_page() 1036 * @private: Private data to be passed on to get_new_page()
1016 * @mode: The migration mode that specifies the constraints for 1037 * @mode: The migration mode that specifies the constraints for
1017 * page migration, if any. 1038 * page migration, if any.
@@ -1025,7 +1046,8 @@ out:
1025 * Returns the number of pages that were not migrated, or an error code. 1046 * Returns the number of pages that were not migrated, or an error code.
1026 */ 1047 */
1027int migrate_pages(struct list_head *from, new_page_t get_new_page, 1048int migrate_pages(struct list_head *from, new_page_t get_new_page,
1028 unsigned long private, enum migrate_mode mode, int reason) 1049 free_page_t put_new_page, unsigned long private,
1050 enum migrate_mode mode, int reason)
1029{ 1051{
1030 int retry = 1; 1052 int retry = 1;
1031 int nr_failed = 0; 1053 int nr_failed = 0;
@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1047 1069
1048 if (PageHuge(page)) 1070 if (PageHuge(page))
1049 rc = unmap_and_move_huge_page(get_new_page, 1071 rc = unmap_and_move_huge_page(get_new_page,
1050 private, page, pass > 2, mode); 1072 put_new_page, private, page,
1073 pass > 2, mode);
1051 else 1074 else
1052 rc = unmap_and_move(get_new_page, private, 1075 rc = unmap_and_move(get_new_page, put_new_page,
1053 page, pass > 2, mode); 1076 private, page, pass > 2, mode);
1054 1077
1055 switch(rc) { 1078 switch(rc) {
1056 case -ENOMEM: 1079 case -ENOMEM:
@@ -1194,7 +1217,7 @@ set_status:
1194 1217
1195 err = 0; 1218 err = 0;
1196 if (!list_empty(&pagelist)) { 1219 if (!list_empty(&pagelist)) {
1197 err = migrate_pages(&pagelist, new_page_node, 1220 err = migrate_pages(&pagelist, new_page_node, NULL,
1198 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); 1221 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1199 if (err) 1222 if (err)
1200 putback_movable_pages(&pagelist); 1223 putback_movable_pages(&pagelist);
@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node)
1643 1666
1644 list_add(&page->lru, &migratepages); 1667 list_add(&page->lru, &migratepages);
1645 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, 1668 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1646 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); 1669 NULL, node, MIGRATE_ASYNC,
1670 MR_NUMA_MISPLACED);
1647 if (nr_remaining) { 1671 if (nr_remaining) {
1648 putback_lru_pages(&migratepages); 1672 putback_lru_pages(&migratepages);
1649 isolated = 0; 1673 isolated = 0;
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..06cb81005c77 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
70 * any other file mapping (ie. marked !present and faulted in with 70 * any other file mapping (ie. marked !present and faulted in with
71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
72 */ 72 */
73 page = find_get_page(mapping, pgoff);
74#ifdef CONFIG_SWAP 73#ifdef CONFIG_SWAP
75 /* shmem/tmpfs may return swap: account for swapcache page too. */ 74 if (shmem_mapping(mapping)) {
76 if (radix_tree_exceptional_entry(page)) { 75 page = find_get_entry(mapping, pgoff);
77 swp_entry_t swap = radix_to_swp_entry(page); 76 /*
78 page = find_get_page(swap_address_space(swap), swap.val); 77 * shmem/tmpfs may return swap: account for swapcache
79 } 78 * page too.
79 */
80 if (radix_tree_exceptional_entry(page)) {
81 swp_entry_t swp = radix_to_swp_entry(page);
82 page = find_get_page(swap_address_space(swp), swp.val);
83 }
84 } else
85 page = find_get_page(mapping, pgoff);
86#else
87 page = find_get_page(mapping, pgoff);
80#endif 88#endif
81 if (page) { 89 if (page) {
82 present = PageUptodate(page); 90 present = PageUptodate(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index af99b9ed2007..c1249cb7dc15 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/vmacache.h>
13#include <linux/shm.h> 14#include <linux/shm.h>
14#include <linux/mman.h> 15#include <linux/mman.h>
15#include <linux/pagemap.h> 16#include <linux/pagemap.h>
@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
682 prev->vm_next = next = vma->vm_next; 683 prev->vm_next = next = vma->vm_next;
683 if (next) 684 if (next)
684 next->vm_prev = prev; 685 next->vm_prev = prev;
685 if (mm->mmap_cache == vma) 686
686 mm->mmap_cache = prev; 687 /* Kill the cache */
688 vmacache_invalidate(mm);
687} 689}
688 690
689/* 691/*
@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area);
1980/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1982/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1981struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 1983struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1982{ 1984{
1983 struct vm_area_struct *vma = NULL; 1985 struct rb_node *rb_node;
1986 struct vm_area_struct *vma;
1984 1987
1985 /* Check the cache first. */ 1988 /* Check the cache first. */
1986 /* (Cache hit rate is typically around 35%.) */ 1989 vma = vmacache_find(mm, addr);
1987 vma = ACCESS_ONCE(mm->mmap_cache); 1990 if (likely(vma))
1988 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1991 return vma;
1989 struct rb_node *rb_node;
1990 1992
1991 rb_node = mm->mm_rb.rb_node; 1993 rb_node = mm->mm_rb.rb_node;
1992 vma = NULL; 1994 vma = NULL;
1993 1995
1994 while (rb_node) { 1996 while (rb_node) {
1995 struct vm_area_struct *vma_tmp; 1997 struct vm_area_struct *tmp;
1996 1998
1997 vma_tmp = rb_entry(rb_node, 1999 tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1998 struct vm_area_struct, vm_rb); 2000
1999 2001 if (tmp->vm_end > addr) {
2000 if (vma_tmp->vm_end > addr) { 2002 vma = tmp;
2001 vma = vma_tmp; 2003 if (tmp->vm_start <= addr)
2002 if (vma_tmp->vm_start <= addr) 2004 break;
2003 break; 2005 rb_node = rb_node->rb_left;
2004 rb_node = rb_node->rb_left; 2006 } else
2005 } else 2007 rb_node = rb_node->rb_right;
2006 rb_node = rb_node->rb_right;
2007 }
2008 if (vma)
2009 mm->mmap_cache = vma;
2010 } 2008 }
2009
2010 if (vma)
2011 vmacache_update(addr, vma);
2011 return vma; 2012 return vma;
2012} 2013}
2013 2014
@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2379 } else 2380 } else
2380 mm->highest_vm_end = prev ? prev->vm_end : 0; 2381 mm->highest_vm_end = prev ? prev->vm_end : 0;
2381 tail_vma->vm_next = NULL; 2382 tail_vma->vm_next = NULL;
2382 mm->mmap_cache = NULL; /* Kill the cache. */ 2383
2384 /* Kill the cache */
2385 vmacache_invalidate(mm);
2383} 2386}
2384 2387
2385/* 2388/*
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..1221d2b66e97 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/export.h> 16#include <linux/export.h>
17#include <linux/mm.h> 17#include <linux/mm.h>
18#include <linux/vmacache.h>
18#include <linux/mman.h> 19#include <linux/mman.h>
19#include <linux/swap.h> 20#include <linux/swap.h>
20#include <linux/file.h> 21#include <linux/file.h>
@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
767 */ 768 */
768static void delete_vma_from_mm(struct vm_area_struct *vma) 769static void delete_vma_from_mm(struct vm_area_struct *vma)
769{ 770{
771 int i;
770 struct address_space *mapping; 772 struct address_space *mapping;
771 struct mm_struct *mm = vma->vm_mm; 773 struct mm_struct *mm = vma->vm_mm;
774 struct task_struct *curr = current;
772 775
773 kenter("%p", vma); 776 kenter("%p", vma);
774 777
775 protect_vma(vma, 0); 778 protect_vma(vma, 0);
776 779
777 mm->map_count--; 780 mm->map_count--;
778 if (mm->mmap_cache == vma) 781 for (i = 0; i < VMACACHE_SIZE; i++) {
779 mm->mmap_cache = NULL; 782 /* if the vma is cached, invalidate the entire cache */
783 if (curr->vmacache[i] == vma) {
784 vmacache_invalidate(curr->mm);
785 break;
786 }
787 }
780 788
781 /* remove the VMA from the mapping */ 789 /* remove the VMA from the mapping */
782 if (vma->vm_file) { 790 if (vma->vm_file) {
@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
824 struct vm_area_struct *vma; 832 struct vm_area_struct *vma;
825 833
826 /* check the cache first */ 834 /* check the cache first */
827 vma = ACCESS_ONCE(mm->mmap_cache); 835 vma = vmacache_find(mm, addr);
828 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 836 if (likely(vma))
829 return vma; 837 return vma;
830 838
831 /* trawl the list (there may be multiple mappings in which addr 839 /* trawl the list (there may be multiple mappings in which addr
@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
834 if (vma->vm_start > addr) 842 if (vma->vm_start > addr)
835 return NULL; 843 return NULL;
836 if (vma->vm_end > addr) { 844 if (vma->vm_end > addr) {
837 mm->mmap_cache = vma; 845 vmacache_update(addr, vma);
838 return vma; 846 return vma;
839 } 847 }
840 } 848 }
@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
873 unsigned long end = addr + len; 881 unsigned long end = addr + len;
874 882
875 /* check the cache first */ 883 /* check the cache first */
876 vma = mm->mmap_cache; 884 vma = vmacache_find_exact(mm, addr, end);
877 if (vma && vma->vm_start == addr && vma->vm_end == end) 885 if (vma)
878 return vma; 886 return vma;
879 887
880 /* trawl the list (there may be multiple mappings in which addr 888 /* trawl the list (there may be multiple mappings in which addr
@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
885 if (vma->vm_start > addr) 893 if (vma->vm_start > addr)
886 return NULL; 894 return NULL;
887 if (vma->vm_end == end) { 895 if (vma->vm_end == end) {
888 mm->mmap_cache = vma; 896 vmacache_update(addr, vma);
889 return vma; 897 return vma;
890 } 898 }
891 } 899 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a280f772bc66..2f91223dbe93 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -405,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order)
405 return bad; 405 return bad;
406} 406}
407 407
408static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 408static inline void prep_zero_page(struct page *page, unsigned int order,
409 gfp_t gfp_flags)
409{ 410{
410 int i; 411 int i;
411 412
@@ -449,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { }
449static inline void clear_page_guard_flag(struct page *page) { } 450static inline void clear_page_guard_flag(struct page *page) { }
450#endif 451#endif
451 452
452static inline void set_page_order(struct page *page, int order) 453static inline void set_page_order(struct page *page, unsigned int order)
453{ 454{
454 set_page_private(page, order); 455 set_page_private(page, order);
455 __SetPageBuddy(page); 456 __SetPageBuddy(page);
@@ -500,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
500 * For recording page's order, we use page_private(page). 501 * For recording page's order, we use page_private(page).
501 */ 502 */
502static inline int page_is_buddy(struct page *page, struct page *buddy, 503static inline int page_is_buddy(struct page *page, struct page *buddy,
503 int order) 504 unsigned int order)
504{ 505{
505 if (!pfn_valid_within(page_to_pfn(buddy))) 506 if (!pfn_valid_within(page_to_pfn(buddy)))
506 return 0; 507 return 0;
507 508
508 if (page_zone_id(page) != page_zone_id(buddy))
509 return 0;
510
511 if (page_is_guard(buddy) && page_order(buddy) == order) { 509 if (page_is_guard(buddy) && page_order(buddy) == order) {
512 VM_BUG_ON(page_count(buddy) != 0); 510 VM_BUG_ON(page_count(buddy) != 0);
511
512 if (page_zone_id(page) != page_zone_id(buddy))
513 return 0;
514
513 return 1; 515 return 1;
514 } 516 }
515 517
516 if (PageBuddy(buddy) && page_order(buddy) == order) { 518 if (PageBuddy(buddy) && page_order(buddy) == order) {
517 VM_BUG_ON(page_count(buddy) != 0); 519 VM_BUG_ON(page_count(buddy) != 0);
520
521 /*
522 * zone check is done late to avoid uselessly
523 * calculating zone/node ids for pages that could
524 * never merge.
525 */
526 if (page_zone_id(page) != page_zone_id(buddy))
527 return 0;
528
518 return 1; 529 return 1;
519 } 530 }
520 return 0; 531 return 0;
@@ -546,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
546 */ 557 */
547 558
548static inline void __free_one_page(struct page *page, 559static inline void __free_one_page(struct page *page,
560 unsigned long pfn,
549 struct zone *zone, unsigned int order, 561 struct zone *zone, unsigned int order,
550 int migratetype) 562 int migratetype)
551{ 563{
@@ -562,7 +574,7 @@ static inline void __free_one_page(struct page *page,
562 574
563 VM_BUG_ON(migratetype == -1); 575 VM_BUG_ON(migratetype == -1);
564 576
565 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 577 page_idx = pfn & ((1 << MAX_ORDER) - 1);
566 578
567 VM_BUG_ON(page_idx & ((1 << order) - 1)); 579 VM_BUG_ON(page_idx & ((1 << order) - 1));
568 VM_BUG_ON(bad_range(zone, page)); 580 VM_BUG_ON(bad_range(zone, page));
@@ -652,9 +664,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
652 int migratetype = 0; 664 int migratetype = 0;
653 int batch_free = 0; 665 int batch_free = 0;
654 int to_free = count; 666 int to_free = count;
667 unsigned long nr_scanned;
655 668
656 spin_lock(&zone->lock); 669 spin_lock(&zone->lock);
657 zone->pages_scanned = 0; 670 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
671 if (nr_scanned)
672 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
658 673
659 while (to_free) { 674 while (to_free) {
660 struct page *page; 675 struct page *page;
@@ -686,7 +701,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
686 list_del(&page->lru); 701 list_del(&page->lru);
687 mt = get_freepage_migratetype(page); 702 mt = get_freepage_migratetype(page);
688 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 703 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
689 __free_one_page(page, zone, 0, mt); 704 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
690 trace_mm_page_pcpu_drain(page, 0, mt); 705 trace_mm_page_pcpu_drain(page, 0, mt);
691 if (likely(!is_migrate_isolate_page(page))) { 706 if (likely(!is_migrate_isolate_page(page))) {
692 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 707 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
@@ -698,13 +713,18 @@ static void free_pcppages_bulk(struct zone *zone, int count,
698 spin_unlock(&zone->lock); 713 spin_unlock(&zone->lock);
699} 714}
700 715
701static void free_one_page(struct zone *zone, struct page *page, int order, 716static void free_one_page(struct zone *zone,
717 struct page *page, unsigned long pfn,
718 unsigned int order,
702 int migratetype) 719 int migratetype)
703{ 720{
721 unsigned long nr_scanned;
704 spin_lock(&zone->lock); 722 spin_lock(&zone->lock);
705 zone->pages_scanned = 0; 723 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
724 if (nr_scanned)
725 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
706 726
707 __free_one_page(page, zone, order, migratetype); 727 __free_one_page(page, pfn, zone, order, migratetype);
708 if (unlikely(!is_migrate_isolate(migratetype))) 728 if (unlikely(!is_migrate_isolate(migratetype)))
709 __mod_zone_freepage_state(zone, 1 << order, migratetype); 729 __mod_zone_freepage_state(zone, 1 << order, migratetype);
710 spin_unlock(&zone->lock); 730 spin_unlock(&zone->lock);
@@ -741,15 +761,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
741{ 761{
742 unsigned long flags; 762 unsigned long flags;
743 int migratetype; 763 int migratetype;
764 unsigned long pfn = page_to_pfn(page);
744 765
745 if (!free_pages_prepare(page, order)) 766 if (!free_pages_prepare(page, order))
746 return; 767 return;
747 768
769 migratetype = get_pfnblock_migratetype(page, pfn);
748 local_irq_save(flags); 770 local_irq_save(flags);
749 __count_vm_events(PGFREE, 1 << order); 771 __count_vm_events(PGFREE, 1 << order);
750 migratetype = get_pageblock_migratetype(page);
751 set_freepage_migratetype(page, migratetype); 772 set_freepage_migratetype(page, migratetype);
752 free_one_page(page_zone(page), page, order, migratetype); 773 free_one_page(page_zone(page), page, pfn, order, migratetype);
753 local_irq_restore(flags); 774 local_irq_restore(flags);
754} 775}
755 776
@@ -869,7 +890,7 @@ static inline int check_new_page(struct page *page)
869 return 0; 890 return 0;
870} 891}
871 892
872static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 893static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
873{ 894{
874 int i; 895 int i;
875 896
@@ -918,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
918 rmv_page_order(page); 939 rmv_page_order(page);
919 area->nr_free--; 940 area->nr_free--;
920 expand(zone, page, order, current_order, area, migratetype); 941 expand(zone, page, order, current_order, area, migratetype);
942 set_freepage_migratetype(page, migratetype);
921 return page; 943 return page;
922 } 944 }
923 945
@@ -1042,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1042{ 1064{
1043 int current_order = page_order(page); 1065 int current_order = page_order(page);
1044 1066
1067 /*
1068 * When borrowing from MIGRATE_CMA, we need to release the excess
1069 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1070 * is set to CMA so it is returned to the correct freelist in case
1071 * the page ends up being not actually allocated from the pcp lists.
1072 */
1045 if (is_migrate_cma(fallback_type)) 1073 if (is_migrate_cma(fallback_type))
1046 return fallback_type; 1074 return fallback_type;
1047 1075
@@ -1073,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1073 1101
1074/* Remove an element from the buddy allocator from the fallback list */ 1102/* Remove an element from the buddy allocator from the fallback list */
1075static inline struct page * 1103static inline struct page *
1076__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1104__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1077{ 1105{
1078 struct free_area *area; 1106 struct free_area *area;
1079 int current_order; 1107 unsigned int current_order;
1080 struct page *page; 1108 struct page *page;
1081 int migratetype, new_type, i; 1109 int migratetype, new_type, i;
1082 1110
1083 /* Find the largest possible block of pages in the other list */ 1111 /* Find the largest possible block of pages in the other list */
1084 for (current_order = MAX_ORDER-1; current_order >= order; 1112 for (current_order = MAX_ORDER-1;
1085 --current_order) { 1113 current_order >= order && current_order <= MAX_ORDER-1;
1114 --current_order) {
1086 for (i = 0;; i++) { 1115 for (i = 0;; i++) {
1087 migratetype = fallbacks[start_migratetype][i]; 1116 migratetype = fallbacks[start_migratetype][i];
1088 1117
@@ -1106,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1106 list_del(&page->lru); 1135 list_del(&page->lru);
1107 rmv_page_order(page); 1136 rmv_page_order(page);
1108 1137
1109 /*
1110 * Borrow the excess buddy pages as well, irrespective
1111 * of whether we stole freepages, or took ownership of
1112 * the pageblock or not.
1113 *
1114 * Exception: When borrowing from MIGRATE_CMA, release
1115 * the excess buddy pages to CMA itself.
1116 */
1117 expand(zone, page, order, current_order, area, 1138 expand(zone, page, order, current_order, area,
1118 is_migrate_cma(migratetype) 1139 new_type);
1119 ? migratetype : start_migratetype); 1140 /* The freepage_migratetype may differ from pageblock's
1141 * migratetype depending on the decisions in
1142 * try_to_steal_freepages. This is OK as long as it does
1143 * not differ for MIGRATE_CMA type.
1144 */
1145 set_freepage_migratetype(page, new_type);
1120 1146
1121 trace_mm_page_alloc_extfrag(page, order, 1147 trace_mm_page_alloc_extfrag(page, order, current_order,
1122 current_order, start_migratetype, migratetype, 1148 start_migratetype, migratetype, new_type);
1123 new_type == start_migratetype);
1124 1149
1125 return page; 1150 return page;
1126 } 1151 }
@@ -1166,9 +1191,9 @@ retry_reserve:
1166 */ 1191 */
1167static int rmqueue_bulk(struct zone *zone, unsigned int order, 1192static int rmqueue_bulk(struct zone *zone, unsigned int order,
1168 unsigned long count, struct list_head *list, 1193 unsigned long count, struct list_head *list,
1169 int migratetype, int cold) 1194 int migratetype, bool cold)
1170{ 1195{
1171 int mt = migratetype, i; 1196 int i;
1172 1197
1173 spin_lock(&zone->lock); 1198 spin_lock(&zone->lock);
1174 for (i = 0; i < count; ++i) { 1199 for (i = 0; i < count; ++i) {
@@ -1185,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1185 * merge IO requests if the physical pages are ordered 1210 * merge IO requests if the physical pages are ordered
1186 * properly. 1211 * properly.
1187 */ 1212 */
1188 if (likely(cold == 0)) 1213 if (likely(!cold))
1189 list_add(&page->lru, list); 1214 list_add(&page->lru, list);
1190 else 1215 else
1191 list_add_tail(&page->lru, list); 1216 list_add_tail(&page->lru, list);
1192 if (IS_ENABLED(CONFIG_CMA)) {
1193 mt = get_pageblock_migratetype(page);
1194 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1195 mt = migratetype;
1196 }
1197 set_freepage_migratetype(page, mt);
1198 list = &page->lru; 1217 list = &page->lru;
1199 if (is_migrate_cma(mt)) 1218 if (is_migrate_cma(get_freepage_migratetype(page)))
1200 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1219 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1201 -(1 << order)); 1220 -(1 << order));
1202 } 1221 }
@@ -1320,7 +1339,7 @@ void mark_free_pages(struct zone *zone)
1320{ 1339{
1321 unsigned long pfn, max_zone_pfn; 1340 unsigned long pfn, max_zone_pfn;
1322 unsigned long flags; 1341 unsigned long flags;
1323 int order, t; 1342 unsigned int order, t;
1324 struct list_head *curr; 1343 struct list_head *curr;
1325 1344
1326 if (zone_is_empty(zone)) 1345 if (zone_is_empty(zone))
@@ -1352,19 +1371,20 @@ void mark_free_pages(struct zone *zone)
1352 1371
1353/* 1372/*
1354 * Free a 0-order page 1373 * Free a 0-order page
1355 * cold == 1 ? free a cold page : free a hot page 1374 * cold == true ? free a cold page : free a hot page
1356 */ 1375 */
1357void free_hot_cold_page(struct page *page, int cold) 1376void free_hot_cold_page(struct page *page, bool cold)
1358{ 1377{
1359 struct zone *zone = page_zone(page); 1378 struct zone *zone = page_zone(page);
1360 struct per_cpu_pages *pcp; 1379 struct per_cpu_pages *pcp;
1361 unsigned long flags; 1380 unsigned long flags;
1381 unsigned long pfn = page_to_pfn(page);
1362 int migratetype; 1382 int migratetype;
1363 1383
1364 if (!free_pages_prepare(page, 0)) 1384 if (!free_pages_prepare(page, 0))
1365 return; 1385 return;
1366 1386
1367 migratetype = get_pageblock_migratetype(page); 1387 migratetype = get_pfnblock_migratetype(page, pfn);
1368 set_freepage_migratetype(page, migratetype); 1388 set_freepage_migratetype(page, migratetype);
1369 local_irq_save(flags); 1389 local_irq_save(flags);
1370 __count_vm_event(PGFREE); 1390 __count_vm_event(PGFREE);
@@ -1378,17 +1398,17 @@ void free_hot_cold_page(struct page *page, int cold)
1378 */ 1398 */
1379 if (migratetype >= MIGRATE_PCPTYPES) { 1399 if (migratetype >= MIGRATE_PCPTYPES) {
1380 if (unlikely(is_migrate_isolate(migratetype))) { 1400 if (unlikely(is_migrate_isolate(migratetype))) {
1381 free_one_page(zone, page, 0, migratetype); 1401 free_one_page(zone, page, pfn, 0, migratetype);
1382 goto out; 1402 goto out;
1383 } 1403 }
1384 migratetype = MIGRATE_MOVABLE; 1404 migratetype = MIGRATE_MOVABLE;
1385 } 1405 }
1386 1406
1387 pcp = &this_cpu_ptr(zone->pageset)->pcp; 1407 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1388 if (cold) 1408 if (!cold)
1389 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1390 else
1391 list_add(&page->lru, &pcp->lists[migratetype]); 1409 list_add(&page->lru, &pcp->lists[migratetype]);
1410 else
1411 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1392 pcp->count++; 1412 pcp->count++;
1393 if (pcp->count >= pcp->high) { 1413 if (pcp->count >= pcp->high) {
1394 unsigned long batch = ACCESS_ONCE(pcp->batch); 1414 unsigned long batch = ACCESS_ONCE(pcp->batch);
@@ -1403,7 +1423,7 @@ out:
1403/* 1423/*
1404 * Free a list of 0-order pages 1424 * Free a list of 0-order pages
1405 */ 1425 */
1406void free_hot_cold_page_list(struct list_head *list, int cold) 1426void free_hot_cold_page_list(struct list_head *list, bool cold)
1407{ 1427{
1408 struct page *page, *next; 1428 struct page *page, *next;
1409 1429
@@ -1515,12 +1535,12 @@ int split_free_page(struct page *page)
1515 */ 1535 */
1516static inline 1536static inline
1517struct page *buffered_rmqueue(struct zone *preferred_zone, 1537struct page *buffered_rmqueue(struct zone *preferred_zone,
1518 struct zone *zone, int order, gfp_t gfp_flags, 1538 struct zone *zone, unsigned int order,
1519 int migratetype) 1539 gfp_t gfp_flags, int migratetype)
1520{ 1540{
1521 unsigned long flags; 1541 unsigned long flags;
1522 struct page *page; 1542 struct page *page;
1523 int cold = !!(gfp_flags & __GFP_COLD); 1543 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1524 1544
1525again: 1545again:
1526 if (likely(order == 0)) { 1546 if (likely(order == 0)) {
@@ -1565,10 +1585,13 @@ again:
1565 if (!page) 1585 if (!page)
1566 goto failed; 1586 goto failed;
1567 __mod_zone_freepage_state(zone, -(1 << order), 1587 __mod_zone_freepage_state(zone, -(1 << order),
1568 get_pageblock_migratetype(page)); 1588 get_freepage_migratetype(page));
1569 } 1589 }
1570 1590
1571 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1591 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1592 if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
1593 !zone_is_fair_depleted(zone))
1594 zone_set_flag(zone, ZONE_FAIR_DEPLETED);
1572 1595
1573 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1596 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1574 zone_statistics(preferred_zone, zone, gfp_flags); 1597 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1665,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1665 * Return true if free pages are above 'mark'. This takes into account the order 1688 * Return true if free pages are above 'mark'. This takes into account the order
1666 * of the allocation. 1689 * of the allocation.
1667 */ 1690 */
1668static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1691static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1669 int classzone_idx, int alloc_flags, long free_pages) 1692 unsigned long mark, int classzone_idx, int alloc_flags,
1693 long free_pages)
1670{ 1694{
1671 /* free_pages my go negative - that's OK */ 1695 /* free_pages my go negative - that's OK */
1672 long min = mark; 1696 long min = mark;
1673 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1674 int o; 1697 int o;
1675 long free_cma = 0; 1698 long free_cma = 0;
1676 1699
@@ -1685,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1685 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1708 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1686#endif 1709#endif
1687 1710
1688 if (free_pages - free_cma <= min + lowmem_reserve) 1711 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
1689 return false; 1712 return false;
1690 for (o = 0; o < order; o++) { 1713 for (o = 0; o < order; o++) {
1691 /* At the next order, this order's pages become unavailable */ 1714 /* At the next order, this order's pages become unavailable */
@@ -1700,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1700 return true; 1723 return true;
1701} 1724}
1702 1725
1703bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1726bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
1704 int classzone_idx, int alloc_flags) 1727 int classzone_idx, int alloc_flags)
1705{ 1728{
1706 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1729 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1707 zone_page_state(z, NR_FREE_PAGES)); 1730 zone_page_state(z, NR_FREE_PAGES));
1708} 1731}
1709 1732
1710bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, 1733bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
1711 int classzone_idx, int alloc_flags) 1734 unsigned long mark, int classzone_idx, int alloc_flags)
1712{ 1735{
1713 long free_pages = zone_page_state(z, NR_FREE_PAGES); 1736 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1714 1737
@@ -1850,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
1850{ 1873{
1851 int i; 1874 int i;
1852 1875
1853 for_each_online_node(i) 1876 for_each_node_state(i, N_MEMORY)
1854 if (node_distance(nid, i) <= RECLAIM_DISTANCE) 1877 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1855 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1878 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1856 else 1879 else
@@ -1893,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid)
1893} 1916}
1894#endif /* CONFIG_NUMA */ 1917#endif /* CONFIG_NUMA */
1895 1918
1919static void reset_alloc_batches(struct zone *preferred_zone)
1920{
1921 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
1922
1923 do {
1924 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1925 high_wmark_pages(zone) - low_wmark_pages(zone) -
1926 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1927 zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
1928 } while (zone++ != preferred_zone);
1929}
1930
1896/* 1931/*
1897 * get_page_from_freelist goes through the zonelist trying to allocate 1932 * get_page_from_freelist goes through the zonelist trying to allocate
1898 * a page. 1933 * a page.
@@ -1900,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid)
1900static struct page * 1935static struct page *
1901get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1936get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1902 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 1937 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1903 struct zone *preferred_zone, int migratetype) 1938 struct zone *preferred_zone, int classzone_idx, int migratetype)
1904{ 1939{
1905 struct zoneref *z; 1940 struct zoneref *z;
1906 struct page *page = NULL; 1941 struct page *page = NULL;
1907 int classzone_idx;
1908 struct zone *zone; 1942 struct zone *zone;
1909 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1943 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1910 int zlc_active = 0; /* set if using zonelist_cache */ 1944 int zlc_active = 0; /* set if using zonelist_cache */
1911 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1945 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1946 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1947 (gfp_mask & __GFP_WRITE);
1948 int nr_fair_skipped = 0;
1949 bool zonelist_rescan;
1912 1950
1913 classzone_idx = zone_idx(preferred_zone);
1914zonelist_scan: 1951zonelist_scan:
1952 zonelist_rescan = false;
1953
1915 /* 1954 /*
1916 * Scan zonelist, looking for a zone with enough free. 1955 * Scan zonelist, looking for a zone with enough free.
1917 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1956 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1923,12 +1962,10 @@ zonelist_scan:
1923 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1962 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1924 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1963 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1925 continue; 1964 continue;
1926 if ((alloc_flags & ALLOC_CPUSET) && 1965 if (cpusets_enabled() &&
1966 (alloc_flags & ALLOC_CPUSET) &&
1927 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1967 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1928 continue; 1968 continue;
1929 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1930 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1931 goto try_this_zone;
1932 /* 1969 /*
1933 * Distribute pages in proportion to the individual 1970 * Distribute pages in proportion to the individual
1934 * zone size to ensure fair page aging. The zone a 1971 * zone size to ensure fair page aging. The zone a
@@ -1937,9 +1974,11 @@ zonelist_scan:
1937 */ 1974 */
1938 if (alloc_flags & ALLOC_FAIR) { 1975 if (alloc_flags & ALLOC_FAIR) {
1939 if (!zone_local(preferred_zone, zone)) 1976 if (!zone_local(preferred_zone, zone))
1977 break;
1978 if (zone_is_fair_depleted(zone)) {
1979 nr_fair_skipped++;
1940 continue; 1980 continue;
1941 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1981 }
1942 continue;
1943 } 1982 }
1944 /* 1983 /*
1945 * When allocating a page cache page for writing, we 1984 * When allocating a page cache page for writing, we
@@ -1967,15 +2006,19 @@ zonelist_scan:
1967 * will require awareness of zones in the 2006 * will require awareness of zones in the
1968 * dirty-throttling and the flusher threads. 2007 * dirty-throttling and the flusher threads.
1969 */ 2008 */
1970 if ((alloc_flags & ALLOC_WMARK_LOW) && 2009 if (consider_zone_dirty && !zone_dirty_ok(zone))
1971 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 2010 continue;
1972 goto this_zone_full;
1973 2011
1974 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2012 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1975 if (!zone_watermark_ok(zone, order, mark, 2013 if (!zone_watermark_ok(zone, order, mark,
1976 classzone_idx, alloc_flags)) { 2014 classzone_idx, alloc_flags)) {
1977 int ret; 2015 int ret;
1978 2016
2017 /* Checked here to keep the fast path fast */
2018 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
2019 if (alloc_flags & ALLOC_NO_WATERMARKS)
2020 goto try_this_zone;
2021
1979 if (IS_ENABLED(CONFIG_NUMA) && 2022 if (IS_ENABLED(CONFIG_NUMA) &&
1980 !did_zlc_setup && nr_online_nodes > 1) { 2023 !did_zlc_setup && nr_online_nodes > 1) {
1981 /* 2024 /*
@@ -2037,17 +2080,11 @@ try_this_zone:
2037 if (page) 2080 if (page)
2038 break; 2081 break;
2039this_zone_full: 2082this_zone_full:
2040 if (IS_ENABLED(CONFIG_NUMA)) 2083 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2041 zlc_mark_zone_full(zonelist, z); 2084 zlc_mark_zone_full(zonelist, z);
2042 } 2085 }
2043 2086
2044 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2087 if (page) {
2045 /* Disable zlc cache for second zonelist scan */
2046 zlc_active = 0;
2047 goto zonelist_scan;
2048 }
2049
2050 if (page)
2051 /* 2088 /*
2052 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2089 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2053 * necessary to allocate the page. The expectation is 2090 * necessary to allocate the page. The expectation is
@@ -2056,8 +2093,37 @@ this_zone_full:
2056 * for !PFMEMALLOC purposes. 2093 * for !PFMEMALLOC purposes.
2057 */ 2094 */
2058 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2095 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2096 return page;
2097 }
2059 2098
2060 return page; 2099 /*
2100 * The first pass makes sure allocations are spread fairly within the
2101 * local node. However, the local node might have free pages left
2102 * after the fairness batches are exhausted, and remote zones haven't
2103 * even been considered yet. Try once more without fairness, and
2104 * include remote zones now, before entering the slowpath and waking
2105 * kswapd: prefer spilling to a remote zone over swapping locally.
2106 */
2107 if (alloc_flags & ALLOC_FAIR) {
2108 alloc_flags &= ~ALLOC_FAIR;
2109 if (nr_fair_skipped) {
2110 zonelist_rescan = true;
2111 reset_alloc_batches(preferred_zone);
2112 }
2113 if (nr_online_nodes > 1)
2114 zonelist_rescan = true;
2115 }
2116
2117 if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
2118 /* Disable zlc cache for second zonelist scan */
2119 zlc_active = 0;
2120 zonelist_rescan = true;
2121 }
2122
2123 if (zonelist_rescan)
2124 goto zonelist_scan;
2125
2126 return NULL;
2061} 2127}
2062 2128
2063/* 2129/*
@@ -2173,7 +2239,7 @@ static inline struct page *
2173__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2239__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2174 struct zonelist *zonelist, enum zone_type high_zoneidx, 2240 struct zonelist *zonelist, enum zone_type high_zoneidx,
2175 nodemask_t *nodemask, struct zone *preferred_zone, 2241 nodemask_t *nodemask, struct zone *preferred_zone,
2176 int migratetype) 2242 int classzone_idx, int migratetype)
2177{ 2243{
2178 struct page *page; 2244 struct page *page;
2179 2245
@@ -2191,7 +2257,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2191 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2257 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
2192 order, zonelist, high_zoneidx, 2258 order, zonelist, high_zoneidx,
2193 ALLOC_WMARK_HIGH|ALLOC_CPUSET, 2259 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2194 preferred_zone, migratetype); 2260 preferred_zone, classzone_idx, migratetype);
2195 if (page) 2261 if (page)
2196 goto out; 2262 goto out;
2197 2263
@@ -2226,7 +2292,7 @@ static struct page *
2226__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2292__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2227 struct zonelist *zonelist, enum zone_type high_zoneidx, 2293 struct zonelist *zonelist, enum zone_type high_zoneidx,
2228 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2294 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2229 int migratetype, bool sync_migration, 2295 int classzone_idx, int migratetype, enum migrate_mode mode,
2230 bool *contended_compaction, bool *deferred_compaction, 2296 bool *contended_compaction, bool *deferred_compaction,
2231 unsigned long *did_some_progress) 2297 unsigned long *did_some_progress)
2232{ 2298{
@@ -2240,7 +2306,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2240 2306
2241 current->flags |= PF_MEMALLOC; 2307 current->flags |= PF_MEMALLOC;
2242 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2308 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2243 nodemask, sync_migration, 2309 nodemask, mode,
2244 contended_compaction); 2310 contended_compaction);
2245 current->flags &= ~PF_MEMALLOC; 2311 current->flags &= ~PF_MEMALLOC;
2246 2312
@@ -2254,13 +2320,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2254 page = get_page_from_freelist(gfp_mask, nodemask, 2320 page = get_page_from_freelist(gfp_mask, nodemask,
2255 order, zonelist, high_zoneidx, 2321 order, zonelist, high_zoneidx,
2256 alloc_flags & ~ALLOC_NO_WATERMARKS, 2322 alloc_flags & ~ALLOC_NO_WATERMARKS,
2257 preferred_zone, migratetype); 2323 preferred_zone, classzone_idx, migratetype);
2258 if (page) { 2324 if (page) {
2259 preferred_zone->compact_blockskip_flush = false; 2325 preferred_zone->compact_blockskip_flush = false;
2260 preferred_zone->compact_considered = 0; 2326 compaction_defer_reset(preferred_zone, order, true);
2261 preferred_zone->compact_defer_shift = 0;
2262 if (order >= preferred_zone->compact_order_failed)
2263 preferred_zone->compact_order_failed = order + 1;
2264 count_vm_event(COMPACTSUCCESS); 2327 count_vm_event(COMPACTSUCCESS);
2265 return page; 2328 return page;
2266 } 2329 }
@@ -2276,7 +2339,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2276 * As async compaction considers a subset of pageblocks, only 2339 * As async compaction considers a subset of pageblocks, only
2277 * defer if the failure was a sync compaction failure. 2340 * defer if the failure was a sync compaction failure.
2278 */ 2341 */
2279 if (sync_migration) 2342 if (mode != MIGRATE_ASYNC)
2280 defer_compaction(preferred_zone, order); 2343 defer_compaction(preferred_zone, order);
2281 2344
2282 cond_resched(); 2345 cond_resched();
@@ -2289,9 +2352,9 @@ static inline struct page *
2289__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2352__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2290 struct zonelist *zonelist, enum zone_type high_zoneidx, 2353 struct zonelist *zonelist, enum zone_type high_zoneidx,
2291 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2354 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2292 int migratetype, bool sync_migration, 2355 int classzone_idx, int migratetype,
2293 bool *contended_compaction, bool *deferred_compaction, 2356 enum migrate_mode mode, bool *contended_compaction,
2294 unsigned long *did_some_progress) 2357 bool *deferred_compaction, unsigned long *did_some_progress)
2295{ 2358{
2296 return NULL; 2359 return NULL;
2297} 2360}
@@ -2330,7 +2393,7 @@ static inline struct page *
2330__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2393__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2331 struct zonelist *zonelist, enum zone_type high_zoneidx, 2394 struct zonelist *zonelist, enum zone_type high_zoneidx,
2332 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2395 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2333 int migratetype, unsigned long *did_some_progress) 2396 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2334{ 2397{
2335 struct page *page = NULL; 2398 struct page *page = NULL;
2336 bool drained = false; 2399 bool drained = false;
@@ -2348,7 +2411,8 @@ retry:
2348 page = get_page_from_freelist(gfp_mask, nodemask, order, 2411 page = get_page_from_freelist(gfp_mask, nodemask, order,
2349 zonelist, high_zoneidx, 2412 zonelist, high_zoneidx,
2350 alloc_flags & ~ALLOC_NO_WATERMARKS, 2413 alloc_flags & ~ALLOC_NO_WATERMARKS,
2351 preferred_zone, migratetype); 2414 preferred_zone, classzone_idx,
2415 migratetype);
2352 2416
2353 /* 2417 /*
2354 * If an allocation failed after direct reclaim, it could be because 2418 * If an allocation failed after direct reclaim, it could be because
@@ -2371,14 +2435,14 @@ static inline struct page *
2371__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2435__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2372 struct zonelist *zonelist, enum zone_type high_zoneidx, 2436 struct zonelist *zonelist, enum zone_type high_zoneidx,
2373 nodemask_t *nodemask, struct zone *preferred_zone, 2437 nodemask_t *nodemask, struct zone *preferred_zone,
2374 int migratetype) 2438 int classzone_idx, int migratetype)
2375{ 2439{
2376 struct page *page; 2440 struct page *page;
2377 2441
2378 do { 2442 do {
2379 page = get_page_from_freelist(gfp_mask, nodemask, order, 2443 page = get_page_from_freelist(gfp_mask, nodemask, order,
2380 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2444 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
2381 preferred_zone, migratetype); 2445 preferred_zone, classzone_idx, migratetype);
2382 2446
2383 if (!page && gfp_mask & __GFP_NOFAIL) 2447 if (!page && gfp_mask & __GFP_NOFAIL)
2384 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2448 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
@@ -2387,28 +2451,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2387 return page; 2451 return page;
2388} 2452}
2389 2453
2390static void reset_alloc_batches(struct zonelist *zonelist,
2391 enum zone_type high_zoneidx,
2392 struct zone *preferred_zone)
2393{
2394 struct zoneref *z;
2395 struct zone *zone;
2396
2397 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2398 /*
2399 * Only reset the batches of zones that were actually
2400 * considered in the fairness pass, we don't want to
2401 * trash fairness information for zones that are not
2402 * actually part of this zonelist's round-robin cycle.
2403 */
2404 if (!zone_local(preferred_zone, zone))
2405 continue;
2406 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2407 high_wmark_pages(zone) - low_wmark_pages(zone) -
2408 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2409 }
2410}
2411
2412static void wake_all_kswapds(unsigned int order, 2454static void wake_all_kswapds(unsigned int order,
2413 struct zonelist *zonelist, 2455 struct zonelist *zonelist,
2414 enum zone_type high_zoneidx, 2456 enum zone_type high_zoneidx,
@@ -2479,14 +2521,14 @@ static inline struct page *
2479__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2521__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2480 struct zonelist *zonelist, enum zone_type high_zoneidx, 2522 struct zonelist *zonelist, enum zone_type high_zoneidx,
2481 nodemask_t *nodemask, struct zone *preferred_zone, 2523 nodemask_t *nodemask, struct zone *preferred_zone,
2482 int migratetype) 2524 int classzone_idx, int migratetype)
2483{ 2525{
2484 const gfp_t wait = gfp_mask & __GFP_WAIT; 2526 const gfp_t wait = gfp_mask & __GFP_WAIT;
2485 struct page *page = NULL; 2527 struct page *page = NULL;
2486 int alloc_flags; 2528 int alloc_flags;
2487 unsigned long pages_reclaimed = 0; 2529 unsigned long pages_reclaimed = 0;
2488 unsigned long did_some_progress; 2530 unsigned long did_some_progress;
2489 bool sync_migration = false; 2531 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2490 bool deferred_compaction = false; 2532 bool deferred_compaction = false;
2491 bool contended_compaction = false; 2533 bool contended_compaction = false;
2492 2534
@@ -2528,15 +2570,19 @@ restart:
2528 * Find the true preferred zone if the allocation is unconstrained by 2570 * Find the true preferred zone if the allocation is unconstrained by
2529 * cpusets. 2571 * cpusets.
2530 */ 2572 */
2531 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) 2573 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
2532 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2574 struct zoneref *preferred_zoneref;
2533 &preferred_zone); 2575 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2576 NULL,
2577 &preferred_zone);
2578 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2579 }
2534 2580
2535rebalance: 2581rebalance:
2536 /* This is the last chance, in general, before the goto nopage. */ 2582 /* This is the last chance, in general, before the goto nopage. */
2537 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2583 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2538 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2584 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
2539 preferred_zone, migratetype); 2585 preferred_zone, classzone_idx, migratetype);
2540 if (page) 2586 if (page)
2541 goto got_pg; 2587 goto got_pg;
2542 2588
@@ -2551,7 +2597,7 @@ rebalance:
2551 2597
2552 page = __alloc_pages_high_priority(gfp_mask, order, 2598 page = __alloc_pages_high_priority(gfp_mask, order,
2553 zonelist, high_zoneidx, nodemask, 2599 zonelist, high_zoneidx, nodemask,
2554 preferred_zone, migratetype); 2600 preferred_zone, classzone_idx, migratetype);
2555 if (page) { 2601 if (page) {
2556 goto got_pg; 2602 goto got_pg;
2557 } 2603 }
@@ -2573,17 +2619,16 @@ rebalance:
2573 * Try direct compaction. The first pass is asynchronous. Subsequent 2619 * Try direct compaction. The first pass is asynchronous. Subsequent
2574 * attempts after direct reclaim are synchronous 2620 * attempts after direct reclaim are synchronous
2575 */ 2621 */
2576 page = __alloc_pages_direct_compact(gfp_mask, order, 2622 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2577 zonelist, high_zoneidx, 2623 high_zoneidx, nodemask, alloc_flags,
2578 nodemask, 2624 preferred_zone,
2579 alloc_flags, preferred_zone, 2625 classzone_idx, migratetype,
2580 migratetype, sync_migration, 2626 migration_mode, &contended_compaction,
2581 &contended_compaction,
2582 &deferred_compaction, 2627 &deferred_compaction,
2583 &did_some_progress); 2628 &did_some_progress);
2584 if (page) 2629 if (page)
2585 goto got_pg; 2630 goto got_pg;
2586 sync_migration = true; 2631 migration_mode = MIGRATE_SYNC_LIGHT;
2587 2632
2588 /* 2633 /*
2589 * If compaction is deferred for high-order allocations, it is because 2634 * If compaction is deferred for high-order allocations, it is because
@@ -2600,7 +2645,8 @@ rebalance:
2600 zonelist, high_zoneidx, 2645 zonelist, high_zoneidx,
2601 nodemask, 2646 nodemask,
2602 alloc_flags, preferred_zone, 2647 alloc_flags, preferred_zone,
2603 migratetype, &did_some_progress); 2648 classzone_idx, migratetype,
2649 &did_some_progress);
2604 if (page) 2650 if (page)
2605 goto got_pg; 2651 goto got_pg;
2606 2652
@@ -2619,7 +2665,7 @@ rebalance:
2619 page = __alloc_pages_may_oom(gfp_mask, order, 2665 page = __alloc_pages_may_oom(gfp_mask, order,
2620 zonelist, high_zoneidx, 2666 zonelist, high_zoneidx,
2621 nodemask, preferred_zone, 2667 nodemask, preferred_zone,
2622 migratetype); 2668 classzone_idx, migratetype);
2623 if (page) 2669 if (page)
2624 goto got_pg; 2670 goto got_pg;
2625 2671
@@ -2658,12 +2704,11 @@ rebalance:
2658 * direct reclaim and reclaim/compaction depends on compaction 2704 * direct reclaim and reclaim/compaction depends on compaction
2659 * being called after reclaim so call directly if necessary 2705 * being called after reclaim so call directly if necessary
2660 */ 2706 */
2661 page = __alloc_pages_direct_compact(gfp_mask, order, 2707 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
2662 zonelist, high_zoneidx, 2708 high_zoneidx, nodemask, alloc_flags,
2663 nodemask, 2709 preferred_zone,
2664 alloc_flags, preferred_zone, 2710 classzone_idx, migratetype,
2665 migratetype, sync_migration, 2711 migration_mode, &contended_compaction,
2666 &contended_compaction,
2667 &deferred_compaction, 2712 &deferred_compaction,
2668 &did_some_progress); 2713 &did_some_progress);
2669 if (page) 2714 if (page)
@@ -2689,11 +2734,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2689{ 2734{
2690 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2735 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2691 struct zone *preferred_zone; 2736 struct zone *preferred_zone;
2737 struct zoneref *preferred_zoneref;
2692 struct page *page = NULL; 2738 struct page *page = NULL;
2693 int migratetype = allocflags_to_migratetype(gfp_mask); 2739 int migratetype = allocflags_to_migratetype(gfp_mask);
2694 unsigned int cpuset_mems_cookie; 2740 unsigned int cpuset_mems_cookie;
2695 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2741 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2696 struct mem_cgroup *memcg = NULL; 2742 struct mem_cgroup *memcg = NULL;
2743 int classzone_idx;
2697 2744
2698 gfp_mask &= gfp_allowed_mask; 2745 gfp_mask &= gfp_allowed_mask;
2699 2746
@@ -2720,42 +2767,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2720 return NULL; 2767 return NULL;
2721 2768
2722retry_cpuset: 2769retry_cpuset:
2723 cpuset_mems_cookie = get_mems_allowed(); 2770 cpuset_mems_cookie = read_mems_allowed_begin();
2724 2771
2725 /* The preferred zone is used for statistics later */ 2772 /* The preferred zone is used for statistics later */
2726 first_zones_zonelist(zonelist, high_zoneidx, 2773 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
2727 nodemask ? : &cpuset_current_mems_allowed, 2774 nodemask ? : &cpuset_current_mems_allowed,
2728 &preferred_zone); 2775 &preferred_zone);
2729 if (!preferred_zone) 2776 if (!preferred_zone)
2730 goto out; 2777 goto out;
2778 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2731 2779
2732#ifdef CONFIG_CMA 2780#ifdef CONFIG_CMA
2733 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2781 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2734 alloc_flags |= ALLOC_CMA; 2782 alloc_flags |= ALLOC_CMA;
2735#endif 2783#endif
2736retry:
2737 /* First allocation attempt */ 2784 /* First allocation attempt */
2738 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2785 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2739 zonelist, high_zoneidx, alloc_flags, 2786 zonelist, high_zoneidx, alloc_flags,
2740 preferred_zone, migratetype); 2787 preferred_zone, classzone_idx, migratetype);
2741 if (unlikely(!page)) { 2788 if (unlikely(!page)) {
2742 /* 2789 /*
2743 * The first pass makes sure allocations are spread
2744 * fairly within the local node. However, the local
2745 * node might have free pages left after the fairness
2746 * batches are exhausted, and remote zones haven't
2747 * even been considered yet. Try once more without
2748 * fairness, and include remote zones now, before
2749 * entering the slowpath and waking kswapd: prefer
2750 * spilling to a remote zone over swapping locally.
2751 */
2752 if (alloc_flags & ALLOC_FAIR) {
2753 reset_alloc_batches(zonelist, high_zoneidx,
2754 preferred_zone);
2755 alloc_flags &= ~ALLOC_FAIR;
2756 goto retry;
2757 }
2758 /*
2759 * Runtime PM, block IO and its error handling path 2790 * Runtime PM, block IO and its error handling path
2760 * can deadlock because I/O on the device might not 2791 * can deadlock because I/O on the device might not
2761 * complete. 2792 * complete.
@@ -2763,7 +2794,7 @@ retry:
2763 gfp_mask = memalloc_noio_flags(gfp_mask); 2794 gfp_mask = memalloc_noio_flags(gfp_mask);
2764 page = __alloc_pages_slowpath(gfp_mask, order, 2795 page = __alloc_pages_slowpath(gfp_mask, order,
2765 zonelist, high_zoneidx, nodemask, 2796 zonelist, high_zoneidx, nodemask,
2766 preferred_zone, migratetype); 2797 preferred_zone, classzone_idx, migratetype);
2767 } 2798 }
2768 2799
2769 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2800 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -2775,7 +2806,7 @@ out:
2775 * the mask is being updated. If a page allocation is about to fail, 2806 * the mask is being updated. If a page allocation is about to fail,
2776 * check if the cpuset changed during allocation and if so, retry. 2807 * check if the cpuset changed during allocation and if so, retry.
2777 */ 2808 */
2778 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2809 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2779 goto retry_cpuset; 2810 goto retry_cpuset;
2780 2811
2781 memcg_kmem_commit_charge(page, memcg, order); 2812 memcg_kmem_commit_charge(page, memcg, order);
@@ -2814,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order)
2814{ 2845{
2815 if (put_page_testzero(page)) { 2846 if (put_page_testzero(page)) {
2816 if (order == 0) 2847 if (order == 0)
2817 free_hot_cold_page(page, 0); 2848 free_hot_cold_page(page, false);
2818 else 2849 else
2819 __free_pages_ok(page, order); 2850 __free_pages_ok(page, order);
2820 } 2851 }
@@ -3043,9 +3074,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
3043 goto out; 3074 goto out;
3044 3075
3045 do { 3076 do {
3046 cpuset_mems_cookie = get_mems_allowed(); 3077 cpuset_mems_cookie = read_mems_allowed_begin();
3047 ret = !node_isset(nid, cpuset_current_mems_allowed); 3078 ret = !node_isset(nid, cpuset_current_mems_allowed);
3048 } while (!put_mems_allowed(cpuset_mems_cookie)); 3079 } while (read_mems_allowed_retry(cpuset_mems_cookie));
3049out: 3080out:
3050 return ret; 3081 return ret;
3051} 3082}
@@ -3198,12 +3229,12 @@ void show_free_areas(unsigned int filter)
3198 K(zone_page_state(zone, NR_BOUNCE)), 3229 K(zone_page_state(zone, NR_BOUNCE)),
3199 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3230 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3200 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3231 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3201 zone->pages_scanned, 3232 K(zone_page_state(zone, NR_PAGES_SCANNED)),
3202 (!zone_reclaimable(zone) ? "yes" : "no") 3233 (!zone_reclaimable(zone) ? "yes" : "no")
3203 ); 3234 );
3204 printk("lowmem_reserve[]:"); 3235 printk("lowmem_reserve[]:");
3205 for (i = 0; i < MAX_NR_ZONES; i++) 3236 for (i = 0; i < MAX_NR_ZONES; i++)
3206 printk(" %lu", zone->lowmem_reserve[i]); 3237 printk(" %ld", zone->lowmem_reserve[i]);
3207 printk("\n"); 3238 printk("\n");
3208 } 3239 }
3209 3240
@@ -3943,6 +3974,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3943 struct page *page; 3974 struct page *page;
3944 unsigned long block_migratetype; 3975 unsigned long block_migratetype;
3945 int reserve; 3976 int reserve;
3977 int old_reserve;
3946 3978
3947 /* 3979 /*
3948 * Get the start pfn, end pfn and the number of blocks to reserve 3980 * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3964,6 +3996,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3964 * future allocation of hugepages at runtime. 3996 * future allocation of hugepages at runtime.
3965 */ 3997 */
3966 reserve = min(2, reserve); 3998 reserve = min(2, reserve);
3999 old_reserve = zone->nr_migrate_reserve_block;
4000
4001 /* When memory hot-add, we almost always need to do nothing */
4002 if (reserve == old_reserve)
4003 return;
4004 zone->nr_migrate_reserve_block = reserve;
3967 4005
3968 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 4006 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
3969 if (!pfn_valid(pfn)) 4007 if (!pfn_valid(pfn))
@@ -4001,6 +4039,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
4001 reserve--; 4039 reserve--;
4002 continue; 4040 continue;
4003 } 4041 }
4042 } else if (!old_reserve) {
4043 /*
4044 * At boot time we don't need to scan the whole zone
4045 * for turning off MIGRATE_RESERVE.
4046 */
4047 break;
4004 } 4048 }
4005 4049
4006 /* 4050 /*
@@ -4080,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4080 4124
4081static void __meminit zone_init_free_lists(struct zone *zone) 4125static void __meminit zone_init_free_lists(struct zone *zone)
4082{ 4126{
4083 int order, t; 4127 unsigned int order, t;
4084 for_each_migratetype_order(order, t) { 4128 for_each_migratetype_order(order, t) {
4085 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); 4129 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
4086 zone->free_area[order].nr_free = 0; 4130 zone->free_area[order].nr_free = 0;
@@ -4903,7 +4947,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4903 4947
4904 pgdat->node_id = nid; 4948 pgdat->node_id = nid;
4905 pgdat->node_start_pfn = node_start_pfn; 4949 pgdat->node_start_pfn = node_start_pfn;
4906 init_zone_allows_reclaim(nid); 4950 if (node_state(nid, N_MEMORY))
4951 init_zone_allows_reclaim(nid);
4907#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4952#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4908 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4953 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4909#endif 4954#endif
@@ -5492,7 +5537,7 @@ static void calculate_totalreserve_pages(void)
5492 for_each_online_pgdat(pgdat) { 5537 for_each_online_pgdat(pgdat) {
5493 for (i = 0; i < MAX_NR_ZONES; i++) { 5538 for (i = 0; i < MAX_NR_ZONES; i++) {
5494 struct zone *zone = pgdat->node_zones + i; 5539 struct zone *zone = pgdat->node_zones + i;
5495 unsigned long max = 0; 5540 long max = 0;
5496 5541
5497 /* Find valid and maximum lowmem_reserve in the zone */ 5542 /* Find valid and maximum lowmem_reserve in the zone */
5498 for (j = i; j < MAX_NR_ZONES; j++) { 5543 for (j = i; j < MAX_NR_ZONES; j++) {
@@ -5734,7 +5779,12 @@ module_init(init_per_zone_wmark_min)
5734int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5779int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5735 void __user *buffer, size_t *length, loff_t *ppos) 5780 void __user *buffer, size_t *length, loff_t *ppos)
5736{ 5781{
5737 proc_dointvec(table, write, buffer, length, ppos); 5782 int rc;
5783
5784 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
5785 if (rc)
5786 return rc;
5787
5738 if (write) { 5788 if (write) {
5739 user_min_free_kbytes = min_free_kbytes; 5789 user_min_free_kbytes = min_free_kbytes;
5740 setup_per_zone_wmarks(); 5790 setup_per_zone_wmarks();
@@ -5976,17 +6026,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
5976 * @end_bitidx: The last bit of interest 6026 * @end_bitidx: The last bit of interest
5977 * returns pageblock_bits flags 6027 * returns pageblock_bits flags
5978 */ 6028 */
5979unsigned long get_pageblock_flags_mask(struct page *page, 6029unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
5980 unsigned long end_bitidx, 6030 unsigned long end_bitidx,
5981 unsigned long mask) 6031 unsigned long mask)
5982{ 6032{
5983 struct zone *zone; 6033 struct zone *zone;
5984 unsigned long *bitmap; 6034 unsigned long *bitmap;
5985 unsigned long pfn, bitidx, word_bitidx; 6035 unsigned long bitidx, word_bitidx;
5986 unsigned long word; 6036 unsigned long word;
5987 6037
5988 zone = page_zone(page); 6038 zone = page_zone(page);
5989 pfn = page_to_pfn(page);
5990 bitmap = get_pageblock_bitmap(zone, pfn); 6039 bitmap = get_pageblock_bitmap(zone, pfn);
5991 bitidx = pfn_to_bitidx(zone, pfn); 6040 bitidx = pfn_to_bitidx(zone, pfn);
5992 word_bitidx = bitidx / BITS_PER_LONG; 6041 word_bitidx = bitidx / BITS_PER_LONG;
@@ -5998,25 +6047,25 @@ unsigned long get_pageblock_flags_mask(struct page *page,
5998} 6047}
5999 6048
6000/** 6049/**
6001 * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages 6050 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
6002 * @page: The page within the block of interest 6051 * @page: The page within the block of interest
6003 * @start_bitidx: The first bit of interest 6052 * @start_bitidx: The first bit of interest
6004 * @end_bitidx: The last bit of interest 6053 * @end_bitidx: The last bit of interest
6005 * @flags: The flags to set 6054 * @flags: The flags to set
6006 */ 6055 */
6007void set_pageblock_flags_mask(struct page *page, unsigned long flags, 6056void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6057 unsigned long pfn,
6008 unsigned long end_bitidx, 6058 unsigned long end_bitidx,
6009 unsigned long mask) 6059 unsigned long mask)
6010{ 6060{
6011 struct zone *zone; 6061 struct zone *zone;
6012 unsigned long *bitmap; 6062 unsigned long *bitmap;
6013 unsigned long pfn, bitidx, word_bitidx; 6063 unsigned long bitidx, word_bitidx;
6014 unsigned long old_word, word; 6064 unsigned long old_word, word;
6015 6065
6016 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 6066 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
6017 6067
6018 zone = page_zone(page); 6068 zone = page_zone(page);
6019 pfn = page_to_pfn(page);
6020 bitmap = get_pageblock_bitmap(zone, pfn); 6069 bitmap = get_pageblock_bitmap(zone, pfn);
6021 bitidx = pfn_to_bitidx(zone, pfn); 6070 bitidx = pfn_to_bitidx(zone, pfn);
6022 word_bitidx = bitidx / BITS_PER_LONG; 6071 word_bitidx = bitidx / BITS_PER_LONG;
@@ -6194,7 +6243,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6194 cc->nr_migratepages -= nr_reclaimed; 6243 cc->nr_migratepages -= nr_reclaimed;
6195 6244
6196 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 6245 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
6197 0, MIGRATE_SYNC, MR_CMA); 6246 NULL, 0, cc->mode, MR_CMA);
6198 } 6247 }
6199 if (ret < 0) { 6248 if (ret < 0) {
6200 putback_movable_pages(&cc->migratepages); 6249 putback_movable_pages(&cc->migratepages);
@@ -6233,7 +6282,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6233 .nr_migratepages = 0, 6282 .nr_migratepages = 0,
6234 .order = -1, 6283 .order = -1,
6235 .zone = page_zone(pfn_to_page(start)), 6284 .zone = page_zone(pfn_to_page(start)),
6236 .sync = true, 6285 .mode = MIGRATE_SYNC,
6237 .ignore_skip_hint = true, 6286 .ignore_skip_hint = true,
6238 }; 6287 };
6239 INIT_LIST_HEAD(&cc.migratepages); 6288 INIT_LIST_HEAD(&cc.migratepages);
@@ -6388,7 +6437,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6388{ 6437{
6389 struct page *page; 6438 struct page *page;
6390 struct zone *zone; 6439 struct zone *zone;
6391 int order, i; 6440 unsigned int order, i;
6392 unsigned long pfn; 6441 unsigned long pfn;
6393 unsigned long flags; 6442 unsigned long flags;
6394 /* find the first valid pfn */ 6443 /* find the first valid pfn */
@@ -6440,7 +6489,7 @@ bool is_free_buddy_page(struct page *page)
6440 struct zone *zone = page_zone(page); 6489 struct zone *zone = page_zone(page);
6441 unsigned long pfn = page_to_pfn(page); 6490 unsigned long pfn = page_to_pfn(page);
6442 unsigned long flags; 6491 unsigned long flags;
6443 int order; 6492 unsigned int order;
6444 6493
6445 spin_lock_irqsave(&zone->lock, flags); 6494 spin_lock_irqsave(&zone->lock, flags);
6446 for (order = 0; order < MAX_ORDER; order++) { 6495 for (order = 0; order < MAX_ORDER; order++) {
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..0f35e983bffb 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
8 */ 8 */
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/fs.h>
12#include <linux/gfp.h> 11#include <linux/gfp.h>
13#include <linux/mm.h>
14#include <linux/export.h> 12#include <linux/export.h>
15#include <linux/blkdev.h> 13#include <linux/blkdev.h>
16#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
20#include <linux/syscalls.h> 18#include <linux/syscalls.h>
21#include <linux/file.h> 19#include <linux/file.h>
22 20
21#include "internal.h"
22
23/* 23/*
24 * Initialise a struct file's readahead state. Assumes that the caller has 24 * Initialise a struct file's readahead state. Assumes that the caller has
25 * memset *ra to zero. 25 * memset *ra to zero.
@@ -149,8 +149,7 @@ out:
149 * 149 *
150 * Returns the number of pages requested, or the maximum amount of I/O allowed. 150 * Returns the number of pages requested, or the maximum amount of I/O allowed.
151 */ 151 */
152static int 152int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
153__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
154 pgoff_t offset, unsigned long nr_to_read, 153 pgoff_t offset, unsigned long nr_to_read,
155 unsigned long lookahead_size) 154 unsigned long lookahead_size)
156{ 155{
@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
179 rcu_read_lock(); 178 rcu_read_lock();
180 page = radix_tree_lookup(&mapping->page_tree, page_offset); 179 page = radix_tree_lookup(&mapping->page_tree, page_offset);
181 rcu_read_unlock(); 180 rcu_read_unlock();
182 if (page) 181 if (page && !radix_tree_exceptional_entry(page))
183 continue; 182 continue;
184 183
185 page = page_cache_alloc_readahead(mapping); 184 page = page_cache_alloc_readahead(mapping);
@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
237 return ret; 236 return ret;
238} 237}
239 238
239#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE)
240/* 240/*
241 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a 241 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
242 * sensible upper limit. 242 * sensible upper limit.
243 */ 243 */
244unsigned long max_sane_readahead(unsigned long nr) 244unsigned long max_sane_readahead(unsigned long nr)
245{ 245{
246 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) 246 return min(nr, MAX_READAHEAD);
247 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
248}
249
250/*
251 * Submit IO for the read-ahead request in file_ra_state.
252 */
253unsigned long ra_submit(struct file_ra_state *ra,
254 struct address_space *mapping, struct file *filp)
255{
256 int actual;
257
258 actual = __do_page_cache_readahead(mapping, filp,
259 ra->start, ra->size, ra->async_size);
260
261 return actual;
262} 247}
263 248
264/* 249/*
@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping,
351 pgoff_t head; 336 pgoff_t head;
352 337
353 rcu_read_lock(); 338 rcu_read_lock();
354 head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); 339 head = page_cache_prev_hole(mapping, offset - 1, max);
355 rcu_read_unlock(); 340 rcu_read_unlock();
356 341
357 return offset - 1 - head; 342 return offset - 1 - head;
@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping,
401 unsigned long req_size) 386 unsigned long req_size)
402{ 387{
403 unsigned long max = max_sane_readahead(ra->ra_pages); 388 unsigned long max = max_sane_readahead(ra->ra_pages);
389 pgoff_t prev_offset;
404 390
405 /* 391 /*
406 * start of file 392 * start of file
@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping,
430 pgoff_t start; 416 pgoff_t start;
431 417
432 rcu_read_lock(); 418 rcu_read_lock();
433 start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); 419 start = page_cache_next_hole(mapping, offset + 1, max);
434 rcu_read_unlock(); 420 rcu_read_unlock();
435 421
436 if (!start || start - offset > max) 422 if (!start || start - offset > max)
@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping,
452 438
453 /* 439 /*
454 * sequential cache miss 440 * sequential cache miss
441 * trivial case: (offset - prev_offset) == 1
442 * unaligned reads: (offset - prev_offset) == 0
455 */ 443 */
456 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) 444 prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
445 if (offset - prev_offset <= 1UL)
457 goto initial_readahead; 446 goto initial_readahead;
458 447
459 /* 448 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 0da81aaeb4cc..ab05681f41cd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
243 pgoff_t index, void *expected, void *replacement) 243 pgoff_t index, void *expected, void *replacement)
244{ 244{
245 void **pslot; 245 void **pslot;
246 void *item = NULL; 246 void *item;
247 247
248 VM_BUG_ON(!expected); 248 VM_BUG_ON(!expected);
249 VM_BUG_ON(!replacement);
249 pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 250 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
250 if (pslot) 251 if (!pslot)
251 item = radix_tree_deref_slot_protected(pslot, 252 return -ENOENT;
252 &mapping->tree_lock); 253 item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
253 if (item != expected) 254 if (item != expected)
254 return -ENOENT; 255 return -ENOENT;
255 if (replacement) 256 radix_tree_replace_slot(pslot, replacement);
256 radix_tree_replace_slot(pslot, replacement);
257 else
258 radix_tree_delete(&mapping->page_tree, index);
259 return 0; 257 return 0;
260} 258}
261 259
@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
332} 330}
333 331
334/* 332/*
335 * Like find_get_pages, but collecting swap entries as well as pages.
336 */
337static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
338 pgoff_t start, unsigned int nr_pages,
339 struct page **pages, pgoff_t *indices)
340{
341 void **slot;
342 unsigned int ret = 0;
343 struct radix_tree_iter iter;
344
345 if (!nr_pages)
346 return 0;
347
348 rcu_read_lock();
349restart:
350 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
351 struct page *page;
352repeat:
353 page = radix_tree_deref_slot(slot);
354 if (unlikely(!page))
355 continue;
356 if (radix_tree_exception(page)) {
357 if (radix_tree_deref_retry(page))
358 goto restart;
359 /*
360 * Otherwise, we must be storing a swap entry
361 * here as an exceptional entry: so return it
362 * without attempting to raise page count.
363 */
364 goto export;
365 }
366 if (!page_cache_get_speculative(page))
367 goto repeat;
368
369 /* Has the page moved? */
370 if (unlikely(page != *slot)) {
371 page_cache_release(page);
372 goto repeat;
373 }
374export:
375 indices[ret] = iter.index;
376 pages[ret] = page;
377 if (++ret == nr_pages)
378 break;
379 }
380 rcu_read_unlock();
381 return ret;
382}
383
384/*
385 * Remove swap entry from radix tree, free the swap and its page cache. 333 * Remove swap entry from radix tree, free the swap and its page cache.
386 */ 334 */
387static int shmem_free_swap(struct address_space *mapping, 335static int shmem_free_swap(struct address_space *mapping,
388 pgoff_t index, void *radswap) 336 pgoff_t index, void *radswap)
389{ 337{
390 int error; 338 void *old;
391 339
392 spin_lock_irq(&mapping->tree_lock); 340 spin_lock_irq(&mapping->tree_lock);
393 error = shmem_radix_tree_replace(mapping, index, radswap, NULL); 341 old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
394 spin_unlock_irq(&mapping->tree_lock); 342 spin_unlock_irq(&mapping->tree_lock);
395 if (!error) 343 if (old != radswap)
396 free_swap_and_cache(radix_to_swp_entry(radswap)); 344 return -ENOENT;
397 return error; 345 free_swap_and_cache(radix_to_swp_entry(radswap));
398} 346 return 0;
399
400/*
401 * Pagevec may contain swap entries, so shuffle up pages before releasing.
402 */
403static void shmem_deswap_pagevec(struct pagevec *pvec)
404{
405 int i, j;
406
407 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
408 struct page *page = pvec->pages[i];
409 if (!radix_tree_exceptional_entry(page))
410 pvec->pages[j++] = page;
411 }
412 pvec->nr = j;
413} 347}
414 348
415/* 349/*
@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
430 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it 364 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
431 * has finished, if it hits a row of PAGEVEC_SIZE swap entries. 365 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
432 */ 366 */
433 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 367 pvec.nr = find_get_entries(mapping, index,
434 PAGEVEC_SIZE, pvec.pages, indices); 368 PAGEVEC_SIZE, pvec.pages, indices);
435 if (!pvec.nr) 369 if (!pvec.nr)
436 break; 370 break;
437 index = indices[pvec.nr - 1] + 1; 371 index = indices[pvec.nr - 1] + 1;
438 shmem_deswap_pagevec(&pvec); 372 pagevec_remove_exceptionals(&pvec);
439 check_move_unevictable_pages(pvec.pages, pvec.nr); 373 check_move_unevictable_pages(pvec.pages, pvec.nr);
440 pagevec_release(&pvec); 374 pagevec_release(&pvec);
441 cond_resched(); 375 cond_resched();
@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
467 pagevec_init(&pvec, 0); 401 pagevec_init(&pvec, 0);
468 index = start; 402 index = start;
469 while (index < end) { 403 while (index < end) {
470 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 404 pvec.nr = find_get_entries(mapping, index,
471 min(end - index, (pgoff_t)PAGEVEC_SIZE), 405 min(end - index, (pgoff_t)PAGEVEC_SIZE),
472 pvec.pages, indices); 406 pvec.pages, indices);
473 if (!pvec.nr) 407 if (!pvec.nr)
474 break; 408 break;
475 mem_cgroup_uncharge_start(); 409 mem_cgroup_uncharge_start();
@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
498 } 432 }
499 unlock_page(page); 433 unlock_page(page);
500 } 434 }
501 shmem_deswap_pagevec(&pvec); 435 pagevec_remove_exceptionals(&pvec);
502 pagevec_release(&pvec); 436 pagevec_release(&pvec);
503 mem_cgroup_uncharge_end(); 437 mem_cgroup_uncharge_end();
504 cond_resched(); 438 cond_resched();
@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
536 index = start; 470 index = start;
537 while (index < end) { 471 while (index < end) {
538 cond_resched(); 472 cond_resched();
539 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 473
474 pvec.nr = find_get_entries(mapping, index,
540 min(end - index, (pgoff_t)PAGEVEC_SIZE), 475 min(end - index, (pgoff_t)PAGEVEC_SIZE),
541 pvec.pages, indices); 476 pvec.pages, indices);
542 if (!pvec.nr) { 477 if (!pvec.nr) {
543 /* If all gone or hole-punch or unfalloc, we're done */ 478 /* If all gone or hole-punch or unfalloc, we're done */
544 if (index == start || end != -1) 479 if (index == start || end != -1)
@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
581 } 516 }
582 unlock_page(page); 517 unlock_page(page);
583 } 518 }
584 shmem_deswap_pagevec(&pvec); 519 pagevec_remove_exceptionals(&pvec);
585 pagevec_release(&pvec); 520 pagevec_release(&pvec);
586 mem_cgroup_uncharge_end(); 521 mem_cgroup_uncharge_end();
587 index++; 522 index++;
@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1090 return -EFBIG; 1025 return -EFBIG;
1091repeat: 1026repeat:
1092 swap.val = 0; 1027 swap.val = 0;
1093 page = find_lock_page(mapping, index); 1028 page = find_lock_entry(mapping, index);
1094 if (radix_tree_exceptional_entry(page)) { 1029 if (radix_tree_exceptional_entry(page)) {
1095 swap = radix_to_swp_entry(page); 1030 swap = radix_to_swp_entry(page);
1096 page = NULL; 1031 page = NULL;
@@ -1102,6 +1037,9 @@ repeat:
1102 goto failed; 1037 goto failed;
1103 } 1038 }
1104 1039
1040 if (page && sgp == SGP_WRITE)
1041 mark_page_accessed(page);
1042
1105 /* fallocated page? */ 1043 /* fallocated page? */
1106 if (page && !PageUptodate(page)) { 1044 if (page && !PageUptodate(page)) {
1107 if (sgp != SGP_READ) 1045 if (sgp != SGP_READ)
@@ -1183,6 +1121,9 @@ repeat:
1183 shmem_recalc_inode(inode); 1121 shmem_recalc_inode(inode);
1184 spin_unlock(&info->lock); 1122 spin_unlock(&info->lock);
1185 1123
1124 if (sgp == SGP_WRITE)
1125 mark_page_accessed(page);
1126
1186 delete_from_swap_cache(page); 1127 delete_from_swap_cache(page);
1187 set_page_dirty(page); 1128 set_page_dirty(page);
1188 swap_free(swap); 1129 swap_free(swap);
@@ -1207,8 +1148,11 @@ repeat:
1207 goto decused; 1148 goto decused;
1208 } 1149 }
1209 1150
1210 SetPageSwapBacked(page); 1151 __SetPageSwapBacked(page);
1211 __set_page_locked(page); 1152 __set_page_locked(page);
1153 if (sgp == SGP_WRITE)
1154 init_page_accessed(page);
1155
1212 error = mem_cgroup_cache_charge(page, current->mm, 1156 error = mem_cgroup_cache_charge(page, current->mm,
1213 gfp & GFP_RECLAIM_MASK); 1157 gfp & GFP_RECLAIM_MASK);
1214 if (error) 1158 if (error)
@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1485 return inode; 1429 return inode;
1486} 1430}
1487 1431
1432bool shmem_mapping(struct address_space *mapping)
1433{
1434 return mapping->backing_dev_info == &shmem_backing_dev_info;
1435}
1436
1488#ifdef CONFIG_TMPFS 1437#ifdef CONFIG_TMPFS
1489static const struct inode_operations shmem_symlink_inode_operations; 1438static const struct inode_operations shmem_symlink_inode_operations;
1490static const struct inode_operations shmem_short_symlink_operations; 1439static const struct inode_operations shmem_short_symlink_operations;
@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1797 pagevec_init(&pvec, 0); 1746 pagevec_init(&pvec, 0);
1798 pvec.nr = 1; /* start small: we may be there already */ 1747 pvec.nr = 1; /* start small: we may be there already */
1799 while (!done) { 1748 while (!done) {
1800 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 1749 pvec.nr = find_get_entries(mapping, index,
1801 pvec.nr, pvec.pages, indices); 1750 pvec.nr, pvec.pages, indices);
1802 if (!pvec.nr) { 1751 if (!pvec.nr) {
1803 if (whence == SEEK_DATA) 1752 if (whence == SEEK_DATA)
@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1824 break; 1773 break;
1825 } 1774 }
1826 } 1775 }
1827 shmem_deswap_pagevec(&pvec); 1776 pagevec_remove_exceptionals(&pvec);
1828 pagevec_release(&pvec); 1777 pagevec_release(&pvec);
1829 pvec.nr = PAGEVEC_SIZE; 1778 pvec.nr = PAGEVEC_SIZE;
1830 cond_resched(); 1779 cond_resched();
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..eb4078c7d183 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
930{ 930{
931 if (unlikely(pfmemalloc_active)) { 931 if (unlikely(pfmemalloc_active)) {
932 /* Some pfmemalloc slabs exist, check if this is one */ 932 /* Some pfmemalloc slabs exist, check if this is one */
933 struct page *page = virt_to_head_page(objp); 933 struct slab *slabp = virt_to_slab(objp);
934 struct page *page = virt_to_head_page(slabp->s_mem);
934 if (PageSlabPfmemalloc(page)) 935 if (PageSlabPfmemalloc(page))
935 set_obj_pfmemalloc(&objp); 936 set_obj_pfmemalloc(&objp);
936 } 937 }
@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1776 __SetPageSlab(page + i); 1777 __SetPageSlab(page + i);
1777 1778
1778 if (page->pfmemalloc) 1779 if (page->pfmemalloc)
1779 SetPageSlabPfmemalloc(page + i); 1780 SetPageSlabPfmemalloc(page);
1780 } 1781 }
1781 memcg_bind_pages(cachep, cachep->gfporder); 1782 memcg_bind_pages(cachep, cachep->gfporder);
1782 1783
@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1809 else 1810 else
1810 sub_zone_page_state(page_zone(page), 1811 sub_zone_page_state(page_zone(page),
1811 NR_SLAB_UNRECLAIMABLE, nr_freed); 1812 NR_SLAB_UNRECLAIMABLE, nr_freed);
1813
1814 __ClearPageSlabPfmemalloc(page);
1812 while (i--) { 1815 while (i--) {
1813 BUG_ON(!PageSlab(page)); 1816 BUG_ON(!PageSlab(page));
1814 __ClearPageSlabPfmemalloc(page);
1815 __ClearPageSlab(page); 1817 __ClearPageSlab(page);
1816 page++; 1818 page++;
1817 } 1819 }
@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3220 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3222 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3221 3223
3222retry_cpuset: 3224retry_cpuset:
3223 cpuset_mems_cookie = get_mems_allowed(); 3225 cpuset_mems_cookie = read_mems_allowed_begin();
3224 zonelist = node_zonelist(slab_node(), flags); 3226 zonelist = node_zonelist(slab_node(), flags);
3225 3227
3226retry: 3228retry:
@@ -3276,7 +3278,7 @@ retry:
3276 } 3278 }
3277 } 3279 }
3278 3280
3279 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) 3281 if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
3280 goto retry_cpuset; 3282 goto retry_cpuset;
3281 return obj; 3283 return obj;
3282} 3284}
diff --git a/mm/slub.c b/mm/slub.c
index 5c1343a391d0..a88d94cfee20 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1635,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1635 return NULL; 1635 return NULL;
1636 1636
1637 do { 1637 do {
1638 cpuset_mems_cookie = get_mems_allowed(); 1638 cpuset_mems_cookie = read_mems_allowed_begin();
1639 zonelist = node_zonelist(slab_node(), flags); 1639 zonelist = node_zonelist(slab_node(), flags);
1640 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1640 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1641 struct kmem_cache_node *n; 1641 struct kmem_cache_node *n;
@@ -1647,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1647 object = get_partial_node(s, n, c, flags); 1647 object = get_partial_node(s, n, c, flags);
1648 if (object) { 1648 if (object) {
1649 /* 1649 /*
1650 * Return the object even if 1650 * Don't check read_mems_allowed_retry()
1651 * put_mems_allowed indicated that 1651 * here - if mems_allowed was updated in
1652 * the cpuset mems_allowed was 1652 * parallel, that was a harmless race
1653 * updated in parallel. It's a 1653 * between allocation and the cpuset
1654 * harmless race between the alloc 1654 * update
1655 * and the cpuset update.
1656 */ 1655 */
1657 put_mems_allowed(cpuset_mems_cookie);
1658 return object; 1656 return object;
1659 } 1657 }
1660 } 1658 }
1661 } 1659 }
1662 } while (!put_mems_allowed(cpuset_mems_cookie)); 1660 } while (read_mems_allowed_retry(cpuset_mems_cookie));
1663#endif 1661#endif
1664 return NULL; 1662 return NULL;
1665} 1663}
diff --git a/mm/swap.c b/mm/swap.c
index aa4da5d9401d..16e70ce1912a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -68,7 +68,7 @@ static void __page_cache_release(struct page *page)
68static void __put_single_page(struct page *page) 68static void __put_single_page(struct page *page)
69{ 69{
70 __page_cache_release(page); 70 __page_cache_release(page);
71 free_hot_cold_page(page, 0); 71 free_hot_cold_page(page, false);
72} 72}
73 73
74static void __put_compound_page(struct page *page) 74static void __put_compound_page(struct page *page)
@@ -437,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
437 SetPageActive(page); 437 SetPageActive(page);
438 lru += LRU_ACTIVE; 438 lru += LRU_ACTIVE;
439 add_page_to_lru_list(page, lruvec, lru); 439 add_page_to_lru_list(page, lruvec, lru);
440 trace_mm_lru_activate(page, page_to_pfn(page)); 440 trace_mm_lru_activate(page);
441 441
442 __count_vm_event(PGACTIVATE); 442 __count_vm_event(PGACTIVATE);
443 update_page_reclaim_stat(lruvec, file, 1); 443 update_page_reclaim_stat(lruvec, file, 1);
@@ -549,12 +549,17 @@ void mark_page_accessed(struct page *page)
549EXPORT_SYMBOL(mark_page_accessed); 549EXPORT_SYMBOL(mark_page_accessed);
550 550
551/* 551/*
552 * Queue the page for addition to the LRU via pagevec. The decision on whether 552 * Used to mark_page_accessed(page) that is not visible yet and when it is
553 * to add the page to the [in]active [file|anon] list is deferred until the 553 * still safe to use non-atomic ops
554 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
555 * have the page added to the active list using mark_page_accessed().
556 */ 554 */
557void __lru_cache_add(struct page *page) 555void init_page_accessed(struct page *page)
556{
557 if (!PageReferenced(page))
558 __SetPageReferenced(page);
559}
560EXPORT_SYMBOL(init_page_accessed);
561
562static void __lru_cache_add(struct page *page)
558{ 563{
559 struct pagevec *pvec = &get_cpu_var(lru_add_pvec); 564 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
560 565
@@ -564,11 +569,34 @@ void __lru_cache_add(struct page *page)
564 pagevec_add(pvec, page); 569 pagevec_add(pvec, page);
565 put_cpu_var(lru_add_pvec); 570 put_cpu_var(lru_add_pvec);
566} 571}
567EXPORT_SYMBOL(__lru_cache_add); 572
573/**
574 * lru_cache_add: add a page to the page lists
575 * @page: the page to add
576 */
577void lru_cache_add_anon(struct page *page)
578{
579 if (PageActive(page))
580 ClearPageActive(page);
581 __lru_cache_add(page);
582}
583
584void lru_cache_add_file(struct page *page)
585{
586 if (PageActive(page))
587 ClearPageActive(page);
588 __lru_cache_add(page);
589}
590EXPORT_SYMBOL(lru_cache_add_file);
568 591
569/** 592/**
570 * lru_cache_add - add a page to a page list 593 * lru_cache_add - add a page to a page list
571 * @page: the page to be added to the LRU. 594 * @page: the page to be added to the LRU.
595 *
596 * Queue the page for addition to the LRU via pagevec. The decision on whether
597 * to add the page to the [in]active [file|anon] list is deferred until the
598 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
599 * have the page added to the active list using mark_page_accessed().
572 */ 600 */
573void lru_cache_add(struct page *page) 601void lru_cache_add(struct page *page)
574{ 602{
@@ -779,7 +807,7 @@ void lru_add_drain_all(void)
779 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() 807 * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
780 * will free it. 808 * will free it.
781 */ 809 */
782void release_pages(struct page **pages, int nr, int cold) 810void release_pages(struct page **pages, int nr, bool cold)
783{ 811{
784 int i; 812 int i;
785 LIST_HEAD(pages_to_free); 813 LIST_HEAD(pages_to_free);
@@ -820,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold)
820 } 848 }
821 849
822 /* Clear Active bit in case of parallel mark_page_accessed */ 850 /* Clear Active bit in case of parallel mark_page_accessed */
823 ClearPageActive(page); 851 __ClearPageActive(page);
824 852
825 list_add(&page->lru, &pages_to_free); 853 list_add(&page->lru, &pages_to_free);
826 } 854 }
@@ -902,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
902 SetPageLRU(page); 930 SetPageLRU(page);
903 add_page_to_lru_list(page, lruvec, lru); 931 add_page_to_lru_list(page, lruvec, lru);
904 update_page_reclaim_stat(lruvec, file, active); 932 update_page_reclaim_stat(lruvec, file, active);
905 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); 933 trace_mm_lru_insertion(page, lru);
906} 934}
907 935
908/* 936/*
@@ -916,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec)
916EXPORT_SYMBOL(__pagevec_lru_add); 944EXPORT_SYMBOL(__pagevec_lru_add);
917 945
918/** 946/**
947 * pagevec_lookup_entries - gang pagecache lookup
948 * @pvec: Where the resulting entries are placed
949 * @mapping: The address_space to search
950 * @start: The starting entry index
951 * @nr_entries: The maximum number of entries
952 * @indices: The cache indices corresponding to the entries in @pvec
953 *
954 * pagevec_lookup_entries() will search for and return a group of up
955 * to @nr_entries pages and shadow entries in the mapping. All
956 * entries are placed in @pvec. pagevec_lookup_entries() takes a
957 * reference against actual pages in @pvec.
958 *
959 * The search returns a group of mapping-contiguous entries with
960 * ascending indexes. There may be holes in the indices due to
961 * not-present entries.
962 *
963 * pagevec_lookup_entries() returns the number of entries which were
964 * found.
965 */
966unsigned pagevec_lookup_entries(struct pagevec *pvec,
967 struct address_space *mapping,
968 pgoff_t start, unsigned nr_pages,
969 pgoff_t *indices)
970{
971 pvec->nr = find_get_entries(mapping, start, nr_pages,
972 pvec->pages, indices);
973 return pagevec_count(pvec);
974}
975
976/**
977 * pagevec_remove_exceptionals - pagevec exceptionals pruning
978 * @pvec: The pagevec to prune
979 *
980 * pagevec_lookup_entries() fills both pages and exceptional radix
981 * tree entries into the pagevec. This function prunes all
982 * exceptionals from @pvec without leaving holes, so that it can be
983 * passed on to page-only pagevec operations.
984 */
985void pagevec_remove_exceptionals(struct pagevec *pvec)
986{
987 int i, j;
988
989 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
990 struct page *page = pvec->pages[i];
991 if (!radix_tree_exceptional_entry(page))
992 pvec->pages[j++] = page;
993 }
994 pvec->nr = j;
995}
996
997/**
919 * pagevec_lookup - gang pagecache lookup 998 * pagevec_lookup - gang pagecache lookup
920 * @pvec: Where the resulting pages are placed 999 * @pvec: Where the resulting pages are placed
921 * @mapping: The address_space to search 1000 * @mapping: The address_space to search
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e6f15f8ca2af..4079edfff2cc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
63 return ret; 63 return ret;
64} 64}
65 65
66static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
67
66void show_swap_cache_info(void) 68void show_swap_cache_info(void)
67{ 69{
68 printk("%lu pages in swap cache\n", total_swapcache_pages()); 70 printk("%lu pages in swap cache\n", total_swapcache_pages());
@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
268 270
269 for (i = 0; i < todo; i++) 271 for (i = 0; i < todo; i++)
270 free_swap_cache(pagep[i]); 272 free_swap_cache(pagep[i]);
271 release_pages(pagep, todo, 0); 273 release_pages(pagep, todo, false);
272 pagep += todo; 274 pagep += todo;
273 nr -= todo; 275 nr -= todo;
274 } 276 }
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
286 288
287 page = find_get_page(swap_address_space(entry), entry.val); 289 page = find_get_page(swap_address_space(entry), entry.val);
288 290
289 if (page) 291 if (page) {
290 INC_CACHE_INFO(find_success); 292 INC_CACHE_INFO(find_success);
293 if (TestClearPageReadahead(page))
294 atomic_inc(&swapin_readahead_hits);
295 }
291 296
292 INC_CACHE_INFO(find_total); 297 INC_CACHE_INFO(find_total);
293 return page; 298 return page;
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
389 return found_page; 394 return found_page;
390} 395}
391 396
397static unsigned long swapin_nr_pages(unsigned long offset)
398{
399 static unsigned long prev_offset;
400 unsigned int pages, max_pages, last_ra;
401 static atomic_t last_readahead_pages;
402
403 max_pages = 1 << ACCESS_ONCE(page_cluster);
404 if (max_pages <= 1)
405 return 1;
406
407 /*
408 * This heuristic has been found to work well on both sequential and
409 * random loads, swapping to hard disk or to SSD: please don't ask
410 * what the "+ 2" means, it just happens to work well, that's all.
411 */
412 pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
413 if (pages == 2) {
414 /*
415 * We can have no readahead hits to judge by: but must not get
416 * stuck here forever, so check for an adjacent offset instead
417 * (and don't even bother to check whether swap type is same).
418 */
419 if (offset != prev_offset + 1 && offset != prev_offset - 1)
420 pages = 1;
421 prev_offset = offset;
422 } else {
423 unsigned int roundup = 4;
424 while (roundup < pages)
425 roundup <<= 1;
426 pages = roundup;
427 }
428
429 if (pages > max_pages)
430 pages = max_pages;
431
432 /* Don't shrink readahead too fast */
433 last_ra = atomic_read(&last_readahead_pages) / 2;
434 if (pages < last_ra)
435 pages = last_ra;
436 atomic_set(&last_readahead_pages, pages);
437
438 return pages;
439}
440
392/** 441/**
393 * swapin_readahead - swap in pages in hope we need them soon 442 * swapin_readahead - swap in pages in hope we need them soon
394 * @entry: swap entry of this memory 443 * @entry: swap entry of this memory
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
412 struct vm_area_struct *vma, unsigned long addr) 461 struct vm_area_struct *vma, unsigned long addr)
413{ 462{
414 struct page *page; 463 struct page *page;
415 unsigned long offset = swp_offset(entry); 464 unsigned long entry_offset = swp_offset(entry);
465 unsigned long offset = entry_offset;
416 unsigned long start_offset, end_offset; 466 unsigned long start_offset, end_offset;
417 unsigned long mask = (1UL << page_cluster) - 1; 467 unsigned long mask;
418 struct blk_plug plug; 468 struct blk_plug plug;
419 469
470 mask = swapin_nr_pages(offset) - 1;
471 if (!mask)
472 goto skip;
473
420 /* Read a page_cluster sized and aligned cluster around offset. */ 474 /* Read a page_cluster sized and aligned cluster around offset. */
421 start_offset = offset & ~mask; 475 start_offset = offset & ~mask;
422 end_offset = offset | mask; 476 end_offset = offset | mask;
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
430 gfp_mask, vma, addr); 484 gfp_mask, vma, addr);
431 if (!page) 485 if (!page)
432 continue; 486 continue;
487 if (offset != entry_offset)
488 SetPageReadahead(page);
433 page_cache_release(page); 489 page_cache_release(page);
434 } 490 }
435 blk_finish_plug(&plug); 491 blk_finish_plug(&plug);
436 492
437 lru_add_drain(); /* Push any new pages onto the LRU now */ 493 lru_add_drain(); /* Push any new pages onto the LRU now */
494skip:
438 return read_swap_cache_async(entry, gfp_mask, vma, addr); 495 return read_swap_cache_async(entry, gfp_mask, vma, addr);
439} 496}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0ec2eaf3ccfd..660b9c0e2e40 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ 51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
52long total_swap_pages; 52long total_swap_pages;
53static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
55 54
56static const char Bad_file[] = "Bad swap file entry "; 55static const char Bad_file[] = "Bad swap file entry ";
57static const char Unused_file[] = "Unused swap file entry "; 56static const char Unused_file[] = "Unused swap file entry ";
58static const char Bad_offset[] = "Bad swap offset entry "; 57static const char Bad_offset[] = "Bad swap offset entry ";
59static const char Unused_offset[] = "Unused swap offset entry "; 58static const char Unused_offset[] = "Unused swap offset entry ";
60 59
61struct swap_list_t swap_list = {-1, -1}; 60/*
61 * all active swap_info_structs
62 * protected with swap_lock, and ordered by priority.
63 */
64PLIST_HEAD(swap_active_head);
65
66/*
67 * all available (active, not full) swap_info_structs
68 * protected with swap_avail_lock, ordered by priority.
69 * This is used by get_swap_page() instead of swap_active_head
70 * because swap_active_head includes all swap_info_structs,
71 * but get_swap_page() doesn't need to look at full ones.
72 * This uses its own lock instead of swap_lock because when a
73 * swap_info_struct changes between not-full/full, it needs to
74 * add/remove itself to/from this list, but the swap_info_struct->lock
75 * is held and the locking order requires swap_lock to be taken
76 * before any swap_info_struct->lock.
77 */
78static PLIST_HEAD(swap_avail_head);
79static DEFINE_SPINLOCK(swap_avail_lock);
62 80
63struct swap_info_struct *swap_info[MAX_SWAPFILES]; 81struct swap_info_struct *swap_info[MAX_SWAPFILES];
64 82
@@ -591,6 +609,9 @@ checks:
591 if (si->inuse_pages == si->pages) { 609 if (si->inuse_pages == si->pages) {
592 si->lowest_bit = si->max; 610 si->lowest_bit = si->max;
593 si->highest_bit = 0; 611 si->highest_bit = 0;
612 spin_lock(&swap_avail_lock);
613 plist_del(&si->avail_list, &swap_avail_head);
614 spin_unlock(&swap_avail_lock);
594 } 615 }
595 si->swap_map[offset] = usage; 616 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset); 617 inc_cluster_info_page(si, si->cluster_info, offset);
@@ -639,71 +660,65 @@ no_page:
639 660
640swp_entry_t get_swap_page(void) 661swp_entry_t get_swap_page(void)
641{ 662{
642 struct swap_info_struct *si; 663 struct swap_info_struct *si, *next;
643 pgoff_t offset; 664 pgoff_t offset;
644 int type, next;
645 int wrapped = 0;
646 int hp_index;
647 665
648 spin_lock(&swap_lock);
649 if (atomic_long_read(&nr_swap_pages) <= 0) 666 if (atomic_long_read(&nr_swap_pages) <= 0)
650 goto noswap; 667 goto noswap;
651 atomic_long_dec(&nr_swap_pages); 668 atomic_long_dec(&nr_swap_pages);
652 669
653 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 670 spin_lock(&swap_avail_lock);
654 hp_index = atomic_xchg(&highest_priority_index, -1);
655 /*
656 * highest_priority_index records current highest priority swap
657 * type which just frees swap entries. If its priority is
658 * higher than that of swap_list.next swap type, we use it. It
659 * isn't protected by swap_lock, so it can be an invalid value
660 * if the corresponding swap type is swapoff. We double check
661 * the flags here. It's even possible the swap type is swapoff
662 * and swapon again and its priority is changed. In such rare
663 * case, low prority swap type might be used, but eventually
664 * high priority swap will be used after several rounds of
665 * swap.
666 */
667 if (hp_index != -1 && hp_index != type &&
668 swap_info[type]->prio < swap_info[hp_index]->prio &&
669 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
670 type = hp_index;
671 swap_list.next = type;
672 }
673
674 si = swap_info[type];
675 next = si->next;
676 if (next < 0 ||
677 (!wrapped && si->prio != swap_info[next]->prio)) {
678 next = swap_list.head;
679 wrapped++;
680 }
681 671
672start_over:
673 plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
674 /* requeue si to after same-priority siblings */
675 plist_requeue(&si->avail_list, &swap_avail_head);
676 spin_unlock(&swap_avail_lock);
682 spin_lock(&si->lock); 677 spin_lock(&si->lock);
683 if (!si->highest_bit) { 678 if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
679 spin_lock(&swap_avail_lock);
680 if (plist_node_empty(&si->avail_list)) {
681 spin_unlock(&si->lock);
682 goto nextsi;
683 }
684 WARN(!si->highest_bit,
685 "swap_info %d in list but !highest_bit\n",
686 si->type);
687 WARN(!(si->flags & SWP_WRITEOK),
688 "swap_info %d in list but !SWP_WRITEOK\n",
689 si->type);
690 plist_del(&si->avail_list, &swap_avail_head);
684 spin_unlock(&si->lock); 691 spin_unlock(&si->lock);
685 continue; 692 goto nextsi;
686 } 693 }
687 if (!(si->flags & SWP_WRITEOK)) {
688 spin_unlock(&si->lock);
689 continue;
690 }
691
692 swap_list.next = next;
693 694
694 spin_unlock(&swap_lock);
695 /* This is called for allocating swap entry for cache */ 695 /* This is called for allocating swap entry for cache */
696 offset = scan_swap_map(si, SWAP_HAS_CACHE); 696 offset = scan_swap_map(si, SWAP_HAS_CACHE);
697 spin_unlock(&si->lock); 697 spin_unlock(&si->lock);
698 if (offset) 698 if (offset)
699 return swp_entry(type, offset); 699 return swp_entry(si->type, offset);
700 spin_lock(&swap_lock); 700 pr_debug("scan_swap_map of si %d failed to find offset\n",
701 next = swap_list.next; 701 si->type);
702 spin_lock(&swap_avail_lock);
703nextsi:
704 /*
705 * if we got here, it's likely that si was almost full before,
706 * and since scan_swap_map() can drop the si->lock, multiple
707 * callers probably all tried to get a page from the same si
708 * and it filled up before we could get one; or, the si filled
709 * up between us dropping swap_avail_lock and taking si->lock.
710 * Since we dropped the swap_avail_lock, the swap_avail_head
711 * list may have been modified; so if next is still in the
712 * swap_avail_head list then try it, otherwise start over.
713 */
714 if (plist_node_empty(&next->avail_list))
715 goto start_over;
702 } 716 }
703 717
718 spin_unlock(&swap_avail_lock);
719
704 atomic_long_inc(&nr_swap_pages); 720 atomic_long_inc(&nr_swap_pages);
705noswap: 721noswap:
706 spin_unlock(&swap_lock);
707 return (swp_entry_t) {0}; 722 return (swp_entry_t) {0};
708} 723}
709 724
@@ -765,27 +780,6 @@ out:
765 return NULL; 780 return NULL;
766} 781}
767 782
768/*
769 * This swap type frees swap entry, check if it is the highest priority swap
770 * type which just frees swap entry. get_swap_page() uses
771 * highest_priority_index to search highest priority swap type. The
772 * swap_info_struct.lock can't protect us if there are multiple swap types
773 * active, so we use atomic_cmpxchg.
774 */
775static void set_highest_priority_index(int type)
776{
777 int old_hp_index, new_hp_index;
778
779 do {
780 old_hp_index = atomic_read(&highest_priority_index);
781 if (old_hp_index != -1 &&
782 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
783 break;
784 new_hp_index = type;
785 } while (atomic_cmpxchg(&highest_priority_index,
786 old_hp_index, new_hp_index) != old_hp_index);
787}
788
789static unsigned char swap_entry_free(struct swap_info_struct *p, 783static unsigned char swap_entry_free(struct swap_info_struct *p,
790 swp_entry_t entry, unsigned char usage) 784 swp_entry_t entry, unsigned char usage)
791{ 785{
@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
827 dec_cluster_info_page(p, p->cluster_info, offset); 821 dec_cluster_info_page(p, p->cluster_info, offset);
828 if (offset < p->lowest_bit) 822 if (offset < p->lowest_bit)
829 p->lowest_bit = offset; 823 p->lowest_bit = offset;
830 if (offset > p->highest_bit) 824 if (offset > p->highest_bit) {
825 bool was_full = !p->highest_bit;
831 p->highest_bit = offset; 826 p->highest_bit = offset;
832 set_highest_priority_index(p->type); 827 if (was_full && (p->flags & SWP_WRITEOK)) {
828 spin_lock(&swap_avail_lock);
829 WARN_ON(!plist_node_empty(&p->avail_list));
830 if (plist_node_empty(&p->avail_list))
831 plist_add(&p->avail_list,
832 &swap_avail_head);
833 spin_unlock(&swap_avail_lock);
834 }
835 }
833 atomic_long_inc(&nr_swap_pages); 836 atomic_long_inc(&nr_swap_pages);
834 p->inuse_pages--; 837 p->inuse_pages--;
835 frontswap_invalidate_page(p->type, offset); 838 frontswap_invalidate_page(p->type, offset);
@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1764 unsigned char *swap_map, 1767 unsigned char *swap_map,
1765 struct swap_cluster_info *cluster_info) 1768 struct swap_cluster_info *cluster_info)
1766{ 1769{
1767 int i, prev;
1768
1769 if (prio >= 0) 1770 if (prio >= 0)
1770 p->prio = prio; 1771 p->prio = prio;
1771 else 1772 else
1772 p->prio = --least_priority; 1773 p->prio = --least_priority;
1774 /*
1775 * the plist prio is negated because plist ordering is
1776 * low-to-high, while swap ordering is high-to-low
1777 */
1778 p->list.prio = -p->prio;
1779 p->avail_list.prio = -p->prio;
1773 p->swap_map = swap_map; 1780 p->swap_map = swap_map;
1774 p->cluster_info = cluster_info; 1781 p->cluster_info = cluster_info;
1775 p->flags |= SWP_WRITEOK; 1782 p->flags |= SWP_WRITEOK;
1776 atomic_long_add(p->pages, &nr_swap_pages); 1783 atomic_long_add(p->pages, &nr_swap_pages);
1777 total_swap_pages += p->pages; 1784 total_swap_pages += p->pages;
1778 1785
1779 /* insert swap space into swap_list: */ 1786 assert_spin_locked(&swap_lock);
1780 prev = -1; 1787 /*
1781 for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { 1788 * both lists are plists, and thus priority ordered.
1782 if (p->prio >= swap_info[i]->prio) 1789 * swap_active_head needs to be priority ordered for swapoff(),
1783 break; 1790 * which on removal of any swap_info_struct with an auto-assigned
1784 prev = i; 1791 * (i.e. negative) priority increments the auto-assigned priority
1785 } 1792 * of any lower-priority swap_info_structs.
1786 p->next = i; 1793 * swap_avail_head needs to be priority ordered for get_swap_page(),
1787 if (prev < 0) 1794 * which allocates swap pages from the highest available priority
1788 swap_list.head = swap_list.next = p->type; 1795 * swap_info_struct.
1789 else 1796 */
1790 swap_info[prev]->next = p->type; 1797 plist_add(&p->list, &swap_active_head);
1798 spin_lock(&swap_avail_lock);
1799 plist_add(&p->avail_list, &swap_avail_head);
1800 spin_unlock(&swap_avail_lock);
1791} 1801}
1792 1802
1793static void enable_swap_info(struct swap_info_struct *p, int prio, 1803static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1822 struct address_space *mapping; 1832 struct address_space *mapping;
1823 struct inode *inode; 1833 struct inode *inode;
1824 struct filename *pathname; 1834 struct filename *pathname;
1825 int i, type, prev; 1835 int err, found = 0;
1826 int err;
1827 unsigned int old_block_size; 1836 unsigned int old_block_size;
1828 1837
1829 if (!capable(CAP_SYS_ADMIN)) 1838 if (!capable(CAP_SYS_ADMIN))
@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1841 goto out; 1850 goto out;
1842 1851
1843 mapping = victim->f_mapping; 1852 mapping = victim->f_mapping;
1844 prev = -1;
1845 spin_lock(&swap_lock); 1853 spin_lock(&swap_lock);
1846 for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { 1854 plist_for_each_entry(p, &swap_active_head, list) {
1847 p = swap_info[type];
1848 if (p->flags & SWP_WRITEOK) { 1855 if (p->flags & SWP_WRITEOK) {
1849 if (p->swap_file->f_mapping == mapping) 1856 if (p->swap_file->f_mapping == mapping) {
1857 found = 1;
1850 break; 1858 break;
1859 }
1851 } 1860 }
1852 prev = type;
1853 } 1861 }
1854 if (type < 0) { 1862 if (!found) {
1855 err = -EINVAL; 1863 err = -EINVAL;
1856 spin_unlock(&swap_lock); 1864 spin_unlock(&swap_lock);
1857 goto out_dput; 1865 goto out_dput;
@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1863 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1864 goto out_dput; 1872 goto out_dput;
1865 } 1873 }
1866 if (prev < 0) 1874 spin_lock(&swap_avail_lock);
1867 swap_list.head = p->next; 1875 plist_del(&p->avail_list, &swap_avail_head);
1868 else 1876 spin_unlock(&swap_avail_lock);
1869 swap_info[prev]->next = p->next;
1870 if (type == swap_list.next) {
1871 /* just pick something that's safe... */
1872 swap_list.next = swap_list.head;
1873 }
1874 spin_lock(&p->lock); 1877 spin_lock(&p->lock);
1875 if (p->prio < 0) { 1878 if (p->prio < 0) {
1876 for (i = p->next; i >= 0; i = swap_info[i]->next) 1879 struct swap_info_struct *si = p;
1877 swap_info[i]->prio = p->prio--; 1880
1881 plist_for_each_entry_continue(si, &swap_active_head, list) {
1882 si->prio++;
1883 si->list.prio--;
1884 si->avail_list.prio--;
1885 }
1878 least_priority++; 1886 least_priority++;
1879 } 1887 }
1888 plist_del(&p->list, &swap_active_head);
1880 atomic_long_sub(p->pages, &nr_swap_pages); 1889 atomic_long_sub(p->pages, &nr_swap_pages);
1881 total_swap_pages -= p->pages; 1890 total_swap_pages -= p->pages;
1882 p->flags &= ~SWP_WRITEOK; 1891 p->flags &= ~SWP_WRITEOK;
@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1884 spin_unlock(&swap_lock); 1893 spin_unlock(&swap_lock);
1885 1894
1886 set_current_oom_origin(); 1895 set_current_oom_origin();
1887 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1896 err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
1888 clear_current_oom_origin(); 1897 clear_current_oom_origin();
1889 1898
1890 if (err) { 1899 if (err) {
@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1926 frontswap_map_set(p, NULL); 1935 frontswap_map_set(p, NULL);
1927 spin_unlock(&p->lock); 1936 spin_unlock(&p->lock);
1928 spin_unlock(&swap_lock); 1937 spin_unlock(&swap_lock);
1929 frontswap_invalidate_area(type); 1938 frontswap_invalidate_area(p->type);
1930 mutex_unlock(&swapon_mutex); 1939 mutex_unlock(&swapon_mutex);
1931 free_percpu(p->percpu_cluster); 1940 free_percpu(p->percpu_cluster);
1932 p->percpu_cluster = NULL; 1941 p->percpu_cluster = NULL;
@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1934 vfree(cluster_info); 1943 vfree(cluster_info);
1935 vfree(frontswap_map); 1944 vfree(frontswap_map);
1936 /* Destroy swap account informatin */ 1945 /* Destroy swap account informatin */
1937 swap_cgroup_swapoff(type); 1946 swap_cgroup_swapoff(p->type);
1938 1947
1939 inode = mapping->host; 1948 inode = mapping->host;
1940 if (S_ISBLK(inode->i_mode)) { 1949 if (S_ISBLK(inode->i_mode)) {
@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void)
2141 */ 2150 */
2142 } 2151 }
2143 INIT_LIST_HEAD(&p->first_swap_extent.list); 2152 INIT_LIST_HEAD(&p->first_swap_extent.list);
2153 plist_node_init(&p->list, 0);
2154 plist_node_init(&p->avail_list, 0);
2144 p->flags = SWP_USED; 2155 p->flags = SWP_USED;
2145 p->next = -1;
2146 spin_unlock(&swap_lock); 2156 spin_unlock(&swap_lock);
2147 spin_lock_init(&p->lock); 2157 spin_lock_init(&p->lock);
2148 2158
diff --git a/mm/truncate.c b/mm/truncate.c
index 353b683afd6e..2e84fe59190b 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -22,6 +22,22 @@
22#include <linux/cleancache.h> 22#include <linux/cleancache.h>
23#include "internal.h" 23#include "internal.h"
24 24
25static void clear_exceptional_entry(struct address_space *mapping,
26 pgoff_t index, void *entry)
27{
28 /* Handled by shmem itself */
29 if (shmem_mapping(mapping))
30 return;
31
32 spin_lock_irq(&mapping->tree_lock);
33 /*
34 * Regular page slots are stabilized by the page lock even
35 * without the tree itself locked. These unlocked entries
36 * need verification under the tree lock.
37 */
38 radix_tree_delete_item(&mapping->page_tree, index, entry);
39 spin_unlock_irq(&mapping->tree_lock);
40}
25 41
26/** 42/**
27 * do_invalidatepage - invalidate part or all of a page 43 * do_invalidatepage - invalidate part or all of a page
@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
208 unsigned int partial_start; /* inclusive */ 224 unsigned int partial_start; /* inclusive */
209 unsigned int partial_end; /* exclusive */ 225 unsigned int partial_end; /* exclusive */
210 struct pagevec pvec; 226 struct pagevec pvec;
227 pgoff_t indices[PAGEVEC_SIZE];
211 pgoff_t index; 228 pgoff_t index;
212 int i; 229 int i;
213 230
@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping,
238 255
239 pagevec_init(&pvec, 0); 256 pagevec_init(&pvec, 0);
240 index = start; 257 index = start;
241 while (index < end && pagevec_lookup(&pvec, mapping, index, 258 while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
242 min(end - index, (pgoff_t)PAGEVEC_SIZE))) { 259 min(end - index, (pgoff_t)PAGEVEC_SIZE),
260 indices)) {
243 mem_cgroup_uncharge_start(); 261 mem_cgroup_uncharge_start();
244 for (i = 0; i < pagevec_count(&pvec); i++) { 262 for (i = 0; i < pagevec_count(&pvec); i++) {
245 struct page *page = pvec.pages[i]; 263 struct page *page = pvec.pages[i];
246 264
247 /* We rely upon deletion not changing page->index */ 265 /* We rely upon deletion not changing page->index */
248 index = page->index; 266 index = indices[i];
249 if (index >= end) 267 if (index >= end)
250 break; 268 break;
251 269
270 if (radix_tree_exceptional_entry(page)) {
271 clear_exceptional_entry(mapping, index, page);
272 continue;
273 }
274
252 if (!trylock_page(page)) 275 if (!trylock_page(page))
253 continue; 276 continue;
254 WARN_ON(page->index != index); 277 WARN_ON(page->index != index);
@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
259 truncate_inode_page(mapping, page); 282 truncate_inode_page(mapping, page);
260 unlock_page(page); 283 unlock_page(page);
261 } 284 }
285 pagevec_remove_exceptionals(&pvec);
262 pagevec_release(&pvec); 286 pagevec_release(&pvec);
263 mem_cgroup_uncharge_end(); 287 mem_cgroup_uncharge_end();
264 cond_resched(); 288 cond_resched();
@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
307 index = start; 331 index = start;
308 for ( ; ; ) { 332 for ( ; ; ) {
309 cond_resched(); 333 cond_resched();
310 if (!pagevec_lookup(&pvec, mapping, index, 334 if (!pagevec_lookup_entries(&pvec, mapping, index,
311 min(end - index, (pgoff_t)PAGEVEC_SIZE))) { 335 min(end - index, (pgoff_t)PAGEVEC_SIZE),
336 indices)) {
312 if (index == start) 337 if (index == start)
313 break; 338 break;
314 index = start; 339 index = start;
315 continue; 340 continue;
316 } 341 }
317 if (index == start && pvec.pages[0]->index >= end) { 342 if (index == start && indices[0] >= end) {
343 pagevec_remove_exceptionals(&pvec);
318 pagevec_release(&pvec); 344 pagevec_release(&pvec);
319 break; 345 break;
320 } 346 }
@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
323 struct page *page = pvec.pages[i]; 349 struct page *page = pvec.pages[i];
324 350
325 /* We rely upon deletion not changing page->index */ 351 /* We rely upon deletion not changing page->index */
326 index = page->index; 352 index = indices[i];
327 if (index >= end) 353 if (index >= end)
328 break; 354 break;
329 355
356 if (radix_tree_exceptional_entry(page)) {
357 clear_exceptional_entry(mapping, index, page);
358 continue;
359 }
360
330 lock_page(page); 361 lock_page(page);
331 WARN_ON(page->index != index); 362 WARN_ON(page->index != index);
332 wait_on_page_writeback(page); 363 wait_on_page_writeback(page);
333 truncate_inode_page(mapping, page); 364 truncate_inode_page(mapping, page);
334 unlock_page(page); 365 unlock_page(page);
335 } 366 }
367 pagevec_remove_exceptionals(&pvec);
336 pagevec_release(&pvec); 368 pagevec_release(&pvec);
337 mem_cgroup_uncharge_end(); 369 mem_cgroup_uncharge_end();
338 index++; 370 index++;
@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
375unsigned long invalidate_mapping_pages(struct address_space *mapping, 407unsigned long invalidate_mapping_pages(struct address_space *mapping,
376 pgoff_t start, pgoff_t end) 408 pgoff_t start, pgoff_t end)
377{ 409{
410 pgoff_t indices[PAGEVEC_SIZE];
378 struct pagevec pvec; 411 struct pagevec pvec;
379 pgoff_t index = start; 412 pgoff_t index = start;
380 unsigned long ret; 413 unsigned long ret;
@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
390 */ 423 */
391 424
392 pagevec_init(&pvec, 0); 425 pagevec_init(&pvec, 0);
393 while (index <= end && pagevec_lookup(&pvec, mapping, index, 426 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
394 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 427 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
428 indices)) {
395 mem_cgroup_uncharge_start(); 429 mem_cgroup_uncharge_start();
396 for (i = 0; i < pagevec_count(&pvec); i++) { 430 for (i = 0; i < pagevec_count(&pvec); i++) {
397 struct page *page = pvec.pages[i]; 431 struct page *page = pvec.pages[i];
398 432
399 /* We rely upon deletion not changing page->index */ 433 /* We rely upon deletion not changing page->index */
400 index = page->index; 434 index = indices[i];
401 if (index > end) 435 if (index > end)
402 break; 436 break;
403 437
438 if (radix_tree_exceptional_entry(page)) {
439 clear_exceptional_entry(mapping, index, page);
440 continue;
441 }
442
404 if (!trylock_page(page)) 443 if (!trylock_page(page))
405 continue; 444 continue;
406 WARN_ON(page->index != index); 445 WARN_ON(page->index != index);
@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
414 deactivate_page(page); 453 deactivate_page(page);
415 count += ret; 454 count += ret;
416 } 455 }
456 pagevec_remove_exceptionals(&pvec);
417 pagevec_release(&pvec); 457 pagevec_release(&pvec);
418 mem_cgroup_uncharge_end(); 458 mem_cgroup_uncharge_end();
419 cond_resched(); 459 cond_resched();
@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
481int invalidate_inode_pages2_range(struct address_space *mapping, 521int invalidate_inode_pages2_range(struct address_space *mapping,
482 pgoff_t start, pgoff_t end) 522 pgoff_t start, pgoff_t end)
483{ 523{
524 pgoff_t indices[PAGEVEC_SIZE];
484 struct pagevec pvec; 525 struct pagevec pvec;
485 pgoff_t index; 526 pgoff_t index;
486 int i; 527 int i;
@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
491 cleancache_invalidate_inode(mapping); 532 cleancache_invalidate_inode(mapping);
492 pagevec_init(&pvec, 0); 533 pagevec_init(&pvec, 0);
493 index = start; 534 index = start;
494 while (index <= end && pagevec_lookup(&pvec, mapping, index, 535 while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
495 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 536 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
537 indices)) {
496 mem_cgroup_uncharge_start(); 538 mem_cgroup_uncharge_start();
497 for (i = 0; i < pagevec_count(&pvec); i++) { 539 for (i = 0; i < pagevec_count(&pvec); i++) {
498 struct page *page = pvec.pages[i]; 540 struct page *page = pvec.pages[i];
499 541
500 /* We rely upon deletion not changing page->index */ 542 /* We rely upon deletion not changing page->index */
501 index = page->index; 543 index = indices[i];
502 if (index > end) 544 if (index > end)
503 break; 545 break;
504 546
547 if (radix_tree_exceptional_entry(page)) {
548 clear_exceptional_entry(mapping, index, page);
549 continue;
550 }
551
505 lock_page(page); 552 lock_page(page);
506 WARN_ON(page->index != index); 553 WARN_ON(page->index != index);
507 if (page->mapping != mapping) { 554 if (page->mapping != mapping) {
@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
539 ret = ret2; 586 ret = ret2;
540 unlock_page(page); 587 unlock_page(page);
541 } 588 }
589 pagevec_remove_exceptionals(&pvec);
542 pagevec_release(&pvec); 590 pagevec_release(&pvec);
543 mem_cgroup_uncharge_end(); 591 mem_cgroup_uncharge_end();
544 cond_resched(); 592 cond_resched();
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..1037a3bab505
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,114 @@
1/*
2 * Copyright (C) 2014 Davidlohr Bueso.
3 */
4#include <linux/sched.h>
5#include <linux/mm.h>
6#include <linux/vmacache.h>
7
8/*
9 * Flush vma caches for threads that share a given mm.
10 *
11 * The operation is safe because the caller holds the mmap_sem
12 * exclusively and other threads accessing the vma cache will
13 * have mmap_sem held at least for read, so no extra locking
14 * is required to maintain the vma cache.
15 */
16void vmacache_flush_all(struct mm_struct *mm)
17{
18 struct task_struct *g, *p;
19
20 rcu_read_lock();
21 for_each_process_thread(g, p) {
22 /*
23 * Only flush the vmacache pointers as the
24 * mm seqnum is already set and curr's will
25 * be set upon invalidation when the next
26 * lookup is done.
27 */
28 if (mm == p->mm)
29 vmacache_flush(p);
30 }
31 rcu_read_unlock();
32}
33
34/*
35 * This task may be accessing a foreign mm via (for example)
36 * get_user_pages()->find_vma(). The vmacache is task-local and this
37 * task's vmacache pertains to a different mm (ie, its own). There is
38 * nothing we can do here.
39 *
40 * Also handle the case where a kernel thread has adopted this mm via use_mm().
41 * That kernel thread's vmacache is not applicable to this mm.
42 */
43static bool vmacache_valid_mm(struct mm_struct *mm)
44{
45 return current->mm == mm && !(current->flags & PF_KTHREAD);
46}
47
48void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
49{
50 if (vmacache_valid_mm(newvma->vm_mm))
51 current->vmacache[VMACACHE_HASH(addr)] = newvma;
52}
53
54static bool vmacache_valid(struct mm_struct *mm)
55{
56 struct task_struct *curr;
57
58 if (!vmacache_valid_mm(mm))
59 return false;
60
61 curr = current;
62 if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
63 /*
64 * First attempt will always be invalid, initialize
65 * the new cache for this task here.
66 */
67 curr->vmacache_seqnum = mm->vmacache_seqnum;
68 vmacache_flush(curr);
69 return false;
70 }
71 return true;
72}
73
74struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
75{
76 int i;
77
78 if (!vmacache_valid(mm))
79 return NULL;
80
81 for (i = 0; i < VMACACHE_SIZE; i++) {
82 struct vm_area_struct *vma = current->vmacache[i];
83
84 if (!vma)
85 continue;
86 if (WARN_ON_ONCE(vma->vm_mm != mm))
87 break;
88 if (vma->vm_start <= addr && vma->vm_end > addr)
89 return vma;
90 }
91
92 return NULL;
93}
94
95#ifndef CONFIG_MMU
96struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
97 unsigned long start,
98 unsigned long end)
99{
100 int i;
101
102 if (!vmacache_valid(mm))
103 return NULL;
104
105 for (i = 0; i < VMACACHE_SIZE; i++) {
106 struct vm_area_struct *vma = current->vmacache[i];
107
108 if (vma && vma->vm_start == start && vma->vm_end == end)
109 return vma;
110 }
111
112 return NULL;
113}
114#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e2be0f802ccf..060dc366ac44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2685,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
2685 2685
2686 prev_end = VMALLOC_START; 2686 prev_end = VMALLOC_START;
2687 2687
2688 spin_lock(&vmap_area_lock); 2688 rcu_read_lock();
2689 2689
2690 if (list_empty(&vmap_area_list)) { 2690 if (list_empty(&vmap_area_list)) {
2691 vmi->largest_chunk = VMALLOC_TOTAL; 2691 vmi->largest_chunk = VMALLOC_TOTAL;
2692 goto out; 2692 goto out;
2693 } 2693 }
2694 2694
2695 list_for_each_entry(va, &vmap_area_list, list) { 2695 list_for_each_entry_rcu(va, &vmap_area_list, list) {
2696 unsigned long addr = va->va_start; 2696 unsigned long addr = va->va_start;
2697 2697
2698 /* 2698 /*
@@ -2719,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi)
2719 vmi->largest_chunk = VMALLOC_END - prev_end; 2719 vmi->largest_chunk = VMALLOC_END - prev_end;
2720 2720
2721out: 2721out:
2722 spin_unlock(&vmap_area_lock); 2722 rcu_read_unlock();
2723} 2723}
2724#endif 2724#endif
2725 2725
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5ad29b2925a0..5461d02ea718 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
163 163
164bool zone_reclaimable(struct zone *zone) 164bool zone_reclaimable(struct zone *zone)
165{ 165{
166 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 166 return zone_page_state(zone, NR_PAGES_SCANNED) <
167 zone_reclaimable_pages(zone) * 6;
167} 168}
168 169
169static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 170static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
224 unsigned long freed = 0; 225 unsigned long freed = 0;
225 unsigned long long delta; 226 unsigned long long delta;
226 long total_scan; 227 long total_scan;
227 long max_pass; 228 long freeable;
228 long nr; 229 long nr;
229 long new_nr; 230 long new_nr;
230 int nid = shrinkctl->nid; 231 int nid = shrinkctl->nid;
231 long batch_size = shrinker->batch ? shrinker->batch 232 long batch_size = shrinker->batch ? shrinker->batch
232 : SHRINK_BATCH; 233 : SHRINK_BATCH;
233 234
234 max_pass = shrinker->count_objects(shrinker, shrinkctl); 235 freeable = shrinker->count_objects(shrinker, shrinkctl);
235 if (max_pass == 0) 236 if (freeable == 0)
236 return 0; 237 return 0;
237 238
238 /* 239 /*
@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
244 245
245 total_scan = nr; 246 total_scan = nr;
246 delta = (4 * nr_pages_scanned) / shrinker->seeks; 247 delta = (4 * nr_pages_scanned) / shrinker->seeks;
247 delta *= max_pass; 248 delta *= freeable;
248 do_div(delta, lru_pages + 1); 249 do_div(delta, lru_pages + 1);
249 total_scan += delta; 250 total_scan += delta;
250 if (total_scan < 0) { 251 if (total_scan < 0) {
251 printk(KERN_ERR 252 printk(KERN_ERR
252 "shrink_slab: %pF negative objects to delete nr=%ld\n", 253 "shrink_slab: %pF negative objects to delete nr=%ld\n",
253 shrinker->scan_objects, total_scan); 254 shrinker->scan_objects, total_scan);
254 total_scan = max_pass; 255 total_scan = freeable;
255 } 256 }
256 257
257 /* 258 /*
@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
260 * shrinkers to return -1 all the time. This results in a large 261 * shrinkers to return -1 all the time. This results in a large
261 * nr being built up so when a shrink that can do some work 262 * nr being built up so when a shrink that can do some work
262 * comes along it empties the entire cache due to nr >>> 263 * comes along it empties the entire cache due to nr >>>
263 * max_pass. This is bad for sustaining a working set in 264 * freeable. This is bad for sustaining a working set in
264 * memory. 265 * memory.
265 * 266 *
266 * Hence only allow the shrinker to scan the entire cache when 267 * Hence only allow the shrinker to scan the entire cache when
267 * a large delta change is calculated directly. 268 * a large delta change is calculated directly.
268 */ 269 */
269 if (delta < max_pass / 4) 270 if (delta < freeable / 4)
270 total_scan = min(total_scan, max_pass / 2); 271 total_scan = min(total_scan, freeable / 2);
271 272
272 /* 273 /*
273 * Avoid risking looping forever due to too large nr value: 274 * Avoid risking looping forever due to too large nr value:
274 * never try to free more than twice the estimate number of 275 * never try to free more than twice the estimate number of
275 * freeable entries. 276 * freeable entries.
276 */ 277 */
277 if (total_scan > max_pass * 2) 278 if (total_scan > freeable * 2)
278 total_scan = max_pass * 2; 279 total_scan = freeable * 2;
279 280
280 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, 281 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
281 nr_pages_scanned, lru_pages, 282 nr_pages_scanned, lru_pages,
282 max_pass, delta, total_scan); 283 freeable, delta, total_scan);
283 284
284 while (total_scan >= batch_size) { 285 /*
286 * Normally, we should not scan less than batch_size objects in one
287 * pass to avoid too frequent shrinker calls, but if the slab has less
288 * than batch_size objects in total and we are really tight on memory,
289 * we will try to reclaim all available objects, otherwise we can end
290 * up failing allocations although there are plenty of reclaimable
291 * objects spread over several slabs with usage less than the
292 * batch_size.
293 *
294 * We detect the "tight on memory" situations by looking at the total
295 * number of objects we want to scan (total_scan). If it is greater
296 * than the total number of objects on slab (freeable), we must be
297 * scanning at high prio and therefore should try to reclaim as much as
298 * possible.
299 */
300 while (total_scan >= batch_size ||
301 total_scan >= freeable) {
285 unsigned long ret; 302 unsigned long ret;
303 unsigned long nr_to_scan = min(batch_size, total_scan);
286 304
287 shrinkctl->nr_to_scan = batch_size; 305 shrinkctl->nr_to_scan = nr_to_scan;
288 ret = shrinker->scan_objects(shrinker, shrinkctl); 306 ret = shrinker->scan_objects(shrinker, shrinkctl);
289 if (ret == SHRINK_STOP) 307 if (ret == SHRINK_STOP)
290 break; 308 break;
291 freed += ret; 309 freed += ret;
292 310
293 count_vm_events(SLABS_SCANNED, batch_size); 311 count_vm_events(SLABS_SCANNED, nr_to_scan);
294 total_scan -= batch_size; 312 total_scan -= nr_to_scan;
295 313
296 cond_resched(); 314 cond_resched();
297 } 315 }
@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
352 } 370 }
353 371
354 list_for_each_entry(shrinker, &shrinker_list, list) { 372 list_for_each_entry(shrinker, &shrinker_list, list) {
355 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { 373 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) {
356 if (!node_online(shrinkctl->nid)) 374 shrinkctl->nid = 0;
357 continue;
358
359 if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
360 (shrinkctl->nid != 0))
361 break;
362
363 freed += shrink_slab_node(shrinkctl, shrinker, 375 freed += shrink_slab_node(shrinkctl, shrinker,
364 nr_pages_scanned, lru_pages); 376 nr_pages_scanned, lru_pages);
377 continue;
378 }
379
380 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
381 if (node_online(shrinkctl->nid))
382 freed += shrink_slab_node(shrinkctl, shrinker,
383 nr_pages_scanned, lru_pages);
365 384
366 } 385 }
367 } 386 }
@@ -1089,7 +1108,7 @@ keep:
1089 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1108 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1090 } 1109 }
1091 1110
1092 free_hot_cold_page_list(&free_pages, 1); 1111 free_hot_cold_page_list(&free_pages, true);
1093 1112
1094 list_splice(&ret_pages, page_list); 1113 list_splice(&ret_pages, page_list);
1095 count_vm_events(PGACTIVATE, pgactivate); 1114 count_vm_events(PGACTIVATE, pgactivate);
@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1126 TTU_UNMAP|TTU_IGNORE_ACCESS, 1145 TTU_UNMAP|TTU_IGNORE_ACCESS,
1127 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); 1146 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
1128 list_splice(&clean_pages, page_list); 1147 list_splice(&clean_pages, page_list);
1129 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1148 mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
1130 return ret; 1149 return ret;
1131} 1150}
1132 1151
@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1452 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1471 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1453 1472
1454 if (global_reclaim(sc)) { 1473 if (global_reclaim(sc)) {
1455 zone->pages_scanned += nr_scanned; 1474 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
1456 if (current_is_kswapd()) 1475 if (current_is_kswapd())
1457 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); 1476 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1458 else 1477 else
@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1487 1506
1488 spin_unlock_irq(&zone->lru_lock); 1507 spin_unlock_irq(&zone->lru_lock);
1489 1508
1490 free_hot_cold_page_list(&page_list, 1); 1509 free_hot_cold_page_list(&page_list, true);
1491 1510
1492 /* 1511 /*
1493 * If reclaim is isolating dirty pages under writeback, it implies 1512 * If reclaim is isolating dirty pages under writeback, it implies
@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1641 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, 1660 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1642 &nr_scanned, sc, isolate_mode, lru); 1661 &nr_scanned, sc, isolate_mode, lru);
1643 if (global_reclaim(sc)) 1662 if (global_reclaim(sc))
1644 zone->pages_scanned += nr_scanned; 1663 __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
1645 1664
1646 reclaim_stat->recent_scanned[file] += nr_taken; 1665 reclaim_stat->recent_scanned[file] += nr_taken;
1647 1666
@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1707 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1726 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1708 spin_unlock_irq(&zone->lru_lock); 1727 spin_unlock_irq(&zone->lru_lock);
1709 1728
1710 free_hot_cold_page_list(&l_hold, 1); 1729 free_hot_cold_page_list(&l_hold, true);
1711} 1730}
1712 1731
1713#ifdef CONFIG_SWAP 1732#ifdef CONFIG_SWAP
@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1829 struct zone *zone = lruvec_zone(lruvec); 1848 struct zone *zone = lruvec_zone(lruvec);
1830 unsigned long anon_prio, file_prio; 1849 unsigned long anon_prio, file_prio;
1831 enum scan_balance scan_balance; 1850 enum scan_balance scan_balance;
1832 unsigned long anon, file, free; 1851 unsigned long anon, file;
1833 bool force_scan = false; 1852 bool force_scan = false;
1834 unsigned long ap, fp; 1853 unsigned long ap, fp;
1835 enum lru_list lru; 1854 enum lru_list lru;
@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1877 goto out; 1896 goto out;
1878 } 1897 }
1879 1898
1880 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1881 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1882 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1883 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1884
1885 /* 1899 /*
1886 * If it's foreseeable that reclaiming the file cache won't be 1900 * If it's foreseeable that reclaiming the file cache won't be
1887 * enough to get the zone back into a desirable shape, we have 1901 * enough to get the zone back into a desirable shape, we have
@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1889 * thrashing - remaining file pages alone. 1903 * thrashing - remaining file pages alone.
1890 */ 1904 */
1891 if (global_reclaim(sc)) { 1905 if (global_reclaim(sc)) {
1892 free = zone_page_state(zone, NR_FREE_PAGES); 1906 unsigned long zonefile;
1893 if (unlikely(file + free <= high_wmark_pages(zone))) { 1907 unsigned long zonefree;
1908
1909 zonefree = zone_page_state(zone, NR_FREE_PAGES);
1910 zonefile = zone_page_state(zone, NR_ACTIVE_FILE) +
1911 zone_page_state(zone, NR_INACTIVE_FILE);
1912
1913 if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) {
1894 scan_balance = SCAN_ANON; 1914 scan_balance = SCAN_ANON;
1895 goto out; 1915 goto out;
1896 } 1916 }
@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1925 * 1945 *
1926 * anon in [0], file in [1] 1946 * anon in [0], file in [1]
1927 */ 1947 */
1948
1949 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1950 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1951 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1952 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1953
1928 spin_lock_irq(&zone->lru_lock); 1954 spin_lock_irq(&zone->lru_lock);
1929 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1955 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1930 reclaim_stat->recent_scanned[0] /= 2; 1956 reclaim_stat->recent_scanned[0] /= 2;
@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2000 unsigned long nr_reclaimed = 0; 2026 unsigned long nr_reclaimed = 0;
2001 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2027 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2002 struct blk_plug plug; 2028 struct blk_plug plug;
2003 bool scan_adjusted = false; 2029 bool scan_adjusted;
2004 2030
2005 get_scan_count(lruvec, sc, nr); 2031 get_scan_count(lruvec, sc, nr);
2006 2032
2007 /* Record the original scan target for proportional adjustments later */ 2033 /* Record the original scan target for proportional adjustments later */
2008 memcpy(targets, nr, sizeof(nr)); 2034 memcpy(targets, nr, sizeof(nr));
2009 2035
2036 /*
2037 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2038 * event that can occur when there is little memory pressure e.g.
2039 * multiple streaming readers/writers. Hence, we do not abort scanning
2040 * when the requested number of pages are reclaimed when scanning at
2041 * DEF_PRIORITY on the assumption that the fact we are direct
2042 * reclaiming implies that kswapd is not keeping up and it is best to
2043 * do a batch of work at once. For memcg reclaim one check is made to
2044 * abort proportional reclaim if either the file or anon lru has already
2045 * dropped to zero at the first pass.
2046 */
2047 scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
2048 sc->priority == DEF_PRIORITY);
2049
2010 blk_start_plug(&plug); 2050 blk_start_plug(&plug);
2011 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2051 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2012 nr[LRU_INACTIVE_FILE]) { 2052 nr[LRU_INACTIVE_FILE]) {
@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2027 continue; 2067 continue;
2028 2068
2029 /* 2069 /*
2030 * For global direct reclaim, reclaim only the number of pages
2031 * requested. Less care is taken to scan proportionally as it
2032 * is more important to minimise direct reclaim stall latency
2033 * than it is to properly age the LRU lists.
2034 */
2035 if (global_reclaim(sc) && !current_is_kswapd())
2036 break;
2037
2038 /*
2039 * For kswapd and memcg, reclaim at least the number of pages 2070 * For kswapd and memcg, reclaim at least the number of pages
2040 * requested. Ensure that the anon and file LRUs shrink 2071 * requested. Ensure that the anon and file LRUs are scanned
2041 * proportionally what was requested by get_scan_count(). We 2072 * proportionally what was requested by get_scan_count(). We
2042 * stop reclaiming one LRU and reduce the amount scanning 2073 * stop reclaiming one LRU and reduce the amount scanning
2043 * proportional to the original scan target. 2074 * proportional to the original scan target.
@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2045 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; 2076 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2046 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; 2077 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2047 2078
2079 /*
2080 * It's just vindictive to attack the larger once the smaller
2081 * has gone to zero. And given the way we stop scanning the
2082 * smaller below, this makes sure that we only make one nudge
2083 * towards proportionality once we've got nr_to_reclaim.
2084 */
2085 if (!nr_file || !nr_anon)
2086 break;
2087
2048 if (nr_file > nr_anon) { 2088 if (nr_file > nr_anon) {
2049 unsigned long scan_target = targets[LRU_INACTIVE_ANON] + 2089 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2050 targets[LRU_ACTIVE_ANON] + 1; 2090 targets[LRU_ACTIVE_ANON] + 1;
@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2406 unsigned long lru_pages = 0; 2446 unsigned long lru_pages = 0;
2407 2447
2408 nodes_clear(shrink->nodes_to_scan); 2448 nodes_clear(shrink->nodes_to_scan);
2409 for_each_zone_zonelist(zone, z, zonelist, 2449 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2410 gfp_zone(sc->gfp_mask)) { 2450 gfp_zone(sc->gfp_mask), sc->nodemask) {
2411 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2451 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2412 continue; 2452 continue;
2413 2453
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a442a723d79..f7ca04482299 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
200 continue; 200 continue;
201 201
202 threshold = (*calculate_pressure)(zone); 202 threshold = (*calculate_pressure)(zone);
203 for_each_possible_cpu(cpu) 203 for_each_online_cpu(cpu)
204 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 204 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
205 = threshold; 205 = threshold;
206 } 206 }
@@ -761,6 +761,7 @@ const char * const vmstat_text[] = {
761 "nr_shmem", 761 "nr_shmem",
762 "nr_dirtied", 762 "nr_dirtied",
763 "nr_written", 763 "nr_written",
764 "nr_pages_scanned",
764 765
765#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
766 "numa_hit", 767 "numa_hit",
@@ -851,12 +852,14 @@ const char * const vmstat_text[] = {
851 "thp_zero_page_alloc", 852 "thp_zero_page_alloc",
852 "thp_zero_page_alloc_failed", 853 "thp_zero_page_alloc_failed",
853#endif 854#endif
855#ifdef CONFIG_DEBUG_TLBFLUSH
854#ifdef CONFIG_SMP 856#ifdef CONFIG_SMP
855 "nr_tlb_remote_flush", 857 "nr_tlb_remote_flush",
856 "nr_tlb_remote_flush_received", 858 "nr_tlb_remote_flush_received",
857#endif 859#endif /* CONFIG_SMP */
858 "nr_tlb_local_flush_all", 860 "nr_tlb_local_flush_all",
859 "nr_tlb_local_flush_one", 861 "nr_tlb_local_flush_one",
862#endif /* CONFIG_DEBUG_TLBFLUSH */
860 863
861#endif /* CONFIG_VM_EVENTS_COUNTERS */ 864#endif /* CONFIG_VM_EVENTS_COUNTERS */
862}; 865};
@@ -1053,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1053 min_wmark_pages(zone), 1056 min_wmark_pages(zone),
1054 low_wmark_pages(zone), 1057 low_wmark_pages(zone),
1055 high_wmark_pages(zone), 1058 high_wmark_pages(zone),
1056 zone->pages_scanned, 1059 zone_page_state(zone, NR_PAGES_SCANNED),
1057 zone->spanned_pages, 1060 zone->spanned_pages,
1058 zone->present_pages, 1061 zone->present_pages,
1059 zone->managed_pages); 1062 zone->managed_pages);
@@ -1063,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1063 zone_page_state(zone, i)); 1066 zone_page_state(zone, i));
1064 1067
1065 seq_printf(m, 1068 seq_printf(m,
1066 "\n protection: (%lu", 1069 "\n protection: (%ld",
1067 zone->lowmem_reserve[0]); 1070 zone->lowmem_reserve[0]);
1068 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) 1071 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
1069 seq_printf(m, ", %lu", zone->lowmem_reserve[i]); 1072 seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
1070 seq_printf(m, 1073 seq_printf(m,
1071 ")" 1074 ")"
1072 "\n pagesets"); 1075 "\n pagesets");