diff options
author | Dan Murphy | 2014-10-09 08:46:37 -0500 |
---|---|---|
committer | Dan Murphy | 2014-10-09 08:46:37 -0500 |
commit | 68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf (patch) | |
tree | 5dd817815fc283a5a5629b937e4c3c2cf9cf8b17 | |
parent | 595e0e568639ef203725532e9f4a767e8a7e3281 (diff) | |
parent | b0807bc10a6ac95ab8bf3bbf57703a0f2edd9aa9 (diff) | |
download | ti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.tar.gz ti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.tar.xz ti-linux-kernel-68f449afccf6d1fb6b38bddfc3a40d9e97b53bdf.zip |
Merge tag 'v3.12.30' of http://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable into ti-linux-3.12.yti-linux-3.12.y
This is the 3.12.30 stable release
* tag 'v3.12.30' of http://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable: (97 commits)
Linux 3.12.30
mm: page_alloc: reduce cost of the fair zone allocation policy
mm: page_alloc: abort fair zone allocation policy when remotes nodes are encountered
mm: vmscan: only update per-cpu thresholds for online CPU
mm: move zone->pages_scanned into a vmstat counter
mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
mm: pagemap: avoid unnecessary overhead when tracepoints are deactivated
memcg, vmscan: Fix forced scan of anonymous pages
vmalloc: use rcu list iterator to reduce vmap_area_lock contention
mm: make copy_pte_range static again
mm, thp: only collapse hugepages to nodes with affinity for zone_reclaim_mode
mm/memory.c: use entry = ACCESS_ONCE(*pte) in handle_pte_fault()
shmem: fix init_page_accessed use to stop !PageLRU bug
mm: avoid unnecessary atomic operations during end_page_writeback()
mm: non-atomically mark page accessed during page cache allocation where possible
fs: buffer: do not use unnecessary atomic operations when discarding buffers
mm: do not use unnecessary atomic operations when adding pages to the LRU
mm: do not use atomic operations when releasing pages
mm: shmem: avoid atomic operation during shmem_getpage_gfp
mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free
...
Signed-off-by: Dan Murphy <DMurphy@ti.com>
88 files changed, 2367 insertions, 1358 deletions
@@ -1,6 +1,6 @@ | |||
1 | VERSION = 3 | 1 | VERSION = 3 |
2 | PATCHLEVEL = 12 | 2 | PATCHLEVEL = 12 |
3 | SUBLEVEL = 29 | 3 | SUBLEVEL = 30 |
4 | EXTRAVERSION = | 4 | EXTRAVERSION = |
5 | NAME = One Giant Leap for Frogkind | 5 | NAME = One Giant Leap for Frogkind |
6 | 6 | ||
diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 004ba568d93f..33294fdc402e 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c | |||
@@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) | |||
417 | if (put_page_testzero(page)) { | 417 | if (put_page_testzero(page)) { |
418 | homecache_change_page_home(page, order, PAGE_HOME_HASH); | 418 | homecache_change_page_home(page, order, PAGE_HOME_HASH); |
419 | if (order == 0) { | 419 | if (order == 0) { |
420 | free_hot_cold_page(page, 0); | 420 | free_hot_cold_page(page, false); |
421 | } else { | 421 | } else { |
422 | init_page_count(page); | 422 | init_page_count(page); |
423 | __free_pages(page, order); | 423 | __free_pages(page, order); |
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index fb5e4c658f7a..ef470a7a3d0f 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h | |||
@@ -14,6 +14,8 @@ | |||
14 | 14 | ||
15 | #include <linux/compiler.h> | 15 | #include <linux/compiler.h> |
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/mm.h> | ||
18 | #include <linux/vmacache.h> | ||
17 | #include <linux/io.h> | 19 | #include <linux/io.h> |
18 | 20 | ||
19 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
@@ -73,7 +75,7 @@ do { \ | |||
73 | else \ | 75 | else \ |
74 | mm->mmap = NULL; \ | 76 | mm->mmap = NULL; \ |
75 | rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ | 77 | rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ |
76 | mm->mmap_cache = NULL; \ | 78 | vmacache_invalidate(mm); \ |
77 | mm->map_count--; \ | 79 | mm->map_count--; \ |
78 | remove_vma(high_vma); \ | 80 | remove_vma(high_vma); \ |
79 | } \ | 81 | } \ |
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index e6d90babc245..04905bfc508b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h | |||
@@ -62,7 +62,7 @@ static inline void __flush_tlb_all(void) | |||
62 | 62 | ||
63 | static inline void __flush_tlb_one(unsigned long addr) | 63 | static inline void __flush_tlb_one(unsigned long addr) |
64 | { | 64 | { |
65 | count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); | 65 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
66 | __flush_tlb_single(addr); | 66 | __flush_tlb_single(addr); |
67 | } | 67 | } |
68 | 68 | ||
@@ -93,13 +93,13 @@ static inline void __flush_tlb_one(unsigned long addr) | |||
93 | */ | 93 | */ |
94 | static inline void __flush_tlb_up(void) | 94 | static inline void __flush_tlb_up(void) |
95 | { | 95 | { |
96 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 96 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
97 | __flush_tlb(); | 97 | __flush_tlb(); |
98 | } | 98 | } |
99 | 99 | ||
100 | static inline void flush_tlb_all(void) | 100 | static inline void flush_tlb_all(void) |
101 | { | 101 | { |
102 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 102 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
103 | __flush_tlb_all(); | 103 | __flush_tlb_all(); |
104 | } | 104 | } |
105 | 105 | ||
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4f..0e25a1bc5ab5 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c | |||
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) | |||
683 | } | 683 | } |
684 | 684 | ||
685 | /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ | 685 | /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ |
686 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 686 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
687 | __flush_tlb(); | 687 | __flush_tlb(); |
688 | 688 | ||
689 | /* Save MTRR state */ | 689 | /* Save MTRR state */ |
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) | |||
697 | static void post_set(void) __releases(set_atomicity_lock) | 697 | static void post_set(void) __releases(set_atomicity_lock) |
698 | { | 698 | { |
699 | /* Flush TLBs (no need to flush caches - they are disabled) */ | 699 | /* Flush TLBs (no need to flush caches - they are disabled) */ |
700 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 700 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
701 | __flush_tlb(); | 701 | __flush_tlb(); |
702 | 702 | ||
703 | /* Intel (P6) standard MTRRs */ | 703 | /* Intel (P6) standard MTRRs */ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index dfa537a03be1..5da29d04de2f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -386,13 +386,20 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, | |||
386 | int ptep_clear_flush_young(struct vm_area_struct *vma, | 386 | int ptep_clear_flush_young(struct vm_area_struct *vma, |
387 | unsigned long address, pte_t *ptep) | 387 | unsigned long address, pte_t *ptep) |
388 | { | 388 | { |
389 | int young; | 389 | /* |
390 | 390 | * On x86 CPUs, clearing the accessed bit without a TLB flush | |
391 | young = ptep_test_and_clear_young(vma, address, ptep); | 391 | * doesn't cause data corruption. [ It could cause incorrect |
392 | if (young) | 392 | * page aging and the (mistaken) reclaim of hot pages, but the |
393 | flush_tlb_page(vma, address); | 393 | * chance of that should be relatively low. ] |
394 | 394 | * | |
395 | return young; | 395 | * So as a performance optimization don't flush the TLB when |
396 | * clearing the accessed bit, it will eventually be flushed by | ||
397 | * a context switch or a VM operation anyway. [ In the rare | ||
398 | * event of it not getting flushed for a long time the delay | ||
399 | * shouldn't really matter because there's no real memory | ||
400 | * pressure for swapout to react to. ] | ||
401 | */ | ||
402 | return ptep_test_and_clear_young(vma, address, ptep); | ||
396 | } | 403 | } |
397 | 404 | ||
398 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 405 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3bbac8..dd8dda167a24 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) | |||
103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) |
104 | return; | 104 | return; |
105 | 105 | ||
106 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 106 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { |
108 | if (f->flush_end == TLB_FLUSH_ALL) | 108 | if (f->flush_end == TLB_FLUSH_ALL) |
109 | local_flush_tlb(); | 109 | local_flush_tlb(); |
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
131 | info.flush_start = start; | 131 | info.flush_start = start; |
132 | info.flush_end = end; | 132 | info.flush_end = end; |
133 | 133 | ||
134 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 134 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
135 | if (is_uv_system()) { | 135 | if (is_uv_system()) { |
136 | unsigned int cpu; | 136 | unsigned int cpu; |
137 | 137 | ||
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void) | |||
151 | 151 | ||
152 | preempt_disable(); | 152 | preempt_disable(); |
153 | 153 | ||
154 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 154 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
155 | local_flush_tlb(); | 155 | local_flush_tlb(); |
156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); |
158 | preempt_enable(); | 158 | preempt_enable(); |
159 | } | 159 | } |
160 | 160 | ||
161 | /* | ||
162 | * It can find out the THP large page, or | ||
163 | * HUGETLB page in tlb_flush when THP disabled | ||
164 | */ | ||
165 | static inline unsigned long has_large_page(struct mm_struct *mm, | ||
166 | unsigned long start, unsigned long end) | ||
167 | { | ||
168 | pgd_t *pgd; | ||
169 | pud_t *pud; | ||
170 | pmd_t *pmd; | ||
171 | unsigned long addr = ALIGN(start, HPAGE_SIZE); | ||
172 | for (; addr < end; addr += HPAGE_SIZE) { | ||
173 | pgd = pgd_offset(mm, addr); | ||
174 | if (likely(!pgd_none(*pgd))) { | ||
175 | pud = pud_offset(pgd, addr); | ||
176 | if (likely(!pud_none(*pud))) { | ||
177 | pmd = pmd_offset(pud, addr); | ||
178 | if (likely(!pmd_none(*pmd))) | ||
179 | if (pmd_large(*pmd)) | ||
180 | return addr; | ||
181 | } | ||
182 | } | ||
183 | } | ||
184 | return 0; | ||
185 | } | ||
186 | |||
187 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
188 | unsigned long end, unsigned long vmflag) | 162 | unsigned long end, unsigned long vmflag) |
189 | { | 163 | { |
190 | unsigned long addr; | 164 | unsigned long addr; |
191 | unsigned act_entries, tlb_entries = 0; | 165 | unsigned act_entries, tlb_entries = 0; |
166 | unsigned long nr_base_pages; | ||
192 | 167 | ||
193 | preempt_disable(); | 168 | preempt_disable(); |
194 | if (current->active_mm != mm) | 169 | if (current->active_mm != mm) |
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
210 | tlb_entries = tlb_lli_4k[ENTRIES]; | 185 | tlb_entries = tlb_lli_4k[ENTRIES]; |
211 | else | 186 | else |
212 | tlb_entries = tlb_lld_4k[ENTRIES]; | 187 | tlb_entries = tlb_lld_4k[ENTRIES]; |
188 | |||
213 | /* Assume all of TLB entries was occupied by this task */ | 189 | /* Assume all of TLB entries was occupied by this task */ |
214 | act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; | 190 | act_entries = tlb_entries >> tlb_flushall_shift; |
191 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
192 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
215 | 193 | ||
216 | /* tlb_flushall_shift is on balance point, details in commit log */ | 194 | /* tlb_flushall_shift is on balance point, details in commit log */ |
217 | if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { | 195 | if (nr_base_pages > act_entries) { |
218 | count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); | 196 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
219 | local_flush_tlb(); | 197 | local_flush_tlb(); |
220 | } else { | 198 | } else { |
221 | if (has_large_page(mm, start, end)) { | ||
222 | local_flush_tlb(); | ||
223 | goto flush_all; | ||
224 | } | ||
225 | /* flush range by one by one 'invlpg' */ | 199 | /* flush range by one by one 'invlpg' */ |
226 | for (addr = start; addr < end; addr += PAGE_SIZE) { | 200 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
227 | count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); | 201 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
228 | __flush_tlb_single(addr); | 202 | __flush_tlb_single(addr); |
229 | } | 203 | } |
230 | 204 | ||
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) | |||
262 | 236 | ||
263 | static void do_flush_tlb_all(void *info) | 237 | static void do_flush_tlb_all(void *info) |
264 | { | 238 | { |
265 | count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 239 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
266 | __flush_tlb_all(); | 240 | __flush_tlb_all(); |
267 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | 241 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) |
268 | leave_mm(smp_processor_id()); | 242 | leave_mm(smp_processor_id()); |
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info) | |||
270 | 244 | ||
271 | void flush_tlb_all(void) | 245 | void flush_tlb_all(void) |
272 | { | 246 | { |
273 | count_vm_event(NR_TLB_REMOTE_FLUSH); | 247 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); |
274 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 248 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
275 | } | 249 | } |
276 | 250 | ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6e9ff8fac75a..6357298932bf 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -474,7 +474,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, | |||
474 | rcu_read_lock(); | 474 | rcu_read_lock(); |
475 | page = radix_tree_lookup(&mapping->page_tree, pg_index); | 475 | page = radix_tree_lookup(&mapping->page_tree, pg_index); |
476 | rcu_read_unlock(); | 476 | rcu_read_unlock(); |
477 | if (page) { | 477 | if (page && !radix_tree_exceptional_entry(page)) { |
478 | misses++; | 478 | misses++; |
479 | if (misses > 4) | 479 | if (misses > 4) |
480 | break; | 480 | break; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 594bbfd4996e..7015d9079bd1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -4446,7 +4446,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) | |||
4446 | spin_unlock(&eb->refs_lock); | 4446 | spin_unlock(&eb->refs_lock); |
4447 | } | 4447 | } |
4448 | 4448 | ||
4449 | static void mark_extent_buffer_accessed(struct extent_buffer *eb) | 4449 | static void mark_extent_buffer_accessed(struct extent_buffer *eb, |
4450 | struct page *accessed) | ||
4450 | { | 4451 | { |
4451 | unsigned long num_pages, i; | 4452 | unsigned long num_pages, i; |
4452 | 4453 | ||
@@ -4455,7 +4456,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) | |||
4455 | num_pages = num_extent_pages(eb->start, eb->len); | 4456 | num_pages = num_extent_pages(eb->start, eb->len); |
4456 | for (i = 0; i < num_pages; i++) { | 4457 | for (i = 0; i < num_pages; i++) { |
4457 | struct page *p = extent_buffer_page(eb, i); | 4458 | struct page *p = extent_buffer_page(eb, i); |
4458 | mark_page_accessed(p); | 4459 | if (p != accessed) |
4460 | mark_page_accessed(p); | ||
4459 | } | 4461 | } |
4460 | } | 4462 | } |
4461 | 4463 | ||
@@ -4476,7 +4478,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | |||
4476 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 4478 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
4477 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 4479 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
4478 | rcu_read_unlock(); | 4480 | rcu_read_unlock(); |
4479 | mark_extent_buffer_accessed(eb); | 4481 | mark_extent_buffer_accessed(eb, NULL); |
4480 | return eb; | 4482 | return eb; |
4481 | } | 4483 | } |
4482 | rcu_read_unlock(); | 4484 | rcu_read_unlock(); |
@@ -4504,7 +4506,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | |||
4504 | spin_unlock(&mapping->private_lock); | 4506 | spin_unlock(&mapping->private_lock); |
4505 | unlock_page(p); | 4507 | unlock_page(p); |
4506 | page_cache_release(p); | 4508 | page_cache_release(p); |
4507 | mark_extent_buffer_accessed(exists); | 4509 | mark_extent_buffer_accessed(exists, p); |
4508 | goto free_eb; | 4510 | goto free_eb; |
4509 | } | 4511 | } |
4510 | 4512 | ||
@@ -4519,7 +4521,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, | |||
4519 | attach_extent_buffer_page(eb, p); | 4521 | attach_extent_buffer_page(eb, p); |
4520 | spin_unlock(&mapping->private_lock); | 4522 | spin_unlock(&mapping->private_lock); |
4521 | WARN_ON(PageDirty(p)); | 4523 | WARN_ON(PageDirty(p)); |
4522 | mark_page_accessed(p); | ||
4523 | eb->pages[i] = p; | 4524 | eb->pages[i] = p; |
4524 | if (!PageUptodate(p)) | 4525 | if (!PageUptodate(p)) |
4525 | uptodate = 0; | 4526 | uptodate = 0; |
@@ -4549,7 +4550,7 @@ again: | |||
4549 | } | 4550 | } |
4550 | spin_unlock(&tree->buffer_lock); | 4551 | spin_unlock(&tree->buffer_lock); |
4551 | radix_tree_preload_end(); | 4552 | radix_tree_preload_end(); |
4552 | mark_extent_buffer_accessed(exists); | 4553 | mark_extent_buffer_accessed(exists, NULL); |
4553 | goto free_eb; | 4554 | goto free_eb; |
4554 | } | 4555 | } |
4555 | /* add one reference for the tree */ | 4556 | /* add one reference for the tree */ |
@@ -4595,7 +4596,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, | |||
4595 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); | 4596 | eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); |
4596 | if (eb && atomic_inc_not_zero(&eb->refs)) { | 4597 | if (eb && atomic_inc_not_zero(&eb->refs)) { |
4597 | rcu_read_unlock(); | 4598 | rcu_read_unlock(); |
4598 | mark_extent_buffer_accessed(eb); | 4599 | mark_extent_buffer_accessed(eb, NULL); |
4599 | return eb; | 4600 | return eb; |
4600 | } | 4601 | } |
4601 | rcu_read_unlock(); | 4602 | rcu_read_unlock(); |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 72da4df53c9a..ad80dfa6cf91 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, | |||
426 | struct page *page = prepared_pages[pg]; | 426 | struct page *page = prepared_pages[pg]; |
427 | /* | 427 | /* |
428 | * Copy data from userspace to the current page | 428 | * Copy data from userspace to the current page |
429 | * | ||
430 | * Disable pagefault to avoid recursive lock since | ||
431 | * the pages are already locked | ||
432 | */ | 429 | */ |
433 | pagefault_disable(); | ||
434 | copied = iov_iter_copy_from_user_atomic(page, i, offset, count); | 430 | copied = iov_iter_copy_from_user_atomic(page, i, offset, count); |
435 | pagefault_enable(); | ||
436 | 431 | ||
437 | /* Flush processor's dcache for this page */ | 432 | /* Flush processor's dcache for this page */ |
438 | flush_dcache_page(page); | 433 | flush_dcache_page(page); |
@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) | |||
476 | for (i = 0; i < num_pages; i++) { | 471 | for (i = 0; i < num_pages; i++) { |
477 | /* page checked is some magic around finding pages that | 472 | /* page checked is some magic around finding pages that |
478 | * have been modified without going through btrfs_set_page_dirty | 473 | * have been modified without going through btrfs_set_page_dirty |
479 | * clear it here | 474 | * clear it here. There should be no need to mark the pages |
475 | * accessed as prepare_pages should have marked them accessed | ||
476 | * in prepare_pages via find_or_create_page() | ||
480 | */ | 477 | */ |
481 | ClearPageChecked(pages[i]); | 478 | ClearPageChecked(pages[i]); |
482 | unlock_page(pages[i]); | 479 | unlock_page(pages[i]); |
483 | mark_page_accessed(pages[i]); | ||
484 | page_cache_release(pages[i]); | 480 | page_cache_release(pages[i]); |
485 | } | 481 | } |
486 | } | 482 | } |
diff --git a/fs/buffer.c b/fs/buffer.c index aeeea6529bcd..b7888527f7c3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) | |||
227 | int all_mapped = 1; | 227 | int all_mapped = 1; |
228 | 228 | ||
229 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); | 229 | index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); |
230 | page = find_get_page(bd_mapping, index); | 230 | page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); |
231 | if (!page) | 231 | if (!page) |
232 | goto out; | 232 | goto out; |
233 | 233 | ||
@@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) | |||
1366 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); | 1366 | struct buffer_head *bh = lookup_bh_lru(bdev, block, size); |
1367 | 1367 | ||
1368 | if (bh == NULL) { | 1368 | if (bh == NULL) { |
1369 | /* __find_get_block_slow will mark the page accessed */ | ||
1369 | bh = __find_get_block_slow(bdev, block); | 1370 | bh = __find_get_block_slow(bdev, block); |
1370 | if (bh) | 1371 | if (bh) |
1371 | bh_lru_install(bh); | 1372 | bh_lru_install(bh); |
1372 | } | 1373 | } else |
1373 | if (bh) | ||
1374 | touch_buffer(bh); | 1374 | touch_buffer(bh); |
1375 | |||
1375 | return bh; | 1376 | return bh; |
1376 | } | 1377 | } |
1377 | EXPORT_SYMBOL(__find_get_block); | 1378 | EXPORT_SYMBOL(__find_get_block); |
@@ -1483,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page); | |||
1483 | /* | 1484 | /* |
1484 | * Called when truncating a buffer on a page completely. | 1485 | * Called when truncating a buffer on a page completely. |
1485 | */ | 1486 | */ |
1487 | |||
1488 | /* Bits that are cleared during an invalidate */ | ||
1489 | #define BUFFER_FLAGS_DISCARD \ | ||
1490 | (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ | ||
1491 | 1 << BH_Delay | 1 << BH_Unwritten) | ||
1492 | |||
1486 | static void discard_buffer(struct buffer_head * bh) | 1493 | static void discard_buffer(struct buffer_head * bh) |
1487 | { | 1494 | { |
1495 | unsigned long b_state, b_state_old; | ||
1496 | |||
1488 | lock_buffer(bh); | 1497 | lock_buffer(bh); |
1489 | clear_buffer_dirty(bh); | 1498 | clear_buffer_dirty(bh); |
1490 | bh->b_bdev = NULL; | 1499 | bh->b_bdev = NULL; |
1491 | clear_buffer_mapped(bh); | 1500 | b_state = bh->b_state; |
1492 | clear_buffer_req(bh); | 1501 | for (;;) { |
1493 | clear_buffer_new(bh); | 1502 | b_state_old = cmpxchg(&bh->b_state, b_state, |
1494 | clear_buffer_delay(bh); | 1503 | (b_state & ~BUFFER_FLAGS_DISCARD)); |
1495 | clear_buffer_unwritten(bh); | 1504 | if (b_state_old == b_state) |
1505 | break; | ||
1506 | b_state = b_state_old; | ||
1507 | } | ||
1496 | unlock_buffer(bh); | 1508 | unlock_buffer(bh); |
1497 | } | 1509 | } |
1498 | 1510 | ||
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e501ac3a49ff..2f6cfcaa55fd 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c | |||
@@ -179,8 +179,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i | |||
179 | struct page *page = NULL; | 179 | struct page *page = NULL; |
180 | 180 | ||
181 | if (blocknr + i < devsize) { | 181 | if (blocknr + i < devsize) { |
182 | page = read_mapping_page_async(mapping, blocknr + i, | 182 | page = read_mapping_page(mapping, blocknr + i, NULL); |
183 | NULL); | ||
184 | /* synchronous error? */ | 183 | /* synchronous error? */ |
185 | if (IS_ERR(page)) | 184 | if (IS_ERR(page)) |
186 | page = NULL; | 185 | page = NULL; |
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/file.h> | 26 | #include <linux/file.h> |
27 | #include <linux/fdtable.h> | 27 | #include <linux/fdtable.h> |
28 | #include <linux/mm.h> | 28 | #include <linux/mm.h> |
29 | #include <linux/vmacache.h> | ||
29 | #include <linux/stat.h> | 30 | #include <linux/stat.h> |
30 | #include <linux/fcntl.h> | 31 | #include <linux/fcntl.h> |
31 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
@@ -818,7 +819,7 @@ EXPORT_SYMBOL(read_code); | |||
818 | static int exec_mmap(struct mm_struct *mm) | 819 | static int exec_mmap(struct mm_struct *mm) |
819 | { | 820 | { |
820 | struct task_struct *tsk; | 821 | struct task_struct *tsk; |
821 | struct mm_struct * old_mm, *active_mm; | 822 | struct mm_struct *old_mm, *active_mm; |
822 | 823 | ||
823 | /* Notify parent that we're no longer interested in the old VM */ | 824 | /* Notify parent that we're no longer interested in the old VM */ |
824 | tsk = current; | 825 | tsk = current; |
@@ -844,6 +845,8 @@ static int exec_mmap(struct mm_struct *mm) | |||
844 | tsk->mm = mm; | 845 | tsk->mm = mm; |
845 | tsk->active_mm = mm; | 846 | tsk->active_mm = mm; |
846 | activate_mm(active_mm, mm); | 847 | activate_mm(active_mm, mm); |
848 | tsk->mm->vmacache_seqnum = 0; | ||
849 | vmacache_flush(tsk); | ||
847 | task_unlock(tsk); | 850 | task_unlock(tsk); |
848 | arch_pick_mmap_layout(mm); | 851 | arch_pick_mmap_layout(mm); |
849 | if (old_mm) { | 852 | if (old_mm) { |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 242226a87be7..7620133f78bf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1044 | * allocating. If we are looking at the buddy cache we would | 1044 | * allocating. If we are looking at the buddy cache we would |
1045 | * have taken a reference using ext4_mb_load_buddy and that | 1045 | * have taken a reference using ext4_mb_load_buddy and that |
1046 | * would have pinned buddy page to page cache. | 1046 | * would have pinned buddy page to page cache. |
1047 | * The call to ext4_mb_get_buddy_page_lock will mark the | ||
1048 | * page accessed. | ||
1047 | */ | 1049 | */ |
1048 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); | 1050 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); |
1049 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { | 1051 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { |
@@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1062 | ret = -EIO; | 1064 | ret = -EIO; |
1063 | goto err; | 1065 | goto err; |
1064 | } | 1066 | } |
1065 | mark_page_accessed(page); | ||
1066 | 1067 | ||
1067 | if (e4b.bd_buddy_page == NULL) { | 1068 | if (e4b.bd_buddy_page == NULL) { |
1068 | /* | 1069 | /* |
@@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1082 | ret = -EIO; | 1083 | ret = -EIO; |
1083 | goto err; | 1084 | goto err; |
1084 | } | 1085 | } |
1085 | mark_page_accessed(page); | ||
1086 | err: | 1086 | err: |
1087 | ext4_mb_put_buddy_page_lock(&e4b); | 1087 | ext4_mb_put_buddy_page_lock(&e4b); |
1088 | return ret; | 1088 | return ret; |
@@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1141 | 1141 | ||
1142 | /* we could use find_or_create_page(), but it locks page | 1142 | /* we could use find_or_create_page(), but it locks page |
1143 | * what we'd like to avoid in fast path ... */ | 1143 | * what we'd like to avoid in fast path ... */ |
1144 | page = find_get_page(inode->i_mapping, pnum); | 1144 | page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); |
1145 | if (page == NULL || !PageUptodate(page)) { | 1145 | if (page == NULL || !PageUptodate(page)) { |
1146 | if (page) | 1146 | if (page) |
1147 | /* | 1147 | /* |
@@ -1172,15 +1172,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1172 | ret = -EIO; | 1172 | ret = -EIO; |
1173 | goto err; | 1173 | goto err; |
1174 | } | 1174 | } |
1175 | |||
1176 | /* Pages marked accessed already */ | ||
1175 | e4b->bd_bitmap_page = page; | 1177 | e4b->bd_bitmap_page = page; |
1176 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | 1178 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); |
1177 | mark_page_accessed(page); | ||
1178 | 1179 | ||
1179 | block++; | 1180 | block++; |
1180 | pnum = block / blocks_per_page; | 1181 | pnum = block / blocks_per_page; |
1181 | poff = block % blocks_per_page; | 1182 | poff = block % blocks_per_page; |
1182 | 1183 | ||
1183 | page = find_get_page(inode->i_mapping, pnum); | 1184 | page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); |
1184 | if (page == NULL || !PageUptodate(page)) { | 1185 | if (page == NULL || !PageUptodate(page)) { |
1185 | if (page) | 1186 | if (page) |
1186 | page_cache_release(page); | 1187 | page_cache_release(page); |
@@ -1201,9 +1202,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1201 | ret = -EIO; | 1202 | ret = -EIO; |
1202 | goto err; | 1203 | goto err; |
1203 | } | 1204 | } |
1205 | |||
1206 | /* Pages marked accessed already */ | ||
1204 | e4b->bd_buddy_page = page; | 1207 | e4b->bd_buddy_page = page; |
1205 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | 1208 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); |
1206 | mark_page_accessed(page); | ||
1207 | 1209 | ||
1208 | BUG_ON(e4b->bd_bitmap_page == NULL); | 1210 | BUG_ON(e4b->bd_bitmap_page == NULL); |
1209 | BUG_ON(e4b->bd_buddy_page == NULL); | 1211 | BUG_ON(e4b->bd_buddy_page == NULL); |
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bb312201ca95..15a29af63e20 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c | |||
@@ -70,7 +70,6 @@ repeat: | |||
70 | goto repeat; | 70 | goto repeat; |
71 | } | 71 | } |
72 | out: | 72 | out: |
73 | mark_page_accessed(page); | ||
74 | return page; | 73 | return page; |
75 | } | 74 | } |
76 | 75 | ||
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 51ef27894433..d0335bdb65b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c | |||
@@ -970,7 +970,6 @@ repeat: | |||
970 | } | 970 | } |
971 | got_it: | 971 | got_it: |
972 | BUG_ON(nid != nid_of_node(page)); | 972 | BUG_ON(nid != nid_of_node(page)); |
973 | mark_page_accessed(page); | ||
974 | return page; | 973 | return page; |
975 | } | 974 | } |
976 | 975 | ||
@@ -1026,7 +1025,6 @@ page_hit: | |||
1026 | f2fs_put_page(page, 1); | 1025 | f2fs_put_page(page, 1); |
1027 | return ERR_PTR(-EIO); | 1026 | return ERR_PTR(-EIO); |
1028 | } | 1027 | } |
1029 | mark_page_accessed(page); | ||
1030 | return page; | 1028 | return page; |
1031 | } | 1029 | } |
1032 | 1030 | ||
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fa8cb4b7b8fe..fc8e4991736a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c | |||
@@ -1613,7 +1613,7 @@ out_finish: | |||
1613 | 1613 | ||
1614 | static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) | 1614 | static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) |
1615 | { | 1615 | { |
1616 | release_pages(req->pages, req->num_pages, 0); | 1616 | release_pages(req->pages, req->num_pages, false); |
1617 | } | 1617 | } |
1618 | 1618 | ||
1619 | static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, | 1619 | static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, |
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4598345ab87d..d08c108065e1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c | |||
@@ -985,13 +985,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, | |||
985 | if (mapping_writably_mapped(mapping)) | 985 | if (mapping_writably_mapped(mapping)) |
986 | flush_dcache_page(page); | 986 | flush_dcache_page(page); |
987 | 987 | ||
988 | pagefault_disable(); | ||
989 | tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); | 988 | tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); |
990 | pagefault_enable(); | ||
991 | flush_dcache_page(page); | 989 | flush_dcache_page(page); |
992 | 990 | ||
993 | mark_page_accessed(page); | ||
994 | |||
995 | if (!tmp) { | 991 | if (!tmp) { |
996 | unlock_page(page); | 992 | unlock_page(page); |
997 | page_cache_release(page); | 993 | page_cache_release(page); |
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1253c2006029..f3aee0bbe886 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c | |||
@@ -517,7 +517,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, | |||
517 | p = kmap_atomic(page); | 517 | p = kmap_atomic(page); |
518 | memcpy(buf + copied, p + offset, amt); | 518 | memcpy(buf + copied, p + offset, amt); |
519 | kunmap_atomic(p); | 519 | kunmap_atomic(p); |
520 | mark_page_accessed(page); | ||
521 | page_cache_release(page); | 520 | page_cache_release(page); |
522 | copied += amt; | 521 | copied += amt; |
523 | index++; | 522 | index++; |
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 52f177be3bf8..89afe3a8f626 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c | |||
@@ -128,7 +128,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) | |||
128 | yield(); | 128 | yield(); |
129 | } | 129 | } |
130 | } else { | 130 | } else { |
131 | page = find_lock_page(mapping, index); | 131 | page = find_get_page_flags(mapping, index, |
132 | FGP_LOCK|FGP_ACCESSED); | ||
132 | if (!page) | 133 | if (!page) |
133 | return NULL; | 134 | return NULL; |
134 | } | 135 | } |
@@ -145,7 +146,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) | |||
145 | map_bh(bh, sdp->sd_vfs, blkno); | 146 | map_bh(bh, sdp->sd_vfs, blkno); |
146 | 147 | ||
147 | unlock_page(page); | 148 | unlock_page(page); |
148 | mark_page_accessed(page); | ||
149 | page_cache_release(page); | 149 | page_cache_release(page); |
150 | 150 | ||
151 | return bh; | 151 | return bh; |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d19b30ababf1..a4a8ed56e438 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -1017,6 +1017,11 @@ static int __init init_hugetlbfs_fs(void) | |||
1017 | int error; | 1017 | int error; |
1018 | int i; | 1018 | int i; |
1019 | 1019 | ||
1020 | if (!hugepages_supported()) { | ||
1021 | pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); | ||
1022 | return -ENOTSUPP; | ||
1023 | } | ||
1024 | |||
1020 | error = bdi_init(&hugetlbfs_backing_dev_info); | 1025 | error = bdi_init(&hugetlbfs_backing_dev_info); |
1021 | if (error) | 1026 | if (error) |
1022 | return error; | 1027 | return error; |
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 02003f02dd92..5a2c26525cfc 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c | |||
@@ -682,7 +682,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, | |||
682 | struct inode *inode = OFNI_EDONI_2SFFJ(f); | 682 | struct inode *inode = OFNI_EDONI_2SFFJ(f); |
683 | struct page *pg; | 683 | struct page *pg; |
684 | 684 | ||
685 | pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, | 685 | pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT, |
686 | (void *)jffs2_do_readpage_unlock, inode); | 686 | (void *)jffs2_do_readpage_unlock, inode); |
687 | if (IS_ERR(pg)) | 687 | if (IS_ERR(pg)) |
688 | return (void *)pg; | 688 | return (void *)pg; |
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index e242bbf72972..fdb74cbb9e0c 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c | |||
@@ -1220,7 +1220,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) | |||
1220 | end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); | 1220 | end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); |
1221 | if (end != NFS_I(inode)->npages) { | 1221 | if (end != NFS_I(inode)->npages) { |
1222 | rcu_read_lock(); | 1222 | rcu_read_lock(); |
1223 | end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX); | 1223 | end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); |
1224 | rcu_read_unlock(); | 1224 | rcu_read_unlock(); |
1225 | } | 1225 | } |
1226 | 1226 | ||
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a27e3fecefaf..250ed5b20c8f 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c | |||
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) | |||
1748 | if (page) { | 1748 | if (page) { |
1749 | set_page_dirty(page); | 1749 | set_page_dirty(page); |
1750 | unlock_page(page); | 1750 | unlock_page(page); |
1751 | mark_page_accessed(page); | ||
1752 | page_cache_release(page); | 1751 | page_cache_release(page); |
1753 | } | 1752 | } |
1754 | ntfs_debug("Done."); | 1753 | ntfs_debug("Done."); |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ea4ba9daeb47..a0b2f345da2b 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, | |||
2060 | } | 2060 | } |
2061 | do { | 2061 | do { |
2062 | unlock_page(pages[--do_pages]); | 2062 | unlock_page(pages[--do_pages]); |
2063 | mark_page_accessed(pages[do_pages]); | ||
2064 | page_cache_release(pages[do_pages]); | 2063 | page_cache_release(pages[do_pages]); |
2065 | } while (do_pages); | 2064 | } while (do_pages); |
2066 | if (unlikely(status)) | 2065 | if (unlikely(status)) |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad4df869c907..7724fbdf443f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -1,4 +1,5 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/vmacache.h> | ||
2 | #include <linux/hugetlb.h> | 3 | #include <linux/hugetlb.h> |
3 | #include <linux/huge_mm.h> | 4 | #include <linux/huge_mm.h> |
4 | #include <linux/mount.h> | 5 | #include <linux/mount.h> |
@@ -159,7 +160,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) | |||
159 | 160 | ||
160 | /* | 161 | /* |
161 | * We remember last_addr rather than next_addr to hit with | 162 | * We remember last_addr rather than next_addr to hit with |
162 | * mmap_cache most of the time. We have zero last_addr at | 163 | * vmacache most of the time. We have zero last_addr at |
163 | * the beginning and also after lseek. We will have -1 last_addr | 164 | * the beginning and also after lseek. We will have -1 last_addr |
164 | * after the end of the vmas. | 165 | * after the end of the vmas. |
165 | */ | 166 | */ |
diff --git a/fs/super.c b/fs/super.c index d127de207376..fb68a4c90c98 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, | |||
112 | 112 | ||
113 | sb = container_of(shrink, struct super_block, s_shrink); | 113 | sb = container_of(shrink, struct super_block, s_shrink); |
114 | 114 | ||
115 | if (!grab_super_passive(sb)) | 115 | /* |
116 | return 0; | 116 | * Don't call grab_super_passive as it is a potential |
117 | 117 | * scalability bottleneck. The counts could get updated | |
118 | * between super_cache_count and super_cache_scan anyway. | ||
119 | * Call to super_cache_count with shrinker_rwsem held | ||
120 | * ensures the safety of call to list_lru_count_node() and | ||
121 | * s_op->nr_cached_objects(). | ||
122 | */ | ||
118 | if (sb->s_op && sb->s_op->nr_cached_objects) | 123 | if (sb->s_op && sb->s_op->nr_cached_objects) |
119 | total_objects = sb->s_op->nr_cached_objects(sb, | 124 | total_objects = sb->s_op->nr_cached_objects(sb, |
120 | sc->nid); | 125 | sc->nid); |
@@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink, | |||
125 | sc->nid); | 130 | sc->nid); |
126 | 131 | ||
127 | total_objects = vfs_pressure_ratio(total_objects); | 132 | total_objects = vfs_pressure_ratio(total_objects); |
128 | drop_super(sb); | ||
129 | return total_objects; | 133 | return total_objects; |
130 | } | 134 | } |
131 | 135 | ||
@@ -321,10 +325,8 @@ void deactivate_locked_super(struct super_block *s) | |||
321 | struct file_system_type *fs = s->s_type; | 325 | struct file_system_type *fs = s->s_type; |
322 | if (atomic_dec_and_test(&s->s_active)) { | 326 | if (atomic_dec_and_test(&s->s_active)) { |
323 | cleancache_invalidate_fs(s); | 327 | cleancache_invalidate_fs(s); |
324 | fs->kill_sb(s); | ||
325 | |||
326 | /* caches are now gone, we can safely kill the shrinker now */ | ||
327 | unregister_shrinker(&s->s_shrink); | 328 | unregister_shrinker(&s->s_shrink); |
329 | fs->kill_sb(s); | ||
328 | 330 | ||
329 | put_filesystem(fs); | 331 | put_filesystem(fs); |
330 | put_super(s); | 332 | put_super(s); |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 091d72e70d8a..01e3132820da 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | |||
22 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 22 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | bool sync, bool *contended); | 25 | enum migrate_mode mode, bool *contended); |
26 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 26 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 27 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
28 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
@@ -62,6 +62,22 @@ static inline bool compaction_deferred(struct zone *zone, int order) | |||
62 | return zone->compact_considered < defer_limit; | 62 | return zone->compact_considered < defer_limit; |
63 | } | 63 | } |
64 | 64 | ||
65 | /* | ||
66 | * Update defer tracking counters after successful compaction of given order, | ||
67 | * which means an allocation either succeeded (alloc_success == true) or is | ||
68 | * expected to succeed. | ||
69 | */ | ||
70 | static inline void compaction_defer_reset(struct zone *zone, int order, | ||
71 | bool alloc_success) | ||
72 | { | ||
73 | if (alloc_success) { | ||
74 | zone->compact_considered = 0; | ||
75 | zone->compact_defer_shift = 0; | ||
76 | } | ||
77 | if (order >= zone->compact_order_failed) | ||
78 | zone->compact_order_failed = order + 1; | ||
79 | } | ||
80 | |||
65 | /* Returns true if restarting compaction after many failures */ | 81 | /* Returns true if restarting compaction after many failures */ |
66 | static inline bool compaction_restarting(struct zone *zone, int order) | 82 | static inline bool compaction_restarting(struct zone *zone, int order) |
67 | { | 83 | { |
@@ -75,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) | |||
75 | #else | 91 | #else |
76 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 92 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, |
77 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 93 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
78 | bool sync, bool *contended) | 94 | enum migrate_mode mode, bool *contended) |
79 | { | 95 | { |
80 | return COMPACT_CONTINUE; | 96 | return COMPACT_CONTINUE; |
81 | } | 97 | } |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index cc1b01cf2035..a7ebb89ae9fb 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -12,10 +12,31 @@ | |||
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/jump_label.h> | ||
15 | 16 | ||
16 | #ifdef CONFIG_CPUSETS | 17 | #ifdef CONFIG_CPUSETS |
17 | 18 | ||
18 | extern int number_of_cpusets; /* How many cpusets are defined in system? */ | 19 | extern struct static_key cpusets_enabled_key; |
20 | static inline bool cpusets_enabled(void) | ||
21 | { | ||
22 | return static_key_false(&cpusets_enabled_key); | ||
23 | } | ||
24 | |||
25 | static inline int nr_cpusets(void) | ||
26 | { | ||
27 | /* jump label reference count + the top-level cpuset */ | ||
28 | return static_key_count(&cpusets_enabled_key) + 1; | ||
29 | } | ||
30 | |||
31 | static inline void cpuset_inc(void) | ||
32 | { | ||
33 | static_key_slow_inc(&cpusets_enabled_key); | ||
34 | } | ||
35 | |||
36 | static inline void cpuset_dec(void) | ||
37 | { | ||
38 | static_key_slow_dec(&cpusets_enabled_key); | ||
39 | } | ||
19 | 40 | ||
20 | extern int cpuset_init(void); | 41 | extern int cpuset_init(void); |
21 | extern void cpuset_init_smp(void); | 42 | extern void cpuset_init_smp(void); |
@@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); | |||
32 | 53 | ||
33 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 54 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) |
34 | { | 55 | { |
35 | return number_of_cpusets <= 1 || | 56 | return nr_cpusets() <= 1 || |
36 | __cpuset_node_allowed_softwall(node, gfp_mask); | 57 | __cpuset_node_allowed_softwall(node, gfp_mask); |
37 | } | 58 | } |
38 | 59 | ||
39 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | 60 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) |
40 | { | 61 | { |
41 | return number_of_cpusets <= 1 || | 62 | return nr_cpusets() <= 1 || |
42 | __cpuset_node_allowed_hardwall(node, gfp_mask); | 63 | __cpuset_node_allowed_hardwall(node, gfp_mask); |
43 | } | 64 | } |
44 | 65 | ||
@@ -87,25 +108,26 @@ extern void rebuild_sched_domains(void); | |||
87 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); | 108 | extern void cpuset_print_task_mems_allowed(struct task_struct *p); |
88 | 109 | ||
89 | /* | 110 | /* |
90 | * get_mems_allowed is required when making decisions involving mems_allowed | 111 | * read_mems_allowed_begin is required when making decisions involving |
91 | * such as during page allocation. mems_allowed can be updated in parallel | 112 | * mems_allowed such as during page allocation. mems_allowed can be updated in |
92 | * and depending on the new value an operation can fail potentially causing | 113 | * parallel and depending on the new value an operation can fail potentially |
93 | * process failure. A retry loop with get_mems_allowed and put_mems_allowed | 114 | * causing process failure. A retry loop with read_mems_allowed_begin and |
94 | * prevents these artificial failures. | 115 | * read_mems_allowed_retry prevents these artificial failures. |
95 | */ | 116 | */ |
96 | static inline unsigned int get_mems_allowed(void) | 117 | static inline unsigned int read_mems_allowed_begin(void) |
97 | { | 118 | { |
98 | return read_seqcount_begin(¤t->mems_allowed_seq); | 119 | return read_seqcount_begin(¤t->mems_allowed_seq); |
99 | } | 120 | } |
100 | 121 | ||
101 | /* | 122 | /* |
102 | * If this returns false, the operation that took place after get_mems_allowed | 123 | * If this returns true, the operation that took place after |
103 | * may have failed. It is up to the caller to retry the operation if | 124 | * read_mems_allowed_begin may have failed artificially due to a concurrent |
125 | * update of mems_allowed. It is up to the caller to retry the operation if | ||
104 | * appropriate. | 126 | * appropriate. |
105 | */ | 127 | */ |
106 | static inline bool put_mems_allowed(unsigned int seq) | 128 | static inline bool read_mems_allowed_retry(unsigned int seq) |
107 | { | 129 | { |
108 | return !read_seqcount_retry(¤t->mems_allowed_seq, seq); | 130 | return read_seqcount_retry(¤t->mems_allowed_seq, seq); |
109 | } | 131 | } |
110 | 132 | ||
111 | static inline void set_mems_allowed(nodemask_t nodemask) | 133 | static inline void set_mems_allowed(nodemask_t nodemask) |
@@ -119,6 +141,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
119 | 141 | ||
120 | #else /* !CONFIG_CPUSETS */ | 142 | #else /* !CONFIG_CPUSETS */ |
121 | 143 | ||
144 | static inline bool cpusets_enabled(void) { return false; } | ||
145 | |||
122 | static inline int cpuset_init(void) { return 0; } | 146 | static inline int cpuset_init(void) { return 0; } |
123 | static inline void cpuset_init_smp(void) {} | 147 | static inline void cpuset_init_smp(void) {} |
124 | 148 | ||
@@ -221,14 +245,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
221 | { | 245 | { |
222 | } | 246 | } |
223 | 247 | ||
224 | static inline unsigned int get_mems_allowed(void) | 248 | static inline unsigned int read_mems_allowed_begin(void) |
225 | { | 249 | { |
226 | return 0; | 250 | return 0; |
227 | } | 251 | } |
228 | 252 | ||
229 | static inline bool put_mems_allowed(unsigned int seq) | 253 | static inline bool read_mems_allowed_retry(unsigned int seq) |
230 | { | 254 | { |
231 | return true; | 255 | return false; |
232 | } | 256 | } |
233 | 257 | ||
234 | #endif /* !CONFIG_CPUSETS */ | 258 | #endif /* !CONFIG_CPUSETS */ |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 9b4dd491f7e8..fa7ac989ff56 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -364,8 +364,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); | |||
364 | 364 | ||
365 | extern void __free_pages(struct page *page, unsigned int order); | 365 | extern void __free_pages(struct page *page, unsigned int order); |
366 | extern void free_pages(unsigned long addr, unsigned int order); | 366 | extern void free_pages(unsigned long addr, unsigned int order); |
367 | extern void free_hot_cold_page(struct page *page, int cold); | 367 | extern void free_hot_cold_page(struct page *page, bool cold); |
368 | extern void free_hot_cold_page_list(struct list_head *list, int cold); | 368 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); |
369 | 369 | ||
370 | extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); | 370 | extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); |
371 | extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); | 371 | extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a291552ab767..aac671be9581 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -92,10 +92,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); | |||
92 | #endif /* CONFIG_DEBUG_VM */ | 92 | #endif /* CONFIG_DEBUG_VM */ |
93 | 93 | ||
94 | extern unsigned long transparent_hugepage_flags; | 94 | extern unsigned long transparent_hugepage_flags; |
95 | extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | ||
96 | pmd_t *dst_pmd, pmd_t *src_pmd, | ||
97 | struct vm_area_struct *vma, | ||
98 | unsigned long addr, unsigned long end); | ||
99 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); | 95 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); |
100 | static inline int split_huge_page(struct page *page) | 96 | static inline int split_huge_page(struct page *page) |
101 | { | 97 | { |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5214ff63c351..511b1a0d6cc2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -396,6 +396,16 @@ static inline int hugepage_migration_support(struct hstate *h) | |||
396 | #endif | 396 | #endif |
397 | } | 397 | } |
398 | 398 | ||
399 | static inline bool hugepages_supported(void) | ||
400 | { | ||
401 | /* | ||
402 | * Some platform decide whether they support huge pages at boot | ||
403 | * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when | ||
404 | * there is no such support | ||
405 | */ | ||
406 | return HPAGE_SHIFT != 0; | ||
407 | } | ||
408 | |||
399 | #else /* CONFIG_HUGETLB_PAGE */ | 409 | #else /* CONFIG_HUGETLB_PAGE */ |
400 | struct hstate {}; | 410 | struct hstate {}; |
401 | #define alloc_huge_page_node(h, nid) NULL | 411 | #define alloc_huge_page_node(h, nid) NULL |
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index a5079072da66..9216e465289a 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h | |||
@@ -62,6 +62,10 @@ struct static_key { | |||
62 | 62 | ||
63 | # include <asm/jump_label.h> | 63 | # include <asm/jump_label.h> |
64 | # define HAVE_JUMP_LABEL | 64 | # define HAVE_JUMP_LABEL |
65 | #else | ||
66 | struct static_key { | ||
67 | atomic_t enabled; | ||
68 | }; | ||
65 | #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ | 69 | #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ |
66 | 70 | ||
67 | enum jump_label_type { | 71 | enum jump_label_type { |
@@ -72,6 +76,12 @@ enum jump_label_type { | |||
72 | struct module; | 76 | struct module; |
73 | 77 | ||
74 | #include <linux/atomic.h> | 78 | #include <linux/atomic.h> |
79 | |||
80 | static inline int static_key_count(struct static_key *key) | ||
81 | { | ||
82 | return atomic_read(&key->enabled); | ||
83 | } | ||
84 | |||
75 | #ifdef HAVE_JUMP_LABEL | 85 | #ifdef HAVE_JUMP_LABEL |
76 | 86 | ||
77 | #define JUMP_LABEL_TRUE_BRANCH 1UL | 87 | #define JUMP_LABEL_TRUE_BRANCH 1UL |
@@ -122,24 +132,20 @@ extern void jump_label_apply_nops(struct module *mod); | |||
122 | 132 | ||
123 | #else /* !HAVE_JUMP_LABEL */ | 133 | #else /* !HAVE_JUMP_LABEL */ |
124 | 134 | ||
125 | struct static_key { | ||
126 | atomic_t enabled; | ||
127 | }; | ||
128 | |||
129 | static __always_inline void jump_label_init(void) | 135 | static __always_inline void jump_label_init(void) |
130 | { | 136 | { |
131 | } | 137 | } |
132 | 138 | ||
133 | static __always_inline bool static_key_false(struct static_key *key) | 139 | static __always_inline bool static_key_false(struct static_key *key) |
134 | { | 140 | { |
135 | if (unlikely(atomic_read(&key->enabled)) > 0) | 141 | if (unlikely(static_key_count(key) > 0)) |
136 | return true; | 142 | return true; |
137 | return false; | 143 | return false; |
138 | } | 144 | } |
139 | 145 | ||
140 | static __always_inline bool static_key_true(struct static_key *key) | 146 | static __always_inline bool static_key_true(struct static_key *key) |
141 | { | 147 | { |
142 | if (likely(atomic_read(&key->enabled)) > 0) | 148 | if (likely(static_key_count(key) > 0)) |
143 | return true; | 149 | return true; |
144 | return false; | 150 | return false; |
145 | } | 151 | } |
@@ -179,7 +185,7 @@ static inline int jump_label_apply_nops(struct module *mod) | |||
179 | 185 | ||
180 | static inline bool static_key_enabled(struct static_key *key) | 186 | static inline bool static_key_enabled(struct static_key *key) |
181 | { | 187 | { |
182 | return (atomic_read(&key->enabled) > 0); | 188 | return static_key_count(key) > 0; |
183 | } | 189 | } |
184 | 190 | ||
185 | #endif /* _LINUX_JUMP_LABEL_H */ | 191 | #endif /* _LINUX_JUMP_LABEL_H */ |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ee8b14ae4f3f..449905ebcab3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -5,7 +5,9 @@ | |||
5 | #include <linux/mempolicy.h> | 5 | #include <linux/mempolicy.h> |
6 | #include <linux/migrate_mode.h> | 6 | #include <linux/migrate_mode.h> |
7 | 7 | ||
8 | typedef struct page *new_page_t(struct page *, unsigned long private, int **); | 8 | typedef struct page *new_page_t(struct page *page, unsigned long private, |
9 | int **reason); | ||
10 | typedef void free_page_t(struct page *page, unsigned long private); | ||
9 | 11 | ||
10 | /* | 12 | /* |
11 | * Return values from addresss_space_operations.migratepage(): | 13 | * Return values from addresss_space_operations.migratepage(): |
@@ -39,7 +41,7 @@ extern void putback_lru_pages(struct list_head *l); | |||
39 | extern void putback_movable_pages(struct list_head *l); | 41 | extern void putback_movable_pages(struct list_head *l); |
40 | extern int migrate_page(struct address_space *, | 42 | extern int migrate_page(struct address_space *, |
41 | struct page *, struct page *, enum migrate_mode); | 43 | struct page *, struct page *, enum migrate_mode); |
42 | extern int migrate_pages(struct list_head *l, new_page_t x, | 44 | extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, |
43 | unsigned long private, enum migrate_mode mode, int reason); | 45 | unsigned long private, enum migrate_mode mode, int reason); |
44 | 46 | ||
45 | extern int fail_migrate_page(struct address_space *, | 47 | extern int fail_migrate_page(struct address_space *, |
@@ -61,8 +63,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping, | |||
61 | 63 | ||
62 | static inline void putback_lru_pages(struct list_head *l) {} | 64 | static inline void putback_lru_pages(struct list_head *l) {} |
63 | static inline void putback_movable_pages(struct list_head *l) {} | 65 | static inline void putback_movable_pages(struct list_head *l) {} |
64 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 66 | static inline int migrate_pages(struct list_head *l, new_page_t new, |
65 | unsigned long private, enum migrate_mode mode, int reason) | 67 | free_page_t free, unsigned long private, enum migrate_mode mode, |
68 | int reason) | ||
66 | { return -ENOSYS; } | 69 | { return -ENOSYS; } |
67 | 70 | ||
68 | static inline int migrate_prep(void) { return -ENOSYS; } | 71 | static inline int migrate_prep(void) { return -ENOSYS; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 073734339583..2b3a5330dcf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -919,6 +919,14 @@ extern void show_free_areas(unsigned int flags); | |||
919 | extern bool skip_free_areas_node(unsigned int flags, int nid); | 919 | extern bool skip_free_areas_node(unsigned int flags, int nid); |
920 | 920 | ||
921 | int shmem_zero_setup(struct vm_area_struct *); | 921 | int shmem_zero_setup(struct vm_area_struct *); |
922 | #ifdef CONFIG_SHMEM | ||
923 | bool shmem_mapping(struct address_space *mapping); | ||
924 | #else | ||
925 | static inline bool shmem_mapping(struct address_space *mapping) | ||
926 | { | ||
927 | return false; | ||
928 | } | ||
929 | #endif | ||
922 | 930 | ||
923 | extern int can_do_mlock(void); | 931 | extern int can_do_mlock(void); |
924 | extern int user_shm_lock(size_t, struct user_struct *); | 932 | extern int user_shm_lock(size_t, struct user_struct *); |
@@ -1623,9 +1631,6 @@ void page_cache_async_readahead(struct address_space *mapping, | |||
1623 | unsigned long size); | 1631 | unsigned long size); |
1624 | 1632 | ||
1625 | unsigned long max_sane_readahead(unsigned long nr); | 1633 | unsigned long max_sane_readahead(unsigned long nr); |
1626 | unsigned long ra_submit(struct file_ra_state *ra, | ||
1627 | struct address_space *mapping, | ||
1628 | struct file *filp); | ||
1629 | 1634 | ||
1630 | /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ | 1635 | /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ |
1631 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); | 1636 | extern int expand_stack(struct vm_area_struct *vma, unsigned long address); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8e082f18fb6a..b8131e7d6eda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -324,9 +324,9 @@ struct mm_rss_stat { | |||
324 | 324 | ||
325 | struct kioctx_table; | 325 | struct kioctx_table; |
326 | struct mm_struct { | 326 | struct mm_struct { |
327 | struct vm_area_struct * mmap; /* list of VMAs */ | 327 | struct vm_area_struct *mmap; /* list of VMAs */ |
328 | struct rb_root mm_rb; | 328 | struct rb_root mm_rb; |
329 | struct vm_area_struct * mmap_cache; /* last find_vma result */ | 329 | u32 vmacache_seqnum; /* per-thread vmacache */ |
330 | #ifdef CONFIG_MMU | 330 | #ifdef CONFIG_MMU |
331 | unsigned long (*get_unmapped_area) (struct file *filp, | 331 | unsigned long (*get_unmapped_area) (struct file *filp, |
332 | unsigned long addr, unsigned long len, | 332 | unsigned long addr, unsigned long len, |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 56482904a676..450f19c5c865 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled; | |||
78 | #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) | 78 | #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) |
79 | #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) | 79 | #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) |
80 | 80 | ||
81 | static inline int get_pageblock_migratetype(struct page *page) | 81 | #define get_pageblock_migratetype(page) \ |
82 | get_pfnblock_flags_mask(page, page_to_pfn(page), \ | ||
83 | PB_migrate_end, MIGRATETYPE_MASK) | ||
84 | |||
85 | static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) | ||
82 | { | 86 | { |
83 | BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); | 87 | BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); |
84 | return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); | 88 | return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, |
89 | MIGRATETYPE_MASK); | ||
85 | } | 90 | } |
86 | 91 | ||
87 | struct free_area { | 92 | struct free_area { |
@@ -138,6 +143,7 @@ enum zone_stat_item { | |||
138 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ | 143 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ |
139 | NR_DIRTIED, /* page dirtyings since bootup */ | 144 | NR_DIRTIED, /* page dirtyings since bootup */ |
140 | NR_WRITTEN, /* page writings since bootup */ | 145 | NR_WRITTEN, /* page writings since bootup */ |
146 | NR_PAGES_SCANNED, /* pages scanned since last reclaim */ | ||
141 | #ifdef CONFIG_NUMA | 147 | #ifdef CONFIG_NUMA |
142 | NUMA_HIT, /* allocated in intended node */ | 148 | NUMA_HIT, /* allocated in intended node */ |
143 | NUMA_MISS, /* allocated in non intended node */ | 149 | NUMA_MISS, /* allocated in non intended node */ |
@@ -316,19 +322,12 @@ enum zone_type { | |||
316 | #ifndef __GENERATING_BOUNDS_H | 322 | #ifndef __GENERATING_BOUNDS_H |
317 | 323 | ||
318 | struct zone { | 324 | struct zone { |
319 | /* Fields commonly accessed by the page allocator */ | 325 | /* Read-mostly fields */ |
320 | 326 | ||
321 | /* zone watermarks, access with *_wmark_pages(zone) macros */ | 327 | /* zone watermarks, access with *_wmark_pages(zone) macros */ |
322 | unsigned long watermark[NR_WMARK]; | 328 | unsigned long watermark[NR_WMARK]; |
323 | 329 | ||
324 | /* | 330 | /* |
325 | * When free pages are below this point, additional steps are taken | ||
326 | * when reading the number of free pages to avoid per-cpu counter | ||
327 | * drift allowing watermarks to be breached | ||
328 | */ | ||
329 | unsigned long percpu_drift_mark; | ||
330 | |||
331 | /* | ||
332 | * We don't know if the memory that we're going to allocate will be freeable | 331 | * We don't know if the memory that we're going to allocate will be freeable |
333 | * or/and it will be released eventually, so to avoid totally wasting several | 332 | * or/and it will be released eventually, so to avoid totally wasting several |
334 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk | 333 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk |
@@ -336,40 +335,26 @@ struct zone { | |||
336 | * on the higher zones). This array is recalculated at runtime if the | 335 | * on the higher zones). This array is recalculated at runtime if the |
337 | * sysctl_lowmem_reserve_ratio sysctl changes. | 336 | * sysctl_lowmem_reserve_ratio sysctl changes. |
338 | */ | 337 | */ |
339 | unsigned long lowmem_reserve[MAX_NR_ZONES]; | 338 | long lowmem_reserve[MAX_NR_ZONES]; |
340 | |||
341 | /* | ||
342 | * This is a per-zone reserve of pages that should not be | ||
343 | * considered dirtyable memory. | ||
344 | */ | ||
345 | unsigned long dirty_balance_reserve; | ||
346 | 339 | ||
347 | #ifdef CONFIG_NUMA | 340 | #ifdef CONFIG_NUMA |
348 | int node; | 341 | int node; |
342 | #endif | ||
343 | |||
349 | /* | 344 | /* |
350 | * zone reclaim becomes active if more unmapped pages exist. | 345 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on |
346 | * this zone's LRU. Maintained by the pageout code. | ||
351 | */ | 347 | */ |
352 | unsigned long min_unmapped_pages; | 348 | unsigned int inactive_ratio; |
353 | unsigned long min_slab_pages; | 349 | |
354 | #endif | 350 | struct pglist_data *zone_pgdat; |
355 | struct per_cpu_pageset __percpu *pageset; | 351 | struct per_cpu_pageset __percpu *pageset; |
352 | |||
356 | /* | 353 | /* |
357 | * free areas of different sizes | 354 | * This is a per-zone reserve of pages that should not be |
355 | * considered dirtyable memory. | ||
358 | */ | 356 | */ |
359 | spinlock_t lock; | 357 | unsigned long dirty_balance_reserve; |
360 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
361 | /* Set to true when the PG_migrate_skip bits should be cleared */ | ||
362 | bool compact_blockskip_flush; | ||
363 | |||
364 | /* pfns where compaction scanners should start */ | ||
365 | unsigned long compact_cached_free_pfn; | ||
366 | unsigned long compact_cached_migrate_pfn; | ||
367 | #endif | ||
368 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
369 | /* see spanned/present_pages for more description */ | ||
370 | seqlock_t span_seqlock; | ||
371 | #endif | ||
372 | struct free_area free_area[MAX_ORDER]; | ||
373 | 358 | ||
374 | #ifndef CONFIG_SPARSEMEM | 359 | #ifndef CONFIG_SPARSEMEM |
375 | /* | 360 | /* |
@@ -379,71 +364,14 @@ struct zone { | |||
379 | unsigned long *pageblock_flags; | 364 | unsigned long *pageblock_flags; |
380 | #endif /* CONFIG_SPARSEMEM */ | 365 | #endif /* CONFIG_SPARSEMEM */ |
381 | 366 | ||
382 | #ifdef CONFIG_COMPACTION | 367 | #ifdef CONFIG_NUMA |
383 | /* | ||
384 | * On compaction failure, 1<<compact_defer_shift compactions | ||
385 | * are skipped before trying again. The number attempted since | ||
386 | * last failure is tracked with compact_considered. | ||
387 | */ | ||
388 | unsigned int compact_considered; | ||
389 | unsigned int compact_defer_shift; | ||
390 | int compact_order_failed; | ||
391 | #endif | ||
392 | |||
393 | ZONE_PADDING(_pad1_) | ||
394 | |||
395 | /* Fields commonly accessed by the page reclaim scanner */ | ||
396 | spinlock_t lru_lock; | ||
397 | struct lruvec lruvec; | ||
398 | |||
399 | unsigned long pages_scanned; /* since last reclaim */ | ||
400 | unsigned long flags; /* zone flags, see below */ | ||
401 | |||
402 | /* Zone statistics */ | ||
403 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | ||
404 | |||
405 | /* | ||
406 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on | ||
407 | * this zone's LRU. Maintained by the pageout code. | ||
408 | */ | ||
409 | unsigned int inactive_ratio; | ||
410 | |||
411 | |||
412 | ZONE_PADDING(_pad2_) | ||
413 | /* Rarely used or read-mostly fields */ | ||
414 | |||
415 | /* | 368 | /* |
416 | * wait_table -- the array holding the hash table | 369 | * zone reclaim becomes active if more unmapped pages exist. |
417 | * wait_table_hash_nr_entries -- the size of the hash table array | ||
418 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) | ||
419 | * | ||
420 | * The purpose of all these is to keep track of the people | ||
421 | * waiting for a page to become available and make them | ||
422 | * runnable again when possible. The trouble is that this | ||
423 | * consumes a lot of space, especially when so few things | ||
424 | * wait on pages at a given time. So instead of using | ||
425 | * per-page waitqueues, we use a waitqueue hash table. | ||
426 | * | ||
427 | * The bucket discipline is to sleep on the same queue when | ||
428 | * colliding and wake all in that wait queue when removing. | ||
429 | * When something wakes, it must check to be sure its page is | ||
430 | * truly available, a la thundering herd. The cost of a | ||
431 | * collision is great, but given the expected load of the | ||
432 | * table, they should be so rare as to be outweighed by the | ||
433 | * benefits from the saved space. | ||
434 | * | ||
435 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the | ||
436 | * primary users of these fields, and in mm/page_alloc.c | ||
437 | * free_area_init_core() performs the initialization of them. | ||
438 | */ | 370 | */ |
439 | wait_queue_head_t * wait_table; | 371 | unsigned long min_unmapped_pages; |
440 | unsigned long wait_table_hash_nr_entries; | 372 | unsigned long min_slab_pages; |
441 | unsigned long wait_table_bits; | 373 | #endif /* CONFIG_NUMA */ |
442 | 374 | ||
443 | /* | ||
444 | * Discontig memory support fields. | ||
445 | */ | ||
446 | struct pglist_data *zone_pgdat; | ||
447 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ | 375 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ |
448 | unsigned long zone_start_pfn; | 376 | unsigned long zone_start_pfn; |
449 | 377 | ||
@@ -489,14 +417,103 @@ struct zone { | |||
489 | * adjust_managed_page_count() should be used instead of directly | 417 | * adjust_managed_page_count() should be used instead of directly |
490 | * touching zone->managed_pages and totalram_pages. | 418 | * touching zone->managed_pages and totalram_pages. |
491 | */ | 419 | */ |
420 | unsigned long managed_pages; | ||
492 | unsigned long spanned_pages; | 421 | unsigned long spanned_pages; |
493 | unsigned long present_pages; | 422 | unsigned long present_pages; |
494 | unsigned long managed_pages; | 423 | |
424 | const char *name; | ||
495 | 425 | ||
496 | /* | 426 | /* |
497 | * rarely used fields: | 427 | * Number of MIGRATE_RESEVE page block. To maintain for just |
428 | * optimization. Protected by zone->lock. | ||
498 | */ | 429 | */ |
499 | const char *name; | 430 | int nr_migrate_reserve_block; |
431 | |||
432 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
433 | /* see spanned/present_pages for more description */ | ||
434 | seqlock_t span_seqlock; | ||
435 | #endif | ||
436 | |||
437 | /* | ||
438 | * wait_table -- the array holding the hash table | ||
439 | * wait_table_hash_nr_entries -- the size of the hash table array | ||
440 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) | ||
441 | * | ||
442 | * The purpose of all these is to keep track of the people | ||
443 | * waiting for a page to become available and make them | ||
444 | * runnable again when possible. The trouble is that this | ||
445 | * consumes a lot of space, especially when so few things | ||
446 | * wait on pages at a given time. So instead of using | ||
447 | * per-page waitqueues, we use a waitqueue hash table. | ||
448 | * | ||
449 | * The bucket discipline is to sleep on the same queue when | ||
450 | * colliding and wake all in that wait queue when removing. | ||
451 | * When something wakes, it must check to be sure its page is | ||
452 | * truly available, a la thundering herd. The cost of a | ||
453 | * collision is great, but given the expected load of the | ||
454 | * table, they should be so rare as to be outweighed by the | ||
455 | * benefits from the saved space. | ||
456 | * | ||
457 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the | ||
458 | * primary users of these fields, and in mm/page_alloc.c | ||
459 | * free_area_init_core() performs the initialization of them. | ||
460 | */ | ||
461 | wait_queue_head_t *wait_table; | ||
462 | unsigned long wait_table_hash_nr_entries; | ||
463 | unsigned long wait_table_bits; | ||
464 | |||
465 | ZONE_PADDING(_pad1_) | ||
466 | |||
467 | /* Write-intensive fields used from the page allocator */ | ||
468 | spinlock_t lock; | ||
469 | |||
470 | /* free areas of different sizes */ | ||
471 | struct free_area free_area[MAX_ORDER]; | ||
472 | |||
473 | /* zone flags, see below */ | ||
474 | unsigned long flags; | ||
475 | |||
476 | ZONE_PADDING(_pad2_) | ||
477 | |||
478 | /* Write-intensive fields used by page reclaim */ | ||
479 | |||
480 | /* Fields commonly accessed by the page reclaim scanner */ | ||
481 | spinlock_t lru_lock; | ||
482 | struct lruvec lruvec; | ||
483 | |||
484 | /* | ||
485 | * When free pages are below this point, additional steps are taken | ||
486 | * when reading the number of free pages to avoid per-cpu counter | ||
487 | * drift allowing watermarks to be breached | ||
488 | */ | ||
489 | unsigned long percpu_drift_mark; | ||
490 | |||
491 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
492 | /* pfn where compaction free scanner should start */ | ||
493 | unsigned long compact_cached_free_pfn; | ||
494 | /* pfn where async and sync compaction migration scanner should start */ | ||
495 | unsigned long compact_cached_migrate_pfn[2]; | ||
496 | #endif | ||
497 | |||
498 | #ifdef CONFIG_COMPACTION | ||
499 | /* | ||
500 | * On compaction failure, 1<<compact_defer_shift compactions | ||
501 | * are skipped before trying again. The number attempted since | ||
502 | * last failure is tracked with compact_considered. | ||
503 | */ | ||
504 | unsigned int compact_considered; | ||
505 | unsigned int compact_defer_shift; | ||
506 | int compact_order_failed; | ||
507 | #endif | ||
508 | |||
509 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
510 | /* Set to true when the PG_migrate_skip bits should be cleared */ | ||
511 | bool compact_blockskip_flush; | ||
512 | #endif | ||
513 | |||
514 | ZONE_PADDING(_pad3_) | ||
515 | /* Zone statistics */ | ||
516 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | ||
500 | } ____cacheline_internodealigned_in_smp; | 517 | } ____cacheline_internodealigned_in_smp; |
501 | 518 | ||
502 | typedef enum { | 519 | typedef enum { |
@@ -512,6 +529,7 @@ typedef enum { | |||
512 | ZONE_WRITEBACK, /* reclaim scanning has recently found | 529 | ZONE_WRITEBACK, /* reclaim scanning has recently found |
513 | * many pages under writeback | 530 | * many pages under writeback |
514 | */ | 531 | */ |
532 | ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ | ||
515 | } zone_flags_t; | 533 | } zone_flags_t; |
516 | 534 | ||
517 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | 535 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) |
@@ -549,6 +567,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) | |||
549 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 567 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
550 | } | 568 | } |
551 | 569 | ||
570 | static inline int zone_is_fair_depleted(const struct zone *zone) | ||
571 | { | ||
572 | return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
573 | } | ||
574 | |||
552 | static inline int zone_is_oom_locked(const struct zone *zone) | 575 | static inline int zone_is_oom_locked(const struct zone *zone) |
553 | { | 576 | { |
554 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 577 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
@@ -803,10 +826,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) | |||
803 | extern struct mutex zonelists_mutex; | 826 | extern struct mutex zonelists_mutex; |
804 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); | 827 | void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); |
805 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); | 828 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); |
806 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 829 | bool zone_watermark_ok(struct zone *z, unsigned int order, |
807 | int classzone_idx, int alloc_flags); | 830 | unsigned long mark, int classzone_idx, int alloc_flags); |
808 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 831 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
809 | int classzone_idx, int alloc_flags); | 832 | unsigned long mark, int classzone_idx, int alloc_flags); |
810 | enum memmap_context { | 833 | enum memmap_context { |
811 | MEMMAP_EARLY, | 834 | MEMMAP_EARLY, |
812 | MEMMAP_HOTPLUG, | 835 | MEMMAP_HOTPLUG, |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index dd7d45b5c496..2284ea62c6cc 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -198,6 +198,7 @@ struct page; /* forward declaration */ | |||
198 | TESTPAGEFLAG(Locked, locked) | 198 | TESTPAGEFLAG(Locked, locked) |
199 | PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) | 199 | PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) |
200 | PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) | 200 | PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) |
201 | __SETPAGEFLAG(Referenced, referenced) | ||
201 | PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) | 202 | PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) |
202 | PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) | 203 | PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) |
203 | PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) | 204 | PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) |
@@ -208,6 +209,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ | |||
208 | PAGEFLAG(SavePinned, savepinned); /* Xen */ | 209 | PAGEFLAG(SavePinned, savepinned); /* Xen */ |
209 | PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) | 210 | PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) |
210 | PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) | 211 | PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) |
212 | __SETPAGEFLAG(SwapBacked, swapbacked) | ||
211 | 213 | ||
212 | __PAGEFLAG(SlobFree, slob_free) | 214 | __PAGEFLAG(SlobFree, slob_free) |
213 | 215 | ||
@@ -228,9 +230,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) | |||
228 | TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) | 230 | TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) |
229 | PAGEFLAG(MappedToDisk, mappedtodisk) | 231 | PAGEFLAG(MappedToDisk, mappedtodisk) |
230 | 232 | ||
231 | /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ | 233 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ |
232 | PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) | 234 | PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) |
233 | PAGEFLAG(Readahead, reclaim) /* Reminder to do async read-ahead */ | 235 | PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) |
234 | 236 | ||
235 | #ifdef CONFIG_HIGHMEM | 237 | #ifdef CONFIG_HIGHMEM |
236 | /* | 238 | /* |
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index c08730c10c7a..2baeee12f48e 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h | |||
@@ -65,33 +65,26 @@ extern int pageblock_order; | |||
65 | /* Forward declaration */ | 65 | /* Forward declaration */ |
66 | struct page; | 66 | struct page; |
67 | 67 | ||
68 | unsigned long get_pageblock_flags_mask(struct page *page, | 68 | unsigned long get_pfnblock_flags_mask(struct page *page, |
69 | unsigned long pfn, | ||
69 | unsigned long end_bitidx, | 70 | unsigned long end_bitidx, |
70 | unsigned long mask); | 71 | unsigned long mask); |
71 | void set_pageblock_flags_mask(struct page *page, | 72 | |
73 | void set_pfnblock_flags_mask(struct page *page, | ||
72 | unsigned long flags, | 74 | unsigned long flags, |
75 | unsigned long pfn, | ||
73 | unsigned long end_bitidx, | 76 | unsigned long end_bitidx, |
74 | unsigned long mask); | 77 | unsigned long mask); |
75 | 78 | ||
76 | /* Declarations for getting and setting flags. See mm/page_alloc.c */ | 79 | /* Declarations for getting and setting flags. See mm/page_alloc.c */ |
77 | static inline unsigned long get_pageblock_flags_group(struct page *page, | 80 | #define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \ |
78 | int start_bitidx, int end_bitidx) | 81 | get_pfnblock_flags_mask(page, page_to_pfn(page), \ |
79 | { | 82 | end_bitidx, \ |
80 | unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; | 83 | (1 << (end_bitidx - start_bitidx + 1)) - 1) |
81 | unsigned long mask = (1 << nr_flag_bits) - 1; | 84 | #define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \ |
82 | 85 | set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \ | |
83 | return get_pageblock_flags_mask(page, end_bitidx, mask); | 86 | end_bitidx, \ |
84 | } | 87 | (1 << (end_bitidx - start_bitidx + 1)) - 1) |
85 | |||
86 | static inline void set_pageblock_flags_group(struct page *page, | ||
87 | unsigned long flags, | ||
88 | int start_bitidx, int end_bitidx) | ||
89 | { | ||
90 | unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; | ||
91 | unsigned long mask = (1 << nr_flag_bits) - 1; | ||
92 | |||
93 | set_pageblock_flags_mask(page, flags, end_bitidx, mask); | ||
94 | } | ||
95 | 88 | ||
96 | #ifdef CONFIG_COMPACTION | 89 | #ifdef CONFIG_COMPACTION |
97 | #define get_pageblock_skip(page) \ | 90 | #define get_pageblock_skip(page) \ |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e3dea75a078b..d57a02a9747b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -99,7 +99,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) | |||
99 | 99 | ||
100 | #define page_cache_get(page) get_page(page) | 100 | #define page_cache_get(page) get_page(page) |
101 | #define page_cache_release(page) put_page(page) | 101 | #define page_cache_release(page) put_page(page) |
102 | void release_pages(struct page **pages, int nr, int cold); | 102 | void release_pages(struct page **pages, int nr, bool cold); |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * speculatively take a reference to a page. | 105 | * speculatively take a reference to a page. |
@@ -243,12 +243,117 @@ static inline struct page *page_cache_alloc_readahead(struct address_space *x) | |||
243 | 243 | ||
244 | typedef int filler_t(void *, struct page *); | 244 | typedef int filler_t(void *, struct page *); |
245 | 245 | ||
246 | extern struct page * find_get_page(struct address_space *mapping, | 246 | pgoff_t page_cache_next_hole(struct address_space *mapping, |
247 | pgoff_t index); | 247 | pgoff_t index, unsigned long max_scan); |
248 | extern struct page * find_lock_page(struct address_space *mapping, | 248 | pgoff_t page_cache_prev_hole(struct address_space *mapping, |
249 | pgoff_t index); | 249 | pgoff_t index, unsigned long max_scan); |
250 | extern struct page * find_or_create_page(struct address_space *mapping, | 250 | |
251 | pgoff_t index, gfp_t gfp_mask); | 251 | #define FGP_ACCESSED 0x00000001 |
252 | #define FGP_LOCK 0x00000002 | ||
253 | #define FGP_CREAT 0x00000004 | ||
254 | #define FGP_WRITE 0x00000008 | ||
255 | #define FGP_NOFS 0x00000010 | ||
256 | #define FGP_NOWAIT 0x00000020 | ||
257 | |||
258 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, | ||
259 | int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask); | ||
260 | |||
261 | /** | ||
262 | * find_get_page - find and get a page reference | ||
263 | * @mapping: the address_space to search | ||
264 | * @offset: the page index | ||
265 | * | ||
266 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
267 | * page cache page, it is returned with an increased refcount. | ||
268 | * | ||
269 | * Otherwise, %NULL is returned. | ||
270 | */ | ||
271 | static inline struct page *find_get_page(struct address_space *mapping, | ||
272 | pgoff_t offset) | ||
273 | { | ||
274 | return pagecache_get_page(mapping, offset, 0, 0, 0); | ||
275 | } | ||
276 | |||
277 | static inline struct page *find_get_page_flags(struct address_space *mapping, | ||
278 | pgoff_t offset, int fgp_flags) | ||
279 | { | ||
280 | return pagecache_get_page(mapping, offset, fgp_flags, 0, 0); | ||
281 | } | ||
282 | |||
283 | /** | ||
284 | * find_lock_page - locate, pin and lock a pagecache page | ||
285 | * pagecache_get_page - find and get a page reference | ||
286 | * @mapping: the address_space to search | ||
287 | * @offset: the page index | ||
288 | * | ||
289 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
290 | * page cache page, it is returned locked and with an increased | ||
291 | * refcount. | ||
292 | * | ||
293 | * Otherwise, %NULL is returned. | ||
294 | * | ||
295 | * find_lock_page() may sleep. | ||
296 | */ | ||
297 | static inline struct page *find_lock_page(struct address_space *mapping, | ||
298 | pgoff_t offset) | ||
299 | { | ||
300 | return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0); | ||
301 | } | ||
302 | |||
303 | /** | ||
304 | * find_or_create_page - locate or add a pagecache page | ||
305 | * @mapping: the page's address_space | ||
306 | * @index: the page's index into the mapping | ||
307 | * @gfp_mask: page allocation mode | ||
308 | * | ||
309 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
310 | * page cache page, it is returned locked and with an increased | ||
311 | * refcount. | ||
312 | * | ||
313 | * If the page is not present, a new page is allocated using @gfp_mask | ||
314 | * and added to the page cache and the VM's LRU list. The page is | ||
315 | * returned locked and with an increased refcount. | ||
316 | * | ||
317 | * On memory exhaustion, %NULL is returned. | ||
318 | * | ||
319 | * find_or_create_page() may sleep, even if @gfp_flags specifies an | ||
320 | * atomic allocation! | ||
321 | */ | ||
322 | static inline struct page *find_or_create_page(struct address_space *mapping, | ||
323 | pgoff_t offset, gfp_t gfp_mask) | ||
324 | { | ||
325 | return pagecache_get_page(mapping, offset, | ||
326 | FGP_LOCK|FGP_ACCESSED|FGP_CREAT, | ||
327 | gfp_mask, gfp_mask & GFP_RECLAIM_MASK); | ||
328 | } | ||
329 | |||
330 | /** | ||
331 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
332 | * @mapping: target address_space | ||
333 | * @index: the page index | ||
334 | * | ||
335 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | ||
336 | * This is intended for speculative data generators, where the data can | ||
337 | * be regenerated if the page couldn't be grabbed. This routine should | ||
338 | * be safe to call while holding the lock for another page. | ||
339 | * | ||
340 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
341 | * and deadlock against the caller's locked page. | ||
342 | */ | ||
343 | static inline struct page *grab_cache_page_nowait(struct address_space *mapping, | ||
344 | pgoff_t index) | ||
345 | { | ||
346 | return pagecache_get_page(mapping, index, | ||
347 | FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, | ||
348 | mapping_gfp_mask(mapping), | ||
349 | GFP_NOFS); | ||
350 | } | ||
351 | |||
352 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); | ||
353 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); | ||
354 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, | ||
355 | unsigned int nr_entries, struct page **entries, | ||
356 | pgoff_t *indices); | ||
252 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | 357 | unsigned find_get_pages(struct address_space *mapping, pgoff_t start, |
253 | unsigned int nr_pages, struct page **pages); | 358 | unsigned int nr_pages, struct page **pages); |
254 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, | 359 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, |
@@ -268,10 +373,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, | |||
268 | return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); | 373 | return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); |
269 | } | 374 | } |
270 | 375 | ||
271 | extern struct page * grab_cache_page_nowait(struct address_space *mapping, | ||
272 | pgoff_t index); | ||
273 | extern struct page * read_cache_page_async(struct address_space *mapping, | ||
274 | pgoff_t index, filler_t *filler, void *data); | ||
275 | extern struct page * read_cache_page(struct address_space *mapping, | 376 | extern struct page * read_cache_page(struct address_space *mapping, |
276 | pgoff_t index, filler_t *filler, void *data); | 377 | pgoff_t index, filler_t *filler, void *data); |
277 | extern struct page * read_cache_page_gfp(struct address_space *mapping, | 378 | extern struct page * read_cache_page_gfp(struct address_space *mapping, |
@@ -279,14 +380,6 @@ extern struct page * read_cache_page_gfp(struct address_space *mapping, | |||
279 | extern int read_cache_pages(struct address_space *mapping, | 380 | extern int read_cache_pages(struct address_space *mapping, |
280 | struct list_head *pages, filler_t *filler, void *data); | 381 | struct list_head *pages, filler_t *filler, void *data); |
281 | 382 | ||
282 | static inline struct page *read_mapping_page_async( | ||
283 | struct address_space *mapping, | ||
284 | pgoff_t index, void *data) | ||
285 | { | ||
286 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | ||
287 | return read_cache_page_async(mapping, index, filler, data); | ||
288 | } | ||
289 | |||
290 | static inline struct page *read_mapping_page(struct address_space *mapping, | 383 | static inline struct page *read_mapping_page(struct address_space *mapping, |
291 | pgoff_t index, void *data) | 384 | pgoff_t index, void *data) |
292 | { | 385 | { |
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index e4dbfab37729..b45d391b4540 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h | |||
@@ -22,6 +22,11 @@ struct pagevec { | |||
22 | 22 | ||
23 | void __pagevec_release(struct pagevec *pvec); | 23 | void __pagevec_release(struct pagevec *pvec); |
24 | void __pagevec_lru_add(struct pagevec *pvec); | 24 | void __pagevec_lru_add(struct pagevec *pvec); |
25 | unsigned pagevec_lookup_entries(struct pagevec *pvec, | ||
26 | struct address_space *mapping, | ||
27 | pgoff_t start, unsigned nr_entries, | ||
28 | pgoff_t *indices); | ||
29 | void pagevec_remove_exceptionals(struct pagevec *pvec); | ||
25 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | 30 | unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, |
26 | pgoff_t start, unsigned nr_pages); | 31 | pgoff_t start, unsigned nr_pages); |
27 | unsigned pagevec_lookup_tag(struct pagevec *pvec, | 32 | unsigned pagevec_lookup_tag(struct pagevec *pvec, |
diff --git a/include/linux/plist.h b/include/linux/plist.h index aa0fb390bd29..8b6c970cff6c 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h | |||
@@ -98,6 +98,13 @@ struct plist_node { | |||
98 | } | 98 | } |
99 | 99 | ||
100 | /** | 100 | /** |
101 | * PLIST_HEAD - declare and init plist_head | ||
102 | * @head: name for struct plist_head variable | ||
103 | */ | ||
104 | #define PLIST_HEAD(head) \ | ||
105 | struct plist_head head = PLIST_HEAD_INIT(head) | ||
106 | |||
107 | /** | ||
101 | * PLIST_NODE_INIT - static struct plist_node initializer | 108 | * PLIST_NODE_INIT - static struct plist_node initializer |
102 | * @node: struct plist_node variable name | 109 | * @node: struct plist_node variable name |
103 | * @__prio: initial node priority | 110 | * @__prio: initial node priority |
@@ -134,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) | |||
134 | extern void plist_add(struct plist_node *node, struct plist_head *head); | 141 | extern void plist_add(struct plist_node *node, struct plist_head *head); |
135 | extern void plist_del(struct plist_node *node, struct plist_head *head); | 142 | extern void plist_del(struct plist_node *node, struct plist_head *head); |
136 | 143 | ||
144 | extern void plist_requeue(struct plist_node *node, struct plist_head *head); | ||
145 | |||
137 | /** | 146 | /** |
138 | * plist_for_each - iterate over the plist | 147 | * plist_for_each - iterate over the plist |
139 | * @pos: the type * to use as a loop counter | 148 | * @pos: the type * to use as a loop counter |
@@ -143,6 +152,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | |||
143 | list_for_each_entry(pos, &(head)->node_list, node_list) | 152 | list_for_each_entry(pos, &(head)->node_list, node_list) |
144 | 153 | ||
145 | /** | 154 | /** |
155 | * plist_for_each_continue - continue iteration over the plist | ||
156 | * @pos: the type * to use as a loop cursor | ||
157 | * @head: the head for your list | ||
158 | * | ||
159 | * Continue to iterate over plist, continuing after the current position. | ||
160 | */ | ||
161 | #define plist_for_each_continue(pos, head) \ | ||
162 | list_for_each_entry_continue(pos, &(head)->node_list, node_list) | ||
163 | |||
164 | /** | ||
146 | * plist_for_each_safe - iterate safely over a plist of given type | 165 | * plist_for_each_safe - iterate safely over a plist of given type |
147 | * @pos: the type * to use as a loop counter | 166 | * @pos: the type * to use as a loop counter |
148 | * @n: another type * to use as temporary storage | 167 | * @n: another type * to use as temporary storage |
@@ -163,6 +182,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); | |||
163 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) | 182 | list_for_each_entry(pos, &(head)->node_list, mem.node_list) |
164 | 183 | ||
165 | /** | 184 | /** |
185 | * plist_for_each_entry_continue - continue iteration over list of given type | ||
186 | * @pos: the type * to use as a loop cursor | ||
187 | * @head: the head for your list | ||
188 | * @m: the name of the list_struct within the struct | ||
189 | * | ||
190 | * Continue to iterate over list of given type, continuing after | ||
191 | * the current position. | ||
192 | */ | ||
193 | #define plist_for_each_entry_continue(pos, head, m) \ | ||
194 | list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) | ||
195 | |||
196 | /** | ||
166 | * plist_for_each_entry_safe - iterate safely over list of given type | 197 | * plist_for_each_entry_safe - iterate safely over list of given type |
167 | * @pos: the type * to use as a loop counter | 198 | * @pos: the type * to use as a loop counter |
168 | * @n: another type * to use as temporary storage | 199 | * @n: another type * to use as temporary storage |
@@ -229,6 +260,20 @@ static inline int plist_node_empty(const struct plist_node *node) | |||
229 | #endif | 260 | #endif |
230 | 261 | ||
231 | /** | 262 | /** |
263 | * plist_next - get the next entry in list | ||
264 | * @pos: the type * to cursor | ||
265 | */ | ||
266 | #define plist_next(pos) \ | ||
267 | list_next_entry(pos, node_list) | ||
268 | |||
269 | /** | ||
270 | * plist_prev - get the prev entry in list | ||
271 | * @pos: the type * to cursor | ||
272 | */ | ||
273 | #define plist_prev(pos) \ | ||
274 | list_prev_entry(pos, node_list) | ||
275 | |||
276 | /** | ||
232 | * plist_first - return the first node (and thus, highest priority) | 277 | * plist_first - return the first node (and thus, highest priority) |
233 | * @head: the &struct plist_head pointer | 278 | * @head: the &struct plist_head pointer |
234 | * | 279 | * |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 403940787be1..e8be53ecfc45 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -219,6 +219,7 @@ static inline void radix_tree_replace_slot(void **pslot, void *item) | |||
219 | int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); | 219 | int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); |
220 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); | 220 | void *radix_tree_lookup(struct radix_tree_root *, unsigned long); |
221 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); | 221 | void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); |
222 | void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); | ||
222 | void *radix_tree_delete(struct radix_tree_root *, unsigned long); | 223 | void *radix_tree_delete(struct radix_tree_root *, unsigned long); |
223 | unsigned int | 224 | unsigned int |
224 | radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | 225 | radix_tree_gang_lookup(struct radix_tree_root *root, void **results, |
@@ -226,10 +227,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, | |||
226 | unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, | 227 | unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, |
227 | void ***results, unsigned long *indices, | 228 | void ***results, unsigned long *indices, |
228 | unsigned long first_index, unsigned int max_items); | 229 | unsigned long first_index, unsigned int max_items); |
229 | unsigned long radix_tree_next_hole(struct radix_tree_root *root, | ||
230 | unsigned long index, unsigned long max_scan); | ||
231 | unsigned long radix_tree_prev_hole(struct radix_tree_root *root, | ||
232 | unsigned long index, unsigned long max_scan); | ||
233 | int radix_tree_preload(gfp_t gfp_mask); | 230 | int radix_tree_preload(gfp_t gfp_mask); |
234 | int radix_tree_maybe_preload(gfp_t gfp_mask); | 231 | int radix_tree_maybe_preload(gfp_t gfp_mask); |
235 | void radix_tree_init(void); | 232 | void radix_tree_init(void); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 0827bec7d82f..cb67b4e2dba2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -63,6 +63,10 @@ struct fs_struct; | |||
63 | struct perf_event_context; | 63 | struct perf_event_context; |
64 | struct blk_plug; | 64 | struct blk_plug; |
65 | 65 | ||
66 | #define VMACACHE_BITS 2 | ||
67 | #define VMACACHE_SIZE (1U << VMACACHE_BITS) | ||
68 | #define VMACACHE_MASK (VMACACHE_SIZE - 1) | ||
69 | |||
66 | /* | 70 | /* |
67 | * List of flags we want to share for kernel threads, | 71 | * List of flags we want to share for kernel threads, |
68 | * if only because they are not used by them anyway. | 72 | * if only because they are not used by them anyway. |
@@ -1093,6 +1097,9 @@ struct task_struct { | |||
1093 | #ifdef CONFIG_COMPAT_BRK | 1097 | #ifdef CONFIG_COMPAT_BRK |
1094 | unsigned brk_randomized:1; | 1098 | unsigned brk_randomized:1; |
1095 | #endif | 1099 | #endif |
1100 | /* per-thread vma caching */ | ||
1101 | u32 vmacache_seqnum; | ||
1102 | struct vm_area_struct *vmacache[VMACACHE_SIZE]; | ||
1096 | #if defined(SPLIT_RSS_COUNTING) | 1103 | #if defined(SPLIT_RSS_COUNTING) |
1097 | struct task_rss_stat rss_stat; | 1104 | struct task_rss_stat rss_stat; |
1098 | #endif | 1105 | #endif |
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 30aa0dc60d75..deb49609cd36 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h | |||
@@ -49,6 +49,7 @@ extern struct file *shmem_file_setup(const char *name, | |||
49 | loff_t size, unsigned long flags); | 49 | loff_t size, unsigned long flags); |
50 | extern int shmem_zero_setup(struct vm_area_struct *); | 50 | extern int shmem_zero_setup(struct vm_area_struct *); |
51 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); | 51 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); |
52 | extern bool shmem_mapping(struct address_space *mapping); | ||
52 | extern void shmem_unlock_mapping(struct address_space *mapping); | 53 | extern void shmem_unlock_mapping(struct address_space *mapping); |
53 | extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | 54 | extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, |
54 | pgoff_t index, gfp_t gfp_mask); | 55 | pgoff_t index, gfp_t gfp_mask); |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 46ba0c6c219f..241bf0922770 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -214,8 +214,9 @@ struct percpu_cluster { | |||
214 | struct swap_info_struct { | 214 | struct swap_info_struct { |
215 | unsigned long flags; /* SWP_USED etc: see above */ | 215 | unsigned long flags; /* SWP_USED etc: see above */ |
216 | signed short prio; /* swap priority of this type */ | 216 | signed short prio; /* swap priority of this type */ |
217 | struct plist_node list; /* entry in swap_active_head */ | ||
218 | struct plist_node avail_list; /* entry in swap_avail_head */ | ||
217 | signed char type; /* strange name for an index */ | 219 | signed char type; /* strange name for an index */ |
218 | signed char next; /* next type on the swap list */ | ||
219 | unsigned int max; /* extent of the swap_map */ | 220 | unsigned int max; /* extent of the swap_map */ |
220 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ | 221 | unsigned char *swap_map; /* vmalloc'ed array of usage counts */ |
221 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ | 222 | struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ |
@@ -255,11 +256,6 @@ struct swap_info_struct { | |||
255 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ | 256 | struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ |
256 | }; | 257 | }; |
257 | 258 | ||
258 | struct swap_list_t { | ||
259 | int head; /* head of priority-ordered swapfile list */ | ||
260 | int next; /* swapfile to be used next */ | ||
261 | }; | ||
262 | |||
263 | /* linux/mm/page_alloc.c */ | 259 | /* linux/mm/page_alloc.c */ |
264 | extern unsigned long totalram_pages; | 260 | extern unsigned long totalram_pages; |
265 | extern unsigned long totalreserve_pages; | 261 | extern unsigned long totalreserve_pages; |
@@ -272,12 +268,14 @@ extern unsigned long nr_free_pagecache_pages(void); | |||
272 | 268 | ||
273 | 269 | ||
274 | /* linux/mm/swap.c */ | 270 | /* linux/mm/swap.c */ |
275 | extern void __lru_cache_add(struct page *); | ||
276 | extern void lru_cache_add(struct page *); | 271 | extern void lru_cache_add(struct page *); |
272 | extern void lru_cache_add_anon(struct page *page); | ||
273 | extern void lru_cache_add_file(struct page *page); | ||
277 | extern void lru_add_page_tail(struct page *page, struct page *page_tail, | 274 | extern void lru_add_page_tail(struct page *page, struct page *page_tail, |
278 | struct lruvec *lruvec, struct list_head *head); | 275 | struct lruvec *lruvec, struct list_head *head); |
279 | extern void activate_page(struct page *); | 276 | extern void activate_page(struct page *); |
280 | extern void mark_page_accessed(struct page *); | 277 | extern void mark_page_accessed(struct page *); |
278 | extern void init_page_accessed(struct page *page); | ||
281 | extern void lru_add_drain(void); | 279 | extern void lru_add_drain(void); |
282 | extern void lru_add_drain_cpu(int cpu); | 280 | extern void lru_add_drain_cpu(int cpu); |
283 | extern void lru_add_drain_all(void); | 281 | extern void lru_add_drain_all(void); |
@@ -287,22 +285,6 @@ extern void swap_setup(void); | |||
287 | 285 | ||
288 | extern void add_page_to_unevictable_list(struct page *page); | 286 | extern void add_page_to_unevictable_list(struct page *page); |
289 | 287 | ||
290 | /** | ||
291 | * lru_cache_add: add a page to the page lists | ||
292 | * @page: the page to add | ||
293 | */ | ||
294 | static inline void lru_cache_add_anon(struct page *page) | ||
295 | { | ||
296 | ClearPageActive(page); | ||
297 | __lru_cache_add(page); | ||
298 | } | ||
299 | |||
300 | static inline void lru_cache_add_file(struct page *page) | ||
301 | { | ||
302 | ClearPageActive(page); | ||
303 | __lru_cache_add(page); | ||
304 | } | ||
305 | |||
306 | /* linux/mm/vmscan.c */ | 288 | /* linux/mm/vmscan.c */ |
307 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 289 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
308 | gfp_t gfp_mask, nodemask_t *mask); | 290 | gfp_t gfp_mask, nodemask_t *mask); |
@@ -460,7 +442,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
460 | #define free_page_and_swap_cache(page) \ | 442 | #define free_page_and_swap_cache(page) \ |
461 | page_cache_release(page) | 443 | page_cache_release(page) |
462 | #define free_pages_and_swap_cache(pages, nr) \ | 444 | #define free_pages_and_swap_cache(pages, nr) \ |
463 | release_pages((pages), (nr), 0); | 445 | release_pages((pages), (nr), false); |
464 | 446 | ||
465 | static inline void show_swap_cache_info(void) | 447 | static inline void show_swap_cache_info(void) |
466 | { | 448 | { |
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e282624e8c10..388293a91e8c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h | |||
@@ -6,7 +6,7 @@ | |||
6 | * want to expose them to the dozens of source files that include swap.h | 6 | * want to expose them to the dozens of source files that include swap.h |
7 | */ | 7 | */ |
8 | extern spinlock_t swap_lock; | 8 | extern spinlock_t swap_lock; |
9 | extern struct swap_list_t swap_list; | 9 | extern struct plist_head swap_active_head; |
10 | extern struct swap_info_struct *swap_info[]; | 10 | extern struct swap_info_struct *swap_info[]; |
11 | extern int try_to_unuse(unsigned int, bool, unsigned long); | 11 | extern int try_to_unuse(unsigned int, bool, unsigned long); |
12 | 12 | ||
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c557c6d096de..3a712e2e7d76 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -71,12 +71,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
71 | THP_ZERO_PAGE_ALLOC, | 71 | THP_ZERO_PAGE_ALLOC, |
72 | THP_ZERO_PAGE_ALLOC_FAILED, | 72 | THP_ZERO_PAGE_ALLOC_FAILED, |
73 | #endif | 73 | #endif |
74 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
74 | #ifdef CONFIG_SMP | 75 | #ifdef CONFIG_SMP |
75 | NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ | 76 | NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ |
76 | NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ | 77 | NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ |
77 | #endif | 78 | #endif /* CONFIG_SMP */ |
78 | NR_TLB_LOCAL_FLUSH_ALL, | 79 | NR_TLB_LOCAL_FLUSH_ALL, |
79 | NR_TLB_LOCAL_FLUSH_ONE, | 80 | NR_TLB_LOCAL_FLUSH_ONE, |
81 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | ||
80 | NR_VM_EVENT_ITEMS | 82 | NR_VM_EVENT_ITEMS |
81 | }; | 83 | }; |
82 | 84 | ||
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h new file mode 100644 index 000000000000..c3fa0fd43949 --- /dev/null +++ b/include/linux/vmacache.h | |||
@@ -0,0 +1,38 @@ | |||
1 | #ifndef __LINUX_VMACACHE_H | ||
2 | #define __LINUX_VMACACHE_H | ||
3 | |||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mm.h> | ||
6 | |||
7 | /* | ||
8 | * Hash based on the page number. Provides a good hit rate for | ||
9 | * workloads with good locality and those with random accesses as well. | ||
10 | */ | ||
11 | #define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) | ||
12 | |||
13 | static inline void vmacache_flush(struct task_struct *tsk) | ||
14 | { | ||
15 | memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); | ||
16 | } | ||
17 | |||
18 | extern void vmacache_flush_all(struct mm_struct *mm); | ||
19 | extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); | ||
20 | extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, | ||
21 | unsigned long addr); | ||
22 | |||
23 | #ifndef CONFIG_MMU | ||
24 | extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | ||
25 | unsigned long start, | ||
26 | unsigned long end); | ||
27 | #endif | ||
28 | |||
29 | static inline void vmacache_invalidate(struct mm_struct *mm) | ||
30 | { | ||
31 | mm->vmacache_seqnum++; | ||
32 | |||
33 | /* deal with overflows */ | ||
34 | if (unlikely(mm->vmacache_seqnum == 0)) | ||
35 | vmacache_flush_all(mm); | ||
36 | } | ||
37 | |||
38 | #endif /* __LINUX_VMACACHE_H */ | ||
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a67b38415768..67ce70c8279b 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -83,6 +83,14 @@ static inline void vm_events_fold_cpu(int cpu) | |||
83 | #define count_vm_numa_events(x, y) do { (void)(y); } while (0) | 83 | #define count_vm_numa_events(x, y) do { (void)(y); } while (0) |
84 | #endif /* CONFIG_NUMA_BALANCING */ | 84 | #endif /* CONFIG_NUMA_BALANCING */ |
85 | 85 | ||
86 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
87 | #define count_vm_tlb_event(x) count_vm_event(x) | ||
88 | #define count_vm_tlb_events(x, y) count_vm_events(x, y) | ||
89 | #else | ||
90 | #define count_vm_tlb_event(x) do {} while (0) | ||
91 | #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) | ||
92 | #endif | ||
93 | |||
86 | #define __count_zone_vm_events(item, zone, delta) \ | 94 | #define __count_zone_vm_events(item, zone, delta) \ |
87 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ | 95 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ |
88 | zone_idx(zone), delta) | 96 | zone_idx(zone), delta) |
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index fde1b3e94c7d..c6814b917bdf 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #define _TRACE_COMPACTION_H | 5 | #define _TRACE_COMPACTION_H |
6 | 6 | ||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/list.h> | ||
8 | #include <linux/tracepoint.h> | 9 | #include <linux/tracepoint.h> |
9 | #include <trace/events/gfpflags.h> | 10 | #include <trace/events/gfpflags.h> |
10 | 11 | ||
@@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, | |||
47 | 48 | ||
48 | TRACE_EVENT(mm_compaction_migratepages, | 49 | TRACE_EVENT(mm_compaction_migratepages, |
49 | 50 | ||
50 | TP_PROTO(unsigned long nr_migrated, | 51 | TP_PROTO(unsigned long nr_all, |
51 | unsigned long nr_failed), | 52 | int migrate_rc, |
53 | struct list_head *migratepages), | ||
52 | 54 | ||
53 | TP_ARGS(nr_migrated, nr_failed), | 55 | TP_ARGS(nr_all, migrate_rc, migratepages), |
54 | 56 | ||
55 | TP_STRUCT__entry( | 57 | TP_STRUCT__entry( |
56 | __field(unsigned long, nr_migrated) | 58 | __field(unsigned long, nr_migrated) |
@@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages, | |||
58 | ), | 60 | ), |
59 | 61 | ||
60 | TP_fast_assign( | 62 | TP_fast_assign( |
61 | __entry->nr_migrated = nr_migrated; | 63 | unsigned long nr_failed = 0; |
64 | struct list_head *page_lru; | ||
65 | |||
66 | /* | ||
67 | * migrate_pages() returns either a non-negative number | ||
68 | * with the number of pages that failed migration, or an | ||
69 | * error code, in which case we need to count the remaining | ||
70 | * pages manually | ||
71 | */ | ||
72 | if (migrate_rc >= 0) | ||
73 | nr_failed = migrate_rc; | ||
74 | else | ||
75 | list_for_each(page_lru, migratepages) | ||
76 | nr_failed++; | ||
77 | |||
78 | __entry->nr_migrated = nr_all - nr_failed; | ||
62 | __entry->nr_failed = nr_failed; | 79 | __entry->nr_failed = nr_failed; |
63 | ), | 80 | ), |
64 | 81 | ||
@@ -67,6 +84,48 @@ TRACE_EVENT(mm_compaction_migratepages, | |||
67 | __entry->nr_failed) | 84 | __entry->nr_failed) |
68 | ); | 85 | ); |
69 | 86 | ||
87 | TRACE_EVENT(mm_compaction_begin, | ||
88 | TP_PROTO(unsigned long zone_start, unsigned long migrate_start, | ||
89 | unsigned long free_start, unsigned long zone_end), | ||
90 | |||
91 | TP_ARGS(zone_start, migrate_start, free_start, zone_end), | ||
92 | |||
93 | TP_STRUCT__entry( | ||
94 | __field(unsigned long, zone_start) | ||
95 | __field(unsigned long, migrate_start) | ||
96 | __field(unsigned long, free_start) | ||
97 | __field(unsigned long, zone_end) | ||
98 | ), | ||
99 | |||
100 | TP_fast_assign( | ||
101 | __entry->zone_start = zone_start; | ||
102 | __entry->migrate_start = migrate_start; | ||
103 | __entry->free_start = free_start; | ||
104 | __entry->zone_end = zone_end; | ||
105 | ), | ||
106 | |||
107 | TP_printk("zone_start=%lu migrate_start=%lu free_start=%lu zone_end=%lu", | ||
108 | __entry->zone_start, | ||
109 | __entry->migrate_start, | ||
110 | __entry->free_start, | ||
111 | __entry->zone_end) | ||
112 | ); | ||
113 | |||
114 | TRACE_EVENT(mm_compaction_end, | ||
115 | TP_PROTO(int status), | ||
116 | |||
117 | TP_ARGS(status), | ||
118 | |||
119 | TP_STRUCT__entry( | ||
120 | __field(int, status) | ||
121 | ), | ||
122 | |||
123 | TP_fast_assign( | ||
124 | __entry->status = status; | ||
125 | ), | ||
126 | |||
127 | TP_printk("status=%d", __entry->status) | ||
128 | ); | ||
70 | 129 | ||
71 | #endif /* _TRACE_COMPACTION_H */ | 130 | #endif /* _TRACE_COMPACTION_H */ |
72 | 131 | ||
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index d0c613476620..aece1346ceb7 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h | |||
@@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain, | |||
267 | TRACE_EVENT(mm_page_alloc_extfrag, | 267 | TRACE_EVENT(mm_page_alloc_extfrag, |
268 | 268 | ||
269 | TP_PROTO(struct page *page, | 269 | TP_PROTO(struct page *page, |
270 | int alloc_order, int fallback_order, | 270 | int alloc_order, int fallback_order, |
271 | int alloc_migratetype, int fallback_migratetype, | 271 | int alloc_migratetype, int fallback_migratetype, int new_migratetype), |
272 | int change_ownership), | ||
273 | 272 | ||
274 | TP_ARGS(page, | 273 | TP_ARGS(page, |
275 | alloc_order, fallback_order, | 274 | alloc_order, fallback_order, |
276 | alloc_migratetype, fallback_migratetype, | 275 | alloc_migratetype, fallback_migratetype, new_migratetype), |
277 | change_ownership), | ||
278 | 276 | ||
279 | TP_STRUCT__entry( | 277 | TP_STRUCT__entry( |
280 | __field( struct page *, page ) | 278 | __field( struct page *, page ) |
@@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, | |||
291 | __entry->fallback_order = fallback_order; | 289 | __entry->fallback_order = fallback_order; |
292 | __entry->alloc_migratetype = alloc_migratetype; | 290 | __entry->alloc_migratetype = alloc_migratetype; |
293 | __entry->fallback_migratetype = fallback_migratetype; | 291 | __entry->fallback_migratetype = fallback_migratetype; |
294 | __entry->change_ownership = change_ownership; | 292 | __entry->change_ownership = (new_migratetype == alloc_migratetype); |
295 | ), | 293 | ), |
296 | 294 | ||
297 | TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", | 295 | TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", |
diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1c9fabde69e4..ce0803b8d05f 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h | |||
@@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, | |||
28 | 28 | ||
29 | TP_PROTO( | 29 | TP_PROTO( |
30 | struct page *page, | 30 | struct page *page, |
31 | unsigned long pfn, | 31 | int lru |
32 | int lru, | ||
33 | unsigned long flags | ||
34 | ), | 32 | ), |
35 | 33 | ||
36 | TP_ARGS(page, pfn, lru, flags), | 34 | TP_ARGS(page, lru), |
37 | 35 | ||
38 | TP_STRUCT__entry( | 36 | TP_STRUCT__entry( |
39 | __field(struct page *, page ) | 37 | __field(struct page *, page ) |
@@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, | |||
44 | 42 | ||
45 | TP_fast_assign( | 43 | TP_fast_assign( |
46 | __entry->page = page; | 44 | __entry->page = page; |
47 | __entry->pfn = pfn; | 45 | __entry->pfn = page_to_pfn(page); |
48 | __entry->lru = lru; | 46 | __entry->lru = lru; |
49 | __entry->flags = flags; | 47 | __entry->flags = trace_pagemap_flags(page); |
50 | ), | 48 | ), |
51 | 49 | ||
52 | /* Flag format is based on page-types.c formatting for pagemap */ | 50 | /* Flag format is based on page-types.c formatting for pagemap */ |
@@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, | |||
64 | 62 | ||
65 | TRACE_EVENT(mm_lru_activate, | 63 | TRACE_EVENT(mm_lru_activate, |
66 | 64 | ||
67 | TP_PROTO(struct page *page, unsigned long pfn), | 65 | TP_PROTO(struct page *page), |
68 | 66 | ||
69 | TP_ARGS(page, pfn), | 67 | TP_ARGS(page), |
70 | 68 | ||
71 | TP_STRUCT__entry( | 69 | TP_STRUCT__entry( |
72 | __field(struct page *, page ) | 70 | __field(struct page *, page ) |
@@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, | |||
75 | 73 | ||
76 | TP_fast_assign( | 74 | TP_fast_assign( |
77 | __entry->page = page; | 75 | __entry->page = page; |
78 | __entry->pfn = pfn; | 76 | __entry->pfn = page_to_pfn(page); |
79 | ), | 77 | ), |
80 | 78 | ||
81 | /* Flag format is based on page-types.c formatting for pagemap */ | 79 | /* Flag format is based on page-types.c formatting for pagemap */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0b29c52479a6..c8289138cad4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -61,12 +61,7 @@ | |||
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
62 | #include <linux/wait.h> | 62 | #include <linux/wait.h> |
63 | 63 | ||
64 | /* | 64 | struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; |
65 | * Tracks how many cpusets are currently defined in system. | ||
66 | * When there is only one cpuset (the root cpuset) we can | ||
67 | * short circuit some hooks. | ||
68 | */ | ||
69 | int number_of_cpusets __read_mostly; | ||
70 | 65 | ||
71 | /* See "Frequency meter" comments, below. */ | 66 | /* See "Frequency meter" comments, below. */ |
72 | 67 | ||
@@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
611 | goto done; | 606 | goto done; |
612 | } | 607 | } |
613 | 608 | ||
614 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 609 | csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); |
615 | if (!csa) | 610 | if (!csa) |
616 | goto done; | 611 | goto done; |
617 | csn = 0; | 612 | csn = 0; |
@@ -1022,7 +1017,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1022 | task_lock(tsk); | 1017 | task_lock(tsk); |
1023 | /* | 1018 | /* |
1024 | * Determine if a loop is necessary if another thread is doing | 1019 | * Determine if a loop is necessary if another thread is doing |
1025 | * get_mems_allowed(). If at least one node remains unchanged and | 1020 | * read_mems_allowed_begin(). If at least one node remains unchanged and |
1026 | * tsk does not have a mempolicy, then an empty nodemask will not be | 1021 | * tsk does not have a mempolicy, then an empty nodemask will not be |
1027 | * possible when mems_allowed is larger than a word. | 1022 | * possible when mems_allowed is larger than a word. |
1028 | */ | 1023 | */ |
@@ -1986,7 +1981,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1986 | if (is_spread_slab(parent)) | 1981 | if (is_spread_slab(parent)) |
1987 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1982 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1988 | 1983 | ||
1989 | number_of_cpusets++; | 1984 | cpuset_inc(); |
1990 | 1985 | ||
1991 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1986 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1992 | goto out_unlock; | 1987 | goto out_unlock; |
@@ -2037,7 +2032,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) | |||
2037 | if (is_sched_load_balance(cs)) | 2032 | if (is_sched_load_balance(cs)) |
2038 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 2033 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
2039 | 2034 | ||
2040 | number_of_cpusets--; | 2035 | cpuset_dec(); |
2041 | clear_bit(CS_ONLINE, &cs->flags); | 2036 | clear_bit(CS_ONLINE, &cs->flags); |
2042 | 2037 | ||
2043 | mutex_unlock(&cpuset_mutex); | 2038 | mutex_unlock(&cpuset_mutex); |
@@ -2092,7 +2087,6 @@ int __init cpuset_init(void) | |||
2092 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) | 2087 | if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) |
2093 | BUG(); | 2088 | BUG(); |
2094 | 2089 | ||
2095 | number_of_cpusets = 1; | ||
2096 | return 0; | 2090 | return 0; |
2097 | } | 2091 | } |
2098 | 2092 | ||
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0506d447aed2..e911ec662d03 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/pid.h> | 49 | #include <linux/pid.h> |
50 | #include <linux/smp.h> | 50 | #include <linux/smp.h> |
51 | #include <linux/mm.h> | 51 | #include <linux/mm.h> |
52 | #include <linux/vmacache.h> | ||
52 | #include <linux/rcupdate.h> | 53 | #include <linux/rcupdate.h> |
53 | 54 | ||
54 | #include <asm/cacheflush.h> | 55 | #include <asm/cacheflush.h> |
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) | |||
224 | if (!CACHE_FLUSH_IS_SAFE) | 225 | if (!CACHE_FLUSH_IS_SAFE) |
225 | return; | 226 | return; |
226 | 227 | ||
227 | if (current->mm && current->mm->mmap_cache) { | 228 | if (current->mm) { |
228 | flush_cache_range(current->mm->mmap_cache, | 229 | int i; |
229 | addr, addr + BREAK_INSTR_SIZE); | 230 | |
231 | for (i = 0; i < VMACACHE_SIZE; i++) { | ||
232 | if (!current->vmacache[i]) | ||
233 | continue; | ||
234 | flush_cache_range(current->vmacache[i], | ||
235 | addr, addr + BREAK_INSTR_SIZE); | ||
236 | } | ||
230 | } | 237 | } |
238 | |||
231 | /* Force flush instruction cache if it was outside the mm */ | 239 | /* Force flush instruction cache if it was outside the mm */ |
232 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); | 240 | flush_icache_range(addr, addr + BREAK_INSTR_SIZE); |
233 | } | 241 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index 143962949bed..29a1b0283d3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include <linux/mman.h> | 28 | #include <linux/mman.h> |
29 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
30 | #include <linux/fs.h> | 30 | #include <linux/fs.h> |
31 | #include <linux/mm.h> | ||
32 | #include <linux/vmacache.h> | ||
31 | #include <linux/nsproxy.h> | 33 | #include <linux/nsproxy.h> |
32 | #include <linux/capability.h> | 34 | #include <linux/capability.h> |
33 | #include <linux/cpu.h> | 35 | #include <linux/cpu.h> |
@@ -363,7 +365,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
363 | 365 | ||
364 | mm->locked_vm = 0; | 366 | mm->locked_vm = 0; |
365 | mm->mmap = NULL; | 367 | mm->mmap = NULL; |
366 | mm->mmap_cache = NULL; | 368 | mm->vmacache_seqnum = 0; |
367 | mm->map_count = 0; | 369 | mm->map_count = 0; |
368 | cpumask_clear(mm_cpumask(mm)); | 370 | cpumask_clear(mm_cpumask(mm)); |
369 | mm->mm_rb = RB_ROOT; | 371 | mm->mm_rb = RB_ROOT; |
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) | |||
882 | if (!oldmm) | 884 | if (!oldmm) |
883 | return 0; | 885 | return 0; |
884 | 886 | ||
887 | /* initialize the new vmacache entries */ | ||
888 | vmacache_flush(tsk); | ||
889 | |||
885 | if (clone_flags & CLONE_VM) { | 890 | if (clone_flags & CLONE_VM) { |
886 | atomic_inc(&oldmm->mm_users); | 891 | atomic_inc(&oldmm->mm_users); |
887 | mm = oldmm; | 892 | mm = oldmm; |
diff --git a/lib/plist.c b/lib/plist.c index 1ebc95f7a46f..0f2084d30798 100644 --- a/lib/plist.c +++ b/lib/plist.c | |||
@@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) | |||
134 | plist_check_head(head); | 134 | plist_check_head(head); |
135 | } | 135 | } |
136 | 136 | ||
137 | /** | ||
138 | * plist_requeue - Requeue @node at end of same-prio entries. | ||
139 | * | ||
140 | * This is essentially an optimized plist_del() followed by | ||
141 | * plist_add(). It moves an entry already in the plist to | ||
142 | * after any other same-priority entries. | ||
143 | * | ||
144 | * @node: &struct plist_node pointer - entry to be moved | ||
145 | * @head: &struct plist_head pointer - list head | ||
146 | */ | ||
147 | void plist_requeue(struct plist_node *node, struct plist_head *head) | ||
148 | { | ||
149 | struct plist_node *iter; | ||
150 | struct list_head *node_next = &head->node_list; | ||
151 | |||
152 | plist_check_head(head); | ||
153 | BUG_ON(plist_head_empty(head)); | ||
154 | BUG_ON(plist_node_empty(node)); | ||
155 | |||
156 | if (node == plist_last(head)) | ||
157 | return; | ||
158 | |||
159 | iter = plist_next(node); | ||
160 | |||
161 | if (node->prio != iter->prio) | ||
162 | return; | ||
163 | |||
164 | plist_del(node, head); | ||
165 | |||
166 | plist_for_each_continue(iter, head) { | ||
167 | if (node->prio != iter->prio) { | ||
168 | node_next = &iter->node_list; | ||
169 | break; | ||
170 | } | ||
171 | } | ||
172 | list_add_tail(&node->node_list, node_next); | ||
173 | |||
174 | plist_check_head(head); | ||
175 | } | ||
176 | |||
137 | #ifdef CONFIG_DEBUG_PI_LIST | 177 | #ifdef CONFIG_DEBUG_PI_LIST |
138 | #include <linux/sched.h> | 178 | #include <linux/sched.h> |
139 | #include <linux/module.h> | 179 | #include <linux/module.h> |
@@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect) | |||
170 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); | 210 | BUG_ON(prio_pos->prio_list.next != &first->prio_list); |
171 | } | 211 | } |
172 | 212 | ||
213 | static void __init plist_test_requeue(struct plist_node *node) | ||
214 | { | ||
215 | plist_requeue(node, &test_head); | ||
216 | |||
217 | if (node != plist_last(&test_head)) | ||
218 | BUG_ON(node->prio == plist_next(node)->prio); | ||
219 | } | ||
220 | |||
173 | static int __init plist_test(void) | 221 | static int __init plist_test(void) |
174 | { | 222 | { |
175 | int nr_expect = 0, i, loop; | 223 | int nr_expect = 0, i, loop; |
@@ -193,6 +241,10 @@ static int __init plist_test(void) | |||
193 | nr_expect--; | 241 | nr_expect--; |
194 | } | 242 | } |
195 | plist_test_check(nr_expect); | 243 | plist_test_check(nr_expect); |
244 | if (!plist_node_empty(test_node + i)) { | ||
245 | plist_test_requeue(test_node + i); | ||
246 | plist_test_check(nr_expect); | ||
247 | } | ||
196 | } | 248 | } |
197 | 249 | ||
198 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { | 250 | for (i = 0; i < ARRAY_SIZE(test_node); i++) { |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7811ed3b4e70..e8adb5d8a184 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -946,81 +946,6 @@ next: | |||
946 | } | 946 | } |
947 | EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); | 947 | EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); |
948 | 948 | ||
949 | |||
950 | /** | ||
951 | * radix_tree_next_hole - find the next hole (not-present entry) | ||
952 | * @root: tree root | ||
953 | * @index: index key | ||
954 | * @max_scan: maximum range to search | ||
955 | * | ||
956 | * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the lowest | ||
957 | * indexed hole. | ||
958 | * | ||
959 | * Returns: the index of the hole if found, otherwise returns an index | ||
960 | * outside of the set specified (in which case 'return - index >= max_scan' | ||
961 | * will be true). In rare cases of index wrap-around, 0 will be returned. | ||
962 | * | ||
963 | * radix_tree_next_hole may be called under rcu_read_lock. However, like | ||
964 | * radix_tree_gang_lookup, this will not atomically search a snapshot of | ||
965 | * the tree at a single point in time. For example, if a hole is created | ||
966 | * at index 5, then subsequently a hole is created at index 10, | ||
967 | * radix_tree_next_hole covering both indexes may return 10 if called | ||
968 | * under rcu_read_lock. | ||
969 | */ | ||
970 | unsigned long radix_tree_next_hole(struct radix_tree_root *root, | ||
971 | unsigned long index, unsigned long max_scan) | ||
972 | { | ||
973 | unsigned long i; | ||
974 | |||
975 | for (i = 0; i < max_scan; i++) { | ||
976 | if (!radix_tree_lookup(root, index)) | ||
977 | break; | ||
978 | index++; | ||
979 | if (index == 0) | ||
980 | break; | ||
981 | } | ||
982 | |||
983 | return index; | ||
984 | } | ||
985 | EXPORT_SYMBOL(radix_tree_next_hole); | ||
986 | |||
987 | /** | ||
988 | * radix_tree_prev_hole - find the prev hole (not-present entry) | ||
989 | * @root: tree root | ||
990 | * @index: index key | ||
991 | * @max_scan: maximum range to search | ||
992 | * | ||
993 | * Search backwards in the range [max(index-max_scan+1, 0), index] | ||
994 | * for the first hole. | ||
995 | * | ||
996 | * Returns: the index of the hole if found, otherwise returns an index | ||
997 | * outside of the set specified (in which case 'index - return >= max_scan' | ||
998 | * will be true). In rare cases of wrap-around, ULONG_MAX will be returned. | ||
999 | * | ||
1000 | * radix_tree_next_hole may be called under rcu_read_lock. However, like | ||
1001 | * radix_tree_gang_lookup, this will not atomically search a snapshot of | ||
1002 | * the tree at a single point in time. For example, if a hole is created | ||
1003 | * at index 10, then subsequently a hole is created at index 5, | ||
1004 | * radix_tree_prev_hole covering both indexes may return 5 if called under | ||
1005 | * rcu_read_lock. | ||
1006 | */ | ||
1007 | unsigned long radix_tree_prev_hole(struct radix_tree_root *root, | ||
1008 | unsigned long index, unsigned long max_scan) | ||
1009 | { | ||
1010 | unsigned long i; | ||
1011 | |||
1012 | for (i = 0; i < max_scan; i++) { | ||
1013 | if (!radix_tree_lookup(root, index)) | ||
1014 | break; | ||
1015 | index--; | ||
1016 | if (index == ULONG_MAX) | ||
1017 | break; | ||
1018 | } | ||
1019 | |||
1020 | return index; | ||
1021 | } | ||
1022 | EXPORT_SYMBOL(radix_tree_prev_hole); | ||
1023 | |||
1024 | /** | 949 | /** |
1025 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree | 950 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree |
1026 | * @root: radix tree root | 951 | * @root: radix tree root |
@@ -1335,15 +1260,18 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) | |||
1335 | } | 1260 | } |
1336 | 1261 | ||
1337 | /** | 1262 | /** |
1338 | * radix_tree_delete - delete an item from a radix tree | 1263 | * radix_tree_delete_item - delete an item from a radix tree |
1339 | * @root: radix tree root | 1264 | * @root: radix tree root |
1340 | * @index: index key | 1265 | * @index: index key |
1266 | * @item: expected item | ||
1341 | * | 1267 | * |
1342 | * Remove the item at @index from the radix tree rooted at @root. | 1268 | * Remove @item at @index from the radix tree rooted at @root. |
1343 | * | 1269 | * |
1344 | * Returns the address of the deleted item, or NULL if it was not present. | 1270 | * Returns the address of the deleted item, or NULL if it was not present |
1271 | * or the entry at the given @index was not @item. | ||
1345 | */ | 1272 | */ |
1346 | void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | 1273 | void *radix_tree_delete_item(struct radix_tree_root *root, |
1274 | unsigned long index, void *item) | ||
1347 | { | 1275 | { |
1348 | struct radix_tree_node *node = NULL; | 1276 | struct radix_tree_node *node = NULL; |
1349 | struct radix_tree_node *slot = NULL; | 1277 | struct radix_tree_node *slot = NULL; |
@@ -1378,6 +1306,11 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1378 | if (slot == NULL) | 1306 | if (slot == NULL) |
1379 | goto out; | 1307 | goto out; |
1380 | 1308 | ||
1309 | if (item && slot != item) { | ||
1310 | slot = NULL; | ||
1311 | goto out; | ||
1312 | } | ||
1313 | |||
1381 | /* | 1314 | /* |
1382 | * Clear all tags associated with the item to be deleted. | 1315 | * Clear all tags associated with the item to be deleted. |
1383 | * This way of doing it would be inefficient, but seldom is any set. | 1316 | * This way of doing it would be inefficient, but seldom is any set. |
@@ -1422,6 +1355,21 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | |||
1422 | out: | 1355 | out: |
1423 | return slot; | 1356 | return slot; |
1424 | } | 1357 | } |
1358 | EXPORT_SYMBOL(radix_tree_delete_item); | ||
1359 | |||
1360 | /** | ||
1361 | * radix_tree_delete - delete an item from a radix tree | ||
1362 | * @root: radix tree root | ||
1363 | * @index: index key | ||
1364 | * | ||
1365 | * Remove the item at @index from the radix tree rooted at @root. | ||
1366 | * | ||
1367 | * Returns the address of the deleted item, or NULL if it was not present. | ||
1368 | */ | ||
1369 | void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) | ||
1370 | { | ||
1371 | return radix_tree_delete_item(root, index, NULL); | ||
1372 | } | ||
1425 | EXPORT_SYMBOL(radix_tree_delete); | 1373 | EXPORT_SYMBOL(radix_tree_delete); |
1426 | 1374 | ||
1427 | /** | 1375 | /** |
diff --git a/mm/Makefile b/mm/Makefile index 305d10acd081..fb51bc61d80a 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o balloon_compaction.o \ | 19 | compaction.o balloon_compaction.o vmacache.o \ |
20 | interval_tree.o list_lru.o $(mmu-y) | 20 | interval_tree.o list_lru.o $(mmu-y) |
21 | 21 | ||
22 | obj-y += init-mm.o | 22 | obj-y += init-mm.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 6441083e76d3..adb6d0560e96 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) | |||
89 | unsigned long end_pfn = zone_end_pfn(zone); | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
90 | unsigned long pfn; | 90 | unsigned long pfn; |
91 | 91 | ||
92 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn[0] = start_pfn; |
93 | zone->compact_cached_migrate_pfn[1] = start_pfn; | ||
93 | zone->compact_cached_free_pfn = end_pfn; | 94 | zone->compact_cached_free_pfn = end_pfn; |
94 | zone->compact_blockskip_flush = false; | 95 | zone->compact_blockskip_flush = false; |
95 | 96 | ||
@@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
131 | */ | 132 | */ |
132 | static void update_pageblock_skip(struct compact_control *cc, | 133 | static void update_pageblock_skip(struct compact_control *cc, |
133 | struct page *page, unsigned long nr_isolated, | 134 | struct page *page, unsigned long nr_isolated, |
134 | bool migrate_scanner) | 135 | bool set_unsuitable, bool migrate_scanner) |
135 | { | 136 | { |
136 | struct zone *zone = cc->zone; | 137 | struct zone *zone = cc->zone; |
138 | unsigned long pfn; | ||
137 | 139 | ||
138 | if (cc->ignore_skip_hint) | 140 | if (cc->ignore_skip_hint) |
139 | return; | 141 | return; |
@@ -141,20 +143,32 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
141 | if (!page) | 143 | if (!page) |
142 | return; | 144 | return; |
143 | 145 | ||
144 | if (!nr_isolated) { | 146 | if (nr_isolated) |
145 | unsigned long pfn = page_to_pfn(page); | 147 | return; |
148 | |||
149 | /* | ||
150 | * Only skip pageblocks when all forms of compaction will be known to | ||
151 | * fail in the near future. | ||
152 | */ | ||
153 | if (set_unsuitable) | ||
146 | set_pageblock_skip(page); | 154 | set_pageblock_skip(page); |
147 | 155 | ||
148 | /* Update where compaction should restart */ | 156 | pfn = page_to_pfn(page); |
149 | if (migrate_scanner) { | 157 | |
150 | if (!cc->finished_update_migrate && | 158 | /* Update where async and sync compaction should restart */ |
151 | pfn > zone->compact_cached_migrate_pfn) | 159 | if (migrate_scanner) { |
152 | zone->compact_cached_migrate_pfn = pfn; | 160 | if (cc->finished_update_migrate) |
153 | } else { | 161 | return; |
154 | if (!cc->finished_update_free && | 162 | if (pfn > zone->compact_cached_migrate_pfn[0]) |
155 | pfn < zone->compact_cached_free_pfn) | 163 | zone->compact_cached_migrate_pfn[0] = pfn; |
156 | zone->compact_cached_free_pfn = pfn; | 164 | if (cc->mode != MIGRATE_ASYNC && |
157 | } | 165 | pfn > zone->compact_cached_migrate_pfn[1]) |
166 | zone->compact_cached_migrate_pfn[1] = pfn; | ||
167 | } else { | ||
168 | if (cc->finished_update_free) | ||
169 | return; | ||
170 | if (pfn < zone->compact_cached_free_pfn) | ||
171 | zone->compact_cached_free_pfn = pfn; | ||
158 | } | 172 | } |
159 | } | 173 | } |
160 | #else | 174 | #else |
@@ -166,7 +180,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
166 | 180 | ||
167 | static void update_pageblock_skip(struct compact_control *cc, | 181 | static void update_pageblock_skip(struct compact_control *cc, |
168 | struct page *page, unsigned long nr_isolated, | 182 | struct page *page, unsigned long nr_isolated, |
169 | bool migrate_scanner) | 183 | bool set_unsuitable, bool migrate_scanner) |
170 | { | 184 | { |
171 | } | 185 | } |
172 | #endif /* CONFIG_COMPACTION */ | 186 | #endif /* CONFIG_COMPACTION */ |
@@ -195,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
195 | } | 209 | } |
196 | 210 | ||
197 | /* async aborts if taking too long or contended */ | 211 | /* async aborts if taking too long or contended */ |
198 | if (!cc->sync) { | 212 | if (cc->mode == MIGRATE_ASYNC) { |
199 | cc->contended = true; | 213 | cc->contended = true; |
200 | return false; | 214 | return false; |
201 | } | 215 | } |
@@ -208,30 +222,39 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
208 | return true; | 222 | return true; |
209 | } | 223 | } |
210 | 224 | ||
211 | static inline bool compact_trylock_irqsave(spinlock_t *lock, | 225 | /* |
212 | unsigned long *flags, struct compact_control *cc) | 226 | * Aside from avoiding lock contention, compaction also periodically checks |
227 | * need_resched() and either schedules in sync compaction or aborts async | ||
228 | * compaction. This is similar to what compact_checklock_irqsave() does, but | ||
229 | * is used where no lock is concerned. | ||
230 | * | ||
231 | * Returns false when no scheduling was needed, or sync compaction scheduled. | ||
232 | * Returns true when async compaction should abort. | ||
233 | */ | ||
234 | static inline bool compact_should_abort(struct compact_control *cc) | ||
213 | { | 235 | { |
214 | return compact_checklock_irqsave(lock, flags, false, cc); | 236 | /* async compaction aborts if contended */ |
237 | if (need_resched()) { | ||
238 | if (cc->mode == MIGRATE_ASYNC) { | ||
239 | cc->contended = true; | ||
240 | return true; | ||
241 | } | ||
242 | |||
243 | cond_resched(); | ||
244 | } | ||
245 | |||
246 | return false; | ||
215 | } | 247 | } |
216 | 248 | ||
217 | /* Returns true if the page is within a block suitable for migration to */ | 249 | /* Returns true if the page is within a block suitable for migration to */ |
218 | static bool suitable_migration_target(struct page *page) | 250 | static bool suitable_migration_target(struct page *page) |
219 | { | 251 | { |
220 | int migratetype = get_pageblock_migratetype(page); | 252 | /* If the page is a large free page, then disallow migration */ |
221 | |||
222 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
223 | if (migratetype == MIGRATE_RESERVE) | ||
224 | return false; | ||
225 | |||
226 | if (is_migrate_isolate(migratetype)) | ||
227 | return false; | ||
228 | |||
229 | /* If the page is a large free page, then allow migration */ | ||
230 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 253 | if (PageBuddy(page) && page_order(page) >= pageblock_order) |
231 | return true; | 254 | return false; |
232 | 255 | ||
233 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 256 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
234 | if (migrate_async_suitable(migratetype)) | 257 | if (migrate_async_suitable(get_pageblock_migratetype(page))) |
235 | return true; | 258 | return true; |
236 | 259 | ||
237 | /* Otherwise skip the block */ | 260 | /* Otherwise skip the block */ |
@@ -254,6 +277,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
254 | struct page *cursor, *valid_page = NULL; | 277 | struct page *cursor, *valid_page = NULL; |
255 | unsigned long flags; | 278 | unsigned long flags; |
256 | bool locked = false; | 279 | bool locked = false; |
280 | bool checked_pageblock = false; | ||
257 | 281 | ||
258 | cursor = pfn_to_page(blockpfn); | 282 | cursor = pfn_to_page(blockpfn); |
259 | 283 | ||
@@ -285,8 +309,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
285 | break; | 309 | break; |
286 | 310 | ||
287 | /* Recheck this is a suitable migration target under lock */ | 311 | /* Recheck this is a suitable migration target under lock */ |
288 | if (!strict && !suitable_migration_target(page)) | 312 | if (!strict && !checked_pageblock) { |
289 | break; | 313 | /* |
314 | * We need to check suitability of pageblock only once | ||
315 | * and this isolate_freepages_block() is called with | ||
316 | * pageblock range, so just check once is sufficient. | ||
317 | */ | ||
318 | checked_pageblock = true; | ||
319 | if (!suitable_migration_target(page)) | ||
320 | break; | ||
321 | } | ||
290 | 322 | ||
291 | /* Recheck this is a buddy page under lock */ | 323 | /* Recheck this is a buddy page under lock */ |
292 | if (!PageBuddy(page)) | 324 | if (!PageBuddy(page)) |
@@ -330,7 +362,8 @@ isolate_fail: | |||
330 | 362 | ||
331 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 363 | /* Update the pageblock-skip if the whole pageblock was scanned */ |
332 | if (blockpfn == end_pfn) | 364 | if (blockpfn == end_pfn) |
333 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 365 | update_pageblock_skip(cc, valid_page, total_isolated, true, |
366 | false); | ||
334 | 367 | ||
335 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | 368 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); |
336 | if (total_isolated) | 369 | if (total_isolated) |
@@ -461,11 +494,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
461 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 494 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
462 | unsigned long nr_scanned = 0, nr_isolated = 0; | 495 | unsigned long nr_scanned = 0, nr_isolated = 0; |
463 | struct list_head *migratelist = &cc->migratepages; | 496 | struct list_head *migratelist = &cc->migratepages; |
464 | isolate_mode_t mode = 0; | ||
465 | struct lruvec *lruvec; | 497 | struct lruvec *lruvec; |
466 | unsigned long flags; | 498 | unsigned long flags; |
467 | bool locked = false; | 499 | bool locked = false; |
468 | struct page *page = NULL, *valid_page = NULL; | 500 | struct page *page = NULL, *valid_page = NULL; |
501 | bool set_unsuitable = true; | ||
502 | const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? | ||
503 | ISOLATE_ASYNC_MIGRATE : 0) | | ||
504 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | ||
469 | 505 | ||
470 | /* | 506 | /* |
471 | * Ensure that there are not too many pages isolated from the LRU | 507 | * Ensure that there are not too many pages isolated from the LRU |
@@ -474,7 +510,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
474 | */ | 510 | */ |
475 | while (unlikely(too_many_isolated(zone))) { | 511 | while (unlikely(too_many_isolated(zone))) { |
476 | /* async migration should just abort */ | 512 | /* async migration should just abort */ |
477 | if (!cc->sync) | 513 | if (cc->mode == MIGRATE_ASYNC) |
478 | return 0; | 514 | return 0; |
479 | 515 | ||
480 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 516 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -483,11 +519,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
483 | return 0; | 519 | return 0; |
484 | } | 520 | } |
485 | 521 | ||
522 | if (compact_should_abort(cc)) | ||
523 | return 0; | ||
524 | |||
486 | /* Time to isolate some pages for migration */ | 525 | /* Time to isolate some pages for migration */ |
487 | cond_resched(); | ||
488 | for (; low_pfn < end_pfn; low_pfn++) { | 526 | for (; low_pfn < end_pfn; low_pfn++) { |
489 | /* give a chance to irqs before checking need_resched() */ | 527 | /* give a chance to irqs before checking need_resched() */ |
490 | if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 528 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { |
491 | if (should_release_lock(&zone->lru_lock)) { | 529 | if (should_release_lock(&zone->lru_lock)) { |
492 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 530 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
493 | locked = false; | 531 | locked = false; |
@@ -526,25 +564,31 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
526 | 564 | ||
527 | /* If isolation recently failed, do not retry */ | 565 | /* If isolation recently failed, do not retry */ |
528 | pageblock_nr = low_pfn >> pageblock_order; | 566 | pageblock_nr = low_pfn >> pageblock_order; |
529 | if (!isolation_suitable(cc, page)) | 567 | if (last_pageblock_nr != pageblock_nr) { |
530 | goto next_pageblock; | 568 | int mt; |
569 | |||
570 | last_pageblock_nr = pageblock_nr; | ||
571 | if (!isolation_suitable(cc, page)) | ||
572 | goto next_pageblock; | ||
573 | |||
574 | /* | ||
575 | * For async migration, also only scan in MOVABLE | ||
576 | * blocks. Async migration is optimistic to see if | ||
577 | * the minimum amount of work satisfies the allocation | ||
578 | */ | ||
579 | mt = get_pageblock_migratetype(page); | ||
580 | if (cc->mode == MIGRATE_ASYNC && | ||
581 | !migrate_async_suitable(mt)) { | ||
582 | set_unsuitable = false; | ||
583 | goto next_pageblock; | ||
584 | } | ||
585 | } | ||
531 | 586 | ||
532 | /* Skip if free */ | 587 | /* Skip if free */ |
533 | if (PageBuddy(page)) | 588 | if (PageBuddy(page)) |
534 | continue; | 589 | continue; |
535 | 590 | ||
536 | /* | 591 | /* |
537 | * For async migration, also only scan in MOVABLE blocks. Async | ||
538 | * migration is optimistic to see if the minimum amount of work | ||
539 | * satisfies the allocation | ||
540 | */ | ||
541 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | ||
542 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | ||
543 | cc->finished_update_migrate = true; | ||
544 | goto next_pageblock; | ||
545 | } | ||
546 | |||
547 | /* | ||
548 | * Check may be lockless but that's ok as we recheck later. | 592 | * Check may be lockless but that's ok as we recheck later. |
549 | * It's possible to migrate LRU pages and balloon pages | 593 | * It's possible to migrate LRU pages and balloon pages |
550 | * Skip any other type of page | 594 | * Skip any other type of page |
@@ -553,11 +597,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
553 | if (unlikely(balloon_page_movable(page))) { | 597 | if (unlikely(balloon_page_movable(page))) { |
554 | if (locked && balloon_page_isolate(page)) { | 598 | if (locked && balloon_page_isolate(page)) { |
555 | /* Successfully isolated */ | 599 | /* Successfully isolated */ |
556 | cc->finished_update_migrate = true; | 600 | goto isolate_success; |
557 | list_add(&page->lru, migratelist); | ||
558 | cc->nr_migratepages++; | ||
559 | nr_isolated++; | ||
560 | goto check_compact_cluster; | ||
561 | } | 601 | } |
562 | } | 602 | } |
563 | continue; | 603 | continue; |
@@ -580,6 +620,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
580 | continue; | 620 | continue; |
581 | } | 621 | } |
582 | 622 | ||
623 | /* | ||
624 | * Migration will fail if an anonymous page is pinned in memory, | ||
625 | * so avoid taking lru_lock and isolating it unnecessarily in an | ||
626 | * admittedly racy check. | ||
627 | */ | ||
628 | if (!page_mapping(page) && | ||
629 | page_count(page) > page_mapcount(page)) | ||
630 | continue; | ||
631 | |||
583 | /* Check if it is ok to still hold the lock */ | 632 | /* Check if it is ok to still hold the lock */ |
584 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | 633 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, |
585 | locked, cc); | 634 | locked, cc); |
@@ -594,12 +643,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
594 | continue; | 643 | continue; |
595 | } | 644 | } |
596 | 645 | ||
597 | if (!cc->sync) | ||
598 | mode |= ISOLATE_ASYNC_MIGRATE; | ||
599 | |||
600 | if (unevictable) | ||
601 | mode |= ISOLATE_UNEVICTABLE; | ||
602 | |||
603 | lruvec = mem_cgroup_page_lruvec(page, zone); | 646 | lruvec = mem_cgroup_page_lruvec(page, zone); |
604 | 647 | ||
605 | /* Try isolate the page */ | 648 | /* Try isolate the page */ |
@@ -609,13 +652,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
609 | VM_BUG_ON(PageTransCompound(page)); | 652 | VM_BUG_ON(PageTransCompound(page)); |
610 | 653 | ||
611 | /* Successfully isolated */ | 654 | /* Successfully isolated */ |
612 | cc->finished_update_migrate = true; | ||
613 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 655 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
656 | |||
657 | isolate_success: | ||
658 | cc->finished_update_migrate = true; | ||
614 | list_add(&page->lru, migratelist); | 659 | list_add(&page->lru, migratelist); |
615 | cc->nr_migratepages++; | 660 | cc->nr_migratepages++; |
616 | nr_isolated++; | 661 | nr_isolated++; |
617 | 662 | ||
618 | check_compact_cluster: | ||
619 | /* Avoid isolating too much */ | 663 | /* Avoid isolating too much */ |
620 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 664 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
621 | ++low_pfn; | 665 | ++low_pfn; |
@@ -626,7 +670,6 @@ check_compact_cluster: | |||
626 | 670 | ||
627 | next_pageblock: | 671 | next_pageblock: |
628 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; | 672 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
629 | last_pageblock_nr = pageblock_nr; | ||
630 | } | 673 | } |
631 | 674 | ||
632 | acct_isolated(zone, locked, cc); | 675 | acct_isolated(zone, locked, cc); |
@@ -634,9 +677,13 @@ next_pageblock: | |||
634 | if (locked) | 677 | if (locked) |
635 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 678 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
636 | 679 | ||
637 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 680 | /* |
681 | * Update the pageblock-skip information and cached scanner pfn, | ||
682 | * if the whole pageblock was scanned without isolating any page. | ||
683 | */ | ||
638 | if (low_pfn == end_pfn) | 684 | if (low_pfn == end_pfn) |
639 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 685 | update_pageblock_skip(cc, valid_page, nr_isolated, |
686 | set_unsuitable, true); | ||
640 | 687 | ||
641 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 688 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
642 | 689 | ||
@@ -657,7 +704,9 @@ static void isolate_freepages(struct zone *zone, | |||
657 | struct compact_control *cc) | 704 | struct compact_control *cc) |
658 | { | 705 | { |
659 | struct page *page; | 706 | struct page *page; |
660 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn; | 707 | unsigned long block_start_pfn; /* start of current pageblock */ |
708 | unsigned long block_end_pfn; /* end of current pageblock */ | ||
709 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | ||
661 | int nr_freepages = cc->nr_freepages; | 710 | int nr_freepages = cc->nr_freepages; |
662 | struct list_head *freelist = &cc->freepages; | 711 | struct list_head *freelist = &cc->freepages; |
663 | 712 | ||
@@ -665,41 +714,38 @@ static void isolate_freepages(struct zone *zone, | |||
665 | * Initialise the free scanner. The starting point is where we last | 714 | * Initialise the free scanner. The starting point is where we last |
666 | * successfully isolated from, zone-cached value, or the end of the | 715 | * successfully isolated from, zone-cached value, or the end of the |
667 | * zone when isolating for the first time. We need this aligned to | 716 | * zone when isolating for the first time. We need this aligned to |
668 | * the pageblock boundary, because we do pfn -= pageblock_nr_pages | 717 | * the pageblock boundary, because we do |
669 | * in the for loop. | 718 | * block_start_pfn -= pageblock_nr_pages in the for loop. |
719 | * For ending point, take care when isolating in last pageblock of a | ||
720 | * a zone which ends in the middle of a pageblock. | ||
670 | * The low boundary is the end of the pageblock the migration scanner | 721 | * The low boundary is the end of the pageblock the migration scanner |
671 | * is using. | 722 | * is using. |
672 | */ | 723 | */ |
673 | pfn = cc->free_pfn & ~(pageblock_nr_pages-1); | 724 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); |
725 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | ||
726 | zone_end_pfn(zone)); | ||
674 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); | 727 | low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); |
675 | 728 | ||
676 | /* | 729 | /* |
677 | * Take care that if the migration scanner is at the end of the zone | ||
678 | * that the free scanner does not accidentally move to the next zone | ||
679 | * in the next isolation cycle. | ||
680 | */ | ||
681 | high_pfn = min(low_pfn, pfn); | ||
682 | |||
683 | z_end_pfn = zone_end_pfn(zone); | ||
684 | |||
685 | /* | ||
686 | * Isolate free pages until enough are available to migrate the | 730 | * Isolate free pages until enough are available to migrate the |
687 | * pages on cc->migratepages. We stop searching if the migrate | 731 | * pages on cc->migratepages. We stop searching if the migrate |
688 | * and free page scanners meet or enough free pages are isolated. | 732 | * and free page scanners meet or enough free pages are isolated. |
689 | */ | 733 | */ |
690 | for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 734 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
691 | pfn -= pageblock_nr_pages) { | 735 | block_end_pfn = block_start_pfn, |
736 | block_start_pfn -= pageblock_nr_pages) { | ||
692 | unsigned long isolated; | 737 | unsigned long isolated; |
693 | unsigned long end_pfn; | ||
694 | 738 | ||
695 | /* | 739 | /* |
696 | * This can iterate a massively long zone without finding any | 740 | * This can iterate a massively long zone without finding any |
697 | * suitable migration targets, so periodically check if we need | 741 | * suitable migration targets, so periodically check if we need |
698 | * to schedule. | 742 | * to schedule, or even abort async compaction. |
699 | */ | 743 | */ |
700 | cond_resched(); | 744 | if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) |
745 | && compact_should_abort(cc)) | ||
746 | break; | ||
701 | 747 | ||
702 | if (!pfn_valid(pfn)) | 748 | if (!pfn_valid(block_start_pfn)) |
703 | continue; | 749 | continue; |
704 | 750 | ||
705 | /* | 751 | /* |
@@ -709,7 +755,7 @@ static void isolate_freepages(struct zone *zone, | |||
709 | * i.e. it's possible that all pages within a zones range of | 755 | * i.e. it's possible that all pages within a zones range of |
710 | * pages do not belong to a single zone. | 756 | * pages do not belong to a single zone. |
711 | */ | 757 | */ |
712 | page = pfn_to_page(pfn); | 758 | page = pfn_to_page(block_start_pfn); |
713 | if (page_zone(page) != zone) | 759 | if (page_zone(page) != zone) |
714 | continue; | 760 | continue; |
715 | 761 | ||
@@ -722,26 +768,26 @@ static void isolate_freepages(struct zone *zone, | |||
722 | continue; | 768 | continue; |
723 | 769 | ||
724 | /* Found a block suitable for isolating free pages from */ | 770 | /* Found a block suitable for isolating free pages from */ |
725 | isolated = 0; | 771 | cc->free_pfn = block_start_pfn; |
772 | isolated = isolate_freepages_block(cc, block_start_pfn, | ||
773 | block_end_pfn, freelist, false); | ||
774 | nr_freepages += isolated; | ||
726 | 775 | ||
727 | /* | 776 | /* |
728 | * Take care when isolating in last pageblock of a zone which | 777 | * Set a flag that we successfully isolated in this pageblock. |
729 | * ends in the middle of a pageblock. | 778 | * In the next loop iteration, zone->compact_cached_free_pfn |
779 | * will not be updated and thus it will effectively contain the | ||
780 | * highest pageblock we isolated pages from. | ||
730 | */ | 781 | */ |
731 | end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); | 782 | if (isolated) |
732 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 783 | cc->finished_update_free = true; |
733 | freelist, false); | ||
734 | nr_freepages += isolated; | ||
735 | 784 | ||
736 | /* | 785 | /* |
737 | * Record the highest PFN we isolated pages from. When next | 786 | * isolate_freepages_block() might have aborted due to async |
738 | * looking for free pages, the search will restart here as | 787 | * compaction being contended |
739 | * page migration may have returned some pages to the allocator | ||
740 | */ | 788 | */ |
741 | if (isolated) { | 789 | if (cc->contended) |
742 | cc->finished_update_free = true; | 790 | break; |
743 | high_pfn = max(high_pfn, pfn); | ||
744 | } | ||
745 | } | 791 | } |
746 | 792 | ||
747 | /* split_free_page does not map the pages */ | 793 | /* split_free_page does not map the pages */ |
@@ -751,10 +797,9 @@ static void isolate_freepages(struct zone *zone, | |||
751 | * If we crossed the migrate scanner, we want to keep it that way | 797 | * If we crossed the migrate scanner, we want to keep it that way |
752 | * so that compact_finished() may detect this | 798 | * so that compact_finished() may detect this |
753 | */ | 799 | */ |
754 | if (pfn < low_pfn) | 800 | if (block_start_pfn < low_pfn) |
755 | cc->free_pfn = max(pfn, zone->zone_start_pfn); | 801 | cc->free_pfn = cc->migrate_pfn; |
756 | else | 802 | |
757 | cc->free_pfn = high_pfn; | ||
758 | cc->nr_freepages = nr_freepages; | 803 | cc->nr_freepages = nr_freepages; |
759 | } | 804 | } |
760 | 805 | ||
@@ -769,9 +814,13 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
769 | struct compact_control *cc = (struct compact_control *)data; | 814 | struct compact_control *cc = (struct compact_control *)data; |
770 | struct page *freepage; | 815 | struct page *freepage; |
771 | 816 | ||
772 | /* Isolate free pages if necessary */ | 817 | /* |
818 | * Isolate free pages if necessary, and if we are not aborting due to | ||
819 | * contention. | ||
820 | */ | ||
773 | if (list_empty(&cc->freepages)) { | 821 | if (list_empty(&cc->freepages)) { |
774 | isolate_freepages(cc->zone, cc); | 822 | if (!cc->contended) |
823 | isolate_freepages(cc->zone, cc); | ||
775 | 824 | ||
776 | if (list_empty(&cc->freepages)) | 825 | if (list_empty(&cc->freepages)) |
777 | return NULL; | 826 | return NULL; |
@@ -785,23 +834,16 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
785 | } | 834 | } |
786 | 835 | ||
787 | /* | 836 | /* |
788 | * We cannot control nr_migratepages and nr_freepages fully when migration is | 837 | * This is a migrate-callback that "frees" freepages back to the isolated |
789 | * running as migrate_pages() has no knowledge of compact_control. When | 838 | * freelist. All pages on the freelist are from the same zone, so there is no |
790 | * migration is complete, we count the number of pages on the lists by hand. | 839 | * special handling needed for NUMA. |
791 | */ | 840 | */ |
792 | static void update_nr_listpages(struct compact_control *cc) | 841 | static void compaction_free(struct page *page, unsigned long data) |
793 | { | 842 | { |
794 | int nr_migratepages = 0; | 843 | struct compact_control *cc = (struct compact_control *)data; |
795 | int nr_freepages = 0; | ||
796 | struct page *page; | ||
797 | |||
798 | list_for_each_entry(page, &cc->migratepages, lru) | ||
799 | nr_migratepages++; | ||
800 | list_for_each_entry(page, &cc->freepages, lru) | ||
801 | nr_freepages++; | ||
802 | 844 | ||
803 | cc->nr_migratepages = nr_migratepages; | 845 | list_add(&page->lru, &cc->freepages); |
804 | cc->nr_freepages = nr_freepages; | 846 | cc->nr_freepages++; |
805 | } | 847 | } |
806 | 848 | ||
807 | /* possible outcome of isolate_migratepages */ | 849 | /* possible outcome of isolate_migratepages */ |
@@ -848,11 +890,16 @@ static int compact_finished(struct zone *zone, | |||
848 | unsigned int order; | 890 | unsigned int order; |
849 | unsigned long watermark; | 891 | unsigned long watermark; |
850 | 892 | ||
851 | if (fatal_signal_pending(current)) | 893 | if (cc->contended || fatal_signal_pending(current)) |
852 | return COMPACT_PARTIAL; | 894 | return COMPACT_PARTIAL; |
853 | 895 | ||
854 | /* Compaction run completes if the migrate and free scanner meet */ | 896 | /* Compaction run completes if the migrate and free scanner meet */ |
855 | if (cc->free_pfn <= cc->migrate_pfn) { | 897 | if (cc->free_pfn <= cc->migrate_pfn) { |
898 | /* Let the next compaction start anew. */ | ||
899 | zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; | ||
900 | zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; | ||
901 | zone->compact_cached_free_pfn = zone_end_pfn(zone); | ||
902 | |||
856 | /* | 903 | /* |
857 | * Mark that the PG_migrate_skip information should be cleared | 904 | * Mark that the PG_migrate_skip information should be cleared |
858 | * by kswapd when it goes to sleep. kswapd does not set the | 905 | * by kswapd when it goes to sleep. kswapd does not set the |
@@ -950,6 +997,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
950 | int ret; | 997 | int ret; |
951 | unsigned long start_pfn = zone->zone_start_pfn; | 998 | unsigned long start_pfn = zone->zone_start_pfn; |
952 | unsigned long end_pfn = zone_end_pfn(zone); | 999 | unsigned long end_pfn = zone_end_pfn(zone); |
1000 | const bool sync = cc->mode != MIGRATE_ASYNC; | ||
953 | 1001 | ||
954 | ret = compaction_suitable(zone, cc->order); | 1002 | ret = compaction_suitable(zone, cc->order); |
955 | switch (ret) { | 1003 | switch (ret) { |
@@ -975,7 +1023,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
975 | * information on where the scanners should start but check that it | 1023 | * information on where the scanners should start but check that it |
976 | * is initialised by ensuring the values are within zone boundaries. | 1024 | * is initialised by ensuring the values are within zone boundaries. |
977 | */ | 1025 | */ |
978 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; | 1026 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; |
979 | cc->free_pfn = zone->compact_cached_free_pfn; | 1027 | cc->free_pfn = zone->compact_cached_free_pfn; |
980 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { | 1028 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
981 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); | 1029 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
@@ -983,13 +1031,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
983 | } | 1031 | } |
984 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | 1032 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { |
985 | cc->migrate_pfn = start_pfn; | 1033 | cc->migrate_pfn = start_pfn; |
986 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | 1034 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
1035 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | ||
987 | } | 1036 | } |
988 | 1037 | ||
1038 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | ||
1039 | |||
989 | migrate_prep_local(); | 1040 | migrate_prep_local(); |
990 | 1041 | ||
991 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 1042 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
992 | unsigned long nr_migrate, nr_remaining; | ||
993 | int err; | 1043 | int err; |
994 | 1044 | ||
995 | switch (isolate_migratepages(zone, cc)) { | 1045 | switch (isolate_migratepages(zone, cc)) { |
@@ -1004,21 +1054,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1004 | ; | 1054 | ; |
1005 | } | 1055 | } |
1006 | 1056 | ||
1007 | nr_migrate = cc->nr_migratepages; | 1057 | if (!cc->nr_migratepages) |
1058 | continue; | ||
1059 | |||
1008 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1060 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1009 | (unsigned long)cc, | 1061 | compaction_free, (unsigned long)cc, cc->mode, |
1010 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | ||
1011 | MR_COMPACTION); | 1062 | MR_COMPACTION); |
1012 | update_nr_listpages(cc); | ||
1013 | nr_remaining = cc->nr_migratepages; | ||
1014 | 1063 | ||
1015 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1064 | trace_mm_compaction_migratepages(cc->nr_migratepages, err, |
1016 | nr_remaining); | 1065 | &cc->migratepages); |
1017 | 1066 | ||
1018 | /* Release isolated pages not migrated */ | 1067 | /* All pages were either migrated or will be released */ |
1068 | cc->nr_migratepages = 0; | ||
1019 | if (err) { | 1069 | if (err) { |
1020 | putback_movable_pages(&cc->migratepages); | 1070 | putback_movable_pages(&cc->migratepages); |
1021 | cc->nr_migratepages = 0; | ||
1022 | /* | 1071 | /* |
1023 | * migrate_pages() may return -ENOMEM when scanners meet | 1072 | * migrate_pages() may return -ENOMEM when scanners meet |
1024 | * and we want compact_finished() to detect it | 1073 | * and we want compact_finished() to detect it |
@@ -1035,12 +1084,13 @@ out: | |||
1035 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1084 | cc->nr_freepages -= release_freepages(&cc->freepages); |
1036 | VM_BUG_ON(cc->nr_freepages != 0); | 1085 | VM_BUG_ON(cc->nr_freepages != 0); |
1037 | 1086 | ||
1087 | trace_mm_compaction_end(ret); | ||
1088 | |||
1038 | return ret; | 1089 | return ret; |
1039 | } | 1090 | } |
1040 | 1091 | ||
1041 | static unsigned long compact_zone_order(struct zone *zone, | 1092 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1042 | int order, gfp_t gfp_mask, | 1093 | gfp_t gfp_mask, enum migrate_mode mode, bool *contended) |
1043 | bool sync, bool *contended) | ||
1044 | { | 1094 | { |
1045 | unsigned long ret; | 1095 | unsigned long ret; |
1046 | struct compact_control cc = { | 1096 | struct compact_control cc = { |
@@ -1049,7 +1099,7 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
1049 | .order = order, | 1099 | .order = order, |
1050 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1100 | .migratetype = allocflags_to_migratetype(gfp_mask), |
1051 | .zone = zone, | 1101 | .zone = zone, |
1052 | .sync = sync, | 1102 | .mode = mode, |
1053 | }; | 1103 | }; |
1054 | INIT_LIST_HEAD(&cc.freepages); | 1104 | INIT_LIST_HEAD(&cc.freepages); |
1055 | INIT_LIST_HEAD(&cc.migratepages); | 1105 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1071,7 +1121,7 @@ int sysctl_extfrag_threshold = 500; | |||
1071 | * @order: The order of the current allocation | 1121 | * @order: The order of the current allocation |
1072 | * @gfp_mask: The GFP mask of the current allocation | 1122 | * @gfp_mask: The GFP mask of the current allocation |
1073 | * @nodemask: The allowed nodes to allocate from | 1123 | * @nodemask: The allowed nodes to allocate from |
1074 | * @sync: Whether migration is synchronous or not | 1124 | * @mode: The migration mode for async, sync light, or sync migration |
1075 | * @contended: Return value that is true if compaction was aborted due to lock contention | 1125 | * @contended: Return value that is true if compaction was aborted due to lock contention |
1076 | * @page: Optionally capture a free page of the requested order during compaction | 1126 | * @page: Optionally capture a free page of the requested order during compaction |
1077 | * | 1127 | * |
@@ -1079,7 +1129,7 @@ int sysctl_extfrag_threshold = 500; | |||
1079 | */ | 1129 | */ |
1080 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1130 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1081 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1131 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1082 | bool sync, bool *contended) | 1132 | enum migrate_mode mode, bool *contended) |
1083 | { | 1133 | { |
1084 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1134 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1085 | int may_enter_fs = gfp_mask & __GFP_FS; | 1135 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1104,7 +1154,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1104 | nodemask) { | 1154 | nodemask) { |
1105 | int status; | 1155 | int status; |
1106 | 1156 | ||
1107 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1157 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1108 | contended); | 1158 | contended); |
1109 | rc = max(status, rc); | 1159 | rc = max(status, rc); |
1110 | 1160 | ||
@@ -1140,13 +1190,9 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1140 | compact_zone(zone, cc); | 1190 | compact_zone(zone, cc); |
1141 | 1191 | ||
1142 | if (cc->order > 0) { | 1192 | if (cc->order > 0) { |
1143 | int ok = zone_watermark_ok(zone, cc->order, | 1193 | if (zone_watermark_ok(zone, cc->order, |
1144 | low_wmark_pages(zone), 0, 0); | 1194 | low_wmark_pages(zone), 0, 0)) |
1145 | if (ok && cc->order >= zone->compact_order_failed) | 1195 | compaction_defer_reset(zone, cc->order, false); |
1146 | zone->compact_order_failed = cc->order + 1; | ||
1147 | /* Currently async compaction is never deferred. */ | ||
1148 | else if (!ok && cc->sync) | ||
1149 | defer_compaction(zone, cc->order); | ||
1150 | } | 1196 | } |
1151 | 1197 | ||
1152 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1198 | VM_BUG_ON(!list_empty(&cc->freepages)); |
@@ -1158,7 +1204,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) | |||
1158 | { | 1204 | { |
1159 | struct compact_control cc = { | 1205 | struct compact_control cc = { |
1160 | .order = order, | 1206 | .order = order, |
1161 | .sync = false, | 1207 | .mode = MIGRATE_ASYNC, |
1162 | }; | 1208 | }; |
1163 | 1209 | ||
1164 | if (!order) | 1210 | if (!order) |
@@ -1171,7 +1217,8 @@ static void compact_node(int nid) | |||
1171 | { | 1217 | { |
1172 | struct compact_control cc = { | 1218 | struct compact_control cc = { |
1173 | .order = -1, | 1219 | .order = -1, |
1174 | .sync = true, | 1220 | .mode = MIGRATE_SYNC, |
1221 | .ignore_skip_hint = true, | ||
1175 | }; | 1222 | }; |
1176 | 1223 | ||
1177 | __compact_pgdat(NODE_DATA(nid), &cc); | 1224 | __compact_pgdat(NODE_DATA(nid), &cc); |
diff --git a/mm/filemap.c b/mm/filemap.c index ae4846ff4849..b012daefc2d7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -192,9 +192,11 @@ static int filemap_check_errors(struct address_space *mapping) | |||
192 | { | 192 | { |
193 | int ret = 0; | 193 | int ret = 0; |
194 | /* Check for outstanding write errors */ | 194 | /* Check for outstanding write errors */ |
195 | if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | 195 | if (test_bit(AS_ENOSPC, &mapping->flags) && |
196 | test_and_clear_bit(AS_ENOSPC, &mapping->flags)) | ||
196 | ret = -ENOSPC; | 197 | ret = -ENOSPC; |
197 | if (test_and_clear_bit(AS_EIO, &mapping->flags)) | 198 | if (test_bit(AS_EIO, &mapping->flags) && |
199 | test_and_clear_bit(AS_EIO, &mapping->flags)) | ||
198 | ret = -EIO; | 200 | ret = -EIO; |
199 | return ret; | 201 | return ret; |
200 | } | 202 | } |
@@ -446,6 +448,29 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | |||
446 | } | 448 | } |
447 | EXPORT_SYMBOL_GPL(replace_page_cache_page); | 449 | EXPORT_SYMBOL_GPL(replace_page_cache_page); |
448 | 450 | ||
451 | static int page_cache_tree_insert(struct address_space *mapping, | ||
452 | struct page *page) | ||
453 | { | ||
454 | void **slot; | ||
455 | int error; | ||
456 | |||
457 | slot = radix_tree_lookup_slot(&mapping->page_tree, page->index); | ||
458 | if (slot) { | ||
459 | void *p; | ||
460 | |||
461 | p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); | ||
462 | if (!radix_tree_exceptional_entry(p)) | ||
463 | return -EEXIST; | ||
464 | radix_tree_replace_slot(slot, page); | ||
465 | mapping->nrpages++; | ||
466 | return 0; | ||
467 | } | ||
468 | error = radix_tree_insert(&mapping->page_tree, page->index, page); | ||
469 | if (!error) | ||
470 | mapping->nrpages++; | ||
471 | return error; | ||
472 | } | ||
473 | |||
449 | /** | 474 | /** |
450 | * add_to_page_cache_locked - add a locked page to the pagecache | 475 | * add_to_page_cache_locked - add a locked page to the pagecache |
451 | * @page: page to add | 476 | * @page: page to add |
@@ -480,11 +505,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
480 | page->index = offset; | 505 | page->index = offset; |
481 | 506 | ||
482 | spin_lock_irq(&mapping->tree_lock); | 507 | spin_lock_irq(&mapping->tree_lock); |
483 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 508 | error = page_cache_tree_insert(mapping, page); |
484 | radix_tree_preload_end(); | 509 | radix_tree_preload_end(); |
485 | if (unlikely(error)) | 510 | if (unlikely(error)) |
486 | goto err_insert; | 511 | goto err_insert; |
487 | mapping->nrpages++; | ||
488 | __inc_zone_page_state(page, NR_FILE_PAGES); | 512 | __inc_zone_page_state(page, NR_FILE_PAGES); |
489 | spin_unlock_irq(&mapping->tree_lock); | 513 | spin_unlock_irq(&mapping->tree_lock); |
490 | trace_mm_filemap_add_to_page_cache(page); | 514 | trace_mm_filemap_add_to_page_cache(page); |
@@ -520,10 +544,10 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
520 | if (cpuset_do_page_mem_spread()) { | 544 | if (cpuset_do_page_mem_spread()) { |
521 | unsigned int cpuset_mems_cookie; | 545 | unsigned int cpuset_mems_cookie; |
522 | do { | 546 | do { |
523 | cpuset_mems_cookie = get_mems_allowed(); | 547 | cpuset_mems_cookie = read_mems_allowed_begin(); |
524 | n = cpuset_mem_spread_node(); | 548 | n = cpuset_mem_spread_node(); |
525 | page = alloc_pages_exact_node(n, gfp, 0); | 549 | page = alloc_pages_exact_node(n, gfp, 0); |
526 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | 550 | } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); |
527 | 551 | ||
528 | return page; | 552 | return page; |
529 | } | 553 | } |
@@ -620,8 +644,17 @@ EXPORT_SYMBOL(unlock_page); | |||
620 | */ | 644 | */ |
621 | void end_page_writeback(struct page *page) | 645 | void end_page_writeback(struct page *page) |
622 | { | 646 | { |
623 | if (TestClearPageReclaim(page)) | 647 | /* |
648 | * TestClearPageReclaim could be used here but it is an atomic | ||
649 | * operation and overkill in this particular case. Failing to | ||
650 | * shuffle a page marked for immediate reclaim is too mild to | ||
651 | * justify taking an atomic operation penalty at the end of | ||
652 | * ever page writeback. | ||
653 | */ | ||
654 | if (PageReclaim(page)) { | ||
655 | ClearPageReclaim(page); | ||
624 | rotate_reclaimable_page(page); | 656 | rotate_reclaimable_page(page); |
657 | } | ||
625 | 658 | ||
626 | if (!test_clear_page_writeback(page)) | 659 | if (!test_clear_page_writeback(page)) |
627 | BUG(); | 660 | BUG(); |
@@ -686,14 +719,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |||
686 | } | 719 | } |
687 | 720 | ||
688 | /** | 721 | /** |
689 | * find_get_page - find and get a page reference | 722 | * page_cache_next_hole - find the next hole (not-present entry) |
723 | * @mapping: mapping | ||
724 | * @index: index | ||
725 | * @max_scan: maximum range to search | ||
726 | * | ||
727 | * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the | ||
728 | * lowest indexed hole. | ||
729 | * | ||
730 | * Returns: the index of the hole if found, otherwise returns an index | ||
731 | * outside of the set specified (in which case 'return - index >= | ||
732 | * max_scan' will be true). In rare cases of index wrap-around, 0 will | ||
733 | * be returned. | ||
734 | * | ||
735 | * page_cache_next_hole may be called under rcu_read_lock. However, | ||
736 | * like radix_tree_gang_lookup, this will not atomically search a | ||
737 | * snapshot of the tree at a single point in time. For example, if a | ||
738 | * hole is created at index 5, then subsequently a hole is created at | ||
739 | * index 10, page_cache_next_hole covering both indexes may return 10 | ||
740 | * if called under rcu_read_lock. | ||
741 | */ | ||
742 | pgoff_t page_cache_next_hole(struct address_space *mapping, | ||
743 | pgoff_t index, unsigned long max_scan) | ||
744 | { | ||
745 | unsigned long i; | ||
746 | |||
747 | for (i = 0; i < max_scan; i++) { | ||
748 | struct page *page; | ||
749 | |||
750 | page = radix_tree_lookup(&mapping->page_tree, index); | ||
751 | if (!page || radix_tree_exceptional_entry(page)) | ||
752 | break; | ||
753 | index++; | ||
754 | if (index == 0) | ||
755 | break; | ||
756 | } | ||
757 | |||
758 | return index; | ||
759 | } | ||
760 | EXPORT_SYMBOL(page_cache_next_hole); | ||
761 | |||
762 | /** | ||
763 | * page_cache_prev_hole - find the prev hole (not-present entry) | ||
764 | * @mapping: mapping | ||
765 | * @index: index | ||
766 | * @max_scan: maximum range to search | ||
767 | * | ||
768 | * Search backwards in the range [max(index-max_scan+1, 0), index] for | ||
769 | * the first hole. | ||
770 | * | ||
771 | * Returns: the index of the hole if found, otherwise returns an index | ||
772 | * outside of the set specified (in which case 'index - return >= | ||
773 | * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX | ||
774 | * will be returned. | ||
775 | * | ||
776 | * page_cache_prev_hole may be called under rcu_read_lock. However, | ||
777 | * like radix_tree_gang_lookup, this will not atomically search a | ||
778 | * snapshot of the tree at a single point in time. For example, if a | ||
779 | * hole is created at index 10, then subsequently a hole is created at | ||
780 | * index 5, page_cache_prev_hole covering both indexes may return 5 if | ||
781 | * called under rcu_read_lock. | ||
782 | */ | ||
783 | pgoff_t page_cache_prev_hole(struct address_space *mapping, | ||
784 | pgoff_t index, unsigned long max_scan) | ||
785 | { | ||
786 | unsigned long i; | ||
787 | |||
788 | for (i = 0; i < max_scan; i++) { | ||
789 | struct page *page; | ||
790 | |||
791 | page = radix_tree_lookup(&mapping->page_tree, index); | ||
792 | if (!page || radix_tree_exceptional_entry(page)) | ||
793 | break; | ||
794 | index--; | ||
795 | if (index == ULONG_MAX) | ||
796 | break; | ||
797 | } | ||
798 | |||
799 | return index; | ||
800 | } | ||
801 | EXPORT_SYMBOL(page_cache_prev_hole); | ||
802 | |||
803 | /** | ||
804 | * find_get_entry - find and get a page cache entry | ||
690 | * @mapping: the address_space to search | 805 | * @mapping: the address_space to search |
691 | * @offset: the page index | 806 | * @offset: the page cache index |
807 | * | ||
808 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
809 | * page cache page, it is returned with an increased refcount. | ||
692 | * | 810 | * |
693 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 811 | * If the slot holds a shadow entry of a previously evicted page, it |
694 | * If yes, increment its refcount and return it; if no, return NULL. | 812 | * is returned. |
813 | * | ||
814 | * Otherwise, %NULL is returned. | ||
695 | */ | 815 | */ |
696 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) | 816 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) |
697 | { | 817 | { |
698 | void **pagep; | 818 | void **pagep; |
699 | struct page *page; | 819 | struct page *page; |
@@ -734,24 +854,30 @@ out: | |||
734 | 854 | ||
735 | return page; | 855 | return page; |
736 | } | 856 | } |
737 | EXPORT_SYMBOL(find_get_page); | 857 | EXPORT_SYMBOL(find_get_entry); |
738 | 858 | ||
739 | /** | 859 | /** |
740 | * find_lock_page - locate, pin and lock a pagecache page | 860 | * find_lock_entry - locate, pin and lock a page cache entry |
741 | * @mapping: the address_space to search | 861 | * @mapping: the address_space to search |
742 | * @offset: the page index | 862 | * @offset: the page cache index |
863 | * | ||
864 | * Looks up the page cache slot at @mapping & @offset. If there is a | ||
865 | * page cache page, it is returned locked and with an increased | ||
866 | * refcount. | ||
743 | * | 867 | * |
744 | * Locates the desired pagecache page, locks it, increments its reference | 868 | * If the slot holds a shadow entry of a previously evicted page, it |
745 | * count and returns its address. | 869 | * is returned. |
746 | * | 870 | * |
747 | * Returns zero if the page was not present. find_lock_page() may sleep. | 871 | * Otherwise, %NULL is returned. |
872 | * | ||
873 | * find_lock_entry() may sleep. | ||
748 | */ | 874 | */ |
749 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | 875 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) |
750 | { | 876 | { |
751 | struct page *page; | 877 | struct page *page; |
752 | 878 | ||
753 | repeat: | 879 | repeat: |
754 | page = find_get_page(mapping, offset); | 880 | page = find_get_entry(mapping, offset); |
755 | if (page && !radix_tree_exception(page)) { | 881 | if (page && !radix_tree_exception(page)) { |
756 | lock_page(page); | 882 | lock_page(page); |
757 | /* Has the page been truncated? */ | 883 | /* Has the page been truncated? */ |
@@ -764,44 +890,87 @@ repeat: | |||
764 | } | 890 | } |
765 | return page; | 891 | return page; |
766 | } | 892 | } |
767 | EXPORT_SYMBOL(find_lock_page); | 893 | EXPORT_SYMBOL(find_lock_entry); |
768 | 894 | ||
769 | /** | 895 | /** |
770 | * find_or_create_page - locate or add a pagecache page | 896 | * pagecache_get_page - find and get a page reference |
771 | * @mapping: the page's address_space | 897 | * @mapping: the address_space to search |
772 | * @index: the page's index into the mapping | 898 | * @offset: the page index |
773 | * @gfp_mask: page allocation mode | 899 | * @fgp_flags: PCG flags |
900 | * @gfp_mask: gfp mask to use if a page is to be allocated | ||
901 | * | ||
902 | * Looks up the page cache slot at @mapping & @offset. | ||
903 | * | ||
904 | * PCG flags modify how the page is returned | ||
774 | * | 905 | * |
775 | * Locates a page in the pagecache. If the page is not present, a new page | 906 | * FGP_ACCESSED: the page will be marked accessed |
776 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's | 907 | * FGP_LOCK: Page is return locked |
777 | * LRU list. The returned page is locked and has its reference count | 908 | * FGP_CREAT: If page is not present then a new page is allocated using |
778 | * incremented. | 909 | * @gfp_mask and added to the page cache and the VM's LRU |
910 | * list. The page is returned locked and with an increased | ||
911 | * refcount. Otherwise, %NULL is returned. | ||
779 | * | 912 | * |
780 | * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic | 913 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
781 | * allocation! | 914 | * if the GFP flags specified for FGP_CREAT are atomic. |
782 | * | 915 | * |
783 | * find_or_create_page() returns the desired page's address, or zero on | 916 | * If there is a page cache page, it is returned with an increased refcount. |
784 | * memory exhaustion. | ||
785 | */ | 917 | */ |
786 | struct page *find_or_create_page(struct address_space *mapping, | 918 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, |
787 | pgoff_t index, gfp_t gfp_mask) | 919 | int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) |
788 | { | 920 | { |
789 | struct page *page; | 921 | struct page *page; |
790 | int err; | 922 | |
791 | repeat: | 923 | repeat: |
792 | page = find_lock_page(mapping, index); | 924 | page = find_get_entry(mapping, offset); |
793 | if (!page) { | 925 | if (radix_tree_exceptional_entry(page)) |
794 | page = __page_cache_alloc(gfp_mask); | 926 | page = NULL; |
927 | if (!page) | ||
928 | goto no_page; | ||
929 | |||
930 | if (fgp_flags & FGP_LOCK) { | ||
931 | if (fgp_flags & FGP_NOWAIT) { | ||
932 | if (!trylock_page(page)) { | ||
933 | page_cache_release(page); | ||
934 | return NULL; | ||
935 | } | ||
936 | } else { | ||
937 | lock_page(page); | ||
938 | } | ||
939 | |||
940 | /* Has the page been truncated? */ | ||
941 | if (unlikely(page->mapping != mapping)) { | ||
942 | unlock_page(page); | ||
943 | page_cache_release(page); | ||
944 | goto repeat; | ||
945 | } | ||
946 | VM_BUG_ON(page->index != offset); | ||
947 | } | ||
948 | |||
949 | if (page && (fgp_flags & FGP_ACCESSED)) | ||
950 | mark_page_accessed(page); | ||
951 | |||
952 | no_page: | ||
953 | if (!page && (fgp_flags & FGP_CREAT)) { | ||
954 | int err; | ||
955 | if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) | ||
956 | cache_gfp_mask |= __GFP_WRITE; | ||
957 | if (fgp_flags & FGP_NOFS) { | ||
958 | cache_gfp_mask &= ~__GFP_FS; | ||
959 | radix_gfp_mask &= ~__GFP_FS; | ||
960 | } | ||
961 | |||
962 | page = __page_cache_alloc(cache_gfp_mask); | ||
795 | if (!page) | 963 | if (!page) |
796 | return NULL; | 964 | return NULL; |
797 | /* | 965 | |
798 | * We want a regular kernel memory (not highmem or DMA etc) | 966 | if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) |
799 | * allocation for the radix tree nodes, but we need to honour | 967 | fgp_flags |= FGP_LOCK; |
800 | * the context-specific requirements the caller has asked for. | 968 | |
801 | * GFP_RECLAIM_MASK collects those requirements. | 969 | /* Init accessed so avoit atomic mark_page_accessed later */ |
802 | */ | 970 | if (fgp_flags & FGP_ACCESSED) |
803 | err = add_to_page_cache_lru(page, mapping, index, | 971 | init_page_accessed(page); |
804 | (gfp_mask & GFP_RECLAIM_MASK)); | 972 | |
973 | err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); | ||
805 | if (unlikely(err)) { | 974 | if (unlikely(err)) { |
806 | page_cache_release(page); | 975 | page_cache_release(page); |
807 | page = NULL; | 976 | page = NULL; |
@@ -809,9 +978,80 @@ repeat: | |||
809 | goto repeat; | 978 | goto repeat; |
810 | } | 979 | } |
811 | } | 980 | } |
981 | |||
812 | return page; | 982 | return page; |
813 | } | 983 | } |
814 | EXPORT_SYMBOL(find_or_create_page); | 984 | EXPORT_SYMBOL(pagecache_get_page); |
985 | |||
986 | /** | ||
987 | * find_get_entries - gang pagecache lookup | ||
988 | * @mapping: The address_space to search | ||
989 | * @start: The starting page cache index | ||
990 | * @nr_entries: The maximum number of entries | ||
991 | * @entries: Where the resulting entries are placed | ||
992 | * @indices: The cache indices corresponding to the entries in @entries | ||
993 | * | ||
994 | * find_get_entries() will search for and return a group of up to | ||
995 | * @nr_entries entries in the mapping. The entries are placed at | ||
996 | * @entries. find_get_entries() takes a reference against any actual | ||
997 | * pages it returns. | ||
998 | * | ||
999 | * The search returns a group of mapping-contiguous page cache entries | ||
1000 | * with ascending indexes. There may be holes in the indices due to | ||
1001 | * not-present pages. | ||
1002 | * | ||
1003 | * Any shadow entries of evicted pages are included in the returned | ||
1004 | * array. | ||
1005 | * | ||
1006 | * find_get_entries() returns the number of pages and shadow entries | ||
1007 | * which were found. | ||
1008 | */ | ||
1009 | unsigned find_get_entries(struct address_space *mapping, | ||
1010 | pgoff_t start, unsigned int nr_entries, | ||
1011 | struct page **entries, pgoff_t *indices) | ||
1012 | { | ||
1013 | void **slot; | ||
1014 | unsigned int ret = 0; | ||
1015 | struct radix_tree_iter iter; | ||
1016 | |||
1017 | if (!nr_entries) | ||
1018 | return 0; | ||
1019 | |||
1020 | rcu_read_lock(); | ||
1021 | restart: | ||
1022 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | ||
1023 | struct page *page; | ||
1024 | repeat: | ||
1025 | page = radix_tree_deref_slot(slot); | ||
1026 | if (unlikely(!page)) | ||
1027 | continue; | ||
1028 | if (radix_tree_exception(page)) { | ||
1029 | if (radix_tree_deref_retry(page)) | ||
1030 | goto restart; | ||
1031 | /* | ||
1032 | * Otherwise, we must be storing a swap entry | ||
1033 | * here as an exceptional entry: so return it | ||
1034 | * without attempting to raise page count. | ||
1035 | */ | ||
1036 | goto export; | ||
1037 | } | ||
1038 | if (!page_cache_get_speculative(page)) | ||
1039 | goto repeat; | ||
1040 | |||
1041 | /* Has the page moved? */ | ||
1042 | if (unlikely(page != *slot)) { | ||
1043 | page_cache_release(page); | ||
1044 | goto repeat; | ||
1045 | } | ||
1046 | export: | ||
1047 | indices[ret] = iter.index; | ||
1048 | entries[ret] = page; | ||
1049 | if (++ret == nr_entries) | ||
1050 | break; | ||
1051 | } | ||
1052 | rcu_read_unlock(); | ||
1053 | return ret; | ||
1054 | } | ||
815 | 1055 | ||
816 | /** | 1056 | /** |
817 | * find_get_pages - gang pagecache lookup | 1057 | * find_get_pages - gang pagecache lookup |
@@ -1031,39 +1271,6 @@ repeat: | |||
1031 | } | 1271 | } |
1032 | EXPORT_SYMBOL(find_get_pages_tag); | 1272 | EXPORT_SYMBOL(find_get_pages_tag); |
1033 | 1273 | ||
1034 | /** | ||
1035 | * grab_cache_page_nowait - returns locked page at given index in given cache | ||
1036 | * @mapping: target address_space | ||
1037 | * @index: the page index | ||
1038 | * | ||
1039 | * Same as grab_cache_page(), but do not wait if the page is unavailable. | ||
1040 | * This is intended for speculative data generators, where the data can | ||
1041 | * be regenerated if the page couldn't be grabbed. This routine should | ||
1042 | * be safe to call while holding the lock for another page. | ||
1043 | * | ||
1044 | * Clear __GFP_FS when allocating the page to avoid recursion into the fs | ||
1045 | * and deadlock against the caller's locked page. | ||
1046 | */ | ||
1047 | struct page * | ||
1048 | grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | ||
1049 | { | ||
1050 | struct page *page = find_get_page(mapping, index); | ||
1051 | |||
1052 | if (page) { | ||
1053 | if (trylock_page(page)) | ||
1054 | return page; | ||
1055 | page_cache_release(page); | ||
1056 | return NULL; | ||
1057 | } | ||
1058 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); | ||
1059 | if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { | ||
1060 | page_cache_release(page); | ||
1061 | page = NULL; | ||
1062 | } | ||
1063 | return page; | ||
1064 | } | ||
1065 | EXPORT_SYMBOL(grab_cache_page_nowait); | ||
1066 | |||
1067 | /* | 1274 | /* |
1068 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | 1275 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
1069 | * a _large_ part of the i/o request. Imagine the worst scenario: | 1276 | * a _large_ part of the i/o request. Imagine the worst scenario: |
@@ -1797,6 +2004,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) | |||
1797 | EXPORT_SYMBOL(generic_file_mmap); | 2004 | EXPORT_SYMBOL(generic_file_mmap); |
1798 | EXPORT_SYMBOL(generic_file_readonly_mmap); | 2005 | EXPORT_SYMBOL(generic_file_readonly_mmap); |
1799 | 2006 | ||
2007 | static struct page *wait_on_page_read(struct page *page) | ||
2008 | { | ||
2009 | if (!IS_ERR(page)) { | ||
2010 | wait_on_page_locked(page); | ||
2011 | if (!PageUptodate(page)) { | ||
2012 | page_cache_release(page); | ||
2013 | page = ERR_PTR(-EIO); | ||
2014 | } | ||
2015 | } | ||
2016 | return page; | ||
2017 | } | ||
2018 | |||
1800 | static struct page *__read_cache_page(struct address_space *mapping, | 2019 | static struct page *__read_cache_page(struct address_space *mapping, |
1801 | pgoff_t index, | 2020 | pgoff_t index, |
1802 | int (*filler)(void *, struct page *), | 2021 | int (*filler)(void *, struct page *), |
@@ -1823,6 +2042,8 @@ repeat: | |||
1823 | if (err < 0) { | 2042 | if (err < 0) { |
1824 | page_cache_release(page); | 2043 | page_cache_release(page); |
1825 | page = ERR_PTR(err); | 2044 | page = ERR_PTR(err); |
2045 | } else { | ||
2046 | page = wait_on_page_read(page); | ||
1826 | } | 2047 | } |
1827 | } | 2048 | } |
1828 | return page; | 2049 | return page; |
@@ -1859,6 +2080,10 @@ retry: | |||
1859 | if (err < 0) { | 2080 | if (err < 0) { |
1860 | page_cache_release(page); | 2081 | page_cache_release(page); |
1861 | return ERR_PTR(err); | 2082 | return ERR_PTR(err); |
2083 | } else { | ||
2084 | page = wait_on_page_read(page); | ||
2085 | if (IS_ERR(page)) | ||
2086 | return page; | ||
1862 | } | 2087 | } |
1863 | out: | 2088 | out: |
1864 | mark_page_accessed(page); | 2089 | mark_page_accessed(page); |
@@ -1866,40 +2091,25 @@ out: | |||
1866 | } | 2091 | } |
1867 | 2092 | ||
1868 | /** | 2093 | /** |
1869 | * read_cache_page_async - read into page cache, fill it if needed | 2094 | * read_cache_page - read into page cache, fill it if needed |
1870 | * @mapping: the page's address_space | 2095 | * @mapping: the page's address_space |
1871 | * @index: the page index | 2096 | * @index: the page index |
1872 | * @filler: function to perform the read | 2097 | * @filler: function to perform the read |
1873 | * @data: first arg to filler(data, page) function, often left as NULL | 2098 | * @data: first arg to filler(data, page) function, often left as NULL |
1874 | * | 2099 | * |
1875 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
1876 | * after submitting it to the filler. | ||
1877 | * | ||
1878 | * Read into the page cache. If a page already exists, and PageUptodate() is | 2100 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1879 | * not set, try to fill the page but don't wait for it to become unlocked. | 2101 | * not set, try to fill the page and wait for it to become unlocked. |
1880 | * | 2102 | * |
1881 | * If the page does not get brought uptodate, return -EIO. | 2103 | * If the page does not get brought uptodate, return -EIO. |
1882 | */ | 2104 | */ |
1883 | struct page *read_cache_page_async(struct address_space *mapping, | 2105 | struct page *read_cache_page(struct address_space *mapping, |
1884 | pgoff_t index, | 2106 | pgoff_t index, |
1885 | int (*filler)(void *, struct page *), | 2107 | int (*filler)(void *, struct page *), |
1886 | void *data) | 2108 | void *data) |
1887 | { | 2109 | { |
1888 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | 2110 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
1889 | } | 2111 | } |
1890 | EXPORT_SYMBOL(read_cache_page_async); | 2112 | EXPORT_SYMBOL(read_cache_page); |
1891 | |||
1892 | static struct page *wait_on_page_read(struct page *page) | ||
1893 | { | ||
1894 | if (!IS_ERR(page)) { | ||
1895 | wait_on_page_locked(page); | ||
1896 | if (!PageUptodate(page)) { | ||
1897 | page_cache_release(page); | ||
1898 | page = ERR_PTR(-EIO); | ||
1899 | } | ||
1900 | } | ||
1901 | return page; | ||
1902 | } | ||
1903 | 2113 | ||
1904 | /** | 2114 | /** |
1905 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. | 2115 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. |
@@ -1918,31 +2128,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, | |||
1918 | { | 2128 | { |
1919 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | 2129 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; |
1920 | 2130 | ||
1921 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); | 2131 | return do_read_cache_page(mapping, index, filler, NULL, gfp); |
1922 | } | 2132 | } |
1923 | EXPORT_SYMBOL(read_cache_page_gfp); | 2133 | EXPORT_SYMBOL(read_cache_page_gfp); |
1924 | 2134 | ||
1925 | /** | ||
1926 | * read_cache_page - read into page cache, fill it if needed | ||
1927 | * @mapping: the page's address_space | ||
1928 | * @index: the page index | ||
1929 | * @filler: function to perform the read | ||
1930 | * @data: first arg to filler(data, page) function, often left as NULL | ||
1931 | * | ||
1932 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
1933 | * not set, try to fill the page then wait for it to become unlocked. | ||
1934 | * | ||
1935 | * If the page does not get brought uptodate, return -EIO. | ||
1936 | */ | ||
1937 | struct page *read_cache_page(struct address_space *mapping, | ||
1938 | pgoff_t index, | ||
1939 | int (*filler)(void *, struct page *), | ||
1940 | void *data) | ||
1941 | { | ||
1942 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); | ||
1943 | } | ||
1944 | EXPORT_SYMBOL(read_cache_page); | ||
1945 | |||
1946 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 2135 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
1947 | const struct iovec *iov, size_t base, size_t bytes) | 2136 | const struct iovec *iov, size_t base, size_t bytes) |
1948 | { | 2137 | { |
@@ -1976,7 +2165,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, | |||
1976 | char *kaddr; | 2165 | char *kaddr; |
1977 | size_t copied; | 2166 | size_t copied; |
1978 | 2167 | ||
1979 | BUG_ON(!in_atomic()); | ||
1980 | kaddr = kmap_atomic(page); | 2168 | kaddr = kmap_atomic(page); |
1981 | if (likely(i->nr_segs == 1)) { | 2169 | if (likely(i->nr_segs == 1)) { |
1982 | int left; | 2170 | int left; |
@@ -2186,7 +2374,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, | |||
2186 | { | 2374 | { |
2187 | const struct address_space_operations *aops = mapping->a_ops; | 2375 | const struct address_space_operations *aops = mapping->a_ops; |
2188 | 2376 | ||
2189 | mark_page_accessed(page); | ||
2190 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); | 2377 | return aops->write_end(file, mapping, pos, len, copied, page, fsdata); |
2191 | } | 2378 | } |
2192 | EXPORT_SYMBOL(pagecache_write_end); | 2379 | EXPORT_SYMBOL(pagecache_write_end); |
@@ -2268,34 +2455,18 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
2268 | struct page *grab_cache_page_write_begin(struct address_space *mapping, | 2455 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
2269 | pgoff_t index, unsigned flags) | 2456 | pgoff_t index, unsigned flags) |
2270 | { | 2457 | { |
2271 | int status; | ||
2272 | gfp_t gfp_mask; | ||
2273 | struct page *page; | 2458 | struct page *page; |
2274 | gfp_t gfp_notmask = 0; | 2459 | int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; |
2275 | 2460 | ||
2276 | gfp_mask = mapping_gfp_mask(mapping); | ||
2277 | if (mapping_cap_account_dirty(mapping)) | ||
2278 | gfp_mask |= __GFP_WRITE; | ||
2279 | if (flags & AOP_FLAG_NOFS) | 2461 | if (flags & AOP_FLAG_NOFS) |
2280 | gfp_notmask = __GFP_FS; | 2462 | fgp_flags |= FGP_NOFS; |
2281 | repeat: | 2463 | |
2282 | page = find_lock_page(mapping, index); | 2464 | page = pagecache_get_page(mapping, index, fgp_flags, |
2465 | mapping_gfp_mask(mapping), | ||
2466 | GFP_KERNEL); | ||
2283 | if (page) | 2467 | if (page) |
2284 | goto found; | 2468 | wait_for_stable_page(page); |
2285 | 2469 | ||
2286 | page = __page_cache_alloc(gfp_mask & ~gfp_notmask); | ||
2287 | if (!page) | ||
2288 | return NULL; | ||
2289 | status = add_to_page_cache_lru(page, mapping, index, | ||
2290 | GFP_KERNEL & ~gfp_notmask); | ||
2291 | if (unlikely(status)) { | ||
2292 | page_cache_release(page); | ||
2293 | if (status == -EEXIST) | ||
2294 | goto repeat; | ||
2295 | return NULL; | ||
2296 | } | ||
2297 | found: | ||
2298 | wait_for_stable_page(page); | ||
2299 | return page; | 2470 | return page; |
2300 | } | 2471 | } |
2301 | EXPORT_SYMBOL(grab_cache_page_write_begin); | 2472 | EXPORT_SYMBOL(grab_cache_page_write_begin); |
@@ -2344,18 +2515,15 @@ again: | |||
2344 | 2515 | ||
2345 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, | 2516 | status = a_ops->write_begin(file, mapping, pos, bytes, flags, |
2346 | &page, &fsdata); | 2517 | &page, &fsdata); |
2347 | if (unlikely(status)) | 2518 | if (unlikely(status < 0)) |
2348 | break; | 2519 | break; |
2349 | 2520 | ||
2350 | if (mapping_writably_mapped(mapping)) | 2521 | if (mapping_writably_mapped(mapping)) |
2351 | flush_dcache_page(page); | 2522 | flush_dcache_page(page); |
2352 | 2523 | ||
2353 | pagefault_disable(); | ||
2354 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2524 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
2355 | pagefault_enable(); | ||
2356 | flush_dcache_page(page); | 2525 | flush_dcache_page(page); |
2357 | 2526 | ||
2358 | mark_page_accessed(page); | ||
2359 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2527 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
2360 | page, fsdata); | 2528 | page, fsdata); |
2361 | if (unlikely(status < 0)) | 2529 | if (unlikely(status < 0)) |
diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221a..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -23,28 +23,44 @@ | |||
23 | 23 | ||
24 | #include "internal.h" | 24 | #include "internal.h" |
25 | 25 | ||
26 | static int mm_counter(struct page *page) | ||
27 | { | ||
28 | return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; | ||
29 | } | ||
30 | |||
26 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | 31 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, |
27 | unsigned long addr, pte_t *ptep) | 32 | unsigned long addr, pte_t *ptep) |
28 | { | 33 | { |
29 | pte_t pte = *ptep; | 34 | pte_t pte = *ptep; |
35 | struct page *page; | ||
36 | swp_entry_t entry; | ||
30 | 37 | ||
31 | if (pte_present(pte)) { | 38 | if (pte_present(pte)) { |
32 | struct page *page; | ||
33 | |||
34 | flush_cache_page(vma, addr, pte_pfn(pte)); | 39 | flush_cache_page(vma, addr, pte_pfn(pte)); |
35 | pte = ptep_clear_flush(vma, addr, ptep); | 40 | pte = ptep_clear_flush(vma, addr, ptep); |
36 | page = vm_normal_page(vma, addr, pte); | 41 | page = vm_normal_page(vma, addr, pte); |
37 | if (page) { | 42 | if (page) { |
38 | if (pte_dirty(pte)) | 43 | if (pte_dirty(pte)) |
39 | set_page_dirty(page); | 44 | set_page_dirty(page); |
45 | update_hiwater_rss(mm); | ||
46 | dec_mm_counter(mm, mm_counter(page)); | ||
40 | page_remove_rmap(page); | 47 | page_remove_rmap(page); |
41 | page_cache_release(page); | 48 | page_cache_release(page); |
49 | } | ||
50 | } else { /* zap_pte() is not called when pte_none() */ | ||
51 | if (!pte_file(pte)) { | ||
42 | update_hiwater_rss(mm); | 52 | update_hiwater_rss(mm); |
43 | dec_mm_counter(mm, MM_FILEPAGES); | 53 | entry = pte_to_swp_entry(pte); |
54 | if (non_swap_entry(entry)) { | ||
55 | if (is_migration_entry(entry)) { | ||
56 | page = migration_entry_to_page(entry); | ||
57 | dec_mm_counter(mm, mm_counter(page)); | ||
58 | } | ||
59 | } else { | ||
60 | free_swap_and_cache(entry); | ||
61 | dec_mm_counter(mm, MM_SWAPENTS); | ||
62 | } | ||
44 | } | 63 | } |
45 | } else { | ||
46 | if (!pte_file(pte)) | ||
47 | free_swap_and_cache(pte_to_swp_entry(pte)); | ||
48 | pte_clear_not_present_full(mm, addr, ptep, 0); | 64 | pte_clear_not_present_full(mm, addr, ptep, 0); |
49 | } | 65 | } |
50 | } | 66 | } |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdcb3197..c30eec536f03 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); | |||
327 | 327 | ||
328 | static unsigned long __frontswap_curr_pages(void) | 328 | static unsigned long __frontswap_curr_pages(void) |
329 | { | 329 | { |
330 | int type; | ||
331 | unsigned long totalpages = 0; | 330 | unsigned long totalpages = 0; |
332 | struct swap_info_struct *si = NULL; | 331 | struct swap_info_struct *si = NULL; |
333 | 332 | ||
334 | assert_spin_locked(&swap_lock); | 333 | assert_spin_locked(&swap_lock); |
335 | for (type = swap_list.head; type >= 0; type = si->next) { | 334 | plist_for_each_entry(si, &swap_active_head, list) |
336 | si = swap_info[type]; | ||
337 | totalpages += atomic_read(&si->frontswap_pages); | 335 | totalpages += atomic_read(&si->frontswap_pages); |
338 | } | ||
339 | return totalpages; | 336 | return totalpages; |
340 | } | 337 | } |
341 | 338 | ||
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
347 | int si_frontswap_pages; | 344 | int si_frontswap_pages; |
348 | unsigned long total_pages_to_unuse = total; | 345 | unsigned long total_pages_to_unuse = total; |
349 | unsigned long pages = 0, pages_to_unuse = 0; | 346 | unsigned long pages = 0, pages_to_unuse = 0; |
350 | int type; | ||
351 | 347 | ||
352 | assert_spin_locked(&swap_lock); | 348 | assert_spin_locked(&swap_lock); |
353 | for (type = swap_list.head; type >= 0; type = si->next) { | 349 | plist_for_each_entry(si, &swap_active_head, list) { |
354 | si = swap_info[type]; | ||
355 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | 350 | si_frontswap_pages = atomic_read(&si->frontswap_pages); |
356 | if (total_pages_to_unuse < si_frontswap_pages) { | 351 | if (total_pages_to_unuse < si_frontswap_pages) { |
357 | pages = pages_to_unuse = total_pages_to_unuse; | 352 | pages = pages_to_unuse = total_pages_to_unuse; |
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
366 | } | 361 | } |
367 | vm_unacct_memory(pages); | 362 | vm_unacct_memory(pages); |
368 | *unused = pages_to_unuse; | 363 | *unused = pages_to_unuse; |
369 | *swapid = type; | 364 | *swapid = si->type; |
370 | ret = 0; | 365 | ret = 0; |
371 | break; | 366 | break; |
372 | } | 367 | } |
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) | |||
413 | /* | 408 | /* |
414 | * we don't want to hold swap_lock while doing a very | 409 | * we don't want to hold swap_lock while doing a very |
415 | * lengthy try_to_unuse, but swap_list may change | 410 | * lengthy try_to_unuse, but swap_list may change |
416 | * so restart scan from swap_list.head each time | 411 | * so restart scan from swap_active_head each time |
417 | */ | 412 | */ |
418 | spin_lock(&swap_lock); | 413 | spin_lock(&swap_lock); |
419 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | 414 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 389973fd6bb7..2ee53749eb48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -758,14 +758,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, | |||
758 | HPAGE_PMD_ORDER, vma, haddr, nd); | 758 | HPAGE_PMD_ORDER, vma, haddr, nd); |
759 | } | 759 | } |
760 | 760 | ||
761 | #ifndef CONFIG_NUMA | ||
762 | static inline struct page *alloc_hugepage(int defrag) | ||
763 | { | ||
764 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
765 | HPAGE_PMD_ORDER); | ||
766 | } | ||
767 | #endif | ||
768 | |||
769 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 761 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
770 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 762 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
771 | struct page *zero_page) | 763 | struct page *zero_page) |
@@ -2197,7 +2189,58 @@ static void khugepaged_alloc_sleep(void) | |||
2197 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | 2189 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
2198 | } | 2190 | } |
2199 | 2191 | ||
2192 | static int khugepaged_node_load[MAX_NUMNODES]; | ||
2193 | |||
2194 | static bool khugepaged_scan_abort(int nid) | ||
2195 | { | ||
2196 | int i; | ||
2197 | |||
2198 | /* | ||
2199 | * If zone_reclaim_mode is disabled, then no extra effort is made to | ||
2200 | * allocate memory locally. | ||
2201 | */ | ||
2202 | if (!zone_reclaim_mode) | ||
2203 | return false; | ||
2204 | |||
2205 | /* If there is a count for this node already, it must be acceptable */ | ||
2206 | if (khugepaged_node_load[nid]) | ||
2207 | return false; | ||
2208 | |||
2209 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
2210 | if (!khugepaged_node_load[i]) | ||
2211 | continue; | ||
2212 | if (node_distance(nid, i) > RECLAIM_DISTANCE) | ||
2213 | return true; | ||
2214 | } | ||
2215 | return false; | ||
2216 | } | ||
2217 | |||
2200 | #ifdef CONFIG_NUMA | 2218 | #ifdef CONFIG_NUMA |
2219 | static int khugepaged_find_target_node(void) | ||
2220 | { | ||
2221 | static int last_khugepaged_target_node = NUMA_NO_NODE; | ||
2222 | int nid, target_node = 0, max_value = 0; | ||
2223 | |||
2224 | /* find first node with max normal pages hit */ | ||
2225 | for (nid = 0; nid < MAX_NUMNODES; nid++) | ||
2226 | if (khugepaged_node_load[nid] > max_value) { | ||
2227 | max_value = khugepaged_node_load[nid]; | ||
2228 | target_node = nid; | ||
2229 | } | ||
2230 | |||
2231 | /* do some balance if several nodes have the same hit record */ | ||
2232 | if (target_node <= last_khugepaged_target_node) | ||
2233 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | ||
2234 | nid++) | ||
2235 | if (max_value == khugepaged_node_load[nid]) { | ||
2236 | target_node = nid; | ||
2237 | break; | ||
2238 | } | ||
2239 | |||
2240 | last_khugepaged_target_node = target_node; | ||
2241 | return target_node; | ||
2242 | } | ||
2243 | |||
2201 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | 2244 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
2202 | { | 2245 | { |
2203 | if (IS_ERR(*hpage)) { | 2246 | if (IS_ERR(*hpage)) { |
@@ -2231,9 +2274,8 @@ static struct page | |||
2231 | * mmap_sem in read mode is good idea also to allow greater | 2274 | * mmap_sem in read mode is good idea also to allow greater |
2232 | * scalability. | 2275 | * scalability. |
2233 | */ | 2276 | */ |
2234 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 2277 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2235 | node, __GFP_OTHER_NODE); | 2278 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2236 | |||
2237 | /* | 2279 | /* |
2238 | * After allocating the hugepage, release the mmap_sem read lock in | 2280 | * After allocating the hugepage, release the mmap_sem read lock in |
2239 | * preparation for taking it in write mode. | 2281 | * preparation for taking it in write mode. |
@@ -2249,6 +2291,17 @@ static struct page | |||
2249 | return *hpage; | 2291 | return *hpage; |
2250 | } | 2292 | } |
2251 | #else | 2293 | #else |
2294 | static int khugepaged_find_target_node(void) | ||
2295 | { | ||
2296 | return 0; | ||
2297 | } | ||
2298 | |||
2299 | static inline struct page *alloc_hugepage(int defrag) | ||
2300 | { | ||
2301 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | ||
2302 | HPAGE_PMD_ORDER); | ||
2303 | } | ||
2304 | |||
2252 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2305 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
2253 | { | 2306 | { |
2254 | struct page *hpage; | 2307 | struct page *hpage; |
@@ -2455,6 +2508,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2455 | if (pmd_trans_huge(*pmd)) | 2508 | if (pmd_trans_huge(*pmd)) |
2456 | goto out; | 2509 | goto out; |
2457 | 2510 | ||
2511 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
2458 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2512 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2459 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2513 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
2460 | _pte++, _address += PAGE_SIZE) { | 2514 | _pte++, _address += PAGE_SIZE) { |
@@ -2471,12 +2525,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2471 | if (unlikely(!page)) | 2525 | if (unlikely(!page)) |
2472 | goto out_unmap; | 2526 | goto out_unmap; |
2473 | /* | 2527 | /* |
2474 | * Chose the node of the first page. This could | 2528 | * Record which node the original page is from and save this |
2475 | * be more sophisticated and look at more pages, | 2529 | * information to khugepaged_node_load[]. |
2476 | * but isn't for now. | 2530 | * Khupaged will allocate hugepage from the node has the max |
2531 | * hit record. | ||
2477 | */ | 2532 | */ |
2478 | if (node == NUMA_NO_NODE) | 2533 | node = page_to_nid(page); |
2479 | node = page_to_nid(page); | 2534 | if (khugepaged_scan_abort(node)) |
2535 | goto out_unmap; | ||
2536 | khugepaged_node_load[node]++; | ||
2480 | VM_BUG_ON(PageCompound(page)); | 2537 | VM_BUG_ON(PageCompound(page)); |
2481 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2538 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2482 | goto out_unmap; | 2539 | goto out_unmap; |
@@ -2491,9 +2548,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2491 | ret = 1; | 2548 | ret = 1; |
2492 | out_unmap: | 2549 | out_unmap: |
2493 | pte_unmap_unlock(pte, ptl); | 2550 | pte_unmap_unlock(pte, ptl); |
2494 | if (ret) | 2551 | if (ret) { |
2552 | node = khugepaged_find_target_node(); | ||
2495 | /* collapse_huge_page will return with the mmap_sem released */ | 2553 | /* collapse_huge_page will return with the mmap_sem released */ |
2496 | collapse_huge_page(mm, address, hpage, vma, node); | 2554 | collapse_huge_page(mm, address, hpage, vma, node); |
2555 | } | ||
2497 | out: | 2556 | out: |
2498 | return ret; | 2557 | return ret; |
2499 | } | 2558 | } |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f80b17106d24..c33d8a65298c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -574,7 +574,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
574 | goto err; | 574 | goto err; |
575 | 575 | ||
576 | retry_cpuset: | 576 | retry_cpuset: |
577 | cpuset_mems_cookie = get_mems_allowed(); | 577 | cpuset_mems_cookie = read_mems_allowed_begin(); |
578 | zonelist = huge_zonelist(vma, address, | 578 | zonelist = huge_zonelist(vma, address, |
579 | htlb_alloc_mask(h), &mpol, &nodemask); | 579 | htlb_alloc_mask(h), &mpol, &nodemask); |
580 | 580 | ||
@@ -596,7 +596,7 @@ retry_cpuset: | |||
596 | } | 596 | } |
597 | 597 | ||
598 | mpol_cond_put(mpol); | 598 | mpol_cond_put(mpol); |
599 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 599 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
600 | goto retry_cpuset; | 600 | goto retry_cpuset; |
601 | return page; | 601 | return page; |
602 | 602 | ||
@@ -2114,6 +2114,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
2114 | unsigned long tmp; | 2114 | unsigned long tmp; |
2115 | int ret; | 2115 | int ret; |
2116 | 2116 | ||
2117 | if (!hugepages_supported()) | ||
2118 | return -ENOTSUPP; | ||
2119 | |||
2117 | tmp = h->max_huge_pages; | 2120 | tmp = h->max_huge_pages; |
2118 | 2121 | ||
2119 | if (write && h->order >= MAX_ORDER) | 2122 | if (write && h->order >= MAX_ORDER) |
@@ -2167,6 +2170,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
2167 | unsigned long tmp; | 2170 | unsigned long tmp; |
2168 | int ret; | 2171 | int ret; |
2169 | 2172 | ||
2173 | if (!hugepages_supported()) | ||
2174 | return -ENOTSUPP; | ||
2175 | |||
2170 | tmp = h->nr_overcommit_huge_pages; | 2176 | tmp = h->nr_overcommit_huge_pages; |
2171 | 2177 | ||
2172 | if (write && h->order >= MAX_ORDER) | 2178 | if (write && h->order >= MAX_ORDER) |
@@ -2192,6 +2198,8 @@ out: | |||
2192 | void hugetlb_report_meminfo(struct seq_file *m) | 2198 | void hugetlb_report_meminfo(struct seq_file *m) |
2193 | { | 2199 | { |
2194 | struct hstate *h = &default_hstate; | 2200 | struct hstate *h = &default_hstate; |
2201 | if (!hugepages_supported()) | ||
2202 | return; | ||
2195 | seq_printf(m, | 2203 | seq_printf(m, |
2196 | "HugePages_Total: %5lu\n" | 2204 | "HugePages_Total: %5lu\n" |
2197 | "HugePages_Free: %5lu\n" | 2205 | "HugePages_Free: %5lu\n" |
@@ -2208,6 +2216,8 @@ void hugetlb_report_meminfo(struct seq_file *m) | |||
2208 | int hugetlb_report_node_meminfo(int nid, char *buf) | 2216 | int hugetlb_report_node_meminfo(int nid, char *buf) |
2209 | { | 2217 | { |
2210 | struct hstate *h = &default_hstate; | 2218 | struct hstate *h = &default_hstate; |
2219 | if (!hugepages_supported()) | ||
2220 | return 0; | ||
2211 | return sprintf(buf, | 2221 | return sprintf(buf, |
2212 | "Node %d HugePages_Total: %5u\n" | 2222 | "Node %d HugePages_Total: %5u\n" |
2213 | "Node %d HugePages_Free: %5u\n" | 2223 | "Node %d HugePages_Free: %5u\n" |
@@ -2222,6 +2232,9 @@ void hugetlb_show_meminfo(void) | |||
2222 | struct hstate *h; | 2232 | struct hstate *h; |
2223 | int nid; | 2233 | int nid; |
2224 | 2234 | ||
2235 | if (!hugepages_supported()) | ||
2236 | return; | ||
2237 | |||
2225 | for_each_node_state(nid, N_MEMORY) | 2238 | for_each_node_state(nid, N_MEMORY) |
2226 | for_each_hstate(h) | 2239 | for_each_hstate(h) |
2227 | pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", | 2240 | pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", |
diff --git a/mm/internal.h b/mm/internal.h index fdddbc83ac5f..d610f7ce4e9c 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #ifndef __MM_INTERNAL_H | 11 | #ifndef __MM_INTERNAL_H |
12 | #define __MM_INTERNAL_H | 12 | #define __MM_INTERNAL_H |
13 | 13 | ||
14 | #include <linux/fs.h> | ||
14 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
15 | 16 | ||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 17 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) | |||
21 | atomic_set(&page->_count, v); | 22 | atomic_set(&page->_count, v); |
22 | } | 23 | } |
23 | 24 | ||
25 | extern int __do_page_cache_readahead(struct address_space *mapping, | ||
26 | struct file *filp, pgoff_t offset, unsigned long nr_to_read, | ||
27 | unsigned long lookahead_size); | ||
28 | |||
29 | /* | ||
30 | * Submit IO for the read-ahead request in file_ra_state. | ||
31 | */ | ||
32 | static inline unsigned long ra_submit(struct file_ra_state *ra, | ||
33 | struct address_space *mapping, struct file *filp) | ||
34 | { | ||
35 | return __do_page_cache_readahead(mapping, filp, | ||
36 | ra->start, ra->size, ra->async_size); | ||
37 | } | ||
38 | |||
24 | /* | 39 | /* |
25 | * Turn a non-refcounted page (->_count == 0) into refcounted with | 40 | * Turn a non-refcounted page (->_count == 0) into refcounted with |
26 | * a count of one. | 41 | * a count of one. |
@@ -120,7 +135,7 @@ struct compact_control { | |||
120 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 135 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
121 | unsigned long free_pfn; /* isolate_freepages search base */ | 136 | unsigned long free_pfn; /* isolate_freepages search base */ |
122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 137 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
123 | bool sync; /* Synchronous migration */ | 138 | enum migrate_mode mode; /* Async or sync migration mode */ |
124 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 139 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
125 | bool finished_update_free; /* True when the zone cached pfns are | 140 | bool finished_update_free; /* True when the zone cached pfns are |
126 | * no longer being updated | 141 | * no longer being updated |
@@ -130,7 +145,10 @@ struct compact_control { | |||
130 | int order; /* order a direct compactor needs */ | 145 | int order; /* order a direct compactor needs */ |
131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 146 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
132 | struct zone *zone; | 147 | struct zone *zone; |
133 | bool contended; /* True if a lock was contended */ | 148 | bool contended; /* True if a lock was contended, or |
149 | * need_resched() true during async | ||
150 | * compaction | ||
151 | */ | ||
134 | }; | 152 | }; |
135 | 153 | ||
136 | unsigned long | 154 | unsigned long |
diff --git a/mm/madvise.c b/mm/madvise.c index 539eeb96b323..a402f8fdc68e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -195,7 +195,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, | |||
195 | for (; start < end; start += PAGE_SIZE) { | 195 | for (; start < end; start += PAGE_SIZE) { |
196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
197 | 197 | ||
198 | page = find_get_page(mapping, index); | 198 | page = find_get_entry(mapping, index); |
199 | if (!radix_tree_exceptional_entry(page)) { | 199 | if (!radix_tree_exceptional_entry(page)) { |
200 | if (page) | 200 | if (page) |
201 | page_cache_release(page); | 201 | page_cache_release(page); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6e3f9c39bc22..4ab233d4714a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1554,7 +1554,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1554 | 1554 | ||
1555 | /* Keep page count to indicate a given hugepage is isolated. */ | 1555 | /* Keep page count to indicate a given hugepage is isolated. */ |
1556 | list_move(&hpage->lru, &pagelist); | 1556 | list_move(&hpage->lru, &pagelist); |
1557 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1557 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1558 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1558 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1559 | if (ret) { | 1559 | if (ret) { |
1560 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1560 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
@@ -1635,7 +1635,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1635 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1635 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1636 | page_is_file_cache(page)); | 1636 | page_is_file_cache(page)); |
1637 | list_add(&page->lru, &pagelist); | 1637 | list_add(&page->lru, &pagelist); |
1638 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1638 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1639 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1639 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1640 | if (ret) { | 1640 | if (ret) { |
1641 | putback_lru_pages(&pagelist); | 1641 | putback_lru_pages(&pagelist); |
diff --git a/mm/memory.c b/mm/memory.c index 99fe3aa1035c..b5901068495f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -878,7 +878,7 @@ out_set_pte: | |||
878 | return 0; | 878 | return 0; |
879 | } | 879 | } |
880 | 880 | ||
881 | int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 881 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
882 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 882 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
883 | unsigned long addr, unsigned long end) | 883 | unsigned long addr, unsigned long end) |
884 | { | 884 | { |
@@ -3698,7 +3698,7 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3698 | pte_t entry; | 3698 | pte_t entry; |
3699 | spinlock_t *ptl; | 3699 | spinlock_t *ptl; |
3700 | 3700 | ||
3701 | entry = *pte; | 3701 | entry = ACCESS_ONCE(*pte); |
3702 | if (!pte_present(entry)) { | 3702 | if (!pte_present(entry)) { |
3703 | if (pte_none(entry)) { | 3703 | if (pte_none(entry)) { |
3704 | if (vma->vm_ops) { | 3704 | if (vma->vm_ops) { |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..d31730564617 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1321,7 +1321,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1321 | * alloc_migrate_target should be improooooved!! | 1321 | * alloc_migrate_target should be improooooved!! |
1322 | * migrate_pages returns # of failed pages. | 1322 | * migrate_pages returns # of failed pages. |
1323 | */ | 1323 | */ |
1324 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1324 | ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, |
1325 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1325 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1326 | if (ret) | 1326 | if (ret) |
1327 | putback_movable_pages(&source); | 1327 | putback_movable_pages(&source); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0437f3595b32..cc61c7a7d6a1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1060,7 +1060,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1060 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 1060 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
1061 | 1061 | ||
1062 | if (!list_empty(&pagelist)) { | 1062 | if (!list_empty(&pagelist)) { |
1063 | err = migrate_pages(&pagelist, new_node_page, dest, | 1063 | err = migrate_pages(&pagelist, new_node_page, NULL, dest, |
1064 | MIGRATE_SYNC, MR_SYSCALL); | 1064 | MIGRATE_SYNC, MR_SYSCALL); |
1065 | if (err) | 1065 | if (err) |
1066 | putback_movable_pages(&pagelist); | 1066 | putback_movable_pages(&pagelist); |
@@ -1306,7 +1306,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1306 | 1306 | ||
1307 | if (!list_empty(&pagelist)) { | 1307 | if (!list_empty(&pagelist)) { |
1308 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1308 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1309 | nr_failed = migrate_pages(&pagelist, new_page, | 1309 | nr_failed = migrate_pages(&pagelist, new_page, NULL, |
1310 | start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | 1310 | start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1311 | if (nr_failed) | 1311 | if (nr_failed) |
1312 | putback_movable_pages(&pagelist); | 1312 | putback_movable_pages(&pagelist); |
@@ -1873,7 +1873,7 @@ int node_random(const nodemask_t *maskp) | |||
1873 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's | 1873 | * If the effective policy is 'BIND, returns a pointer to the mempolicy's |
1874 | * @nodemask for filtering the zonelist. | 1874 | * @nodemask for filtering the zonelist. |
1875 | * | 1875 | * |
1876 | * Must be protected by get_mems_allowed() | 1876 | * Must be protected by read_mems_allowed_begin() |
1877 | */ | 1877 | */ |
1878 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1878 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1879 | gfp_t gfp_flags, struct mempolicy **mpol, | 1879 | gfp_t gfp_flags, struct mempolicy **mpol, |
@@ -2037,7 +2037,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
2037 | 2037 | ||
2038 | retry_cpuset: | 2038 | retry_cpuset: |
2039 | pol = get_vma_policy(current, vma, addr); | 2039 | pol = get_vma_policy(current, vma, addr); |
2040 | cpuset_mems_cookie = get_mems_allowed(); | 2040 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2041 | 2041 | ||
2042 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 2042 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
2043 | unsigned nid; | 2043 | unsigned nid; |
@@ -2045,7 +2045,7 @@ retry_cpuset: | |||
2045 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 2045 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
2046 | mpol_cond_put(pol); | 2046 | mpol_cond_put(pol); |
2047 | page = alloc_page_interleave(gfp, order, nid); | 2047 | page = alloc_page_interleave(gfp, order, nid); |
2048 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2048 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2049 | goto retry_cpuset; | 2049 | goto retry_cpuset; |
2050 | 2050 | ||
2051 | return page; | 2051 | return page; |
@@ -2055,7 +2055,7 @@ retry_cpuset: | |||
2055 | policy_nodemask(gfp, pol)); | 2055 | policy_nodemask(gfp, pol)); |
2056 | if (unlikely(mpol_needs_cond_ref(pol))) | 2056 | if (unlikely(mpol_needs_cond_ref(pol))) |
2057 | __mpol_put(pol); | 2057 | __mpol_put(pol); |
2058 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2058 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2059 | goto retry_cpuset; | 2059 | goto retry_cpuset; |
2060 | return page; | 2060 | return page; |
2061 | } | 2061 | } |
@@ -2089,7 +2089,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
2089 | pol = &default_policy; | 2089 | pol = &default_policy; |
2090 | 2090 | ||
2091 | retry_cpuset: | 2091 | retry_cpuset: |
2092 | cpuset_mems_cookie = get_mems_allowed(); | 2092 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2093 | 2093 | ||
2094 | /* | 2094 | /* |
2095 | * No reference counting needed for current->mempolicy | 2095 | * No reference counting needed for current->mempolicy |
@@ -2102,7 +2102,7 @@ retry_cpuset: | |||
2102 | policy_zonelist(gfp, pol, numa_node_id()), | 2102 | policy_zonelist(gfp, pol, numa_node_id()), |
2103 | policy_nodemask(gfp, pol)); | 2103 | policy_nodemask(gfp, pol)); |
2104 | 2104 | ||
2105 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2105 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2106 | goto retry_cpuset; | 2106 | goto retry_cpuset; |
2107 | 2107 | ||
2108 | return page; | 2108 | return page; |
diff --git a/mm/migrate.c b/mm/migrate.c index e3cf71dd1288..96d4d814ae2f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -867,8 +867,9 @@ out: | |||
867 | * Obtain the lock on page, remove all ptes and migrate the page | 867 | * Obtain the lock on page, remove all ptes and migrate the page |
868 | * to the newly allocated page in newpage. | 868 | * to the newly allocated page in newpage. |
869 | */ | 869 | */ |
870 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 870 | static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, |
871 | struct page *page, int force, enum migrate_mode mode) | 871 | unsigned long private, struct page *page, int force, |
872 | enum migrate_mode mode) | ||
872 | { | 873 | { |
873 | int rc = 0; | 874 | int rc = 0; |
874 | int *result = NULL; | 875 | int *result = NULL; |
@@ -912,11 +913,18 @@ out: | |||
912 | page_is_file_cache(page)); | 913 | page_is_file_cache(page)); |
913 | putback_lru_page(page); | 914 | putback_lru_page(page); |
914 | } | 915 | } |
916 | |||
915 | /* | 917 | /* |
916 | * Move the new page to the LRU. If migration was not successful | 918 | * If migration was not successful and there's a freeing callback, use |
917 | * then this will free the page. | 919 | * it. Otherwise, putback_lru_page() will drop the reference grabbed |
920 | * during isolation. | ||
918 | */ | 921 | */ |
919 | putback_lru_page(newpage); | 922 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { |
923 | ClearPageSwapBacked(newpage); | ||
924 | put_new_page(newpage, private); | ||
925 | } else | ||
926 | putback_lru_page(newpage); | ||
927 | |||
920 | if (result) { | 928 | if (result) { |
921 | if (rc) | 929 | if (rc) |
922 | *result = rc; | 930 | *result = rc; |
@@ -945,8 +953,9 @@ out: | |||
945 | * will wait in the page fault for migration to complete. | 953 | * will wait in the page fault for migration to complete. |
946 | */ | 954 | */ |
947 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 955 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
948 | unsigned long private, struct page *hpage, | 956 | free_page_t put_new_page, unsigned long private, |
949 | int force, enum migrate_mode mode) | 957 | struct page *hpage, int force, |
958 | enum migrate_mode mode) | ||
950 | { | 959 | { |
951 | int rc = 0; | 960 | int rc = 0; |
952 | int *result = NULL; | 961 | int *result = NULL; |
@@ -982,20 +991,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
982 | if (!page_mapped(hpage)) | 991 | if (!page_mapped(hpage)) |
983 | rc = move_to_new_page(new_hpage, hpage, 1, mode); | 992 | rc = move_to_new_page(new_hpage, hpage, 1, mode); |
984 | 993 | ||
985 | if (rc) | 994 | if (rc != MIGRATEPAGE_SUCCESS) |
986 | remove_migration_ptes(hpage, hpage); | 995 | remove_migration_ptes(hpage, hpage); |
987 | 996 | ||
988 | if (anon_vma) | 997 | if (anon_vma) |
989 | put_anon_vma(anon_vma); | 998 | put_anon_vma(anon_vma); |
990 | 999 | ||
991 | if (!rc) | 1000 | if (rc == MIGRATEPAGE_SUCCESS) |
992 | hugetlb_cgroup_migrate(hpage, new_hpage); | 1001 | hugetlb_cgroup_migrate(hpage, new_hpage); |
993 | 1002 | ||
994 | unlock_page(hpage); | 1003 | unlock_page(hpage); |
995 | out: | 1004 | out: |
996 | if (rc != -EAGAIN) | 1005 | if (rc != -EAGAIN) |
997 | putback_active_hugepage(hpage); | 1006 | putback_active_hugepage(hpage); |
998 | put_page(new_hpage); | 1007 | |
1008 | /* | ||
1009 | * If migration was not successful and there's a freeing callback, use | ||
1010 | * it. Otherwise, put_page() will drop the reference grabbed during | ||
1011 | * isolation. | ||
1012 | */ | ||
1013 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) | ||
1014 | put_new_page(new_hpage, private); | ||
1015 | else | ||
1016 | put_page(new_hpage); | ||
1017 | |||
999 | if (result) { | 1018 | if (result) { |
1000 | if (rc) | 1019 | if (rc) |
1001 | *result = rc; | 1020 | *result = rc; |
@@ -1012,6 +1031,8 @@ out: | |||
1012 | * @from: The list of pages to be migrated. | 1031 | * @from: The list of pages to be migrated. |
1013 | * @get_new_page: The function used to allocate free pages to be used | 1032 | * @get_new_page: The function used to allocate free pages to be used |
1014 | * as the target of the page migration. | 1033 | * as the target of the page migration. |
1034 | * @put_new_page: The function used to free target pages if migration | ||
1035 | * fails, or NULL if no special handling is necessary. | ||
1015 | * @private: Private data to be passed on to get_new_page() | 1036 | * @private: Private data to be passed on to get_new_page() |
1016 | * @mode: The migration mode that specifies the constraints for | 1037 | * @mode: The migration mode that specifies the constraints for |
1017 | * page migration, if any. | 1038 | * page migration, if any. |
@@ -1025,7 +1046,8 @@ out: | |||
1025 | * Returns the number of pages that were not migrated, or an error code. | 1046 | * Returns the number of pages that were not migrated, or an error code. |
1026 | */ | 1047 | */ |
1027 | int migrate_pages(struct list_head *from, new_page_t get_new_page, | 1048 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
1028 | unsigned long private, enum migrate_mode mode, int reason) | 1049 | free_page_t put_new_page, unsigned long private, |
1050 | enum migrate_mode mode, int reason) | ||
1029 | { | 1051 | { |
1030 | int retry = 1; | 1052 | int retry = 1; |
1031 | int nr_failed = 0; | 1053 | int nr_failed = 0; |
@@ -1047,10 +1069,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1047 | 1069 | ||
1048 | if (PageHuge(page)) | 1070 | if (PageHuge(page)) |
1049 | rc = unmap_and_move_huge_page(get_new_page, | 1071 | rc = unmap_and_move_huge_page(get_new_page, |
1050 | private, page, pass > 2, mode); | 1072 | put_new_page, private, page, |
1073 | pass > 2, mode); | ||
1051 | else | 1074 | else |
1052 | rc = unmap_and_move(get_new_page, private, | 1075 | rc = unmap_and_move(get_new_page, put_new_page, |
1053 | page, pass > 2, mode); | 1076 | private, page, pass > 2, mode); |
1054 | 1077 | ||
1055 | switch(rc) { | 1078 | switch(rc) { |
1056 | case -ENOMEM: | 1079 | case -ENOMEM: |
@@ -1194,7 +1217,7 @@ set_status: | |||
1194 | 1217 | ||
1195 | err = 0; | 1218 | err = 0; |
1196 | if (!list_empty(&pagelist)) { | 1219 | if (!list_empty(&pagelist)) { |
1197 | err = migrate_pages(&pagelist, new_page_node, | 1220 | err = migrate_pages(&pagelist, new_page_node, NULL, |
1198 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); | 1221 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1199 | if (err) | 1222 | if (err) |
1200 | putback_movable_pages(&pagelist); | 1223 | putback_movable_pages(&pagelist); |
@@ -1643,7 +1666,8 @@ int migrate_misplaced_page(struct page *page, int node) | |||
1643 | 1666 | ||
1644 | list_add(&page->lru, &migratepages); | 1667 | list_add(&page->lru, &migratepages); |
1645 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, | 1668 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1646 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); | 1669 | NULL, node, MIGRATE_ASYNC, |
1670 | MR_NUMA_MISPLACED); | ||
1647 | if (nr_remaining) { | 1671 | if (nr_remaining) { |
1648 | putback_lru_pages(&migratepages); | 1672 | putback_lru_pages(&migratepages); |
1649 | isolated = 0; | 1673 | isolated = 0; |
diff --git a/mm/mincore.c b/mm/mincore.c index da2be56a7b8f..06cb81005c77 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
70 | * any other file mapping (ie. marked !present and faulted in with | 70 | * any other file mapping (ie. marked !present and faulted in with |
71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. | 71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
72 | */ | 72 | */ |
73 | page = find_get_page(mapping, pgoff); | ||
74 | #ifdef CONFIG_SWAP | 73 | #ifdef CONFIG_SWAP |
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | 74 | if (shmem_mapping(mapping)) { |
76 | if (radix_tree_exceptional_entry(page)) { | 75 | page = find_get_entry(mapping, pgoff); |
77 | swp_entry_t swap = radix_to_swp_entry(page); | 76 | /* |
78 | page = find_get_page(swap_address_space(swap), swap.val); | 77 | * shmem/tmpfs may return swap: account for swapcache |
79 | } | 78 | * page too. |
79 | */ | ||
80 | if (radix_tree_exceptional_entry(page)) { | ||
81 | swp_entry_t swp = radix_to_swp_entry(page); | ||
82 | page = find_get_page(swap_address_space(swp), swp.val); | ||
83 | } | ||
84 | } else | ||
85 | page = find_get_page(mapping, pgoff); | ||
86 | #else | ||
87 | page = find_get_page(mapping, pgoff); | ||
80 | #endif | 88 | #endif |
81 | if (page) { | 89 | if (page) { |
82 | present = PageUptodate(page); | 90 | present = PageUptodate(page); |
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/vmacache.h> | ||
13 | #include <linux/shm.h> | 14 | #include <linux/shm.h> |
14 | #include <linux/mman.h> | 15 | #include <linux/mman.h> |
15 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
@@ -682,8 +683,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
682 | prev->vm_next = next = vma->vm_next; | 683 | prev->vm_next = next = vma->vm_next; |
683 | if (next) | 684 | if (next) |
684 | next->vm_prev = prev; | 685 | next->vm_prev = prev; |
685 | if (mm->mmap_cache == vma) | 686 | |
686 | mm->mmap_cache = prev; | 687 | /* Kill the cache */ |
688 | vmacache_invalidate(mm); | ||
687 | } | 689 | } |
688 | 690 | ||
689 | /* | 691 | /* |
@@ -1980,34 +1982,33 @@ EXPORT_SYMBOL(get_unmapped_area); | |||
1980 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ | 1982 | /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ |
1981 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | 1983 | struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) |
1982 | { | 1984 | { |
1983 | struct vm_area_struct *vma = NULL; | 1985 | struct rb_node *rb_node; |
1986 | struct vm_area_struct *vma; | ||
1984 | 1987 | ||
1985 | /* Check the cache first. */ | 1988 | /* Check the cache first. */ |
1986 | /* (Cache hit rate is typically around 35%.) */ | 1989 | vma = vmacache_find(mm, addr); |
1987 | vma = ACCESS_ONCE(mm->mmap_cache); | 1990 | if (likely(vma)) |
1988 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1991 | return vma; |
1989 | struct rb_node *rb_node; | ||
1990 | 1992 | ||
1991 | rb_node = mm->mm_rb.rb_node; | 1993 | rb_node = mm->mm_rb.rb_node; |
1992 | vma = NULL; | 1994 | vma = NULL; |
1993 | 1995 | ||
1994 | while (rb_node) { | 1996 | while (rb_node) { |
1995 | struct vm_area_struct *vma_tmp; | 1997 | struct vm_area_struct *tmp; |
1996 | 1998 | ||
1997 | vma_tmp = rb_entry(rb_node, | 1999 | tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); |
1998 | struct vm_area_struct, vm_rb); | 2000 | |
1999 | 2001 | if (tmp->vm_end > addr) { | |
2000 | if (vma_tmp->vm_end > addr) { | 2002 | vma = tmp; |
2001 | vma = vma_tmp; | 2003 | if (tmp->vm_start <= addr) |
2002 | if (vma_tmp->vm_start <= addr) | 2004 | break; |
2003 | break; | 2005 | rb_node = rb_node->rb_left; |
2004 | rb_node = rb_node->rb_left; | 2006 | } else |
2005 | } else | 2007 | rb_node = rb_node->rb_right; |
2006 | rb_node = rb_node->rb_right; | ||
2007 | } | ||
2008 | if (vma) | ||
2009 | mm->mmap_cache = vma; | ||
2010 | } | 2008 | } |
2009 | |||
2010 | if (vma) | ||
2011 | vmacache_update(addr, vma); | ||
2011 | return vma; | 2012 | return vma; |
2012 | } | 2013 | } |
2013 | 2014 | ||
@@ -2379,7 +2380,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2379 | } else | 2380 | } else |
2380 | mm->highest_vm_end = prev ? prev->vm_end : 0; | 2381 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
2381 | tail_vma->vm_next = NULL; | 2382 | tail_vma->vm_next = NULL; |
2382 | mm->mmap_cache = NULL; /* Kill the cache. */ | 2383 | |
2384 | /* Kill the cache */ | ||
2385 | vmacache_invalidate(mm); | ||
2383 | } | 2386 | } |
2384 | 2387 | ||
2385 | /* | 2388 | /* |
diff --git a/mm/nommu.c b/mm/nommu.c index ecd1f158548e..1221d2b66e97 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -15,6 +15,7 @@ | |||
15 | 15 | ||
16 | #include <linux/export.h> | 16 | #include <linux/export.h> |
17 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
18 | #include <linux/vmacache.h> | ||
18 | #include <linux/mman.h> | 19 | #include <linux/mman.h> |
19 | #include <linux/swap.h> | 20 | #include <linux/swap.h> |
20 | #include <linux/file.h> | 21 | #include <linux/file.h> |
@@ -767,16 +768,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
767 | */ | 768 | */ |
768 | static void delete_vma_from_mm(struct vm_area_struct *vma) | 769 | static void delete_vma_from_mm(struct vm_area_struct *vma) |
769 | { | 770 | { |
771 | int i; | ||
770 | struct address_space *mapping; | 772 | struct address_space *mapping; |
771 | struct mm_struct *mm = vma->vm_mm; | 773 | struct mm_struct *mm = vma->vm_mm; |
774 | struct task_struct *curr = current; | ||
772 | 775 | ||
773 | kenter("%p", vma); | 776 | kenter("%p", vma); |
774 | 777 | ||
775 | protect_vma(vma, 0); | 778 | protect_vma(vma, 0); |
776 | 779 | ||
777 | mm->map_count--; | 780 | mm->map_count--; |
778 | if (mm->mmap_cache == vma) | 781 | for (i = 0; i < VMACACHE_SIZE; i++) { |
779 | mm->mmap_cache = NULL; | 782 | /* if the vma is cached, invalidate the entire cache */ |
783 | if (curr->vmacache[i] == vma) { | ||
784 | vmacache_invalidate(curr->mm); | ||
785 | break; | ||
786 | } | ||
787 | } | ||
780 | 788 | ||
781 | /* remove the VMA from the mapping */ | 789 | /* remove the VMA from the mapping */ |
782 | if (vma->vm_file) { | 790 | if (vma->vm_file) { |
@@ -824,8 +832,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
824 | struct vm_area_struct *vma; | 832 | struct vm_area_struct *vma; |
825 | 833 | ||
826 | /* check the cache first */ | 834 | /* check the cache first */ |
827 | vma = ACCESS_ONCE(mm->mmap_cache); | 835 | vma = vmacache_find(mm, addr); |
828 | if (vma && vma->vm_start <= addr && vma->vm_end > addr) | 836 | if (likely(vma)) |
829 | return vma; | 837 | return vma; |
830 | 838 | ||
831 | /* trawl the list (there may be multiple mappings in which addr | 839 | /* trawl the list (there may be multiple mappings in which addr |
@@ -834,7 +842,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
834 | if (vma->vm_start > addr) | 842 | if (vma->vm_start > addr) |
835 | return NULL; | 843 | return NULL; |
836 | if (vma->vm_end > addr) { | 844 | if (vma->vm_end > addr) { |
837 | mm->mmap_cache = vma; | 845 | vmacache_update(addr, vma); |
838 | return vma; | 846 | return vma; |
839 | } | 847 | } |
840 | } | 848 | } |
@@ -873,8 +881,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
873 | unsigned long end = addr + len; | 881 | unsigned long end = addr + len; |
874 | 882 | ||
875 | /* check the cache first */ | 883 | /* check the cache first */ |
876 | vma = mm->mmap_cache; | 884 | vma = vmacache_find_exact(mm, addr, end); |
877 | if (vma && vma->vm_start == addr && vma->vm_end == end) | 885 | if (vma) |
878 | return vma; | 886 | return vma; |
879 | 887 | ||
880 | /* trawl the list (there may be multiple mappings in which addr | 888 | /* trawl the list (there may be multiple mappings in which addr |
@@ -885,7 +893,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, | |||
885 | if (vma->vm_start > addr) | 893 | if (vma->vm_start > addr) |
886 | return NULL; | 894 | return NULL; |
887 | if (vma->vm_end == end) { | 895 | if (vma->vm_end == end) { |
888 | mm->mmap_cache = vma; | 896 | vmacache_update(addr, vma); |
889 | return vma; | 897 | return vma; |
890 | } | 898 | } |
891 | } | 899 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a280f772bc66..2f91223dbe93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -405,7 +405,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
405 | return bad; | 405 | return bad; |
406 | } | 406 | } |
407 | 407 | ||
408 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 408 | static inline void prep_zero_page(struct page *page, unsigned int order, |
409 | gfp_t gfp_flags) | ||
409 | { | 410 | { |
410 | int i; | 411 | int i; |
411 | 412 | ||
@@ -449,7 +450,7 @@ static inline void set_page_guard_flag(struct page *page) { } | |||
449 | static inline void clear_page_guard_flag(struct page *page) { } | 450 | static inline void clear_page_guard_flag(struct page *page) { } |
450 | #endif | 451 | #endif |
451 | 452 | ||
452 | static inline void set_page_order(struct page *page, int order) | 453 | static inline void set_page_order(struct page *page, unsigned int order) |
453 | { | 454 | { |
454 | set_page_private(page, order); | 455 | set_page_private(page, order); |
455 | __SetPageBuddy(page); | 456 | __SetPageBuddy(page); |
@@ -500,21 +501,31 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) | |||
500 | * For recording page's order, we use page_private(page). | 501 | * For recording page's order, we use page_private(page). |
501 | */ | 502 | */ |
502 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 503 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
503 | int order) | 504 | unsigned int order) |
504 | { | 505 | { |
505 | if (!pfn_valid_within(page_to_pfn(buddy))) | 506 | if (!pfn_valid_within(page_to_pfn(buddy))) |
506 | return 0; | 507 | return 0; |
507 | 508 | ||
508 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
509 | return 0; | ||
510 | |||
511 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 509 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
512 | VM_BUG_ON(page_count(buddy) != 0); | 510 | VM_BUG_ON(page_count(buddy) != 0); |
511 | |||
512 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
513 | return 0; | ||
514 | |||
513 | return 1; | 515 | return 1; |
514 | } | 516 | } |
515 | 517 | ||
516 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 518 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
517 | VM_BUG_ON(page_count(buddy) != 0); | 519 | VM_BUG_ON(page_count(buddy) != 0); |
520 | |||
521 | /* | ||
522 | * zone check is done late to avoid uselessly | ||
523 | * calculating zone/node ids for pages that could | ||
524 | * never merge. | ||
525 | */ | ||
526 | if (page_zone_id(page) != page_zone_id(buddy)) | ||
527 | return 0; | ||
528 | |||
518 | return 1; | 529 | return 1; |
519 | } | 530 | } |
520 | return 0; | 531 | return 0; |
@@ -546,6 +557,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
546 | */ | 557 | */ |
547 | 558 | ||
548 | static inline void __free_one_page(struct page *page, | 559 | static inline void __free_one_page(struct page *page, |
560 | unsigned long pfn, | ||
549 | struct zone *zone, unsigned int order, | 561 | struct zone *zone, unsigned int order, |
550 | int migratetype) | 562 | int migratetype) |
551 | { | 563 | { |
@@ -562,7 +574,7 @@ static inline void __free_one_page(struct page *page, | |||
562 | 574 | ||
563 | VM_BUG_ON(migratetype == -1); | 575 | VM_BUG_ON(migratetype == -1); |
564 | 576 | ||
565 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 577 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
566 | 578 | ||
567 | VM_BUG_ON(page_idx & ((1 << order) - 1)); | 579 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
568 | VM_BUG_ON(bad_range(zone, page)); | 580 | VM_BUG_ON(bad_range(zone, page)); |
@@ -652,9 +664,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
652 | int migratetype = 0; | 664 | int migratetype = 0; |
653 | int batch_free = 0; | 665 | int batch_free = 0; |
654 | int to_free = count; | 666 | int to_free = count; |
667 | unsigned long nr_scanned; | ||
655 | 668 | ||
656 | spin_lock(&zone->lock); | 669 | spin_lock(&zone->lock); |
657 | zone->pages_scanned = 0; | 670 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); |
671 | if (nr_scanned) | ||
672 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | ||
658 | 673 | ||
659 | while (to_free) { | 674 | while (to_free) { |
660 | struct page *page; | 675 | struct page *page; |
@@ -686,7 +701,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
686 | list_del(&page->lru); | 701 | list_del(&page->lru); |
687 | mt = get_freepage_migratetype(page); | 702 | mt = get_freepage_migratetype(page); |
688 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 703 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
689 | __free_one_page(page, zone, 0, mt); | 704 | __free_one_page(page, page_to_pfn(page), zone, 0, mt); |
690 | trace_mm_page_pcpu_drain(page, 0, mt); | 705 | trace_mm_page_pcpu_drain(page, 0, mt); |
691 | if (likely(!is_migrate_isolate_page(page))) { | 706 | if (likely(!is_migrate_isolate_page(page))) { |
692 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 707 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
@@ -698,13 +713,18 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
698 | spin_unlock(&zone->lock); | 713 | spin_unlock(&zone->lock); |
699 | } | 714 | } |
700 | 715 | ||
701 | static void free_one_page(struct zone *zone, struct page *page, int order, | 716 | static void free_one_page(struct zone *zone, |
717 | struct page *page, unsigned long pfn, | ||
718 | unsigned int order, | ||
702 | int migratetype) | 719 | int migratetype) |
703 | { | 720 | { |
721 | unsigned long nr_scanned; | ||
704 | spin_lock(&zone->lock); | 722 | spin_lock(&zone->lock); |
705 | zone->pages_scanned = 0; | 723 | nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); |
724 | if (nr_scanned) | ||
725 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); | ||
706 | 726 | ||
707 | __free_one_page(page, zone, order, migratetype); | 727 | __free_one_page(page, pfn, zone, order, migratetype); |
708 | if (unlikely(!is_migrate_isolate(migratetype))) | 728 | if (unlikely(!is_migrate_isolate(migratetype))) |
709 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 729 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
710 | spin_unlock(&zone->lock); | 730 | spin_unlock(&zone->lock); |
@@ -741,15 +761,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
741 | { | 761 | { |
742 | unsigned long flags; | 762 | unsigned long flags; |
743 | int migratetype; | 763 | int migratetype; |
764 | unsigned long pfn = page_to_pfn(page); | ||
744 | 765 | ||
745 | if (!free_pages_prepare(page, order)) | 766 | if (!free_pages_prepare(page, order)) |
746 | return; | 767 | return; |
747 | 768 | ||
769 | migratetype = get_pfnblock_migratetype(page, pfn); | ||
748 | local_irq_save(flags); | 770 | local_irq_save(flags); |
749 | __count_vm_events(PGFREE, 1 << order); | 771 | __count_vm_events(PGFREE, 1 << order); |
750 | migratetype = get_pageblock_migratetype(page); | ||
751 | set_freepage_migratetype(page, migratetype); | 772 | set_freepage_migratetype(page, migratetype); |
752 | free_one_page(page_zone(page), page, order, migratetype); | 773 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
753 | local_irq_restore(flags); | 774 | local_irq_restore(flags); |
754 | } | 775 | } |
755 | 776 | ||
@@ -869,7 +890,7 @@ static inline int check_new_page(struct page *page) | |||
869 | return 0; | 890 | return 0; |
870 | } | 891 | } |
871 | 892 | ||
872 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 893 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) |
873 | { | 894 | { |
874 | int i; | 895 | int i; |
875 | 896 | ||
@@ -918,6 +939,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
918 | rmv_page_order(page); | 939 | rmv_page_order(page); |
919 | area->nr_free--; | 940 | area->nr_free--; |
920 | expand(zone, page, order, current_order, area, migratetype); | 941 | expand(zone, page, order, current_order, area, migratetype); |
942 | set_freepage_migratetype(page, migratetype); | ||
921 | return page; | 943 | return page; |
922 | } | 944 | } |
923 | 945 | ||
@@ -1042,6 +1064,12 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1042 | { | 1064 | { |
1043 | int current_order = page_order(page); | 1065 | int current_order = page_order(page); |
1044 | 1066 | ||
1067 | /* | ||
1068 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1069 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
1070 | * is set to CMA so it is returned to the correct freelist in case | ||
1071 | * the page ends up being not actually allocated from the pcp lists. | ||
1072 | */ | ||
1045 | if (is_migrate_cma(fallback_type)) | 1073 | if (is_migrate_cma(fallback_type)) |
1046 | return fallback_type; | 1074 | return fallback_type; |
1047 | 1075 | ||
@@ -1073,16 +1101,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1073 | 1101 | ||
1074 | /* Remove an element from the buddy allocator from the fallback list */ | 1102 | /* Remove an element from the buddy allocator from the fallback list */ |
1075 | static inline struct page * | 1103 | static inline struct page * |
1076 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 1104 | __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) |
1077 | { | 1105 | { |
1078 | struct free_area *area; | 1106 | struct free_area *area; |
1079 | int current_order; | 1107 | unsigned int current_order; |
1080 | struct page *page; | 1108 | struct page *page; |
1081 | int migratetype, new_type, i; | 1109 | int migratetype, new_type, i; |
1082 | 1110 | ||
1083 | /* Find the largest possible block of pages in the other list */ | 1111 | /* Find the largest possible block of pages in the other list */ |
1084 | for (current_order = MAX_ORDER-1; current_order >= order; | 1112 | for (current_order = MAX_ORDER-1; |
1085 | --current_order) { | 1113 | current_order >= order && current_order <= MAX_ORDER-1; |
1114 | --current_order) { | ||
1086 | for (i = 0;; i++) { | 1115 | for (i = 0;; i++) { |
1087 | migratetype = fallbacks[start_migratetype][i]; | 1116 | migratetype = fallbacks[start_migratetype][i]; |
1088 | 1117 | ||
@@ -1106,21 +1135,17 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1106 | list_del(&page->lru); | 1135 | list_del(&page->lru); |
1107 | rmv_page_order(page); | 1136 | rmv_page_order(page); |
1108 | 1137 | ||
1109 | /* | ||
1110 | * Borrow the excess buddy pages as well, irrespective | ||
1111 | * of whether we stole freepages, or took ownership of | ||
1112 | * the pageblock or not. | ||
1113 | * | ||
1114 | * Exception: When borrowing from MIGRATE_CMA, release | ||
1115 | * the excess buddy pages to CMA itself. | ||
1116 | */ | ||
1117 | expand(zone, page, order, current_order, area, | 1138 | expand(zone, page, order, current_order, area, |
1118 | is_migrate_cma(migratetype) | 1139 | new_type); |
1119 | ? migratetype : start_migratetype); | 1140 | /* The freepage_migratetype may differ from pageblock's |
1141 | * migratetype depending on the decisions in | ||
1142 | * try_to_steal_freepages. This is OK as long as it does | ||
1143 | * not differ for MIGRATE_CMA type. | ||
1144 | */ | ||
1145 | set_freepage_migratetype(page, new_type); | ||
1120 | 1146 | ||
1121 | trace_mm_page_alloc_extfrag(page, order, | 1147 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1122 | current_order, start_migratetype, migratetype, | 1148 | start_migratetype, migratetype, new_type); |
1123 | new_type == start_migratetype); | ||
1124 | 1149 | ||
1125 | return page; | 1150 | return page; |
1126 | } | 1151 | } |
@@ -1166,9 +1191,9 @@ retry_reserve: | |||
1166 | */ | 1191 | */ |
1167 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1192 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1168 | unsigned long count, struct list_head *list, | 1193 | unsigned long count, struct list_head *list, |
1169 | int migratetype, int cold) | 1194 | int migratetype, bool cold) |
1170 | { | 1195 | { |
1171 | int mt = migratetype, i; | 1196 | int i; |
1172 | 1197 | ||
1173 | spin_lock(&zone->lock); | 1198 | spin_lock(&zone->lock); |
1174 | for (i = 0; i < count; ++i) { | 1199 | for (i = 0; i < count; ++i) { |
@@ -1185,18 +1210,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1185 | * merge IO requests if the physical pages are ordered | 1210 | * merge IO requests if the physical pages are ordered |
1186 | * properly. | 1211 | * properly. |
1187 | */ | 1212 | */ |
1188 | if (likely(cold == 0)) | 1213 | if (likely(!cold)) |
1189 | list_add(&page->lru, list); | 1214 | list_add(&page->lru, list); |
1190 | else | 1215 | else |
1191 | list_add_tail(&page->lru, list); | 1216 | list_add_tail(&page->lru, list); |
1192 | if (IS_ENABLED(CONFIG_CMA)) { | ||
1193 | mt = get_pageblock_migratetype(page); | ||
1194 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) | ||
1195 | mt = migratetype; | ||
1196 | } | ||
1197 | set_freepage_migratetype(page, mt); | ||
1198 | list = &page->lru; | 1217 | list = &page->lru; |
1199 | if (is_migrate_cma(mt)) | 1218 | if (is_migrate_cma(get_freepage_migratetype(page))) |
1200 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | 1219 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
1201 | -(1 << order)); | 1220 | -(1 << order)); |
1202 | } | 1221 | } |
@@ -1320,7 +1339,7 @@ void mark_free_pages(struct zone *zone) | |||
1320 | { | 1339 | { |
1321 | unsigned long pfn, max_zone_pfn; | 1340 | unsigned long pfn, max_zone_pfn; |
1322 | unsigned long flags; | 1341 | unsigned long flags; |
1323 | int order, t; | 1342 | unsigned int order, t; |
1324 | struct list_head *curr; | 1343 | struct list_head *curr; |
1325 | 1344 | ||
1326 | if (zone_is_empty(zone)) | 1345 | if (zone_is_empty(zone)) |
@@ -1352,19 +1371,20 @@ void mark_free_pages(struct zone *zone) | |||
1352 | 1371 | ||
1353 | /* | 1372 | /* |
1354 | * Free a 0-order page | 1373 | * Free a 0-order page |
1355 | * cold == 1 ? free a cold page : free a hot page | 1374 | * cold == true ? free a cold page : free a hot page |
1356 | */ | 1375 | */ |
1357 | void free_hot_cold_page(struct page *page, int cold) | 1376 | void free_hot_cold_page(struct page *page, bool cold) |
1358 | { | 1377 | { |
1359 | struct zone *zone = page_zone(page); | 1378 | struct zone *zone = page_zone(page); |
1360 | struct per_cpu_pages *pcp; | 1379 | struct per_cpu_pages *pcp; |
1361 | unsigned long flags; | 1380 | unsigned long flags; |
1381 | unsigned long pfn = page_to_pfn(page); | ||
1362 | int migratetype; | 1382 | int migratetype; |
1363 | 1383 | ||
1364 | if (!free_pages_prepare(page, 0)) | 1384 | if (!free_pages_prepare(page, 0)) |
1365 | return; | 1385 | return; |
1366 | 1386 | ||
1367 | migratetype = get_pageblock_migratetype(page); | 1387 | migratetype = get_pfnblock_migratetype(page, pfn); |
1368 | set_freepage_migratetype(page, migratetype); | 1388 | set_freepage_migratetype(page, migratetype); |
1369 | local_irq_save(flags); | 1389 | local_irq_save(flags); |
1370 | __count_vm_event(PGFREE); | 1390 | __count_vm_event(PGFREE); |
@@ -1378,17 +1398,17 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1378 | */ | 1398 | */ |
1379 | if (migratetype >= MIGRATE_PCPTYPES) { | 1399 | if (migratetype >= MIGRATE_PCPTYPES) { |
1380 | if (unlikely(is_migrate_isolate(migratetype))) { | 1400 | if (unlikely(is_migrate_isolate(migratetype))) { |
1381 | free_one_page(zone, page, 0, migratetype); | 1401 | free_one_page(zone, page, pfn, 0, migratetype); |
1382 | goto out; | 1402 | goto out; |
1383 | } | 1403 | } |
1384 | migratetype = MIGRATE_MOVABLE; | 1404 | migratetype = MIGRATE_MOVABLE; |
1385 | } | 1405 | } |
1386 | 1406 | ||
1387 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1407 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1388 | if (cold) | 1408 | if (!cold) |
1389 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1390 | else | ||
1391 | list_add(&page->lru, &pcp->lists[migratetype]); | 1409 | list_add(&page->lru, &pcp->lists[migratetype]); |
1410 | else | ||
1411 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | ||
1392 | pcp->count++; | 1412 | pcp->count++; |
1393 | if (pcp->count >= pcp->high) { | 1413 | if (pcp->count >= pcp->high) { |
1394 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1414 | unsigned long batch = ACCESS_ONCE(pcp->batch); |
@@ -1403,7 +1423,7 @@ out: | |||
1403 | /* | 1423 | /* |
1404 | * Free a list of 0-order pages | 1424 | * Free a list of 0-order pages |
1405 | */ | 1425 | */ |
1406 | void free_hot_cold_page_list(struct list_head *list, int cold) | 1426 | void free_hot_cold_page_list(struct list_head *list, bool cold) |
1407 | { | 1427 | { |
1408 | struct page *page, *next; | 1428 | struct page *page, *next; |
1409 | 1429 | ||
@@ -1515,12 +1535,12 @@ int split_free_page(struct page *page) | |||
1515 | */ | 1535 | */ |
1516 | static inline | 1536 | static inline |
1517 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1537 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1518 | struct zone *zone, int order, gfp_t gfp_flags, | 1538 | struct zone *zone, unsigned int order, |
1519 | int migratetype) | 1539 | gfp_t gfp_flags, int migratetype) |
1520 | { | 1540 | { |
1521 | unsigned long flags; | 1541 | unsigned long flags; |
1522 | struct page *page; | 1542 | struct page *page; |
1523 | int cold = !!(gfp_flags & __GFP_COLD); | 1543 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1524 | 1544 | ||
1525 | again: | 1545 | again: |
1526 | if (likely(order == 0)) { | 1546 | if (likely(order == 0)) { |
@@ -1565,10 +1585,13 @@ again: | |||
1565 | if (!page) | 1585 | if (!page) |
1566 | goto failed; | 1586 | goto failed; |
1567 | __mod_zone_freepage_state(zone, -(1 << order), | 1587 | __mod_zone_freepage_state(zone, -(1 << order), |
1568 | get_pageblock_migratetype(page)); | 1588 | get_freepage_migratetype(page)); |
1569 | } | 1589 | } |
1570 | 1590 | ||
1571 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1591 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
1592 | if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && | ||
1593 | !zone_is_fair_depleted(zone)) | ||
1594 | zone_set_flag(zone, ZONE_FAIR_DEPLETED); | ||
1572 | 1595 | ||
1573 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1596 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1574 | zone_statistics(preferred_zone, zone, gfp_flags); | 1597 | zone_statistics(preferred_zone, zone, gfp_flags); |
@@ -1665,12 +1688,12 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1665 | * Return true if free pages are above 'mark'. This takes into account the order | 1688 | * Return true if free pages are above 'mark'. This takes into account the order |
1666 | * of the allocation. | 1689 | * of the allocation. |
1667 | */ | 1690 | */ |
1668 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1691 | static bool __zone_watermark_ok(struct zone *z, unsigned int order, |
1669 | int classzone_idx, int alloc_flags, long free_pages) | 1692 | unsigned long mark, int classzone_idx, int alloc_flags, |
1693 | long free_pages) | ||
1670 | { | 1694 | { |
1671 | /* free_pages my go negative - that's OK */ | 1695 | /* free_pages my go negative - that's OK */ |
1672 | long min = mark; | 1696 | long min = mark; |
1673 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1674 | int o; | 1697 | int o; |
1675 | long free_cma = 0; | 1698 | long free_cma = 0; |
1676 | 1699 | ||
@@ -1685,7 +1708,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1685 | free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); | 1708 | free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); |
1686 | #endif | 1709 | #endif |
1687 | 1710 | ||
1688 | if (free_pages - free_cma <= min + lowmem_reserve) | 1711 | if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) |
1689 | return false; | 1712 | return false; |
1690 | for (o = 0; o < order; o++) { | 1713 | for (o = 0; o < order; o++) { |
1691 | /* At the next order, this order's pages become unavailable */ | 1714 | /* At the next order, this order's pages become unavailable */ |
@@ -1700,15 +1723,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1700 | return true; | 1723 | return true; |
1701 | } | 1724 | } |
1702 | 1725 | ||
1703 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1726 | bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, |
1704 | int classzone_idx, int alloc_flags) | 1727 | int classzone_idx, int alloc_flags) |
1705 | { | 1728 | { |
1706 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1729 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1707 | zone_page_state(z, NR_FREE_PAGES)); | 1730 | zone_page_state(z, NR_FREE_PAGES)); |
1708 | } | 1731 | } |
1709 | 1732 | ||
1710 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 1733 | bool zone_watermark_ok_safe(struct zone *z, unsigned int order, |
1711 | int classzone_idx, int alloc_flags) | 1734 | unsigned long mark, int classzone_idx, int alloc_flags) |
1712 | { | 1735 | { |
1713 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | 1736 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
1714 | 1737 | ||
@@ -1850,7 +1873,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) | |||
1850 | { | 1873 | { |
1851 | int i; | 1874 | int i; |
1852 | 1875 | ||
1853 | for_each_online_node(i) | 1876 | for_each_node_state(i, N_MEMORY) |
1854 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | 1877 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) |
1855 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | 1878 | node_set(i, NODE_DATA(nid)->reclaim_nodes); |
1856 | else | 1879 | else |
@@ -1893,6 +1916,18 @@ static inline void init_zone_allows_reclaim(int nid) | |||
1893 | } | 1916 | } |
1894 | #endif /* CONFIG_NUMA */ | 1917 | #endif /* CONFIG_NUMA */ |
1895 | 1918 | ||
1919 | static void reset_alloc_batches(struct zone *preferred_zone) | ||
1920 | { | ||
1921 | struct zone *zone = preferred_zone->zone_pgdat->node_zones; | ||
1922 | |||
1923 | do { | ||
1924 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
1925 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
1926 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
1927 | zone_clear_flag(zone, ZONE_FAIR_DEPLETED); | ||
1928 | } while (zone++ != preferred_zone); | ||
1929 | } | ||
1930 | |||
1896 | /* | 1931 | /* |
1897 | * get_page_from_freelist goes through the zonelist trying to allocate | 1932 | * get_page_from_freelist goes through the zonelist trying to allocate |
1898 | * a page. | 1933 | * a page. |
@@ -1900,18 +1935,22 @@ static inline void init_zone_allows_reclaim(int nid) | |||
1900 | static struct page * | 1935 | static struct page * |
1901 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1936 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1902 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 1937 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1903 | struct zone *preferred_zone, int migratetype) | 1938 | struct zone *preferred_zone, int classzone_idx, int migratetype) |
1904 | { | 1939 | { |
1905 | struct zoneref *z; | 1940 | struct zoneref *z; |
1906 | struct page *page = NULL; | 1941 | struct page *page = NULL; |
1907 | int classzone_idx; | ||
1908 | struct zone *zone; | 1942 | struct zone *zone; |
1909 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1943 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1910 | int zlc_active = 0; /* set if using zonelist_cache */ | 1944 | int zlc_active = 0; /* set if using zonelist_cache */ |
1911 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1945 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1946 | bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && | ||
1947 | (gfp_mask & __GFP_WRITE); | ||
1948 | int nr_fair_skipped = 0; | ||
1949 | bool zonelist_rescan; | ||
1912 | 1950 | ||
1913 | classzone_idx = zone_idx(preferred_zone); | ||
1914 | zonelist_scan: | 1951 | zonelist_scan: |
1952 | zonelist_rescan = false; | ||
1953 | |||
1915 | /* | 1954 | /* |
1916 | * Scan zonelist, looking for a zone with enough free. | 1955 | * Scan zonelist, looking for a zone with enough free. |
1917 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. | 1956 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. |
@@ -1923,12 +1962,10 @@ zonelist_scan: | |||
1923 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 1962 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1924 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1963 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1925 | continue; | 1964 | continue; |
1926 | if ((alloc_flags & ALLOC_CPUSET) && | 1965 | if (cpusets_enabled() && |
1966 | (alloc_flags & ALLOC_CPUSET) && | ||
1927 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1967 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1928 | continue; | 1968 | continue; |
1929 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1930 | if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) | ||
1931 | goto try_this_zone; | ||
1932 | /* | 1969 | /* |
1933 | * Distribute pages in proportion to the individual | 1970 | * Distribute pages in proportion to the individual |
1934 | * zone size to ensure fair page aging. The zone a | 1971 | * zone size to ensure fair page aging. The zone a |
@@ -1937,9 +1974,11 @@ zonelist_scan: | |||
1937 | */ | 1974 | */ |
1938 | if (alloc_flags & ALLOC_FAIR) { | 1975 | if (alloc_flags & ALLOC_FAIR) { |
1939 | if (!zone_local(preferred_zone, zone)) | 1976 | if (!zone_local(preferred_zone, zone)) |
1977 | break; | ||
1978 | if (zone_is_fair_depleted(zone)) { | ||
1979 | nr_fair_skipped++; | ||
1940 | continue; | 1980 | continue; |
1941 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | 1981 | } |
1942 | continue; | ||
1943 | } | 1982 | } |
1944 | /* | 1983 | /* |
1945 | * When allocating a page cache page for writing, we | 1984 | * When allocating a page cache page for writing, we |
@@ -1967,15 +2006,19 @@ zonelist_scan: | |||
1967 | * will require awareness of zones in the | 2006 | * will require awareness of zones in the |
1968 | * dirty-throttling and the flusher threads. | 2007 | * dirty-throttling and the flusher threads. |
1969 | */ | 2008 | */ |
1970 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 2009 | if (consider_zone_dirty && !zone_dirty_ok(zone)) |
1971 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 2010 | continue; |
1972 | goto this_zone_full; | ||
1973 | 2011 | ||
1974 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2012 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1975 | if (!zone_watermark_ok(zone, order, mark, | 2013 | if (!zone_watermark_ok(zone, order, mark, |
1976 | classzone_idx, alloc_flags)) { | 2014 | classzone_idx, alloc_flags)) { |
1977 | int ret; | 2015 | int ret; |
1978 | 2016 | ||
2017 | /* Checked here to keep the fast path fast */ | ||
2018 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
2019 | if (alloc_flags & ALLOC_NO_WATERMARKS) | ||
2020 | goto try_this_zone; | ||
2021 | |||
1979 | if (IS_ENABLED(CONFIG_NUMA) && | 2022 | if (IS_ENABLED(CONFIG_NUMA) && |
1980 | !did_zlc_setup && nr_online_nodes > 1) { | 2023 | !did_zlc_setup && nr_online_nodes > 1) { |
1981 | /* | 2024 | /* |
@@ -2037,17 +2080,11 @@ try_this_zone: | |||
2037 | if (page) | 2080 | if (page) |
2038 | break; | 2081 | break; |
2039 | this_zone_full: | 2082 | this_zone_full: |
2040 | if (IS_ENABLED(CONFIG_NUMA)) | 2083 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2041 | zlc_mark_zone_full(zonelist, z); | 2084 | zlc_mark_zone_full(zonelist, z); |
2042 | } | 2085 | } |
2043 | 2086 | ||
2044 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { | 2087 | if (page) { |
2045 | /* Disable zlc cache for second zonelist scan */ | ||
2046 | zlc_active = 0; | ||
2047 | goto zonelist_scan; | ||
2048 | } | ||
2049 | |||
2050 | if (page) | ||
2051 | /* | 2088 | /* |
2052 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | 2089 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was |
2053 | * necessary to allocate the page. The expectation is | 2090 | * necessary to allocate the page. The expectation is |
@@ -2056,8 +2093,37 @@ this_zone_full: | |||
2056 | * for !PFMEMALLOC purposes. | 2093 | * for !PFMEMALLOC purposes. |
2057 | */ | 2094 | */ |
2058 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | 2095 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); |
2096 | return page; | ||
2097 | } | ||
2059 | 2098 | ||
2060 | return page; | 2099 | /* |
2100 | * The first pass makes sure allocations are spread fairly within the | ||
2101 | * local node. However, the local node might have free pages left | ||
2102 | * after the fairness batches are exhausted, and remote zones haven't | ||
2103 | * even been considered yet. Try once more without fairness, and | ||
2104 | * include remote zones now, before entering the slowpath and waking | ||
2105 | * kswapd: prefer spilling to a remote zone over swapping locally. | ||
2106 | */ | ||
2107 | if (alloc_flags & ALLOC_FAIR) { | ||
2108 | alloc_flags &= ~ALLOC_FAIR; | ||
2109 | if (nr_fair_skipped) { | ||
2110 | zonelist_rescan = true; | ||
2111 | reset_alloc_batches(preferred_zone); | ||
2112 | } | ||
2113 | if (nr_online_nodes > 1) | ||
2114 | zonelist_rescan = true; | ||
2115 | } | ||
2116 | |||
2117 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { | ||
2118 | /* Disable zlc cache for second zonelist scan */ | ||
2119 | zlc_active = 0; | ||
2120 | zonelist_rescan = true; | ||
2121 | } | ||
2122 | |||
2123 | if (zonelist_rescan) | ||
2124 | goto zonelist_scan; | ||
2125 | |||
2126 | return NULL; | ||
2061 | } | 2127 | } |
2062 | 2128 | ||
2063 | /* | 2129 | /* |
@@ -2173,7 +2239,7 @@ static inline struct page * | |||
2173 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2239 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2174 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2240 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2175 | nodemask_t *nodemask, struct zone *preferred_zone, | 2241 | nodemask_t *nodemask, struct zone *preferred_zone, |
2176 | int migratetype) | 2242 | int classzone_idx, int migratetype) |
2177 | { | 2243 | { |
2178 | struct page *page; | 2244 | struct page *page; |
2179 | 2245 | ||
@@ -2191,7 +2257,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2191 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2257 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
2192 | order, zonelist, high_zoneidx, | 2258 | order, zonelist, high_zoneidx, |
2193 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | 2259 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, |
2194 | preferred_zone, migratetype); | 2260 | preferred_zone, classzone_idx, migratetype); |
2195 | if (page) | 2261 | if (page) |
2196 | goto out; | 2262 | goto out; |
2197 | 2263 | ||
@@ -2226,7 +2292,7 @@ static struct page * | |||
2226 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2292 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2227 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2293 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2228 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2294 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2229 | int migratetype, bool sync_migration, | 2295 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2230 | bool *contended_compaction, bool *deferred_compaction, | 2296 | bool *contended_compaction, bool *deferred_compaction, |
2231 | unsigned long *did_some_progress) | 2297 | unsigned long *did_some_progress) |
2232 | { | 2298 | { |
@@ -2240,7 +2306,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2240 | 2306 | ||
2241 | current->flags |= PF_MEMALLOC; | 2307 | current->flags |= PF_MEMALLOC; |
2242 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2308 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2243 | nodemask, sync_migration, | 2309 | nodemask, mode, |
2244 | contended_compaction); | 2310 | contended_compaction); |
2245 | current->flags &= ~PF_MEMALLOC; | 2311 | current->flags &= ~PF_MEMALLOC; |
2246 | 2312 | ||
@@ -2254,13 +2320,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2254 | page = get_page_from_freelist(gfp_mask, nodemask, | 2320 | page = get_page_from_freelist(gfp_mask, nodemask, |
2255 | order, zonelist, high_zoneidx, | 2321 | order, zonelist, high_zoneidx, |
2256 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2322 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2257 | preferred_zone, migratetype); | 2323 | preferred_zone, classzone_idx, migratetype); |
2258 | if (page) { | 2324 | if (page) { |
2259 | preferred_zone->compact_blockskip_flush = false; | 2325 | preferred_zone->compact_blockskip_flush = false; |
2260 | preferred_zone->compact_considered = 0; | 2326 | compaction_defer_reset(preferred_zone, order, true); |
2261 | preferred_zone->compact_defer_shift = 0; | ||
2262 | if (order >= preferred_zone->compact_order_failed) | ||
2263 | preferred_zone->compact_order_failed = order + 1; | ||
2264 | count_vm_event(COMPACTSUCCESS); | 2327 | count_vm_event(COMPACTSUCCESS); |
2265 | return page; | 2328 | return page; |
2266 | } | 2329 | } |
@@ -2276,7 +2339,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2276 | * As async compaction considers a subset of pageblocks, only | 2339 | * As async compaction considers a subset of pageblocks, only |
2277 | * defer if the failure was a sync compaction failure. | 2340 | * defer if the failure was a sync compaction failure. |
2278 | */ | 2341 | */ |
2279 | if (sync_migration) | 2342 | if (mode != MIGRATE_ASYNC) |
2280 | defer_compaction(preferred_zone, order); | 2343 | defer_compaction(preferred_zone, order); |
2281 | 2344 | ||
2282 | cond_resched(); | 2345 | cond_resched(); |
@@ -2289,9 +2352,9 @@ static inline struct page * | |||
2289 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2352 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2290 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2353 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2291 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2354 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2292 | int migratetype, bool sync_migration, | 2355 | int classzone_idx, int migratetype, |
2293 | bool *contended_compaction, bool *deferred_compaction, | 2356 | enum migrate_mode mode, bool *contended_compaction, |
2294 | unsigned long *did_some_progress) | 2357 | bool *deferred_compaction, unsigned long *did_some_progress) |
2295 | { | 2358 | { |
2296 | return NULL; | 2359 | return NULL; |
2297 | } | 2360 | } |
@@ -2330,7 +2393,7 @@ static inline struct page * | |||
2330 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2393 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2331 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2394 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2332 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2395 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2333 | int migratetype, unsigned long *did_some_progress) | 2396 | int classzone_idx, int migratetype, unsigned long *did_some_progress) |
2334 | { | 2397 | { |
2335 | struct page *page = NULL; | 2398 | struct page *page = NULL; |
2336 | bool drained = false; | 2399 | bool drained = false; |
@@ -2348,7 +2411,8 @@ retry: | |||
2348 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2411 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2349 | zonelist, high_zoneidx, | 2412 | zonelist, high_zoneidx, |
2350 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2413 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2351 | preferred_zone, migratetype); | 2414 | preferred_zone, classzone_idx, |
2415 | migratetype); | ||
2352 | 2416 | ||
2353 | /* | 2417 | /* |
2354 | * If an allocation failed after direct reclaim, it could be because | 2418 | * If an allocation failed after direct reclaim, it could be because |
@@ -2371,14 +2435,14 @@ static inline struct page * | |||
2371 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2435 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2372 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2436 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2373 | nodemask_t *nodemask, struct zone *preferred_zone, | 2437 | nodemask_t *nodemask, struct zone *preferred_zone, |
2374 | int migratetype) | 2438 | int classzone_idx, int migratetype) |
2375 | { | 2439 | { |
2376 | struct page *page; | 2440 | struct page *page; |
2377 | 2441 | ||
2378 | do { | 2442 | do { |
2379 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2443 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2380 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2444 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, |
2381 | preferred_zone, migratetype); | 2445 | preferred_zone, classzone_idx, migratetype); |
2382 | 2446 | ||
2383 | if (!page && gfp_mask & __GFP_NOFAIL) | 2447 | if (!page && gfp_mask & __GFP_NOFAIL) |
2384 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2448 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
@@ -2387,28 +2451,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2387 | return page; | 2451 | return page; |
2388 | } | 2452 | } |
2389 | 2453 | ||
2390 | static void reset_alloc_batches(struct zonelist *zonelist, | ||
2391 | enum zone_type high_zoneidx, | ||
2392 | struct zone *preferred_zone) | ||
2393 | { | ||
2394 | struct zoneref *z; | ||
2395 | struct zone *zone; | ||
2396 | |||
2397 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | ||
2398 | /* | ||
2399 | * Only reset the batches of zones that were actually | ||
2400 | * considered in the fairness pass, we don't want to | ||
2401 | * trash fairness information for zones that are not | ||
2402 | * actually part of this zonelist's round-robin cycle. | ||
2403 | */ | ||
2404 | if (!zone_local(preferred_zone, zone)) | ||
2405 | continue; | ||
2406 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
2407 | high_wmark_pages(zone) - low_wmark_pages(zone) - | ||
2408 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | ||
2409 | } | ||
2410 | } | ||
2411 | |||
2412 | static void wake_all_kswapds(unsigned int order, | 2454 | static void wake_all_kswapds(unsigned int order, |
2413 | struct zonelist *zonelist, | 2455 | struct zonelist *zonelist, |
2414 | enum zone_type high_zoneidx, | 2456 | enum zone_type high_zoneidx, |
@@ -2479,14 +2521,14 @@ static inline struct page * | |||
2479 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2521 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2480 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2522 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2481 | nodemask_t *nodemask, struct zone *preferred_zone, | 2523 | nodemask_t *nodemask, struct zone *preferred_zone, |
2482 | int migratetype) | 2524 | int classzone_idx, int migratetype) |
2483 | { | 2525 | { |
2484 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2526 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2485 | struct page *page = NULL; | 2527 | struct page *page = NULL; |
2486 | int alloc_flags; | 2528 | int alloc_flags; |
2487 | unsigned long pages_reclaimed = 0; | 2529 | unsigned long pages_reclaimed = 0; |
2488 | unsigned long did_some_progress; | 2530 | unsigned long did_some_progress; |
2489 | bool sync_migration = false; | 2531 | enum migrate_mode migration_mode = MIGRATE_ASYNC; |
2490 | bool deferred_compaction = false; | 2532 | bool deferred_compaction = false; |
2491 | bool contended_compaction = false; | 2533 | bool contended_compaction = false; |
2492 | 2534 | ||
@@ -2528,15 +2570,19 @@ restart: | |||
2528 | * Find the true preferred zone if the allocation is unconstrained by | 2570 | * Find the true preferred zone if the allocation is unconstrained by |
2529 | * cpusets. | 2571 | * cpusets. |
2530 | */ | 2572 | */ |
2531 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | 2573 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { |
2532 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2574 | struct zoneref *preferred_zoneref; |
2533 | &preferred_zone); | 2575 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2576 | NULL, | ||
2577 | &preferred_zone); | ||
2578 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2579 | } | ||
2534 | 2580 | ||
2535 | rebalance: | 2581 | rebalance: |
2536 | /* This is the last chance, in general, before the goto nopage. */ | 2582 | /* This is the last chance, in general, before the goto nopage. */ |
2537 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2583 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2538 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2584 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
2539 | preferred_zone, migratetype); | 2585 | preferred_zone, classzone_idx, migratetype); |
2540 | if (page) | 2586 | if (page) |
2541 | goto got_pg; | 2587 | goto got_pg; |
2542 | 2588 | ||
@@ -2551,7 +2597,7 @@ rebalance: | |||
2551 | 2597 | ||
2552 | page = __alloc_pages_high_priority(gfp_mask, order, | 2598 | page = __alloc_pages_high_priority(gfp_mask, order, |
2553 | zonelist, high_zoneidx, nodemask, | 2599 | zonelist, high_zoneidx, nodemask, |
2554 | preferred_zone, migratetype); | 2600 | preferred_zone, classzone_idx, migratetype); |
2555 | if (page) { | 2601 | if (page) { |
2556 | goto got_pg; | 2602 | goto got_pg; |
2557 | } | 2603 | } |
@@ -2573,17 +2619,16 @@ rebalance: | |||
2573 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2619 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2574 | * attempts after direct reclaim are synchronous | 2620 | * attempts after direct reclaim are synchronous |
2575 | */ | 2621 | */ |
2576 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2622 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2577 | zonelist, high_zoneidx, | 2623 | high_zoneidx, nodemask, alloc_flags, |
2578 | nodemask, | 2624 | preferred_zone, |
2579 | alloc_flags, preferred_zone, | 2625 | classzone_idx, migratetype, |
2580 | migratetype, sync_migration, | 2626 | migration_mode, &contended_compaction, |
2581 | &contended_compaction, | ||
2582 | &deferred_compaction, | 2627 | &deferred_compaction, |
2583 | &did_some_progress); | 2628 | &did_some_progress); |
2584 | if (page) | 2629 | if (page) |
2585 | goto got_pg; | 2630 | goto got_pg; |
2586 | sync_migration = true; | 2631 | migration_mode = MIGRATE_SYNC_LIGHT; |
2587 | 2632 | ||
2588 | /* | 2633 | /* |
2589 | * If compaction is deferred for high-order allocations, it is because | 2634 | * If compaction is deferred for high-order allocations, it is because |
@@ -2600,7 +2645,8 @@ rebalance: | |||
2600 | zonelist, high_zoneidx, | 2645 | zonelist, high_zoneidx, |
2601 | nodemask, | 2646 | nodemask, |
2602 | alloc_flags, preferred_zone, | 2647 | alloc_flags, preferred_zone, |
2603 | migratetype, &did_some_progress); | 2648 | classzone_idx, migratetype, |
2649 | &did_some_progress); | ||
2604 | if (page) | 2650 | if (page) |
2605 | goto got_pg; | 2651 | goto got_pg; |
2606 | 2652 | ||
@@ -2619,7 +2665,7 @@ rebalance: | |||
2619 | page = __alloc_pages_may_oom(gfp_mask, order, | 2665 | page = __alloc_pages_may_oom(gfp_mask, order, |
2620 | zonelist, high_zoneidx, | 2666 | zonelist, high_zoneidx, |
2621 | nodemask, preferred_zone, | 2667 | nodemask, preferred_zone, |
2622 | migratetype); | 2668 | classzone_idx, migratetype); |
2623 | if (page) | 2669 | if (page) |
2624 | goto got_pg; | 2670 | goto got_pg; |
2625 | 2671 | ||
@@ -2658,12 +2704,11 @@ rebalance: | |||
2658 | * direct reclaim and reclaim/compaction depends on compaction | 2704 | * direct reclaim and reclaim/compaction depends on compaction |
2659 | * being called after reclaim so call directly if necessary | 2705 | * being called after reclaim so call directly if necessary |
2660 | */ | 2706 | */ |
2661 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2707 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, |
2662 | zonelist, high_zoneidx, | 2708 | high_zoneidx, nodemask, alloc_flags, |
2663 | nodemask, | 2709 | preferred_zone, |
2664 | alloc_flags, preferred_zone, | 2710 | classzone_idx, migratetype, |
2665 | migratetype, sync_migration, | 2711 | migration_mode, &contended_compaction, |
2666 | &contended_compaction, | ||
2667 | &deferred_compaction, | 2712 | &deferred_compaction, |
2668 | &did_some_progress); | 2713 | &did_some_progress); |
2669 | if (page) | 2714 | if (page) |
@@ -2689,11 +2734,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2689 | { | 2734 | { |
2690 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2735 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2691 | struct zone *preferred_zone; | 2736 | struct zone *preferred_zone; |
2737 | struct zoneref *preferred_zoneref; | ||
2692 | struct page *page = NULL; | 2738 | struct page *page = NULL; |
2693 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2739 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2694 | unsigned int cpuset_mems_cookie; | 2740 | unsigned int cpuset_mems_cookie; |
2695 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2741 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2696 | struct mem_cgroup *memcg = NULL; | 2742 | struct mem_cgroup *memcg = NULL; |
2743 | int classzone_idx; | ||
2697 | 2744 | ||
2698 | gfp_mask &= gfp_allowed_mask; | 2745 | gfp_mask &= gfp_allowed_mask; |
2699 | 2746 | ||
@@ -2720,42 +2767,26 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2720 | return NULL; | 2767 | return NULL; |
2721 | 2768 | ||
2722 | retry_cpuset: | 2769 | retry_cpuset: |
2723 | cpuset_mems_cookie = get_mems_allowed(); | 2770 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2724 | 2771 | ||
2725 | /* The preferred zone is used for statistics later */ | 2772 | /* The preferred zone is used for statistics later */ |
2726 | first_zones_zonelist(zonelist, high_zoneidx, | 2773 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, |
2727 | nodemask ? : &cpuset_current_mems_allowed, | 2774 | nodemask ? : &cpuset_current_mems_allowed, |
2728 | &preferred_zone); | 2775 | &preferred_zone); |
2729 | if (!preferred_zone) | 2776 | if (!preferred_zone) |
2730 | goto out; | 2777 | goto out; |
2778 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | ||
2731 | 2779 | ||
2732 | #ifdef CONFIG_CMA | 2780 | #ifdef CONFIG_CMA |
2733 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2781 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
2734 | alloc_flags |= ALLOC_CMA; | 2782 | alloc_flags |= ALLOC_CMA; |
2735 | #endif | 2783 | #endif |
2736 | retry: | ||
2737 | /* First allocation attempt */ | 2784 | /* First allocation attempt */ |
2738 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2785 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2739 | zonelist, high_zoneidx, alloc_flags, | 2786 | zonelist, high_zoneidx, alloc_flags, |
2740 | preferred_zone, migratetype); | 2787 | preferred_zone, classzone_idx, migratetype); |
2741 | if (unlikely(!page)) { | 2788 | if (unlikely(!page)) { |
2742 | /* | 2789 | /* |
2743 | * The first pass makes sure allocations are spread | ||
2744 | * fairly within the local node. However, the local | ||
2745 | * node might have free pages left after the fairness | ||
2746 | * batches are exhausted, and remote zones haven't | ||
2747 | * even been considered yet. Try once more without | ||
2748 | * fairness, and include remote zones now, before | ||
2749 | * entering the slowpath and waking kswapd: prefer | ||
2750 | * spilling to a remote zone over swapping locally. | ||
2751 | */ | ||
2752 | if (alloc_flags & ALLOC_FAIR) { | ||
2753 | reset_alloc_batches(zonelist, high_zoneidx, | ||
2754 | preferred_zone); | ||
2755 | alloc_flags &= ~ALLOC_FAIR; | ||
2756 | goto retry; | ||
2757 | } | ||
2758 | /* | ||
2759 | * Runtime PM, block IO and its error handling path | 2790 | * Runtime PM, block IO and its error handling path |
2760 | * can deadlock because I/O on the device might not | 2791 | * can deadlock because I/O on the device might not |
2761 | * complete. | 2792 | * complete. |
@@ -2763,7 +2794,7 @@ retry: | |||
2763 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2794 | gfp_mask = memalloc_noio_flags(gfp_mask); |
2764 | page = __alloc_pages_slowpath(gfp_mask, order, | 2795 | page = __alloc_pages_slowpath(gfp_mask, order, |
2765 | zonelist, high_zoneidx, nodemask, | 2796 | zonelist, high_zoneidx, nodemask, |
2766 | preferred_zone, migratetype); | 2797 | preferred_zone, classzone_idx, migratetype); |
2767 | } | 2798 | } |
2768 | 2799 | ||
2769 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2800 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
@@ -2775,7 +2806,7 @@ out: | |||
2775 | * the mask is being updated. If a page allocation is about to fail, | 2806 | * the mask is being updated. If a page allocation is about to fail, |
2776 | * check if the cpuset changed during allocation and if so, retry. | 2807 | * check if the cpuset changed during allocation and if so, retry. |
2777 | */ | 2808 | */ |
2778 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2809 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2779 | goto retry_cpuset; | 2810 | goto retry_cpuset; |
2780 | 2811 | ||
2781 | memcg_kmem_commit_charge(page, memcg, order); | 2812 | memcg_kmem_commit_charge(page, memcg, order); |
@@ -2814,7 +2845,7 @@ void __free_pages(struct page *page, unsigned int order) | |||
2814 | { | 2845 | { |
2815 | if (put_page_testzero(page)) { | 2846 | if (put_page_testzero(page)) { |
2816 | if (order == 0) | 2847 | if (order == 0) |
2817 | free_hot_cold_page(page, 0); | 2848 | free_hot_cold_page(page, false); |
2818 | else | 2849 | else |
2819 | __free_pages_ok(page, order); | 2850 | __free_pages_ok(page, order); |
2820 | } | 2851 | } |
@@ -3043,9 +3074,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) | |||
3043 | goto out; | 3074 | goto out; |
3044 | 3075 | ||
3045 | do { | 3076 | do { |
3046 | cpuset_mems_cookie = get_mems_allowed(); | 3077 | cpuset_mems_cookie = read_mems_allowed_begin(); |
3047 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 3078 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
3048 | } while (!put_mems_allowed(cpuset_mems_cookie)); | 3079 | } while (read_mems_allowed_retry(cpuset_mems_cookie)); |
3049 | out: | 3080 | out: |
3050 | return ret; | 3081 | return ret; |
3051 | } | 3082 | } |
@@ -3198,12 +3229,12 @@ void show_free_areas(unsigned int filter) | |||
3198 | K(zone_page_state(zone, NR_BOUNCE)), | 3229 | K(zone_page_state(zone, NR_BOUNCE)), |
3199 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | 3230 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), |
3200 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 3231 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
3201 | zone->pages_scanned, | 3232 | K(zone_page_state(zone, NR_PAGES_SCANNED)), |
3202 | (!zone_reclaimable(zone) ? "yes" : "no") | 3233 | (!zone_reclaimable(zone) ? "yes" : "no") |
3203 | ); | 3234 | ); |
3204 | printk("lowmem_reserve[]:"); | 3235 | printk("lowmem_reserve[]:"); |
3205 | for (i = 0; i < MAX_NR_ZONES; i++) | 3236 | for (i = 0; i < MAX_NR_ZONES; i++) |
3206 | printk(" %lu", zone->lowmem_reserve[i]); | 3237 | printk(" %ld", zone->lowmem_reserve[i]); |
3207 | printk("\n"); | 3238 | printk("\n"); |
3208 | } | 3239 | } |
3209 | 3240 | ||
@@ -3943,6 +3974,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3943 | struct page *page; | 3974 | struct page *page; |
3944 | unsigned long block_migratetype; | 3975 | unsigned long block_migratetype; |
3945 | int reserve; | 3976 | int reserve; |
3977 | int old_reserve; | ||
3946 | 3978 | ||
3947 | /* | 3979 | /* |
3948 | * Get the start pfn, end pfn and the number of blocks to reserve | 3980 | * Get the start pfn, end pfn and the number of blocks to reserve |
@@ -3964,6 +3996,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3964 | * future allocation of hugepages at runtime. | 3996 | * future allocation of hugepages at runtime. |
3965 | */ | 3997 | */ |
3966 | reserve = min(2, reserve); | 3998 | reserve = min(2, reserve); |
3999 | old_reserve = zone->nr_migrate_reserve_block; | ||
4000 | |||
4001 | /* When memory hot-add, we almost always need to do nothing */ | ||
4002 | if (reserve == old_reserve) | ||
4003 | return; | ||
4004 | zone->nr_migrate_reserve_block = reserve; | ||
3967 | 4005 | ||
3968 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 4006 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3969 | if (!pfn_valid(pfn)) | 4007 | if (!pfn_valid(pfn)) |
@@ -4001,6 +4039,12 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
4001 | reserve--; | 4039 | reserve--; |
4002 | continue; | 4040 | continue; |
4003 | } | 4041 | } |
4042 | } else if (!old_reserve) { | ||
4043 | /* | ||
4044 | * At boot time we don't need to scan the whole zone | ||
4045 | * for turning off MIGRATE_RESERVE. | ||
4046 | */ | ||
4047 | break; | ||
4004 | } | 4048 | } |
4005 | 4049 | ||
4006 | /* | 4050 | /* |
@@ -4080,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4080 | 4124 | ||
4081 | static void __meminit zone_init_free_lists(struct zone *zone) | 4125 | static void __meminit zone_init_free_lists(struct zone *zone) |
4082 | { | 4126 | { |
4083 | int order, t; | 4127 | unsigned int order, t; |
4084 | for_each_migratetype_order(order, t) { | 4128 | for_each_migratetype_order(order, t) { |
4085 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); | 4129 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
4086 | zone->free_area[order].nr_free = 0; | 4130 | zone->free_area[order].nr_free = 0; |
@@ -4903,7 +4947,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4903 | 4947 | ||
4904 | pgdat->node_id = nid; | 4948 | pgdat->node_id = nid; |
4905 | pgdat->node_start_pfn = node_start_pfn; | 4949 | pgdat->node_start_pfn = node_start_pfn; |
4906 | init_zone_allows_reclaim(nid); | 4950 | if (node_state(nid, N_MEMORY)) |
4951 | init_zone_allows_reclaim(nid); | ||
4907 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4952 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4908 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 4953 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
4909 | #endif | 4954 | #endif |
@@ -5492,7 +5537,7 @@ static void calculate_totalreserve_pages(void) | |||
5492 | for_each_online_pgdat(pgdat) { | 5537 | for_each_online_pgdat(pgdat) { |
5493 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5538 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5494 | struct zone *zone = pgdat->node_zones + i; | 5539 | struct zone *zone = pgdat->node_zones + i; |
5495 | unsigned long max = 0; | 5540 | long max = 0; |
5496 | 5541 | ||
5497 | /* Find valid and maximum lowmem_reserve in the zone */ | 5542 | /* Find valid and maximum lowmem_reserve in the zone */ |
5498 | for (j = i; j < MAX_NR_ZONES; j++) { | 5543 | for (j = i; j < MAX_NR_ZONES; j++) { |
@@ -5734,7 +5779,12 @@ module_init(init_per_zone_wmark_min) | |||
5734 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5779 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
5735 | void __user *buffer, size_t *length, loff_t *ppos) | 5780 | void __user *buffer, size_t *length, loff_t *ppos) |
5736 | { | 5781 | { |
5737 | proc_dointvec(table, write, buffer, length, ppos); | 5782 | int rc; |
5783 | |||
5784 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
5785 | if (rc) | ||
5786 | return rc; | ||
5787 | |||
5738 | if (write) { | 5788 | if (write) { |
5739 | user_min_free_kbytes = min_free_kbytes; | 5789 | user_min_free_kbytes = min_free_kbytes; |
5740 | setup_per_zone_wmarks(); | 5790 | setup_per_zone_wmarks(); |
@@ -5976,17 +6026,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
5976 | * @end_bitidx: The last bit of interest | 6026 | * @end_bitidx: The last bit of interest |
5977 | * returns pageblock_bits flags | 6027 | * returns pageblock_bits flags |
5978 | */ | 6028 | */ |
5979 | unsigned long get_pageblock_flags_mask(struct page *page, | 6029 | unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
5980 | unsigned long end_bitidx, | 6030 | unsigned long end_bitidx, |
5981 | unsigned long mask) | 6031 | unsigned long mask) |
5982 | { | 6032 | { |
5983 | struct zone *zone; | 6033 | struct zone *zone; |
5984 | unsigned long *bitmap; | 6034 | unsigned long *bitmap; |
5985 | unsigned long pfn, bitidx, word_bitidx; | 6035 | unsigned long bitidx, word_bitidx; |
5986 | unsigned long word; | 6036 | unsigned long word; |
5987 | 6037 | ||
5988 | zone = page_zone(page); | 6038 | zone = page_zone(page); |
5989 | pfn = page_to_pfn(page); | ||
5990 | bitmap = get_pageblock_bitmap(zone, pfn); | 6039 | bitmap = get_pageblock_bitmap(zone, pfn); |
5991 | bitidx = pfn_to_bitidx(zone, pfn); | 6040 | bitidx = pfn_to_bitidx(zone, pfn); |
5992 | word_bitidx = bitidx / BITS_PER_LONG; | 6041 | word_bitidx = bitidx / BITS_PER_LONG; |
@@ -5998,25 +6047,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, | |||
5998 | } | 6047 | } |
5999 | 6048 | ||
6000 | /** | 6049 | /** |
6001 | * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages | 6050 | * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
6002 | * @page: The page within the block of interest | 6051 | * @page: The page within the block of interest |
6003 | * @start_bitidx: The first bit of interest | 6052 | * @start_bitidx: The first bit of interest |
6004 | * @end_bitidx: The last bit of interest | 6053 | * @end_bitidx: The last bit of interest |
6005 | * @flags: The flags to set | 6054 | * @flags: The flags to set |
6006 | */ | 6055 | */ |
6007 | void set_pageblock_flags_mask(struct page *page, unsigned long flags, | 6056 | void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
6057 | unsigned long pfn, | ||
6008 | unsigned long end_bitidx, | 6058 | unsigned long end_bitidx, |
6009 | unsigned long mask) | 6059 | unsigned long mask) |
6010 | { | 6060 | { |
6011 | struct zone *zone; | 6061 | struct zone *zone; |
6012 | unsigned long *bitmap; | 6062 | unsigned long *bitmap; |
6013 | unsigned long pfn, bitidx, word_bitidx; | 6063 | unsigned long bitidx, word_bitidx; |
6014 | unsigned long old_word, word; | 6064 | unsigned long old_word, word; |
6015 | 6065 | ||
6016 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); | 6066 | BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
6017 | 6067 | ||
6018 | zone = page_zone(page); | 6068 | zone = page_zone(page); |
6019 | pfn = page_to_pfn(page); | ||
6020 | bitmap = get_pageblock_bitmap(zone, pfn); | 6069 | bitmap = get_pageblock_bitmap(zone, pfn); |
6021 | bitidx = pfn_to_bitidx(zone, pfn); | 6070 | bitidx = pfn_to_bitidx(zone, pfn); |
6022 | word_bitidx = bitidx / BITS_PER_LONG; | 6071 | word_bitidx = bitidx / BITS_PER_LONG; |
@@ -6194,7 +6243,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
6194 | cc->nr_migratepages -= nr_reclaimed; | 6243 | cc->nr_migratepages -= nr_reclaimed; |
6195 | 6244 | ||
6196 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, | 6245 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
6197 | 0, MIGRATE_SYNC, MR_CMA); | 6246 | NULL, 0, cc->mode, MR_CMA); |
6198 | } | 6247 | } |
6199 | if (ret < 0) { | 6248 | if (ret < 0) { |
6200 | putback_movable_pages(&cc->migratepages); | 6249 | putback_movable_pages(&cc->migratepages); |
@@ -6233,7 +6282,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
6233 | .nr_migratepages = 0, | 6282 | .nr_migratepages = 0, |
6234 | .order = -1, | 6283 | .order = -1, |
6235 | .zone = page_zone(pfn_to_page(start)), | 6284 | .zone = page_zone(pfn_to_page(start)), |
6236 | .sync = true, | 6285 | .mode = MIGRATE_SYNC, |
6237 | .ignore_skip_hint = true, | 6286 | .ignore_skip_hint = true, |
6238 | }; | 6287 | }; |
6239 | INIT_LIST_HEAD(&cc.migratepages); | 6288 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -6388,7 +6437,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6388 | { | 6437 | { |
6389 | struct page *page; | 6438 | struct page *page; |
6390 | struct zone *zone; | 6439 | struct zone *zone; |
6391 | int order, i; | 6440 | unsigned int order, i; |
6392 | unsigned long pfn; | 6441 | unsigned long pfn; |
6393 | unsigned long flags; | 6442 | unsigned long flags; |
6394 | /* find the first valid pfn */ | 6443 | /* find the first valid pfn */ |
@@ -6440,7 +6489,7 @@ bool is_free_buddy_page(struct page *page) | |||
6440 | struct zone *zone = page_zone(page); | 6489 | struct zone *zone = page_zone(page); |
6441 | unsigned long pfn = page_to_pfn(page); | 6490 | unsigned long pfn = page_to_pfn(page); |
6442 | unsigned long flags; | 6491 | unsigned long flags; |
6443 | int order; | 6492 | unsigned int order; |
6444 | 6493 | ||
6445 | spin_lock_irqsave(&zone->lock, flags); | 6494 | spin_lock_irqsave(&zone->lock, flags); |
6446 | for (order = 0; order < MAX_ORDER; order++) { | 6495 | for (order = 0; order < MAX_ORDER; order++) { |
diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..0f35e983bffb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -8,9 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/fs.h> | ||
12 | #include <linux/gfp.h> | 11 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | ||
14 | #include <linux/export.h> | 12 | #include <linux/export.h> |
15 | #include <linux/blkdev.h> | 13 | #include <linux/blkdev.h> |
16 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
@@ -20,6 +18,8 @@ | |||
20 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
21 | #include <linux/file.h> | 19 | #include <linux/file.h> |
22 | 20 | ||
21 | #include "internal.h" | ||
22 | |||
23 | /* | 23 | /* |
24 | * Initialise a struct file's readahead state. Assumes that the caller has | 24 | * Initialise a struct file's readahead state. Assumes that the caller has |
25 | * memset *ra to zero. | 25 | * memset *ra to zero. |
@@ -149,8 +149,7 @@ out: | |||
149 | * | 149 | * |
150 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 150 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
151 | */ | 151 | */ |
152 | static int | 152 | int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
153 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
154 | pgoff_t offset, unsigned long nr_to_read, | 153 | pgoff_t offset, unsigned long nr_to_read, |
155 | unsigned long lookahead_size) | 154 | unsigned long lookahead_size) |
156 | { | 155 | { |
@@ -179,7 +178,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
179 | rcu_read_lock(); | 178 | rcu_read_lock(); |
180 | page = radix_tree_lookup(&mapping->page_tree, page_offset); | 179 | page = radix_tree_lookup(&mapping->page_tree, page_offset); |
181 | rcu_read_unlock(); | 180 | rcu_read_unlock(); |
182 | if (page) | 181 | if (page && !radix_tree_exceptional_entry(page)) |
183 | continue; | 182 | continue; |
184 | 183 | ||
185 | page = page_cache_alloc_readahead(mapping); | 184 | page = page_cache_alloc_readahead(mapping); |
@@ -237,28 +236,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
237 | return ret; | 236 | return ret; |
238 | } | 237 | } |
239 | 238 | ||
239 | #define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) | ||
240 | /* | 240 | /* |
241 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 241 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
242 | * sensible upper limit. | 242 | * sensible upper limit. |
243 | */ | 243 | */ |
244 | unsigned long max_sane_readahead(unsigned long nr) | 244 | unsigned long max_sane_readahead(unsigned long nr) |
245 | { | 245 | { |
246 | return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) | 246 | return min(nr, MAX_READAHEAD); |
247 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | ||
248 | } | ||
249 | |||
250 | /* | ||
251 | * Submit IO for the read-ahead request in file_ra_state. | ||
252 | */ | ||
253 | unsigned long ra_submit(struct file_ra_state *ra, | ||
254 | struct address_space *mapping, struct file *filp) | ||
255 | { | ||
256 | int actual; | ||
257 | |||
258 | actual = __do_page_cache_readahead(mapping, filp, | ||
259 | ra->start, ra->size, ra->async_size); | ||
260 | |||
261 | return actual; | ||
262 | } | 247 | } |
263 | 248 | ||
264 | /* | 249 | /* |
@@ -351,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, | |||
351 | pgoff_t head; | 336 | pgoff_t head; |
352 | 337 | ||
353 | rcu_read_lock(); | 338 | rcu_read_lock(); |
354 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | 339 | head = page_cache_prev_hole(mapping, offset - 1, max); |
355 | rcu_read_unlock(); | 340 | rcu_read_unlock(); |
356 | 341 | ||
357 | return offset - 1 - head; | 342 | return offset - 1 - head; |
@@ -401,6 +386,7 @@ ondemand_readahead(struct address_space *mapping, | |||
401 | unsigned long req_size) | 386 | unsigned long req_size) |
402 | { | 387 | { |
403 | unsigned long max = max_sane_readahead(ra->ra_pages); | 388 | unsigned long max = max_sane_readahead(ra->ra_pages); |
389 | pgoff_t prev_offset; | ||
404 | 390 | ||
405 | /* | 391 | /* |
406 | * start of file | 392 | * start of file |
@@ -430,7 +416,7 @@ ondemand_readahead(struct address_space *mapping, | |||
430 | pgoff_t start; | 416 | pgoff_t start; |
431 | 417 | ||
432 | rcu_read_lock(); | 418 | rcu_read_lock(); |
433 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); | 419 | start = page_cache_next_hole(mapping, offset + 1, max); |
434 | rcu_read_unlock(); | 420 | rcu_read_unlock(); |
435 | 421 | ||
436 | if (!start || start - offset > max) | 422 | if (!start || start - offset > max) |
@@ -452,8 +438,11 @@ ondemand_readahead(struct address_space *mapping, | |||
452 | 438 | ||
453 | /* | 439 | /* |
454 | * sequential cache miss | 440 | * sequential cache miss |
441 | * trivial case: (offset - prev_offset) == 1 | ||
442 | * unaligned reads: (offset - prev_offset) == 0 | ||
455 | */ | 443 | */ |
456 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | 444 | prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; |
445 | if (offset - prev_offset <= 1UL) | ||
457 | goto initial_readahead; | 446 | goto initial_readahead; |
458 | 447 | ||
459 | /* | 448 | /* |
diff --git a/mm/shmem.c b/mm/shmem.c index 0da81aaeb4cc..ab05681f41cd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -243,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
243 | pgoff_t index, void *expected, void *replacement) | 243 | pgoff_t index, void *expected, void *replacement) |
244 | { | 244 | { |
245 | void **pslot; | 245 | void **pslot; |
246 | void *item = NULL; | 246 | void *item; |
247 | 247 | ||
248 | VM_BUG_ON(!expected); | 248 | VM_BUG_ON(!expected); |
249 | VM_BUG_ON(!replacement); | ||
249 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); | 250 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); |
250 | if (pslot) | 251 | if (!pslot) |
251 | item = radix_tree_deref_slot_protected(pslot, | 252 | return -ENOENT; |
252 | &mapping->tree_lock); | 253 | item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); |
253 | if (item != expected) | 254 | if (item != expected) |
254 | return -ENOENT; | 255 | return -ENOENT; |
255 | if (replacement) | 256 | radix_tree_replace_slot(pslot, replacement); |
256 | radix_tree_replace_slot(pslot, replacement); | ||
257 | else | ||
258 | radix_tree_delete(&mapping->page_tree, index); | ||
259 | return 0; | 257 | return 0; |
260 | } | 258 | } |
261 | 259 | ||
@@ -332,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) | |||
332 | } | 330 | } |
333 | 331 | ||
334 | /* | 332 | /* |
335 | * Like find_get_pages, but collecting swap entries as well as pages. | ||
336 | */ | ||
337 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, | ||
338 | pgoff_t start, unsigned int nr_pages, | ||
339 | struct page **pages, pgoff_t *indices) | ||
340 | { | ||
341 | void **slot; | ||
342 | unsigned int ret = 0; | ||
343 | struct radix_tree_iter iter; | ||
344 | |||
345 | if (!nr_pages) | ||
346 | return 0; | ||
347 | |||
348 | rcu_read_lock(); | ||
349 | restart: | ||
350 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | ||
351 | struct page *page; | ||
352 | repeat: | ||
353 | page = radix_tree_deref_slot(slot); | ||
354 | if (unlikely(!page)) | ||
355 | continue; | ||
356 | if (radix_tree_exception(page)) { | ||
357 | if (radix_tree_deref_retry(page)) | ||
358 | goto restart; | ||
359 | /* | ||
360 | * Otherwise, we must be storing a swap entry | ||
361 | * here as an exceptional entry: so return it | ||
362 | * without attempting to raise page count. | ||
363 | */ | ||
364 | goto export; | ||
365 | } | ||
366 | if (!page_cache_get_speculative(page)) | ||
367 | goto repeat; | ||
368 | |||
369 | /* Has the page moved? */ | ||
370 | if (unlikely(page != *slot)) { | ||
371 | page_cache_release(page); | ||
372 | goto repeat; | ||
373 | } | ||
374 | export: | ||
375 | indices[ret] = iter.index; | ||
376 | pages[ret] = page; | ||
377 | if (++ret == nr_pages) | ||
378 | break; | ||
379 | } | ||
380 | rcu_read_unlock(); | ||
381 | return ret; | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Remove swap entry from radix tree, free the swap and its page cache. | 333 | * Remove swap entry from radix tree, free the swap and its page cache. |
386 | */ | 334 | */ |
387 | static int shmem_free_swap(struct address_space *mapping, | 335 | static int shmem_free_swap(struct address_space *mapping, |
388 | pgoff_t index, void *radswap) | 336 | pgoff_t index, void *radswap) |
389 | { | 337 | { |
390 | int error; | 338 | void *old; |
391 | 339 | ||
392 | spin_lock_irq(&mapping->tree_lock); | 340 | spin_lock_irq(&mapping->tree_lock); |
393 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | 341 | old = radix_tree_delete_item(&mapping->page_tree, index, radswap); |
394 | spin_unlock_irq(&mapping->tree_lock); | 342 | spin_unlock_irq(&mapping->tree_lock); |
395 | if (!error) | 343 | if (old != radswap) |
396 | free_swap_and_cache(radix_to_swp_entry(radswap)); | 344 | return -ENOENT; |
397 | return error; | 345 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
398 | } | 346 | return 0; |
399 | |||
400 | /* | ||
401 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
402 | */ | ||
403 | static void shmem_deswap_pagevec(struct pagevec *pvec) | ||
404 | { | ||
405 | int i, j; | ||
406 | |||
407 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | ||
408 | struct page *page = pvec->pages[i]; | ||
409 | if (!radix_tree_exceptional_entry(page)) | ||
410 | pvec->pages[j++] = page; | ||
411 | } | ||
412 | pvec->nr = j; | ||
413 | } | 347 | } |
414 | 348 | ||
415 | /* | 349 | /* |
@@ -430,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
430 | * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it | 364 | * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it |
431 | * has finished, if it hits a row of PAGEVEC_SIZE swap entries. | 365 | * has finished, if it hits a row of PAGEVEC_SIZE swap entries. |
432 | */ | 366 | */ |
433 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 367 | pvec.nr = find_get_entries(mapping, index, |
434 | PAGEVEC_SIZE, pvec.pages, indices); | 368 | PAGEVEC_SIZE, pvec.pages, indices); |
435 | if (!pvec.nr) | 369 | if (!pvec.nr) |
436 | break; | 370 | break; |
437 | index = indices[pvec.nr - 1] + 1; | 371 | index = indices[pvec.nr - 1] + 1; |
438 | shmem_deswap_pagevec(&pvec); | 372 | pagevec_remove_exceptionals(&pvec); |
439 | check_move_unevictable_pages(pvec.pages, pvec.nr); | 373 | check_move_unevictable_pages(pvec.pages, pvec.nr); |
440 | pagevec_release(&pvec); | 374 | pagevec_release(&pvec); |
441 | cond_resched(); | 375 | cond_resched(); |
@@ -467,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
467 | pagevec_init(&pvec, 0); | 401 | pagevec_init(&pvec, 0); |
468 | index = start; | 402 | index = start; |
469 | while (index < end) { | 403 | while (index < end) { |
470 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 404 | pvec.nr = find_get_entries(mapping, index, |
471 | min(end - index, (pgoff_t)PAGEVEC_SIZE), | 405 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
472 | pvec.pages, indices); | 406 | pvec.pages, indices); |
473 | if (!pvec.nr) | 407 | if (!pvec.nr) |
474 | break; | 408 | break; |
475 | mem_cgroup_uncharge_start(); | 409 | mem_cgroup_uncharge_start(); |
@@ -498,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
498 | } | 432 | } |
499 | unlock_page(page); | 433 | unlock_page(page); |
500 | } | 434 | } |
501 | shmem_deswap_pagevec(&pvec); | 435 | pagevec_remove_exceptionals(&pvec); |
502 | pagevec_release(&pvec); | 436 | pagevec_release(&pvec); |
503 | mem_cgroup_uncharge_end(); | 437 | mem_cgroup_uncharge_end(); |
504 | cond_resched(); | 438 | cond_resched(); |
@@ -536,9 +470,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
536 | index = start; | 470 | index = start; |
537 | while (index < end) { | 471 | while (index < end) { |
538 | cond_resched(); | 472 | cond_resched(); |
539 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 473 | |
474 | pvec.nr = find_get_entries(mapping, index, | ||
540 | min(end - index, (pgoff_t)PAGEVEC_SIZE), | 475 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
541 | pvec.pages, indices); | 476 | pvec.pages, indices); |
542 | if (!pvec.nr) { | 477 | if (!pvec.nr) { |
543 | /* If all gone or hole-punch or unfalloc, we're done */ | 478 | /* If all gone or hole-punch or unfalloc, we're done */ |
544 | if (index == start || end != -1) | 479 | if (index == start || end != -1) |
@@ -581,7 +516,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
581 | } | 516 | } |
582 | unlock_page(page); | 517 | unlock_page(page); |
583 | } | 518 | } |
584 | shmem_deswap_pagevec(&pvec); | 519 | pagevec_remove_exceptionals(&pvec); |
585 | pagevec_release(&pvec); | 520 | pagevec_release(&pvec); |
586 | mem_cgroup_uncharge_end(); | 521 | mem_cgroup_uncharge_end(); |
587 | index++; | 522 | index++; |
@@ -1090,7 +1025,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
1090 | return -EFBIG; | 1025 | return -EFBIG; |
1091 | repeat: | 1026 | repeat: |
1092 | swap.val = 0; | 1027 | swap.val = 0; |
1093 | page = find_lock_page(mapping, index); | 1028 | page = find_lock_entry(mapping, index); |
1094 | if (radix_tree_exceptional_entry(page)) { | 1029 | if (radix_tree_exceptional_entry(page)) { |
1095 | swap = radix_to_swp_entry(page); | 1030 | swap = radix_to_swp_entry(page); |
1096 | page = NULL; | 1031 | page = NULL; |
@@ -1102,6 +1037,9 @@ repeat: | |||
1102 | goto failed; | 1037 | goto failed; |
1103 | } | 1038 | } |
1104 | 1039 | ||
1040 | if (page && sgp == SGP_WRITE) | ||
1041 | mark_page_accessed(page); | ||
1042 | |||
1105 | /* fallocated page? */ | 1043 | /* fallocated page? */ |
1106 | if (page && !PageUptodate(page)) { | 1044 | if (page && !PageUptodate(page)) { |
1107 | if (sgp != SGP_READ) | 1045 | if (sgp != SGP_READ) |
@@ -1183,6 +1121,9 @@ repeat: | |||
1183 | shmem_recalc_inode(inode); | 1121 | shmem_recalc_inode(inode); |
1184 | spin_unlock(&info->lock); | 1122 | spin_unlock(&info->lock); |
1185 | 1123 | ||
1124 | if (sgp == SGP_WRITE) | ||
1125 | mark_page_accessed(page); | ||
1126 | |||
1186 | delete_from_swap_cache(page); | 1127 | delete_from_swap_cache(page); |
1187 | set_page_dirty(page); | 1128 | set_page_dirty(page); |
1188 | swap_free(swap); | 1129 | swap_free(swap); |
@@ -1207,8 +1148,11 @@ repeat: | |||
1207 | goto decused; | 1148 | goto decused; |
1208 | } | 1149 | } |
1209 | 1150 | ||
1210 | SetPageSwapBacked(page); | 1151 | __SetPageSwapBacked(page); |
1211 | __set_page_locked(page); | 1152 | __set_page_locked(page); |
1153 | if (sgp == SGP_WRITE) | ||
1154 | init_page_accessed(page); | ||
1155 | |||
1212 | error = mem_cgroup_cache_charge(page, current->mm, | 1156 | error = mem_cgroup_cache_charge(page, current->mm, |
1213 | gfp & GFP_RECLAIM_MASK); | 1157 | gfp & GFP_RECLAIM_MASK); |
1214 | if (error) | 1158 | if (error) |
@@ -1485,6 +1429,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1485 | return inode; | 1429 | return inode; |
1486 | } | 1430 | } |
1487 | 1431 | ||
1432 | bool shmem_mapping(struct address_space *mapping) | ||
1433 | { | ||
1434 | return mapping->backing_dev_info == &shmem_backing_dev_info; | ||
1435 | } | ||
1436 | |||
1488 | #ifdef CONFIG_TMPFS | 1437 | #ifdef CONFIG_TMPFS |
1489 | static const struct inode_operations shmem_symlink_inode_operations; | 1438 | static const struct inode_operations shmem_symlink_inode_operations; |
1490 | static const struct inode_operations shmem_short_symlink_operations; | 1439 | static const struct inode_operations shmem_short_symlink_operations; |
@@ -1797,7 +1746,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | |||
1797 | pagevec_init(&pvec, 0); | 1746 | pagevec_init(&pvec, 0); |
1798 | pvec.nr = 1; /* start small: we may be there already */ | 1747 | pvec.nr = 1; /* start small: we may be there already */ |
1799 | while (!done) { | 1748 | while (!done) { |
1800 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 1749 | pvec.nr = find_get_entries(mapping, index, |
1801 | pvec.nr, pvec.pages, indices); | 1750 | pvec.nr, pvec.pages, indices); |
1802 | if (!pvec.nr) { | 1751 | if (!pvec.nr) { |
1803 | if (whence == SEEK_DATA) | 1752 | if (whence == SEEK_DATA) |
@@ -1824,7 +1773,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | |||
1824 | break; | 1773 | break; |
1825 | } | 1774 | } |
1826 | } | 1775 | } |
1827 | shmem_deswap_pagevec(&pvec); | 1776 | pagevec_remove_exceptionals(&pvec); |
1828 | pagevec_release(&pvec); | 1777 | pagevec_release(&pvec); |
1829 | pvec.nr = PAGEVEC_SIZE; | 1778 | pvec.nr = PAGEVEC_SIZE; |
1830 | cond_resched(); | 1779 | cond_resched(); |
@@ -930,7 +930,8 @@ static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | |||
930 | { | 930 | { |
931 | if (unlikely(pfmemalloc_active)) { | 931 | if (unlikely(pfmemalloc_active)) { |
932 | /* Some pfmemalloc slabs exist, check if this is one */ | 932 | /* Some pfmemalloc slabs exist, check if this is one */ |
933 | struct page *page = virt_to_head_page(objp); | 933 | struct slab *slabp = virt_to_slab(objp); |
934 | struct page *page = virt_to_head_page(slabp->s_mem); | ||
934 | if (PageSlabPfmemalloc(page)) | 935 | if (PageSlabPfmemalloc(page)) |
935 | set_obj_pfmemalloc(&objp); | 936 | set_obj_pfmemalloc(&objp); |
936 | } | 937 | } |
@@ -1776,7 +1777,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1776 | __SetPageSlab(page + i); | 1777 | __SetPageSlab(page + i); |
1777 | 1778 | ||
1778 | if (page->pfmemalloc) | 1779 | if (page->pfmemalloc) |
1779 | SetPageSlabPfmemalloc(page + i); | 1780 | SetPageSlabPfmemalloc(page); |
1780 | } | 1781 | } |
1781 | memcg_bind_pages(cachep, cachep->gfporder); | 1782 | memcg_bind_pages(cachep, cachep->gfporder); |
1782 | 1783 | ||
@@ -1809,9 +1810,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1809 | else | 1810 | else |
1810 | sub_zone_page_state(page_zone(page), | 1811 | sub_zone_page_state(page_zone(page), |
1811 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1812 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1813 | |||
1814 | __ClearPageSlabPfmemalloc(page); | ||
1812 | while (i--) { | 1815 | while (i--) { |
1813 | BUG_ON(!PageSlab(page)); | 1816 | BUG_ON(!PageSlab(page)); |
1814 | __ClearPageSlabPfmemalloc(page); | ||
1815 | __ClearPageSlab(page); | 1817 | __ClearPageSlab(page); |
1816 | page++; | 1818 | page++; |
1817 | } | 1819 | } |
@@ -3220,7 +3222,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3220 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3222 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3221 | 3223 | ||
3222 | retry_cpuset: | 3224 | retry_cpuset: |
3223 | cpuset_mems_cookie = get_mems_allowed(); | 3225 | cpuset_mems_cookie = read_mems_allowed_begin(); |
3224 | zonelist = node_zonelist(slab_node(), flags); | 3226 | zonelist = node_zonelist(slab_node(), flags); |
3225 | 3227 | ||
3226 | retry: | 3228 | retry: |
@@ -3276,7 +3278,7 @@ retry: | |||
3276 | } | 3278 | } |
3277 | } | 3279 | } |
3278 | 3280 | ||
3279 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | 3281 | if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) |
3280 | goto retry_cpuset; | 3282 | goto retry_cpuset; |
3281 | return obj; | 3283 | return obj; |
3282 | } | 3284 | } |
@@ -1635,7 +1635,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1635 | return NULL; | 1635 | return NULL; |
1636 | 1636 | ||
1637 | do { | 1637 | do { |
1638 | cpuset_mems_cookie = get_mems_allowed(); | 1638 | cpuset_mems_cookie = read_mems_allowed_begin(); |
1639 | zonelist = node_zonelist(slab_node(), flags); | 1639 | zonelist = node_zonelist(slab_node(), flags); |
1640 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1640 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1641 | struct kmem_cache_node *n; | 1641 | struct kmem_cache_node *n; |
@@ -1647,19 +1647,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1647 | object = get_partial_node(s, n, c, flags); | 1647 | object = get_partial_node(s, n, c, flags); |
1648 | if (object) { | 1648 | if (object) { |
1649 | /* | 1649 | /* |
1650 | * Return the object even if | 1650 | * Don't check read_mems_allowed_retry() |
1651 | * put_mems_allowed indicated that | 1651 | * here - if mems_allowed was updated in |
1652 | * the cpuset mems_allowed was | 1652 | * parallel, that was a harmless race |
1653 | * updated in parallel. It's a | 1653 | * between allocation and the cpuset |
1654 | * harmless race between the alloc | 1654 | * update |
1655 | * and the cpuset update. | ||
1656 | */ | 1655 | */ |
1657 | put_mems_allowed(cpuset_mems_cookie); | ||
1658 | return object; | 1656 | return object; |
1659 | } | 1657 | } |
1660 | } | 1658 | } |
1661 | } | 1659 | } |
1662 | } while (!put_mems_allowed(cpuset_mems_cookie)); | 1660 | } while (read_mems_allowed_retry(cpuset_mems_cookie)); |
1663 | #endif | 1661 | #endif |
1664 | return NULL; | 1662 | return NULL; |
1665 | } | 1663 | } |
@@ -68,7 +68,7 @@ static void __page_cache_release(struct page *page) | |||
68 | static void __put_single_page(struct page *page) | 68 | static void __put_single_page(struct page *page) |
69 | { | 69 | { |
70 | __page_cache_release(page); | 70 | __page_cache_release(page); |
71 | free_hot_cold_page(page, 0); | 71 | free_hot_cold_page(page, false); |
72 | } | 72 | } |
73 | 73 | ||
74 | static void __put_compound_page(struct page *page) | 74 | static void __put_compound_page(struct page *page) |
@@ -437,7 +437,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, | |||
437 | SetPageActive(page); | 437 | SetPageActive(page); |
438 | lru += LRU_ACTIVE; | 438 | lru += LRU_ACTIVE; |
439 | add_page_to_lru_list(page, lruvec, lru); | 439 | add_page_to_lru_list(page, lruvec, lru); |
440 | trace_mm_lru_activate(page, page_to_pfn(page)); | 440 | trace_mm_lru_activate(page); |
441 | 441 | ||
442 | __count_vm_event(PGACTIVATE); | 442 | __count_vm_event(PGACTIVATE); |
443 | update_page_reclaim_stat(lruvec, file, 1); | 443 | update_page_reclaim_stat(lruvec, file, 1); |
@@ -549,12 +549,17 @@ void mark_page_accessed(struct page *page) | |||
549 | EXPORT_SYMBOL(mark_page_accessed); | 549 | EXPORT_SYMBOL(mark_page_accessed); |
550 | 550 | ||
551 | /* | 551 | /* |
552 | * Queue the page for addition to the LRU via pagevec. The decision on whether | 552 | * Used to mark_page_accessed(page) that is not visible yet and when it is |
553 | * to add the page to the [in]active [file|anon] list is deferred until the | 553 | * still safe to use non-atomic ops |
554 | * pagevec is drained. This gives a chance for the caller of __lru_cache_add() | ||
555 | * have the page added to the active list using mark_page_accessed(). | ||
556 | */ | 554 | */ |
557 | void __lru_cache_add(struct page *page) | 555 | void init_page_accessed(struct page *page) |
556 | { | ||
557 | if (!PageReferenced(page)) | ||
558 | __SetPageReferenced(page); | ||
559 | } | ||
560 | EXPORT_SYMBOL(init_page_accessed); | ||
561 | |||
562 | static void __lru_cache_add(struct page *page) | ||
558 | { | 563 | { |
559 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | 564 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
560 | 565 | ||
@@ -564,11 +569,34 @@ void __lru_cache_add(struct page *page) | |||
564 | pagevec_add(pvec, page); | 569 | pagevec_add(pvec, page); |
565 | put_cpu_var(lru_add_pvec); | 570 | put_cpu_var(lru_add_pvec); |
566 | } | 571 | } |
567 | EXPORT_SYMBOL(__lru_cache_add); | 572 | |
573 | /** | ||
574 | * lru_cache_add: add a page to the page lists | ||
575 | * @page: the page to add | ||
576 | */ | ||
577 | void lru_cache_add_anon(struct page *page) | ||
578 | { | ||
579 | if (PageActive(page)) | ||
580 | ClearPageActive(page); | ||
581 | __lru_cache_add(page); | ||
582 | } | ||
583 | |||
584 | void lru_cache_add_file(struct page *page) | ||
585 | { | ||
586 | if (PageActive(page)) | ||
587 | ClearPageActive(page); | ||
588 | __lru_cache_add(page); | ||
589 | } | ||
590 | EXPORT_SYMBOL(lru_cache_add_file); | ||
568 | 591 | ||
569 | /** | 592 | /** |
570 | * lru_cache_add - add a page to a page list | 593 | * lru_cache_add - add a page to a page list |
571 | * @page: the page to be added to the LRU. | 594 | * @page: the page to be added to the LRU. |
595 | * | ||
596 | * Queue the page for addition to the LRU via pagevec. The decision on whether | ||
597 | * to add the page to the [in]active [file|anon] list is deferred until the | ||
598 | * pagevec is drained. This gives a chance for the caller of lru_cache_add() | ||
599 | * have the page added to the active list using mark_page_accessed(). | ||
572 | */ | 600 | */ |
573 | void lru_cache_add(struct page *page) | 601 | void lru_cache_add(struct page *page) |
574 | { | 602 | { |
@@ -779,7 +807,7 @@ void lru_add_drain_all(void) | |||
779 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | 807 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
780 | * will free it. | 808 | * will free it. |
781 | */ | 809 | */ |
782 | void release_pages(struct page **pages, int nr, int cold) | 810 | void release_pages(struct page **pages, int nr, bool cold) |
783 | { | 811 | { |
784 | int i; | 812 | int i; |
785 | LIST_HEAD(pages_to_free); | 813 | LIST_HEAD(pages_to_free); |
@@ -820,7 +848,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
820 | } | 848 | } |
821 | 849 | ||
822 | /* Clear Active bit in case of parallel mark_page_accessed */ | 850 | /* Clear Active bit in case of parallel mark_page_accessed */ |
823 | ClearPageActive(page); | 851 | __ClearPageActive(page); |
824 | 852 | ||
825 | list_add(&page->lru, &pages_to_free); | 853 | list_add(&page->lru, &pages_to_free); |
826 | } | 854 | } |
@@ -902,7 +930,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | |||
902 | SetPageLRU(page); | 930 | SetPageLRU(page); |
903 | add_page_to_lru_list(page, lruvec, lru); | 931 | add_page_to_lru_list(page, lruvec, lru); |
904 | update_page_reclaim_stat(lruvec, file, active); | 932 | update_page_reclaim_stat(lruvec, file, active); |
905 | trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); | 933 | trace_mm_lru_insertion(page, lru); |
906 | } | 934 | } |
907 | 935 | ||
908 | /* | 936 | /* |
@@ -916,6 +944,57 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
916 | EXPORT_SYMBOL(__pagevec_lru_add); | 944 | EXPORT_SYMBOL(__pagevec_lru_add); |
917 | 945 | ||
918 | /** | 946 | /** |
947 | * pagevec_lookup_entries - gang pagecache lookup | ||
948 | * @pvec: Where the resulting entries are placed | ||
949 | * @mapping: The address_space to search | ||
950 | * @start: The starting entry index | ||
951 | * @nr_entries: The maximum number of entries | ||
952 | * @indices: The cache indices corresponding to the entries in @pvec | ||
953 | * | ||
954 | * pagevec_lookup_entries() will search for and return a group of up | ||
955 | * to @nr_entries pages and shadow entries in the mapping. All | ||
956 | * entries are placed in @pvec. pagevec_lookup_entries() takes a | ||
957 | * reference against actual pages in @pvec. | ||
958 | * | ||
959 | * The search returns a group of mapping-contiguous entries with | ||
960 | * ascending indexes. There may be holes in the indices due to | ||
961 | * not-present entries. | ||
962 | * | ||
963 | * pagevec_lookup_entries() returns the number of entries which were | ||
964 | * found. | ||
965 | */ | ||
966 | unsigned pagevec_lookup_entries(struct pagevec *pvec, | ||
967 | struct address_space *mapping, | ||
968 | pgoff_t start, unsigned nr_pages, | ||
969 | pgoff_t *indices) | ||
970 | { | ||
971 | pvec->nr = find_get_entries(mapping, start, nr_pages, | ||
972 | pvec->pages, indices); | ||
973 | return pagevec_count(pvec); | ||
974 | } | ||
975 | |||
976 | /** | ||
977 | * pagevec_remove_exceptionals - pagevec exceptionals pruning | ||
978 | * @pvec: The pagevec to prune | ||
979 | * | ||
980 | * pagevec_lookup_entries() fills both pages and exceptional radix | ||
981 | * tree entries into the pagevec. This function prunes all | ||
982 | * exceptionals from @pvec without leaving holes, so that it can be | ||
983 | * passed on to page-only pagevec operations. | ||
984 | */ | ||
985 | void pagevec_remove_exceptionals(struct pagevec *pvec) | ||
986 | { | ||
987 | int i, j; | ||
988 | |||
989 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | ||
990 | struct page *page = pvec->pages[i]; | ||
991 | if (!radix_tree_exceptional_entry(page)) | ||
992 | pvec->pages[j++] = page; | ||
993 | } | ||
994 | pvec->nr = j; | ||
995 | } | ||
996 | |||
997 | /** | ||
919 | * pagevec_lookup - gang pagecache lookup | 998 | * pagevec_lookup - gang pagecache lookup |
920 | * @pvec: Where the resulting pages are placed | 999 | * @pvec: Where the resulting pages are placed |
921 | * @mapping: The address_space to search | 1000 | * @mapping: The address_space to search |
diff --git a/mm/swap_state.c b/mm/swap_state.c index e6f15f8ca2af..4079edfff2cc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void) | |||
63 | return ret; | 63 | return ret; |
64 | } | 64 | } |
65 | 65 | ||
66 | static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); | ||
67 | |||
66 | void show_swap_cache_info(void) | 68 | void show_swap_cache_info(void) |
67 | { | 69 | { |
68 | printk("%lu pages in swap cache\n", total_swapcache_pages()); | 70 | printk("%lu pages in swap cache\n", total_swapcache_pages()); |
@@ -268,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) | |||
268 | 270 | ||
269 | for (i = 0; i < todo; i++) | 271 | for (i = 0; i < todo; i++) |
270 | free_swap_cache(pagep[i]); | 272 | free_swap_cache(pagep[i]); |
271 | release_pages(pagep, todo, 0); | 273 | release_pages(pagep, todo, false); |
272 | pagep += todo; | 274 | pagep += todo; |
273 | nr -= todo; | 275 | nr -= todo; |
274 | } | 276 | } |
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
286 | 288 | ||
287 | page = find_get_page(swap_address_space(entry), entry.val); | 289 | page = find_get_page(swap_address_space(entry), entry.val); |
288 | 290 | ||
289 | if (page) | 291 | if (page) { |
290 | INC_CACHE_INFO(find_success); | 292 | INC_CACHE_INFO(find_success); |
293 | if (TestClearPageReadahead(page)) | ||
294 | atomic_inc(&swapin_readahead_hits); | ||
295 | } | ||
291 | 296 | ||
292 | INC_CACHE_INFO(find_total); | 297 | INC_CACHE_INFO(find_total); |
293 | return page; | 298 | return page; |
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
389 | return found_page; | 394 | return found_page; |
390 | } | 395 | } |
391 | 396 | ||
397 | static unsigned long swapin_nr_pages(unsigned long offset) | ||
398 | { | ||
399 | static unsigned long prev_offset; | ||
400 | unsigned int pages, max_pages, last_ra; | ||
401 | static atomic_t last_readahead_pages; | ||
402 | |||
403 | max_pages = 1 << ACCESS_ONCE(page_cluster); | ||
404 | if (max_pages <= 1) | ||
405 | return 1; | ||
406 | |||
407 | /* | ||
408 | * This heuristic has been found to work well on both sequential and | ||
409 | * random loads, swapping to hard disk or to SSD: please don't ask | ||
410 | * what the "+ 2" means, it just happens to work well, that's all. | ||
411 | */ | ||
412 | pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; | ||
413 | if (pages == 2) { | ||
414 | /* | ||
415 | * We can have no readahead hits to judge by: but must not get | ||
416 | * stuck here forever, so check for an adjacent offset instead | ||
417 | * (and don't even bother to check whether swap type is same). | ||
418 | */ | ||
419 | if (offset != prev_offset + 1 && offset != prev_offset - 1) | ||
420 | pages = 1; | ||
421 | prev_offset = offset; | ||
422 | } else { | ||
423 | unsigned int roundup = 4; | ||
424 | while (roundup < pages) | ||
425 | roundup <<= 1; | ||
426 | pages = roundup; | ||
427 | } | ||
428 | |||
429 | if (pages > max_pages) | ||
430 | pages = max_pages; | ||
431 | |||
432 | /* Don't shrink readahead too fast */ | ||
433 | last_ra = atomic_read(&last_readahead_pages) / 2; | ||
434 | if (pages < last_ra) | ||
435 | pages = last_ra; | ||
436 | atomic_set(&last_readahead_pages, pages); | ||
437 | |||
438 | return pages; | ||
439 | } | ||
440 | |||
392 | /** | 441 | /** |
393 | * swapin_readahead - swap in pages in hope we need them soon | 442 | * swapin_readahead - swap in pages in hope we need them soon |
394 | * @entry: swap entry of this memory | 443 | * @entry: swap entry of this memory |
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
412 | struct vm_area_struct *vma, unsigned long addr) | 461 | struct vm_area_struct *vma, unsigned long addr) |
413 | { | 462 | { |
414 | struct page *page; | 463 | struct page *page; |
415 | unsigned long offset = swp_offset(entry); | 464 | unsigned long entry_offset = swp_offset(entry); |
465 | unsigned long offset = entry_offset; | ||
416 | unsigned long start_offset, end_offset; | 466 | unsigned long start_offset, end_offset; |
417 | unsigned long mask = (1UL << page_cluster) - 1; | 467 | unsigned long mask; |
418 | struct blk_plug plug; | 468 | struct blk_plug plug; |
419 | 469 | ||
470 | mask = swapin_nr_pages(offset) - 1; | ||
471 | if (!mask) | ||
472 | goto skip; | ||
473 | |||
420 | /* Read a page_cluster sized and aligned cluster around offset. */ | 474 | /* Read a page_cluster sized and aligned cluster around offset. */ |
421 | start_offset = offset & ~mask; | 475 | start_offset = offset & ~mask; |
422 | end_offset = offset | mask; | 476 | end_offset = offset | mask; |
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
430 | gfp_mask, vma, addr); | 484 | gfp_mask, vma, addr); |
431 | if (!page) | 485 | if (!page) |
432 | continue; | 486 | continue; |
487 | if (offset != entry_offset) | ||
488 | SetPageReadahead(page); | ||
433 | page_cache_release(page); | 489 | page_cache_release(page); |
434 | } | 490 | } |
435 | blk_finish_plug(&plug); | 491 | blk_finish_plug(&plug); |
436 | 492 | ||
437 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 493 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
494 | skip: | ||
438 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 495 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
439 | } | 496 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 0ec2eaf3ccfd..660b9c0e2e40 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -51,14 +51,32 @@ atomic_long_t nr_swap_pages; | |||
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | 51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ |
52 | long total_swap_pages; | 52 | long total_swap_pages; |
53 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
55 | 54 | ||
56 | static const char Bad_file[] = "Bad swap file entry "; | 55 | static const char Bad_file[] = "Bad swap file entry "; |
57 | static const char Unused_file[] = "Unused swap file entry "; | 56 | static const char Unused_file[] = "Unused swap file entry "; |
58 | static const char Bad_offset[] = "Bad swap offset entry "; | 57 | static const char Bad_offset[] = "Bad swap offset entry "; |
59 | static const char Unused_offset[] = "Unused swap offset entry "; | 58 | static const char Unused_offset[] = "Unused swap offset entry "; |
60 | 59 | ||
61 | struct swap_list_t swap_list = {-1, -1}; | 60 | /* |
61 | * all active swap_info_structs | ||
62 | * protected with swap_lock, and ordered by priority. | ||
63 | */ | ||
64 | PLIST_HEAD(swap_active_head); | ||
65 | |||
66 | /* | ||
67 | * all available (active, not full) swap_info_structs | ||
68 | * protected with swap_avail_lock, ordered by priority. | ||
69 | * This is used by get_swap_page() instead of swap_active_head | ||
70 | * because swap_active_head includes all swap_info_structs, | ||
71 | * but get_swap_page() doesn't need to look at full ones. | ||
72 | * This uses its own lock instead of swap_lock because when a | ||
73 | * swap_info_struct changes between not-full/full, it needs to | ||
74 | * add/remove itself to/from this list, but the swap_info_struct->lock | ||
75 | * is held and the locking order requires swap_lock to be taken | ||
76 | * before any swap_info_struct->lock. | ||
77 | */ | ||
78 | static PLIST_HEAD(swap_avail_head); | ||
79 | static DEFINE_SPINLOCK(swap_avail_lock); | ||
62 | 80 | ||
63 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 81 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
64 | 82 | ||
@@ -591,6 +609,9 @@ checks: | |||
591 | if (si->inuse_pages == si->pages) { | 609 | if (si->inuse_pages == si->pages) { |
592 | si->lowest_bit = si->max; | 610 | si->lowest_bit = si->max; |
593 | si->highest_bit = 0; | 611 | si->highest_bit = 0; |
612 | spin_lock(&swap_avail_lock); | ||
613 | plist_del(&si->avail_list, &swap_avail_head); | ||
614 | spin_unlock(&swap_avail_lock); | ||
594 | } | 615 | } |
595 | si->swap_map[offset] = usage; | 616 | si->swap_map[offset] = usage; |
596 | inc_cluster_info_page(si, si->cluster_info, offset); | 617 | inc_cluster_info_page(si, si->cluster_info, offset); |
@@ -639,71 +660,65 @@ no_page: | |||
639 | 660 | ||
640 | swp_entry_t get_swap_page(void) | 661 | swp_entry_t get_swap_page(void) |
641 | { | 662 | { |
642 | struct swap_info_struct *si; | 663 | struct swap_info_struct *si, *next; |
643 | pgoff_t offset; | 664 | pgoff_t offset; |
644 | int type, next; | ||
645 | int wrapped = 0; | ||
646 | int hp_index; | ||
647 | 665 | ||
648 | spin_lock(&swap_lock); | ||
649 | if (atomic_long_read(&nr_swap_pages) <= 0) | 666 | if (atomic_long_read(&nr_swap_pages) <= 0) |
650 | goto noswap; | 667 | goto noswap; |
651 | atomic_long_dec(&nr_swap_pages); | 668 | atomic_long_dec(&nr_swap_pages); |
652 | 669 | ||
653 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 670 | spin_lock(&swap_avail_lock); |
654 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
655 | /* | ||
656 | * highest_priority_index records current highest priority swap | ||
657 | * type which just frees swap entries. If its priority is | ||
658 | * higher than that of swap_list.next swap type, we use it. It | ||
659 | * isn't protected by swap_lock, so it can be an invalid value | ||
660 | * if the corresponding swap type is swapoff. We double check | ||
661 | * the flags here. It's even possible the swap type is swapoff | ||
662 | * and swapon again and its priority is changed. In such rare | ||
663 | * case, low prority swap type might be used, but eventually | ||
664 | * high priority swap will be used after several rounds of | ||
665 | * swap. | ||
666 | */ | ||
667 | if (hp_index != -1 && hp_index != type && | ||
668 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
669 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
670 | type = hp_index; | ||
671 | swap_list.next = type; | ||
672 | } | ||
673 | |||
674 | si = swap_info[type]; | ||
675 | next = si->next; | ||
676 | if (next < 0 || | ||
677 | (!wrapped && si->prio != swap_info[next]->prio)) { | ||
678 | next = swap_list.head; | ||
679 | wrapped++; | ||
680 | } | ||
681 | 671 | ||
672 | start_over: | ||
673 | plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { | ||
674 | /* requeue si to after same-priority siblings */ | ||
675 | plist_requeue(&si->avail_list, &swap_avail_head); | ||
676 | spin_unlock(&swap_avail_lock); | ||
682 | spin_lock(&si->lock); | 677 | spin_lock(&si->lock); |
683 | if (!si->highest_bit) { | 678 | if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { |
679 | spin_lock(&swap_avail_lock); | ||
680 | if (plist_node_empty(&si->avail_list)) { | ||
681 | spin_unlock(&si->lock); | ||
682 | goto nextsi; | ||
683 | } | ||
684 | WARN(!si->highest_bit, | ||
685 | "swap_info %d in list but !highest_bit\n", | ||
686 | si->type); | ||
687 | WARN(!(si->flags & SWP_WRITEOK), | ||
688 | "swap_info %d in list but !SWP_WRITEOK\n", | ||
689 | si->type); | ||
690 | plist_del(&si->avail_list, &swap_avail_head); | ||
684 | spin_unlock(&si->lock); | 691 | spin_unlock(&si->lock); |
685 | continue; | 692 | goto nextsi; |
686 | } | 693 | } |
687 | if (!(si->flags & SWP_WRITEOK)) { | ||
688 | spin_unlock(&si->lock); | ||
689 | continue; | ||
690 | } | ||
691 | |||
692 | swap_list.next = next; | ||
693 | 694 | ||
694 | spin_unlock(&swap_lock); | ||
695 | /* This is called for allocating swap entry for cache */ | 695 | /* This is called for allocating swap entry for cache */ |
696 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 696 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
697 | spin_unlock(&si->lock); | 697 | spin_unlock(&si->lock); |
698 | if (offset) | 698 | if (offset) |
699 | return swp_entry(type, offset); | 699 | return swp_entry(si->type, offset); |
700 | spin_lock(&swap_lock); | 700 | pr_debug("scan_swap_map of si %d failed to find offset\n", |
701 | next = swap_list.next; | 701 | si->type); |
702 | spin_lock(&swap_avail_lock); | ||
703 | nextsi: | ||
704 | /* | ||
705 | * if we got here, it's likely that si was almost full before, | ||
706 | * and since scan_swap_map() can drop the si->lock, multiple | ||
707 | * callers probably all tried to get a page from the same si | ||
708 | * and it filled up before we could get one; or, the si filled | ||
709 | * up between us dropping swap_avail_lock and taking si->lock. | ||
710 | * Since we dropped the swap_avail_lock, the swap_avail_head | ||
711 | * list may have been modified; so if next is still in the | ||
712 | * swap_avail_head list then try it, otherwise start over. | ||
713 | */ | ||
714 | if (plist_node_empty(&next->avail_list)) | ||
715 | goto start_over; | ||
702 | } | 716 | } |
703 | 717 | ||
718 | spin_unlock(&swap_avail_lock); | ||
719 | |||
704 | atomic_long_inc(&nr_swap_pages); | 720 | atomic_long_inc(&nr_swap_pages); |
705 | noswap: | 721 | noswap: |
706 | spin_unlock(&swap_lock); | ||
707 | return (swp_entry_t) {0}; | 722 | return (swp_entry_t) {0}; |
708 | } | 723 | } |
709 | 724 | ||
@@ -765,27 +780,6 @@ out: | |||
765 | return NULL; | 780 | return NULL; |
766 | } | 781 | } |
767 | 782 | ||
768 | /* | ||
769 | * This swap type frees swap entry, check if it is the highest priority swap | ||
770 | * type which just frees swap entry. get_swap_page() uses | ||
771 | * highest_priority_index to search highest priority swap type. The | ||
772 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
773 | * active, so we use atomic_cmpxchg. | ||
774 | */ | ||
775 | static void set_highest_priority_index(int type) | ||
776 | { | ||
777 | int old_hp_index, new_hp_index; | ||
778 | |||
779 | do { | ||
780 | old_hp_index = atomic_read(&highest_priority_index); | ||
781 | if (old_hp_index != -1 && | ||
782 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
783 | break; | ||
784 | new_hp_index = type; | ||
785 | } while (atomic_cmpxchg(&highest_priority_index, | ||
786 | old_hp_index, new_hp_index) != old_hp_index); | ||
787 | } | ||
788 | |||
789 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 783 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
790 | swp_entry_t entry, unsigned char usage) | 784 | swp_entry_t entry, unsigned char usage) |
791 | { | 785 | { |
@@ -827,9 +821,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
827 | dec_cluster_info_page(p, p->cluster_info, offset); | 821 | dec_cluster_info_page(p, p->cluster_info, offset); |
828 | if (offset < p->lowest_bit) | 822 | if (offset < p->lowest_bit) |
829 | p->lowest_bit = offset; | 823 | p->lowest_bit = offset; |
830 | if (offset > p->highest_bit) | 824 | if (offset > p->highest_bit) { |
825 | bool was_full = !p->highest_bit; | ||
831 | p->highest_bit = offset; | 826 | p->highest_bit = offset; |
832 | set_highest_priority_index(p->type); | 827 | if (was_full && (p->flags & SWP_WRITEOK)) { |
828 | spin_lock(&swap_avail_lock); | ||
829 | WARN_ON(!plist_node_empty(&p->avail_list)); | ||
830 | if (plist_node_empty(&p->avail_list)) | ||
831 | plist_add(&p->avail_list, | ||
832 | &swap_avail_head); | ||
833 | spin_unlock(&swap_avail_lock); | ||
834 | } | ||
835 | } | ||
833 | atomic_long_inc(&nr_swap_pages); | 836 | atomic_long_inc(&nr_swap_pages); |
834 | p->inuse_pages--; | 837 | p->inuse_pages--; |
835 | frontswap_invalidate_page(p->type, offset); | 838 | frontswap_invalidate_page(p->type, offset); |
@@ -1764,30 +1767,37 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1764 | unsigned char *swap_map, | 1767 | unsigned char *swap_map, |
1765 | struct swap_cluster_info *cluster_info) | 1768 | struct swap_cluster_info *cluster_info) |
1766 | { | 1769 | { |
1767 | int i, prev; | ||
1768 | |||
1769 | if (prio >= 0) | 1770 | if (prio >= 0) |
1770 | p->prio = prio; | 1771 | p->prio = prio; |
1771 | else | 1772 | else |
1772 | p->prio = --least_priority; | 1773 | p->prio = --least_priority; |
1774 | /* | ||
1775 | * the plist prio is negated because plist ordering is | ||
1776 | * low-to-high, while swap ordering is high-to-low | ||
1777 | */ | ||
1778 | p->list.prio = -p->prio; | ||
1779 | p->avail_list.prio = -p->prio; | ||
1773 | p->swap_map = swap_map; | 1780 | p->swap_map = swap_map; |
1774 | p->cluster_info = cluster_info; | 1781 | p->cluster_info = cluster_info; |
1775 | p->flags |= SWP_WRITEOK; | 1782 | p->flags |= SWP_WRITEOK; |
1776 | atomic_long_add(p->pages, &nr_swap_pages); | 1783 | atomic_long_add(p->pages, &nr_swap_pages); |
1777 | total_swap_pages += p->pages; | 1784 | total_swap_pages += p->pages; |
1778 | 1785 | ||
1779 | /* insert swap space into swap_list: */ | 1786 | assert_spin_locked(&swap_lock); |
1780 | prev = -1; | 1787 | /* |
1781 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { | 1788 | * both lists are plists, and thus priority ordered. |
1782 | if (p->prio >= swap_info[i]->prio) | 1789 | * swap_active_head needs to be priority ordered for swapoff(), |
1783 | break; | 1790 | * which on removal of any swap_info_struct with an auto-assigned |
1784 | prev = i; | 1791 | * (i.e. negative) priority increments the auto-assigned priority |
1785 | } | 1792 | * of any lower-priority swap_info_structs. |
1786 | p->next = i; | 1793 | * swap_avail_head needs to be priority ordered for get_swap_page(), |
1787 | if (prev < 0) | 1794 | * which allocates swap pages from the highest available priority |
1788 | swap_list.head = swap_list.next = p->type; | 1795 | * swap_info_struct. |
1789 | else | 1796 | */ |
1790 | swap_info[prev]->next = p->type; | 1797 | plist_add(&p->list, &swap_active_head); |
1798 | spin_lock(&swap_avail_lock); | ||
1799 | plist_add(&p->avail_list, &swap_avail_head); | ||
1800 | spin_unlock(&swap_avail_lock); | ||
1791 | } | 1801 | } |
1792 | 1802 | ||
1793 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1803 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -1822,8 +1832,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1822 | struct address_space *mapping; | 1832 | struct address_space *mapping; |
1823 | struct inode *inode; | 1833 | struct inode *inode; |
1824 | struct filename *pathname; | 1834 | struct filename *pathname; |
1825 | int i, type, prev; | 1835 | int err, found = 0; |
1826 | int err; | ||
1827 | unsigned int old_block_size; | 1836 | unsigned int old_block_size; |
1828 | 1837 | ||
1829 | if (!capable(CAP_SYS_ADMIN)) | 1838 | if (!capable(CAP_SYS_ADMIN)) |
@@ -1841,17 +1850,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1841 | goto out; | 1850 | goto out; |
1842 | 1851 | ||
1843 | mapping = victim->f_mapping; | 1852 | mapping = victim->f_mapping; |
1844 | prev = -1; | ||
1845 | spin_lock(&swap_lock); | 1853 | spin_lock(&swap_lock); |
1846 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { | 1854 | plist_for_each_entry(p, &swap_active_head, list) { |
1847 | p = swap_info[type]; | ||
1848 | if (p->flags & SWP_WRITEOK) { | 1855 | if (p->flags & SWP_WRITEOK) { |
1849 | if (p->swap_file->f_mapping == mapping) | 1856 | if (p->swap_file->f_mapping == mapping) { |
1857 | found = 1; | ||
1850 | break; | 1858 | break; |
1859 | } | ||
1851 | } | 1860 | } |
1852 | prev = type; | ||
1853 | } | 1861 | } |
1854 | if (type < 0) { | 1862 | if (!found) { |
1855 | err = -EINVAL; | 1863 | err = -EINVAL; |
1856 | spin_unlock(&swap_lock); | 1864 | spin_unlock(&swap_lock); |
1857 | goto out_dput; | 1865 | goto out_dput; |
@@ -1863,20 +1871,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1863 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
1864 | goto out_dput; | 1872 | goto out_dput; |
1865 | } | 1873 | } |
1866 | if (prev < 0) | 1874 | spin_lock(&swap_avail_lock); |
1867 | swap_list.head = p->next; | 1875 | plist_del(&p->avail_list, &swap_avail_head); |
1868 | else | 1876 | spin_unlock(&swap_avail_lock); |
1869 | swap_info[prev]->next = p->next; | ||
1870 | if (type == swap_list.next) { | ||
1871 | /* just pick something that's safe... */ | ||
1872 | swap_list.next = swap_list.head; | ||
1873 | } | ||
1874 | spin_lock(&p->lock); | 1877 | spin_lock(&p->lock); |
1875 | if (p->prio < 0) { | 1878 | if (p->prio < 0) { |
1876 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1879 | struct swap_info_struct *si = p; |
1877 | swap_info[i]->prio = p->prio--; | 1880 | |
1881 | plist_for_each_entry_continue(si, &swap_active_head, list) { | ||
1882 | si->prio++; | ||
1883 | si->list.prio--; | ||
1884 | si->avail_list.prio--; | ||
1885 | } | ||
1878 | least_priority++; | 1886 | least_priority++; |
1879 | } | 1887 | } |
1888 | plist_del(&p->list, &swap_active_head); | ||
1880 | atomic_long_sub(p->pages, &nr_swap_pages); | 1889 | atomic_long_sub(p->pages, &nr_swap_pages); |
1881 | total_swap_pages -= p->pages; | 1890 | total_swap_pages -= p->pages; |
1882 | p->flags &= ~SWP_WRITEOK; | 1891 | p->flags &= ~SWP_WRITEOK; |
@@ -1884,7 +1893,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1884 | spin_unlock(&swap_lock); | 1893 | spin_unlock(&swap_lock); |
1885 | 1894 | ||
1886 | set_current_oom_origin(); | 1895 | set_current_oom_origin(); |
1887 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ | 1896 | err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ |
1888 | clear_current_oom_origin(); | 1897 | clear_current_oom_origin(); |
1889 | 1898 | ||
1890 | if (err) { | 1899 | if (err) { |
@@ -1926,7 +1935,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1926 | frontswap_map_set(p, NULL); | 1935 | frontswap_map_set(p, NULL); |
1927 | spin_unlock(&p->lock); | 1936 | spin_unlock(&p->lock); |
1928 | spin_unlock(&swap_lock); | 1937 | spin_unlock(&swap_lock); |
1929 | frontswap_invalidate_area(type); | 1938 | frontswap_invalidate_area(p->type); |
1930 | mutex_unlock(&swapon_mutex); | 1939 | mutex_unlock(&swapon_mutex); |
1931 | free_percpu(p->percpu_cluster); | 1940 | free_percpu(p->percpu_cluster); |
1932 | p->percpu_cluster = NULL; | 1941 | p->percpu_cluster = NULL; |
@@ -1934,7 +1943,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1934 | vfree(cluster_info); | 1943 | vfree(cluster_info); |
1935 | vfree(frontswap_map); | 1944 | vfree(frontswap_map); |
1936 | /* Destroy swap account informatin */ | 1945 | /* Destroy swap account informatin */ |
1937 | swap_cgroup_swapoff(type); | 1946 | swap_cgroup_swapoff(p->type); |
1938 | 1947 | ||
1939 | inode = mapping->host; | 1948 | inode = mapping->host; |
1940 | if (S_ISBLK(inode->i_mode)) { | 1949 | if (S_ISBLK(inode->i_mode)) { |
@@ -2141,8 +2150,9 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2141 | */ | 2150 | */ |
2142 | } | 2151 | } |
2143 | INIT_LIST_HEAD(&p->first_swap_extent.list); | 2152 | INIT_LIST_HEAD(&p->first_swap_extent.list); |
2153 | plist_node_init(&p->list, 0); | ||
2154 | plist_node_init(&p->avail_list, 0); | ||
2144 | p->flags = SWP_USED; | 2155 | p->flags = SWP_USED; |
2145 | p->next = -1; | ||
2146 | spin_unlock(&swap_lock); | 2156 | spin_unlock(&swap_lock); |
2147 | spin_lock_init(&p->lock); | 2157 | spin_lock_init(&p->lock); |
2148 | 2158 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 353b683afd6e..2e84fe59190b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -22,6 +22,22 @@ | |||
22 | #include <linux/cleancache.h> | 22 | #include <linux/cleancache.h> |
23 | #include "internal.h" | 23 | #include "internal.h" |
24 | 24 | ||
25 | static void clear_exceptional_entry(struct address_space *mapping, | ||
26 | pgoff_t index, void *entry) | ||
27 | { | ||
28 | /* Handled by shmem itself */ | ||
29 | if (shmem_mapping(mapping)) | ||
30 | return; | ||
31 | |||
32 | spin_lock_irq(&mapping->tree_lock); | ||
33 | /* | ||
34 | * Regular page slots are stabilized by the page lock even | ||
35 | * without the tree itself locked. These unlocked entries | ||
36 | * need verification under the tree lock. | ||
37 | */ | ||
38 | radix_tree_delete_item(&mapping->page_tree, index, entry); | ||
39 | spin_unlock_irq(&mapping->tree_lock); | ||
40 | } | ||
25 | 41 | ||
26 | /** | 42 | /** |
27 | * do_invalidatepage - invalidate part or all of a page | 43 | * do_invalidatepage - invalidate part or all of a page |
@@ -208,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
208 | unsigned int partial_start; /* inclusive */ | 224 | unsigned int partial_start; /* inclusive */ |
209 | unsigned int partial_end; /* exclusive */ | 225 | unsigned int partial_end; /* exclusive */ |
210 | struct pagevec pvec; | 226 | struct pagevec pvec; |
227 | pgoff_t indices[PAGEVEC_SIZE]; | ||
211 | pgoff_t index; | 228 | pgoff_t index; |
212 | int i; | 229 | int i; |
213 | 230 | ||
@@ -238,17 +255,23 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
238 | 255 | ||
239 | pagevec_init(&pvec, 0); | 256 | pagevec_init(&pvec, 0); |
240 | index = start; | 257 | index = start; |
241 | while (index < end && pagevec_lookup(&pvec, mapping, index, | 258 | while (index < end && pagevec_lookup_entries(&pvec, mapping, index, |
242 | min(end - index, (pgoff_t)PAGEVEC_SIZE))) { | 259 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
260 | indices)) { | ||
243 | mem_cgroup_uncharge_start(); | 261 | mem_cgroup_uncharge_start(); |
244 | for (i = 0; i < pagevec_count(&pvec); i++) { | 262 | for (i = 0; i < pagevec_count(&pvec); i++) { |
245 | struct page *page = pvec.pages[i]; | 263 | struct page *page = pvec.pages[i]; |
246 | 264 | ||
247 | /* We rely upon deletion not changing page->index */ | 265 | /* We rely upon deletion not changing page->index */ |
248 | index = page->index; | 266 | index = indices[i]; |
249 | if (index >= end) | 267 | if (index >= end) |
250 | break; | 268 | break; |
251 | 269 | ||
270 | if (radix_tree_exceptional_entry(page)) { | ||
271 | clear_exceptional_entry(mapping, index, page); | ||
272 | continue; | ||
273 | } | ||
274 | |||
252 | if (!trylock_page(page)) | 275 | if (!trylock_page(page)) |
253 | continue; | 276 | continue; |
254 | WARN_ON(page->index != index); | 277 | WARN_ON(page->index != index); |
@@ -259,6 +282,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
259 | truncate_inode_page(mapping, page); | 282 | truncate_inode_page(mapping, page); |
260 | unlock_page(page); | 283 | unlock_page(page); |
261 | } | 284 | } |
285 | pagevec_remove_exceptionals(&pvec); | ||
262 | pagevec_release(&pvec); | 286 | pagevec_release(&pvec); |
263 | mem_cgroup_uncharge_end(); | 287 | mem_cgroup_uncharge_end(); |
264 | cond_resched(); | 288 | cond_resched(); |
@@ -307,14 +331,16 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
307 | index = start; | 331 | index = start; |
308 | for ( ; ; ) { | 332 | for ( ; ; ) { |
309 | cond_resched(); | 333 | cond_resched(); |
310 | if (!pagevec_lookup(&pvec, mapping, index, | 334 | if (!pagevec_lookup_entries(&pvec, mapping, index, |
311 | min(end - index, (pgoff_t)PAGEVEC_SIZE))) { | 335 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
336 | indices)) { | ||
312 | if (index == start) | 337 | if (index == start) |
313 | break; | 338 | break; |
314 | index = start; | 339 | index = start; |
315 | continue; | 340 | continue; |
316 | } | 341 | } |
317 | if (index == start && pvec.pages[0]->index >= end) { | 342 | if (index == start && indices[0] >= end) { |
343 | pagevec_remove_exceptionals(&pvec); | ||
318 | pagevec_release(&pvec); | 344 | pagevec_release(&pvec); |
319 | break; | 345 | break; |
320 | } | 346 | } |
@@ -323,16 +349,22 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
323 | struct page *page = pvec.pages[i]; | 349 | struct page *page = pvec.pages[i]; |
324 | 350 | ||
325 | /* We rely upon deletion not changing page->index */ | 351 | /* We rely upon deletion not changing page->index */ |
326 | index = page->index; | 352 | index = indices[i]; |
327 | if (index >= end) | 353 | if (index >= end) |
328 | break; | 354 | break; |
329 | 355 | ||
356 | if (radix_tree_exceptional_entry(page)) { | ||
357 | clear_exceptional_entry(mapping, index, page); | ||
358 | continue; | ||
359 | } | ||
360 | |||
330 | lock_page(page); | 361 | lock_page(page); |
331 | WARN_ON(page->index != index); | 362 | WARN_ON(page->index != index); |
332 | wait_on_page_writeback(page); | 363 | wait_on_page_writeback(page); |
333 | truncate_inode_page(mapping, page); | 364 | truncate_inode_page(mapping, page); |
334 | unlock_page(page); | 365 | unlock_page(page); |
335 | } | 366 | } |
367 | pagevec_remove_exceptionals(&pvec); | ||
336 | pagevec_release(&pvec); | 368 | pagevec_release(&pvec); |
337 | mem_cgroup_uncharge_end(); | 369 | mem_cgroup_uncharge_end(); |
338 | index++; | 370 | index++; |
@@ -375,6 +407,7 @@ EXPORT_SYMBOL(truncate_inode_pages); | |||
375 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | 407 | unsigned long invalidate_mapping_pages(struct address_space *mapping, |
376 | pgoff_t start, pgoff_t end) | 408 | pgoff_t start, pgoff_t end) |
377 | { | 409 | { |
410 | pgoff_t indices[PAGEVEC_SIZE]; | ||
378 | struct pagevec pvec; | 411 | struct pagevec pvec; |
379 | pgoff_t index = start; | 412 | pgoff_t index = start; |
380 | unsigned long ret; | 413 | unsigned long ret; |
@@ -390,17 +423,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
390 | */ | 423 | */ |
391 | 424 | ||
392 | pagevec_init(&pvec, 0); | 425 | pagevec_init(&pvec, 0); |
393 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 426 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, |
394 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 427 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
428 | indices)) { | ||
395 | mem_cgroup_uncharge_start(); | 429 | mem_cgroup_uncharge_start(); |
396 | for (i = 0; i < pagevec_count(&pvec); i++) { | 430 | for (i = 0; i < pagevec_count(&pvec); i++) { |
397 | struct page *page = pvec.pages[i]; | 431 | struct page *page = pvec.pages[i]; |
398 | 432 | ||
399 | /* We rely upon deletion not changing page->index */ | 433 | /* We rely upon deletion not changing page->index */ |
400 | index = page->index; | 434 | index = indices[i]; |
401 | if (index > end) | 435 | if (index > end) |
402 | break; | 436 | break; |
403 | 437 | ||
438 | if (radix_tree_exceptional_entry(page)) { | ||
439 | clear_exceptional_entry(mapping, index, page); | ||
440 | continue; | ||
441 | } | ||
442 | |||
404 | if (!trylock_page(page)) | 443 | if (!trylock_page(page)) |
405 | continue; | 444 | continue; |
406 | WARN_ON(page->index != index); | 445 | WARN_ON(page->index != index); |
@@ -414,6 +453,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
414 | deactivate_page(page); | 453 | deactivate_page(page); |
415 | count += ret; | 454 | count += ret; |
416 | } | 455 | } |
456 | pagevec_remove_exceptionals(&pvec); | ||
417 | pagevec_release(&pvec); | 457 | pagevec_release(&pvec); |
418 | mem_cgroup_uncharge_end(); | 458 | mem_cgroup_uncharge_end(); |
419 | cond_resched(); | 459 | cond_resched(); |
@@ -481,6 +521,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) | |||
481 | int invalidate_inode_pages2_range(struct address_space *mapping, | 521 | int invalidate_inode_pages2_range(struct address_space *mapping, |
482 | pgoff_t start, pgoff_t end) | 522 | pgoff_t start, pgoff_t end) |
483 | { | 523 | { |
524 | pgoff_t indices[PAGEVEC_SIZE]; | ||
484 | struct pagevec pvec; | 525 | struct pagevec pvec; |
485 | pgoff_t index; | 526 | pgoff_t index; |
486 | int i; | 527 | int i; |
@@ -491,17 +532,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
491 | cleancache_invalidate_inode(mapping); | 532 | cleancache_invalidate_inode(mapping); |
492 | pagevec_init(&pvec, 0); | 533 | pagevec_init(&pvec, 0); |
493 | index = start; | 534 | index = start; |
494 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 535 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, |
495 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 536 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
537 | indices)) { | ||
496 | mem_cgroup_uncharge_start(); | 538 | mem_cgroup_uncharge_start(); |
497 | for (i = 0; i < pagevec_count(&pvec); i++) { | 539 | for (i = 0; i < pagevec_count(&pvec); i++) { |
498 | struct page *page = pvec.pages[i]; | 540 | struct page *page = pvec.pages[i]; |
499 | 541 | ||
500 | /* We rely upon deletion not changing page->index */ | 542 | /* We rely upon deletion not changing page->index */ |
501 | index = page->index; | 543 | index = indices[i]; |
502 | if (index > end) | 544 | if (index > end) |
503 | break; | 545 | break; |
504 | 546 | ||
547 | if (radix_tree_exceptional_entry(page)) { | ||
548 | clear_exceptional_entry(mapping, index, page); | ||
549 | continue; | ||
550 | } | ||
551 | |||
505 | lock_page(page); | 552 | lock_page(page); |
506 | WARN_ON(page->index != index); | 553 | WARN_ON(page->index != index); |
507 | if (page->mapping != mapping) { | 554 | if (page->mapping != mapping) { |
@@ -539,6 +586,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
539 | ret = ret2; | 586 | ret = ret2; |
540 | unlock_page(page); | 587 | unlock_page(page); |
541 | } | 588 | } |
589 | pagevec_remove_exceptionals(&pvec); | ||
542 | pagevec_release(&pvec); | 590 | pagevec_release(&pvec); |
543 | mem_cgroup_uncharge_end(); | 591 | mem_cgroup_uncharge_end(); |
544 | cond_resched(); | 592 | cond_resched(); |
diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 000000000000..1037a3bab505 --- /dev/null +++ b/mm/vmacache.c | |||
@@ -0,0 +1,114 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Davidlohr Bueso. | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/vmacache.h> | ||
7 | |||
8 | /* | ||
9 | * Flush vma caches for threads that share a given mm. | ||
10 | * | ||
11 | * The operation is safe because the caller holds the mmap_sem | ||
12 | * exclusively and other threads accessing the vma cache will | ||
13 | * have mmap_sem held at least for read, so no extra locking | ||
14 | * is required to maintain the vma cache. | ||
15 | */ | ||
16 | void vmacache_flush_all(struct mm_struct *mm) | ||
17 | { | ||
18 | struct task_struct *g, *p; | ||
19 | |||
20 | rcu_read_lock(); | ||
21 | for_each_process_thread(g, p) { | ||
22 | /* | ||
23 | * Only flush the vmacache pointers as the | ||
24 | * mm seqnum is already set and curr's will | ||
25 | * be set upon invalidation when the next | ||
26 | * lookup is done. | ||
27 | */ | ||
28 | if (mm == p->mm) | ||
29 | vmacache_flush(p); | ||
30 | } | ||
31 | rcu_read_unlock(); | ||
32 | } | ||
33 | |||
34 | /* | ||
35 | * This task may be accessing a foreign mm via (for example) | ||
36 | * get_user_pages()->find_vma(). The vmacache is task-local and this | ||
37 | * task's vmacache pertains to a different mm (ie, its own). There is | ||
38 | * nothing we can do here. | ||
39 | * | ||
40 | * Also handle the case where a kernel thread has adopted this mm via use_mm(). | ||
41 | * That kernel thread's vmacache is not applicable to this mm. | ||
42 | */ | ||
43 | static bool vmacache_valid_mm(struct mm_struct *mm) | ||
44 | { | ||
45 | return current->mm == mm && !(current->flags & PF_KTHREAD); | ||
46 | } | ||
47 | |||
48 | void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) | ||
49 | { | ||
50 | if (vmacache_valid_mm(newvma->vm_mm)) | ||
51 | current->vmacache[VMACACHE_HASH(addr)] = newvma; | ||
52 | } | ||
53 | |||
54 | static bool vmacache_valid(struct mm_struct *mm) | ||
55 | { | ||
56 | struct task_struct *curr; | ||
57 | |||
58 | if (!vmacache_valid_mm(mm)) | ||
59 | return false; | ||
60 | |||
61 | curr = current; | ||
62 | if (mm->vmacache_seqnum != curr->vmacache_seqnum) { | ||
63 | /* | ||
64 | * First attempt will always be invalid, initialize | ||
65 | * the new cache for this task here. | ||
66 | */ | ||
67 | curr->vmacache_seqnum = mm->vmacache_seqnum; | ||
68 | vmacache_flush(curr); | ||
69 | return false; | ||
70 | } | ||
71 | return true; | ||
72 | } | ||
73 | |||
74 | struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) | ||
75 | { | ||
76 | int i; | ||
77 | |||
78 | if (!vmacache_valid(mm)) | ||
79 | return NULL; | ||
80 | |||
81 | for (i = 0; i < VMACACHE_SIZE; i++) { | ||
82 | struct vm_area_struct *vma = current->vmacache[i]; | ||
83 | |||
84 | if (!vma) | ||
85 | continue; | ||
86 | if (WARN_ON_ONCE(vma->vm_mm != mm)) | ||
87 | break; | ||
88 | if (vma->vm_start <= addr && vma->vm_end > addr) | ||
89 | return vma; | ||
90 | } | ||
91 | |||
92 | return NULL; | ||
93 | } | ||
94 | |||
95 | #ifndef CONFIG_MMU | ||
96 | struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, | ||
97 | unsigned long start, | ||
98 | unsigned long end) | ||
99 | { | ||
100 | int i; | ||
101 | |||
102 | if (!vmacache_valid(mm)) | ||
103 | return NULL; | ||
104 | |||
105 | for (i = 0; i < VMACACHE_SIZE; i++) { | ||
106 | struct vm_area_struct *vma = current->vmacache[i]; | ||
107 | |||
108 | if (vma && vma->vm_start == start && vma->vm_end == end) | ||
109 | return vma; | ||
110 | } | ||
111 | |||
112 | return NULL; | ||
113 | } | ||
114 | #endif | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e2be0f802ccf..060dc366ac44 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2685,14 +2685,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) | |||
2685 | 2685 | ||
2686 | prev_end = VMALLOC_START; | 2686 | prev_end = VMALLOC_START; |
2687 | 2687 | ||
2688 | spin_lock(&vmap_area_lock); | 2688 | rcu_read_lock(); |
2689 | 2689 | ||
2690 | if (list_empty(&vmap_area_list)) { | 2690 | if (list_empty(&vmap_area_list)) { |
2691 | vmi->largest_chunk = VMALLOC_TOTAL; | 2691 | vmi->largest_chunk = VMALLOC_TOTAL; |
2692 | goto out; | 2692 | goto out; |
2693 | } | 2693 | } |
2694 | 2694 | ||
2695 | list_for_each_entry(va, &vmap_area_list, list) { | 2695 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
2696 | unsigned long addr = va->va_start; | 2696 | unsigned long addr = va->va_start; |
2697 | 2697 | ||
2698 | /* | 2698 | /* |
@@ -2719,7 +2719,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) | |||
2719 | vmi->largest_chunk = VMALLOC_END - prev_end; | 2719 | vmi->largest_chunk = VMALLOC_END - prev_end; |
2720 | 2720 | ||
2721 | out: | 2721 | out: |
2722 | spin_unlock(&vmap_area_lock); | 2722 | rcu_read_unlock(); |
2723 | } | 2723 | } |
2724 | #endif | 2724 | #endif |
2725 | 2725 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5ad29b2925a0..5461d02ea718 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -163,7 +163,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) | |||
163 | 163 | ||
164 | bool zone_reclaimable(struct zone *zone) | 164 | bool zone_reclaimable(struct zone *zone) |
165 | { | 165 | { |
166 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | 166 | return zone_page_state(zone, NR_PAGES_SCANNED) < |
167 | zone_reclaimable_pages(zone) * 6; | ||
167 | } | 168 | } |
168 | 169 | ||
169 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 170 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
@@ -224,15 +225,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
224 | unsigned long freed = 0; | 225 | unsigned long freed = 0; |
225 | unsigned long long delta; | 226 | unsigned long long delta; |
226 | long total_scan; | 227 | long total_scan; |
227 | long max_pass; | 228 | long freeable; |
228 | long nr; | 229 | long nr; |
229 | long new_nr; | 230 | long new_nr; |
230 | int nid = shrinkctl->nid; | 231 | int nid = shrinkctl->nid; |
231 | long batch_size = shrinker->batch ? shrinker->batch | 232 | long batch_size = shrinker->batch ? shrinker->batch |
232 | : SHRINK_BATCH; | 233 | : SHRINK_BATCH; |
233 | 234 | ||
234 | max_pass = shrinker->count_objects(shrinker, shrinkctl); | 235 | freeable = shrinker->count_objects(shrinker, shrinkctl); |
235 | if (max_pass == 0) | 236 | if (freeable == 0) |
236 | return 0; | 237 | return 0; |
237 | 238 | ||
238 | /* | 239 | /* |
@@ -244,14 +245,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
244 | 245 | ||
245 | total_scan = nr; | 246 | total_scan = nr; |
246 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 247 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
247 | delta *= max_pass; | 248 | delta *= freeable; |
248 | do_div(delta, lru_pages + 1); | 249 | do_div(delta, lru_pages + 1); |
249 | total_scan += delta; | 250 | total_scan += delta; |
250 | if (total_scan < 0) { | 251 | if (total_scan < 0) { |
251 | printk(KERN_ERR | 252 | printk(KERN_ERR |
252 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | 253 | "shrink_slab: %pF negative objects to delete nr=%ld\n", |
253 | shrinker->scan_objects, total_scan); | 254 | shrinker->scan_objects, total_scan); |
254 | total_scan = max_pass; | 255 | total_scan = freeable; |
255 | } | 256 | } |
256 | 257 | ||
257 | /* | 258 | /* |
@@ -260,38 +261,55 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
260 | * shrinkers to return -1 all the time. This results in a large | 261 | * shrinkers to return -1 all the time. This results in a large |
261 | * nr being built up so when a shrink that can do some work | 262 | * nr being built up so when a shrink that can do some work |
262 | * comes along it empties the entire cache due to nr >>> | 263 | * comes along it empties the entire cache due to nr >>> |
263 | * max_pass. This is bad for sustaining a working set in | 264 | * freeable. This is bad for sustaining a working set in |
264 | * memory. | 265 | * memory. |
265 | * | 266 | * |
266 | * Hence only allow the shrinker to scan the entire cache when | 267 | * Hence only allow the shrinker to scan the entire cache when |
267 | * a large delta change is calculated directly. | 268 | * a large delta change is calculated directly. |
268 | */ | 269 | */ |
269 | if (delta < max_pass / 4) | 270 | if (delta < freeable / 4) |
270 | total_scan = min(total_scan, max_pass / 2); | 271 | total_scan = min(total_scan, freeable / 2); |
271 | 272 | ||
272 | /* | 273 | /* |
273 | * Avoid risking looping forever due to too large nr value: | 274 | * Avoid risking looping forever due to too large nr value: |
274 | * never try to free more than twice the estimate number of | 275 | * never try to free more than twice the estimate number of |
275 | * freeable entries. | 276 | * freeable entries. |
276 | */ | 277 | */ |
277 | if (total_scan > max_pass * 2) | 278 | if (total_scan > freeable * 2) |
278 | total_scan = max_pass * 2; | 279 | total_scan = freeable * 2; |
279 | 280 | ||
280 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, | 281 | trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, |
281 | nr_pages_scanned, lru_pages, | 282 | nr_pages_scanned, lru_pages, |
282 | max_pass, delta, total_scan); | 283 | freeable, delta, total_scan); |
283 | 284 | ||
284 | while (total_scan >= batch_size) { | 285 | /* |
286 | * Normally, we should not scan less than batch_size objects in one | ||
287 | * pass to avoid too frequent shrinker calls, but if the slab has less | ||
288 | * than batch_size objects in total and we are really tight on memory, | ||
289 | * we will try to reclaim all available objects, otherwise we can end | ||
290 | * up failing allocations although there are plenty of reclaimable | ||
291 | * objects spread over several slabs with usage less than the | ||
292 | * batch_size. | ||
293 | * | ||
294 | * We detect the "tight on memory" situations by looking at the total | ||
295 | * number of objects we want to scan (total_scan). If it is greater | ||
296 | * than the total number of objects on slab (freeable), we must be | ||
297 | * scanning at high prio and therefore should try to reclaim as much as | ||
298 | * possible. | ||
299 | */ | ||
300 | while (total_scan >= batch_size || | ||
301 | total_scan >= freeable) { | ||
285 | unsigned long ret; | 302 | unsigned long ret; |
303 | unsigned long nr_to_scan = min(batch_size, total_scan); | ||
286 | 304 | ||
287 | shrinkctl->nr_to_scan = batch_size; | 305 | shrinkctl->nr_to_scan = nr_to_scan; |
288 | ret = shrinker->scan_objects(shrinker, shrinkctl); | 306 | ret = shrinker->scan_objects(shrinker, shrinkctl); |
289 | if (ret == SHRINK_STOP) | 307 | if (ret == SHRINK_STOP) |
290 | break; | 308 | break; |
291 | freed += ret; | 309 | freed += ret; |
292 | 310 | ||
293 | count_vm_events(SLABS_SCANNED, batch_size); | 311 | count_vm_events(SLABS_SCANNED, nr_to_scan); |
294 | total_scan -= batch_size; | 312 | total_scan -= nr_to_scan; |
295 | 313 | ||
296 | cond_resched(); | 314 | cond_resched(); |
297 | } | 315 | } |
@@ -352,16 +370,17 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl, | |||
352 | } | 370 | } |
353 | 371 | ||
354 | list_for_each_entry(shrinker, &shrinker_list, list) { | 372 | list_for_each_entry(shrinker, &shrinker_list, list) { |
355 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | 373 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) { |
356 | if (!node_online(shrinkctl->nid)) | 374 | shrinkctl->nid = 0; |
357 | continue; | ||
358 | |||
359 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE) && | ||
360 | (shrinkctl->nid != 0)) | ||
361 | break; | ||
362 | |||
363 | freed += shrink_slab_node(shrinkctl, shrinker, | 375 | freed += shrink_slab_node(shrinkctl, shrinker, |
364 | nr_pages_scanned, lru_pages); | 376 | nr_pages_scanned, lru_pages); |
377 | continue; | ||
378 | } | ||
379 | |||
380 | for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) { | ||
381 | if (node_online(shrinkctl->nid)) | ||
382 | freed += shrink_slab_node(shrinkctl, shrinker, | ||
383 | nr_pages_scanned, lru_pages); | ||
365 | 384 | ||
366 | } | 385 | } |
367 | } | 386 | } |
@@ -1089,7 +1108,7 @@ keep: | |||
1089 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 1108 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
1090 | } | 1109 | } |
1091 | 1110 | ||
1092 | free_hot_cold_page_list(&free_pages, 1); | 1111 | free_hot_cold_page_list(&free_pages, true); |
1093 | 1112 | ||
1094 | list_splice(&ret_pages, page_list); | 1113 | list_splice(&ret_pages, page_list); |
1095 | count_vm_events(PGACTIVATE, pgactivate); | 1114 | count_vm_events(PGACTIVATE, pgactivate); |
@@ -1126,7 +1145,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1126 | TTU_UNMAP|TTU_IGNORE_ACCESS, | 1145 | TTU_UNMAP|TTU_IGNORE_ACCESS, |
1127 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); | 1146 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); |
1128 | list_splice(&clean_pages, page_list); | 1147 | list_splice(&clean_pages, page_list); |
1129 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | 1148 | mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); |
1130 | return ret; | 1149 | return ret; |
1131 | } | 1150 | } |
1132 | 1151 | ||
@@ -1452,7 +1471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1452 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | 1471 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); |
1453 | 1472 | ||
1454 | if (global_reclaim(sc)) { | 1473 | if (global_reclaim(sc)) { |
1455 | zone->pages_scanned += nr_scanned; | 1474 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); |
1456 | if (current_is_kswapd()) | 1475 | if (current_is_kswapd()) |
1457 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); | 1476 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); |
1458 | else | 1477 | else |
@@ -1487,7 +1506,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1487 | 1506 | ||
1488 | spin_unlock_irq(&zone->lru_lock); | 1507 | spin_unlock_irq(&zone->lru_lock); |
1489 | 1508 | ||
1490 | free_hot_cold_page_list(&page_list, 1); | 1509 | free_hot_cold_page_list(&page_list, true); |
1491 | 1510 | ||
1492 | /* | 1511 | /* |
1493 | * If reclaim is isolating dirty pages under writeback, it implies | 1512 | * If reclaim is isolating dirty pages under writeback, it implies |
@@ -1641,7 +1660,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1641 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, | 1660 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
1642 | &nr_scanned, sc, isolate_mode, lru); | 1661 | &nr_scanned, sc, isolate_mode, lru); |
1643 | if (global_reclaim(sc)) | 1662 | if (global_reclaim(sc)) |
1644 | zone->pages_scanned += nr_scanned; | 1663 | __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); |
1645 | 1664 | ||
1646 | reclaim_stat->recent_scanned[file] += nr_taken; | 1665 | reclaim_stat->recent_scanned[file] += nr_taken; |
1647 | 1666 | ||
@@ -1707,7 +1726,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1707 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); | 1726 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
1708 | spin_unlock_irq(&zone->lru_lock); | 1727 | spin_unlock_irq(&zone->lru_lock); |
1709 | 1728 | ||
1710 | free_hot_cold_page_list(&l_hold, 1); | 1729 | free_hot_cold_page_list(&l_hold, true); |
1711 | } | 1730 | } |
1712 | 1731 | ||
1713 | #ifdef CONFIG_SWAP | 1732 | #ifdef CONFIG_SWAP |
@@ -1829,7 +1848,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1829 | struct zone *zone = lruvec_zone(lruvec); | 1848 | struct zone *zone = lruvec_zone(lruvec); |
1830 | unsigned long anon_prio, file_prio; | 1849 | unsigned long anon_prio, file_prio; |
1831 | enum scan_balance scan_balance; | 1850 | enum scan_balance scan_balance; |
1832 | unsigned long anon, file, free; | 1851 | unsigned long anon, file; |
1833 | bool force_scan = false; | 1852 | bool force_scan = false; |
1834 | unsigned long ap, fp; | 1853 | unsigned long ap, fp; |
1835 | enum lru_list lru; | 1854 | enum lru_list lru; |
@@ -1877,11 +1896,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1877 | goto out; | 1896 | goto out; |
1878 | } | 1897 | } |
1879 | 1898 | ||
1880 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + | ||
1881 | get_lru_size(lruvec, LRU_INACTIVE_ANON); | ||
1882 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | ||
1883 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1884 | |||
1885 | /* | 1899 | /* |
1886 | * If it's foreseeable that reclaiming the file cache won't be | 1900 | * If it's foreseeable that reclaiming the file cache won't be |
1887 | * enough to get the zone back into a desirable shape, we have | 1901 | * enough to get the zone back into a desirable shape, we have |
@@ -1889,8 +1903,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1889 | * thrashing - remaining file pages alone. | 1903 | * thrashing - remaining file pages alone. |
1890 | */ | 1904 | */ |
1891 | if (global_reclaim(sc)) { | 1905 | if (global_reclaim(sc)) { |
1892 | free = zone_page_state(zone, NR_FREE_PAGES); | 1906 | unsigned long zonefile; |
1893 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1907 | unsigned long zonefree; |
1908 | |||
1909 | zonefree = zone_page_state(zone, NR_FREE_PAGES); | ||
1910 | zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
1911 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
1912 | |||
1913 | if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { | ||
1894 | scan_balance = SCAN_ANON; | 1914 | scan_balance = SCAN_ANON; |
1895 | goto out; | 1915 | goto out; |
1896 | } | 1916 | } |
@@ -1925,6 +1945,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1925 | * | 1945 | * |
1926 | * anon in [0], file in [1] | 1946 | * anon in [0], file in [1] |
1927 | */ | 1947 | */ |
1948 | |||
1949 | anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + | ||
1950 | get_lru_size(lruvec, LRU_INACTIVE_ANON); | ||
1951 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | ||
1952 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1953 | |||
1928 | spin_lock_irq(&zone->lru_lock); | 1954 | spin_lock_irq(&zone->lru_lock); |
1929 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { | 1955 | if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { |
1930 | reclaim_stat->recent_scanned[0] /= 2; | 1956 | reclaim_stat->recent_scanned[0] /= 2; |
@@ -2000,13 +2026,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2000 | unsigned long nr_reclaimed = 0; | 2026 | unsigned long nr_reclaimed = 0; |
2001 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 2027 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
2002 | struct blk_plug plug; | 2028 | struct blk_plug plug; |
2003 | bool scan_adjusted = false; | 2029 | bool scan_adjusted; |
2004 | 2030 | ||
2005 | get_scan_count(lruvec, sc, nr); | 2031 | get_scan_count(lruvec, sc, nr); |
2006 | 2032 | ||
2007 | /* Record the original scan target for proportional adjustments later */ | 2033 | /* Record the original scan target for proportional adjustments later */ |
2008 | memcpy(targets, nr, sizeof(nr)); | 2034 | memcpy(targets, nr, sizeof(nr)); |
2009 | 2035 | ||
2036 | /* | ||
2037 | * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal | ||
2038 | * event that can occur when there is little memory pressure e.g. | ||
2039 | * multiple streaming readers/writers. Hence, we do not abort scanning | ||
2040 | * when the requested number of pages are reclaimed when scanning at | ||
2041 | * DEF_PRIORITY on the assumption that the fact we are direct | ||
2042 | * reclaiming implies that kswapd is not keeping up and it is best to | ||
2043 | * do a batch of work at once. For memcg reclaim one check is made to | ||
2044 | * abort proportional reclaim if either the file or anon lru has already | ||
2045 | * dropped to zero at the first pass. | ||
2046 | */ | ||
2047 | scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && | ||
2048 | sc->priority == DEF_PRIORITY); | ||
2049 | |||
2010 | blk_start_plug(&plug); | 2050 | blk_start_plug(&plug); |
2011 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 2051 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
2012 | nr[LRU_INACTIVE_FILE]) { | 2052 | nr[LRU_INACTIVE_FILE]) { |
@@ -2027,17 +2067,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2027 | continue; | 2067 | continue; |
2028 | 2068 | ||
2029 | /* | 2069 | /* |
2030 | * For global direct reclaim, reclaim only the number of pages | ||
2031 | * requested. Less care is taken to scan proportionally as it | ||
2032 | * is more important to minimise direct reclaim stall latency | ||
2033 | * than it is to properly age the LRU lists. | ||
2034 | */ | ||
2035 | if (global_reclaim(sc) && !current_is_kswapd()) | ||
2036 | break; | ||
2037 | |||
2038 | /* | ||
2039 | * For kswapd and memcg, reclaim at least the number of pages | 2070 | * For kswapd and memcg, reclaim at least the number of pages |
2040 | * requested. Ensure that the anon and file LRUs shrink | 2071 | * requested. Ensure that the anon and file LRUs are scanned |
2041 | * proportionally what was requested by get_scan_count(). We | 2072 | * proportionally what was requested by get_scan_count(). We |
2042 | * stop reclaiming one LRU and reduce the amount scanning | 2073 | * stop reclaiming one LRU and reduce the amount scanning |
2043 | * proportional to the original scan target. | 2074 | * proportional to the original scan target. |
@@ -2045,6 +2076,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
2045 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; | 2076 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; |
2046 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; | 2077 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; |
2047 | 2078 | ||
2079 | /* | ||
2080 | * It's just vindictive to attack the larger once the smaller | ||
2081 | * has gone to zero. And given the way we stop scanning the | ||
2082 | * smaller below, this makes sure that we only make one nudge | ||
2083 | * towards proportionality once we've got nr_to_reclaim. | ||
2084 | */ | ||
2085 | if (!nr_file || !nr_anon) | ||
2086 | break; | ||
2087 | |||
2048 | if (nr_file > nr_anon) { | 2088 | if (nr_file > nr_anon) { |
2049 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + | 2089 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + |
2050 | targets[LRU_ACTIVE_ANON] + 1; | 2090 | targets[LRU_ACTIVE_ANON] + 1; |
@@ -2406,8 +2446,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2406 | unsigned long lru_pages = 0; | 2446 | unsigned long lru_pages = 0; |
2407 | 2447 | ||
2408 | nodes_clear(shrink->nodes_to_scan); | 2448 | nodes_clear(shrink->nodes_to_scan); |
2409 | for_each_zone_zonelist(zone, z, zonelist, | 2449 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2410 | gfp_zone(sc->gfp_mask)) { | 2450 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
2411 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2451 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2412 | continue; | 2452 | continue; |
2413 | 2453 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a442a723d79..f7ca04482299 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -200,7 +200,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, | |||
200 | continue; | 200 | continue; |
201 | 201 | ||
202 | threshold = (*calculate_pressure)(zone); | 202 | threshold = (*calculate_pressure)(zone); |
203 | for_each_possible_cpu(cpu) | 203 | for_each_online_cpu(cpu) |
204 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | 204 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
205 | = threshold; | 205 | = threshold; |
206 | } | 206 | } |
@@ -761,6 +761,7 @@ const char * const vmstat_text[] = { | |||
761 | "nr_shmem", | 761 | "nr_shmem", |
762 | "nr_dirtied", | 762 | "nr_dirtied", |
763 | "nr_written", | 763 | "nr_written", |
764 | "nr_pages_scanned", | ||
764 | 765 | ||
765 | #ifdef CONFIG_NUMA | 766 | #ifdef CONFIG_NUMA |
766 | "numa_hit", | 767 | "numa_hit", |
@@ -851,12 +852,14 @@ const char * const vmstat_text[] = { | |||
851 | "thp_zero_page_alloc", | 852 | "thp_zero_page_alloc", |
852 | "thp_zero_page_alloc_failed", | 853 | "thp_zero_page_alloc_failed", |
853 | #endif | 854 | #endif |
855 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
854 | #ifdef CONFIG_SMP | 856 | #ifdef CONFIG_SMP |
855 | "nr_tlb_remote_flush", | 857 | "nr_tlb_remote_flush", |
856 | "nr_tlb_remote_flush_received", | 858 | "nr_tlb_remote_flush_received", |
857 | #endif | 859 | #endif /* CONFIG_SMP */ |
858 | "nr_tlb_local_flush_all", | 860 | "nr_tlb_local_flush_all", |
859 | "nr_tlb_local_flush_one", | 861 | "nr_tlb_local_flush_one", |
862 | #endif /* CONFIG_DEBUG_TLBFLUSH */ | ||
860 | 863 | ||
861 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 864 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
862 | }; | 865 | }; |
@@ -1053,7 +1056,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1053 | min_wmark_pages(zone), | 1056 | min_wmark_pages(zone), |
1054 | low_wmark_pages(zone), | 1057 | low_wmark_pages(zone), |
1055 | high_wmark_pages(zone), | 1058 | high_wmark_pages(zone), |
1056 | zone->pages_scanned, | 1059 | zone_page_state(zone, NR_PAGES_SCANNED), |
1057 | zone->spanned_pages, | 1060 | zone->spanned_pages, |
1058 | zone->present_pages, | 1061 | zone->present_pages, |
1059 | zone->managed_pages); | 1062 | zone->managed_pages); |
@@ -1063,10 +1066,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1063 | zone_page_state(zone, i)); | 1066 | zone_page_state(zone, i)); |
1064 | 1067 | ||
1065 | seq_printf(m, | 1068 | seq_printf(m, |
1066 | "\n protection: (%lu", | 1069 | "\n protection: (%ld", |
1067 | zone->lowmem_reserve[0]); | 1070 | zone->lowmem_reserve[0]); |
1068 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 1071 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
1069 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | 1072 | seq_printf(m, ", %ld", zone->lowmem_reserve[i]); |
1070 | seq_printf(m, | 1073 | seq_printf(m, |
1071 | ")" | 1074 | ")" |
1072 | "\n pagesets"); | 1075 | "\n pagesets"); |