Merge remote-tracking branch 'stable/linux-3.0.y' into android-3.0

Change-Id: I9685feb9277b450da10d78a455b3c0674d6cfe18 Signed-off-by: Todd Poynor <toddpoynor@google.com>
author: Todd Poynor 2012-11-01 15:36:34 -0500
committer: Todd Poynor 2012-11-01 15:36:34 -0500
commit: 925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3 (patch)
tree: a56506710f0340db055191e3cf0a207699c1b849 /mm
parent: 834029ac9d0ad8dea4e6a21bc34877dc3740b9f4 (diff)
parent: 27d0858dbcf199838b8c50a3e94d397bf326d986 (diff)
download: kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.gz
kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.xz
kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.zip
20 files changed, 771 insertions, 334 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index c4bc5acf865..8ea7308601b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        /* Account for isolated anon and file pages */
-        unsigned long nr_anon;
-        unsigned long nr_file;
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
 static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
        struct page *page;
-        unsigned int count[NR_LRU_LISTS] = { 0, };
+        unsigned int count[2] = { 0, };
-        list_for_each_entry(page, &cc->migratepages, lru) {
+        list_for_each_entry(page, &cc->migratepages, lru)
-                int lru = page_lru_base_type(page);
+                count[!!page_is_file_cache(page)]++;
-                count[lru]++;
-        }
-        cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-        cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
+        isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
        /* Do not scan outside zone boundaries */
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -378,8 +371,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
                }
+                if (!cc->sync)
+                        mode |= ISOLATE_ASYNC_MIGRATE;
                /* Try isolate the page */
-                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+                if (__isolate_lru_page(page, mode, 0) != 0)
                        continue;
                VM_BUG_ON(PageTransCompound(page));
@@ -581,7 +577,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                nr_migrate = cc->nr_migratepages;
                err = migrate_pages(&cc->migratepages, compaction_alloc,
                                (unsigned long)cc, false,
-                                cc->sync);
+                                cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -596,8 +592,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/mm/filemap.c b/mm/filemap.c
index b7d860390f3..10481ebd96c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -516,10 +516,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
        struct page *page;
        if (cpuset_do_page_mem_spread()) {
-                get_mems_allowed();
+                unsigned int cpuset_mems_cookie;
-                n = cpuset_mem_spread_node();
+                do {
-                page = alloc_pages_exact_node(n, gfp, 0);
+                        cpuset_mems_cookie = get_mems_allowed();
-                put_mems_allowed();
+                        n = cpuset_mem_spread_node();
+                        page = alloc_pages_exact_node(n, gfp, 0);
+                } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
                return page;
        }
        return alloc_pages(gfp, 0);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f7001ac53b3..037f077b986 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -460,8 +460,10 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
+        unsigned int cpuset_mems_cookie;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        zonelist = huge_zonelist(vma, address,
                                        htlb_alloc_mask, &mpol, &nodemask);
        /*
@@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                        }
                }
        }
-err:
        mpol_cond_put(mpol);
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
+err:
+        mpol_cond_put(mpol);
+        return NULL;
 }
 static void update_and_free_page(struct hstate *h, struct page *page)
@@ -2060,6 +2067,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                kref_get(&reservations->refs);
 }
+static void resv_map_put(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        if (!reservations)
+                return;
+        kref_put(&reservations->refs, resv_map_release);
+}
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
@@ -2075,7 +2091,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                reserve = (end - start) -
                        region_count(&reservations->regions, start, end);
-                kref_put(&reservations->refs, resv_map_release);
+                resv_map_put(vma);
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
@@ -2285,6 +2301,22 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 {
        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        __unmap_hugepage_range(vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex below.
+         *
+         * This works because in the contexts this is called, the VMA is
+         * going to be destroyed. It is not vunerable to madvise(DONTNEED)
+         * because madvise is not supported on hugetlbfs. The same applies
+         * for direct IO. unmap_hugepage_range() is only being called just
+         * before free_pgtables() so clearing VM_MAYSHARE will not cause
+         * surprises later.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
@@ -2398,7 +2430,6 @@ retry_avoidcopy:
                if (outside_reserve) {
                        BUG_ON(huge_pte_none(pte));
                        if (unmap_ref_private(mm, vma, old_page, address)) {
-                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(&mm->page_table_lock);
                                goto retry_avoidcopy;
@@ -2838,9 +2869,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
@@ -2878,12 +2914,16 @@ int hugetlb_reserve_pages(struct inode *inode,
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
-        if (chg < 0)
+        if (chg < 0) {
-                return chg;
+                ret = chg;
+                goto out_err;
+        }
        /* There must be enough filesystem quota for the mapping */
-        if (hugetlb_get_quota(inode->i_mapping, chg))
+        if (hugetlb_get_quota(inode->i_mapping, chg)) {
-                return -ENOSPC;
+                ret = -ENOSPC;
+                goto out_err;
+        }
        /*
         * Check enough hugepages are available for the reservation.
@@ -2892,7 +2932,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
-                return ret;
+                goto out_err;
        }
        /*
@@ -2909,6 +2949,10 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
+out_err:
+        if (vma)
+                resv_map_put(vma);
+        return ret;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed50..deabe5f603a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -197,14 +198,16 @@ static long madvise_remove(struct vm_area_struct *vma,
        struct address_space *mapping;
        loff_t offset, endoff;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
@@ -218,9 +221,16 @@ static long madvise_remove(struct vm_area_struct *vma,
        endoff = (loff_t)(end - vma->vm_start - 1)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex and i_alloc_sem */
+        /*
+         * vmtruncate_range may need to take i_mutex and i_alloc_sem.
+         * We need to explicitly grab a reference because the vma (and
+         * hence the vma's reference to the file) can go away as soon as
+         * we drop mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
        error = vmtruncate_range(mapping->host, offset, endoff);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 283068f5af9..57cdf5ad692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1251,7 +1251,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
+                                        struct zone *z,
                                        struct mem_cgroup *mem_cont,
                                        int active, int file)
 {
@@ -4605,6 +4606,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 swap_buffers:
        /* Swap primary and spare array */
        thresholds->spare = thresholds->primary;
+        /* If all events are unregistered, free the spare array */
+        if (!new) {
+                kfree(thresholds->spare);
+                thresholds->spare = NULL;
+        }
        rcu_assign_pointer(thresholds->primary, new);
        /* To be sure that nobody uses thresholds */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059..2f49dcf4f47 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1334,8 +1334,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
        /* Keep page count to indicate a given hugepage is isolated. */
        list_add(&hpage->lru, &pagelist);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
-                                true);
+                                MIGRATE_SYNC);
        if (ret) {
                struct page *page1, *page2;
                list_for_each_entry_safe(page1, page2, &pagelist, lru)
@@ -1464,7 +1464,7 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                                0, true);
+                                                        false, MIGRATE_SYNC);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11..e0a3e51d519 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -116,9 +116,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
        struct mem_section *ms;
        struct page *page, *memmap;
-        if (!pfn_valid(start_pfn))
-                return;
        section_nr = pfn_to_section_nr(start_pfn);
        ms = __nr_to_section(section_nr);
@@ -177,9 +174,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
        end_pfn = pfn + pgdat->node_spanned_pages;
        /* register_section info */
-        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-                register_page_bootmem_info_section(pfn);
+                /*
+                 * Some platforms can assign the same pfn to multiple nodes - on
+                 * node0 as well as nodeN.  To avoid registering a pfn against
+                 * multiple nodes we check that this pfn does not already
+                 * reside in some other node.
+                 */
+                if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+                        register_page_bootmem_info_section(pfn);
+        }
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -747,7 +751,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                }
                /* this function returns # of failed pages */
                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
-                                                                true, true);
+                                                        true, MIGRATE_SYNC);
                if (ret)
                        putback_lru_pages(&source);
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a85171de5d0..5dce7d46f79 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -606,24 +606,39 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
-/* Apply policy to a single VMA */
+/*
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+                                                struct mempolicy *pol)
 {
-        int err = 0;
+        int err;
-        struct mempolicy *old = vma->vm_policy;
+        struct mempolicy *old;
+        struct mempolicy *new;
        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
                 vma->vm_ops, vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-        if (vma->vm_ops && vma->vm_ops->set_policy)
+        new = mpol_dup(pol);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
-        if (!err) {
+                if (err)
-                mpol_get(new);
+                        goto err_out;
-                vma->vm_policy = new;
-                mpol_put(old);
        }
+        old = vma->vm_policy;
+        vma->vm_policy = new; /* protected by mmap_sem */
+        mpol_put(old);
+        return 0;
+ err_out:
+        mpol_put(new);
        return err;
 }
@@ -666,7 +681,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
-                err = policy_vma(vma, new_pol);
+                err = vma_replace_policy(vma, new_pol);
                if (err)
                        goto out;
        }
@@ -933,7 +948,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                                false, true);
+                                                        false, MIGRATE_SYNC);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1496,8 +1511,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
                                                                        addr);
                        if (vpol)
                                pol = vpol;
-                } else if (vma->vm_policy)
+                } else if (vma->vm_policy) {
                        pol = vma->vm_policy;
+                        /*
+                         * shmem_alloc_page() passes MPOL_F_SHARED policy with
+                         * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+                         * count on these policies which will be dropped by
+                         * mpol_cond_put() later
+                         */
+                        if (mpol_needs_cond_ref(pol))
+                                mpol_get(pol);
+                }
        }
        if (!pol)
                pol = &default_policy;
@@ -1817,18 +1842,24 @@ struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr, int node)
 {
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        struct mempolicy *pol;
        struct zonelist *zl;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
+retry_cpuset:
+        pol = get_vma_policy(current, vma, addr);
+        cpuset_mems_cookie = get_mems_allowed();
-        get_mems_allowed();
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
                page = alloc_page_interleave(gfp, order, nid);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        zl = policy_zonelist(gfp, pol, node);
@@ -1839,7 +1870,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
-                put_mems_allowed();
+                if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                        goto retry_cpuset;
                return page;
        }
        /*
@@ -1847,7 +1879,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
         */
        page = __alloc_pages_nodemask(gfp, order, zl,
                                      policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
@@ -1874,11 +1907,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
        struct mempolicy *pol = current->mempolicy;
        struct page *page;
+        unsigned int cpuset_mems_cookie;
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                pol = &default_policy;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
@@ -1889,7 +1925,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = __alloc_pages_nodemask(gfp, order,
                                policy_zonelist(gfp, pol, numa_node_id()),
                                policy_nodemask(gfp, pol));
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
@@ -1992,7 +2031,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 */
 /* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
@@ -2056,36 +2095,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
        if (!sp->root.rb_node)
                return NULL;
-        spin_lock(&sp->lock);
+        mutex_lock(&sp->mutex);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
-        spin_unlock(&sp->lock);
+        mutex_unlock(&sp->mutex);
        return pol;
 }
+static void sp_free(struct sp_node *n)
+{
+        mpol_put(n->policy);
+        kmem_cache_free(sn_cache, n);
+}
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-        mpol_put(n->policy);
+        sp_free(n);
-        kmem_cache_free(sn_cache, n);
 }
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
 {
-        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+        struct sp_node *n;
+        struct mempolicy *newpol;
+        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;
+        newpol = mpol_dup(pol);
+        if (IS_ERR(newpol)) {
+                kmem_cache_free(sn_cache, n);
+                return NULL;
+        }
+        newpol->flags |= MPOL_F_SHARED;
        n->start = start;
        n->end = end;
-        mpol_get(pol);
+        n->policy = newpol;
-        pol->flags |= MPOL_F_SHARED;    /* for unref */
-        n->policy = pol;
        return n;
 }
@@ -2093,10 +2146,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
                                 unsigned long end, struct sp_node *new)
 {
-        struct sp_node *n, *new2 = NULL;
+        struct sp_node *n;
+        int ret = 0;
-restart:
+        mutex_lock(&sp->mutex);
-        spin_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
@@ -2109,16 +2162,14 @@ restart:
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
+                                struct sp_node *new2;
+                                new2 = sp_alloc(end, n->end, n->policy);
                                if (!new2) {
-                                        spin_unlock(&sp->lock);
+                                        ret = -ENOMEM;
-                                        new2 = sp_alloc(end, n->end, n->policy);
+                                        goto out;
-                                        if (!new2)
-                                                return -ENOMEM;
-                                        goto restart;
                                }
                                n->end = start;
                                sp_insert(sp, new2);
-                                new2 = NULL;
                                break;
                        } else
                                n->end = start;
@@ -2129,12 +2180,9 @@ restart:
        }
        if (new)
                sp_insert(sp, new);
-        spin_unlock(&sp->lock);
+out:
-        if (new2) {
+        mutex_unlock(&sp->mutex);
-                mpol_put(new2->policy);
+        return ret;
-                kmem_cache_free(sn_cache, new2);
-        }
-        return 0;
 }
 /**
@@ -2152,7 +2200,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        int ret;
        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-        spin_lock_init(&sp->lock);
+        mutex_init(&sp->mutex);
        if (mpol) {
                struct vm_area_struct pvma;
@@ -2206,7 +2254,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
        }
        err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
        if (err && new)
-                kmem_cache_free(sn_cache, new);
+                sp_free(new);
        return err;
 }
@@ -2218,16 +2266,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
        if (!p->root.rb_node)
                return;
-        spin_lock(&p->lock);
+        mutex_lock(&p->mutex);
        next = rb_first(&p->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
-                rb_erase(&n->nd, &p->root);
+                sp_delete(p, n);
-                mpol_put(n->policy);
-                kmem_cache_free(sn_cache, n);
        }
-        spin_unlock(&p->lock);
+        mutex_unlock(&p->mutex);
 }
 /* assumes fs == KERNEL_DS */
@@ -2493,7 +2539,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
                break;
        default:
-                BUG();
+                return -EINVAL;
        }
        l = strlen(policy_modes[mode]);
diff --git a/mm/migrate.c b/mm/migrate.c
index 14d0a6a632f..480714b6f3f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -220,6 +220,56 @@ out:
        pte_unmap_unlock(ptep, ptl);
 }
+#ifdef CONFIG_BLOCK
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                        enum migrate_mode mode)
+{
+        struct buffer_head *bh = head;
+        /* Simple case, sync compaction */
+        if (mode != MIGRATE_ASYNC) {
+                do {
+                        get_bh(bh);
+                        lock_buffer(bh);
+                        bh = bh->b_this_page;
+                } while (bh != head);
+                return true;
+        }
+        /* async case, we cannot block on lock_buffer so use trylock_buffer */
+        do {
+                get_bh(bh);
+                if (!trylock_buffer(bh)) {
+                        /*
+                         * We failed to lock the buffer and cannot stall in
+                         * async migration. Release the taken locks
+                         */
+                        struct buffer_head *failed_bh = bh;
+                        put_bh(failed_bh);
+                        bh = head;
+                        while (bh != failed_bh) {
+                                unlock_buffer(bh);
+                                put_bh(bh);
+                                bh = bh->b_this_page;
+                        }
+                        return false;
+                }
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return true;
+}
+#else
+static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
+                                                        enum migrate_mode mode)
+{
+        return true;
+}
+#endif /* CONFIG_BLOCK */
 /*
 * Replace the page in the mapping.
 *
@@ -229,7 +279,8 @@ out:
 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
 */
 static int migrate_page_move_mapping(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page,
+                struct buffer_head *head, enum migrate_mode mode)
 {
        int expected_count;
        void **pslot;
@@ -259,6 +310,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        }
        /*
+         * In the async migration case of moving a page with buffers, lock the
+         * buffers using trylock before the mapping is moved. If the mapping
+         * was moved, we later failed to lock the buffers and could not move
+         * the mapping back due to an elevated page count, we would have to
+         * block waiting on other references to be dropped.
+         */
+        if (mode == MIGRATE_ASYNC && head &&
+                        !buffer_migrate_lock_buffers(head, mode)) {
+                page_unfreeze_refs(page, expected_count);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        /*
         * Now we know that no one else is looking at the page.
         */
        get_page(newpage);      /* add cache reference */
@@ -415,13 +480,14 @@ EXPORT_SYMBOL(fail_migrate_page);
 * Pages are locked upon entry and exit.
 */
 int migrate_page(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page,
+                enum migrate_mode mode)
 {
        int rc;
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_move_mapping(mapping, newpage, page);
+        rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
        if (rc)
                return rc;
@@ -438,28 +504,28 @@ EXPORT_SYMBOL(migrate_page);
 * exist.
 */
 int buffer_migrate_page(struct address_space *mapping,
-                struct page *newpage, struct page *page)
+                struct page *newpage, struct page *page, enum migrate_mode mode)
 {
        struct buffer_head *bh, *head;
        int rc;
        if (!page_has_buffers(page))
-                return migrate_page(mapping, newpage, page);
+                return migrate_page(mapping, newpage, page, mode);
        head = page_buffers(page);
-        rc = migrate_page_move_mapping(mapping, newpage, page);
+        rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
        if (rc)
                return rc;
-        bh = head;
+        /*
-        do {
+         * In the async case, migrate_page_move_mapping locked the buffers
-                get_bh(bh);
+         * with an IRQ-safe spinlock held. In the sync case, the buffers
-                lock_buffer(bh);
+         * need to be locked now
-                bh = bh->b_this_page;
+         */
+        if (mode != MIGRATE_ASYNC)
-        } while (bh != head);
+                BUG_ON(!buffer_migrate_lock_buffers(head, mode));
        ClearPagePrivate(page);
        set_page_private(newpage, page_private(page));
@@ -536,10 +602,14 @@ static int writeout(struct address_space *mapping, struct page *page)
 * Default handling if a filesystem does not provide a migration function.
 */
 static int fallback_migrate_page(struct address_space *mapping,
-        struct page *newpage, struct page *page)
+        struct page *newpage, struct page *page, enum migrate_mode mode)
 {
-        if (PageDirty(page))
+        if (PageDirty(page)) {
+                /* Only writeback pages in full synchronous migration */
+                if (mode != MIGRATE_SYNC)
+                        return -EBUSY;
                return writeout(mapping, page);
+        }
        /*
         * Buffers may be managed in a filesystem specific way.
@@ -549,7 +619,7 @@ static int fallback_migrate_page(struct address_space *mapping,
            !try_to_release_page(page, GFP_KERNEL))
                return -EAGAIN;
-        return migrate_page(mapping, newpage, page);
+        return migrate_page(mapping, newpage, page, mode);
 }
 /*
@@ -564,7 +634,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 *  == 0 - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page,
-                                        int remap_swapcache, bool sync)
+                                int remap_swapcache, enum migrate_mode mode)
 {
        struct address_space *mapping;
        int rc;
@@ -585,29 +655,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        mapping = page_mapping(page);
        if (!mapping)
-                rc = migrate_page(mapping, newpage, page);
+                rc = migrate_page(mapping, newpage, page, mode);
-        else {
+        else if (mapping->a_ops->migratepage)
                /*
-                 * Do not writeback pages if !sync and migratepage is
+                 * Most pages have a mapping and most filesystems provide a
-                 * not pointing to migrate_page() which is nonblocking
+                 * migratepage callback. Anonymous pages are part of swap
-                 * (swapcache/tmpfs uses migratepage = migrate_page).
+                 * space which also has its own migratepage callback. This
+                 * is the most common path for page migration.
                 */
-                if (PageDirty(page) && !sync &&
+                rc = mapping->a_ops->migratepage(mapping,
-                    mapping->a_ops->migratepage != migrate_page)
+                                                newpage, page, mode);
-                        rc = -EBUSY;
+        else
-                else if (mapping->a_ops->migratepage)
+                rc = fallback_migrate_page(mapping, newpage, page, mode);
-                        /*
-                         * Most pages have a mapping and most filesystems
-                         * should provide a migration function. Anonymous
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(mapping,
-                                                        newpage, page);
-                else
-                        rc = fallback_migrate_page(mapping, newpage, page);
-        }
        if (rc) {
                newpage->mapping = NULL;
@@ -621,38 +680,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        return rc;
 }
-/*
+static int __unmap_and_move(struct page *page, struct page *newpage,
- * Obtain the lock on page, remove all ptes and migrate the page
+                        int force, bool offlining, enum migrate_mode mode)
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, bool offlining, bool sync)
 {
-        int rc = 0;
+        int rc = -EAGAIN;
-        int *result = NULL;
-        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
-        if (!newpage)
-                return -ENOMEM;
-        if (page_count(page) == 1) {
-                /* page was freed from under us. So we are done. */
-                goto move_newpage;
-        }
-        if (unlikely(PageTransHuge(page)))
-                if (unlikely(split_huge_page(page)))
-                        goto move_newpage;
-        /* prepare cgroup just returns 0 or -ENOMEM */
-        rc = -EAGAIN;
        if (!trylock_page(page)) {
-                if (!force || !sync)
+                if (!force || mode == MIGRATE_ASYNC)
-                        goto move_newpage;
+                        goto out;
                /*
                 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +707,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                 * altogether.
                 */
                if (current->flags & PF_MEMALLOC)
-                        goto move_newpage;
+                        goto out;
                lock_page(page);
        }
@@ -697,10 +736,12 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (PageWriteback(page)) {
                /*
-                 * For !sync, there is no point retrying as the retry loop
+                 * Only in the case of a full syncronous migration is it
-                 * is expected to be too short for PageWriteback to be cleared
+                 * necessary to wait for PageWriteback. In the async case,
+                 * the retry loop is too short and in the sync-light case,
+                 * the overhead of stalling is too much
                 */
-                if (!sync) {
+                if (mode != MIGRATE_SYNC) {
                        rc = -EBUSY;
                        goto uncharge;
                }
@@ -771,7 +812,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 skip_unmap:
        if (!page_mapped(page))
-                rc = move_to_new_page(newpage, page, remap_swapcache, sync);
+                rc = move_to_new_page(newpage, page, remap_swapcache, mode);
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
@@ -785,27 +826,53 @@ uncharge:
                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
+out:
+        return rc;
+}
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                        struct page *page, int force, bool offlining,
+                        enum migrate_mode mode)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *newpage = get_new_page(page, private, &result);
+        if (!newpage)
+                return -ENOMEM;
+        if (page_count(page) == 1) {
+                /* page was freed from under us. So we are done. */
+                goto out;
+        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto out;
+        rc = __unmap_and_move(page, newpage, force, offlining, mode);
+out:
        if (rc != -EAGAIN) {
-                /*
+                /*
-                 * A page that has been migrated has all references
+                 * A page that has been migrated has all references
-                 * removed and will be freed. A page that has not been
+                 * removed and will be freed. A page that has not been
-                 * migrated will have kepts its references and be
+                 * migrated will have kepts its references and be
-                 * restored.
+                 * restored.
-                 */
+                 */
-                list_del(&page->lru);
+                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                putback_lru_page(page);
        }
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
         */
        putback_lru_page(newpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -835,7 +902,8 @@ move_newpage:
 */
 static int unmap_and_move_huge_page(new_page_t get_new_page,
                                unsigned long private, struct page *hpage,
-                                int force, bool offlining, bool sync)
+                                int force, bool offlining,
+                                enum migrate_mode mode)
 {
        int rc = 0;
        int *result = NULL;
@@ -848,7 +916,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        rc = -EAGAIN;
        if (!trylock_page(hpage)) {
-                if (!force || !sync)
+                if (!force || mode != MIGRATE_SYNC)
                        goto out;
                lock_page(hpage);
        }
@@ -859,7 +927,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
        if (!page_mapped(hpage))
-                rc = move_to_new_page(new_hpage, hpage, 1, sync);
+                rc = move_to_new_page(new_hpage, hpage, 1, mode);
        if (rc)
                remove_migration_ptes(hpage, hpage);
@@ -902,7 +970,7 @@ out:
 */
 int migrate_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                bool sync)
+                enum migrate_mode mode)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -923,7 +991,7 @@ int migrate_pages(struct list_head *from,
                        rc = unmap_and_move(get_new_page, private,
                                                page, pass > 2, offlining,
-                                                sync);
+                                                mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -953,7 +1021,7 @@ out:
 int migrate_huge_pages(struct list_head *from,
                new_page_t get_new_page, unsigned long private, bool offlining,
-                bool sync)
+                enum migrate_mode mode)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -970,7 +1038,7 @@ int migrate_huge_pages(struct list_head *from,
                        rc = unmap_and_move_huge_page(get_new_page,
                                        private, page, pass > 2, offlining,
-                                        sync);
+                                        mode);
                        switch(rc) {
                        case -ENOMEM:
@@ -1099,7 +1167,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0, true);
+                                (unsigned long)pm, 0, MIGRATE_SYNC);
                if (err)
                        putback_lru_pages(&pagelist);
        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088..71c78115c45 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f258..e39e3efe4a4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,8 +83,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-        int i;
+        unsigned long i, start_aligned, end_aligned;
-        unsigned long start_aligned, end_aligned;
        int order = ilog2(BITS_PER_LONG);
        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f474da7ee..bfe789472b4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -555,7 +555,7 @@ static inline void __free_one_page(struct page *page,
                combined_idx = buddy_idx & page_idx;
                higher_page = page + (combined_idx - page_idx);
                buddy_idx = __find_buddy_index(combined_idx, order + 1);
-                higher_buddy = page + (buddy_idx - combined_idx);
+                higher_buddy = higher_page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -1912,14 +1912,20 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        struct page *page;
-        if (!order || compaction_deferred(preferred_zone))
+        if (!order)
                return NULL;
+        if (compaction_deferred(preferred_zone)) {
+                *deferred_compaction = true;
+                return NULL;
+        }
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration);
@@ -1947,7 +1953,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * but not enough to satisfy watermarks.
                 */
                count_vm_event(COMPACTFAIL);
-                defer_compaction(preferred_zone);
+                /*
+                 * As async compaction considers a subset of pageblocks, only
+                 * defer if the failure was a sync compaction failure.
+                 */
+                if (sync_migration)
+                        defer_compaction(preferred_zone);
                cond_resched();
        }
@@ -1959,8 +1971,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2110,6 +2123,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
        bool sync_migration = false;
+        bool deferred_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2190,12 +2204,22 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
        if (page)
                goto got_pg;
        sync_migration = true;
+        /*
+         * If compaction is deferred for high-order allocations, it is because
+         * sync compaction recently failed. In this is the case and the caller
+         * has requested the system not be heavily disrupted, fail the
+         * allocation now instead of entering direct reclaim
+         */
+        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2266,8 +2290,9 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2291,8 +2316,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-        struct page *page;
+        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+        unsigned int cpuset_mems_cookie;
        gfp_mask &= gfp_allowed_mask;
@@ -2311,15 +2337,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-        if (!preferred_zone) {
+        if (!preferred_zone)
-                put_mems_allowed();
+                goto out;
-                return NULL;
-        }
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2329,9 +2355,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        put_mems_allowed();
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+out:
+        /*
+         * When updating a task's mems_allowed, it is possible to race with
+         * parallel threads in such a way that an allocation can fail while
+         * the mask is being updated. If a page allocation is about to fail,
+         * check if the cpuset changed during allocation and if so, retry.
+         */
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2555,13 +2591,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+        unsigned int cpuset_mems_cookie;
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
-        get_mems_allowed();
+        do {
-        ret = !node_isset(nid, cpuset_current_mems_allowed);
+                cpuset_mems_cookie = get_mems_allowed();
-        put_mems_allowed();
+                ret = !node_isset(nid, cpuset_current_mems_allowed);
+        } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3441,25 +3479,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
-                /* Blocks with reserved pages will never free, skip them. */
-                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                if (pageblock_is_reserved(pfn, block_end_pfn))
-                        continue;
                block_migratetype = get_pageblock_migratetype(page);
-                /* If this block is reserved, account for it */
+                /* Only test what is necessary when the reserves are not met */
-                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                if (reserve > 0) {
-                        reserve--;
+                        /*
-                        continue;
+                         * Blocks with reserved pages will never free, skip
-                }
+                         * them.
+                         */
+                        block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                        if (pageblock_is_reserved(pfn, block_end_pfn))
+                                continue;
-                /* Suitable for reserving if this block is movable */
+                        /* If this block is reserved, account for it */
-                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        if (block_migratetype == MIGRATE_RESERVE) {
-                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                                reserve--;
-                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                                continue;
-                        reserve--;
+                        }
-                        continue;
+                        /* Suitable for reserving if this block is movable */
+                        if (block_migratetype == MIGRATE_MOVABLE) {
+                                set_pageblock_migratetype(page,
+                                                        MIGRATE_RESERVE);
+                                move_freepages_block(zone, page,
+                                                        MIGRATE_RESERVE);
+                                reserve--;
+                                continue;
+                        }
                }
                /*
diff --git a/mm/percpu.c b/mm/percpu.c
index 0ae7a09141e..af0cc7a58f9 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1630,6 +1630,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                areas[group] = ptr;
                base = min(ptr, base);
+        }
+        /*
+         * Copy data and free unused parts.  This should happen after all
+         * allocations are complete; otherwise, we may end up with
+         * overlapping groups.
+         */
+        for (group = 0; group < ai->nr_groups; group++) {
+                struct pcpu_group_info *gi = &ai->groups[group];
+                void *ptr = areas[group];
                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 883e98f78ca..df31a443293 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2348,12 +2348,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *dentry = NULL;
-        u64 inum = fid->raw[2];
+        u64 inum;
-        inum = (inum << 32) | fid->raw[1];
        if (fh_len < 3)
                return NULL;
+        inum = fid->raw[2];
+        inum = (inum << 32) | fid->raw[1];
        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de77..a67f8121ce5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3218,12 +3218,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (in_interrupt() || (flags & __GFP_THISNODE))
                return NULL;
        nid_alloc = nid_here = numa_mem_id();
-        get_mems_allowed();
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
                nid_alloc = slab_node(current->mempolicy);
-        put_mems_allowed();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3246,14 +3244,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        enum zone_type high_zoneidx = gfp_zone(flags);
        void *obj = NULL;
        int nid;
+        unsigned int cpuset_mems_cookie;
        if (flags & __GFP_THISNODE)
                return NULL;
-        get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
+        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
 retry:
        /*
         * Look through allowed nodes for objects available
@@ -3306,7 +3307,9 @@ retry:
                        }
                }
        }
-        put_mems_allowed();
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+                goto retry_cpuset;
        return obj;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 10ab2335e2e..ae6e80ed1e5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1457,6 +1457,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(flags);
        struct page *page;
+        unsigned int cpuset_mems_cookie;
        /*
         * The defrag ratio allows a configuration of the tradeoffs between
@@ -1480,23 +1481,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
-        get_mems_allowed();
+        do {
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                cpuset_mems_cookie = get_mems_allowed();
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-                struct kmem_cache_node *n;
+                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+                        struct kmem_cache_node *n;
-                n = get_node(s, zone_to_nid(zone));
+                        n = get_node(s, zone_to_nid(zone));
-                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > s->min_partial) {
+                        if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                        page = get_partial_node(n);
+                                        n->nr_partial > s->min_partial) {
-                        if (page) {
+                                page = get_partial_node(n);
-                                put_mems_allowed();
+                                if (page) {
-                                return page;
+                                        /*
+                                         * Return the object even if
+                                         * put_mems_allowed indicated that
+                                         * the cpuset mems_allowed was
+                                         * updated in parallel. It's a
+                                         * harmless race between the alloc
+                                         * and the cpuset update.
+                                         */
+                                        put_mems_allowed(cpuset_mems_cookie);
+                                        return page;
+                                }
                        }
                }
-        }
+        } while (!put_mems_allowed(cpuset_mems_cookie));
-        put_mems_allowed();
 #endif
        return NULL;
 }
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad..3e9829f3988 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -398,11 +398,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
+        clear_page_mlock(page);
        spin_lock_irq(&mapping->tree_lock);
        if (PageDirty(page))
                goto failed;
-        clear_page_mlock(page);
        BUG_ON(page_has_private(page));
        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 43b44dbadda..bdb70042c12 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
        struct rb_node rb_node;         /* address sorted rbtree */
        struct list_head list;          /* address sorted list */
        struct list_head purge_list;    /* "lazy purge" list */
-        void *private;
+        struct vm_struct *vm;
        struct rcu_head rcu_head;
 };
@@ -1174,9 +1174,10 @@ void __init vmalloc_init(void)
        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-                va->flags = tmp->flags | VM_VM_AREA;
+                va->flags = VM_VM_AREA;
                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
+                va->vm = tmp;
                __insert_vmap_area(va);
        }
@@ -1274,7 +1275,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
-        va->private = vm;
+        va->vm = vm;
        va->flags |= VM_VM_AREA;
 }
@@ -1397,7 +1398,7 @@ static struct vm_struct *find_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA)
-                return va->private;
+                return va->vm;
        return NULL;
 }
@@ -1416,7 +1417,7 @@ struct vm_struct *remove_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA) {
-                struct vm_struct *vm = va->private;
+                struct vm_struct *vm = va->vm;
                if (!(vm->flags & VM_UNLIST)) {
                        struct vm_struct *tmp, **p;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6072d74a16f..5326f98f506 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -248,35 +248,66 @@ unsigned long shrink_slab(struct shrink_control *shrink,
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
-                unsigned long total_scan;
+                long total_scan;
-                unsigned long max_pass;
+                long max_pass;
+                int shrink_ret = 0;
+                long nr;
+                long new_nr;
                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
+                if (max_pass <= 0)
+                        continue;
+                /*
+                 * copy the current shrinker scan count into a local variable
+                 * and zero it so that other concurrent shrinker invocations
+                 * don't also do this scanning work.
+                 */
+                do {
+                        nr = shrinker->nr;
+                } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
+                total_scan = nr;
                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
-                shrinker->nr += delta;
+                total_scan += delta;
-                if (shrinker->nr < 0) {
+                if (total_scan < 0) {
                        printk(KERN_ERR "shrink_slab: %pF negative objects to "
                               "delete nr=%ld\n",
-                               shrinker->shrink, shrinker->nr);
+                               shrinker->shrink, total_scan);
-                        shrinker->nr = max_pass;
+                        total_scan = max_pass;
                }
                /*
+                 * We need to avoid excessive windup on filesystem shrinkers
+                 * due to large numbers of GFP_NOFS allocations causing the
+                 * shrinkers to return -1 all the time. This results in a large
+                 * nr being built up so when a shrink that can do some work
+                 * comes along it empties the entire cache due to nr >>>
+                 * max_pass.  This is bad for sustaining a working set in
+                 * memory.
+                 *
+                 * Hence only allow the shrinker to scan the entire cache when
+                 * a large delta change is calculated directly.
+                 */
+                if (delta < max_pass / 4)
+                        total_scan = min(total_scan, max_pass / 2);
+                /*
                 * Avoid risking looping forever due to too large nr value:
                 * never try to free more than twice the estimate number of
                 * freeable entries.
                 */
-                if (shrinker->nr > max_pass * 2)
+                if (total_scan > max_pass * 2)
-                        shrinker->nr = max_pass * 2;
+                        total_scan = max_pass * 2;
-                total_scan = shrinker->nr;
+                trace_mm_shrink_slab_start(shrinker, shrink, nr,
-                shrinker->nr = 0;
+                                        nr_pages_scanned, lru_pages,
+                                        max_pass, delta, total_scan);
                while (total_scan >= SHRINK_BATCH) {
                        long this_scan = SHRINK_BATCH;
-                        int shrink_ret;
                        int nr_before;
                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
@@ -292,7 +323,19 @@ unsigned long shrink_slab(struct shrink_control *shrink,
                        cond_resched();
                }
-                shrinker->nr += total_scan;
+                /*
+                 * move the unused scan count back into the shrinker in a
+                 * manner that handles concurrent updates. If we exhausted the
+                 * scan, there is no need to do an update.
+                 */
+                do {
+                        nr = shrinker->nr;
+                        new_nr = total_scan + nr;
+                        if (total_scan <= 0)
+                                break;
+                } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
+                trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
@@ -665,7 +708,7 @@ static enum page_references page_check_references(struct page *page,
                return PAGEREF_RECLAIM;
        if (referenced_ptes) {
-                if (PageAnon(page))
+                if (PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;
                /*
                 * All mapped pages start out with page table
@@ -683,7 +726,13 @@ static enum page_references page_check_references(struct page *page,
                 */
                SetPageReferenced(page);
-                if (referenced_page)
+                if (referenced_page || referenced_ptes > 1)
+                        return PAGEREF_ACTIVATE;
+                /*
+                 * Activate file-backed executable pages after first usage.
+                 */
+                if (vm_flags & VM_EXEC)
                        return PAGEREF_ACTIVATE;
                return PAGEREF_KEEP;
@@ -972,23 +1021,27 @@ keep_lumpy:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+        bool all_lru_mode;
        int ret = -EINVAL;
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
+        all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+                (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+        if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
-        if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+        if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
        /*
@@ -1001,6 +1054,43 @@ int __isolate_lru_page(struct page *page, int mode, int file)
        ret = -EBUSY;
+        /*
+         * To minimise LRU disruption, the caller can indicate that it only
+         * wants to isolate pages it will be able to operate on without
+         * blocking - clean pages for the most part.
+         *
+         * ISOLATE_CLEAN means that only clean pages should be isolated. This
+         * is used by reclaim when it is cannot write to backing storage
+         *
+         * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
+         * that it is possible to migrate without blocking
+         */
+        if (mode & (ISOLATE_CLEAN|ISOLATE_ASYNC_MIGRATE)) {
+                /* All the caller can do on PageWriteback is block */
+                if (PageWriteback(page))
+                        return ret;
+                if (PageDirty(page)) {
+                        struct address_space *mapping;
+                        /* ISOLATE_CLEAN means only clean pages */
+                        if (mode & ISOLATE_CLEAN)
+                                return ret;
+                        /*
+                         * Only pages without mappings or that have a
+                         * ->migratepage callback are possible to migrate
+                         * without blocking
+                         */
+                        mapping = page_mapping(page);
+                        if (mapping && !mapping->a_ops->migratepage)
+                                return ret;
+                }
+        }
+        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+                return ret;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1036,7 +1126,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-                unsigned long *scanned, int order, int mode, int file)
+                unsigned long *scanned, int order, isolate_mode_t mode,
+                int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1111,7 +1202,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * anon page which don't already have a swap slot is
                         * pointless.
                         */
-                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+                        if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
                            !PageSwapCache(cursor_page))
                                break;
@@ -1161,8 +1252,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
-                                        int active, int file)
+                                        struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1408,6 +1499,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1418,15 +1510,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
        set_reclaim_mode(priority, sc, false);
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+                reclaim_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
-                nr_taken = isolate_pages_global(nr_to_scan,
+                nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1435,12 +1533,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone,
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                        sc->mem_cgroup, 0, file);
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, sc->mem_cgroup,
-                        0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1542,19 +1637,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1747,23 +1849,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-        int force_scan = 0;
+        bool force_scan = false;
        unsigned long nr_force_scan[2];
+        /* kswapd does zone balancing and needs to scan this zone */
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+        if (scanning_global_lru(sc) && current_is_kswapd() &&
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+            zone->all_unreclaimable)
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                force_scan = true;
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        /* memcg may have small limit and need to avoid priority drop */
+        if (!scanning_global_lru(sc))
-        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+                force_scan = true;
-                /* kswapd does zone balancing and need to scan this zone */
-                if (scanning_global_lru(sc) && current_is_kswapd())
-                        force_scan = 1;
-                /* memcg may have small limit and need to avoid priority drop */
-                if (!scanning_global_lru(sc))
-                        force_scan = 1;
-        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1776,6 +1871,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1912,8 +2012,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (nr_swap_pages > 0)
+                inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
@@ -1985,6 +2086,42 @@ restart:
        throttle_vm_writeout(sc->gfp_mask);
 }
+/* Returns true if compaction should go ahead for a high-order request */
+static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
+{
+        unsigned long balance_gap, watermark;
+        bool watermark_ok;
+        /* Do not consider compaction for orders reclaim is meant to satisfy */
+        if (sc->order <= PAGE_ALLOC_COSTLY_ORDER)
+                return false;
+        /*
+         * Compaction takes time to run and there are potentially other
+         * callers using the pages just freed. Continue reclaiming until
+         * there is a buffer of free pages available to give compaction
+         * a reasonable chance of completing and allocating the page
+         */
+        balance_gap = min(low_wmark_pages(zone),
+                (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                        KSWAPD_ZONE_BALANCE_GAP_RATIO);
+        watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
+        watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+        /*
+         * If compaction is deferred, reclaim up to a point where
+         * compaction will have a chance of success when re-enabled
+         */
+        if (compaction_deferred(zone))
+                return watermark_ok;
+        /* If compaction is not ready to start, keep reclaiming */
+        if (!compaction_suitable(zone, sc->order))
+                return false;
+        return watermark_ok;
+}
 /*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -2000,14 +2137,20 @@ restart:
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is ready to begin. This indicates to
+ * the caller that it should consider retrying the allocation instead of
+ * further reclaim.
 */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+        bool aborted_reclaim = false;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2022,6 +2165,21 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        if (COMPACTION_BUILD) {
+                                /*
+                                 * If we already have plenty of memory free for
+                                 * compaction in this zone, don't free any more.
+                                 * Even though compaction is invoked for any
+                                 * non-zero order, only frequent costly order
+                                 * reclamation is disruptive enough to become a
+                                 * noticable problem, like transparent huge page
+                                 * allocations.
+                                 */
+                                if (compaction_ready(zone, sc)) {
+                                        aborted_reclaim = true;
+                                        continue;
+                                }
+                        }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2039,6 +2197,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                shrink_zone(priority, zone, sc);
        }
+        return aborted_reclaim;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2092,8 +2252,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        unsigned long writeback_threshold;
+        bool aborted_reclaim;
-        get_mems_allowed();
        delayacct_freepages_start();
        if (scanning_global_lru(sc))
@@ -2103,7 +2263,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->mem_cgroup);
-                shrink_zones(priority, zonelist, sc);
+                aborted_reclaim = shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2155,7 +2316,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 out:
        delayacct_freepages_end();
-        put_mems_allowed();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
@@ -2168,6 +2328,10 @@ out:
        if (oom_killer_disabled)
                return 0;
+        /* Aborted reclaim to try compaction? don't OOM, then */
+        if (aborted_reclaim)
+                return 1;
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
@@ -2459,6 +2623,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                        } else {
+                                /* If balanced, clear the congested flag */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2695,7 +2862,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -2722,7 +2892,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+        unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2753,7 +2925,9 @@ static int kswapd(void *p)
        set_freezable();
        order = new_order = 0;
+        balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
@@ -2762,7 +2936,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                if (balanced_classzone_idx >= new_classzone_idx &&
+                                        balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2777,9 +2952,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                        kswapd_try_to_sleep(pgdat, balanced_order,
+                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                        new_order = order;
+                        new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2794,7 +2972,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        order = balance_pgdat(pgdat, order, &classzone_idx);
+                        balanced_classzone_idx = classzone_idx;
+                        balanced_order = balance_pgdat(pgdat, order,
+                                                &balanced_classzone_idx);
                }
        }
        return 0;
@@ -2952,14 +3132,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b..6559013c5a1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
 *
 * vm_stat contains the global counters
 */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
author	Todd Poynor	2012-11-01 15:36:34 -0500
committer	Todd Poynor	2012-11-01 15:36:34 -0500
commit	925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3 (patch)
tree	a56506710f0340db055191e3cf0a207699c1b849 /mm
parent	834029ac9d0ad8dea4e6a21bc34877dc3740b9f4 (diff)
parent	27d0858dbcf199838b8c50a3e94d397bf326d986 (diff)
download	kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.gz kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.tar.xz kernel-common-925d49abc38dcc7ef1cbfe125c6f0b2202ae3df3.zip