aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c96
1 files changed, 62 insertions, 34 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6c6f5ccfcda1..0127b788272f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1304,17 +1304,11 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1304 VM_BUG_ON_PAGE(!PageHead(page), page); 1304 VM_BUG_ON_PAGE(!PageHead(page), page);
1305 if (flags & FOLL_TOUCH) { 1305 if (flags & FOLL_TOUCH) {
1306 pmd_t _pmd; 1306 pmd_t _pmd;
1307 /* 1307 _pmd = pmd_mkyoung(*pmd);
1308 * We should set the dirty bit only for FOLL_WRITE but 1308 if (flags & FOLL_WRITE)
1309 * for now the dirty bit in the pmd is meaningless. 1309 _pmd = pmd_mkdirty(_pmd);
1310 * And if the dirty bit will become meaningful and
1311 * we'll only set it with FOLL_WRITE, an atomic
1312 * set_bit will be required on the pmd to set the
1313 * young bit, instead of the current set_pmd_at.
1314 */
1315 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1316 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, 1310 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1317 pmd, _pmd, 1)) 1311 pmd, _pmd, flags & FOLL_WRITE))
1318 update_mmu_cache_pmd(vma, addr, pmd); 1312 update_mmu_cache_pmd(vma, addr, pmd);
1319 } 1313 }
1320 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1314 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
@@ -1572,35 +1566,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1572{ 1566{
1573 struct mm_struct *mm = vma->vm_mm; 1567 struct mm_struct *mm = vma->vm_mm;
1574 spinlock_t *ptl; 1568 spinlock_t *ptl;
1569 pmd_t entry;
1570 bool preserve_write;
1571
1575 int ret = 0; 1572 int ret = 0;
1576 1573
1577 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1574 if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
1578 pmd_t entry; 1575 return 0;
1579 bool preserve_write = prot_numa && pmd_write(*pmd);
1580 ret = 1;
1581 1576
1582 /* 1577 preserve_write = prot_numa && pmd_write(*pmd);
1583 * Avoid trapping faults against the zero page. The read-only 1578 ret = 1;
1584 * data is likely to be read-cached on the local CPU and
1585 * local/remote hits to the zero page are not interesting.
1586 */
1587 if (prot_numa && is_huge_zero_pmd(*pmd)) {
1588 spin_unlock(ptl);
1589 return ret;
1590 }
1591 1579
1592 if (!prot_numa || !pmd_protnone(*pmd)) { 1580 /*
1593 entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); 1581 * Avoid trapping faults against the zero page. The read-only
1594 entry = pmd_modify(entry, newprot); 1582 * data is likely to be read-cached on the local CPU and
1595 if (preserve_write) 1583 * local/remote hits to the zero page are not interesting.
1596 entry = pmd_mkwrite(entry); 1584 */
1597 ret = HPAGE_PMD_NR; 1585 if (prot_numa && is_huge_zero_pmd(*pmd))
1598 set_pmd_at(mm, addr, pmd, entry); 1586 goto unlock;
1599 BUG_ON(!preserve_write && pmd_write(entry)); 1587
1600 } 1588 if (prot_numa && pmd_protnone(*pmd))
1601 spin_unlock(ptl); 1589 goto unlock;
1602 }
1603 1590
1591 /*
1592 * In case prot_numa, we are under down_read(mmap_sem). It's critical
1593 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1594 * which is also under down_read(mmap_sem):
1595 *
1596 * CPU0: CPU1:
1597 * change_huge_pmd(prot_numa=1)
1598 * pmdp_huge_get_and_clear_notify()
1599 * madvise_dontneed()
1600 * zap_pmd_range()
1601 * pmd_trans_huge(*pmd) == 0 (without ptl)
1602 * // skip the pmd
1603 * set_pmd_at();
1604 * // pmd is re-established
1605 *
1606 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1607 * which may break userspace.
1608 *
1609 * pmdp_invalidate() is required to make sure we don't miss
1610 * dirty/young flags set by hardware.
1611 */
1612 entry = *pmd;
1613 pmdp_invalidate(vma, addr, pmd);
1614
1615 /*
1616 * Recover dirty/young flags. It relies on pmdp_invalidate to not
1617 * corrupt them.
1618 */
1619 if (pmd_dirty(*pmd))
1620 entry = pmd_mkdirty(entry);
1621 if (pmd_young(*pmd))
1622 entry = pmd_mkyoung(entry);
1623
1624 entry = pmd_modify(entry, newprot);
1625 if (preserve_write)
1626 entry = pmd_mkwrite(entry);
1627 ret = HPAGE_PMD_NR;
1628 set_pmd_at(mm, addr, pmd, entry);
1629 BUG_ON(!preserve_write && pmd_write(entry));
1630unlock:
1631 spin_unlock(ptl);
1604 return ret; 1632 return ret;
1605} 1633}
1606 1634