diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 96 |
1 files changed, 62 insertions, 34 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6c6f5ccfcda1..0127b788272f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1304,17 +1304,11 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1304 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1304 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1305 | if (flags & FOLL_TOUCH) { | 1305 | if (flags & FOLL_TOUCH) { |
1306 | pmd_t _pmd; | 1306 | pmd_t _pmd; |
1307 | /* | 1307 | _pmd = pmd_mkyoung(*pmd); |
1308 | * We should set the dirty bit only for FOLL_WRITE but | 1308 | if (flags & FOLL_WRITE) |
1309 | * for now the dirty bit in the pmd is meaningless. | 1309 | _pmd = pmd_mkdirty(_pmd); |
1310 | * And if the dirty bit will become meaningful and | ||
1311 | * we'll only set it with FOLL_WRITE, an atomic | ||
1312 | * set_bit will be required on the pmd to set the | ||
1313 | * young bit, instead of the current set_pmd_at. | ||
1314 | */ | ||
1315 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
1316 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | 1310 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, |
1317 | pmd, _pmd, 1)) | 1311 | pmd, _pmd, flags & FOLL_WRITE)) |
1318 | update_mmu_cache_pmd(vma, addr, pmd); | 1312 | update_mmu_cache_pmd(vma, addr, pmd); |
1319 | } | 1313 | } |
1320 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1314 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
@@ -1572,35 +1566,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1572 | { | 1566 | { |
1573 | struct mm_struct *mm = vma->vm_mm; | 1567 | struct mm_struct *mm = vma->vm_mm; |
1574 | spinlock_t *ptl; | 1568 | spinlock_t *ptl; |
1569 | pmd_t entry; | ||
1570 | bool preserve_write; | ||
1571 | |||
1575 | int ret = 0; | 1572 | int ret = 0; |
1576 | 1573 | ||
1577 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1574 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) |
1578 | pmd_t entry; | 1575 | return 0; |
1579 | bool preserve_write = prot_numa && pmd_write(*pmd); | ||
1580 | ret = 1; | ||
1581 | 1576 | ||
1582 | /* | 1577 | preserve_write = prot_numa && pmd_write(*pmd); |
1583 | * Avoid trapping faults against the zero page. The read-only | 1578 | ret = 1; |
1584 | * data is likely to be read-cached on the local CPU and | ||
1585 | * local/remote hits to the zero page are not interesting. | ||
1586 | */ | ||
1587 | if (prot_numa && is_huge_zero_pmd(*pmd)) { | ||
1588 | spin_unlock(ptl); | ||
1589 | return ret; | ||
1590 | } | ||
1591 | 1579 | ||
1592 | if (!prot_numa || !pmd_protnone(*pmd)) { | 1580 | /* |
1593 | entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); | 1581 | * Avoid trapping faults against the zero page. The read-only |
1594 | entry = pmd_modify(entry, newprot); | 1582 | * data is likely to be read-cached on the local CPU and |
1595 | if (preserve_write) | 1583 | * local/remote hits to the zero page are not interesting. |
1596 | entry = pmd_mkwrite(entry); | 1584 | */ |
1597 | ret = HPAGE_PMD_NR; | 1585 | if (prot_numa && is_huge_zero_pmd(*pmd)) |
1598 | set_pmd_at(mm, addr, pmd, entry); | 1586 | goto unlock; |
1599 | BUG_ON(!preserve_write && pmd_write(entry)); | 1587 | |
1600 | } | 1588 | if (prot_numa && pmd_protnone(*pmd)) |
1601 | spin_unlock(ptl); | 1589 | goto unlock; |
1602 | } | ||
1603 | 1590 | ||
1591 | /* | ||
1592 | * In case prot_numa, we are under down_read(mmap_sem). It's critical | ||
1593 | * to not clear pmd intermittently to avoid race with MADV_DONTNEED | ||
1594 | * which is also under down_read(mmap_sem): | ||
1595 | * | ||
1596 | * CPU0: CPU1: | ||
1597 | * change_huge_pmd(prot_numa=1) | ||
1598 | * pmdp_huge_get_and_clear_notify() | ||
1599 | * madvise_dontneed() | ||
1600 | * zap_pmd_range() | ||
1601 | * pmd_trans_huge(*pmd) == 0 (without ptl) | ||
1602 | * // skip the pmd | ||
1603 | * set_pmd_at(); | ||
1604 | * // pmd is re-established | ||
1605 | * | ||
1606 | * The race makes MADV_DONTNEED miss the huge pmd and don't clear it | ||
1607 | * which may break userspace. | ||
1608 | * | ||
1609 | * pmdp_invalidate() is required to make sure we don't miss | ||
1610 | * dirty/young flags set by hardware. | ||
1611 | */ | ||
1612 | entry = *pmd; | ||
1613 | pmdp_invalidate(vma, addr, pmd); | ||
1614 | |||
1615 | /* | ||
1616 | * Recover dirty/young flags. It relies on pmdp_invalidate to not | ||
1617 | * corrupt them. | ||
1618 | */ | ||
1619 | if (pmd_dirty(*pmd)) | ||
1620 | entry = pmd_mkdirty(entry); | ||
1621 | if (pmd_young(*pmd)) | ||
1622 | entry = pmd_mkyoung(entry); | ||
1623 | |||
1624 | entry = pmd_modify(entry, newprot); | ||
1625 | if (preserve_write) | ||
1626 | entry = pmd_mkwrite(entry); | ||
1627 | ret = HPAGE_PMD_NR; | ||
1628 | set_pmd_at(mm, addr, pmd, entry); | ||
1629 | BUG_ON(!preserve_write && pmd_write(entry)); | ||
1630 | unlock: | ||
1631 | spin_unlock(ptl); | ||
1604 | return ret; | 1632 | return ret; |
1605 | } | 1633 | } |
1606 | 1634 | ||