8f860591ff
2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb mprotect. From: David Gibson <david@gibson.dropbear.id.au> Remove a test from the mprotect() path which checks that the mprotect()ed range on a hugepage VMA is hugepage aligned (yes, really, the sense of is_aligned_hugepage_range() is the opposite of what you'd guess :-/). In fact, we don't need this test. If the given addresses match the beginning/end of a hugepage VMA they must already be suitably aligned. If they don't, then mprotect_fixup() will attempt to split the VMA. The very first test in split_vma() will check for a badly aligned address on a hugepage VMA and return -EINVAL if necessary. From: "Chen, Kenneth W" <kenneth.w.chen@intel.com> On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE. The identify of hugetlb pte is lost when changing page protection via mprotect. A page fault occurs later will trigger a bug check in huge_pte_alloc(). The fix is to always make new pte a hugetlb pte and also to clean up legacy code where _PAGE_PRESENT is forced on in the pre-faulting day. Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "David S. Miller" <davem@davemloft.net> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
280 lines
6.5 KiB
C
280 lines
6.5 KiB
C
/*
|
|
* mm/mprotect.c
|
|
*
|
|
* (C) Copyright 1994 Linus Torvalds
|
|
* (C) Copyright 2002 Christoph Hellwig
|
|
*
|
|
* Address space accounting code <alan@redhat.com>
|
|
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/shm.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/security.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/personality.h>
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
|
|
unsigned long addr, unsigned long end, pgprot_t newprot)
|
|
{
|
|
pte_t *pte;
|
|
spinlock_t *ptl;
|
|
|
|
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
|
do {
|
|
if (pte_present(*pte)) {
|
|
pte_t ptent;
|
|
|
|
/* Avoid an SMP race with hardware updated dirty/clean
|
|
* bits by wiping the pte and then setting the new pte
|
|
* into place.
|
|
*/
|
|
ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
|
|
set_pte_at(mm, addr, pte, ptent);
|
|
lazy_mmu_prot_update(ptent);
|
|
}
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
pte_unmap_unlock(pte - 1, ptl);
|
|
}
|
|
|
|
static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
|
|
unsigned long addr, unsigned long end, pgprot_t newprot)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (pmd_none_or_clear_bad(pmd))
|
|
continue;
|
|
change_pte_range(mm, pmd, addr, next, newprot);
|
|
} while (pmd++, addr = next, addr != end);
|
|
}
|
|
|
|
static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
|
|
unsigned long addr, unsigned long end, pgprot_t newprot)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_offset(pgd, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
change_pmd_range(mm, pud, addr, next, newprot);
|
|
} while (pud++, addr = next, addr != end);
|
|
}
|
|
|
|
static void change_protection(struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end, pgprot_t newprot)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
unsigned long start = addr;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset(mm, addr);
|
|
flush_cache_range(vma, addr, end);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
change_pud_range(mm, pgd, addr, next, newprot);
|
|
} while (pgd++, addr = next, addr != end);
|
|
flush_tlb_range(vma, start, end);
|
|
}
|
|
|
|
static int
|
|
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
|
unsigned long start, unsigned long end, unsigned long newflags)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
unsigned long oldflags = vma->vm_flags;
|
|
long nrpages = (end - start) >> PAGE_SHIFT;
|
|
unsigned long charged = 0;
|
|
pgprot_t newprot;
|
|
pgoff_t pgoff;
|
|
int error;
|
|
|
|
if (newflags == oldflags) {
|
|
*pprev = vma;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If we make a private mapping writable we increase our commit;
|
|
* but (without finer accounting) cannot reduce our commit if we
|
|
* make it unwritable again.
|
|
*
|
|
* FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
|
|
* a MAP_NORESERVE private mapping to writable will now reserve.
|
|
*/
|
|
if (newflags & VM_WRITE) {
|
|
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
|
|
charged = nrpages;
|
|
if (security_vm_enough_memory(charged))
|
|
return -ENOMEM;
|
|
newflags |= VM_ACCOUNT;
|
|
}
|
|
}
|
|
|
|
newprot = protection_map[newflags & 0xf];
|
|
|
|
/*
|
|
* First try to merge with previous and/or next vma.
|
|
*/
|
|
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
|
*pprev = vma_merge(mm, *pprev, start, end, newflags,
|
|
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
|
|
if (*pprev) {
|
|
vma = *pprev;
|
|
goto success;
|
|
}
|
|
|
|
*pprev = vma;
|
|
|
|
if (start != vma->vm_start) {
|
|
error = split_vma(mm, vma, start, 1);
|
|
if (error)
|
|
goto fail;
|
|
}
|
|
|
|
if (end != vma->vm_end) {
|
|
error = split_vma(mm, vma, end, 0);
|
|
if (error)
|
|
goto fail;
|
|
}
|
|
|
|
success:
|
|
/*
|
|
* vm_flags and vm_page_prot are protected by the mmap_sem
|
|
* held in write mode.
|
|
*/
|
|
vma->vm_flags = newflags;
|
|
vma->vm_page_prot = newprot;
|
|
if (is_vm_hugetlb_page(vma))
|
|
hugetlb_change_protection(vma, start, end, newprot);
|
|
else
|
|
change_protection(vma, start, end, newprot);
|
|
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
|
|
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
|
|
return 0;
|
|
|
|
fail:
|
|
vm_unacct_memory(charged);
|
|
return error;
|
|
}
|
|
|
|
asmlinkage long
|
|
sys_mprotect(unsigned long start, size_t len, unsigned long prot)
|
|
{
|
|
unsigned long vm_flags, nstart, end, tmp, reqprot;
|
|
struct vm_area_struct *vma, *prev;
|
|
int error = -EINVAL;
|
|
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
|
|
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
|
|
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
|
|
return -EINVAL;
|
|
|
|
if (start & ~PAGE_MASK)
|
|
return -EINVAL;
|
|
if (!len)
|
|
return 0;
|
|
len = PAGE_ALIGN(len);
|
|
end = start + len;
|
|
if (end <= start)
|
|
return -ENOMEM;
|
|
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM))
|
|
return -EINVAL;
|
|
|
|
reqprot = prot;
|
|
/*
|
|
* Does the application expect PROT_READ to imply PROT_EXEC:
|
|
*/
|
|
if (unlikely((prot & PROT_READ) &&
|
|
(current->personality & READ_IMPLIES_EXEC)))
|
|
prot |= PROT_EXEC;
|
|
|
|
vm_flags = calc_vm_prot_bits(prot);
|
|
|
|
down_write(¤t->mm->mmap_sem);
|
|
|
|
vma = find_vma_prev(current->mm, start, &prev);
|
|
error = -ENOMEM;
|
|
if (!vma)
|
|
goto out;
|
|
if (unlikely(grows & PROT_GROWSDOWN)) {
|
|
if (vma->vm_start >= end)
|
|
goto out;
|
|
start = vma->vm_start;
|
|
error = -EINVAL;
|
|
if (!(vma->vm_flags & VM_GROWSDOWN))
|
|
goto out;
|
|
}
|
|
else {
|
|
if (vma->vm_start > start)
|
|
goto out;
|
|
if (unlikely(grows & PROT_GROWSUP)) {
|
|
end = vma->vm_end;
|
|
error = -EINVAL;
|
|
if (!(vma->vm_flags & VM_GROWSUP))
|
|
goto out;
|
|
}
|
|
}
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
for (nstart = start ; ; ) {
|
|
unsigned long newflags;
|
|
|
|
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
|
|
|
newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
|
|
|
|
/* newflags >> 4 shift VM_MAY% in place of VM_% */
|
|
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
|
|
error = -EACCES;
|
|
goto out;
|
|
}
|
|
|
|
error = security_file_mprotect(vma, reqprot, prot);
|
|
if (error)
|
|
goto out;
|
|
|
|
tmp = vma->vm_end;
|
|
if (tmp > end)
|
|
tmp = end;
|
|
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
|
|
if (error)
|
|
goto out;
|
|
nstart = tmp;
|
|
|
|
if (nstart < prev->vm_end)
|
|
nstart = prev->vm_end;
|
|
if (nstart >= end)
|
|
goto out;
|
|
|
|
vma = prev->vm_next;
|
|
if (!vma || vma->vm_start != nstart) {
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
}
|
|
out:
|
|
up_write(¤t->mm->mmap_sem);
|
|
return error;
|
|
}
|