linux/arch/s390/mm/pgtable.c
Martin Schwidefsky 043d07084b [S390] Remove data execution protection
The noexec support on s390 does not rely on a bit in the page table
entry but utilizes the secondary space mode to distinguish between
memory accesses for instructions vs. data. The noexec code relies
on the assumption that the cpu will always use the secondary space
page table for data accesses while it is running in the secondary
space mode. Up to the z9-109 class machines this has been the case.
Unfortunately this is not true anymore with z10 and later machines.
The load-relative-long instructions lrl, lgrl and lgfrl access the
memory operand using the same addressing-space mode that has been
used to fetch the instruction.
This breaks the noexec mode for all user space binaries compiled
with march=z10 or later. The only option is to remove the current
noexec support.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2011-05-23 10:24:28 +02:00

427 lines
11 KiB
C

/*
* Copyright IBM Corp. 2007,2009
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <linux/quicklist.h>
#include <linux/rcupdate.h>
#include <asm/system.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
struct rcu_table_freelist {
struct rcu_head rcu;
struct mm_struct *mm;
unsigned int pgt_index;
unsigned int crst_index;
unsigned long *table[0];
};
#define RCU_FREELIST_SIZE \
((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
/ sizeof(unsigned long))
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
static void __page_table_free(struct mm_struct *mm, unsigned long *table);
static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
{
struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
struct rcu_table_freelist *batch = *batchp;
if (batch)
return batch;
batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
if (batch) {
batch->mm = mm;
batch->pgt_index = 0;
batch->crst_index = RCU_FREELIST_SIZE;
*batchp = batch;
}
return batch;
}
static void rcu_table_freelist_callback(struct rcu_head *head)
{
struct rcu_table_freelist *batch =
container_of(head, struct rcu_table_freelist, rcu);
while (batch->pgt_index > 0)
__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
while (batch->crst_index < RCU_FREELIST_SIZE)
crst_table_free(batch->mm, batch->table[batch->crst_index++]);
free_page((unsigned long) batch);
}
void rcu_table_freelist_finish(void)
{
struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
if (!batch)
return;
call_rcu(&batch->rcu, rcu_table_freelist_callback);
__get_cpu_var(rcu_table_freelist) = NULL;
}
static void smp_sync(void *arg)
{
}
#ifndef CONFIG_64BIT
#define ALLOC_ORDER 1
#define TABLES_PER_PAGE 4
#define FRAG_MASK 15UL
#define SECOND_HALVES 10UL
void clear_table_pgstes(unsigned long *table)
{
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
memset(table + 256, 0, PAGE_SIZE/4);
clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
memset(table + 768, 0, PAGE_SIZE/4);
}
#else
#define ALLOC_ORDER 2
#define TABLES_PER_PAGE 2
#define FRAG_MASK 3UL
#define SECOND_HALVES 2UL
void clear_table_pgstes(unsigned long *table)
{
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
memset(table + 256, 0, PAGE_SIZE/2);
}
#endif
unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
EXPORT_SYMBOL(VMALLOC_START);
static int __init parse_vmalloc(char *arg)
{
if (!arg)
return -EINVAL;
VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
return 0;
}
early_param("vmalloc", parse_vmalloc);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
if (!page)
return NULL;
return (unsigned long *) page_to_phys(page);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
free_pages((unsigned long) table, ALLOC_ORDER);
}
void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
struct rcu_table_freelist *batch;
if (atomic_read(&mm->mm_users) < 2 &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
crst_table_free(mm, table);
return;
}
batch = rcu_table_freelist_get(mm);
if (!batch) {
smp_call_function(smp_sync, NULL, 1);
crst_table_free(mm, table);
return;
}
batch->table[--batch->crst_index] = table;
if (batch->pgt_index >= batch->crst_index)
rcu_table_freelist_finish();
}
#ifdef CONFIG_64BIT
int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
{
unsigned long *table, *pgd;
unsigned long entry;
BUG_ON(limit > (1UL << 53));
repeat:
table = crst_table_alloc(mm);
if (!table)
return -ENOMEM;
spin_lock_bh(&mm->page_table_lock);
if (mm->context.asce_limit < limit) {
pgd = (unsigned long *) mm->pgd;
if (mm->context.asce_limit <= (1UL << 31)) {
entry = _REGION3_ENTRY_EMPTY;
mm->context.asce_limit = 1UL << 42;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_REGION3;
} else {
entry = _REGION2_ENTRY_EMPTY;
mm->context.asce_limit = 1UL << 53;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_REGION2;
}
crst_table_init(table, entry);
pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
mm->pgd = (pgd_t *) table;
mm->task_size = mm->context.asce_limit;
table = NULL;
}
spin_unlock_bh(&mm->page_table_lock);
if (table)
crst_table_free(mm, table);
if (mm->context.asce_limit < limit)
goto repeat;
update_mm(mm, current);
return 0;
}
void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
{
pgd_t *pgd;
if (mm->context.asce_limit <= limit)
return;
__tlb_flush_mm(mm);
while (mm->context.asce_limit > limit) {
pgd = mm->pgd;
switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
case _REGION_ENTRY_TYPE_R2:
mm->context.asce_limit = 1UL << 42;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_REGION3;
break;
case _REGION_ENTRY_TYPE_R3:
mm->context.asce_limit = 1UL << 31;
mm->context.asce_bits = _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS |
_ASCE_TYPE_SEGMENT;
break;
default:
BUG();
}
mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
mm->task_size = mm->context.asce_limit;
crst_table_free(mm, (unsigned long *) pgd);
}
update_mm(mm, current);
}
#endif
/*
* page table entry allocation/free routines.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
struct page *page;
unsigned long *table;
unsigned long bits;
bits = (mm->context.has_pgste) ? 3UL : 1UL;
spin_lock_bh(&mm->context.list_lock);
page = NULL;
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
page = NULL;
}
if (!page) {
spin_unlock_bh(&mm->context.list_lock);
page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
if (!page)
return NULL;
pgtable_page_ctor(page);
page->flags &= ~FRAG_MASK;
table = (unsigned long *) page_to_phys(page);
if (mm->context.has_pgste)
clear_table_pgstes(table);
else
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
spin_lock_bh(&mm->context.list_lock);
list_add(&page->lru, &mm->context.pgtable_list);
}
table = (unsigned long *) page_to_phys(page);
while (page->flags & bits) {
table += 256;
bits <<= 1;
}
page->flags |= bits;
if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
list_move_tail(&page->lru, &mm->context.pgtable_list);
spin_unlock_bh(&mm->context.list_lock);
return table;
}
static void __page_table_free(struct mm_struct *mm, unsigned long *table)
{
struct page *page;
unsigned long bits;
bits = ((unsigned long) table) & 15;
table = (unsigned long *)(((unsigned long) table) ^ bits);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
page->flags ^= bits;
if (!(page->flags & FRAG_MASK)) {
pgtable_page_dtor(page);
__free_page(page);
}
}
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
struct page *page;
unsigned long bits;
bits = (mm->context.has_pgste) ? 3UL : 1UL;
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
spin_lock_bh(&mm->context.list_lock);
page->flags ^= bits;
if (page->flags & FRAG_MASK) {
/* Page now has some free pgtable fragments. */
if (!list_empty(&page->lru))
list_move(&page->lru, &mm->context.pgtable_list);
page = NULL;
} else
/* All fragments of the 4K page have been freed. */
list_del(&page->lru);
spin_unlock_bh(&mm->context.list_lock);
if (page) {
pgtable_page_dtor(page);
__free_page(page);
}
}
void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
{
struct rcu_table_freelist *batch;
struct page *page;
unsigned long bits;
if (atomic_read(&mm->mm_users) < 2 &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
page_table_free(mm, table);
return;
}
batch = rcu_table_freelist_get(mm);
if (!batch) {
smp_call_function(smp_sync, NULL, 1);
page_table_free(mm, table);
return;
}
bits = (mm->context.has_pgste) ? 3UL : 1UL;
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
spin_lock_bh(&mm->context.list_lock);
/* Delayed freeing with rcu prevents reuse of pgtable fragments */
list_del_init(&page->lru);
spin_unlock_bh(&mm->context.list_lock);
table = (unsigned long *)(((unsigned long) table) | bits);
batch->table[batch->pgt_index++] = table;
if (batch->pgt_index >= batch->crst_index)
rcu_table_freelist_finish();
}
/*
* switch on pgstes for its userspace process (for kvm)
*/
int s390_enable_sie(void)
{
struct task_struct *tsk = current;
struct mm_struct *mm, *old_mm;
/* Do we have switched amode? If no, we cannot do sie */
if (user_mode == HOME_SPACE_MODE)
return -EINVAL;
/* Do we have pgstes? if yes, we are done */
if (tsk->mm->context.has_pgste)
return 0;
/* lets check if we are allowed to replace the mm */
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
!hlist_empty(&tsk->mm->ioctx_list) ||
#endif
tsk->mm != tsk->active_mm) {
task_unlock(tsk);
return -EINVAL;
}
task_unlock(tsk);
/* we copy the mm and let dup_mm create the page tables with_pgstes */
tsk->mm->context.alloc_pgste = 1;
mm = dup_mm(tsk);
tsk->mm->context.alloc_pgste = 0;
if (!mm)
return -ENOMEM;
/* Now lets check again if something happened */
task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
#ifdef CONFIG_AIO
!hlist_empty(&tsk->mm->ioctx_list) ||
#endif
tsk->mm != tsk->active_mm) {
mmput(mm);
task_unlock(tsk);
return -EINVAL;
}
/* ok, we are alone. No ptrace, no threads, etc. */
old_mm = tsk->mm;
tsk->mm = tsk->active_mm = mm;
preempt_disable();
update_mm(mm, tsk);
atomic_inc(&mm->context.attach_count);
atomic_dec(&old_mm->context.attach_count);
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
preempt_enable();
task_unlock(tsk);
mmput(old_mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
bool kernel_page_present(struct page *page)
{
unsigned long addr;
int cc;
addr = page_to_phys(page);
asm volatile(
" lra %1,0(%1)\n"
" ipm %0\n"
" srl %0,28"
: "=d" (cc), "+a" (addr) : : "cc");
return cc == 0;
}
#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */