linux/arch/m68k/mm/fault.c

219 lines
5.2 KiB
C
Raw Normal View History

/*
* linux/arch/m68k/mm/fault.c
*
* Copyright (C) 1995 Hamish Macdonald
*/
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/ptrace.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <asm/setup.h>
#include <asm/traps.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
extern void die_if_kernel(char *, struct pt_regs *, long);
extern const int frame_extra_sizes[]; /* in m68k/kernel/signal.c */
int send_fault_sig(struct pt_regs *regs)
{
siginfo_t siginfo = { 0, 0, 0, };
siginfo.si_signo = current->thread.signo;
siginfo.si_code = current->thread.code;
siginfo.si_addr = (void *)current->thread.faddr;
#ifdef DEBUG
printk("send_fault_sig: %p,%d,%d\n", siginfo.si_addr, siginfo.si_signo, siginfo.si_code);
#endif
if (user_mode(regs)) {
force_sig_info(siginfo.si_signo,
&siginfo, current);
} else {
const struct exception_table_entry *fixup;
/* Are we prepared to handle this kernel fault? */
if ((fixup = search_exception_tables(regs->pc))) {
struct pt_regs *tregs;
/* Create a new four word stack frame, discarding the old
one. */
regs->stkadj = frame_extra_sizes[regs->format];
tregs = (struct pt_regs *)((ulong)regs + regs->stkadj);
tregs->vector = regs->vector;
tregs->format = 0;
tregs->pc = fixup->fixup;
tregs->sr = regs->sr;
return -1;
}
//if (siginfo.si_signo == SIGBUS)
// force_sig_info(siginfo.si_signo,
// &siginfo, current);
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
*/
if ((unsigned long)siginfo.si_addr < PAGE_SIZE)
printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
else
printk(KERN_ALERT "Unable to handle kernel access");
printk(" at virtual address %p\n", siginfo.si_addr);
die_if_kernel("Oops", regs, 0 /*error_code*/);
do_exit(SIGKILL);
}
return 1;
}
/*
* This routine handles page faults. It determines the problem, and
* then passes it off to one of the appropriate routines.
*
* error_code:
* bit 0 == 0 means no page found, 1 means protection fault
* bit 1 == 0 means read, 1 means write
*
* If this routine detects a bad access, it returns 1, otherwise it
* returns 0.
*/
int do_page_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct * vma;
int write, fault;
#ifdef DEBUG
printk ("do page fault:\nregs->sr=%#x, regs->pc=%#lx, address=%#lx, %ld, %p\n",
regs->sr, regs->pc, address, error_code,
current->mm->pgd);
#endif
/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
if (in_atomic() || !mm)
goto no_context;
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (!vma)
goto map_err;
if (vma->vm_flags & VM_IO)
goto acc_err;
if (vma->vm_start <= address)
goto good_area;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto map_err;
if (user_mode(regs)) {
/* Accessing the stack below usp is always a bug. The
"+ 256" is there due to some instructions doing
pre-decrement on the stack and that doesn't show up
until later. */
if (address + 256 < rdusp())
goto map_err;
}
if (expand_stack(vma, address))
goto map_err;
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
#ifdef DEBUG
printk("do_page_fault: good_area\n");
#endif
write = 0;
switch (error_code & 3) {
default: /* 3: write, present */
/* fall through */
case 2: /* write, not present */
if (!(vma->vm_flags & VM_WRITE))
goto acc_err;
write++;
break;
case 1: /* read, present */
goto acc_err;
case 0: /* read, not present */
[PATCH] make PROT_WRITE imply PROT_READ Make PROT_WRITE imply PROT_READ for a number of architectures which don't support write only in hardware. While looking at this, I noticed that some architectures which do not support write only mappings already take the exact same approach. For example, in arch/alpha/mm/fault.c: " if (cause < 0) { if (!(vma->vm_flags & VM_EXEC)) goto bad_area; } else if (!cause) { /* Allow reads even for write-only mappings */ if (!(vma->vm_flags & (VM_READ | VM_WRITE))) goto bad_area; } else { if (!(vma->vm_flags & VM_WRITE)) goto bad_area; } " Thus, this patch brings other architectures which do not support write only mappings in-line and consistent with the rest. I've verified the patch on ia64, x86_64 and x86. Additional discussion: Several architectures, including x86, can not support write-only mappings. The pte for x86 reserves a single bit for protection and its two states are read only or read/write. Thus, write only is not supported in h/w. Currently, if i 'mmap' a page write-only, the first read attempt on that page creates a page fault and will SEGV. That check is enforced in arch/blah/mm/fault.c. However, if i first write that page it will fault in and the pte will be set to read/write. Thus, any subsequent reads to the page will succeed. It is this inconsistency in behavior that this patch is attempting to address. Furthermore, if the page is swapped out, and then brought back the first read will also cause a SEGV. Thus, any arbitrary read on a page can potentially result in a SEGV. According to the SuSv3 spec, "if the application requests only PROT_WRITE, the implementation may also allow read access." Also as mentioned, some archtectures, such as alpha, shown above already take the approach that i am suggesting. The counter-argument to this raised by Arjan, is that the kernel is enforcing the write only mapping the best it can given the h/w limitations. This is true, however Alan Cox, and myself would argue that the inconsitency in behavior, that is applications can sometimes work/sometimes fails is highly undesireable. If you read through the thread, i think people, came to an agreement on the last patch i posted, as nobody has objected to it... Signed-off-by: Jason Baron <jbaron@redhat.com> Cc: Russell King <rmk@arm.linux.org.uk> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Acked-by: Andi Kleen <ak@muc.de> Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Arjan van de Ven <arjan@linux.intel.com> Acked-by: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Ian Molton <spyro@f2s.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-09-29 08:58:58 +00:00
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto acc_err;
}
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
survive:
fault = handle_mm_fault(mm, vma, address, write);
#ifdef DEBUG
printk("handle_mm_fault returns %d\n",fault);
#endif
mm: fault feedback #2 This patch completes Linus's wish that the fault return codes be made into bit flags, which I agree makes everything nicer. This requires requires all handle_mm_fault callers to be modified (possibly the modifications should go further and do things like fault accounting in handle_mm_fault -- however that would be for another patch). [akpm@linux-foundation.org: fix alpha build] [akpm@linux-foundation.org: fix s390 build] [akpm@linux-foundation.org: fix sparc build] [akpm@linux-foundation.org: fix sparc64 build] [akpm@linux-foundation.org: fix ia64 build] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Bryan Wu <bryan.wu@analog.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Greg Ungerer <gerg@uclinux.org> Cc: Matthew Wilcox <willy@debian.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Chris Zankel <chris@zankel.net> Acked-by: Kyle McMartin <kyle@mcmartin.ca> Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com> Acked-by: Ralf Baechle <ralf@linux-mips.org> Acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [ Still apparently needs some ARM and PPC loving - Linus ] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 08:47:05 +00:00
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
else if (fault & VM_FAULT_SIGBUS)
goto bus_err;
BUG();
}
mm: fault feedback #2 This patch completes Linus's wish that the fault return codes be made into bit flags, which I agree makes everything nicer. This requires requires all handle_mm_fault callers to be modified (possibly the modifications should go further and do things like fault accounting in handle_mm_fault -- however that would be for another patch). [akpm@linux-foundation.org: fix alpha build] [akpm@linux-foundation.org: fix s390 build] [akpm@linux-foundation.org: fix sparc build] [akpm@linux-foundation.org: fix sparc64 build] [akpm@linux-foundation.org: fix ia64 build] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Bryan Wu <bryan.wu@analog.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Greg Ungerer <gerg@uclinux.org> Cc: Matthew Wilcox <willy@debian.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Chris Zankel <chris@zankel.net> Acked-by: Kyle McMartin <kyle@mcmartin.ca> Acked-by: Haavard Skinnemoen <hskinnemoen@atmel.com> Acked-by: Ralf Baechle <ralf@linux-mips.org> Acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [ Still apparently needs some ARM and PPC loving - Linus ] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-19 08:47:05 +00:00
if (fault & VM_FAULT_MAJOR)
current->maj_flt++;
else
current->min_flt++;
up_read(&mm->mmap_sem);
return 0;
/*
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
out_of_memory:
up_read(&mm->mmap_sem);
pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 06:39:52 +00:00
if (is_global_init(current)) {
yield();
down_read(&mm->mmap_sem);
goto survive;
}
printk("VM: killing process %s\n", current->comm);
if (user_mode(regs))
During VM oom condition, kill all threads in process group We have had complaints where a threaded application is left in a bad state after one of it's threads is killed when we hit a VM: out_of_memory condition. Killing just one of the process threads can leave the application in a bad state, whereas killing the entire process group would allow for the application to restart, or be otherwise handled, and makes it very obvious that something has gone wrong. This change allows the entire process group to be taken down, rather than just the one thread. Signed-off-by: Will Schmidt <will_schmidt@vnet.ibm.com> Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Haavard Skinnemoen <hskinnemoen@atmel.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Andi Kleen <ak@suse.de> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Matthew Wilcox <willy@debian.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Chris Zankel <chris@zankel.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-16 08:24:18 +00:00
do_group_exit(SIGKILL);
no_context:
current->thread.signo = SIGBUS;
current->thread.faddr = address;
return send_fault_sig(regs);
bus_err:
current->thread.signo = SIGBUS;
current->thread.code = BUS_ADRERR;
current->thread.faddr = address;
goto send_sig;
map_err:
current->thread.signo = SIGSEGV;
current->thread.code = SEGV_MAPERR;
current->thread.faddr = address;
goto send_sig;
acc_err:
current->thread.signo = SIGSEGV;
current->thread.code = SEGV_ACCERR;
current->thread.faddr = address;
send_sig:
up_read(&mm->mmap_sem);
return send_fault_sig(regs);
}