linux/kernel/events/ring_buffer.c

/*
 * Performance events ring-buffer code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * For licensing details see kernel-base/COPYING
 */

#include <linux/perf_event.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>

#include "internal.h"

static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
			      unsigned long offset, unsigned long head)
{
	unsigned long mask;

	if (!rb->writable)
		return true;

	mask = perf_data_size(rb) - 1;

	offset = (offset - tail) & mask;
	head   = (head   - tail) & mask;

	if ((int)(head - offset) < 0)
		return false;

	return true;
}

static void perf_output_wakeup(struct perf_output_handle *handle)
{
	atomic_set(&handle->rb->poll, POLL_IN);

	handle->event->pending_wakeup = 1;
	irq_work_queue(&handle->event->pending);
}

/*
 * We need to ensure a later event_id doesn't publish a head when a former
 * event isn't done writing. However since we need to deal with NMIs we
 * cannot fully serialize things.
 *
 * We only publish the head (and generate a wakeup) when the outer-most
 * event completes.
 */
static void perf_output_get_handle(struct perf_output_handle *handle)
{
	struct ring_buffer *rb = handle->rb;

	preempt_disable();
	local_inc(&rb->nest);
	handle->wakeup = local_read(&rb->wakeup);
}

static void perf_output_put_handle(struct perf_output_handle *handle)
{
	struct ring_buffer *rb = handle->rb;
	unsigned long head;

again:
	head = local_read(&rb->head);

	/*
	 * IRQ/NMI can happen here, which means we can miss a head update.
	 */

	if (!local_dec_and_test(&rb->nest))
		goto out;

	/*
	 * Publish the known good head. Rely on the full barrier implied
	 * by atomic_dec_and_test() order the rb->head read and this
	 * write.
	 */
	rb->user_page->data_head = head;

	/*
	 * Now check if we missed an update, rely on the (compiler)
	 * barrier in atomic_dec_and_test() to re-read rb->head.
	 */
	if (unlikely(head != local_read(&rb->head))) {
		local_inc(&rb->nest);
		goto again;
	}

	if (handle->wakeup != local_read(&rb->wakeup))
		perf_output_wakeup(handle);

out:
	preempt_enable();
}

int perf_output_begin(struct perf_output_handle *handle,
		      struct perf_event *event, unsigned int size)
{
	struct ring_buffer *rb;
	unsigned long tail, offset, head;
	int have_lost;
	struct perf_sample_data sample_data;
	struct {
		struct perf_event_header header;
		u64			 id;
		u64			 lost;
	} lost_event;

	rcu_read_lock();
	/*
	 * For inherited events we send all the output towards the parent.
	 */
	if (event->parent)
		event = event->parent;

	rb = rcu_dereference(event->rb);
	if (!rb)
		goto out;

	handle->rb	= rb;
	handle->event	= event;

	if (!rb->nr_pages)
		goto out;

	have_lost = local_read(&rb->lost);
	if (have_lost) {
		lost_event.header.size = sizeof(lost_event);
		perf_event_header__init_id(&lost_event.header, &sample_data,
					   event);
		size += lost_event.header.size;
	}

	perf_output_get_handle(handle);

	do {
		/*
		 * Userspace could choose to issue a mb() before updating the
		 * tail pointer. So that all reads will be completed before the
		 * write is issued.
		 */
		tail = ACCESS_ONCE(rb->user_page->data_tail);
		smp_rmb();
		offset = head = local_read(&rb->head);
		head += size;
		if (unlikely(!perf_output_space(rb, tail, offset, head)))
			goto fail;
	} while (local_cmpxchg(&rb->head, offset, head) != offset);

	if (head - local_read(&rb->wakeup) > rb->watermark)
		local_add(rb->watermark, &rb->wakeup);

	handle->page = offset >> (PAGE_SHIFT + page_order(rb));
	handle->page &= rb->nr_pages - 1;
	handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
	handle->addr = rb->data_pages[handle->page];
	handle->addr += handle->size;
	handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;

	if (have_lost) {
		lost_event.header.type = PERF_RECORD_LOST;
		lost_event.header.misc = 0;
		lost_event.id          = event->id;
		lost_event.lost        = local_xchg(&rb->lost, 0);

		perf_output_put(handle, lost_event);
		perf_event__output_id_sample(event, handle, &sample_data);
	}

	return 0;

fail:
	local_inc(&rb->lost);
	perf_output_put_handle(handle);
out:
	rcu_read_unlock();

	return -ENOSPC;
}

void perf_output_copy(struct perf_output_handle *handle,
		      const void *buf, unsigned int len)
{
	__output_copy(handle, buf, len);
}

void perf_output_end(struct perf_output_handle *handle)
{
	perf_output_put_handle(handle);
	rcu_read_unlock();
}

static void
ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
{
	long max_size = perf_data_size(rb);

	if (watermark)
		rb->watermark = min(max_size, watermark);

	if (!rb->watermark)
		rb->watermark = max_size / 2;

	if (flags & RING_BUFFER_WRITABLE)
		rb->writable = 1;

	atomic_set(&rb->refcount, 1);

	INIT_LIST_HEAD(&rb->event_list);
	spin_lock_init(&rb->event_lock);
}

#ifndef CONFIG_PERF_USE_VMALLOC

/*
 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 */

struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
	if (pgoff > rb->nr_pages)
		return NULL;

	if (pgoff == 0)
		return virt_to_page(rb->user_page);

	return virt_to_page(rb->data_pages[pgoff - 1]);
}

static void *perf_mmap_alloc_page(int cpu)
{
	struct page *page;
	int node;

	node = (cpu == -1) ? cpu : cpu_to_node(cpu);
	page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
	if (!page)
		return NULL;

	return page_address(page);
}

struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
	struct ring_buffer *rb;
	unsigned long size;
	int i;

	size = sizeof(struct ring_buffer);
	size += nr_pages * sizeof(void *);

	rb = kzalloc(size, GFP_KERNEL);
	if (!rb)
		goto fail;

	rb->user_page = perf_mmap_alloc_page(cpu);
	if (!rb->user_page)
		goto fail_user_page;

	for (i = 0; i < nr_pages; i++) {
		rb->data_pages[i] = perf_mmap_alloc_page(cpu);
		if (!rb->data_pages[i])
			goto fail_data_pages;
	}

	rb->nr_pages = nr_pages;

	ring_buffer_init(rb, watermark, flags);

	return rb;

fail_data_pages:
	for (i--; i >= 0; i--)
		free_page((unsigned long)rb->data_pages[i]);

	free_page((unsigned long)rb->user_page);

fail_user_page:
	kfree(rb);

fail:
	return NULL;
}

static void perf_mmap_free_page(unsigned long addr)
{
	struct page *page = virt_to_page((void *)addr);

	page->mapping = NULL;
	__free_page(page);
}

void rb_free(struct ring_buffer *rb)
{
	int i;

	perf_mmap_free_page((unsigned long)rb->user_page);
	for (i = 0; i < rb->nr_pages; i++)
		perf_mmap_free_page((unsigned long)rb->data_pages[i]);
	kfree(rb);
}

#else

struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
	if (pgoff > (1UL << page_order(rb)))
		return NULL;

	return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}

static void perf_mmap_unmark_page(void *addr)
{
	struct page *page = vmalloc_to_page(addr);

	page->mapping = NULL;
}

static void rb_free_work(struct work_struct *work)
{
	struct ring_buffer *rb;
	void *base;
	int i, nr;

	rb = container_of(work, struct ring_buffer, work);
	nr = 1 << page_order(rb);

	base = rb->user_page;
	for (i = 0; i < nr + 1; i++)
		perf_mmap_unmark_page(base + (i * PAGE_SIZE));

	vfree(base);
	kfree(rb);
}

void rb_free(struct ring_buffer *rb)
{
	schedule_work(&rb->work);
}

struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
{
	struct ring_buffer *rb;
	unsigned long size;
	void *all_buf;

	size = sizeof(struct ring_buffer);
	size += sizeof(void *);

	rb = kzalloc(size, GFP_KERNEL);
	if (!rb)
		goto fail;

	INIT_WORK(&rb->work, rb_free_work);

	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
	if (!all_buf)
		goto fail_all_buf;

	rb->user_page = all_buf;
	rb->data_pages[0] = all_buf + PAGE_SIZE;
	rb->page_order = ilog2(nr_pages);
	rb->nr_pages = 1;

	ring_buffer_init(rb, watermark, flags);

	return rb;

fail_all_buf:
	kfree(rb);

fail:
	return NULL;
}

#endif
perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@alien8.de> Cc: Stephane Eranian <eranian@google.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-05-19 17:55:04 +00:00			`/*`
			`* Performance events ring-buffer code:`
			`*`
			`* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>`
			`* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar`
			`* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>`
misc latin1 to utf8 conversions Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Jiri Kosina <jkosina@suse.cz> 2011-12-29 22:09:01 +00:00			`* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>`
perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@alien8.de> Cc: Stephane Eranian <eranian@google.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-05-19 17:55:04 +00:00			`*`
			`* For licensing details see kernel-base/COPYING`
			`*/`

			`#include <linux/perf_event.h>`
			`#include <linux/vmalloc.h>`
			`#include <linux/slab.h>`

			`#include "internal.h"`

			`static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,`
			`unsigned long offset, unsigned long head)`
			`{`
			`unsigned long mask;`

			`if (!rb->writable)`
			`return true;`

			`mask = perf_data_size(rb) - 1;`

			`offset = (offset - tail) & mask;`
			`head = (head - tail) & mask;`

			`if ((int)(head - offset) < 0)`
			`return false;`

			`return true;`
			`}`

			`static void perf_output_wakeup(struct perf_output_handle *handle)`
			`{`
			`atomic_set(&handle->rb->poll, POLL_IN);`

perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Michael Cree <mcree@orcon.net.nz> Cc: Will Deacon <will.deacon@arm.com> Cc: Deng-Cheng Zhu <dengcheng.zhu@gmail.com> Cc: Anton Blanchard <anton@samba.org> Cc: Eric B Munson <emunson@mgebm.net> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: David S. Miller <davem@davemloft.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Jason Wessel <jason.wessel@windriver.com> Cc: Don Zickus <dzickus@redhat.com> Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-06-27 12:41:57 +00:00			`handle->event->pending_wakeup = 1;`
			`irq_work_queue(&handle->event->pending);`
perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@alien8.de> Cc: Stephane Eranian <eranian@google.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-05-19 17:55:04 +00:00			`}`

			`/*`
			`* We need to ensure a later event_id doesn't publish a head when a former`
			`* event isn't done writing. However since we need to deal with NMIs we`
			`* cannot fully serialize things.`
			`*`
			`* We only publish the head (and generate a wakeup) when the outer-most`
			`* event completes.`
			`*/`
			`static void perf_output_get_handle(struct perf_output_handle *handle)`
			`{`
			`struct ring_buffer *rb = handle->rb;`

			`preempt_disable();`
			`local_inc(&rb->nest);`
			`handle->wakeup = local_read(&rb->wakeup);`
			`}`

			`static void perf_output_put_handle(struct perf_output_handle *handle)`
			`{`
			`struct ring_buffer *rb = handle->rb;`
			`unsigned long head;`

			`again:`
			`head = local_read(&rb->head);`

			`/*`
			`* IRQ/NMI can happen here, which means we can miss a head update.`
			`*/`

			`if (!local_dec_and_test(&rb->nest))`
			`goto out;`

			`/*`
			`* Publish the known good head. Rely on the full barrier implied`
			`* by atomic_dec_and_test() order the rb->head read and this`
			`* write.`
			`*/`
			`rb->user_page->data_head = head;`

			`/*`
			`* Now check if we missed an update, rely on the (compiler)`
			`* barrier in atomic_dec_and_test() to re-read rb->head.`
			`*/`
			`if (unlikely(head != local_read(&rb->head))) {`
			`local_inc(&rb->nest);`
			`goto again;`
			`}`

			`if (handle->wakeup != local_read(&rb->wakeup))`
			`perf_output_wakeup(handle);`

			`out:`
			`preempt_enable();`
			`}`

			`int perf_output_begin(struct perf_output_handle *handle,`
perf: Remove the perf_output_begin(.sample) argument Since only samples call perf_output_sample() its much saner (and more correct) to put the sample logic in there than in the perf_output_begin()/perf_output_end() pair. Saves a useless argument, reduces conditionals and shrinks struct perf_output_handle, win! Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/n/tip-2crpvsx3cqu67q3zqjbnlpsc@git.kernel.org Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-06-27 14:47:16 +00:00			`struct perf_event *event, unsigned int size)`
perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@alien8.de> Cc: Stephane Eranian <eranian@google.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-05-19 17:55:04 +00:00			`{`
			`struct ring_buffer *rb;`
			`unsigned long tail, offset, head;`
			`int have_lost;`
			`struct perf_sample_data sample_data;`
			`struct {`
			`struct perf_event_header header;`
			`u64 id;`
			`u64 lost;`
			`} lost_event;`

			`rcu_read_lock();`
			`/*`
			`* For inherited events we send all the output towards the parent.`
			`*/`
			`if (event->parent)`
			`event = event->parent;`

			`rb = rcu_dereference(event->rb);`
			`if (!rb)`
			`goto out;`

			`handle->rb = rb;`
			`handle->event = event;`

			`if (!rb->nr_pages)`
			`goto out;`

			`have_lost = local_read(&rb->lost);`
			`if (have_lost) {`
			`lost_event.header.size = sizeof(lost_event);`
			`perf_event_header__init_id(&lost_event.header, &sample_data,`
			`event);`
			`size += lost_event.header.size;`
			`}`

			`perf_output_get_handle(handle);`

			`do {`
			`/*`
			`* Userspace could choose to issue a mb() before updating the`
			`* tail pointer. So that all reads will be completed before the`
			`* write is issued.`
			`*/`
			`tail = ACCESS_ONCE(rb->user_page->data_tail);`
			`smp_rmb();`
			`offset = head = local_read(&rb->head);`
			`head += size;`
			`if (unlikely(!perf_output_space(rb, tail, offset, head)))`
			`goto fail;`
			`} while (local_cmpxchg(&rb->head, offset, head) != offset);`

			`if (head - local_read(&rb->wakeup) > rb->watermark)`
			`local_add(rb->watermark, &rb->wakeup);`

			`handle->page = offset >> (PAGE_SHIFT + page_order(rb));`
			`handle->page &= rb->nr_pages - 1;`
			`handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);`
			`handle->addr = rb->data_pages[handle->page];`
			`handle->addr += handle->size;`
			`handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;`

			`if (have_lost) {`
			`lost_event.header.type = PERF_RECORD_LOST;`
			`lost_event.header.misc = 0;`
			`lost_event.id = event->id;`
			`lost_event.lost = local_xchg(&rb->lost, 0);`

			`perf_output_put(handle, lost_event);`
			`perf_event__output_id_sample(event, handle, &sample_data);`
			`}`

			`return 0;`

			`fail:`
			`local_inc(&rb->lost);`
			`perf_output_put_handle(handle);`
			`out:`
			`rcu_read_unlock();`

			`return -ENOSPC;`
			`}`

			`void perf_output_copy(struct perf_output_handle *handle,`
			`const void *buf, unsigned int len)`
			`{`
			`__output_copy(handle, buf, len);`
			`}`

			`void perf_output_end(struct perf_output_handle *handle)`
			`{`
			`perf_output_put_handle(handle);`
			`rcu_read_unlock();`
			`}`

			`static void`
			`ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)`
			`{`
			`long max_size = perf_data_size(rb);`

			`if (watermark)`
			`rb->watermark = min(max_size, watermark);`

			`if (!rb->watermark)`
			`rb->watermark = max_size / 2;`

			`if (flags & RING_BUFFER_WRITABLE)`
			`rb->writable = 1;`

			`atomic_set(&rb->refcount, 1);`
perf: Fix loss of notification with multi-event When you do: $ perf record -e cycles,cycles,cycles noploop 10 You expect about 10,000 samples for each event, i.e., 10s at 1000samples/sec. However, this is not what's happening. You get much fewer samples, maybe 3700 samples/event: $ perf report -D \| tail -15 Aggregated stats: TOTAL events: 10998 MMAP events: 66 COMM events: 2 SAMPLE events: 10930 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 cycles stats: TOTAL events: 3642 SAMPLE events: 3642 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 On a Intel Nehalem or even AMD64, there are 4 counters capable of measuring cycles, so there is plenty of space to measure those events without multiplexing (even with the NMI watchdog active). And even with multiplexing, we'd expect roughly the same number of samples per event. The root of the problem was that when the event that caused the buffer to become full was not the first event passed on the cmdline, the user notification would get lost. The notification was sent to the file descriptor of the overflowed event but the perf tool was not polling on it. The perf tool aggregates all samples into a single buffer, i.e., the buffer of the first event. Consequently, it assumes notifications for any event will come via that descriptor. The seemingly straight forward solution of moving the waitq into the ringbuffer object doesn't work because of life-time issues. One could perf_event_set_output() on a fd that you're also blocking on and cause the old rb object to be freed while its waitq would still be referenced by the blocked thread -> FAIL. Therefore link all events to the ringbuffer and broadcast the wakeup from the ringbuffer object to all possible events that could be waited upon. This is rather ugly, and we're open to better solutions but it works for now. Reported-by: Stephane Eranian <eranian@google.com> Finished-by: Stephane Eranian <eranian@google.com> Reviewed-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-11-26 01:47:31 +00:00
			`INIT_LIST_HEAD(&rb->event_list);`
			`spin_lock_init(&rb->event_lock);`
perf: Split up buffer handling from core code And create the internal perf events header. v2: Keep an internal inlined perf_output_copy() Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Borislav Petkov <bp@alien8.de> Cc: Stephane Eranian <eranian@google.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1305827704-5607-1-git-send-email-fweisbec@gmail.com [ v3: use clearer 'ring_buffer' and 'rb' naming ] Signed-off-by: Ingo Molnar <mingo@elte.hu> 2011-05-19 17:55:04 +00:00			`}`

			`#ifndef CONFIG_PERF_USE_VMALLOC`

			`/*`
			`* Back perf_mmap() with regular GFP_KERNEL-0 pages.`
			`*/`

			`struct page *`
			`perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)`
			`{`
			`if (pgoff > rb->nr_pages)`
			`return NULL;`

			`if (pgoff == 0)`
			`return virt_to_page(rb->user_page);`

			`return virt_to_page(rb->data_pages[pgoff - 1]);`
			`}`

			`static void *perf_mmap_alloc_page(int cpu)`
			`{`
			`struct page *page;`
			`int node;`

			`node = (cpu == -1) ? cpu : cpu_to_node(cpu);`
			`page = alloc_pages_node(node, GFP_KERNEL \| __GFP_ZERO, 0);`
			`if (!page)`
			`return NULL;`

			`return page_address(page);`
			`}`

			`struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)`
			`{`
			`struct ring_buffer *rb;`
			`unsigned long size;`
			`int i;`

			`size = sizeof(struct ring_buffer);`
			`size += nr_pages * sizeof(void *);`

			`rb = kzalloc(size, GFP_KERNEL);`
			`if (!rb)`
			`goto fail;`

			`rb->user_page = perf_mmap_alloc_page(cpu);`
			`if (!rb->user_page)`
			`goto fail_user_page;`

			`for (i = 0; i < nr_pages; i++) {`
			`rb->data_pages[i] = perf_mmap_alloc_page(cpu);`
			`if (!rb->data_pages[i])`
			`goto fail_data_pages;`
			`}`

			`rb->nr_pages = nr_pages;`

			`ring_buffer_init(rb, watermark, flags);`

			`return rb;`

			`fail_data_pages:`
			`for (i--; i >= 0; i--)`
			`free_page((unsigned long)rb->data_pages[i]);`

			`free_page((unsigned long)rb->user_page);`

			`fail_user_page:`
			`kfree(rb);`

			`fail:`
			`return NULL;`
			`}`

			`static void perf_mmap_free_page(unsigned long addr)`
			`{`
			`struct page page = virt_to_page((void )addr);`

			`page->mapping = NULL;`
			`__free_page(page);`
			`}`

			`void rb_free(struct ring_buffer *rb)`
			`{`
			`int i;`

			`perf_mmap_free_page((unsigned long)rb->user_page);`
			`for (i = 0; i < rb->nr_pages; i++)`
			`perf_mmap_free_page((unsigned long)rb->data_pages[i]);`
			`kfree(rb);`
			`}`

			`#else`

			`struct page *`
			`perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)`
			`{`
			`if (pgoff > (1UL << page_order(rb)))`
			`return NULL;`

			`return vmalloc_to_page((void )rb->user_page + pgoff PAGE_SIZE);`
			`}`

			`static void perf_mmap_unmark_page(void *addr)`
			`{`
			`struct page *page = vmalloc_to_page(addr);`

			`page->mapping = NULL;`
			`}`

			`static void rb_free_work(struct work_struct *work)`
			`{`
			`struct ring_buffer *rb;`
			`void *base;`
			`int i, nr;`

			`rb = container_of(work, struct ring_buffer, work);`
			`nr = 1 << page_order(rb);`

			`base = rb->user_page;`
			`for (i = 0; i < nr + 1; i++)`
			`perf_mmap_unmark_page(base + (i * PAGE_SIZE));`

			`vfree(base);`
			`kfree(rb);`
			`}`

			`void rb_free(struct ring_buffer *rb)`
			`{`
			`schedule_work(&rb->work);`
			`}`

			`struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)`
			`{`
			`struct ring_buffer *rb;`
			`unsigned long size;`
			`void *all_buf;`

			`size = sizeof(struct ring_buffer);`
			`size += sizeof(void *);`

			`rb = kzalloc(size, GFP_KERNEL);`
			`if (!rb)`
			`goto fail;`

			`INIT_WORK(&rb->work, rb_free_work);`

			`all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);`
			`if (!all_buf)`
			`goto fail_all_buf;`

			`rb->user_page = all_buf;`
			`rb->data_pages[0] = all_buf + PAGE_SIZE;`
			`rb->page_order = ilog2(nr_pages);`
			`rb->nr_pages = 1;`

			`ring_buffer_init(rb, watermark, flags);`

			`return rb;`

			`fail_all_buf:`
			`kfree(rb);`

			`fail:`
			`return NULL;`
			`}`

			`#endif`