linux/tools/perf/util/thread.c

214 lines
4.2 KiB
C
Raw Normal View History

#include "../perf.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "thread.h"
#include "util.h"
#include "debug.h"
static struct rb_root threads;
static struct thread *last_match;
static struct thread *thread__new(pid_t pid)
{
perf sched: Add 'perf sched map' scheduling event map printout This prints a textual context-switching outline of workload captured via perf sched record. For example, on a 16 CPU box it outputs: N1 O1 . . . S1 . . . B0 . *I0 C1 . M1 . 23002.773423 secs N1 O1 . *Q0 . S1 . . . B0 . I0 C1 . M1 . 23002.773423 secs N1 O1 . Q0 . S1 . . . B0 . *R1 C1 . M1 . 23002.773485 secs N1 O1 . Q0 . S1 . *S0 . B0 . R1 C1 . M1 . 23002.773478 secs *L0 O1 . Q0 . S1 . S0 . B0 . R1 C1 . M1 . 23002.773523 secs L0 O1 . *. . S1 . S0 . B0 . R1 C1 . M1 . 23002.773531 secs L0 O1 . . . S1 . S0 . B0 . R1 C1 *T1 M1 . 23002.773547 secs T1 => irqbalance:2089 L0 O1 . . . S1 . S0 . *P0 . R1 C1 T1 M1 . 23002.773549 secs *N1 O1 . . . S1 . S0 . P0 . R1 C1 T1 M1 . 23002.773566 secs N1 O1 . . . *J0 . S0 . P0 . R1 C1 T1 M1 . 23002.773571 secs N1 O1 . . . J0 . S0 *B0 P0 . R1 C1 T1 M1 . 23002.773592 secs N1 O1 . . . J0 . *U0 B0 P0 . R1 C1 T1 M1 . 23002.773582 secs N1 O1 . . . *S1 . U0 B0 P0 . R1 C1 T1 M1 . 23002.773604 secs N1 O1 . . . S1 . U0 B0 *. . R1 C1 T1 M1 . 23002.773615 secs N1 O1 . . . S1 . U0 B0 . . *K0 C1 T1 M1 . 23002.773631 secs N1 O1 . *M0 . S1 . U0 B0 . . K0 C1 T1 M1 . 23002.773624 secs N1 O1 . M0 . S1 . U0 *. . . K0 C1 T1 M1 . 23002.773644 secs N1 O1 . M0 . S1 . U0 . . . *R1 C1 T1 M1 . 23002.773662 secs N1 O1 . M0 . S1 . *. . . . R1 C1 T1 M1 . 23002.773648 secs N1 O1 . *. . S1 . . . . . R1 C1 T1 M1 . 23002.773680 secs N1 O1 . . . *L0 . . . . . R1 C1 T1 M1 . 23002.773717 secs *N0 O1 . . . L0 . . . . . R1 C1 T1 M1 . 23002.773709 secs *N1 O1 . . . L0 . . . . . R1 C1 T1 M1 . 23002.773747 secs Columns stand for individual CPUs, from CPU0 to CPU15, and the two-letter shortcuts stand for tasks that are running on a CPU. '*' denotes the CPU that had the event. A dot signals an idle CPU. New tasks are assigned new two-letter shortcuts - when they occur first they are printed. In the above example 'T1' stood for irqbalance: T1 => irqbalance:2089 Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-16 15:40:48 +00:00
struct thread *self = calloc(1, sizeof(*self));
if (self != NULL) {
self->pid = pid;
self->comm = malloc(32);
if (self->comm)
snprintf(self->comm, 32, ":%d", self->pid);
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
self->maps = RB_ROOT;
perf tools: Rewrite and improve support for kernel modules Representing modules as struct map entries, backed by a DSO, etc, using /proc/modules to find where the module is loaded. DSOs now can have a short and long name, so that in verbose mode we can show exactly which .ko or vmlinux image was used. As kernel modules now are a DSO separate from the kernel, we can ask for just the hits for a particular set of kernel modules, just like we can do with shared libraries: [root@doppio linux-2.6-tip]# perf report -n --vmlinux /home/acme/git/build/tip-recvmmsg/vmlinux --modules --dsos \[drm\] | head -15 84.58% 13266 Xorg [k] drm_clflush_pages 4.02% 630 Xorg [k] trace_kmalloc.clone.0 3.95% 619 Xorg [k] drm_ioctl 2.07% 324 Xorg [k] drm_addbufs 1.68% 263 Xorg [k] drm_gem_close_ioctl 0.77% 120 Xorg [k] drm_setmaster_ioctl 0.70% 110 Xorg [k] drm_lastclose 0.68% 106 Xorg [k] drm_open 0.54% 85 Xorg [k] drm_mm_search_free [root@doppio linux-2.6-tip]# Specifying --dsos /lib/modules/2.6.31-tip/kernel/drivers/gpu/drm/drm.ko would have the same effect. Allowing specifying just 'drm.ko' is left for another patch. Processing kallsyms so that per kernel module struct map are instantiated was also left for another patch. That will allow removing the module name from each of its symbols. struct symbol was reduced by removing the ->module backpointer and moving it (well now the map) to struct symbol_entry in perf top, that is its only user right now. The total linecount went down by ~500 lines. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Avi Kivity <avi@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-02 06:29:58 +00:00
INIT_LIST_HEAD(&self->removed_maps);
}
return self;
}
int thread__set_comm(struct thread *self, const char *comm)
{
if (self->comm)
free(self->comm);
self->comm = strdup(comm);
return self->comm ? 0 : -ENOMEM;
}
static size_t thread__fprintf(struct thread *self, FILE *fp)
{
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
struct rb_node *nd;
perf tools: Rewrite and improve support for kernel modules Representing modules as struct map entries, backed by a DSO, etc, using /proc/modules to find where the module is loaded. DSOs now can have a short and long name, so that in verbose mode we can show exactly which .ko or vmlinux image was used. As kernel modules now are a DSO separate from the kernel, we can ask for just the hits for a particular set of kernel modules, just like we can do with shared libraries: [root@doppio linux-2.6-tip]# perf report -n --vmlinux /home/acme/git/build/tip-recvmmsg/vmlinux --modules --dsos \[drm\] | head -15 84.58% 13266 Xorg [k] drm_clflush_pages 4.02% 630 Xorg [k] trace_kmalloc.clone.0 3.95% 619 Xorg [k] drm_ioctl 2.07% 324 Xorg [k] drm_addbufs 1.68% 263 Xorg [k] drm_gem_close_ioctl 0.77% 120 Xorg [k] drm_setmaster_ioctl 0.70% 110 Xorg [k] drm_lastclose 0.68% 106 Xorg [k] drm_open 0.54% 85 Xorg [k] drm_mm_search_free [root@doppio linux-2.6-tip]# Specifying --dsos /lib/modules/2.6.31-tip/kernel/drivers/gpu/drm/drm.ko would have the same effect. Allowing specifying just 'drm.ko' is left for another patch. Processing kallsyms so that per kernel module struct map are instantiated was also left for another patch. That will allow removing the module name from each of its symbols. struct symbol was reduced by removing the ->module backpointer and moving it (well now the map) to struct symbol_entry in perf top, that is its only user right now. The total linecount went down by ~500 lines. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Avi Kivity <avi@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-02 06:29:58 +00:00
struct map *pos;
size_t ret = fprintf(fp, "Thread %d %s\nCurrent maps:\n",
self->pid, self->comm);
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
for (nd = rb_first(&self->maps); nd; nd = rb_next(nd)) {
perf tools: Rewrite and improve support for kernel modules Representing modules as struct map entries, backed by a DSO, etc, using /proc/modules to find where the module is loaded. DSOs now can have a short and long name, so that in verbose mode we can show exactly which .ko or vmlinux image was used. As kernel modules now are a DSO separate from the kernel, we can ask for just the hits for a particular set of kernel modules, just like we can do with shared libraries: [root@doppio linux-2.6-tip]# perf report -n --vmlinux /home/acme/git/build/tip-recvmmsg/vmlinux --modules --dsos \[drm\] | head -15 84.58% 13266 Xorg [k] drm_clflush_pages 4.02% 630 Xorg [k] trace_kmalloc.clone.0 3.95% 619 Xorg [k] drm_ioctl 2.07% 324 Xorg [k] drm_addbufs 1.68% 263 Xorg [k] drm_gem_close_ioctl 0.77% 120 Xorg [k] drm_setmaster_ioctl 0.70% 110 Xorg [k] drm_lastclose 0.68% 106 Xorg [k] drm_open 0.54% 85 Xorg [k] drm_mm_search_free [root@doppio linux-2.6-tip]# Specifying --dsos /lib/modules/2.6.31-tip/kernel/drivers/gpu/drm/drm.ko would have the same effect. Allowing specifying just 'drm.ko' is left for another patch. Processing kallsyms so that per kernel module struct map are instantiated was also left for another patch. That will allow removing the module name from each of its symbols. struct symbol was reduced by removing the ->module backpointer and moving it (well now the map) to struct symbol_entry in perf top, that is its only user right now. The total linecount went down by ~500 lines. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Avi Kivity <avi@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-02 06:29:58 +00:00
pos = rb_entry(nd, struct map, rb_node);
ret += map__fprintf(pos, fp);
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
}
perf tools: Rewrite and improve support for kernel modules Representing modules as struct map entries, backed by a DSO, etc, using /proc/modules to find where the module is loaded. DSOs now can have a short and long name, so that in verbose mode we can show exactly which .ko or vmlinux image was used. As kernel modules now are a DSO separate from the kernel, we can ask for just the hits for a particular set of kernel modules, just like we can do with shared libraries: [root@doppio linux-2.6-tip]# perf report -n --vmlinux /home/acme/git/build/tip-recvmmsg/vmlinux --modules --dsos \[drm\] | head -15 84.58% 13266 Xorg [k] drm_clflush_pages 4.02% 630 Xorg [k] trace_kmalloc.clone.0 3.95% 619 Xorg [k] drm_ioctl 2.07% 324 Xorg [k] drm_addbufs 1.68% 263 Xorg [k] drm_gem_close_ioctl 0.77% 120 Xorg [k] drm_setmaster_ioctl 0.70% 110 Xorg [k] drm_lastclose 0.68% 106 Xorg [k] drm_open 0.54% 85 Xorg [k] drm_mm_search_free [root@doppio linux-2.6-tip]# Specifying --dsos /lib/modules/2.6.31-tip/kernel/drivers/gpu/drm/drm.ko would have the same effect. Allowing specifying just 'drm.ko' is left for another patch. Processing kallsyms so that per kernel module struct map are instantiated was also left for another patch. That will allow removing the module name from each of its symbols. struct symbol was reduced by removing the ->module backpointer and moving it (well now the map) to struct symbol_entry in perf top, that is its only user right now. The total linecount went down by ~500 lines. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Avi Kivity <avi@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-02 06:29:58 +00:00
ret = fprintf(fp, "Removed maps:\n");
list_for_each_entry(pos, &self->removed_maps, node)
ret += map__fprintf(pos, fp);
return ret;
}
struct thread *threads__findnew(pid_t pid)
{
struct rb_node **p = &threads.rb_node;
struct rb_node *parent = NULL;
struct thread *th;
/*
* Font-end cache - PID lookups come in blocks,
* so most of the time we dont have to look up
* the full rbtree:
*/
if (last_match && last_match->pid == pid)
return last_match;
while (*p != NULL) {
parent = *p;
th = rb_entry(parent, struct thread, rb_node);
if (th->pid == pid) {
last_match = th;
return th;
}
if (pid < th->pid)
p = &(*p)->rb_left;
else
p = &(*p)->rb_right;
}
th = thread__new(pid);
if (th != NULL) {
rb_link_node(&th->rb_node, parent, p);
rb_insert_color(&th->rb_node, &threads);
last_match = th;
}
return th;
}
struct thread *register_idle_thread(void)
{
struct thread *thread = threads__findnew(0);
if (!thread || thread__set_comm(thread, "swapper")) {
fprintf(stderr, "problem inserting idle task.\n");
exit(-1);
}
return thread;
}
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
static void thread__remove_overlappings(struct thread *self, struct map *map)
{
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
struct rb_node *next = rb_first(&self->maps);
while (next) {
struct map *pos = rb_entry(next, struct map, rb_node);
next = rb_next(&pos->rb_node);
if (!map__overlap(pos, map))
continue;
if (verbose >= 2) {
printf("overlapping maps:\n");
map__fprintf(map, stdout);
map__fprintf(pos, stdout);
}
perf tools: Rewrite and improve support for kernel modules Representing modules as struct map entries, backed by a DSO, etc, using /proc/modules to find where the module is loaded. DSOs now can have a short and long name, so that in verbose mode we can show exactly which .ko or vmlinux image was used. As kernel modules now are a DSO separate from the kernel, we can ask for just the hits for a particular set of kernel modules, just like we can do with shared libraries: [root@doppio linux-2.6-tip]# perf report -n --vmlinux /home/acme/git/build/tip-recvmmsg/vmlinux --modules --dsos \[drm\] | head -15 84.58% 13266 Xorg [k] drm_clflush_pages 4.02% 630 Xorg [k] trace_kmalloc.clone.0 3.95% 619 Xorg [k] drm_ioctl 2.07% 324 Xorg [k] drm_addbufs 1.68% 263 Xorg [k] drm_gem_close_ioctl 0.77% 120 Xorg [k] drm_setmaster_ioctl 0.70% 110 Xorg [k] drm_lastclose 0.68% 106 Xorg [k] drm_open 0.54% 85 Xorg [k] drm_mm_search_free [root@doppio linux-2.6-tip]# Specifying --dsos /lib/modules/2.6.31-tip/kernel/drivers/gpu/drm/drm.ko would have the same effect. Allowing specifying just 'drm.ko' is left for another patch. Processing kallsyms so that per kernel module struct map are instantiated was also left for another patch. That will allow removing the module name from each of its symbols. struct symbol was reduced by removing the ->module backpointer and moving it (well now the map) to struct symbol_entry in perf top, that is its only user right now. The total linecount went down by ~500 lines. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Avi Kivity <avi@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-10-02 06:29:58 +00:00
rb_erase(&pos->rb_node, &self->maps);
/*
* We may have references to this map, for instance in some
* hist_entry instances, so just move them to a separate
* list.
*/
list_add_tail(&pos->node, &self->removed_maps);
}
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
}
void maps__insert(struct rb_root *maps, struct map *map)
{
struct rb_node **p = &maps->rb_node;
struct rb_node *parent = NULL;
const u64 ip = map->start;
struct map *m;
while (*p != NULL) {
parent = *p;
m = rb_entry(parent, struct map, rb_node);
if (ip < m->start)
p = &(*p)->rb_left;
else
p = &(*p)->rb_right;
}
rb_link_node(&map->rb_node, parent, p);
rb_insert_color(&map->rb_node, maps);
}
struct map *maps__find(struct rb_root *maps, u64 ip)
{
struct rb_node **p = &maps->rb_node;
struct rb_node *parent = NULL;
struct map *m;
while (*p != NULL) {
parent = *p;
m = rb_entry(parent, struct map, rb_node);
if (ip < m->start)
p = &(*p)->rb_left;
else if (ip > m->end)
p = &(*p)->rb_right;
else
return m;
}
return NULL;
}
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
void thread__insert_map(struct thread *self, struct map *map)
{
thread__remove_overlappings(self, map);
maps__insert(&self->maps, map);
}
int thread__fork(struct thread *self, struct thread *parent)
{
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
struct rb_node *nd;
if (self->comm)
free(self->comm);
self->comm = strdup(parent->comm);
if (!self->comm)
return -ENOMEM;
perf tools: Use rb_tree for maps Threads can have many and kernel modules will be represented as a tree of maps as well. Ah, and for a perf.data with 146607 samples: Before: [root@doppio ~]# perf stat -r 5 perf report > /dev/null Performance counter stats for 'perf report' (5 runs): 699.823680 task-clock-msecs # 0.991 CPUs ( +- 0.454% ) 74 context-switches # 0.000 M/sec ( +- 1.709% ) 2 CPU-migrations # 0.000 M/sec ( +- 17.008% ) 23114 page-faults # 0.033 M/sec ( +- 0.000% ) 1381257019 cycles # 1973.721 M/sec ( +- 0.290% ) 1456894438 instructions # 1.055 IPC ( +- 0.007% ) 18779818 cache-references # 26.835 M/sec ( +- 0.380% ) 641799 cache-misses # 0.917 M/sec ( +- 1.200% ) 0.705972729 seconds time elapsed ( +- 0.501% ) [root@doppio ~]# After Performance counter stats for 'perf report' (5 runs): 691.261451 task-clock-msecs # 0.993 CPUs ( +- 0.307% ) 72 context-switches # 0.000 M/sec ( +- 0.829% ) 6 CPU-migrations # 0.000 M/sec ( +- 18.409% ) 23127 page-faults # 0.033 M/sec ( +- 0.000% ) 1366395876 cycles # 1976.670 M/sec ( +- 0.153% ) 1443136016 instructions # 1.056 IPC ( +- 0.012% ) 17956402 cache-references # 25.976 M/sec ( +- 0.325% ) 661924 cache-misses # 0.958 M/sec ( +- 1.335% ) 0.696127275 seconds time elapsed ( +- 0.377% ) I.e. we see some speedup too. Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frédéric Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mike Galbraith <efault@gmx.de> Cc: "H. Peter Anvin" <hpa@zytor.com> LKML-Reference: <20090928174846.GA3361@ghostprotocols.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-28 17:48:46 +00:00
for (nd = rb_first(&parent->maps); nd; nd = rb_next(nd)) {
struct map *map = rb_entry(nd, struct map, rb_node);
struct map *new = map__clone(map);
if (!new)
return -ENOMEM;
thread__insert_map(self, new);
}
return 0;
}
size_t threads__fprintf(FILE *fp)
{
size_t ret = 0;
struct rb_node *nd;
for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
struct thread *pos = rb_entry(nd, struct thread, rb_node);
ret += thread__fprintf(pos, fp);
}
return ret;
}