From cca491388d969f0bd9f6602afd7d79c1315e9856 Mon Sep 17 00:00:00 2001 From: Johannes Stoelp Date: Sat, 27 Jan 2024 01:40:58 +0100 Subject: perf: add pmu event doc + callee vs caller graph example --- src/trace_profile/perf.md | 192 +++++++++++++++++++++++++++++++++++++--- src/trace_profile/src/Makefile | 8 ++ src/trace_profile/src/noploop.c | 10 +++ 3 files changed, 200 insertions(+), 10 deletions(-) create mode 100644 src/trace_profile/src/Makefile create mode 100644 src/trace_profile/src/noploop.c diff --git a/src/trace_profile/perf.md b/src/trace_profile/perf.md index 57118ed..351ae89 100644 --- a/src/trace_profile/perf.md +++ b/src/trace_profile/perf.md @@ -1,12 +1,18 @@ # perf(1) -```markdown -perf list show supported hw/sw events +``` +perf list show supported hw/sw events & metrics + -v ........ print longer event descriptions + --details . print information on the perf event names + and expressions used internally by events perf stat - -p .. show stats for running process - -I ... show stats periodically over interval - -e ... filter for events + -p ..... show stats for running process + -I ...... show stats periodically over interval + -e ...... select event(s) + -M ..... print metric(s), this adds the metric events + --all-user ... configure all selected events for user space + --all-kernel . configure all selected events for kernel space perf top -p .. show stats for running process @@ -22,15 +28,17 @@ perf record dwarf: use .cfi debug information lbr : use hardware last branch record facility -g ..................... short-hand for --call-graph fp - -e ................ filter for events + -e ................ select event(s) + --all-user ... configure all selected events for user space + --all-kernel . configure all selected events for kernel space perf report -n .................... annotate symbols with nr of samples --stdio ............... report to stdio, if not presen tui mode - -g graph,0.5,caller ... show caller based call chains with value >0.5 + -g graph,0.5,callee ... show callee based call chains with value >0.5 ``` -```markdown +``` Useful : page-faults minor-faults @@ -39,18 +47,182 @@ Useful : task-clock ``` +## Select specific events + +Events to sample are specified with the `-e` option, either pass a comma +separated list or pass `-e` multiple times. + +Events are specified in the following form `name[:modifier]`. The list and +description of the `modifier` can be found in the +[`perf-list(1)`][man-perf-list] manpage under `EVENT MODIFIERS`. +```sh +# L1 i$ misses in user space +# L2 i$ stats in user/kernel space mixed +# Sample specified events. +perf stat -e L1-icache-load-misses:u \ + -e l2_rqsts.all_code_rd:uk,l2_rqsts.code_rd_hit:k,l2_rqsts.code_rd_miss:k \ + -- stress -c 2 +``` + +The `--all-user` and `--all-kernel` options append a `:u` and `:k` modifier to +all specified events. Therefore the following two command lines are equivalent. +```sh +# 1) +perf stat -e cycles:u,instructions:u -- ls + +# 2) +perf stat --all-user -e cycles,instructions -- ls +``` + +### Raw events + +In case perf does not provide a _symbolic_ name for an event, the event can be +specified in a _raw_ form as `r + UMask + EventCode`. + +The following is an example for the [L2_RQSTS.CODE_RD_HIT][l2i-req-ev] event +with `EventCode=0x24` and `UMask=0x10` on my laptop with a `sandybridge` uarch. +```sh +perf stat -e l2_rqsts.code_rd_hit -e r1024 -- ls +# Performance counter stats for 'ls': +# +# 33.942 l2_rqsts.code_rd_hit +# 33.942 r1024 +``` + +### Find raw performance counter events (intel) + +The [`intel/perfmon`][perfmon] repository provides a performance event +databases for the different intel uarchs. + +The table in [`mapfile.csv`][perfmon-map] can be used to lookup the +corresponding uarch, just grab the family model from the procfs. +```sh + cat /proc/cpuinfo | awk '/^vendor_id/ { V=$3 } + /^cpu family/ { F=$4 } + /^model\s*:/ { printf "%s-%d-%x\n",V,F,$3 }' +``` +> The table in [performance monitoring events][perfmon-kinds] describes how +> events are sorted into the different files. + +### Raw events for perfs own symbolic names + +Perf also defines some own _symbolic_ names for events. An example is the +`cache-references` event. The [`perf_event_open(2)`][man-perf-ev-open] manpage +gives the following description. +```man +perf_event_open(2) + +PERF_COUNT_HW_CACHE_REFERENCES + Cache accesses. Usually this indicates Last Level Cache accesses but this + may vary depending on your CPU. This may include prefetches and coherency + messages; again this depends on the design of your CPU. +``` + +The `sysfs` can be consulted to get the concrete performance counter on the +given system. +```sh +cat /sys/devices/cpu/events/cache-misses +# event=0x2e,umask=0x41 +``` + ## [`Flamegraph`](https://github.com/brendangregg/FlameGraph) ### Flamegraph with single event trace -```markdown +``` perf record -g -e cpu-cycles -p perf script | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > cycles-flamegraph.svg ``` ### Flamegraph with multiple event traces -```markdown +```sh perf record -g -e cpu-cycles,page-faults -p perf script --per-event-dump # fold & generate as above ``` +## Examples +### Determine theoretical max instructions per cycle + +```c +{{#include src/noploop.c }} +``` + +```sh +perf stat -e cycles,instructions ./noploop +# Performance counter stats for './noploop': +# +# 1.031.075.940 cycles +# 4.103.534.341 instructions # 3,98 insn per cycle +``` + +### Caller vs callee callstacks + +The following gives an example for a scenario where we have the following calls +- `main -> do_foo() -> do_work()` +- `main -> do_bar() -> do_work()` + +```sh +perf report --stdio -g graph,callee + +# Children Self Command Shared Object Symbols +# ........ ........ ....... .................... ................. +# +# 49.71% 49.66% bench bench [.] do_work +# | +# --49.66%--_start <- callstack bottom +# __libc_start_main +# 0x7ff366c62ccf +# main +# | +# |--25.13%--do_bar +# | do_work <- callstack top +# | +# --24.53%--do_foo +# do_work + +perf report --stdio -g graph,callee + +# Children Self Command Shared Object Symbols +# ........ ........ ....... .................... ................. +# +# 49.71% 49.66% bench bench [.] do_work +# | +# ---do_work <- callstack top +# | +# |--25.15%--do_bar +# | main +# | 0x7ff366c62ccf +# | __libc_start_main +# | _start <- callstack bottom +# | +# --24.55%--do_foo +# main +# 0x7ff366c62ccf +# __libc_start_main +# _start <- callstack bottom +``` + +## References +- [intel/perfmon][perfmon] - intel PMU event database per uarch +- [intel/perfmon-html][perfmon-html] - a html rendered version of the PMU events with search +- [intel/perfmon/mapfile.csv][perfmon-map] - processor family to uarch mapping +- [linux/perf/events][perf-pmu-ev] - x86 PMU events known to perf tools +- [linux/arch/events][x86-core-ev] - x86 PMU events linux kernel +- [wikichip] - computer architecture wiki +- [perf-list(1)][man-perf-list] - manpage +- [perf_event_open(2)][man-perf-ev-open] - manpage + + +[perfmon-html]: https://perfmon-events.intel.com/ +[perfmon]: https://github.com/intel/perfmon +[perfmon-map]: https://github.com/intel/perfmon/blob/main/mapfile.csv +[perfmon-kinds]: https://github.com/intel/perfmon/tree/main#performance-monitoring-events + +[perf-pmu-ev]: https://github.com/torvalds/linux/tree/master/tools/perf/pmu-events/arch/x86 +[x86-core-ev]: https://github.com/torvalds/linux/blob/master/arch/x86/events/intel/core.c +[l2i-req-ev]: https://github.com/intel/perfmon/blob/09c155f72e1b8f14b09aea346a35467a03a7d62b/SNB/events/sandybridge_core.json#L808 + +[man-perf-ev-open]: https://man7.org/linux/man-pages/man2/perf_event_open.2.html +[man-perf-list]: https://man7.org/linux/man-pages/man1/perf-list.1.html + +[wikichip]: https://en.wikichip.org/wiki/WikiChip diff --git a/src/trace_profile/src/Makefile b/src/trace_profile/src/Makefile new file mode 100644 index 0000000..7e887b8 --- /dev/null +++ b/src/trace_profile/src/Makefile @@ -0,0 +1,8 @@ +ipc: noploop + perf stat -e cycles,instructions -- ./noploop + +noploop: + $(CC) -o noploop noploop.c + +clean: + $(RM) noploop diff --git a/src/trace_profile/src/noploop.c b/src/trace_profile/src/noploop.c new file mode 100644 index 0000000..76e664f --- /dev/null +++ b/src/trace_profile/src/noploop.c @@ -0,0 +1,10 @@ +#define NOP4 "nop\nnop\nnop\nnop\n" +#define NOP32 NOP4 NOP4 NOP4 NOP4 NOP4 NOP4 NOP4 NOP4 +#define NOP256 NOP32 NOP32 NOP32 NOP32 NOP32 NOP32 NOP32 NOP32 +#define NOP2048 NOP256 NOP256 NOP256 NOP256 NOP256 NOP256 NOP256 NOP256 + +int main() { + for (unsigned i = 0; i < 2000000; ++i) { + asm volatile(NOP2048); + } +} -- cgit v1.2.3