Debugging mechanism
kprobes
- KProbes is a debugging mechanism for the Linux kernel which can also be used for monitoring events inside a production system.
- Good article at https://lwn.net/Articles/132196/ - An introduction to KProbes
-
You can monitor a specific function or operation in a function with kprobe
- Before registering kprobe handler, you need to fill ‘struct kprobe’ with the functions wanted to be called
struct kprobe {
struct hlist_node hlist;
/* list of kprobes for multi-handler support */
struct list_head list;
/*count the number of times this probe was temporarily disarmed */
unsigned long nmissed;
/* location of the probe point */
kprobe_opcode_t *addr;
/* Allow user to indicate symbol name of the probe point */
const char *symbol_name;
/* Offset into the symbol */
unsigned int offset;
/* Called before addr is executed. */
kprobe_pre_handler_t pre_handler;
/* Called after addr is executed, unless... */
kprobe_post_handler_t post_handler;
/*
* ... called if executing addr causes a fault (eg. page fault).
* Return 1 if it handled fault, otherwise kernel will see it.
*/
kprobe_fault_handler_t fault_handler;
/*
* ... called if breakpoint trap occurs in probe handler.
* Return 1 if it handled break, otherwise kernel will see it.
*/
kprobe_break_handler_t break_handler;
/* Saved opcode (which has been replaced with breakpoint) */
kprobe_opcode_t opcode;
/* copy of the original instruction */
struct arch_specific_insn ainsn;
/*
* Indicates various status flags.
* Protected by kprobe_mutex after this kprobe is registered.
*/
u32 flags;
};
- There are four handler functions based on when you want it to be called
- pre_handler : before call the target function
- post_handler : after call the target function
- fault_handler : if the target function is causing of fault such as page fault
- break_handler : if the target has breakpoint and trap occured
- Target function can be addressed with one of the below two method
- kprobe_opcode_t *addr : Use the address of the target function
- const char *symbol_name : Using the symbol name. Only works if the function is exported
- Registering/Unregistering can be done with the below functions
int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
- Example
- Call handlers before and after a specific function
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/version.h>
static unsigned long address = 0x0;
static char *name = 0;
module_param(address, ulong, S_IRUGO);
module_param(name, charp, S_IRUGO);
static struct kprobe kp;
static int handler_pre(struct kprobe *p, struct pt_regs *regs)
{
printk("pre_handler called before p->addr=0x%p\n", p->addr);
dump_stack();
return 0;
}
static void handler_post(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
printk("post_handler called after p->addr=0x%p\n", p->addr);
dump_stack();
}
static int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr)
{
printk("fault_handler called during calling p->addr=0x%p\n", p->addr);
dump_stack();
return 0;
}
static int __init my_init(void)
{
if (address == 0 && (name == 0 || strlen(name) == 0)) {
printk
("Target function is not specified. Please use address or name to monitoring it\n");
return -1;
}
kp.pre_handler = handler_pre;
kp.post_handler = handler_post;
kp.fault_handler = handler_fault;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
if (name != NULL)
address = kallsyms_lookup_name(name);
#endif
if (!address) {
printk("Can't find the target address for %s\n", name);
return -1;
}
kp.addr = (kprobe_opcode_t *) address;
if (register_kprobe(&kp)) {
printk("Can't register kprobe on %s\n", name);
return -1;
}
printk("Hello, kprobe is registered\n");
return 0;
}
static void __exit my_exit(void)
{
unregister_kprobe(&kp);
printk("Bye bye\n");
}
module_init(my_init);
module_exit(my_exit);
MODULE_LICENSE("GPL");
- Running
$ make
$ insmod kprobe_test.ko name="do_fork"
$ ls
kprobe_test.c kprobe_test.mod.c kprobe_test.o modules.order
kprobe_test.ko kprobe_test.mod.o Makefile Module.symvers
$ tail /var/log/messages
Jun 5 15:52:33 devel kernel: 0000000000000282 00000000392dcf20 ffff8801cfe17df0 ffffffff81686e13
Jun 5 15:52:33 devel kernel: ffff8801cfe17e00 ffffffffa0797080 ffff8801cfe17e40 ffffffff81692368
Jun 5 15:52:33 devel kernel: ffffffff81084de5 ffffffff81ae79e0 0000000000008000 ffffffff81084de0
Jun 5 15:52:33 devel kernel: Call Trace:
Jun 5 15:52:33 devel kernel: [<ffffffff81686e13>] dump_stack+0x19/0x1b
Jun 5 15:52:33 devel kernel: [<ffffffffa0797080>] handler_pre+0x20/0x24 [kprobe_test]
Jun 5 15:52:33 devel kernel: [<ffffffff81692368>] kprobe_ftrace_handler+0xb8/0x120
Jun 5 15:52:33 devel kernel: [<ffffffff81084de5>] ? do_fork+0x5/0x2c0
Jun 5 15:52:33 devel kernel: [<ffffffff81084de0>] ? fork_idle+0xd0/0xd0
$ rmmod kprobe_test
- Checking out if kprobe was installed in a specific function from vmcore
- If it has ‘callq ftrace_regs_caller’, it has kprobe on this function
crash> dis -lr do_fork+0x5
/usr/src/debug/kernel-3.10.0-514.21.1.el7/linux-3.10.0-514.21.1.el7.x86_64/kernel/fork.c: 1690
0xffffffff81084de0 <do_fork>: callq 0xffffffff81699230 <ftrace_regs_caller>
0xffffffff81084de5 <do_fork+5>: push %rbp
- If kprobe_ftrace_handler is in one of ftrace_ops structures, it has kprobe installed on the system
crash> ftrace_ops_list
ftrace_ops_list = $5 = (struct ftrace_ops *) 0xffffffff81ae79e0 <kprobe_ftrace_ops>
crash> list -o ftrace_ops.next -s ftrace_ops.func 0xffffffff81ae79e0
ffffffff81ae79e0
func = 0xffffffff816922b0 <kprobe_ftrace_handler>
ffffffff81ae8380
func = 0xffffffff81699220 <ftrace_stub>
- kprobe structures are saved in ‘kprobe_table[]’
crash> kprobe_table
kprobe_table = $6 =
{ {
first = 0x0
}, {
first = 0x0
}, {
first = 0x0
}, {
...
first = 0x0
}, {
first = 0xffffffffa0799240
}, {
first = 0x0
...
crash> kprobe 0xffffffffa0799240
struct kprobe {
hlist = {
next = 0x0,
pprev = 0xffffffff81e6ff30 <kprobe_table+144>
},
list = {
next = 0xffffffffa0799250,
prev = 0xffffffffa0799250
},
nmissed = 0,
addr = 0xffffffff81084de0 <do_fork> "\350KDa",
symbol_name = 0x0,
offset = 0,
pre_handler = 0xffffffffa0797060,
post_handler = 0xffffffffa0797030,
fault_handler = 0xffffffffa0797000,
break_handler = 0x0,
opcode = 0 '\000',
ainsn = {
insn = 0x0,
boostable = -1,
if_modifier = false
},
flags = 8
}
jprobes
- jprobes is using similar techniques as kprobes, but instead of specifying multiples function for pre, post, fault, and trap events, it’s providing a way to wrap the original function
- Registering/Unregistering happen via the below functions with ‘struct jprobe’
int __kprobes register_jprobe(struct jprobe *jp);
void __kprobes unregister_jprobe(struct jprobe *jp);
- ‘struct jprobe’ is based on kprobe and has one more entry which is pointing a new function (wrapper)
struct jprobe {
struct kprobe kp;
void *entry; /* probe handling code to jump to */
};
- Example
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
static long my_fork(unsigned long clone_flags, unsigned long stack_start,
struct pt_regs *regs, unsigned long stack_size,
int __user *parent_tidptr, int __user *child_tidptr)
{
printk("jprobe my_fork: clone_flags = 0x%lx, stack_size = 0x%lx\n",
clone_flags, stack_size);
jprobe_return();
printk("After actual do_fork()? No, not going to be called\n");
return 0;
}
static struct jprobe my_jprobe = {
.entry = my_fork,
.kp = {
.symbol_name = "do_fork",
},
};
static int __init my_init(void)
{
int ret;
ret = register_jprobe(&my_jprobe);
if (ret < 0) {
printk("register_jprobe failed with %d\n", ret);
return -1;
}
printk("register_jprobe installed\n");
return 0;
}
static void __exit my_exit(void)
{
unregister_jprobe(&my_jprobe);
printk("jprobe unregistered\n");
}
module_init(my_init);
module_exit(my_exit);
MODULE_LICENSE("GPL");
- Running
$ insmod jprobe_test.ko
$ rmmod jprobe_test
root@devel:kprobes$ tail /var/log/messages
...
Jun 6 11:17:45 devel kernel: register_jprobe installed
Jun 6 11:17:50 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:17:50 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:17:50 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:18:04 devel systemd: Time has been changed
Jun 6 11:18:23 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:18:23 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:18:24 devel kernel: jprobe my_fork: clone_flags = 0x1200011, stack_size = 0x0
Jun 6 11:18:24 devel kernel: jprobe unregistered
- Checking jprobes
- It is using the same method just like kprobe
crash> dis -l do_fork | head -n 3
/usr/src/debug/kernel-3.10.0-514.21.1.el7/linux-3.10.0-514.21.1.el7.x86_64/kernel/fork.c: 1690
0xffffffff81084de0 <do_fork>: callq 0xffffffff81699230 <ftrace_regs_caller>
0xffffffff81084de5 <do_fork+5>: push %rbp
crash> kprobe_table
kprobe_table = $1 =
{ {
...
first = 0x0
}, {
first = 0xffffffffa0794000
}, {
first = 0x0
...
crash> sym 0xffffffffa0794000
ffffffffa0794000 (d) my_jprobe [jprobe_test]
crash> jprobe ffffffffa0794000
struct jprobe {
kp = {
hlist = {
next = 0x0,
pprev = 0xffffffff81e6ff30 <kprobe_table+144>
},
list = {
next = 0xffffffffa0794010,
prev = 0xffffffffa0794010
},
nmissed = 0,
addr = 0xffffffff81084de0 <do_fork> "\350KDa",
symbol_name = 0xffffffffa07930f0 "do_fork",
offset = 0,
pre_handler = 0xffffffff816918e0 <setjmp_pre_handler>,
post_handler = 0x0,
fault_handler = 0x0,
break_handler = 0xffffffff81691980 <longjmp_break_handler>,
opcode = 0 '\000',
ainsn = {
insn = 0x0,
boostable = -1,
if_modifier = false
},
flags = 8
},
entry = 0xffffffffa0792000
}
crash> sym 0xffffffffa0792000
ffffffffa0792000 (t) my_fork [jprobe_test]
SystemTap
-
SystemTap ( stap ) is a scripting language and tool for dynamically instrumenting running production Linux kernel-based operating systems. System administrators can use SystemTap to extract, filter and summarize data in order to enable diagnosis of complex performance or functional problems - https://en.wikipedia.org/wiki/SystemTap
-
To use SystemTap, you need to install SystemTap as well as kernel-debuginfo, kernel-devl to compile and run it
-
Installing the environment
$ yum install -y systemtap systemtap-runtime
$ subscription-manager repos --enable=rhel-7-variant-debug-rpms
$ yum install -y kernel-devel-$(uname -r) kernel-debuginfo-$(uname -r) kernel-debuginfo-common-$(uname -m)-$(uname -r)
- Testing
- SystemTap script can be passed via command line argument or via a file
$ stap -v -e 'probe vfs.read {printf("read performed\n"); exit()}'
Pass 1: parsed user script and 122 library scripts using 227708virt/40776res/3264shr/37692data kb, in 280usr/10sys/300real ms.
Pass 2: analyzed script: 1 probe, 1 function, 4 embeds, 0 globals using 359100virt/173276res/4464shr/169084data kb, in 1330usr/180sys/1506real ms.
Pass 3: using cached /root/.systemtap/cache/e3/stap_e32b328321382df05d3041933d344e8c_1682.c
Pass 4: using cached /root/.systemtap/cache/e3/stap_e32b328321382df05d3041933d344e8c_1682.ko
Pass 5: starting run.
read performed
Pass 5: run completed in 10usr/60sys/387real ms.
- Internal implementation
$ stap -v -e 'probe vfs.read {printf("read performed\n"); exit()}' -p3
Pass 1: parsed user script and 122 library scripts using 227576virt/40768res/3264shr/37560data kb, in 290usr/10sys/285real ms.
Pass 2: analyzed script: 1 probe, 1 function, 4 embeds, 0 globals using 359088virt/173260res/4464shr/169072data kb, in 1340usr/190sys/1439real ms.
...
static int systemtap_module_init (void) {
...
}
static void systemtap_module_refresh (const char *modname) {
...
}
static void systemtap_module_exit (void) {
...
}
...
static int
stapkp_register_probe(struct stap_kprobe_probe *skp)
{
if (skp->registered_p)
return 0;
return skp->return_p ? stapkp_register_kretprobe(skp)
: stapkp_register_kprobe(skp);
}
static int
stapkp_register_kprobe(struct stap_kprobe_probe *skp)
{
int ret = stapkp_prepare_kprobe(skp);
if (ret == 0)
ret = stapkp_arch_register_kprobe(skp);
return ret;
}
static int
stapkp_arch_register_kprobe(struct stap_kprobe_probe *skp)
{
int ret = 0;
struct kprobe *kp = &skp->kprobe->u.kp;
...
ret = register_kprobe(&skp->kprobe->dummy);
...
return ret;
}
- Tracing it in crash
crash> sym sys_futex
ffffffff810f8540 (T) sys_futex /usr/src/debug/kernel-3.10.0-514.16.1.el7/linux-3.10.0-514.16.1.el7.x86_64/kernel/futex.c: 2978
crash> dis -l ffffffff810f8540 | head -n 2
/usr/src/debug/kernel-3.10.0-514.16.1.el7/linux-3.10.0-514.16.1.el7.x86_64/kernel/futex.c: 2978
0xffffffff810f8540 <sys_futex>: callq 0xffffffff81698df0 <ftrace_regs_caller>
ENTRY(ftrace_regs_caller)
/* Save the current flags before compare (in SS location)*/
pushfq
...
callq 0xffffffff81141330 <ftrace_ops_list_func>
...
jmp ftrace_return
popfq
jmp ftrace_stub
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
static inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
...
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip, regs))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
...
}
crash> ftrace_ops_list
ftrace_ops_list = $2 = (struct ftrace_ops *) 0xffffffff81ae38a0 <kprobe_ftrace_ops>
crash> struct ftrace_ops.func,next 0xffffffff81ae38a0
func = 0xffffffff81691e70 <kprobe_ftrace_handler>
next = 0xffffffff81ae4240 <ftrace_list_end>
crash> kprobe_table
kprobe_table = $3 =
...
}, {
first = 0xffffffffa07b5f40
}, {
...
crash> kprobe.addr,pre_handler,post_handler 0xffffffffa07b5f40
addr = 0xffffffff810f8540 <SyS_futex> "\350\253\bZ"
pre_handler = 0xffffffff81692eb0 <pre_handler_kretprobe>
post_handler = 0x0
crash> kretprobe.free_instances 0xffffffffa07b5f40
free_instances = {
first = 0xffff8801d8f91b80
}
crash> kretprobe.entry_handler,handler,free_instances 0xffffffffa07b5f40
entry_handler = 0xffffffffa07af170
handler = 0xffffffffa07af190
free_instances = {
first = 0xffff8801d8f91c40
}
crash> dis -l 0xffffffffa07af170 7
dis: WARNING: ffffffffa07af170: no associated kernel symbol found
0xffffffffa07af170: nopl 0x0(%rax,%rax,1)
0xffffffffa07af175: push %rbp
0xffffffffa07af176: mov $0x1,%edx
0xffffffffa07af17b: mov %rsp,%rbp
0xffffffffa07af17e: callq 0xffffffffa07aee50
0xffffffffa07af183: pop %rbp
0xffffffffa07af184: retq
crash> dis -l 0xffffffffa07af190 7
WARNING: ffffffffa07af190: no associated kernel symbol found
0xffffffffa07af190: nopl 0x0(%rax,%rax,1)
0xffffffffa07af195: push %rbp
0xffffffffa07af196: xor %edx,%edx
0xffffffffa07af198: mov %rsp,%rbp
0xffffffffa07af19b: callq 0xffffffffa07aee50
0xffffffffa07af1a0: pop %rbp
0xffffffffa07af1a1: retq
crash> dis -l 0xffffffffa07aee50 15
WARNING: ffffffffa07aee50: no associated kernel symbol found
0xffffffffa07aee50: nopl 0x0(%rax,%rax,1)
0xffffffffa07aee55: push %rbp
0xffffffffa07aee56: mov %rsp,%rbp
0xffffffffa07aee59: push %r15
0xffffffffa07aee5b: push %r14
0xffffffffa07aee5d: push %r13
0xffffffffa07aee5f: push %r12
0xffffffffa07aee61: mov %edx,%r12d
0xffffffffa07aee64: push %rbx
0xffffffffa07aee65: sub $0x20,%rsp
0xffffffffa07aee69: mov 0x6740(%rip),%rbx # 0xffffffffa07b55b0
0xffffffffa07aee70: mov %gs:0x28,%rax
0xffffffffa07aee79: mov %rax,-0x30(%rbp)
0xffffffffa07aee7d: xor %eax,%eax
0xffffffffa07aee7f: test %edx,%edx
Ftrace
- A tracing utility built directly into the Linux Kernel
- It provides the ability to see what is happening inside the kernel
- Debugging the kernel using Ftrace - part 1
- Debugging the kernel using Ftrace - part 2
Setting up Ftrace
- Ftrace is located in debugfs file system
$ cd /sys/kernel/debug/tracing
$ ls
available_events max_graph_depth stack_trace
available_filter_functions options/ stack_trace_filter
available_tracers per_cpu/ trace
buffer_size_kb printk_formats trace_clock
buffer_total_size_kb README trace_marker
current_tracer saved_cmdlines trace_options
dyn_ftrace_total_info saved_cmdlines_size trace_pipe
enabled_functions set_event trace_stat/
events/ set_ftrace_filter tracing_cpumask
free_buffer set_ftrace_notrace tracing_max_latency
function_profile_enabled set_ftrace_pid tracing_on
instances/ set_graph_function tracing_thresh
kprobe_events snapshot uprobe_events
kprobe_profile stack_max_size uprobe_profile
- To use ftrace, the below should be enabled which are enabled in RHEL
$ grep CONFIG_DYNAMIC_FTRACE= config-3.10.0-514.21.1.el7.x86_64
CONFIG_DYNAMIC_FTRACE=y
$ grep TRACER=y config-3.10.0-514.21.1.el7.x86_64
CONFIG_NOP_TRACER=y
CONFIG_HAVE_FUNCTION_TRACER=y
CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
CONFIG_CONTEXT_SWITCH_TRACER=y
CONFIG_GENERIC_TRACER=y
CONFIG_FUNCTION_TRACER=y
CONFIG_FUNCTION_GRAPH_TRACER=y
CONFIG_SCHED_TRACER=y
CONFIG_STACK_TRACER=y
- When CONFIG_DYNAMIC_FTRACE is configured the call is converted to a NOP at boot time to keep the system running at 100% performance
- When tracer is not working
crash> dis -l do_fork | head -n 3
/usr/src/debug/kernel-3.10.0-514.16.1.el7/linux-3.10.0-514.16.1.el7.x86_64/kernel/fork.c: 1690
0xffffffff81084de0 <do_fork>: nopl 0x0(%rax,%rax,1) [FTRACE NOP]
0xffffffff81084de5 <do_fork+5>: push %rbp
- When tracer is ON
/usr/src/debug/kernel-3.10.0-514.16.1.el7/linux-3.10.0-514.16.1.el7.x86_64/kernel/fork.c: 1690
0xffffffff81084de0 <do_fork>: callq 0xffffffff81698d50 <ftrace_caller>
0xffffffff81084de5 <do_fork+5>: push %rbp
- Check available tracers
$ cat available_tracers
blk function_graph wakeup_dl wakeup_rt wakeup function nop
- Checking available tracers from vmcore
crash> trace_types
trace_types = $4 = (struct tracer *) 0xffffffff81ae4b00 <blk_tracer>
crash> list 0xffffffff81ae4b00 -o tracer.next -s tracer.name,init
ffffffff81ae4b00
name = 0xffffffff818da5c3 "blk"
init = 0xffffffff81159800 <blk_tracer_init>
ffffffff81ae4a60
name = 0xffffffff818da38a "function_graph"
init = 0xffffffff81157c30 <graph_trace_init>
ffffffff81ae4720
name = 0xffffffff818da1e3 "wakeup_dl"
init = 0xffffffff81156e90 <wakeup_dl_tracer_init>
ffffffff81ae47c0
name = 0xffffffff818da1ed "wakeup_rt"
init = 0xffffffff81156eb0 <wakeup_rt_tracer_init>
ffffffff81ae4860
name = 0xffffffff81937866 "wakeup"
init = 0xffffffff81156ed0 <wakeup_tracer_init>
ffffffff81ae45c0
name = 0xffffffff818d99b4 "function"
init = 0xffffffff81155ff0 <function_trace_init>
ffffffff81ae4960
name = 0xffffffff818f3ad6 "nop"
init = 0xffffffff81157130 <nop_trace_init>
Function tracer
- every function in the kernel call a special function “mcount()”
$ echo function > /sys/kernel/debug/tracing/current_tracer
$ cat /sys/kernel/debug/tracing/current_tracer
function
- Checking the function calls
- The first two items are the traced task name and PID
- The CPU that the trace was executed on is within the brackets
- The timestamp is the time since boot, followed by the function name
- The function in this case is the function being traced with its parent following the “<-“ symbol.
$ cat /sys/kernel/debug/tracing/trace | head -n 20
# tracer: function
#
# entries-in-buffer/entries-written: 102499/11907819 #P:2
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
irqbalance-810 [001] .... 89262.130446: irq_to_desc <-show_interrupts
irqbalance-810 [001] .... 89262.130447: _raw_spin_lock_irqsave <-show_interrupts
irqbalance-810 [001] d... 89262.130447: kstat_irqs_cpu <-show_interrupts
irqbalance-810 [001] d... 89262.130447: kstat_irqs_cpu <-show_interrupts
irqbalance-810 [001] d... 89262.130447: seq_printf <-show_interrupts
irqbalance-810 [001] d... 89262.130447: seq_vprintf <-seq_printf
irqbalance-810 [001] d... 89262.130448: kstat_irqs_cpu <-show_interrupts
irqbalance-810 [001] d... 89262.130448: seq_printf <-show_interrupts
irqbalance-810 [001] d... 89262.130448: seq_vprintf <-seq_printf
function_graph tracer
- A bit hard to understand ‘function’ output
- function_graph traces both the entry and exit of a function, which gives the tracer the ability to know the depth of functions that are called
- This gives the start and end of a function denoted with the C like annotation of “{“ to start a function and “}” at the end
- Leaf functions, which do not call other functions, simply end with a “;”
- The DURATION column shows the time spent in the corresponding function
- The function graph tracer records the time the function was entered and exited and reports the difference as the duration. These numbers only appear with the leaf functions and the “}” symbol. Note that this time also includes the overhead of all functions within a nested function as well as the overhead of the function graph tracer itself.
$ echo function_graph > /sys/kernel/debug/tracing/current_tracer
root@devel:boot$ cat /sys/kernel/debug/tracing/trace | head -n 20
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
0) 0.087 us | } /* page_remove_rmap */
0) 0.042 us | put_page();
0) 0.073 us | _raw_spin_unlock();
0) 0.045 us | put_page();
0) + 17.010 us | } /* wp_page_copy.isra.56 */
0) + 20.751 us | } /* do_wp_page */
0) + 22.275 us | } /* handle_mm_fault */
0) 0.042 us | up_read();
0) + 24.347 us | } /* __do_page_fault */
0) + 24.754 us | } /* do_page_fault */
0) | do_page_fault() {
0) | __do_page_fault() {
0) 0.043 us | down_read_trylock();
0) 0.042 us | _cond_resched();
0) 0.068 us | find_vma();
0) | handle_mm_fault() {
- When a function spent more than 10 microseconds, it prints ‘+’ in front of the time
$ cat /sys/kernel/debug/tracing/trace | grep '+' | head
0) + 10.394 us | }
0) + 12.787 us | } /* do_wp_page */
0) + 13.661 us | } /* handle_mm_fault */
0) + 14.992 us | } /* __do_page_fault */
0) + 15.220 us | } /* do_page_fault */
0) + 10.413 us | }
0) + 12.719 us | }
0) + 13.565 us | }
0) + 14.815 us | }
0) + 15.083 us | }
- When a function spent more than 100 microseconds, it prints ‘!’ in front of the time
$ cat /sys/kernel/debug/tracing/trace | grep '!' | head
1) ! 106.320 us | } /* free_pages_and_swap_cache */
1) ! 141.339 us | } /* tlb_flush_mmu.part.61 */
1) ! 145.374 us | } /* tlb_finish_mmu */
1) ! 1087.709 us | } /* exit_mmap */
1) ! 1094.262 us | } /* mmput */
1) ! 1106.230 us | } /* flush_old_exec */
1) ! 140.495 us | }
1) ! 1527.120 us | } /* load_elf_binary */
1) ! 1533.974 us | } /* search_binary_handler */
1) ! 1832.817 us | } /* do_execve_common.isra.25 */
On/Off Tracing
- Turning off tracing
$ echo 0 > /sys/kernel/debug/tracing/tracing_on
$ cat /sys/kernel/debug/tracing/trace | head -n 10
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) 0.509 us | } /* tty_write_room */
1) 5.245 us | } /* n_tty_poll */
1) | tty_ldisc_deref() {
1) 0.045 us | ldsem_up_read();
1) 0.481 us | }
1) 8.481 us | } /* tty_poll */
$ sleep 5
$ cat /sys/kernel/debug/tracing/trace | head -n 10
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) 0.509 us | } /* tty_write_room */
1) 5.245 us | } /* n_tty_poll */
1) | tty_ldisc_deref() {
1) 0.045 us | ldsem_up_read();
1) 0.481 us | }
1) 8.481 us | } /* tty_poll */
- Turning on tracing
$ echo 1 > /sys/kernel/debug/tracing/tracing_on
$ cat /sys/kernel/debug/tracing/trace | head -n 10
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) 0.030 us | } /* _raw_spin_lock */
1) | do_set_pte() {
1) 0.031 us | add_mm_counter_fast();
1) 0.033 us | page_add_file_rmap();
1) 0.537 us | }
1) 0.030 us | _raw_spin_unlock();
$ sleep 5
$ cat /sys/kernel/debug/tracing/trace | head -n 10
# tracer: function_graph
#
# CPU DURATION FUNCTION CALLS
# | | | | | | |
1) 0.044 us | } /* mutex_unlock */
1) | __vma_link_rb() {
1) 0.045 us | vma_compute_subtree_gap();
1) | vma_gap_callbacks_rotate() {
1) 0.055 us | vma_compute_subtree_gap();
1) 0.455 us | }
Tracing stack
- Examine the size of the kernel stack and how much stack space each function is using
- Enabling the stack tracer (CONFIG_STACK_TRACER) will show where the biggest use of the stack takes place.
$ grep CONFIG_STACK_TRACER /boot/config-3.10.0-514.el7.x86_64
CONFIG_STACK_TRACER=y
- Stack trace can be enabled by put ‘1’ on /proc/sys/kernel/stack_tracer_enabled
- The biggest stack usage so far can be checked with /sys/kernel/debug/tracing/stack_max_size
- /sys/kernel/debug/tracing/stack_trace shows the biggest stack usage trace
- It also shows the size each function was using
$ echo 1 > /proc/sys/kernel/stack_tracer_enabled
$ cat /sys/kernel/debug/tracing/stack_max_size
3608
$ cat /sys/kernel/debug/tracing/stack_trace
Depth Size Location (34 entries)
----- ---- --------
0) 3608 32 down_trylock+0x14/0x40
1) 3576 48 xfs_buf_trylock+0x1f/0x80 [xfs]
2) 3528 80 _xfs_buf_find+0x15d/0x340 [xfs]
3) 3448 64 xfs_buf_get_map+0x2a/0x240 [xfs]
4) 3384 72 xfs_buf_read_map+0x30/0x160 [xfs]
5) 3312 64 xfs_trans_read_buf_map+0x211/0x400 [xfs]
6) 3248 96 xfs_btree_read_buf_block.constprop.28+0x78/0xc0 [xfs]
7) 3152 80 xfs_btree_lookup_get_block+0x80/0x100 [xfs]
8) 3072 112 xfs_btree_lookup+0xcf/0x5c0 [xfs]
9) 2960 16 xfs_alloc_lookup_eq+0x1b/0x20 [xfs]
10) 2944 88 xfs_alloc_fixup_trees+0x252/0x370 [xfs]
11) 2856 160 xfs_alloc_ag_vextent_near+0x570/0xab0 [xfs]
12) 2696 24 xfs_alloc_ag_vextent+0x10d/0x150 [xfs]
13) 2672 80 xfs_alloc_vextent+0x446/0x5f0 [xfs]
14) 2592 232 xfs_bmap_btalloc+0x3f3/0x780 [xfs]
15) 2360 16 xfs_bmap_alloc+0xe/0x10 [xfs]
16) 2344 344 xfs_bmapi_write+0x499/0xab0 [xfs]
17) 2000 184 xfs_iomap_write_allocate+0x177/0x390 [xfs]
18) 1816 88 xfs_map_blocks+0x186/0x240 [xfs]
19) 1728 184 xfs_vm_writepage+0x193/0x5d0 [xfs]
20) 1544 24 __writepage+0x13/0x50
21) 1520 280 write_cache_pages+0x251/0x4d0
22) 1240 96 generic_writepages+0x4d/0x80
23) 1144 48 xfs_vm_writepages+0x53/0x90 [xfs]
24) 1096 16 do_writepages+0x1e/0x40
25) 1080 64 __writeback_single_inode+0x40/0x210
26) 1016 168 writeback_sb_inodes+0x25e/0x420
27) 848 72 __writeback_inodes_wb+0x9f/0xd0
28) 776 120 wb_writeback+0x263/0x2f0
29) 656 176 bdi_writeback_workfn+0x2cb/0x460
30) 480 72 process_one_work+0x17b/0x470
31) 408 96 worker_thread+0x126/0x410
32) 312 136 kthread+0xcf/0xe0
33) 176 176 ret_from_fork+0x58/0x90