Time Management
jiffies
- Heartbeat of Linux kernel
- In RHEL6 or later version is using tickless timer, so, the calculation is a bit complicate
- Time is managed in ‘jiffies_64’
/*
* The 64-bit value is not atomic - you MUST NOT read it
* without sampling the sequence number in xtime_lock.
* get_jiffies_64() will do this for you as appropriate.
*/
extern u64 __jiffy_data jiffies_64;
extern unsigned long volatile __jiffy_data jiffies;
- It’s increased during timer interrupt handling
- Difference between RHEL5 and RHEL6/7 is that RHEL6/7 is using tickless scheduling
- It doesn’t trigger timer interrupt in regular basis, but based on anything coming up with other/scheduled interrupts
- It needs to find out how much time had been passed which is saved in ‘ticks’
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
update_wall_time();
calc_global_load(ticks);
}
static void tick_do_update_jiffies64(ktime_t now)
{
unsigned long ticks = 0;
ktime_t delta;
/*
* Do a quick check without holding xtime_lock:
*/
delta = ktime_sub(now, last_jiffies_update);
if (delta.tv64 < tick_period.tv64)
return;
/* Reevalute with xtime_lock held */
write_seqlock(&xtime_lock);
delta = ktime_sub(now, last_jiffies_update); <---- Counting missed ticks
if (delta.tv64 >= tick_period.tv64) {
delta = ktime_sub(delta, tick_period);
last_jiffies_update = ktime_add(last_jiffies_update,
tick_period);
/* Slow path for long timeouts */
if (unlikely(delta.tv64 >= tick_period.tv64)) {
s64 incr = ktime_to_ns(tick_period);
ticks = ktime_divns(delta, incr);
last_jiffies_update = ktime_add_ns(last_jiffies_update,
incr * ticks);
}
do_timer(++ticks);
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
}
write_sequnlock(&xtime_lock);
}
- ‘ticks’ value is calculated based on the difference between current time (now) and last update time (last_jiffies_update).
- The current time (now) is from the below
static void tick_nohz_update_jiffies(ktime_t now)
{
...
tick_do_update_jiffies64(now);
}
static inline void tick_check_nohz(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
ktime_t now;
if (!ts->idle_active && !ts->tick_stopped)
return;
now = ktime_get(); <----- Get the time
if (ts->idle_active)
tick_nohz_stop_idle(cpu, now);
if (ts->tick_stopped) {
tick_nohz_update_jiffies(now);
tick_nohz_kick_tick(cpu, now);
}
}
ktime_t ktime_get(void)
{
..
do {
...
secs = timekeeper.xtime.tv_sec +
timekeeper.wall_to_monotonic.tv_sec;
nsecs = timekeeper.xtime.tv_nsec +
timekeeper.wall_to_monotonic.tv_nsec;
nsecs += timekeeping_get_ns(); <---- get the time in ns
/* If arch requires, add in gettimeoffset() */
nsecs += arch_gettimeoffset();
} while (read_seqretry(&timekeeper.lock, seq));
..
return ktime_add_ns(ktime_set(secs, 0), nsecs);
}
/* Timekeeper helper functions. */
static inline s64 timekeeping_get_ns(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
/* read clocksource: */
clock = timekeeper.clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* return delta convert to nanoseconds using ntp adjusted mult. */
return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
timekeeper.shift);
}
- The actual time is gathered from clock source via ‘clock->read(clock)’.
- This ‘tick_check_nohz()’ is called whenever there’s an interrupt
/*
* Called from irq_enter to notify about the possible interruption of idle()
*/
void tick_check_idle(int cpu)
{
tick_check_oneshot_broadcast(cpu);
tick_check_nohz(cpu);
}
void irq_enter(void)
{
...
tick_check_idle(cpu);
...
}
unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
...
irq_enter();
...
}
- In RHEL5 or earlier version, it’s much easier. It’s called from ‘timer_interrupt()’
static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
...
main_timer_handler(regs);
...
}
void main_timer_handler(struct pt_regs *regs)
{
...
do_timer_tsc_timekeeping(regs);
...
}
static void do_timer_tsc_timekeeping(struct pt_regs *regs)
{
...
do_timer_jiffy(regs);
...
}
static void do_timer_jiffy(struct pt_regs *regs)
{
do_timer(regs);
...
}
void do_timer(struct pt_regs *regs)
{
jiffies_64++;
...
}
- How to use jiffies in kernel to check time progress
- Below is a wrong way to check it. It can’t handle overflow situation
if (jiffies > expected_time) {
do_work();
}
- Proper way to check is using kernle provided macros shown in the below
time_after(a, b);
time_before(a, b);
time_after_eq(a, b);
time_before_eq(a, b);
- Example
if (time_after(jiffies, expected_time)) {
do_work();
}
How to make a delay in Kernel
- Using jiffies is simpler way, but can’t make a delay lower than HZ
- jiffies is increased in every 1 ms when HZ = 1000, in every 10 ms when HZ=100
int delay = 5;
work_time = jiffies + delay * HZ; // 5 secs later
while (time_before(jiffies, work_time)) {
;
}
- Functions for smaller delay than HZ
- ndelay: nano seconds delay
- udelay: micro seonds delay
- mdelay: milli seconds delay
- msleep, msleep_interruptible: milli seconds delay without looping in CPU as it goes to sleep. interruptible() version is sleep in TASK_INTERRUPTIBLE.
#include <linux/delay.h>
void ndelay(unsigned long nanoseconds);
void udelay(unsigned long microseconds);
void mdelay(unsigned long milliseconds);
void msleep (unsigned int milliseconds);
unsigned long msleep_interruptible (unsigned int milliseconds);
Kernel Timer
- Looping in CPU is the worst way to wait until the specified time comes
- Linux Kernel provides a mechanism that helps to execute functions scheduled to run after some specific time
struct timer_list {
struct list_head entry;
unsigned long expires;
void (*function)(unsigned long);
unsigned long data;
...
};
void init_timer (struct timer_list *timer);
void add_timer (struct timer_list *timer);
void mod_timer (struct timer_list *timer, unsigned long expires);
int del_timer (struct timer_list *timer);
int del_timer_sync (struct timer_list *timer);
- Simple example to show how to use timer
/* timer_drv.c */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/timer.h>
#include <linux/sched.h>
#include <linux/init.h>
MODULE_LICENSE("GPL");
#define MYDEV_NAME "mycdrv"
int mycdrv_ma_no;
struct timer_list my_timer;
void my_timer_function(unsigned long ptr)
{
printk("my_timer_function(), jiffies=%ld\n", jiffies);
printk("my data = %d, my pid=%d\n", (int)ptr, (int)current->pid);
}
ssize_t my_write(struct file *file, const char *buf, size_t lbuf, loff_t * ppos)
{
static int len = 100;
printk("my_write(),current->pid=%d\n", (int)current->pid);
init_timer(&my_timer);
my_timer.function = my_timer_function;
my_timer.expires = jiffies + HZ;
my_timer.data = len;
printk("Adding timer at jiffies = %ld\n", jiffies);
add_timer(&my_timer);
len += 10;
return lbuf;
}
struct file_operations fops = {
.owner = THIS_MODULE,
.write = my_write,
};
int my_init(void)
{
mycdrv_ma_no = register_chrdev(0, MYDEV_NAME, &fops);
return 0;
}
void my_exit(void)
{
unregister_chrdev(mycdrv_ma_no, MYDEV_NAME);
}
module_init(my_init);
module_exit(my_exit);
- It’ll trigger a timer when any writting is happening on this device
$ insmod timer_drv.ko
$ grep mycdrv /proc/devices
248 mycdrv
$ mknod mydrv c 248 0
$ echo hello > ./mydrv
$ tail -n 4 /var/log/messages
Apr 26 12:32:32 devel kernel: my_write(),current->pid=8809
Apr 26 12:32:32 devel kernel: Adding timer at jiffies = 4295896395
Apr 26 12:32:33 devel kernel: my_timer_function(), jiffies=4295897397
Apr 26 12:32:33 devel kernel: my data = 100, my pid=0
$ rmmod timer_drv
-
You can see the difference in between two jiffies - 4295896395 and 4295897397. This difference is not exactly 1 second (1000).
-
Periodic timer. If the function needs to be run periodically, you can use ‘mod_timer’
/* periodic_timer.c */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/sched.h>
#include <linux/init.h>
MODULE_LICENSE("GPL");
struct timer_list timer;
struct kt_data {
unsigned long period;
unsigned long start_time;
} data;
void ktfun(unsigned long var)
{
struct kt_data *tdata = (struct kt_data *)var;
printk("ktimer:period=%ld elapsed =%ld\n",
tdata->period, jiffies - tdata->start_time);
mod_timer(&timer, tdata->period + jiffies);
}
int init_module(void)
{
data.period = 2 * HZ;
init_timer(&timer);
timer.function = ktfun;
timer.data = (unsigned long)&data;
timer.expires = jiffies + data.period;
data.start_time = jiffies;
add_timer(&timer);
return 0;
}
void cleanup_module(void)
{
printk("Delete timer,rc=%d\n", del_timer_sync(&timer));
}
- In cleanup_module(), it’s using del_timer_sync() just in case the timer function is running when the deleting is happening.
$ insmod period_timer.ko
$ tail /var/log/messages
Apr 26 12:47:20 devel kernel: ktimer:period=2000 elapsed =12029
Apr 26 12:47:22 devel kernel: ktimer:period=2000 elapsed =14033
Apr 26 12:47:24 devel kernel: ktimer:period=2000 elapsed =16037
Apr 26 12:47:26 devel kernel: ktimer:period=2000 elapsed =18041
Apr 26 12:47:28 devel kernel: ktimer:period=2000 elapsed =20045
Apr 26 12:47:30 devel kernel: ktimer:period=2000 elapsed =22049
Apr 26 12:47:32 devel kernel: ktimer:period=2000 elapsed =24053
Apr 26 12:47:34 devel kernel: ktimer:period=2000 elapsed =26057
Apr 26 12:47:36 devel kernel: ktimer:period=2000 elapsed =28061
Apr 26 12:47:38 devel kernel: ktimer:period=2000 elapsed =30065
$ rmmod period_timer
clock source
- The reason of the below message
2013-07-16T05:00:05.181538-04:00 xxxxxx kernel: Clocksource tsc unstable (delta = -95170507948 ns). Enable clocksource failover by adding clocksource_failover kernel parameter.
- This message is generated from the watchdog timer function
- clocksource_watchdog is handled with watchdog_timer and started in clocksource_start_watchdog()
static struct timer_list watchdog_timer;
static inline void clocksource_start_watchdog(void)
{
if (watchdog_running || !watchdog || list_empty(&watchdog_list))
return;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
}
- clocksource_watchdog() is checking each clock sources and see if there’s any big differences in the time value.
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
cycle_t csnow, wdnow;
int64_t wd_nsec, cs_nsec;
int next_cpu;
spin_lock(&watchdog_lock);
if (!watchdog_running)
goto out;
list_for_each_entry(cs, &watchdog_list, wd_list) {
/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
schedule_work(&watchdog_work);
continue;
}
local_irq_disable();
csnow = cs->read(cs);
wdnow = watchdog->read(watchdog);
local_irq_enable();
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
cs->flags |= CLOCK_SOURCE_WATCHDOG;
cs->wd_last = wdnow;
cs->cs_last = csnow;
continue;
}
wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
watchdog->mult, watchdog->shift);
cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
cs->mask, cs->mult, cs->shift);
cs->cs_last = csnow;
cs->wd_last = wdnow;
/* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
if (clocksource_failover)
clocksource_unstable(cs, cs_nsec - wd_nsec);
else
printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns). Enable clocksource failover by adding clocksource_failover kernel parameter.\n",
cs->name, cs_nsec - wd_nsec);
continue;
}
if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/*
* We just marked the clocksource as highres-capable,
* notify the rest of the system as well so that we
* transition into high-res mode:
*/
tick_clock_notify();
}
}
/*
* Cycle through CPUs to check if the CPUs stay synchronized
* to each other.
*/
next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
next_cpu = cpumask_first(cpu_online_mask);
watchdog_timer.expires += WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, next_cpu);
out:
spin_unlock(&watchdog_lock);
}
- Why this checking is happening?
- clocksource is used to get smaller time differences than the time interrupt can handles (100ms or 10ms depends on kernel HZ).
- By checking the clock source value, we can check if the clock source is providing reliable value.
/* Timekeeper helper functions. */
static inline s64 timekeeping_get_ns(void)
{
cycle_t cycle_now, cycle_delta;
struct clocksource *clock;
/* read clocksource: */
clock = timekeeper.clock;
cycle_now = clock->read(clock);
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
/* return delta convert to nanoseconds using ntp adjusted mult. */
return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
timekeeper.shift);
}