linux/arch/i386/kernel/timers/timer_tsc.c

/*
 * This code largely moved from arch/i386/kernel/time.c.
 * See comments there for proper credits.
 *
 * 2004-06-25    Jesper Juhl
 *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
 *      failing to inline.
 */

#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/timex.h>
#include <linux/errno.h>
#include <linux/cpufreq.h>
#include <linux/string.h>
#include <linux/jiffies.h>

#include <asm/timer.h>
#include <asm/io.h>
/* processor.h for distable_tsc flag */
#include <asm/processor.h>

#include "io_ports.h"
#include "mach_timer.h"

#include <asm/hpet.h>

#ifdef CONFIG_HPET_TIMER
static unsigned long hpet_usec_quotient;
static unsigned long hpet_last;
static struct timer_opts timer_tsc;
#endif

static inline void cpufreq_delayed_get(void);

int tsc_disable __devinitdata = 0;

extern spinlock_t i8253_lock;

static int use_tsc;
/* Number of usecs that the last interrupt was delayed */
static int delay_at_last_interrupt;

static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
static unsigned long long monotonic_base;
static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;

/* convert from cycles(64bits) => nanoseconds (64bits)
 *  basic equation:
 *		ns = cycles / (freq / ns_per_sec)
 *		ns = cycles * (ns_per_sec / freq)
 *		ns = cycles * (10^9 / (cpu_mhz * 10^6))
 *		ns = cycles * (10^3 / cpu_mhz)
 *
 *	Then we use scaling math (suggested by george@mvista.com) to get:
 *		ns = cycles * (10^3 * SC / cpu_mhz) / SC
 *		ns = cycles * cyc2ns_scale / SC
 *
 *	And since SC is a constant power of two, we can convert the div
 *  into a shift.   
 *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
 */
static unsigned long cyc2ns_scale; 
#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */

static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
{
	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
}

static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
}

static int count2; /* counter for mark_offset_tsc() */

/* Cached *multiplier* to convert TSC counts to microseconds.
 * (see the equation below).
 * Equal to 2^32 * (1 / (clocks per usec) ).
 * Initialized in time_init.
 */
static unsigned long fast_gettimeoffset_quotient;

static unsigned long get_offset_tsc(void)
{
	register unsigned long eax, edx;

	/* Read the Time Stamp Counter */

	rdtsc(eax,edx);

	/* .. relative to previous jiffy (32 bits is enough) */
	eax -= last_tsc_low;	/* tsc_low delta */

	/*
         * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
         *             = (tsc_low delta) * (usecs_per_clock)
         *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
	 *
	 * Using a mull instead of a divl saves up to 31 clock cycles
	 * in the critical path.
         */

	__asm__("mull %2"
		:"=a" (eax), "=d" (edx)
		:"rm" (fast_gettimeoffset_quotient),
		 "0" (eax));

	/* our adjusted time offset in microseconds */
	return delay_at_last_interrupt + edx;
}

static unsigned long long monotonic_clock_tsc(void)
{
	unsigned long long last_offset, this_offset, base;
	unsigned seq;
	
	/* atomically read monotonic base & last_offset */
	do {
		seq = read_seqbegin(&monotonic_lock);
		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
		base = monotonic_base;
	} while (read_seqretry(&monotonic_lock, seq));

	/* Read the Time Stamp Counter */
	rdtscll(this_offset);

	/* return the value in ns */
	return base + cycles_2_ns(this_offset - last_offset);
}

/*
 * Scheduler clock - returns current time in nanosec units.
 */
unsigned long long sched_clock(void)
{
	unsigned long long this_offset;

	/*
	 * In the NUMA case we dont use the TSC as they are not
	 * synchronized across all CPUs.
	 */
#ifndef CONFIG_NUMA
	if (!use_tsc)
#endif
		/* no locking but a rare wrong value is not a big deal */
		return jiffies_64 * (1000000000 / HZ);

	/* Read the Time Stamp Counter */
	rdtscll(this_offset);

	/* return the value in ns */
	return cycles_2_ns(this_offset);
}

static void delay_tsc(unsigned long loops)
{
	unsigned long bclock, now;
	
	rdtscl(bclock);
	do
	{
		rep_nop();
		rdtscl(now);
	} while ((now-bclock) < loops);
}

#ifdef CONFIG_HPET_TIMER
static void mark_offset_tsc_hpet(void)
{
	unsigned long long this_offset, last_offset;
 	unsigned long offset, temp, hpet_current;

	write_seqlock(&monotonic_lock);
	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
	/*
	 * It is important that these two operations happen almost at
	 * the same time. We do the RDTSC stuff first, since it's
	 * faster. To avoid any inconsistencies, we need interrupts
	 * disabled locally.
	 */
	/*
	 * Interrupts are just disabled locally since the timer irq
	 * has the SA_INTERRUPT flag set. -arca
	 */
	/* read Pentium cycle counter */

	hpet_current = hpet_readl(HPET_COUNTER);
	rdtsc(last_tsc_low, last_tsc_high);

	/* lost tick compensation */
	offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
	if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
		int lost_ticks = (offset - hpet_last) / hpet_tick;
		jiffies_64 += lost_ticks;
	}
	hpet_last = hpet_current;

	/* update the monotonic base value */
	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
	monotonic_base += cycles_2_ns(this_offset - last_offset);
	write_sequnlock(&monotonic_lock);

	/* calculate delay_at_last_interrupt */
	/*
	 * Time offset = (hpet delta) * ( usecs per HPET clock )
	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
	 * Where,
	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
	 */
	delay_at_last_interrupt = hpet_current - offset;
	ASM_MUL64_REG(temp, delay_at_last_interrupt,
			hpet_usec_quotient, delay_at_last_interrupt);
}
#endif


#ifdef CONFIG_CPU_FREQ
#include <linux/workqueue.h>

static unsigned int cpufreq_delayed_issched = 0;
static unsigned int cpufreq_init = 0;
static struct work_struct cpufreq_delayed_get_work;

static void handle_cpufreq_delayed_get(void *v)
{
	unsigned int cpu;
	for_each_online_cpu(cpu) {
		cpufreq_get(cpu);
	}
	cpufreq_delayed_issched = 0;
}

/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
 * to verify the CPU frequency the timing core thinks the CPU is running
 * at is still correct.
 */
static inline void cpufreq_delayed_get(void) 
{
	if (cpufreq_init && !cpufreq_delayed_issched) {
		cpufreq_delayed_issched = 1;
		printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
		schedule_work(&cpufreq_delayed_get_work);
	}
}

/* If the CPU frequency is scaled, TSC-based delays will need a different
 * loops_per_jiffy value to function properly.
 */

static unsigned int  ref_freq = 0;
static unsigned long loops_per_jiffy_ref = 0;

#ifndef CONFIG_SMP
static unsigned long fast_gettimeoffset_ref = 0;
static unsigned int cpu_khz_ref = 0;
#endif

static int
time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
		       void *data)
{
	struct cpufreq_freqs *freq = data;

	if (val != CPUFREQ_RESUMECHANGE)
		write_seqlock_irq(&xtime_lock);
	if (!ref_freq) {
		ref_freq = freq->old;
		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
#ifndef CONFIG_SMP
		fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
		cpu_khz_ref = cpu_khz;
#endif
	}

	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
	    (val == CPUFREQ_RESUMECHANGE)) {
		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
#ifndef CONFIG_SMP
		if (cpu_khz)
			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
		if (use_tsc) {
			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
				fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
				set_cyc2ns_scale(cpu_khz/1000);
			}
		}
#endif
	}

	if (val != CPUFREQ_RESUMECHANGE)
		write_sequnlock_irq(&xtime_lock);

	return 0;
}

static struct notifier_block time_cpufreq_notifier_block = {
	.notifier_call	= time_cpufreq_notifier
};


static int __init cpufreq_tsc(void)
{
	int ret;
	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
					CPUFREQ_TRANSITION_NOTIFIER);
	if (!ret)
		cpufreq_init = 1;
	return ret;
}
core_initcall(cpufreq_tsc);

#else /* CONFIG_CPU_FREQ */
static inline void cpufreq_delayed_get(void) { return; }
#endif 

int recalibrate_cpu_khz(void)
{
#ifndef CONFIG_SMP
	unsigned int cpu_khz_old = cpu_khz;

	if (cpu_has_tsc) {
		init_cpu_khz();
		cpu_data[0].loops_per_jiffy =
		    cpufreq_scale(cpu_data[0].loops_per_jiffy,
			          cpu_khz_old,
				  cpu_khz);
		return 0;
	} else
		return -ENODEV;
#else
	return -ENODEV;
#endif
}
EXPORT_SYMBOL(recalibrate_cpu_khz);

static void mark_offset_tsc(void)
{
	unsigned long lost,delay;
	unsigned long delta = last_tsc_low;
	int count;
	int countmp;
	static int count1 = 0;
	unsigned long long this_offset, last_offset;
	static int lost_count = 0;

	write_seqlock(&monotonic_lock);
	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
	/*
	 * It is important that these two operations happen almost at
	 * the same time. We do the RDTSC stuff first, since it's
	 * faster. To avoid any inconsistencies, we need interrupts
	 * disabled locally.
	 */

	/*
	 * Interrupts are just disabled locally since the timer irq
	 * has the SA_INTERRUPT flag set. -arca
	 */

	/* read Pentium cycle counter */

	rdtsc(last_tsc_low, last_tsc_high);

	spin_lock(&i8253_lock);
	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */

	count = inb_p(PIT_CH0);    /* read the latched count */
	count |= inb(PIT_CH0) << 8;

	/*
	 * VIA686a test code... reset the latch if count > max + 1
	 * from timer_pit.c - cjb
	 */
	if (count > LATCH) {
		outb_p(0x34, PIT_MODE);
		outb_p(LATCH & 0xff, PIT_CH0);
		outb(LATCH >> 8, PIT_CH0);
		count = LATCH - 1;
	}

	spin_unlock(&i8253_lock);

	if (pit_latch_buggy) {
		/* get center value of last 3 time lutch */
		if ((count2 >= count && count >= count1)
		    || (count1 >= count && count >= count2)) {
			count2 = count1; count1 = count;
		} else if ((count1 >= count2 && count2 >= count)
			   || (count >= count2 && count2 >= count1)) {
			countmp = count;count = count2;
			count2 = count1;count1 = countmp;
		} else {
			count2 = count1; count1 = count; count = count1;
		}
	}

	/* lost tick compensation */
	delta = last_tsc_low - delta;
	{
		register unsigned long eax, edx;
		eax = delta;
		__asm__("mull %2"
		:"=a" (eax), "=d" (edx)
		:"rm" (fast_gettimeoffset_quotient),
		 "0" (eax));
		delta = edx;
	}
	delta += delay_at_last_interrupt;
	lost = delta/(1000000/HZ);
	delay = delta%(1000000/HZ);
	if (lost >= 2) {
		jiffies_64 += lost-1;

		/* sanity check to ensure we're not always losing ticks */
		if (lost_count++ > 100) {
			printk(KERN_WARNING "Losing too many ticks!\n");
			printk(KERN_WARNING "TSC cannot be used as a timesource.  \n");
			printk(KERN_WARNING "Possible reasons for this are:\n");
			printk(KERN_WARNING "  You're running with Speedstep,\n");
			printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n");
			printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n");
			printk(KERN_WARNING "Falling back to a sane timesource now.\n");

			clock_fallback();
		}
		/* ... but give the TSC a fair chance */
		if (lost_count > 25)
			cpufreq_delayed_get();
	} else
		lost_count = 0;
	/* update the monotonic base value */
	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
	monotonic_base += cycles_2_ns(this_offset - last_offset);
	write_sequnlock(&monotonic_lock);

	/* calculate delay_at_last_interrupt */
	count = ((LATCH-1) - count) * TICK_SIZE;
	delay_at_last_interrupt = (count + LATCH/2) / LATCH;

	/* catch corner case where tick rollover occured
	 * between tsc and pit reads (as noted when
	 * usec delta is > 90% # of usecs/tick)
	 */
	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
		jiffies_64++;
}

static int __init init_tsc(char* override)
{

	/* check clock override */
	if (override[0] && strncmp(override,"tsc",3)) {
#ifdef CONFIG_HPET_TIMER
		if (is_hpet_enabled()) {
			printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
		} else
#endif
		{
			return -ENODEV;
		}
	}

	/*
	 * If we have APM enabled or the CPU clock speed is variable
	 * (CPU stops clock on HLT or slows clock to save power)
	 * then the TSC timestamps may diverge by up to 1 jiffy from
	 * 'real time' but nothing will break.
	 * The most frequent case is that the CPU is "woken" from a halt
	 * state by the timer interrupt itself, so we get 0 error. In the
	 * rare cases where a driver would "wake" the CPU and request a
	 * timestamp, the maximum error is < 1 jiffy. But timestamps are
	 * still perfectly ordered.
	 * Note that the TSC counter will be reset if APM suspends
	 * to disk; this won't break the kernel, though, 'cuz we're
	 * smart.  See arch/i386/kernel/apm.c.
	 */
 	/*
 	 *	Firstly we have to do a CPU check for chips with
 	 * 	a potentially buggy TSC. At this point we haven't run
 	 *	the ident/bugs checks so we must run this hook as it
 	 *	may turn off the TSC flag.
 	 *
 	 *	NOTE: this doesn't yet handle SMP 486 machines where only
 	 *	some CPU's have a TSC. Thats never worked and nobody has
 	 *	moaned if you have the only one in the world - you fix it!
 	 */

	count2 = LATCH; /* initialize counter for mark_offset_tsc() */

	if (cpu_has_tsc) {
		unsigned long tsc_quotient;
#ifdef CONFIG_HPET_TIMER
		if (is_hpet_enabled() && hpet_use_timer) {
			unsigned long result, remain;
			printk("Using TSC for gettimeofday\n");
			tsc_quotient = calibrate_tsc_hpet(NULL);
			timer_tsc.mark_offset = &mark_offset_tsc_hpet;
			/*
			 * Math to calculate hpet to usec multiplier
			 * Look for the comments at get_offset_tsc_hpet()
			 */
			ASM_DIV64_REG(result, remain, hpet_tick,
					0, KERNEL_TICK_USEC);
			if (remain > (hpet_tick >> 1))
				result++; /* rounding the result */

			hpet_usec_quotient = result;
		} else
#endif
		{
			tsc_quotient = calibrate_tsc();
		}

		if (tsc_quotient) {
			fast_gettimeoffset_quotient = tsc_quotient;
			use_tsc = 1;
			/*
			 *	We could be more selective here I suspect
			 *	and just enable this for the next intel chips ?
			 */
			/* report CPU clock rate in Hz.
			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
			 * clock/second. Our precision is about 100 ppm.
			 */
			{	unsigned long eax=0, edx=1000;
				__asm__("divl %2"
		       		:"=a" (cpu_khz), "=d" (edx)
        	       		:"r" (tsc_quotient),
	                	"0" (eax), "1" (edx));
				printk("Detected %u.%03u MHz processor.\n",
					cpu_khz / 1000, cpu_khz % 1000);
			}
			set_cyc2ns_scale(cpu_khz/1000);
			return 0;
		}
	}
	return -ENODEV;
}

#ifndef CONFIG_X86_TSC
/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
 * in cpu/common.c */
static int __init tsc_setup(char *str)
{
	tsc_disable = 1;
	return 1;
}
#else
static int __init tsc_setup(char *str)
{
	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
				"cannot disable TSC.\n");
	return 1;
}
#endif
__setup("notsc", tsc_setup);


/************************************************************/

/* tsc timer_opts struct */
static struct timer_opts timer_tsc = {
	.name = "tsc",
	.mark_offset = mark_offset_tsc, 
	.get_offset = get_offset_tsc,
	.monotonic_clock = monotonic_clock_tsc,
	.delay = delay_tsc,
	.read_timer = read_timer_tsc,
};

struct init_timer_opts __initdata timer_tsc_init = {
	.init = init_tsc,
	.opts = &timer_tsc,
};