1 files changed, 53 insertions, 46 deletions
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..5dd363d54348 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
 }
 #endif
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+        __attribute__((visibility("hidden")));
+#endif
 #ifndef BUILD_VDSO32
 #include <linux/kernel.h>
@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 #ifdef CONFIG_PARAVIRT_CLOCK
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 {
-        const struct pvclock_vsyscall_time_info *pvti_base;
+        return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
-        int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
-        int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-        BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-        pvti_base = (struct pvclock_vsyscall_time_info *)
-                    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-        return &pvti_base[offset];
 }
 static notrace cycle_t vread_pvclock(int *mode)
 {
-        const struct pvclock_vsyscall_time_info *pvti;
+        const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
        cycle_t ret;
-        u64 last;
+        u64 tsc, pvti_tsc;
-        u32 version;
+        u64 last, delta, pvti_system_time;
-        u8 flags;
+        u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
-        unsigned cpu, cpu1;
        /*
-         * Note: hypervisor must guarantee that:
+         * Note: The kernel and hypervisor must guarantee that cpu ID
-         * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+         * number maps 1:1 to per-CPU pvclock time info.
-         * 2. that per-CPU pvclock time info is updated if the
+         *
-         *    underlying CPU changes.
+         * Because the hypervisor is entirely unaware of guest userspace
-         * 3. that version is increased whenever underlying CPU
+         * preemption, it cannot guarantee that per-CPU pvclock time
-         *    changes.
+         * info is updated if the underlying CPU changes or that that
+         * version is increased whenever underlying CPU changes.
         *
+         * On KVM, we are guaranteed that pvti updates for any vCPU are
+         * atomic as seen by *all* vCPUs.  This is an even stronger
+         * guarantee than we get with a normal seqlock.
+         *
+         * On Xen, we don't appear to have that guarantee, but Xen still
+         * supplies a valid seqlock using the version field.
+         * We only do pvclock vdso timing at all if
+         * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+         * mean that all vCPUs have matching pvti and that the TSC is
+         * synced, so we can just look at vCPU 0's pvti.
         */
-        do {
-                cpu = __getcpu() & VGETCPU_CPU_MASK;
+        if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
-                /* TODO: We can put vcpu id into higher bits of pvti.version.
-                 * This will save a couple of cycles by getting rid of
-                 * __getcpu() calls (Gleb).
-                 */
-                pvti = get_pvti(cpu);
-                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-                /*
-                 * Test we're still on the cpu as well as the version.
-                 * We could have been migrated just after the first
-                 * vgetcpu but before fetching the version, so we
-                 * wouldn't notice a version change.
-                 */
-                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-        } while (unlikely(cpu != cpu1 ||
-                          (pvti->pvti.version & 1) ||
-                          pvti->pvti.version != version));
-        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
                *mode = VCLOCK_NONE;
+                return 0;
+        }
+        do {
+                version = pvti->version;
+                /* This is also a read barrier, so we'll read version first. */
+                tsc = rdtsc_ordered();
+                pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+                pvti_tsc_shift = pvti->tsc_shift;
+                pvti_system_time = pvti->system_time;
+                pvti_tsc = pvti->tsc_timestamp;
+                /* Make sure that the version double-check is last. */
+                smp_rmb();
+        } while (unlikely((version & 1) || version != pvti->version));
+        delta = tsc - pvti_tsc;
+        ret = pvti_system_time +
+                pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+                                    pvti_tsc_shift);
        /* refer to tsc.c read_tsc() comment for rationale */
        last = gtod->cycle_last;

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index ca94fa649251..5dd363d54348 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
36	}	36	}
37	#endif	37	#endif
38		38
		39	#ifdef CONFIG_PARAVIRT_CLOCK
		40	extern u8 pvclock_page
		41	__attribute__((visibility("hidden")));
		42	#endif
		43
39	#ifndef BUILD_VDSO32	44	#ifndef BUILD_VDSO32
40		45
41	#include <linux/kernel.h>	46	#include <linux/kernel.h>
@@ -62,63 +67,65 @@ notrace static long vdso_fallback_gtod(struct timeval tv, struct timezone tz)
62		67
63	#ifdef CONFIG_PARAVIRT_CLOCK	68	#ifdef CONFIG_PARAVIRT_CLOCK
64		69
65	static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)	70	static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
66	{	71	{
67	const struct pvclock_vsyscall_time_info *pvti_base;	72	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
68	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
69	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
70
71	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
72
73	pvti_base = (struct pvclock_vsyscall_time_info *)
74	__fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
75
76	return &pvti_base[offset];
77	}	73	}
78		74
79	static notrace cycle_t vread_pvclock(int *mode)	75	static notrace cycle_t vread_pvclock(int *mode)
80	{	76	{
81	const struct pvclock_vsyscall_time_info *pvti;	77	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
82	cycle_t ret;	78	cycle_t ret;
83	u64 last;	79	u64 tsc, pvti_tsc;
84	u32 version;	80	u64 last, delta, pvti_system_time;
85	u8 flags;	81	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
86	unsigned cpu, cpu1;
87
88		82
89	/*	83	/*
90	* Note: hypervisor must guarantee that:	84	* Note: The kernel and hypervisor must guarantee that cpu ID
91	* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.	85	* number maps 1:1 to per-CPU pvclock time info.
92	* 2. that per-CPU pvclock time info is updated if the	86	*
93	* underlying CPU changes.	87	* Because the hypervisor is entirely unaware of guest userspace
94	* 3. that version is increased whenever underlying CPU	88	* preemption, it cannot guarantee that per-CPU pvclock time
95	* changes.	89	* info is updated if the underlying CPU changes or that that
		90	* version is increased whenever underlying CPU changes.
96	*	91	*
		92	* On KVM, we are guaranteed that pvti updates for any vCPU are
		93	* atomic as seen by all vCPUs. This is an even stronger
		94	* guarantee than we get with a normal seqlock.
		95	*
		96	* On Xen, we don't appear to have that guarantee, but Xen still
		97	* supplies a valid seqlock using the version field.
		98
		99	* We only do pvclock vdso timing at all if
		100	* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
		101	* mean that all vCPUs have matching pvti and that the TSC is
		102	* synced, so we can just look at vCPU 0's pvti.
97	*/	103	*/
98	do {	104
99	cpu = __getcpu() & VGETCPU_CPU_MASK;	105	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
100	/* TODO: We can put vcpu id into higher bits of pvti.version.
101	* This will save a couple of cycles by getting rid of
102	* __getcpu() calls (Gleb).
103	*/
104
105	pvti = get_pvti(cpu);
106
107	version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
108
109	/*
110	* Test we're still on the cpu as well as the version.
111	* We could have been migrated just after the first
112	* vgetcpu but before fetching the version, so we
113	* wouldn't notice a version change.
114	*/
115	cpu1 = __getcpu() & VGETCPU_CPU_MASK;
116	} while (unlikely(cpu != cpu1 \|\|
117	(pvti->pvti.version & 1) \|\|
118	pvti->pvti.version != version));
119
120	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
121	*mode = VCLOCK_NONE;	106	*mode = VCLOCK_NONE;
		107	return 0;
		108	}
		109
		110	do {
		111	version = pvti->version;
		112
		113	/* This is also a read barrier, so we'll read version first. */
		114	tsc = rdtsc_ordered();
		115
		116	pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
		117	pvti_tsc_shift = pvti->tsc_shift;
		118	pvti_system_time = pvti->system_time;
		119	pvti_tsc = pvti->tsc_timestamp;
		120
		121	/* Make sure that the version double-check is last. */
		122	smp_rmb();
		123	} while (unlikely((version & 1) \|\| version != pvti->version));
		124
		125	delta = tsc - pvti_tsc;
		126	ret = pvti_system_time +
		127	pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
		128	pvti_tsc_shift);
122		129
123	/* refer to tsc.c read_tsc() comment for rationale */	130	/* refer to tsc.c read_tsc() comment for rationale */
124	last = gtod->cycle_last;	131	last = gtod->cycle_last;