diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml
index a5eb3aa25e02..7e0266cf58b7 100644
--- a/doc/src/sgml/ref/pgtesttiming.sgml
+++ b/doc/src/sgml/ref/pgtesttiming.sgml
@@ -93,28 +93,34 @@ PostgreSQL documentation
 
   <para>
    Good results will show most (>90%) individual timing calls take less than
-   one microsecond. Average per loop overhead will be even lower, below 100
-   nanoseconds. This example from an Intel i7-860 system using a TSC clock
-   source shows excellent performance:
+   one microsecond (1000 nanoseconds). Average per loop overhead will be even
+   lower, below 100 nanoseconds. This example from an Intel i9-9880H system
+   using a TSC clock source shows excellent performance:
 
 <screen><![CDATA[
 Testing timing overhead for 3 seconds.
-Per loop time including overhead: 35.96 ns
+Per loop time including overhead: 13.74 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     96.40465   80435604
-     2      3.59518    2999652
-     4      0.00015        126
-     8      0.00002         13
-    16      0.00000          2
+     < ns   % of total      count
+       16     97.00221  211857215
+       32      2.99555    6542412
+       64      0.00115       2505
+      128      0.00035        759
+      256      0.00004         78
+      512      0.00000          3
+     1024      0.00000          4
+     2048      0.00034        732
+     4096      0.00000          6
+     8192      0.00000          8
+    16384      0.00019        409
+    32768      0.00018        403
+    65536      0.00000          1
 ]]></screen>
   </para>
 
   <para>
-   Note that different units are used for the per loop time than the
-   histogram. The loop can have resolution within a few nanoseconds (ns),
-   while the individual timing calls can only resolve down to one microsecond
-   (us).
+   Note that the accuracy of the histogram entries may be lower than the
+   per loop time.
   </para>
 
  </refsect2>
@@ -125,24 +131,25 @@ Histogram of timing durations:
    When the query executor is running a statement using
    <command>EXPLAIN ANALYZE</command>, individual operations are timed as well
    as showing a summary.  The overhead of your system can be checked by
-   counting rows with the <application>psql</application> program:
+   disabling the per-row timing, using the <literal>TIMING OFF</literal>
+   option:
 
 <screen>
-CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
-\timing
-SELECT COUNT(*) FROM t;
-EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
+CREATE TABLE t AS SELECT * FROM generate_series(1, 100000);
+EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t;
+EXPLAIN (ANALYZE, TIMING ON) SELECT COUNT(*) FROM t;
 </screen>
   </para>
 
   <para>
-   The i7-860 system measured runs the count query in 9.8 ms while
-   the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms, each
-   processing just over 100,000 rows.  That 6.8 ms difference means the timing
-   overhead per row is 68 ns, about twice what pg_test_timing estimated it
-   would be.  Even that relatively small amount of overhead is making the fully
-   timed count statement take almost 70% longer.  On more substantial queries,
-   the timing overhead would be less problematic.
+   The i9-9880H system measured shows an execution time of 4.116 ms for the
+   <literal>TIMING OFF</literal> query, and 6.965 ms for the
+   <literal>TIMING ON</literal>, each processing 100,000 rows.
+
+   That 2.849 ms difference means the timing overhead per row is 28 ns.  As
+   <literal>TIMING ON</literal> measures timestamps twice per row returned by
+   an executor node, the overhead is very close to what pg_test_timing
+   estimated it would be.
   </para>
 
  </refsect2>
@@ -157,28 +164,31 @@ EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
 
 <screen><![CDATA[
 # cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-tsc hpet acpi_pm
+tsc acpi_pm
 # echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
 # pg_test_timing
-Per loop time including overhead: 722.92 ns
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 708.58 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     27.84870    1155682
-     2     72.05956    2990371
-     4      0.07810       3241
-     8      0.01357        563
-    16      0.00007          3
+     < ns   % of total      count
+     1024     99.79796    4225270
+     2048      0.15560       6588
+     4096      0.00035         15
+     8192      0.01738        736
+    16384      0.01679        711
+    32768      0.01190        504
 ]]></screen>
   </para>
 
   <para>
    In this configuration, the sample <command>EXPLAIN ANALYZE</command> above
-   takes 115.9 ms.  That's 1061 ns of timing overhead, again a small multiple
-   of what's measured directly by this utility.  That much timing overhead
-   means the actual query itself is only taking a tiny fraction of the
-   accounted for time, most of it is being consumed in overhead instead.  In
-   this configuration, any <command>EXPLAIN ANALYZE</command> totals involving
-   many timed operations would be inflated significantly by timing overhead.
+   shows an execution time of 148.7 ms.  That's 1392 ns of per-row timing
+   overhead.  Taking the two timestamps per row into account, that's again
+   close to what pg_test_timing estimated.  That much timing overhead means
+   the actual query itself is only taking a tiny fraction of the accounted for
+   time, most of it is being consumed in overhead instead.  In this
+   configuration, any <command>EXPLAIN ANALYZE</command> totals involving many
+   timed operations would be inflated significantly by timing overhead.
   </para>
 
   <para>
@@ -196,33 +206,6 @@ kern.timecounter.hardware: ACPI-fast -> TSC
 </screen>
   </para>
 
-  <para>
-   Other systems may only allow setting the time source on boot.  On older
-   Linux systems the "clock" kernel setting is the only way to make this sort
-   of change.  And even on some more recent ones, the only option you'll see
-   for a clock source is "jiffies".  Jiffies are the older Linux software clock
-   implementation, which can have good resolution when it's backed by fast
-   enough timing hardware, as in this example:
-
-<screen><![CDATA[
-$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-jiffies
-$ dmesg | grep time.c
-time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
-time.c: Detected 2400.153 MHz processor.
-$ pg_test_timing
-Testing timing overhead for 3 seconds.
-Per timing duration including loop overhead: 97.75 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     90.23734   27694571
-     2      9.75277    2993204
-     4      0.00981       3010
-     8      0.00007         22
-    16      0.00000          1
-    32      0.00000          1
-]]></screen></para>
-
  </refsect2>
 
  <refsect2>
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 3b91d02605a0..a33308119519 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3363,8 +3363,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
 			INSTR_TIME_SET_CURRENT(currenttime);
 			elapsed = currenttime;
 			INSTR_TIME_SUBTRACT(elapsed, starttime);
-			if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
-				>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
+			if (INSTR_TIME_GET_MILLISEC(elapsed) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
 			{
 				if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
 				{
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 56e635f47000..01f67c5d9725 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
 void
 InstrStartNode(Instrumentation *instr)
 {
-	if (instr->need_timer &&
-		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
-		elog(ERROR, "InstrStartNode called twice in a row");
+	if (instr->need_timer)
+	{
+		if (!INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStartNode called twice in a row");
+		else
+			INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
+	}
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 			elog(ERROR, "InstrStopNode called without start");
 
-		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT_FAST(endtime);
 		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 318600d6d02e..2e429b30a277 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -786,6 +786,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Initialize portal manager */
 	EnablePortalManager();
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* Initialize status reporting */
 	pgstat_beinit();
 
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index ce7aad4b25a4..62a15465f7c0 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -19,8 +19,8 @@ static void handle_args(int argc, char *argv[]);
 static uint64 test_timing(unsigned int duration);
 static void output(uint64 loop_count);
 
-/* record duration in powers of 2 microseconds */
-static long long int histogram[32];
+/* record duration in powers of 2 nanoseconds */
+static uint64 histogram[64];
 
 int
 main(int argc, char *argv[])
@@ -121,35 +121,49 @@ handle_args(int argc, char *argv[])
 static uint64
 test_timing(unsigned int duration)
 {
-	uint64		total_time;
-	int64		time_elapsed = 0;
 	uint64		loop_count = 0;
-	uint64		prev,
-				cur;
+	instr_time	until_time,
+				total_time;
 	instr_time	start_time,
-				end_time,
-				temp;
+				end_time;
+	instr_time	cur;
 
-	total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
+	INSTR_TIME_INITIALIZE();
+	INSTR_TIME_SET_CURRENT_FAST(start_time);
 
-	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_MICROSEC(start_time);
+	/*
+	 * To reduce loop overhead, check loop condition in instr_time domain.
+	 */
+	INSTR_TIME_SET_SECONDS(total_time, duration);
+	until_time = start_time;
+	INSTR_TIME_ADD(until_time, total_time);
 
-	while (time_elapsed < total_time)
+	cur = start_time;
+
+	while (INSTR_TIME_IS_LT(cur, until_time))
 	{
-		int32		diff,
-					bits = 0;
+		instr_time	temp;
+		instr_time	prev;
+		int64		diff;
+		int32		bits = 0;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_MICROSEC(temp);
-		diff = cur - prev;
+		INSTR_TIME_SET_CURRENT_FAST(cur);
+		temp = cur;
+		INSTR_TIME_SUBTRACT(temp, prev);
+		diff = INSTR_TIME_GET_NANOSEC(temp);
 
 		/* Did time go backwards? */
-		if (diff < 0)
+		if (unlikely(diff <= 0))
 		{
+			/* can't do anything with that measurement */
+			if (diff == 0)
+			{
+				loop_count++;
+				continue;
+			}
 			fprintf(stderr, _("Detected clock going backwards in time.\n"));
-			fprintf(stderr, _("Time warp: %d ms\n"), diff);
+			fprintf(stderr, _("Time warp: %lld ns\n"), (long long) diff);
 			exit(1);
 		}
 
@@ -164,16 +178,14 @@ test_timing(unsigned int duration)
 		histogram[bits]++;
 
 		loop_count++;
-		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	INSTR_TIME_SET_CURRENT_FAST(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
 	printf(_("Per loop time including overhead: %0.2f ns\n"),
-		   INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count);
+		   (INSTR_TIME_GET_DOUBLE(end_time) * NS_PER_S) / loop_count);
 
 	return loop_count;
 }
@@ -181,9 +193,10 @@ test_timing(unsigned int duration)
 static void
 output(uint64 loop_count)
 {
-	int64		max_bit = 31,
+	int64		low_bit = 0,
+				max_bit = 63,
 				i;
-	char	   *header1 = _("< us");
+	char	   *header1 = _("< ns");
 	char	   *header2 = /* xgettext:no-c-format */ _("% of total");
 	char	   *header3 = _("count");
 	int			len1 = strlen(header1);
@@ -194,15 +207,19 @@ output(uint64 loop_count)
 	while (max_bit > 0 && histogram[max_bit] == 0)
 		max_bit--;
 
+	/* find lowest bit value */
+	while (low_bit < max_bit && histogram[low_bit] == 0)
+		low_bit++;
+
 	printf(_("Histogram of timing durations:\n"));
 	printf("%*s   %*s %*s\n",
-		   Max(6, len1), header1,
+		   Max(9, len1), header1,
 		   Max(10, len2), header2,
 		   Max(10, len3), header3);
 
-	for (i = 0; i <= max_bit; i++)
-		printf("%*ld    %*.5f %*lld\n",
-			   Max(6, len1), 1l << i,
+	for (i = low_bit; i <= max_bit; i++)
+		printf("%*ld    %*.5f %*llu\n",
+			   Max(9, len1), 1l << i,
 			   Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
-			   Max(10, len3), histogram[i]);
+			   Max(10, len3), (long long unsigned) histogram[i]);
 }
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index bf099aab2787..716f90578f0e 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -7265,6 +7265,9 @@ main(int argc, char **argv)
 		initRandomState(&state[i].cs_func_rs);
 	}
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* opening connection... */
 	con = doConnect();
 	if (con == NULL)
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 5018eedf1e57..789a2aa67635 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 1e2b91c83c4c..194e45e7ae8d 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 000000000000..9b18e6e3588d
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,170 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "portability/instr_time.h"
+
+#ifndef WIN32
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up
+ * with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
+ * for more details.
+ *
+ * By default assume we are using clock_gettime() as a fallback which uses
+ * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
+ * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
+ *
+ * When using the RDTSC instruction directly this is filled in during initialization
+ * based on the relevant CPUID fields.
+ */
+int64		ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
+int64		ticks_per_sec = NS_PER_S;
+int64		max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
+
+#if defined(__x86_64__) && defined(__linux__)
+/*
+ * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
+ */
+bool		has_rdtsc = false;
+
+/*
+ * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
+ */
+bool		has_rdtscp = false;
+
+#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d)	/* KVMKVMKVM */
+
+static bool
+get_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	uint32		r[4];
+
+	if (__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]) && r[2] > 0)
+	{
+		if (r[0] == 0 || r[1] == 0)
+			return false;
+
+		*tsc_freq = r[2] / 1000 * r[1] / r[0];
+		return true;
+	}
+
+	/* Some CPUs only report frequency in 16H */
+	if (__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]))
+	{
+		*tsc_freq = r[0] * 1000;
+		return true;
+	}
+
+	/*
+	 * Check if we have a KVM or VMware Hypervisor passing down TSC frequency
+	 * to us in a guest VM
+	 *
+	 * Note that accessing the 0x40000000 leaf for Hypervisor info requires
+	 * use of __cpuidex to set ECX to 0.
+	 *
+	 * TODO: We need to check whether our compiler is new enough
+	 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95973)
+	 */
+	__cpuidex((int32 *) r, 0x40000000, 0);
+	if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
+	{
+		__cpuidex((int32 *) r, 0x40000010, 0);
+		if (r[0] > 0)
+		{
+			*tsc_freq = r[0];
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool
+is_rdtscp_available()
+{
+	uint32		r[4];
+
+	return __get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]) > 0 && (r[3] & (1 << 27)) != 0;
+}
+
+/*
+ * Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ *
+ * Since Linux already does a significant amount of work to determine
+ * whether TSC is a viable clock source, decide based on that.
+ */
+void
+pg_initialize_rdtsc(void)
+{
+	FILE	   *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+
+	if (fp)
+	{
+		char		buf[128];
+
+		if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
+		{
+			/*
+			 * Compute baseline CPU peformance, determines speed at which
+			 * RDTSC advances.
+			 */
+			uint32		tsc_freq;
+
+			if (get_tsc_frequency_khz(&tsc_freq))
+			{
+				/*
+				 * Ticks to nanoseconds conversion requires floating point
+				 * math because because:
+				 *
+				 * sec = ticks / frequency_hz ns  = ticks / frequency_hz *
+				 * 1,000,000,000 ns  = ticks * (1,000,000,000 / frequency_hz)
+				 * ns  = ticks * (1,000,000 / frequency_khz) <-- now in
+				 * kilohertz
+				 *
+				 * Here, 'ns' is usually a floating number. For example for a
+				 * 2.5 GHz CPU the scaling factor becomes 1,000,000 /
+				 * 2,500,000 = 1.2.
+				 *
+				 * To be able to use integer math we work around the lack of
+				 * precision. We first scale the integer up and after the
+				 * multiplication by the number of ticks in
+				 * INSTR_TIME_GET_NANOSEC() we divide again by the same value.
+				 * We picked the scaler such that it provides enough precision
+				 * and is a power-of-two which allows for shifting instead of
+				 * doing an integer division.
+				 */
+				ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
+				ticks_per_sec = tsc_freq * 1000;	/* KHz->Hz */
+				max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+
+				has_rdtsc = true;
+				has_rdtscp = is_rdtscp_available();
+			}
+		}
+
+		fclose(fp);
+	}
+}
+#endif							/* defined(__x86_64__) && defined(__linux__) */
+
+#endif							/* WIN32 */
diff --git a/src/common/meson.build b/src/common/meson.build
index 1540ba67cca4..62b90b3e609c 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index f71a851b18d8..fd8ca1220012 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC
+ * clocksource is also used on the host OS.  Otherwise, and on other
+ * Unix-like systems we use clock_gettime() and on Windows we use
+ * QueryPerformanceCounter(). These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -15,12 +17,17 @@
  *
  * INSTR_TIME_IS_ZERO(t)			is t equal to zero?
  *
+ * INSTR_TIME_IS_LT(x, y)			x < y
+ *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)		set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t)	set t to current time without waiting
+ * 									for instructions in out-of-order window
+ *
+ * INSTR_TIME_SET_CURRENT(t)		set t to current time while waiting for
+ * 									instructions in OOO to retire
  *
- * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
- *									evaluates to whether t changed
+ * INSTR_TIME_SET_SECONDS(t, s)		set t to s seconds
  *
  * INSTR_TIME_ADD(x, y)				x += y
  *
@@ -80,7 +87,15 @@ typedef struct instr_time
 
 
 #ifndef WIN32
+/*
+ * Make sure this is a power-of-two, so that the compiler can turn the
+ * multiplications and divisions into shifts.
+ */
+#define TICKS_TO_NS_PRECISION (1<<14)
 
+extern int64 ticks_per_ns_scaled;
+extern int64 ticks_per_sec;
+extern int64 max_ticks_no_overflow;
 
 /* Use clock_gettime() */
 
@@ -106,32 +121,124 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
+#if defined(__x86_64__) && defined(__linux__)
+#include <x86intrin.h>
+#include <cpuid.h>
+
+extern bool has_rdtsc;
+extern bool has_rdtscp;
+
+extern void pg_initialize_rdtsc(void);
+#endif
+
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_clock_gettime()
 {
 	instr_time	now;
 	struct timespec tmp;
 
 	clock_gettime(PG_INSTR_CLOCK, &tmp);
 	now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
-
 	return now;
 }
 
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtsc)
+	{
+		instr_time	now;
+
+		now.ticks = __rdtsc();
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtscp)
+	{
+		instr_time	now;
+		uint32		unused;
+
+		now.ticks = __rdtscp(&unused);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline int64_t
+pg_ticks_to_ns(instr_time t)
+{
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 * Check overflow without actually overflowing via: a * b > max <=> a >
+	 * max / b
+	 */
+	int64		ns = 0;
+
+	if (unlikely(t.ticks > max_ticks_no_overflow))
+	{
+		/*
+		 * Compute how often the maximum number of ticks fits completely into
+		 * the number of elapsed ticks and convert that number into
+		 * nanoseconds. Then multiply by the count to arrive at the final
+		 * value. In a 2nd step we adjust the number of elapsed ticks and
+		 * convert the remaining ticks.
+		 */
+		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+
+		ns = max_ns * count;
+
+		/*
+		 * Subtract the ticks that we now already accounted for, so that they
+		 * don't get counted twice.
+		 */
+		t.ticks -= count * max_ticks_no_overflow;
+		Assert(t.ticks >= 0);
+	}
+
+	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	return ns;
+}
+
+static inline void
+pg_initialize_get_ticks()
+{
+#if defined(__x86_64__) && defined(__linux__)
+	pg_initialize_rdtsc();
+#endif
+}
+
+#define INSTR_TIME_INITIALIZE() \
+	pg_initialize_get_ticks()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
+	((t) = pg_get_ticks())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
+#define INSTR_TIME_SET_SECONDS(t, s) \
+	((t).ticks = (s) * ticks_per_sec)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	pg_ticks_to_ns(t)
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
 
-/* helper for INSTR_TIME_SET_CURRENT */
 static inline instr_time
 pg_query_performance_counter(void)
 {
@@ -153,9 +260,17 @@ GetTimerFrequency(void)
 	return (double) f.QuadPart;
 }
 
+#define INSTR_TIME_INITIALIZE()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_query_performance_counter())
+
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
+#define INSTR_TIME_SET_SECONDS(t, s) \
+	((t).ticks = s * GetTimerFrequency())
+
 #define INSTR_TIME_GET_NANOSEC(t) \
 	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
 
@@ -168,13 +283,10 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_ZERO(t)	((t).ticks == 0)
 
+#define INSTR_TIME_IS_LT(x, y)	((x).ticks < (y).ticks)
 
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
-#define INSTR_TIME_SET_CURRENT_LAZY(t) \
-	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
-
-
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
@@ -184,7 +296,6 @@ GetTimerFrequency(void)
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
-
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)