From c85a5afb367a683f6c2c566a3f00ac6bf722192f Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 20 Jan 2023 15:31:54 -0800
Subject: [PATCH 1/3] instr_time: Add INSTR_TIME_SET_SECONDS(),
 INSTR_TIME_IS_LT()

INSTR_TIME_SET_SECONDS() is useful to calculate the end of a time-bound loop
without having to convert into time units (which is
costly). INSTR_TIME_IS_LT() can be used to check the loop condition.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/portability/instr_time.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index f71a851b18d8..6ad411c68667 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -15,6 +15,8 @@
  *
  * INSTR_TIME_IS_ZERO(t)			is t equal to zero?
  *
+ * INSTR_TIME_IS_LT(x, y)			x < y
+ *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
  * INSTR_TIME_SET_CURRENT(t)		set t to current time
@@ -22,6 +24,8 @@
  * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
  *									evaluates to whether t changed
  *
+ * INSTR_TIME_SET_SECONDS(t, s)		set t to s seconds
+ *
  * INSTR_TIME_ADD(x, y)				x += y
  *
  * INSTR_TIME_SUBTRACT(x, y)		x -= y
@@ -122,6 +126,9 @@ pg_clock_gettime_ns(void)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_clock_gettime_ns())
 
+#define INSTR_TIME_SET_SECONDS(t, s) \
+	((t).ticks = NS_PER_S * (s))
+
 #define INSTR_TIME_GET_NANOSEC(t) \
 	((int64) (t).ticks)
 
@@ -156,6 +163,9 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
+#define INSTR_TIME_SET_SECONDS(t, s) \
+	((t).ticks = s * GetTimerFrequency())
+
 #define INSTR_TIME_GET_NANOSEC(t) \
 	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
 
@@ -168,6 +178,8 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_ZERO(t)	((t).ticks == 0)
 
+#define INSTR_TIME_IS_LT(x, y)	((x).ticks < (y).ticks)
+
 
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 

From 4eb5f44613ed86c7a088b3affa2569ca36fd5fa4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 16 Jan 2023 11:19:11 -0800
Subject: [PATCH 2/3] wip: report nanoseconds in pg_test_timing

This commit also updates pg_test_timing's documentation:

- compare EXPLAIN (ANALYZE, TIMING ON/OFF) instead of comparing performance of
  of statement with/without EXPLAIN ANALYZE
- explain the 2x overhead (due to two timestamp acquisitions per row)
- remove old section about old versions of linux - I couldn't update the
  numbers, and it's old enough nobody would care
---
 doc/src/sgml/ref/pgtesttiming.sgml      | 117 ++++++++++--------------
 src/bin/pg_test_timing/pg_test_timing.c |  74 +++++++++------
 2 files changed, 95 insertions(+), 96 deletions(-)

diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml
index a5eb3aa25e02..7e0266cf58b7 100644
--- a/doc/src/sgml/ref/pgtesttiming.sgml
+++ b/doc/src/sgml/ref/pgtesttiming.sgml
@@ -93,28 +93,34 @@ PostgreSQL documentation
 
   <para>
    Good results will show most (>90%) individual timing calls take less than
-   one microsecond. Average per loop overhead will be even lower, below 100
-   nanoseconds. This example from an Intel i7-860 system using a TSC clock
-   source shows excellent performance:
+   one microsecond (1000 nanoseconds). Average per loop overhead will be even
+   lower, below 100 nanoseconds. This example from an Intel i9-9880H system
+   using a TSC clock source shows excellent performance:
 
 <screen><![CDATA[
 Testing timing overhead for 3 seconds.
-Per loop time including overhead: 35.96 ns
+Per loop time including overhead: 13.74 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     96.40465   80435604
-     2      3.59518    2999652
-     4      0.00015        126
-     8      0.00002         13
-    16      0.00000          2
+     < ns   % of total      count
+       16     97.00221  211857215
+       32      2.99555    6542412
+       64      0.00115       2505
+      128      0.00035        759
+      256      0.00004         78
+      512      0.00000          3
+     1024      0.00000          4
+     2048      0.00034        732
+     4096      0.00000          6
+     8192      0.00000          8
+    16384      0.00019        409
+    32768      0.00018        403
+    65536      0.00000          1
 ]]></screen>
   </para>
 
   <para>
-   Note that different units are used for the per loop time than the
-   histogram. The loop can have resolution within a few nanoseconds (ns),
-   while the individual timing calls can only resolve down to one microsecond
-   (us).
+   Note that the accuracy of the histogram entries may be lower than the
+   per loop time.
   </para>
 
  </refsect2>
@@ -125,24 +131,25 @@ Histogram of timing durations:
    When the query executor is running a statement using
    <command>EXPLAIN ANALYZE</command>, individual operations are timed as well
    as showing a summary.  The overhead of your system can be checked by
-   counting rows with the <application>psql</application> program:
+   disabling the per-row timing, using the <literal>TIMING OFF</literal>
+   option:
 
 <screen>
-CREATE TABLE t AS SELECT * FROM generate_series(1,100000);
-\timing
-SELECT COUNT(*) FROM t;
-EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
+CREATE TABLE t AS SELECT * FROM generate_series(1, 100000);
+EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t;
+EXPLAIN (ANALYZE, TIMING ON) SELECT COUNT(*) FROM t;
 </screen>
   </para>
 
   <para>
-   The i7-860 system measured runs the count query in 9.8 ms while
-   the <command>EXPLAIN ANALYZE</command> version takes 16.6 ms, each
-   processing just over 100,000 rows.  That 6.8 ms difference means the timing
-   overhead per row is 68 ns, about twice what pg_test_timing estimated it
-   would be.  Even that relatively small amount of overhead is making the fully
-   timed count statement take almost 70% longer.  On more substantial queries,
-   the timing overhead would be less problematic.
+   The i9-9880H system measured shows an execution time of 4.116 ms for the
+   <literal>TIMING OFF</literal> query, and 6.965 ms for the
+   <literal>TIMING ON</literal>, each processing 100,000 rows.
+
+   That 2.849 ms difference means the timing overhead per row is 28 ns.  As
+   <literal>TIMING ON</literal> measures timestamps twice per row returned by
+   an executor node, the overhead is very close to what pg_test_timing
+   estimated it would be.
   </para>
 
  </refsect2>
@@ -157,28 +164,31 @@ EXPLAIN ANALYZE SELECT COUNT(*) FROM t;
 
 <screen><![CDATA[
 # cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-tsc hpet acpi_pm
+tsc acpi_pm
 # echo acpi_pm > /sys/devices/system/clocksource/clocksource0/current_clocksource
 # pg_test_timing
-Per loop time including overhead: 722.92 ns
+Testing timing overhead for 3 seconds.
+Per loop time including overhead: 708.58 ns
 Histogram of timing durations:
-  < us   % of total      count
-     1     27.84870    1155682
-     2     72.05956    2990371
-     4      0.07810       3241
-     8      0.01357        563
-    16      0.00007          3
+     < ns   % of total      count
+     1024     99.79796    4225270
+     2048      0.15560       6588
+     4096      0.00035         15
+     8192      0.01738        736
+    16384      0.01679        711
+    32768      0.01190        504
 ]]></screen>
   </para>
 
   <para>
    In this configuration, the sample <command>EXPLAIN ANALYZE</command> above
-   takes 115.9 ms.  That's 1061 ns of timing overhead, again a small multiple
-   of what's measured directly by this utility.  That much timing overhead
-   means the actual query itself is only taking a tiny fraction of the
-   accounted for time, most of it is being consumed in overhead instead.  In
-   this configuration, any <command>EXPLAIN ANALYZE</command> totals involving
-   many timed operations would be inflated significantly by timing overhead.
+   shows an execution time of 148.7 ms.  That's 1392 ns of per-row timing
+   overhead.  Taking the two timestamps per row into account, that's again
+   close to what pg_test_timing estimated.  That much timing overhead means
+   the actual query itself is only taking a tiny fraction of the accounted for
+   time, most of it is being consumed in overhead instead.  In this
+   configuration, any <command>EXPLAIN ANALYZE</command> totals involving many
+   timed operations would be inflated significantly by timing overhead.
   </para>
 
   <para>
@@ -196,33 +206,6 @@ kern.timecounter.hardware: ACPI-fast -> TSC
 </screen>
   </para>
 
-  <para>
-   Other systems may only allow setting the time source on boot.  On older
-   Linux systems the "clock" kernel setting is the only way to make this sort
-   of change.  And even on some more recent ones, the only option you'll see
-   for a clock source is "jiffies".  Jiffies are the older Linux software clock
-   implementation, which can have good resolution when it's backed by fast
-   enough timing hardware, as in this example:
-
-<screen><![CDATA[
-$ cat /sys/devices/system/clocksource/clocksource0/available_clocksource
-jiffies
-$ dmesg | grep time.c
-time.c: Using 3.579545 MHz WALL PM GTOD PIT/TSC timer.
-time.c: Detected 2400.153 MHz processor.
-$ pg_test_timing
-Testing timing overhead for 3 seconds.
-Per timing duration including loop overhead: 97.75 ns
-Histogram of timing durations:
-  < us   % of total      count
-     1     90.23734   27694571
-     2      9.75277    2993204
-     4      0.00981       3010
-     8      0.00007         22
-    16      0.00000          1
-    32      0.00000          1
-]]></screen></para>
-
  </refsect2>
 
  <refsect2>
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index ce7aad4b25a4..29b0db6d6190 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -19,8 +19,8 @@ static void handle_args(int argc, char *argv[]);
 static uint64 test_timing(unsigned int duration);
 static void output(uint64 loop_count);
 
-/* record duration in powers of 2 microseconds */
-static long long int histogram[32];
+/* record duration in powers of 2 nanoseconds */
+static uint64 histogram[64];
 
 int
 main(int argc, char *argv[])
@@ -121,35 +121,48 @@ handle_args(int argc, char *argv[])
 static uint64
 test_timing(unsigned int duration)
 {
-	uint64		total_time;
-	int64		time_elapsed = 0;
 	uint64		loop_count = 0;
-	uint64		prev,
-				cur;
+	instr_time	until_time,
+				total_time;
 	instr_time	start_time,
-				end_time,
-				temp;
-
-	total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
+				end_time;
+	instr_time	cur;
 
 	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_MICROSEC(start_time);
 
-	while (time_elapsed < total_time)
+	/*
+	 * To reduce loop overhead, check loop condition in instr_time domain.
+	 */
+	INSTR_TIME_SET_SECONDS(total_time, duration);
+	until_time = start_time;
+	INSTR_TIME_ADD(until_time, total_time);
+
+	cur = start_time;
+
+	while (INSTR_TIME_IS_LT(cur, until_time))
 	{
-		int32		diff,
-					bits = 0;
+		instr_time	temp;
+		instr_time	prev;
+		int64		diff;
+		int32		bits = 0;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_MICROSEC(temp);
-		diff = cur - prev;
+		INSTR_TIME_SET_CURRENT(cur);
+		temp = cur;
+		INSTR_TIME_SUBTRACT(temp, prev);
+		diff = INSTR_TIME_GET_NANOSEC(temp);
 
 		/* Did time go backwards? */
-		if (diff < 0)
+		if (unlikely(diff <= 0))
 		{
+			/* can't do anything with that measurement */
+			if (diff == 0)
+			{
+				loop_count++;
+				continue;
+			}
 			fprintf(stderr, _("Detected clock going backwards in time.\n"));
-			fprintf(stderr, _("Time warp: %d ms\n"), diff);
+			fprintf(stderr, _("Time warp: %lld ns\n"), (long long) diff);
 			exit(1);
 		}
 
@@ -164,8 +177,6 @@ test_timing(unsigned int duration)
 		histogram[bits]++;
 
 		loop_count++;
-		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
 	}
 
 	INSTR_TIME_SET_CURRENT(end_time);
@@ -173,7 +184,7 @@ test_timing(unsigned int duration)
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
 	printf(_("Per loop time including overhead: %0.2f ns\n"),
-		   INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count);
+		   (INSTR_TIME_GET_DOUBLE(end_time) * NS_PER_S) / loop_count);
 
 	return loop_count;
 }
@@ -181,9 +192,10 @@ test_timing(unsigned int duration)
 static void
 output(uint64 loop_count)
 {
-	int64		max_bit = 31,
+	int64		low_bit = 0,
+				max_bit = 63,
 				i;
-	char	   *header1 = _("< us");
+	char	   *header1 = _("< ns");
 	char	   *header2 = /* xgettext:no-c-format */ _("% of total");
 	char	   *header3 = _("count");
 	int			len1 = strlen(header1);
@@ -194,15 +206,19 @@ output(uint64 loop_count)
 	while (max_bit > 0 && histogram[max_bit] == 0)
 		max_bit--;
 
+	/* find lowest bit value */
+	while (low_bit < max_bit && histogram[low_bit] == 0)
+		low_bit++;
+
 	printf(_("Histogram of timing durations:\n"));
 	printf("%*s   %*s %*s\n",
-		   Max(6, len1), header1,
+		   Max(9, len1), header1,
 		   Max(10, len2), header2,
 		   Max(10, len3), header3);
 
-	for (i = 0; i <= max_bit; i++)
-		printf("%*ld    %*.5f %*lld\n",
-			   Max(6, len1), 1l << i,
+	for (i = low_bit; i <= max_bit; i++)
+		printf("%*ld    %*.5f %*llu\n",
+			   Max(9, len1), 1l << i,
 			   Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
-			   Max(10, len3), histogram[i]);
+			   Max(10, len3), (long long unsigned) histogram[i]);
 }

From a1557c049a6541ec64bcb004e44dd49d1646ff36 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Sun, 23 Feb 2025 20:44:10 -0800
Subject: [PATCH 3/3] Use time stamp counter to measure time on Linux/x86

We switch to using the time stamp counter (TSC) instead of clock_gettime()
to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime
is reduced by around 10% for queries moving lots of rows through the plan.

For now this is only enabled on Linux/x86, in case the system clocksource is
reported as TSC. Relying on the Linux kernel simplifies the logic to detect
if the present TSC is usable (frequency invariant, synchronized between
sockets, etc.). In all other cases we fallback to clock_gettime().

Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP.
RDTSCP waits for outstanding instructions to retire on out-of-order CPUs.
This adds noticably for little benefit in the typical InstrStartNode() /
InstrStopNode() use case. The macro to be used in such cases is called
INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT()
uses RDTSCP and is supposed to be used when precision is more important
than performance.

Author: David Geier <geidav.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Author: Lukas Fittl <lukas@fittl.com>
Reviewed-by:
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
---
 src/backend/access/heap/vacuumlazy.c    |   3 +-
 src/backend/executor/instrument.c       |  12 +-
 src/backend/utils/init/postinit.c       |   3 +
 src/bin/pg_test_timing/pg_test_timing.c |   7 +-
 src/bin/pgbench/pgbench.c               |   3 +
 src/bin/psql/startup.c                  |   4 +
 src/common/Makefile                     |   1 +
 src/common/instr_time.c                 | 170 ++++++++++++++++++++++++
 src/common/meson.build                  |   1 +
 src/include/portability/instr_time.h    | 139 ++++++++++++++++---
 10 files changed, 314 insertions(+), 29 deletions(-)
 create mode 100644 src/common/instr_time.c

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 3b91d02605a0..a33308119519 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3363,8 +3363,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
 			INSTR_TIME_SET_CURRENT(currenttime);
 			elapsed = currenttime;
 			INSTR_TIME_SUBTRACT(elapsed, starttime);
-			if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
-				>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
+			if (INSTR_TIME_GET_MILLISEC(elapsed) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
 			{
 				if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
 				{
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 56e635f47000..01f67c5d9725 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
 void
 InstrStartNode(Instrumentation *instr)
 {
-	if (instr->need_timer &&
-		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
-		elog(ERROR, "InstrStartNode called twice in a row");
+	if (instr->need_timer)
+	{
+		if (!INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStartNode called twice in a row");
+		else
+			INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
+	}
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 			elog(ERROR, "InstrStopNode called without start");
 
-		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT_FAST(endtime);
 		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 318600d6d02e..2e429b30a277 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -786,6 +786,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Initialize portal manager */
 	EnablePortalManager();
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* Initialize status reporting */
 	pgstat_beinit();
 
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index 29b0db6d6190..62a15465f7c0 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -128,7 +128,8 @@ test_timing(unsigned int duration)
 				end_time;
 	instr_time	cur;
 
-	INSTR_TIME_SET_CURRENT(start_time);
+	INSTR_TIME_INITIALIZE();
+	INSTR_TIME_SET_CURRENT_FAST(start_time);
 
 	/*
 	 * To reduce loop overhead, check loop condition in instr_time domain.
@@ -147,7 +148,7 @@ test_timing(unsigned int duration)
 		int32		bits = 0;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(cur);
+		INSTR_TIME_SET_CURRENT_FAST(cur);
 		temp = cur;
 		INSTR_TIME_SUBTRACT(temp, prev);
 		diff = INSTR_TIME_GET_NANOSEC(temp);
@@ -179,7 +180,7 @@ test_timing(unsigned int duration)
 		loop_count++;
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	INSTR_TIME_SET_CURRENT_FAST(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index bf099aab2787..716f90578f0e 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -7265,6 +7265,9 @@ main(int argc, char **argv)
 		initRandomState(&state[i].cs_func_rs);
 	}
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* opening connection... */
 	con = doConnect();
 	if (con == NULL)
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 5018eedf1e57..789a2aa67635 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index 1e2b91c83c4c..194e45e7ae8d 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 000000000000..9b18e6e3588d
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,170 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "portability/instr_time.h"
+
+#ifndef WIN32
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up
+ * with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
+ * for more details.
+ *
+ * By default assume we are using clock_gettime() as a fallback which uses
+ * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
+ * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
+ *
+ * When using the RDTSC instruction directly this is filled in during initialization
+ * based on the relevant CPUID fields.
+ */
+int64		ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
+int64		ticks_per_sec = NS_PER_S;
+int64		max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
+
+#if defined(__x86_64__) && defined(__linux__)
+/*
+ * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
+ */
+bool		has_rdtsc = false;
+
+/*
+ * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
+ */
+bool		has_rdtscp = false;
+
+#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d)	/* KVMKVMKVM */
+
+static bool
+get_tsc_frequency_khz(uint32 *tsc_freq)
+{
+	uint32		r[4];
+
+	if (__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]) && r[2] > 0)
+	{
+		if (r[0] == 0 || r[1] == 0)
+			return false;
+
+		*tsc_freq = r[2] / 1000 * r[1] / r[0];
+		return true;
+	}
+
+	/* Some CPUs only report frequency in 16H */
+	if (__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]))
+	{
+		*tsc_freq = r[0] * 1000;
+		return true;
+	}
+
+	/*
+	 * Check if we have a KVM or VMware Hypervisor passing down TSC frequency
+	 * to us in a guest VM
+	 *
+	 * Note that accessing the 0x40000000 leaf for Hypervisor info requires
+	 * use of __cpuidex to set ECX to 0.
+	 *
+	 * TODO: We need to check whether our compiler is new enough
+	 * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95973)
+	 */
+	__cpuidex((int32 *) r, 0x40000000, 0);
+	if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
+	{
+		__cpuidex((int32 *) r, 0x40000010, 0);
+		if (r[0] > 0)
+		{
+			*tsc_freq = r[0];
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool
+is_rdtscp_available()
+{
+	uint32		r[4];
+
+	return __get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]) > 0 && (r[3] & (1 << 27)) != 0;
+}
+
+/*
+ * Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ *
+ * Since Linux already does a significant amount of work to determine
+ * whether TSC is a viable clock source, decide based on that.
+ */
+void
+pg_initialize_rdtsc(void)
+{
+	FILE	   *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+
+	if (fp)
+	{
+		char		buf[128];
+
+		if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
+		{
+			/*
+			 * Compute baseline CPU peformance, determines speed at which
+			 * RDTSC advances.
+			 */
+			uint32		tsc_freq;
+
+			if (get_tsc_frequency_khz(&tsc_freq))
+			{
+				/*
+				 * Ticks to nanoseconds conversion requires floating point
+				 * math because because:
+				 *
+				 * sec = ticks / frequency_hz ns  = ticks / frequency_hz *
+				 * 1,000,000,000 ns  = ticks * (1,000,000,000 / frequency_hz)
+				 * ns  = ticks * (1,000,000 / frequency_khz) <-- now in
+				 * kilohertz
+				 *
+				 * Here, 'ns' is usually a floating number. For example for a
+				 * 2.5 GHz CPU the scaling factor becomes 1,000,000 /
+				 * 2,500,000 = 1.2.
+				 *
+				 * To be able to use integer math we work around the lack of
+				 * precision. We first scale the integer up and after the
+				 * multiplication by the number of ticks in
+				 * INSTR_TIME_GET_NANOSEC() we divide again by the same value.
+				 * We picked the scaler such that it provides enough precision
+				 * and is a power-of-two which allows for shifting instead of
+				 * doing an integer division.
+				 */
+				ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
+				ticks_per_sec = tsc_freq * 1000;	/* KHz->Hz */
+				max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+
+				has_rdtsc = true;
+				has_rdtscp = is_rdtscp_available();
+			}
+		}
+
+		fclose(fp);
+	}
+}
+#endif							/* defined(__x86_64__) && defined(__linux__) */
+
+#endif							/* WIN32 */
diff --git a/src/common/meson.build b/src/common/meson.build
index 1540ba67cca4..62b90b3e609c 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 6ad411c68667..fd8ca1220012 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,11 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC
+ * clocksource is also used on the host OS.  Otherwise, and on other
+ * Unix-like systems we use clock_gettime() and on Windows we use
+ * QueryPerformanceCounter(). These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -19,10 +21,11 @@
  *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)		set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t)	set t to current time without waiting
+ * 									for instructions in out-of-order window
  *
- * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
- *									evaluates to whether t changed
+ * INSTR_TIME_SET_CURRENT(t)		set t to current time while waiting for
+ * 									instructions in OOO to retire
  *
  * INSTR_TIME_SET_SECONDS(t, s)		set t to s seconds
  *
@@ -84,7 +87,15 @@ typedef struct instr_time
 
 
 #ifndef WIN32
+/*
+ * Make sure this is a power-of-two, so that the compiler can turn the
+ * multiplications and divisions into shifts.
+ */
+#define TICKS_TO_NS_PRECISION (1<<14)
 
+extern int64 ticks_per_ns_scaled;
+extern int64 ticks_per_sec;
+extern int64 max_ticks_no_overflow;
 
 /* Use clock_gettime() */
 
@@ -110,35 +121,124 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
+#if defined(__x86_64__) && defined(__linux__)
+#include <x86intrin.h>
+#include <cpuid.h>
+
+extern bool has_rdtsc;
+extern bool has_rdtscp;
+
+extern void pg_initialize_rdtsc(void);
+#endif
+
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_clock_gettime()
 {
 	instr_time	now;
 	struct timespec tmp;
 
 	clock_gettime(PG_INSTR_CLOCK, &tmp);
 	now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
-
 	return now;
 }
 
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtsc)
+	{
+		instr_time	now;
+
+		now.ticks = __rdtsc();
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+#if defined(__x86_64__) && defined(__linux__)
+	if (has_rdtscp)
+	{
+		instr_time	now;
+		uint32		unused;
+
+		now.ticks = __rdtscp(&unused);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline int64_t
+pg_ticks_to_ns(instr_time t)
+{
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 * Check overflow without actually overflowing via: a * b > max <=> a >
+	 * max / b
+	 */
+	int64		ns = 0;
+
+	if (unlikely(t.ticks > max_ticks_no_overflow))
+	{
+		/*
+		 * Compute how often the maximum number of ticks fits completely into
+		 * the number of elapsed ticks and convert that number into
+		 * nanoseconds. Then multiply by the count to arrive at the final
+		 * value. In a 2nd step we adjust the number of elapsed ticks and
+		 * convert the remaining ticks.
+		 */
+		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+
+		ns = max_ns * count;
+
+		/*
+		 * Subtract the ticks that we now already accounted for, so that they
+		 * don't get counted twice.
+		 */
+		t.ticks -= count * max_ticks_no_overflow;
+		Assert(t.ticks >= 0);
+	}
+
+	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	return ns;
+}
+
+static inline void
+pg_initialize_get_ticks()
+{
+#if defined(__x86_64__) && defined(__linux__)
+	pg_initialize_rdtsc();
+#endif
+}
+
+#define INSTR_TIME_INITIALIZE() \
+	pg_initialize_get_ticks()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
+	((t) = pg_get_ticks())
 
 #define INSTR_TIME_SET_SECONDS(t, s) \
-	((t).ticks = NS_PER_S * (s))
+	((t).ticks = (s) * ticks_per_sec)
 
 #define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
-
+	pg_ticks_to_ns(t)
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
 
-/* helper for INSTR_TIME_SET_CURRENT */
 static inline instr_time
 pg_query_performance_counter(void)
 {
@@ -160,6 +260,11 @@ GetTimerFrequency(void)
 	return (double) f.QuadPart;
 }
 
+#define INSTR_TIME_INITIALIZE()
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_query_performance_counter())
+
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
@@ -180,13 +285,8 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_LT(x, y)	((x).ticks < (y).ticks)
 
-
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
-#define INSTR_TIME_SET_CURRENT_LAZY(t) \
-	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
-
-
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
@@ -196,7 +296,6 @@ GetTimerFrequency(void)
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
-
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)