From c85a5afb367a683f6c2c566a3f00ac6bf722192f Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 20 Jan 2023 15:31:54 -0800 Subject: [PATCH 1/3] instr_time: Add INSTR_TIME_SET_SECONDS(), INSTR_TIME_IS_LT() INSTR_TIME_SET_SECONDS() is useful to calculate the end of a time-bound loop without having to convert into time units (which is costly). INSTR_TIME_IS_LT() can be used to check the loop condition. Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- src/include/portability/instr_time.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index f71a851b18d8..6ad411c68667 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -15,6 +15,8 @@ * * INSTR_TIME_IS_ZERO(t) is t equal to zero? * + * INSTR_TIME_IS_LT(x, y) x < y + * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * * INSTR_TIME_SET_CURRENT(t) set t to current time @@ -22,6 +24,8 @@ * INSTR_TIME_SET_CURRENT_LAZY(t) set t to current time if t is zero, * evaluates to whether t changed * + * INSTR_TIME_SET_SECONDS(t, s) set t to s seconds + * * INSTR_TIME_ADD(x, y) x += y * * INSTR_TIME_SUBTRACT(x, y) x -= y @@ -122,6 +126,9 @@ pg_clock_gettime_ns(void) #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_clock_gettime_ns()) +#define INSTR_TIME_SET_SECONDS(t, s) \ + ((t).ticks = NS_PER_S * (s)) + #define INSTR_TIME_GET_NANOSEC(t) \ ((int64) (t).ticks) @@ -156,6 +163,9 @@ GetTimerFrequency(void) #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_query_performance_counter()) +#define INSTR_TIME_SET_SECONDS(t, s) \ + ((t).ticks = s * GetTimerFrequency()) + #define INSTR_TIME_GET_NANOSEC(t) \ ((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency()))) @@ -168,6 +178,8 @@ GetTimerFrequency(void) #define INSTR_TIME_IS_ZERO(t) ((t).ticks == 0) +#define INSTR_TIME_IS_LT(x, y) ((x).ticks < (y).ticks) + #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) From 4eb5f44613ed86c7a088b3affa2569ca36fd5fa4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 16 Jan 2023 11:19:11 -0800 Subject: [PATCH 2/3] wip: report nanoseconds in pg_test_timing This commit also updates pg_test_timing's documentation: - compare EXPLAIN (ANALYZE, TIMING ON/OFF) instead of comparing performance of of statement with/without EXPLAIN ANALYZE - explain the 2x overhead (due to two timestamp acquisitions per row) - remove old section about old versions of linux - I couldn't update the numbers, and it's old enough nobody would care --- doc/src/sgml/ref/pgtesttiming.sgml | 117 ++++++++++-------------- src/bin/pg_test_timing/pg_test_timing.c | 74 +++++++++------ 2 files changed, 95 insertions(+), 96 deletions(-) diff --git a/doc/src/sgml/ref/pgtesttiming.sgml b/doc/src/sgml/ref/pgtesttiming.sgml index a5eb3aa25e02..7e0266cf58b7 100644 --- a/doc/src/sgml/ref/pgtesttiming.sgml +++ b/doc/src/sgml/ref/pgtesttiming.sgml @@ -93,28 +93,34 @@ PostgreSQL documentation Good results will show most (>90%) individual timing calls take less than - one microsecond. Average per loop overhead will be even lower, below 100 - nanoseconds. This example from an Intel i7-860 system using a TSC clock - source shows excellent performance: + one microsecond (1000 nanoseconds). Average per loop overhead will be even + lower, below 100 nanoseconds. This example from an Intel i9-9880H system + using a TSC clock source shows excellent performance: - Note that different units are used for the per loop time than the - histogram. The loop can have resolution within a few nanoseconds (ns), - while the individual timing calls can only resolve down to one microsecond - (us). + Note that the accuracy of the histogram entries may be lower than the + per loop time. @@ -125,24 +131,25 @@ Histogram of timing durations: When the query executor is running a statement using EXPLAIN ANALYZE, individual operations are timed as well as showing a summary. The overhead of your system can be checked by - counting rows with the psql program: + disabling the per-row timing, using the TIMING OFF + option: -CREATE TABLE t AS SELECT * FROM generate_series(1,100000); -\timing -SELECT COUNT(*) FROM t; -EXPLAIN ANALYZE SELECT COUNT(*) FROM t; +CREATE TABLE t AS SELECT * FROM generate_series(1, 100000); +EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t; +EXPLAIN (ANALYZE, TIMING ON) SELECT COUNT(*) FROM t; - The i7-860 system measured runs the count query in 9.8 ms while - the EXPLAIN ANALYZE version takes 16.6 ms, each - processing just over 100,000 rows. That 6.8 ms difference means the timing - overhead per row is 68 ns, about twice what pg_test_timing estimated it - would be. Even that relatively small amount of overhead is making the fully - timed count statement take almost 70% longer. On more substantial queries, - the timing overhead would be less problematic. + The i9-9880H system measured shows an execution time of 4.116 ms for the + TIMING OFF query, and 6.965 ms for the + TIMING ON, each processing 100,000 rows. + + That 2.849 ms difference means the timing overhead per row is 28 ns. As + TIMING ON measures timestamps twice per row returned by + an executor node, the overhead is very close to what pg_test_timing + estimated it would be. @@ -157,28 +164,31 @@ EXPLAIN ANALYZE SELECT COUNT(*) FROM t; /sys/devices/system/clocksource/clocksource0/current_clocksource # pg_test_timing -Per loop time including overhead: 722.92 ns +Testing timing overhead for 3 seconds. +Per loop time including overhead: 708.58 ns Histogram of timing durations: - < us % of total count - 1 27.84870 1155682 - 2 72.05956 2990371 - 4 0.07810 3241 - 8 0.01357 563 - 16 0.00007 3 + < ns % of total count + 1024 99.79796 4225270 + 2048 0.15560 6588 + 4096 0.00035 15 + 8192 0.01738 736 + 16384 0.01679 711 + 32768 0.01190 504 ]]> In this configuration, the sample EXPLAIN ANALYZE above - takes 115.9 ms. That's 1061 ns of timing overhead, again a small multiple - of what's measured directly by this utility. That much timing overhead - means the actual query itself is only taking a tiny fraction of the - accounted for time, most of it is being consumed in overhead instead. In - this configuration, any EXPLAIN ANALYZE totals involving - many timed operations would be inflated significantly by timing overhead. + shows an execution time of 148.7 ms. That's 1392 ns of per-row timing + overhead. Taking the two timestamps per row into account, that's again + close to what pg_test_timing estimated. That much timing overhead means + the actual query itself is only taking a tiny fraction of the accounted for + time, most of it is being consumed in overhead instead. In this + configuration, any EXPLAIN ANALYZE totals involving many + timed operations would be inflated significantly by timing overhead. @@ -196,33 +206,6 @@ kern.timecounter.hardware: ACPI-fast -> TSC - - Other systems may only allow setting the time source on boot. On older - Linux systems the "clock" kernel setting is the only way to make this sort - of change. And even on some more recent ones, the only option you'll see - for a clock source is "jiffies". Jiffies are the older Linux software clock - implementation, which can have good resolution when it's backed by fast - enough timing hardware, as in this example: - - - diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index ce7aad4b25a4..29b0db6d6190 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -19,8 +19,8 @@ static void handle_args(int argc, char *argv[]); static uint64 test_timing(unsigned int duration); static void output(uint64 loop_count); -/* record duration in powers of 2 microseconds */ -static long long int histogram[32]; +/* record duration in powers of 2 nanoseconds */ +static uint64 histogram[64]; int main(int argc, char *argv[]) @@ -121,35 +121,48 @@ handle_args(int argc, char *argv[]) static uint64 test_timing(unsigned int duration) { - uint64 total_time; - int64 time_elapsed = 0; uint64 loop_count = 0; - uint64 prev, - cur; + instr_time until_time, + total_time; instr_time start_time, - end_time, - temp; - - total_time = duration > 0 ? duration * INT64CONST(1000000) : 0; + end_time; + instr_time cur; INSTR_TIME_SET_CURRENT(start_time); - cur = INSTR_TIME_GET_MICROSEC(start_time); - while (time_elapsed < total_time) + /* + * To reduce loop overhead, check loop condition in instr_time domain. + */ + INSTR_TIME_SET_SECONDS(total_time, duration); + until_time = start_time; + INSTR_TIME_ADD(until_time, total_time); + + cur = start_time; + + while (INSTR_TIME_IS_LT(cur, until_time)) { - int32 diff, - bits = 0; + instr_time temp; + instr_time prev; + int64 diff; + int32 bits = 0; prev = cur; - INSTR_TIME_SET_CURRENT(temp); - cur = INSTR_TIME_GET_MICROSEC(temp); - diff = cur - prev; + INSTR_TIME_SET_CURRENT(cur); + temp = cur; + INSTR_TIME_SUBTRACT(temp, prev); + diff = INSTR_TIME_GET_NANOSEC(temp); /* Did time go backwards? */ - if (diff < 0) + if (unlikely(diff <= 0)) { + /* can't do anything with that measurement */ + if (diff == 0) + { + loop_count++; + continue; + } fprintf(stderr, _("Detected clock going backwards in time.\n")); - fprintf(stderr, _("Time warp: %d ms\n"), diff); + fprintf(stderr, _("Time warp: %lld ns\n"), (long long) diff); exit(1); } @@ -164,8 +177,6 @@ test_timing(unsigned int duration) histogram[bits]++; loop_count++; - INSTR_TIME_SUBTRACT(temp, start_time); - time_elapsed = INSTR_TIME_GET_MICROSEC(temp); } INSTR_TIME_SET_CURRENT(end_time); @@ -173,7 +184,7 @@ test_timing(unsigned int duration) INSTR_TIME_SUBTRACT(end_time, start_time); printf(_("Per loop time including overhead: %0.2f ns\n"), - INSTR_TIME_GET_DOUBLE(end_time) * 1e9 / loop_count); + (INSTR_TIME_GET_DOUBLE(end_time) * NS_PER_S) / loop_count); return loop_count; } @@ -181,9 +192,10 @@ test_timing(unsigned int duration) static void output(uint64 loop_count) { - int64 max_bit = 31, + int64 low_bit = 0, + max_bit = 63, i; - char *header1 = _("< us"); + char *header1 = _("< ns"); char *header2 = /* xgettext:no-c-format */ _("% of total"); char *header3 = _("count"); int len1 = strlen(header1); @@ -194,15 +206,19 @@ output(uint64 loop_count) while (max_bit > 0 && histogram[max_bit] == 0) max_bit--; + /* find lowest bit value */ + while (low_bit < max_bit && histogram[low_bit] == 0) + low_bit++; + printf(_("Histogram of timing durations:\n")); printf("%*s %*s %*s\n", - Max(6, len1), header1, + Max(9, len1), header1, Max(10, len2), header2, Max(10, len3), header3); - for (i = 0; i <= max_bit; i++) - printf("%*ld %*.5f %*lld\n", - Max(6, len1), 1l << i, + for (i = low_bit; i <= max_bit; i++) + printf("%*ld %*.5f %*llu\n", + Max(9, len1), 1l << i, Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count, - Max(10, len3), histogram[i]); + Max(10, len3), (long long unsigned) histogram[i]); } From a1557c049a6541ec64bcb004e44dd49d1646ff36 Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sun, 23 Feb 2025 20:44:10 -0800 Subject: [PATCH 3/3] Use time stamp counter to measure time on Linux/x86 We switch to using the time stamp counter (TSC) instead of clock_gettime() to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime is reduced by around 10% for queries moving lots of rows through the plan. For now this is only enabled on Linux/x86, in case the system clocksource is reported as TSC. Relying on the Linux kernel simplifies the logic to detect if the present TSC is usable (frequency invariant, synchronized between sockets, etc.). In all other cases we fallback to clock_gettime(). Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP. RDTSCP waits for outstanding instructions to retire on out-of-order CPUs. This adds noticably for little benefit in the typical InstrStartNode() / InstrStopNode() use case. The macro to be used in such cases is called INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. Author: David Geier Author: Andres Freund Author: Lukas Fittl Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de --- src/backend/access/heap/vacuumlazy.c | 3 +- src/backend/executor/instrument.c | 12 +- src/backend/utils/init/postinit.c | 3 + src/bin/pg_test_timing/pg_test_timing.c | 7 +- src/bin/pgbench/pgbench.c | 3 + src/bin/psql/startup.c | 4 + src/common/Makefile | 1 + src/common/instr_time.c | 170 ++++++++++++++++++++++++ src/common/meson.build | 1 + src/include/portability/instr_time.h | 139 ++++++++++++++++--- 10 files changed, 314 insertions(+), 29 deletions(-) create mode 100644 src/common/instr_time.c diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 3b91d02605a0..a33308119519 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3363,8 +3363,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected) INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); - if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) - >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) + if (INSTR_TIME_GET_MILLISEC(elapsed) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock)) { diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 56e635f47000..01f67c5d9725 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options) void InstrStartNode(Instrumentation *instr) { - if (instr->need_timer && - !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); + if (instr->need_timer) + { + if (!INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStartNode called twice in a row"); + else + INSTR_TIME_SET_CURRENT_FAST(instr->starttime); + } /* save buffer usage totals at node entry, if needed */ if (instr->need_bufusage) @@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples) if (INSTR_TIME_IS_ZERO(instr->starttime)) elog(ERROR, "InstrStopNode called without start"); - INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_SET_CURRENT_FAST(endtime); INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); INSTR_TIME_SET_ZERO(instr->starttime); diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 318600d6d02e..2e429b30a277 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -786,6 +786,9 @@ InitPostgres(const char *in_dbname, Oid dboid, /* Initialize portal manager */ EnablePortalManager(); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* Initialize status reporting */ pgstat_beinit(); diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index 29b0db6d6190..62a15465f7c0 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -128,7 +128,8 @@ test_timing(unsigned int duration) end_time; instr_time cur; - INSTR_TIME_SET_CURRENT(start_time); + INSTR_TIME_INITIALIZE(); + INSTR_TIME_SET_CURRENT_FAST(start_time); /* * To reduce loop overhead, check loop condition in instr_time domain. @@ -147,7 +148,7 @@ test_timing(unsigned int duration) int32 bits = 0; prev = cur; - INSTR_TIME_SET_CURRENT(cur); + INSTR_TIME_SET_CURRENT_FAST(cur); temp = cur; INSTR_TIME_SUBTRACT(temp, prev); diff = INSTR_TIME_GET_NANOSEC(temp); @@ -179,7 +180,7 @@ test_timing(unsigned int duration) loop_count++; } - INSTR_TIME_SET_CURRENT(end_time); + INSTR_TIME_SET_CURRENT_FAST(end_time); INSTR_TIME_SUBTRACT(end_time, start_time); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index bf099aab2787..716f90578f0e 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7265,6 +7265,9 @@ main(int argc, char **argv) initRandomState(&state[i].cs_func_rs); } + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* opening connection... */ con = doConnect(); if (con == NULL) diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index 5018eedf1e57..789a2aa67635 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -327,6 +328,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index 1e2b91c83c4c..194e45e7ae8d 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 000000000000..9b18e6e3588d --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,170 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "portability/instr_time.h" + +#ifndef WIN32 +/* + * Stores what the number of cycles needs to be multiplied with to end up + * with nanoseconds using integer math. See comment in pg_initialize_rdtsc() + * for more details. + * + * By default assume we are using clock_gettime() as a fallback which uses + * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar + * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds. + * + * When using the RDTSC instruction directly this is filled in during initialization + * based on the relevant CPUID fields. + */ +int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION; +int64 ticks_per_sec = NS_PER_S; +int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION; + +#if defined(__x86_64__) && defined(__linux__) +/* + * Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource) + */ +bool has_rdtsc = false; + +/* + * Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available. + */ +bool has_rdtscp = false; + +#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */ +#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d) /* KVMKVMKVM */ + +static bool +get_tsc_frequency_khz(uint32 *tsc_freq) +{ + uint32 r[4]; + + if (__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]) && r[2] > 0) + { + if (r[0] == 0 || r[1] == 0) + return false; + + *tsc_freq = r[2] / 1000 * r[1] / r[0]; + return true; + } + + /* Some CPUs only report frequency in 16H */ + if (__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3])) + { + *tsc_freq = r[0] * 1000; + return true; + } + + /* + * Check if we have a KVM or VMware Hypervisor passing down TSC frequency + * to us in a guest VM + * + * Note that accessing the 0x40000000 leaf for Hypervisor info requires + * use of __cpuidex to set ECX to 0. + * + * TODO: We need to check whether our compiler is new enough + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95973) + */ + __cpuidex((int32 *) r, 0x40000000, 0); + if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r))) + { + __cpuidex((int32 *) r, 0x40000010, 0); + if (r[0] > 0) + { + *tsc_freq = r[0]; + return true; + } + } + + return false; +} + +static bool +is_rdtscp_available() +{ + uint32 r[4]; + + return __get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]) > 0 && (r[3] & (1 << 27)) != 0; +} + +/* + * Decide whether we use the RDTSC instruction at runtime, for Linux/x86, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + * + * Since Linux already does a significant amount of work to determine + * whether TSC is a viable clock source, decide based on that. + */ +void +pg_initialize_rdtsc(void) +{ + FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + + if (fp) + { + char buf[128]; + + if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0) + { + /* + * Compute baseline CPU peformance, determines speed at which + * RDTSC advances. + */ + uint32 tsc_freq; + + if (get_tsc_frequency_khz(&tsc_freq)) + { + /* + * Ticks to nanoseconds conversion requires floating point + * math because because: + * + * sec = ticks / frequency_hz ns = ticks / frequency_hz * + * 1,000,000,000 ns = ticks * (1,000,000,000 / frequency_hz) + * ns = ticks * (1,000,000 / frequency_khz) <-- now in + * kilohertz + * + * Here, 'ns' is usually a floating number. For example for a + * 2.5 GHz CPU the scaling factor becomes 1,000,000 / + * 2,500,000 = 1.2. + * + * To be able to use integer math we work around the lack of + * precision. We first scale the integer up and after the + * multiplication by the number of ticks in + * INSTR_TIME_GET_NANOSEC() we divide again by the same value. + * We picked the scaler such that it provides enough precision + * and is a power-of-two which allows for shifting instead of + * doing an integer division. + */ + ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq; + ticks_per_sec = tsc_freq * 1000; /* KHz->Hz */ + max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled; + + has_rdtsc = true; + has_rdtscp = is_rdtscp_available(); + } + } + + fclose(fp); + } +} +#endif /* defined(__x86_64__) && defined(__linux__) */ + +#endif /* WIN32 */ diff --git a/src/common/meson.build b/src/common/meson.build index 1540ba67cca4..62b90b3e609c 100644 --- a/src/common/meson.build +++ b/src/common/meson.build @@ -13,6 +13,7 @@ common_sources = files( 'file_perm.c', 'file_utils.c', 'hashfn.c', + 'instr_time.c', 'ip.c', 'jsonapi.c', 'keywords.c', diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 6ad411c68667..fd8ca1220012 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC + * clocksource is also used on the host OS. Otherwise, and on other + * Unix-like systems we use clock_gettime() and on Windows we use + * QueryPerformanceCounter(). These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -19,10 +21,11 @@ * * INSTR_TIME_SET_ZERO(t) set t to zero (memset is acceptable too) * - * INSTR_TIME_SET_CURRENT(t) set t to current time + * INSTR_TIME_SET_CURRENT_FAST(t) set t to current time without waiting + * for instructions in out-of-order window * - * INSTR_TIME_SET_CURRENT_LAZY(t) set t to current time if t is zero, - * evaluates to whether t changed + * INSTR_TIME_SET_CURRENT(t) set t to current time while waiting for + * instructions in OOO to retire * * INSTR_TIME_SET_SECONDS(t, s) set t to s seconds * @@ -84,7 +87,15 @@ typedef struct instr_time #ifndef WIN32 +/* + * Make sure this is a power-of-two, so that the compiler can turn the + * multiplications and divisions into shifts. + */ +#define TICKS_TO_NS_PRECISION (1<<14) +extern int64 ticks_per_ns_scaled; +extern int64 ticks_per_sec; +extern int64 max_ticks_no_overflow; /* Use clock_gettime() */ @@ -110,35 +121,124 @@ typedef struct instr_time #define PG_INSTR_CLOCK CLOCK_REALTIME #endif -/* helper for INSTR_TIME_SET_CURRENT */ +#if defined(__x86_64__) && defined(__linux__) +#include +#include + +extern bool has_rdtsc; +extern bool has_rdtscp; + +extern void pg_initialize_rdtsc(void); +#endif + static inline instr_time -pg_clock_gettime_ns(void) +pg_clock_gettime() { instr_time now; struct timespec tmp; clock_gettime(PG_INSTR_CLOCK, &tmp); now.ticks = tmp.tv_sec * NS_PER_S + tmp.tv_nsec; - return now; } +static inline instr_time +pg_get_ticks_fast(void) +{ +#if defined(__x86_64__) && defined(__linux__) + if (has_rdtsc) + { + instr_time now; + + now.ticks = __rdtsc(); + return now; + } +#endif + + return pg_clock_gettime(); +} + +static inline instr_time +pg_get_ticks(void) +{ +#if defined(__x86_64__) && defined(__linux__) + if (has_rdtscp) + { + instr_time now; + uint32 unused; + + now.ticks = __rdtscp(&unused); + return now; + } +#endif + + return pg_clock_gettime(); +} + +static inline int64_t +pg_ticks_to_ns(instr_time t) +{ + /* + * Would multiplication overflow? If so perform computation in two parts. + * Check overflow without actually overflowing via: a * b > max <=> a > + * max / b + */ + int64 ns = 0; + + if (unlikely(t.ticks > max_ticks_no_overflow)) + { + /* + * Compute how often the maximum number of ticks fits completely into + * the number of elapsed ticks and convert that number into + * nanoseconds. Then multiply by the count to arrive at the final + * value. In a 2nd step we adjust the number of elapsed ticks and + * convert the remaining ticks. + */ + int64 count = t.ticks / max_ticks_no_overflow; + int64 max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION; + + ns = max_ns * count; + + /* + * Subtract the ticks that we now already accounted for, so that they + * don't get counted twice. + */ + t.ticks -= count * max_ticks_no_overflow; + Assert(t.ticks >= 0); + } + + ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION; + return ns; +} + +static inline void +pg_initialize_get_ticks() +{ +#if defined(__x86_64__) && defined(__linux__) + pg_initialize_rdtsc(); +#endif +} + +#define INSTR_TIME_INITIALIZE() \ + pg_initialize_get_ticks() + +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_get_ticks_fast()) + #define INSTR_TIME_SET_CURRENT(t) \ - ((t) = pg_clock_gettime_ns()) + ((t) = pg_get_ticks()) #define INSTR_TIME_SET_SECONDS(t, s) \ - ((t).ticks = NS_PER_S * (s)) + ((t).ticks = (s) * ticks_per_sec) #define INSTR_TIME_GET_NANOSEC(t) \ - ((int64) (t).ticks) - + pg_ticks_to_ns(t) #else /* WIN32 */ /* Use QueryPerformanceCounter() */ -/* helper for INSTR_TIME_SET_CURRENT */ static inline instr_time pg_query_performance_counter(void) { @@ -160,6 +260,11 @@ GetTimerFrequency(void) return (double) f.QuadPart; } +#define INSTR_TIME_INITIALIZE() + +#define INSTR_TIME_SET_CURRENT_FAST(t) \ + ((t) = pg_query_performance_counter()) + #define INSTR_TIME_SET_CURRENT(t) \ ((t) = pg_query_performance_counter()) @@ -180,13 +285,8 @@ GetTimerFrequency(void) #define INSTR_TIME_IS_LT(x, y) ((x).ticks < (y).ticks) - #define INSTR_TIME_SET_ZERO(t) ((t).ticks = 0) -#define INSTR_TIME_SET_CURRENT_LAZY(t) \ - (INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false) - - #define INSTR_TIME_ADD(x,y) \ ((x).ticks += (y).ticks) @@ -196,7 +296,6 @@ GetTimerFrequency(void) #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ ((x).ticks += (y).ticks - (z).ticks) - #define INSTR_TIME_GET_DOUBLE(t) \ ((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)