Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit a1557c0

Browse files
lfittlCommitfest Bot
authored and
Commitfest Bot
committed
Use time stamp counter to measure time on Linux/x86
We switch to using the time stamp counter (TSC) instead of clock_gettime() to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime is reduced by around 10% for queries moving lots of rows through the plan. For now this is only enabled on Linux/x86, in case the system clocksource is reported as TSC. Relying on the Linux kernel simplifies the logic to detect if the present TSC is usable (frequency invariant, synchronized between sockets, etc.). In all other cases we fallback to clock_gettime(). Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP. RDTSCP waits for outstanding instructions to retire on out-of-order CPUs. This adds noticably for little benefit in the typical InstrStartNode() / InstrStopNode() use case. The macro to be used in such cases is called INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT() uses RDTSCP and is supposed to be used when precision is more important than performance. Author: David Geier <geidav.pg@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Lukas Fittl <lukas@fittl.com> Reviewed-by: Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
1 parent 4eb5f44 commit a1557c0

File tree

10 files changed

+314
-29
lines changed

10 files changed

+314
-29
lines changed

src/backend/access/heap/vacuumlazy.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3363,8 +3363,7 @@ count_nondeletable_pages(LVRelState *vacrel, bool *lock_waiter_detected)
33633363
INSTR_TIME_SET_CURRENT(currenttime);
33643364
elapsed = currenttime;
33653365
INSTR_TIME_SUBTRACT(elapsed, starttime);
3366-
if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
3367-
>= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
3366+
if (INSTR_TIME_GET_MILLISEC(elapsed) >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
33683367
{
33693368
if (LockHasWaitersRelation(vacrel->rel, AccessExclusiveLock))
33703369
{

src/backend/executor/instrument.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
6767
void
6868
InstrStartNode(Instrumentation *instr)
6969
{
70-
if (instr->need_timer &&
71-
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
72-
elog(ERROR, "InstrStartNode called twice in a row");
70+
if (instr->need_timer)
71+
{
72+
if (!INSTR_TIME_IS_ZERO(instr->starttime))
73+
elog(ERROR, "InstrStartNode called twice in a row");
74+
else
75+
INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
76+
}
7377

7478
/* save buffer usage totals at node entry, if needed */
7579
if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
9599
if (INSTR_TIME_IS_ZERO(instr->starttime))
96100
elog(ERROR, "InstrStopNode called without start");
97101

98-
INSTR_TIME_SET_CURRENT(endtime);
102+
INSTR_TIME_SET_CURRENT_FAST(endtime);
99103
INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
100104

101105
INSTR_TIME_SET_ZERO(instr->starttime);

src/backend/utils/init/postinit.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,9 @@ InitPostgres(const char *in_dbname, Oid dboid,
786786
/* Initialize portal manager */
787787
EnablePortalManager();
788788

789+
/* initialize high-precision interval timing */
790+
INSTR_TIME_INITIALIZE();
791+
789792
/* Initialize status reporting */
790793
pgstat_beinit();
791794

src/bin/pg_test_timing/pg_test_timing.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ test_timing(unsigned int duration)
128128
end_time;
129129
instr_time cur;
130130

131-
INSTR_TIME_SET_CURRENT(start_time);
131+
INSTR_TIME_INITIALIZE();
132+
INSTR_TIME_SET_CURRENT_FAST(start_time);
132133

133134
/*
134135
* To reduce loop overhead, check loop condition in instr_time domain.
@@ -147,7 +148,7 @@ test_timing(unsigned int duration)
147148
int32 bits = 0;
148149

149150
prev = cur;
150-
INSTR_TIME_SET_CURRENT(cur);
151+
INSTR_TIME_SET_CURRENT_FAST(cur);
151152
temp = cur;
152153
INSTR_TIME_SUBTRACT(temp, prev);
153154
diff = INSTR_TIME_GET_NANOSEC(temp);
@@ -179,7 +180,7 @@ test_timing(unsigned int duration)
179180
loop_count++;
180181
}
181182

182-
INSTR_TIME_SET_CURRENT(end_time);
183+
INSTR_TIME_SET_CURRENT_FAST(end_time);
183184

184185
INSTR_TIME_SUBTRACT(end_time, start_time);
185186

src/bin/pgbench/pgbench.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7265,6 +7265,9 @@ main(int argc, char **argv)
72657265
initRandomState(&state[i].cs_func_rs);
72667266
}
72677267

7268+
/* initialize high-precision interval timing */
7269+
INSTR_TIME_INITIALIZE();
7270+
72687271
/* opening connection... */
72697272
con = doConnect();
72707273
if (con == NULL)

src/bin/psql/startup.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "help.h"
2525
#include "input.h"
2626
#include "mainloop.h"
27+
#include "portability/instr_time.h"
2728
#include "settings.h"
2829

2930
/*
@@ -327,6 +328,9 @@ main(int argc, char *argv[])
327328

328329
PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
329330

331+
/* initialize high-precision interval timing */
332+
INSTR_TIME_INITIALIZE();
333+
330334
SyncVariables();
331335

332336
if (options.list_dbs)

src/common/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ OBJS_COMMON = \
5959
file_perm.o \
6060
file_utils.o \
6161
hashfn.o \
62+
instr_time.o \
6263
ip.o \
6364
jsonapi.o \
6465
keywords.o \

src/common/instr_time.c

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* instr_time.c
4+
* Non-inline parts of the portable high-precision interval timing
5+
* implementation
6+
*
7+
* Portions Copyright (c) 2022, PostgreSQL Global Development Group
8+
*
9+
*
10+
* IDENTIFICATION
11+
* src/backend/port/instr_time.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
#include "postgres.h"
16+
17+
#include "portability/instr_time.h"
18+
19+
#ifndef WIN32
20+
/*
21+
* Stores what the number of cycles needs to be multiplied with to end up
22+
* with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
23+
* for more details.
24+
*
25+
* By default assume we are using clock_gettime() as a fallback which uses
26+
* nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
27+
* so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
28+
*
29+
* When using the RDTSC instruction directly this is filled in during initialization
30+
* based on the relevant CPUID fields.
31+
*/
32+
int64 ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
33+
int64 ticks_per_sec = NS_PER_S;
34+
int64 max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
35+
36+
#if defined(__x86_64__) && defined(__linux__)
37+
/*
38+
* Indicates if RDTSC can be used (Linux/x86 only, when OS uses TSC clocksource)
39+
*/
40+
bool has_rdtsc = false;
41+
42+
/*
43+
* Indicates if RDTSCP can be used. True if RDTSC can be used and RDTSCP is available.
44+
*/
45+
bool has_rdtscp = false;
46+
47+
#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
48+
#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d) /* KVMKVMKVM */
49+
50+
static bool
51+
get_tsc_frequency_khz(uint32 *tsc_freq)
52+
{
53+
uint32 r[4];
54+
55+
if (__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]) && r[2] > 0)
56+
{
57+
if (r[0] == 0 || r[1] == 0)
58+
return false;
59+
60+
*tsc_freq = r[2] / 1000 * r[1] / r[0];
61+
return true;
62+
}
63+
64+
/* Some CPUs only report frequency in 16H */
65+
if (__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]))
66+
{
67+
*tsc_freq = r[0] * 1000;
68+
return true;
69+
}
70+
71+
/*
72+
* Check if we have a KVM or VMware Hypervisor passing down TSC frequency
73+
* to us in a guest VM
74+
*
75+
* Note that accessing the 0x40000000 leaf for Hypervisor info requires
76+
* use of __cpuidex to set ECX to 0.
77+
*
78+
* TODO: We need to check whether our compiler is new enough
79+
* (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95973)
80+
*/
81+
__cpuidex((int32 *) r, 0x40000000, 0);
82+
if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
83+
{
84+
__cpuidex((int32 *) r, 0x40000010, 0);
85+
if (r[0] > 0)
86+
{
87+
*tsc_freq = r[0];
88+
return true;
89+
}
90+
}
91+
92+
return false;
93+
}
94+
95+
static bool
96+
is_rdtscp_available()
97+
{
98+
uint32 r[4];
99+
100+
return __get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]) > 0 && (r[3] & (1 << 27)) != 0;
101+
}
102+
103+
/*
104+
* Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
105+
* instead of incurring the overhead of a full clock_gettime() call.
106+
*
107+
* This can't be reliably determined at compile time, since the
108+
* availability of an "invariant" TSC (that is not affected by CPU
109+
* frequency changes) is dependent on the CPU architecture. Additionally,
110+
* there are cases where TSC availability is impacted by virtualization,
111+
* where a simple cpuid feature check would not be enough.
112+
*
113+
* Since Linux already does a significant amount of work to determine
114+
* whether TSC is a viable clock source, decide based on that.
115+
*/
116+
void
117+
pg_initialize_rdtsc(void)
118+
{
119+
FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
120+
121+
if (fp)
122+
{
123+
char buf[128];
124+
125+
if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
126+
{
127+
/*
128+
* Compute baseline CPU peformance, determines speed at which
129+
* RDTSC advances.
130+
*/
131+
uint32 tsc_freq;
132+
133+
if (get_tsc_frequency_khz(&tsc_freq))
134+
{
135+
/*
136+
* Ticks to nanoseconds conversion requires floating point
137+
* math because because:
138+
*
139+
* sec = ticks / frequency_hz ns = ticks / frequency_hz *
140+
* 1,000,000,000 ns = ticks * (1,000,000,000 / frequency_hz)
141+
* ns = ticks * (1,000,000 / frequency_khz) <-- now in
142+
* kilohertz
143+
*
144+
* Here, 'ns' is usually a floating number. For example for a
145+
* 2.5 GHz CPU the scaling factor becomes 1,000,000 /
146+
* 2,500,000 = 1.2.
147+
*
148+
* To be able to use integer math we work around the lack of
149+
* precision. We first scale the integer up and after the
150+
* multiplication by the number of ticks in
151+
* INSTR_TIME_GET_NANOSEC() we divide again by the same value.
152+
* We picked the scaler such that it provides enough precision
153+
* and is a power-of-two which allows for shifting instead of
154+
* doing an integer division.
155+
*/
156+
ticks_per_ns_scaled = INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
157+
ticks_per_sec = tsc_freq * 1000; /* KHz->Hz */
158+
max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
159+
160+
has_rdtsc = true;
161+
has_rdtscp = is_rdtscp_available();
162+
}
163+
}
164+
165+
fclose(fp);
166+
}
167+
}
168+
#endif /* defined(__x86_64__) && defined(__linux__) */
169+
170+
#endif /* WIN32 */

src/common/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ common_sources = files(
1313
'file_perm.c',
1414
'file_utils.c',
1515
'hashfn.c',
16+
'instr_time.c',
1617
'ip.c',
1718
'jsonapi.c',
1819
'keywords.c',

0 commit comments

Comments
 (0)