7
7
* Copyright (c) 2001-2006, PostgreSQL Global Development Group
8
8
*
9
9
* IDENTIFICATION
10
- * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
10
+ * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
11
11
*
12
12
*-------------------------------------------------------------------------
13
13
*/
14
14
#include "postgres.h"
15
15
16
- #include <unistd.h>
17
16
#include <math.h>
17
+ #include <unistd.h>
18
18
19
19
#include "executor/instrument.h"
20
20
21
- /* This is the function that is used to determine the sampling intervals. In
22
- * general, if the function is f(x), then for N tuples we will take on the
23
- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
24
- * number of samples that would be collected over 1,000,000 tuples.
25
21
26
- f(x) = x => log2(N) 20
27
- f(x) = x^(1/2) => 2 * N^(1/2) 2000
28
- f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
29
-
22
+ /*
23
+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
24
+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
25
+ * (Because that requires a kernel call on most systems, it's expensive.)
26
+ *
27
+ * This macro determines the sampling interval: that is, after how many more
28
+ * iterations will we take the next time sample, given that niters iterations
29
+ * have occurred already. In general, if the function is f(x), then for N
30
+ * iterations we will take on the order of integral(1/f(x), x=0..N)
31
+ * samples. Some examples follow, with the number of samples that would be
32
+ * collected over 1,000,000 iterations:
33
+ *
34
+ * f(x) = x => log2(N) 20
35
+ * f(x) = x^(1/2) => 2 * N^(1/2) 2000
36
+ * f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
37
+ *
30
38
* I've chosen the last one as it seems to provide a good compromise between
31
39
* low overhead but still getting a meaningful number of samples. However,
32
40
* not all machines have the cbrt() function so on those we substitute
33
41
* sqrt(). The difference is not very significant in the tests I made.
34
- */
42
+ *
43
+ * The actual sampling interval is randomized with the SampleFunc() value
44
+ * as the mean; this hopefully will reduce any measurement bias due to
45
+ * cyclic variation in the node execution time.
46
+ */
35
47
#ifdef HAVE_CBRT
36
- #define SampleFunc cbrt
48
+ #define SampleFunc ( niters ) cbrt(niters)
37
49
#else
38
- #define SampleFunc sqrt
50
+ #define SampleFunc ( niters ) sqrt(niters)
39
51
#endif
40
52
53
+ #define SampleInterval (niters ) \
54
+ (SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
55
+
56
+ /*
57
+ * We sample at every node iteration until we've reached this threshold,
58
+ * so that nodes not called a large number of times are completely accurate.
59
+ * (Perhaps this should be a GUC variable? But beware of messing up
60
+ * CalculateSampleOverhead if value is too small.)
61
+ */
41
62
#define SAMPLE_THRESHOLD 50
42
63
64
+ /*
65
+ * Determine the sampling overhead. This only needs to be done once per
66
+ * backend (actually, probably once per postmaster would do ...)
67
+ */
43
68
static double SampleOverhead ;
44
- static bool SampleOverheadCalculated ;
69
+ static bool SampleOverheadCalculated = false ;
45
70
46
71
static void
47
- CalculateSampleOverhead ()
72
+ CalculateSampleOverhead (void )
48
73
{
49
74
Instrumentation instr ;
50
75
int i ;
51
-
52
- /* We want to determine the sampling overhead, to correct
53
- * calculations later. This only needs to be done once per backend.
54
- * Is this the place? A wrong value here (due to a mistimed
55
- * task-switch) will cause bad calculations later.
56
- *
76
+
77
+ /*
78
+ * We could get a wrong result due to being interrupted by task switch.
57
79
* To minimize the risk we do it a few times and take the lowest.
58
80
*/
59
-
60
81
SampleOverhead = 1.0e6 ;
61
-
62
- for ( i = 0 ; i < 5 ; i ++ )
82
+
83
+ for ( i = 0 ; i < 5 ; i ++ )
63
84
{
64
85
int j ;
65
86
double overhead ;
66
-
67
- memset ( & instr , 0 , sizeof (instr ) );
68
-
69
- /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
70
- for ( j = 0 ; j < SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE (instr .counter ) < 100e-6 ; i ++ )
87
+
88
+ memset (& instr , 0 , sizeof (instr ));
89
+ /*
90
+ * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
91
+ * so that's as far as we can test.
92
+ */
93
+ for (j = 0 ; j < SAMPLE_THRESHOLD ; j ++ )
71
94
{
72
- InstrStartNode ( & instr );
73
- InstrStopNode ( & instr , 1 );
95
+ InstrStartNode (& instr );
96
+ InstrStopNode (& instr , 1 );
74
97
}
75
98
overhead = INSTR_TIME_GET_DOUBLE (instr .counter ) / instr .samplecount ;
76
- if ( overhead < SampleOverhead )
99
+ if ( overhead < SampleOverhead )
77
100
SampleOverhead = overhead ;
78
101
}
79
-
102
+
80
103
SampleOverheadCalculated = true;
81
104
}
82
105
106
+
83
107
/* Allocate new instrumentation structure(s) */
84
108
Instrumentation *
85
109
InstrAlloc (int n )
86
110
{
87
- Instrumentation * instr = palloc0 ( n * sizeof ( Instrumentation )) ;
111
+ Instrumentation * instr ;
88
112
89
- /* we don't need to do any initialization except zero 'em */
90
-
91
- /* Calculate overhead, if not done yet */
92
- if ( !SampleOverheadCalculated )
113
+ /* Calculate sampling overhead, if not done yet in this backend */
114
+ if (!SampleOverheadCalculated )
93
115
CalculateSampleOverhead ();
116
+
117
+ instr = palloc0 (n * sizeof (Instrumentation ));
118
+
119
+ /* we don't need to do any initialization except zero 'em */
120
+
94
121
return instr ;
95
122
}
96
123
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
100
127
{
101
128
if (INSTR_TIME_IS_ZERO (instr -> starttime ))
102
129
{
103
- /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
104
- if (instr -> tuplecount < SAMPLE_THRESHOLD )
130
+ /*
131
+ * Always sample if not yet up to threshold, else check whether
132
+ * next threshold has been reached
133
+ */
134
+ if (instr -> itercount < SAMPLE_THRESHOLD )
105
135
instr -> sampling = true;
106
- else
136
+ else if ( instr -> itercount >= instr -> nextsample )
107
137
{
108
- /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
109
- if ( instr -> tuplecount > instr -> nextsample )
110
- {
111
- instr -> sampling = true;
112
- /* The doubling is so the random will average 1 over time */
113
- instr -> nextsample += 2.0 * SampleFunc (instr -> tuplecount ) * (double )rand () / (double )RAND_MAX ;
114
- }
138
+ instr -> sampling = true;
139
+ instr -> nextsample =
140
+ instr -> itercount + SampleInterval (instr -> itercount );
115
141
}
116
142
if (instr -> sampling )
117
143
INSTR_TIME_SET_CURRENT (instr -> starttime );
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
124
150
void
125
151
InstrStopNode (Instrumentation * instr , double nTuples )
126
152
{
127
- instr_time endtime ;
128
-
129
- /* count the returned tuples */
153
+ /* count the returned tuples and iterations */
130
154
instr -> tuplecount += nTuples ;
155
+ instr -> itercount += 1 ;
131
156
157
+ /* measure runtime if appropriate */
132
158
if (instr -> sampling )
133
159
{
160
+ instr_time endtime ;
161
+
134
162
if (INSTR_TIME_IS_ZERO (instr -> starttime ))
135
163
{
136
164
elog (DEBUG2 , "InstrStopNode called without start" );
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
159
187
#endif
160
188
161
189
INSTR_TIME_SET_ZERO (instr -> starttime );
162
- instr -> samplecount += nTuples ;
190
+
191
+ instr -> samplecount += 1 ;
163
192
instr -> sampling = false;
164
193
}
165
194
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
184
213
if (!INSTR_TIME_IS_ZERO (instr -> starttime ))
185
214
elog (DEBUG2 , "InstrEndLoop called on running node" );
186
215
187
- /* Accumulate per-cycle statistics into totals */
216
+ /* Compute time spent in node */
188
217
totaltime = INSTR_TIME_GET_DOUBLE (instr -> counter );
189
218
190
- instr -> startup += instr -> firsttuple ;
191
-
192
- /* Here we take into account sampling effects. Doing it naively ends
193
- * up assuming the sampling overhead applies to all tuples, even the
194
- * ones we didn't measure. We've calculated an overhead, so we
195
- * subtract that for all samples we didn't measure. The first tuple
196
- * is also special cased, because it usually takes longer. */
197
-
198
- if ( instr -> samplecount < instr -> tuplecount )
219
+ /*
220
+ * If we didn't measure runtime on every iteration, then we must increase
221
+ * the measured total to account for the other iterations. Naively
222
+ * multiplying totaltime by itercount/samplecount would be wrong because
223
+ * it effectively assumes the sampling overhead applies to all iterations,
224
+ * even the ones we didn't measure. (Note that what we are trying to
225
+ * estimate here is the actual time spent in the node, including the
226
+ * actual measurement overhead; not the time exclusive of measurement
227
+ * overhead.) We exclude the first iteration from the correction basis,
228
+ * because it often takes longer than others.
229
+ */
230
+ if (instr -> itercount > instr -> samplecount )
199
231
{
200
- double pertuple = (totaltime - instr -> firsttuple ) / (instr -> samplecount - 1 );
201
- instr -> total += instr -> firsttuple + (pertuple * (instr -> samplecount - 1 ))
202
- + ((pertuple - SampleOverhead ) * (instr -> tuplecount - instr -> samplecount ));
232
+ double per_iter ;
233
+
234
+ per_iter = (totaltime - instr -> firsttuple ) / (instr -> samplecount - 1 )
235
+ - SampleOverhead ;
236
+ if (per_iter > 0 ) /* sanity check */
237
+ totaltime += per_iter * (instr -> itercount - instr -> samplecount );
203
238
}
204
- else
205
- instr -> total += totaltime ;
206
-
239
+
240
+ /* Accumulate per-cycle statistics into totals */
241
+ instr -> startup += instr -> firsttuple ;
242
+ instr -> total += totaltime ;
207
243
instr -> ntuples += instr -> tuplecount ;
208
- instr -> nsamples += instr -> samplecount ;
209
244
instr -> nloops += 1 ;
210
245
211
246
/* Reset for next cycle (if any) */
212
247
instr -> running = false;
248
+ instr -> sampling = false;
213
249
INSTR_TIME_SET_ZERO (instr -> starttime );
214
250
INSTR_TIME_SET_ZERO (instr -> counter );
215
251
instr -> firsttuple = 0 ;
216
- instr -> samplecount = 0 ;
217
252
instr -> tuplecount = 0 ;
253
+ instr -> itercount = 0 ;
254
+ instr -> samplecount = 0 ;
255
+ instr -> nextsample = 0 ;
218
256
}
0 commit comments