Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 011b51c

Browse files
committed
Marginal hacking to improve the speed of COPY OUT. I had found in a bit of
profiling that CopyAttributeOutText was taking an unreasonable fraction of the backend run time (like 66%!) on the following trivial test case: $ time psql -c "copy (select repeat('xyzzy',50) from generate_series(1,10000000)) to stdout" regression >/dev/null The time is all being spent on scanning the string for characters to be escaped, which most of the time there aren't any of. Some tweaking to take as many tests as possible out of the inner loop reduced the runtime of this example by more than 10%. In a real-world case it wouldn't be as useful a speedup, but it still seems worth adding a few lines here.
1 parent 6775c01 commit 011b51c

File tree

1 file changed

+74
-54
lines changed

1 file changed

+74
-54
lines changed

src/backend/commands/copy.c

Lines changed: 74 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.283 2007/04/27 22:05:46 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.284 2007/06/17 23:39:28 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -3075,68 +3075,88 @@ CopyAttributeOutText(CopyState cstate, char *string)
30753075
* We have to grovel through the string searching for control characters
30763076
* and instances of the delimiter character. In most cases, though, these
30773077
* are infrequent. To avoid overhead from calling CopySendData once per
3078-
* character, we dump out all characters between replaceable characters in
3078+
* character, we dump out all characters between escaped characters in
30793079
* a single call. The loop invariant is that the data from "start" to
30803080
* "ptr" can be sent literally, but hasn't yet been.
3081+
*
3082+
* We can skip pg_encoding_mblen() overhead when encoding is safe, because
3083+
* in valid backend encodings, extra bytes of a multibyte character never
3084+
* look like ASCII. This loop is sufficiently performance-critical that
3085+
* it's worth making two copies of it to get the IS_HIGHBIT_SET() test
3086+
* out of the normal safe-encoding path.
30813087
*/
3082-
start = ptr;
3083-
while ((c = *ptr) != '\0')
3088+
if (cstate->encoding_embeds_ascii)
30843089
{
3085-
switch (c)
3090+
start = ptr;
3091+
while ((c = *ptr) != '\0')
30863092
{
3087-
case '\b':
3088-
DUMPSOFAR();
3089-
CopySendString(cstate, "\\b");
3090-
start = ++ptr;
3091-
break;
3092-
case '\f':
3093-
DUMPSOFAR();
3094-
CopySendString(cstate, "\\f");
3095-
start = ++ptr;
3096-
break;
3097-
case '\n':
3098-
DUMPSOFAR();
3099-
CopySendString(cstate, "\\n");
3100-
start = ++ptr;
3101-
break;
3102-
case '\r':
3103-
DUMPSOFAR();
3104-
CopySendString(cstate, "\\r");
3105-
start = ++ptr;
3106-
break;
3107-
case '\t':
3108-
DUMPSOFAR();
3109-
CopySendString(cstate, "\\t");
3110-
start = ++ptr;
3111-
break;
3112-
case '\v':
3093+
if (c == '\\' || c == delimc)
3094+
{
31133095
DUMPSOFAR();
3114-
CopySendString(cstate, "\\v");
3115-
start = ++ptr;
3116-
break;
3117-
case '\\':
3096+
CopySendChar(cstate, '\\');
3097+
start = ptr++; /* we include char in next run */
3098+
}
3099+
else if ((unsigned char) c < (unsigned char) 0x20)
3100+
{
3101+
switch (c)
3102+
{
3103+
/* \r and \n must be escaped, the others are traditional */
3104+
case '\b':
3105+
case '\f':
3106+
case '\n':
3107+
case '\r':
3108+
case '\t':
3109+
case '\v':
3110+
DUMPSOFAR();
3111+
CopySendChar(cstate, '\\');
3112+
start = ptr++; /* we include char in next run */
3113+
break;
3114+
default:
3115+
/* All ASCII control chars are length 1 */
3116+
ptr++;
3117+
break;
3118+
}
3119+
}
3120+
else if (IS_HIGHBIT_SET(c))
3121+
ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
3122+
else
3123+
ptr++;
3124+
}
3125+
}
3126+
else
3127+
{
3128+
start = ptr;
3129+
while ((c = *ptr) != '\0')
3130+
{
3131+
if (c == '\\' || c == delimc)
3132+
{
31183133
DUMPSOFAR();
3119-
CopySendString(cstate, "\\\\");
3120-
start = ++ptr;
3121-
break;
3122-
default:
3123-
if (c == delimc)
3134+
CopySendChar(cstate, '\\');
3135+
start = ptr++; /* we include char in next run */
3136+
}
3137+
else if ((unsigned char) c < (unsigned char) 0x20)
3138+
{
3139+
switch (c)
31243140
{
3125-
DUMPSOFAR();
3126-
CopySendChar(cstate, '\\');
3127-
start = ptr; /* we include char in next run */
3141+
/* \r and \n must be escaped, the others are traditional */
3142+
case '\b':
3143+
case '\f':
3144+
case '\n':
3145+
case '\r':
3146+
case '\t':
3147+
case '\v':
3148+
DUMPSOFAR();
3149+
CopySendChar(cstate, '\\');
3150+
start = ptr++; /* we include char in next run */
3151+
break;
3152+
default:
3153+
/* All ASCII control chars are length 1 */
3154+
ptr++;
3155+
break;
31283156
}
3129-
3130-
/*
3131-
* We can skip pg_encoding_mblen() overhead when encoding is
3132-
* safe, because in valid backend encodings, extra bytes of a
3133-
* multibyte character never look like ASCII.
3134-
*/
3135-
if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
3136-
ptr += pg_encoding_mblen(cstate->client_encoding, ptr);
3137-
else
3138-
ptr++;
3139-
break;
3157+
}
3158+
else
3159+
ptr++;
31403160
}
31413161
}
31423162

0 commit comments

Comments
 (0)