|
8 | 8 | *
|
9 | 9 | *
|
10 | 10 | * IDENTIFICATION
|
11 |
| - * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.167 2008/05/27 00:13:09 tgl Exp $ |
| 11 | + * $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.168 2008/09/07 04:20:00 tgl Exp $ |
12 | 12 | *
|
13 | 13 | *-------------------------------------------------------------------------
|
14 | 14 | */
|
@@ -39,6 +39,9 @@ typedef struct
|
39 | 39 | pg_wchar *wstr2; /* note: these are palloc'd */
|
40 | 40 | int len1; /* string lengths in logical characters */
|
41 | 41 | int len2;
|
| 42 | + /* Skip table for Boyer-Moore-Horspool search algorithm: */ |
| 43 | + int skiptablemask; /* mask for ANDing with skiptable subscripts */ |
| 44 | + int skiptable[256]; /* skip distance for given mismatched char */ |
42 | 45 | } TextPositionState;
|
43 | 46 |
|
44 | 47 | #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
|
@@ -753,7 +756,7 @@ text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
|
753 | 756 | * If we're working with an untoasted source, no need to do an extra
|
754 | 757 | * copying step.
|
755 | 758 | */
|
756 |
| - if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || |
| 759 | + if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) || |
757 | 760 | VARATT_IS_EXTERNAL(DatumGetPointer(str)))
|
758 | 761 | slice = DatumGetTextPSlice(str, slice_start, slice_size);
|
759 | 762 | else
|
@@ -866,6 +869,7 @@ text_position(text *t1, text *t2)
|
866 | 869 | return result;
|
867 | 870 | }
|
868 | 871 |
|
| 872 | + |
869 | 873 | /*
|
870 | 874 | * text_position_setup, text_position_next, text_position_cleanup -
|
871 | 875 | * Component steps of text_position()
|
@@ -909,64 +913,215 @@ text_position_setup(text *t1, text *t2, TextPositionState *state)
|
909 | 913 | state->len1 = len1;
|
910 | 914 | state->len2 = len2;
|
911 | 915 | }
|
| 916 | + |
| 917 | + /* |
| 918 | + * Prepare the skip table for Boyer-Moore-Horspool searching. In these |
| 919 | + * notes we use the terminology that the "haystack" is the string to be |
| 920 | + * searched (t1) and the "needle" is the pattern being sought (t2). |
| 921 | + * |
| 922 | + * If the needle is empty or bigger than the haystack then there is no |
| 923 | + * point in wasting cycles initializing the table. We also choose not |
| 924 | + * to use B-M-H for needles of length 1, since the skip table can't |
| 925 | + * possibly save anything in that case. |
| 926 | + */ |
| 927 | + if (len1 >= len2 && len2 > 1) |
| 928 | + { |
| 929 | + int searchlength = len1 - len2; |
| 930 | + int skiptablemask; |
| 931 | + int last; |
| 932 | + int i; |
| 933 | + |
| 934 | + /* |
| 935 | + * First we must determine how much of the skip table to use. The |
| 936 | + * declaration of TextPositionState allows up to 256 elements, but for |
| 937 | + * short search problems we don't really want to have to initialize so |
| 938 | + * many elements --- it would take too long in comparison to the |
| 939 | + * actual search time. So we choose a useful skip table size based on |
| 940 | + * the haystack length minus the needle length. The closer the needle |
| 941 | + * length is to the haystack length the less useful skipping becomes. |
| 942 | + * |
| 943 | + * Note: since we use bit-masking to select table elements, the skip |
| 944 | + * table size MUST be a power of 2, and so the mask must be 2^N-1. |
| 945 | + */ |
| 946 | + if (searchlength < 16) |
| 947 | + skiptablemask = 3; |
| 948 | + else if (searchlength < 64) |
| 949 | + skiptablemask = 7; |
| 950 | + else if (searchlength < 128) |
| 951 | + skiptablemask = 15; |
| 952 | + else if (searchlength < 512) |
| 953 | + skiptablemask = 31; |
| 954 | + else if (searchlength < 2048) |
| 955 | + skiptablemask = 63; |
| 956 | + else if (searchlength < 4096) |
| 957 | + skiptablemask = 127; |
| 958 | + else |
| 959 | + skiptablemask = 255; |
| 960 | + state->skiptablemask = skiptablemask; |
| 961 | + |
| 962 | + /* |
| 963 | + * Initialize the skip table. We set all elements to the needle |
| 964 | + * length, since this is the correct skip distance for any character |
| 965 | + * not found in the needle. |
| 966 | + */ |
| 967 | + for (i = 0; i <= skiptablemask; i++) |
| 968 | + state->skiptable[i] = len2; |
| 969 | + |
| 970 | + /* |
| 971 | + * Now examine the needle. For each character except the last one, |
| 972 | + * set the corresponding table element to the appropriate skip |
| 973 | + * distance. Note that when two characters share the same skip table |
| 974 | + * entry, the one later in the needle must determine the skip distance. |
| 975 | + */ |
| 976 | + last = len2 - 1; |
| 977 | + |
| 978 | + if (!state->use_wchar) |
| 979 | + { |
| 980 | + const char *str2 = state->str2; |
| 981 | + |
| 982 | + for (i = 0; i < last; i++) |
| 983 | + state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i; |
| 984 | + } |
| 985 | + else |
| 986 | + { |
| 987 | + const pg_wchar *wstr2 = state->wstr2; |
| 988 | + |
| 989 | + for (i = 0; i < last; i++) |
| 990 | + state->skiptable[wstr2[i] & skiptablemask] = last - i; |
| 991 | + } |
| 992 | + } |
912 | 993 | }
|
913 | 994 |
|
914 | 995 | static int
|
915 | 996 | text_position_next(int start_pos, TextPositionState *state)
|
916 | 997 | {
|
917 |
| - int pos = 0, |
918 |
| - p, |
919 |
| - px; |
| 998 | + int haystack_len = state->len1; |
| 999 | + int needle_len = state->len2; |
| 1000 | + int skiptablemask = state->skiptablemask; |
920 | 1001 |
|
921 | 1002 | Assert(start_pos > 0); /* else caller error */
|
922 | 1003 |
|
923 |
| - if (state->len2 <= 0) |
| 1004 | + if (needle_len <= 0) |
924 | 1005 | return start_pos; /* result for empty pattern */
|
925 | 1006 |
|
| 1007 | + start_pos--; /* adjust for zero based arrays */ |
| 1008 | + |
| 1009 | + /* Done if the needle can't possibly fit */ |
| 1010 | + if (haystack_len < start_pos + needle_len) |
| 1011 | + return 0; |
| 1012 | + |
926 | 1013 | if (!state->use_wchar)
|
927 | 1014 | {
|
928 | 1015 | /* simple case - single byte encoding */
|
929 |
| - char *p1 = state->str1; |
930 |
| - char *p2 = state->str2; |
| 1016 | + const char *haystack = state->str1; |
| 1017 | + const char *needle = state->str2; |
| 1018 | + const char *haystack_end = &haystack[haystack_len]; |
| 1019 | + const char *hptr; |
931 | 1020 |
|
932 |
| - /* no use in searching str past point where search_str will fit */ |
933 |
| - px = (state->len1 - state->len2); |
934 |
| - |
935 |
| - p1 += start_pos - 1; |
| 1021 | + if (needle_len == 1) |
| 1022 | + { |
| 1023 | + /* No point in using B-M-H for a one-character needle */ |
| 1024 | + char nchar = *needle; |
936 | 1025 |
|
937 |
| - for (p = start_pos - 1; p <= px; p++) |
| 1026 | + hptr = &haystack[start_pos]; |
| 1027 | + while (hptr < haystack_end) |
| 1028 | + { |
| 1029 | + if (*hptr == nchar) |
| 1030 | + return hptr - haystack + 1; |
| 1031 | + hptr++; |
| 1032 | + } |
| 1033 | + } |
| 1034 | + else |
938 | 1035 | {
|
939 |
| - if ((*p1 == *p2) && (strncmp(p1, p2, state->len2) == 0)) |
| 1036 | + const char *needle_last = &needle[needle_len - 1]; |
| 1037 | + |
| 1038 | + /* Start at startpos plus the length of the needle */ |
| 1039 | + hptr = &haystack[start_pos + needle_len - 1]; |
| 1040 | + while (hptr < haystack_end) |
940 | 1041 | {
|
941 |
| - pos = p + 1; |
942 |
| - break; |
| 1042 | + /* Match the needle scanning *backward* */ |
| 1043 | + const char *nptr; |
| 1044 | + const char *p; |
| 1045 | + |
| 1046 | + nptr = needle_last; |
| 1047 | + p = hptr; |
| 1048 | + while (*nptr == *p) |
| 1049 | + { |
| 1050 | + /* Matched it all? If so, return 1-based position */ |
| 1051 | + if (nptr == needle) |
| 1052 | + return p - haystack + 1; |
| 1053 | + nptr--, p--; |
| 1054 | + } |
| 1055 | + /* |
| 1056 | + * No match, so use the haystack char at hptr to decide how |
| 1057 | + * far to advance. If the needle had any occurrence of that |
| 1058 | + * character (or more precisely, one sharing the same |
| 1059 | + * skiptable entry) before its last character, then we advance |
| 1060 | + * far enough to align the last such needle character with |
| 1061 | + * that haystack position. Otherwise we can advance by the |
| 1062 | + * whole needle length. |
| 1063 | + */ |
| 1064 | + hptr += state->skiptable[(unsigned char) *hptr & skiptablemask]; |
943 | 1065 | }
|
944 |
| - p1++; |
945 | 1066 | }
|
946 | 1067 | }
|
947 | 1068 | else
|
948 | 1069 | {
|
949 |
| - /* not as simple - multibyte encoding */ |
950 |
| - pg_wchar *p1 = state->wstr1; |
951 |
| - pg_wchar *p2 = state->wstr2; |
| 1070 | + /* The multibyte char version. This works exactly the same way. */ |
| 1071 | + const pg_wchar *haystack = state->wstr1; |
| 1072 | + const pg_wchar *needle = state->wstr2; |
| 1073 | + const pg_wchar *haystack_end = &haystack[haystack_len]; |
| 1074 | + const pg_wchar *hptr; |
952 | 1075 |
|
953 |
| - /* no use in searching str past point where search_str will fit */ |
954 |
| - px = (state->len1 - state->len2); |
955 |
| - |
956 |
| - p1 += start_pos - 1; |
| 1076 | + if (needle_len == 1) |
| 1077 | + { |
| 1078 | + /* No point in using B-M-H for a one-character needle */ |
| 1079 | + pg_wchar nchar = *needle; |
957 | 1080 |
|
958 |
| - for (p = start_pos - 1; p <= px; p++) |
| 1081 | + hptr = &haystack[start_pos]; |
| 1082 | + while (hptr < haystack_end) |
| 1083 | + { |
| 1084 | + if (*hptr == nchar) |
| 1085 | + return hptr - haystack + 1; |
| 1086 | + hptr++; |
| 1087 | + } |
| 1088 | + } |
| 1089 | + else |
959 | 1090 | {
|
960 |
| - if ((*p1 == *p2) && (pg_wchar_strncmp(p1, p2, state->len2) == 0)) |
| 1091 | + const pg_wchar *needle_last = &needle[needle_len - 1]; |
| 1092 | + |
| 1093 | + /* Start at startpos plus the length of the needle */ |
| 1094 | + hptr = &haystack[start_pos + needle_len - 1]; |
| 1095 | + while (hptr < haystack_end) |
961 | 1096 | {
|
962 |
| - pos = p + 1; |
963 |
| - break; |
| 1097 | + /* Match the needle scanning *backward* */ |
| 1098 | + const pg_wchar *nptr; |
| 1099 | + const pg_wchar *p; |
| 1100 | + |
| 1101 | + nptr = needle_last; |
| 1102 | + p = hptr; |
| 1103 | + while (*nptr == *p) |
| 1104 | + { |
| 1105 | + /* Matched it all? If so, return 1-based position */ |
| 1106 | + if (nptr == needle) |
| 1107 | + return p - haystack + 1; |
| 1108 | + nptr--, p--; |
| 1109 | + } |
| 1110 | + /* |
| 1111 | + * No match, so use the haystack char at hptr to decide how |
| 1112 | + * far to advance. If the needle had any occurrence of that |
| 1113 | + * character (or more precisely, one sharing the same |
| 1114 | + * skiptable entry) before its last character, then we advance |
| 1115 | + * far enough to align the last such needle character with |
| 1116 | + * that haystack position. Otherwise we can advance by the |
| 1117 | + * whole needle length. |
| 1118 | + */ |
| 1119 | + hptr += state->skiptable[*hptr & skiptablemask]; |
964 | 1120 | }
|
965 |
| - p1++; |
966 | 1121 | }
|
967 | 1122 | }
|
968 | 1123 |
|
969 |
| - return pos; |
| 1124 | + return 0; /* not found */ |
970 | 1125 | }
|
971 | 1126 |
|
972 | 1127 | static void
|
|
0 commit comments