Fix bugs in contrib/pg_trgm's LIKE pattern analysis code.

tglsfdc · tglsfdc · commit b2a01b9ad1c6 · 2012-08-20T13:25:42.000-04:00
Extraction of trigrams did not process LIKE escape sequences properly,
leading to possible misidentification of trigrams near escapes, resulting
in incorrect index search results.

Fujii Masao
diff --git a/contrib/pg_trgm/expected/pg_trgm.out b/contrib/pg_trgm/expected/pg_trgm.out
@@ -3497,6 +3497,12 @@ select * from test2 where t like '%bcd%';
  abcdef
 (1 row)
 
+select * from test2 where t like E'%\\bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
 select * from test2 where t ilike '%BCD%';
    t    
 --------
@@ -3539,6 +3545,12 @@ select * from test2 where t like '%bcd%';
  abcdef
 (1 row)
 
+select * from test2 where t like E'%\\bcd%';
+   t    
+--------
+ abcdef
+(1 row)
+
 select * from test2 where t ilike '%BCD%';
    t    
 --------
diff --git a/contrib/pg_trgm/sql/pg_trgm.sql b/contrib/pg_trgm/sql/pg_trgm.sql
@@ -49,6 +49,7 @@ explain (costs off)
   select * from test2 where t ilike '%BCD%';
 select * from test2 where t like '%BCD%';
 select * from test2 where t like '%bcd%';
+select * from test2 where t like E'%\\bcd%';
 select * from test2 where t ilike '%BCD%';
 select * from test2 where t ilike 'qua%';
 drop index test2_idx_gin;
@@ -60,5 +61,6 @@ explain (costs off)
   select * from test2 where t ilike '%BCD%';
 select * from test2 where t like '%BCD%';
 select * from test2 where t like '%bcd%';
+select * from test2 where t like E'%\\bcd%';
 select * from test2 where t ilike '%BCD%';
 select * from test2 where t ilike 'qua%';
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
@@ -272,33 +272,36 @@ get_wildcard_part(const char *str, int lenstr,
 	const char *beginword = str;
 	const char *endword;
 	char	   *s = buf;
-	bool		in_wildcard_meta = false;
+	bool		in_leading_wildcard_meta = false;
+	bool		in_trailing_wildcard_meta = false;
 	bool		in_escape = false;
 	int			clen;
 
 	/*
-	 * Find the first word character remembering whether last character was
-	 * wildcard meta-character.
+	 * Find the first word character, remembering whether preceding character
+	 * was wildcard meta-character.  Note that the in_escape state persists
+	 * from this loop to the next one, since we may exit at a word character
+	 * that is in_escape.
 	 */
 	while (beginword - str < lenstr)
 	{
 		if (in_escape)
 		{
-			in_escape = false;
-			in_wildcard_meta = false;
 			if (iswordchr(beginword))
 				break;
+			in_escape = false;
+			in_leading_wildcard_meta = false;
 		}
 		else
 		{
 			if (ISESCAPECHAR(beginword))
 				in_escape = true;
 			else if (ISWILDCARDCHAR(beginword))
-				in_wildcard_meta = true;
+				in_leading_wildcard_meta = true;
 			else if (iswordchr(beginword))
 				break;
 			else
-				in_wildcard_meta = false;
+				in_leading_wildcard_meta = false;
 		}
 		beginword += pg_mblen(beginword);
 	}
@@ -310,11 +313,11 @@ get_wildcard_part(const char *str, int lenstr,
 		return NULL;
 
 	/*
-	 * Add left padding spaces if last character wasn't wildcard
+	 * Add left padding spaces if preceding character wasn't wildcard
 	 * meta-character.
 	 */
 	*charlen = 0;
-	if (!in_wildcard_meta)
+	if (!in_leading_wildcard_meta)
 	{
 		if (LPADDING > 0)
 		{
@@ -333,31 +336,37 @@ get_wildcard_part(const char *str, int lenstr,
 	 * string boundary.  Strip escapes during copy.
 	 */
 	endword = beginword;
-	in_wildcard_meta = false;
-	in_escape = false;
 	while (endword - str < lenstr)
 	{
 		clen = pg_mblen(endword);
 		if (in_escape)
 		{
-			in_escape = false;
-			in_wildcard_meta = false;
 			if (iswordchr(endword))
 			{
 				memcpy(s, endword, clen);
 				(*charlen)++;
 				s += clen;
 			}
 			else
+			{
+				/*
+				 * Back up endword to the escape character when stopping at
+				 * an escaped char, so that subsequent get_wildcard_part will
+				 * restart from the escape character.  We assume here that
+				 * escape chars are single-byte.
+				 */
+				endword--;
 				break;
+			}
+			in_escape = false;
 		}
 		else
 		{
 			if (ISESCAPECHAR(endword))
 				in_escape = true;
 			else if (ISWILDCARDCHAR(endword))
 			{
-				in_wildcard_meta = true;
+				in_trailing_wildcard_meta = true;
 				break;
 			}
 			else if (iswordchr(endword))
@@ -367,19 +376,16 @@ get_wildcard_part(const char *str, int lenstr,
 				s += clen;
 			}
 			else
-			{
-				in_wildcard_meta = false;
 				break;
-			}
 		}
 		endword += clen;
 	}
 
 	/*
-	 * Add right padding spaces if last character wasn't wildcard
+	 * Add right padding spaces if next character isn't wildcard
 	 * meta-character.
 	 */
-	if (!in_wildcard_meta)
+	if (!in_trailing_wildcard_meta)
 	{
 		if (RPADDING > 0)
 		{