From 9eb78beeae01f2f0ccafc5d66a2003ea7e3952f9 Mon Sep 17 00:00:00 2001
From: Neil Conway
Date: Tue, 20 Mar 2007 05:45:00 +0000
Subject: Add three new regexp functions: regexp_matches,
 regexp_split_to_array, and regexp_split_to_table. These functions provide
 access to the capture groups resulting from a POSIX regular expression match,
 and provide the ability to split a string on a POSIX regular expression,
 respectively. Patch from Jeremy Drake; code review by Neil Conway, additional
 comments and suggestions from Tom and Peter E.

This patch bumps the catversion, adds some regression tests,
and updates the docs.
---
 doc/src/sgml/func.sgml | 184 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 176 insertions(+), 8 deletions(-)

(limited to 'doc/src')
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b8be507f2df..084db0d40e5 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.369 2007/02/20 19:59:04 momjian Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.370 2007/03/20 05:44:59 neilc Exp $ -->
 
  <chapter id="functions">
   <title>Functions and Operators</title>
@@ -1468,17 +1468,52 @@
       </row>
 
       <row>
-       <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [,<parameter>flags</parameter> <type>text</type>])</literal></entry>
+       <entry><literal><function>regexp_matches</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
+       <entry><type>setof text[]</type></entry>
+       <entry>
+        Return all capture groups resulting from matching POSIX regular
+        expression against the <parameter>string</parameter>. See
+        <xref linkend="functions-posix-regexp"> for more information.
+       </entry>
+       <entry><literal>regexp_matches('foobarbequebaz', '(bar)(beque)')</literal></entry>
+       <entry><literal>{bar,beque}</literal></entry>
+      </row>
+
+      <row>
+       <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
        <entry><type>text</type></entry>
        <entry>
         Replace substring matching POSIX regular expression. See
-        <xref linkend="functions-matching"> for more information on pattern
-        matching.
+        <xref linkend="functions-posix-regexp"> for more information.
        </entry>
        <entry><literal>regexp_replace('Thomas', '.[mN]a.', 'M')</literal></entry>
        <entry><literal>ThM</literal></entry>
       </row>
 
+      <row>
+       <entry><literal><function>regexp_split_to_array</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type> ])</literal></entry>
+       <entry><type>text[]</type></entry>
+       <entry>
+        Split <parameter>string</parameter> using POSIX regular expression as
+        the delimiter.  See <xref linkend="functions-posix-regexp"> for more
+        information.
+       </entry>
+       <entry><literal>regexp_split_to_array('hello world', E'\\s+')</literal></entry>
+       <entry><literal>{hello,world}</literal></entry>
+      </row>
+
+      <row>
+       <entry><literal><function>regexp_split_to_table</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry>
+       <entry><type>setof text</type></entry>
+       <entry>
+        Split <parameter>string</parameter> using POSIX regular expression as
+        the delimiter.  See <xref linkend="functions-posix-regexp"> for more
+        information.
+       </entry>
+       <entry><literal>regexp_split_to_table('hello world', E'\\s+')</literal></entry>
+       <entry><literal>hello</literal><para><literal>world</literal></para> (2 rows)</entry>
+      </row>
+
       <row>
        <entry><literal><function>repeat</function>(<parameter>string</parameter> <type>text</type>, <parameter>number</parameter> <type>int</type>)</literal></entry>
        <entry><type>text</type></entry>
@@ -2883,9 +2918,6 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
    <indexterm>
     <primary>substring</primary>
    </indexterm>
-   <indexterm>
-    <primary>regexp_replace</primary>
-   </indexterm>
 
 <synopsis>
 <replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional>
@@ -3004,6 +3036,21 @@ substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotat
     <primary>regular expression</primary>
     <seealso>pattern matching</seealso>
    </indexterm>
+   <indexterm>
+    <primary>substring</primary>
+   </indexterm>
+   <indexterm>
+    <primary>regexp_replace</primary>
+   </indexterm>
+   <indexterm>
+    <primary>regexp_matches</primary>
+   </indexterm>
+   <indexterm>
+    <primary>regexp_split_to_table</primary>
+   </indexterm>
+   <indexterm>
+    <primary>regexp_split_to_array</primary>
+   </indexterm>
 
    <para>
     <xref linkend="functions-posix-table"> lists the available
@@ -3134,7 +3181,10 @@ substring('foobar' from 'o(.)b')   <lineannotation>o</lineannotation>
      string containing zero or more single-letter flags that change the
      function's behavior.  Flag <literal>i</> specifies case-insensitive
      matching, while flag <literal>g</> specifies replacement of each matching
-     substring rather than only the first one.
+     substring rather than only the first one.  Other supported flags are
+     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
+     <literal>x</>, whose meanings correspond to those shown in
+     <xref linkend="posix-embedded-options-table">.
     </para>
 
    <para>
@@ -3149,6 +3199,124 @@ regexp_replace('foobarbaz', 'b(..)', E'X\\1Y', 'g')
 </programlisting>
    </para>
 
+    <para>
+     The <function>regexp_matches</> function returns all of the capture
+     groups resulting from matching a POSIX regular expression pattern.
+     It has the syntax
+     <function>regexp_matches</function>(<replaceable>string</>, <replaceable>pattern</>
+     <optional>, <replaceable>flags</> </optional>).
+     If there is no match to the <replaceable>pattern</>, the function returns no rows.
+     If there is a match, the function returns the contents of all of the capture groups
+     in a text array, or if there were no capture groups in the pattern, it returns the
+     contents of the entire match as a single-element text array.
+     The <replaceable>flags</> parameter is an optional text
+     string containing zero or more single-letter flags that change the
+     function's behavior.  Flag <literal>i</> specifies case-insensitive
+     matching, while flag <literal>g</> causes the return of each matching
+     substring rather than only the first one.  Other supported
+     flags are <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
+     <literal>x</>, whose meanings are described in
+     <xref linkend="posix-embedded-options-table">.
+    </para>
+
+   <para>
+    Some examples:
+<programlisting>
+SELECT regexp_matches('foobarbequebaz', '(bar)(beque)');
+ regexp_matches 
+----------------
+ {bar,beque}
+(1 row)
+
+SELECT regexp_matches('foobarbequebazilbarfbonk', '(b[^b]+)(b[^b]+)', 'g');
+ regexp_matches 
+----------------
+ {bar,beque}
+ {bazil,barf}
+(2 rows)
+
+SELECT regexp_matches('foobarbequebaz', 'barbeque');
+ regexp_matches 
+----------------
+ {barbeque}
+(1 row)
+</programlisting>
+   </para>
+
+    <para>
+     The <function>regexp_split_to_table</> function splits a string using a POSIX
+     regular expression pattern as a delimiter.  It has the syntax
+     <function>regexp_split_to_table</function>(<replaceable>string</>, <replaceable>pattern</>
+     <optional>, <replaceable>flags</> </optional>).
+     If there is no match to the <replaceable>pattern</>, the function returns the
+     <replaceable>string</>.  If there is at least one match, for each match it returns
+     the text from the end of the last match (or the beginning of the string)
+     to the beginning of the match.  When there are no more matches, it
+     returns the text from the end of the last match to the end of the string.
+     The <replaceable>flags</> parameter is an optional text string containing
+     zero or more single-letter flags that change the function's behavior.
+     <function>regexp_split_to_table</function> supports the flags <literal>i</>,
+     <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and
+     <literal>x</>, whose meanings are described in
+     <xref linkend="posix-embedded-options-table">.
+    </para>
+
+    <para>
+     The <function>regexp_split_to_array</> function behaves the same as
+     <function>regexp_split_to_table</>, except that <function>regexp_split_to_array</>
+     returns its results as a <type>text[]</>.  It has the syntax
+     <function>regexp_split_to_array</function>(<replaceable>string</>, <replaceable>pattern</>
+     <optional>, <replaceable>flags</> </optional>).
+     The parameters are the same as for <function>regexp_split_to_table</>.
+    </para>
+
+   <para>
+    Some examples:
+<programlisting>
+
+SELECT foo FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', E'\\\s+') AS foo;
+  foo   
+--------
+ the    
+ quick  
+ brown  
+ fox    
+ jumped 
+ over   
+ the    
+ lazy   
+ dog    
+(9 rows)
+
+SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', E'\\s+');
+              regexp_split_to_array             
+------------------------------------------------
+ {the,quick,brown,fox,jumped,over,the,lazy,dog}
+(1 row)
+
+SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo;
+ foo 
+-----
+ t         
+ h         
+ e         
+ q         
+ u         
+ i         
+ c         
+ k         
+ b         
+ r         
+ o         
+ w         
+ n         
+ f         
+ o         
+ x         
+(16 rows)
+</programlisting>
+   </para>
+
    <para>
     <productname>PostgreSQL</productname>'s regular expressions are implemented
     using a package written by Henry Spencer.  Much of
-- 
cgit v1.2.3