Source Home >> Java Source 1.6.0 >> java.util.regex.Pattern V 0.09
  • 0001/*
  • 0002 * @(#)Pattern.java 1.124 07/03/15
  • 0003 *
  • 0004 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
  • 0005 * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  • 0006 */
  • 0007
  • 0008package java.util.regex;
  • 0009
  • 0010import java.security.AccessController;
  • 0011import java.security.PrivilegedAction;
  • 0012import java.text.CharacterIterator;
  • 0013import java.text.Normalizer;
  • 0014import java.util.ArrayList;
  • 0015import java.util.HashMap;
  • 0016import java.util.Arrays;
  • 0017
  • 0018
  • 0019/**
  • 0020 * A compiled representation of a regular expression.
  • 0021 *
  • 0022 * <p> A regular expression, specified as a string, must first be compiled into
  • 0023 * an instance of this class. The resulting pattern can then be used to create
  • 0024 * a {@link Matcher} object that can match arbitrary {@link
  • 0025 * java.lang.CharSequence </code>character sequences<code>} against the regular
  • 0026 * expression. All of the state involved in performing a match resides in the
  • 0027 * matcher, so many matchers can share the same pattern.
  • 0028 *
  • 0029 * <p> A typical invocation sequence is thus
  • 0030 *
  • 0031 * <blockquote><pre>
  • 0032 * Pattern p = Pattern.{@link #compile compile}("a*b");
  • 0033 * Matcher m = p.{@link #matcher matcher}("aaaaab");
  • 0034 * boolean b = m.{@link Matcher#matches matches}();</pre></blockquote>
  • 0035 *
  • 0036 * <p> A {@link #matches matches} method is defined by this class as a
  • 0037 * convenience for when a regular expression is used just once. This method
  • 0038 * compiles an expression and matches an input sequence against it in a single
  • 0039 * invocation. The statement
  • 0040 *
  • 0041 * <blockquote><pre>
  • 0042 * boolean b = Pattern.matches("a*b", "aaaaab");</pre></blockquote>
  • 0043 *
  • 0044 * is equivalent to the three statements above, though for repeated matches it
  • 0045 * is less efficient since it does not allow the compiled pattern to be reused.
  • 0046 *
  • 0047 * <p> Instances of this class are immutable and are safe for use by multiple
  • 0048 * concurrent threads. Instances of the {@link Matcher} class are not safe for
  • 0049 * such use.
  • 0050 *
  • 0051 *
  • 0052 * <a name="sum">
  • 0053 * <h4> Summary of regular-expression constructs </h4>
  • 0054 *
  • 0055 * <table border="0" cellpadding="1" cellspacing="0"
  • 0056 * summary="Regular expression constructs, and what they match">
  • 0057 *
  • 0058 * <tr align="left">
  • 0059 * <th bgcolor="#CCCCFF" align="left" id="construct">Construct</th>
  • 0060 * <th bgcolor="#CCCCFF" align="left" id="matches">Matches</th>
  • 0061 * </tr>
  • 0062 *
  • 0063 * <tr><th> </th></tr>
  • 0064 * <tr align="left"><th colspan="2" id="characters">Characters</th></tr>
  • 0065 *
  • 0066 * <tr><td valign="top" headers="construct characters"><i>x</i></td>
  • 0067 * <td headers="matches">The character <i>x</i></td></tr>
  • 0068 * <tr><td valign="top" headers="construct characters"><tt>\\</tt></td>
  • 0069 * <td headers="matches">The backslash character</td></tr>
  • 0070 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>n</i></td>
  • 0071 * <td headers="matches">The character with octal value <tt>0</tt><i>n</i>
  • 0072 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  • 0073 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>nn</i></td>
  • 0074 * <td headers="matches">The character with octal value <tt>0</tt><i>nn</i>
  • 0075 * (0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  • 0076 * <tr><td valign="top" headers="construct characters"><tt>\0</tt><i>mnn</i></td>
  • 0077 * <td headers="matches">The character with octal value <tt>0</tt><i>mnn</i>
  • 0078 * (0 <tt><=</tt> <i>m</i> <tt><=</tt> 3,
  • 0079 * 0 <tt><=</tt> <i>n</i> <tt><=</tt> 7)</td></tr>
  • 0080 * <tr><td valign="top" headers="construct characters"><tt>\x</tt><i>hh</i></td>
  • 0081 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hh</i></td></tr>
  • 0082 * <tr><td valign="top" headers="construct characters"><tt>\u</tt><i>hhhh</i></td>
  • 0083 * <td headers="matches">The character with hexadecimal value <tt>0x</tt><i>hhhh</i></td></tr>
  • 0084 * <tr><td valign="top" headers="matches"><tt>\t</tt></td>
  • 0085 * <td headers="matches">The tab character (<tt>'\u0009'</tt>)</td></tr>
  • 0086 * <tr><td valign="top" headers="construct characters"><tt>\n</tt></td>
  • 0087 * <td headers="matches">The newline (line feed) character (<tt>'\u000A'</tt>)</td></tr>
  • 0088 * <tr><td valign="top" headers="construct characters"><tt>\r</tt></td>
  • 0089 * <td headers="matches">The carriage-return character (<tt>'\u000D'</tt>)</td></tr>
  • 0090 * <tr><td valign="top" headers="construct characters"><tt>\f</tt></td>
  • 0091 * <td headers="matches">The form-feed character (<tt>'\u000C'</tt>)</td></tr>
  • 0092 * <tr><td valign="top" headers="construct characters"><tt>\a</tt></td>
  • 0093 * <td headers="matches">The alert (bell) character (<tt>'\u0007'</tt>)</td></tr>
  • 0094 * <tr><td valign="top" headers="construct characters"><tt>\e</tt></td>
  • 0095 * <td headers="matches">The escape character (<tt>'\u001B'</tt>)</td></tr>
  • 0096 * <tr><td valign="top" headers="construct characters"><tt>\c</tt><i>x</i></td>
  • 0097 * <td headers="matches">The control character corresponding to <i>x</i></td></tr>
  • 0098 *
  • 0099 * <tr><th> </th></tr>
  • 0100 * <tr align="left"><th colspan="2" id="classes">Character classes</th></tr>
  • 0101 *
  • 0102 * <tr><td valign="top" headers="construct classes"><tt>[abc]</tt></td>
  • 0103 * <td headers="matches"><tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (simple class)</td></tr>
  • 0104 * <tr><td valign="top" headers="construct classes"><tt>[^abc]</tt></td>
  • 0105 * <td headers="matches">Any character except <tt>a</tt>, <tt>b</tt>, or <tt>c</tt> (negation)</td></tr>
  • 0106 * <tr><td valign="top" headers="construct classes"><tt>[a-zA-Z]</tt></td>
  • 0107 * <td headers="matches"><tt>a</tt> through <tt>z</tt>
  • 0108 * or <tt>A</tt> through <tt>Z</tt>, inclusive (range)</td></tr>
  • 0109 * <tr><td valign="top" headers="construct classes"><tt>[a-d[m-p]]</tt></td>
  • 0110 * <td headers="matches"><tt>a</tt> through <tt>d</tt>,
  • 0111 * or <tt>m</tt> through <tt>p</tt>: <tt>[a-dm-p]</tt> (union)</td></tr>
  • 0112 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[def]]</tt></td>
  • 0113 * <td headers="matches"><tt>d</tt>, <tt>e</tt>, or <tt>f</tt> (intersection)</tr>
  • 0114 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^bc]]</tt></td>
  • 0115 * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
  • 0116 * except for <tt>b</tt> and <tt>c</tt>: <tt>[ad-z]</tt> (subtraction)</td></tr>
  • 0117 * <tr><td valign="top" headers="construct classes"><tt>[a-z&&[^m-p]]</tt></td>
  • 0118 * <td headers="matches"><tt>a</tt> through <tt>z</tt>,
  • 0119 * and not <tt>m</tt> through <tt>p</tt>: <tt>[a-lq-z]</tt>(subtraction)</td></tr>
  • 0120 * <tr><th> </th></tr>
  • 0121 *
  • 0122 * <tr align="left"><th colspan="2" id="predef">Predefined character classes</th></tr>
  • 0123 *
  • 0124 * <tr><td valign="top" headers="construct predef"><tt>.</tt></td>
  • 0125 * <td headers="matches">Any character (may or may not match <a href="#lt">line terminators</a>)</td></tr>
  • 0126 * <tr><td valign="top" headers="construct predef"><tt>\d</tt></td>
  • 0127 * <td headers="matches">A digit: <tt>[0-9]</tt></td></tr>
  • 0128 * <tr><td valign="top" headers="construct predef"><tt>\D</tt></td>
  • 0129 * <td headers="matches">A non-digit: <tt>[^0-9]</tt></td></tr>
  • 0130 * <tr><td valign="top" headers="construct predef"><tt>\s</tt></td>
  • 0131 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
  • 0132 * <tr><td valign="top" headers="construct predef"><tt>\S</tt></td>
  • 0133 * <td headers="matches">A non-whitespace character: <tt>[^\s]</tt></td></tr>
  • 0134 * <tr><td valign="top" headers="construct predef"><tt>\w</tt></td>
  • 0135 * <td headers="matches">A word character: <tt>[a-zA-Z_0-9]</tt></td></tr>
  • 0136 * <tr><td valign="top" headers="construct predef"><tt>\W</tt></td>
  • 0137 * <td headers="matches">A non-word character: <tt>[^\w]</tt></td></tr>
  • 0138 *
  • 0139 * <tr><th> </th></tr>
  • 0140 * <tr align="left"><th colspan="2" id="posix">POSIX character classes</b> (US-ASCII only)<b></th></tr>
  • 0141 *
  • 0142 * <tr><td valign="top" headers="construct posix"><tt>\p{Lower}</tt></td>
  • 0143 * <td headers="matches">A lower-case alphabetic character: <tt>[a-z]</tt></td></tr>
  • 0144 * <tr><td valign="top" headers="construct posix"><tt>\p{Upper}</tt></td>
  • 0145 * <td headers="matches">An upper-case alphabetic character:<tt>[A-Z]</tt></td></tr>
  • 0146 * <tr><td valign="top" headers="construct posix"><tt>\p{ASCII}</tt></td>
  • 0147 * <td headers="matches">All ASCII:<tt>[\x00-\x7F]</tt></td></tr>
  • 0148 * <tr><td valign="top" headers="construct posix"><tt>\p{Alpha}</tt></td>
  • 0149 * <td headers="matches">An alphabetic character:<tt>[\p{Lower}\p{Upper}]</tt></td></tr>
  • 0150 * <tr><td valign="top" headers="construct posix"><tt>\p{Digit}</tt></td>
  • 0151 * <td headers="matches">A decimal digit: <tt>[0-9]</tt></td></tr>
  • 0152 * <tr><td valign="top" headers="construct posix"><tt>\p{Alnum}</tt></td>
  • 0153 * <td headers="matches">An alphanumeric character:<tt>[\p{Alpha}\p{Digit}]</tt></td></tr>
  • 0154 * <tr><td valign="top" headers="construct posix"><tt>\p{Punct}</tt></td>
  • 0155 * <td headers="matches">Punctuation: One of <tt>!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~</tt></td></tr>
  • 0156 * <!-- <tt>[\!"#\$%&'\(\)\*\+,\-\./:;\<=\>\?@\[\\\]\^_`\{\|\}~]</tt>
  • 0157 * <tt>[\X21-\X2F\X31-\X40\X5B-\X60\X7B-\X7E]</tt> -->
  • 0158 * <tr><td valign="top" headers="construct posix"><tt>\p{Graph}</tt></td>
  • 0159 * <td headers="matches">A visible character: <tt>[\p{Alnum}\p{Punct}]</tt></td></tr>
  • 0160 * <tr><td valign="top" headers="construct posix"><tt>\p{Print}</tt></td>
  • 0161 * <td headers="matches">A printable character: <tt>[\p{Graph}\x20]</tt></td></tr>
  • 0162 * <tr><td valign="top" headers="construct posix"><tt>\p{Blank}</tt></td>
  • 0163 * <td headers="matches">A space or a tab: <tt>[ \t]</tt></td></tr>
  • 0164 * <tr><td valign="top" headers="construct posix"><tt>\p{Cntrl}</tt></td>
  • 0165 * <td headers="matches">A control character: <tt>[\x00-\x1F\x7F]</tt></td></tr>
  • 0166 * <tr><td valign="top" headers="construct posix"><tt>\p{XDigit}</tt></td>
  • 0167 * <td headers="matches">A hexadecimal digit: <tt>[0-9a-fA-F]</tt></td></tr>
  • 0168 * <tr><td valign="top" headers="construct posix"><tt>\p{Space}</tt></td>
  • 0169 * <td headers="matches">A whitespace character: <tt>[ \t\n\x0B\f\r]</tt></td></tr>
  • 0170 *
  • 0171 * <tr><th> </th></tr>
  • 0172 * <tr align="left"><th colspan="2">java.lang.Character classes (simple <a href="#jcc">java character type</a>)</th></tr>
  • 0173 *
  • 0174 * <tr><td valign="top"><tt>\p{javaLowerCase}</tt></td>
  • 0175 * <td>Equivalent to java.lang.Character.isLowerCase()</td></tr>
  • 0176 * <tr><td valign="top"><tt>\p{javaUpperCase}</tt></td>
  • 0177 * <td>Equivalent to java.lang.Character.isUpperCase()</td></tr>
  • 0178 * <tr><td valign="top"><tt>\p{javaWhitespace}</tt></td>
  • 0179 * <td>Equivalent to java.lang.Character.isWhitespace()</td></tr>
  • 0180 * <tr><td valign="top"><tt>\p{javaMirrored}</tt></td>
  • 0181 * <td>Equivalent to java.lang.Character.isMirrored()</td></tr>
  • 0182 *
  • 0183 * <tr><th> </th></tr>
  • 0184 * <tr align="left"><th colspan="2" id="unicode">Classes for Unicode blocks and categories</th></tr>
  • 0185 *
  • 0186 * <tr><td valign="top" headers="construct unicode"><tt>\p{InGreek}</tt></td>
  • 0187 * <td headers="matches">A character in the Greek block (simple <a href="#ubc">block</a>)</td></tr>
  • 0188 * <tr><td valign="top" headers="construct unicode"><tt>\p{Lu}</tt></td>
  • 0189 * <td headers="matches">An uppercase letter (simple <a href="#ubc">category</a>)</td></tr>
  • 0190 * <tr><td valign="top" headers="construct unicode"><tt>\p{Sc}</tt></td>
  • 0191 * <td headers="matches">A currency symbol</td></tr>
  • 0192 * <tr><td valign="top" headers="construct unicode"><tt>\P{InGreek}</tt></td>
  • 0193 * <td headers="matches">Any character except one in the Greek block (negation)</td></tr>
  • 0194 * <tr><td valign="top" headers="construct unicode"><tt>[\p{L}&&[^\p{Lu}]] </tt></td>
  • 0195 * <td headers="matches">Any letter except an uppercase letter (subtraction)</td></tr>
  • 0196 *
  • 0197 * <tr><th> </th></tr>
  • 0198 * <tr align="left"><th colspan="2" id="bounds">Boundary matchers</th></tr>
  • 0199 *
  • 0200 * <tr><td valign="top" headers="construct bounds"><tt>^</tt></td>
  • 0201 * <td headers="matches">The beginning of a line</td></tr>
  • 0202 * <tr><td valign="top" headers="construct bounds"><tt>$</tt></td>
  • 0203 * <td headers="matches">The end of a line</td></tr>
  • 0204 * <tr><td valign="top" headers="construct bounds"><tt>\b</tt></td>
  • 0205 * <td headers="matches">A word boundary</td></tr>
  • 0206 * <tr><td valign="top" headers="construct bounds"><tt>\B</tt></td>
  • 0207 * <td headers="matches">A non-word boundary</td></tr>
  • 0208 * <tr><td valign="top" headers="construct bounds"><tt>\A</tt></td>
  • 0209 * <td headers="matches">The beginning of the input</td></tr>
  • 0210 * <tr><td valign="top" headers="construct bounds"><tt>\G</tt></td>
  • 0211 * <td headers="matches">The end of the previous match</td></tr>
  • 0212 * <tr><td valign="top" headers="construct bounds"><tt>\Z</tt></td>
  • 0213 * <td headers="matches">The end of the input but for the final
  • 0214 * <a href="#lt">terminator</a>, if any</td></tr>
  • 0215 * <tr><td valign="top" headers="construct bounds"><tt>\z</tt></td>
  • 0216 * <td headers="matches">The end of the input</td></tr>
  • 0217 *
  • 0218 * <tr><th> </th></tr>
  • 0219 * <tr align="left"><th colspan="2" id="greedy">Greedy quantifiers</th></tr>
  • 0220 *
  • 0221 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>?</tt></td>
  • 0222 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  • 0223 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>*</tt></td>
  • 0224 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  • 0225 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>+</tt></td>
  • 0226 * <td headers="matches"><i>X</i>, one or more times</td></tr>
  • 0227 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>}</tt></td>
  • 0228 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  • 0229 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,}</tt></td>
  • 0230 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  • 0231 * <tr><td valign="top" headers="construct greedy"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}</tt></td>
  • 0232 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  • 0233 *
  • 0234 * <tr><th> </th></tr>
  • 0235 * <tr align="left"><th colspan="2" id="reluc">Reluctant quantifiers</th></tr>
  • 0236 *
  • 0237 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>??</tt></td>
  • 0238 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  • 0239 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>*?</tt></td>
  • 0240 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  • 0241 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>+?</tt></td>
  • 0242 * <td headers="matches"><i>X</i>, one or more times</td></tr>
  • 0243 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>}?</tt></td>
  • 0244 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  • 0245 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,}?</tt></td>
  • 0246 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  • 0247 * <tr><td valign="top" headers="construct reluc"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}?</tt></td>
  • 0248 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  • 0249 *
  • 0250 * <tr><th> </th></tr>
  • 0251 * <tr align="left"><th colspan="2" id="poss">Possessive quantifiers</th></tr>
  • 0252 *
  • 0253 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>?+</tt></td>
  • 0254 * <td headers="matches"><i>X</i>, once or not at all</td></tr>
  • 0255 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>*+</tt></td>
  • 0256 * <td headers="matches"><i>X</i>, zero or more times</td></tr>
  • 0257 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>++</tt></td>
  • 0258 * <td headers="matches"><i>X</i>, one or more times</td></tr>
  • 0259 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>}+</tt></td>
  • 0260 * <td headers="matches"><i>X</i>, exactly <i>n</i> times</td></tr>
  • 0261 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,}+</tt></td>
  • 0262 * <td headers="matches"><i>X</i>, at least <i>n</i> times</td></tr>
  • 0263 * <tr><td valign="top" headers="construct poss"><i>X</i><tt>{</tt><i>n</i><tt>,</tt><i>m</i><tt>}+</tt></td>
  • 0264 * <td headers="matches"><i>X</i>, at least <i>n</i> but not more than <i>m</i> times</td></tr>
  • 0265 *
  • 0266 * <tr><th> </th></tr>
  • 0267 * <tr align="left"><th colspan="2" id="logical">Logical operators</th></tr>
  • 0268 *
  • 0269 * <tr><td valign="top" headers="construct logical"><i>XY</i></td>
  • 0270 * <td headers="matches"><i>X</i> followed by <i>Y</i></td></tr>
  • 0271 * <tr><td valign="top" headers="construct logical"><i>X</i><tt>|</tt><i>Y</i></td>
  • 0272 * <td headers="matches">Either <i>X</i> or <i>Y</i></td></tr>
  • 0273 * <tr><td valign="top" headers="construct logical"><tt>(</tt><i>X</i><tt>)</tt></td>
  • 0274 * <td headers="matches">X, as a <a href="#cg">capturing group</a></td></tr>
  • 0275 *
  • 0276 * <tr><th> </th></tr>
  • 0277 * <tr align="left"><th colspan="2" id="backref">Back references</th></tr>
  • 0278 *
  • 0279 * <tr><td valign="bottom" headers="construct backref"><tt>\</tt><i>n</i></td>
  • 0280 * <td valign="bottom" headers="matches">Whatever the <i>n</i><sup>th</sup>
  • 0281 * <a href="#cg">capturing group</a> matched</td></tr>
  • 0282 *
  • 0283 * <tr><th> </th></tr>
  • 0284 * <tr align="left"><th colspan="2" id="quot">Quotation</th></tr>
  • 0285 *
  • 0286 * <tr><td valign="top" headers="construct quot"><tt>\</tt></td>
  • 0287 * <td headers="matches">Nothing, but quotes the following character</td></tr>
  • 0288 * <tr><td valign="top" headers="construct quot"><tt>\Q</tt></td>
  • 0289 * <td headers="matches">Nothing, but quotes all characters until <tt>\E</tt></td></tr>
  • 0290 * <tr><td valign="top" headers="construct quot"><tt>\E</tt></td>
  • 0291 * <td headers="matches">Nothing, but ends quoting started by <tt>\Q</tt></td></tr>
  • 0292 * <!-- Metachars: !$()*+.<>?[\]^{|} -->
  • 0293 *
  • 0294 * <tr><th> </th></tr>
  • 0295 * <tr align="left"><th colspan="2" id="special">Special constructs (non-capturing)</th></tr>
  • 0296 *
  • 0297 * <tr><td valign="top" headers="construct special"><tt>(?:</tt><i>X</i><tt>)</tt></td>
  • 0298 * <td headers="matches"><i>X</i>, as a non-capturing group</td></tr>
  • 0299 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux) </tt></td>
  • 0300 * <td headers="matches">Nothing, but turns match flags <a href="#CASE_INSENSITIVE">i</a>
  • 0301 * <a href="#UNIX_LINES">d</a> <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a>
  • 0302 * <a href="#UNICODE_CASE">u</a> <a href="#COMMENTS">x</a> on - off</td></tr>
  • 0303 * <tr><td valign="top" headers="construct special"><tt>(?idmsux-idmsux:</tt><i>X</i><tt>)</tt>  </td>
  • 0304 * <td headers="matches"><i>X</i>, as a <a href="#cg">non-capturing group</a> with the
  • 0305 * given flags <a href="#CASE_INSENSITIVE">i</a> <a href="#UNIX_LINES">d</a>
  • 0306 * <a href="#MULTILINE">m</a> <a href="#DOTALL">s</a> <a href="#UNICODE_CASE">u</a >
  • 0307 * <a href="#COMMENTS">x</a> on - off</td></tr>
  • 0308 * <tr><td valign="top" headers="construct special"><tt>(?=</tt><i>X</i><tt>)</tt></td>
  • 0309 * <td headers="matches"><i>X</i>, via zero-width positive lookahead</td></tr>
  • 0310 * <tr><td valign="top" headers="construct special"><tt>(?!</tt><i>X</i><tt>)</tt></td>
  • 0311 * <td headers="matches"><i>X</i>, via zero-width negative lookahead</td></tr>
  • 0312 * <tr><td valign="top" headers="construct special"><tt>(?<=</tt><i>X</i><tt>)</tt></td>
  • 0313 * <td headers="matches"><i>X</i>, via zero-width positive lookbehind</td></tr>
  • 0314 * <tr><td valign="top" headers="construct special"><tt>(?<!</tt><i>X</i><tt>)</tt></td>
  • 0315 * <td headers="matches"><i>X</i>, via zero-width negative lookbehind</td></tr>
  • 0316 * <tr><td valign="top" headers="construct special"><tt>(?></tt><i>X</i><tt>)</tt></td>
  • 0317 * <td headers="matches"><i>X</i>, as an independent, non-capturing group</td></tr>
  • 0318 *
  • 0319 * </table>
  • 0320 *
  • 0321 * <hr>
  • 0322 *
  • 0323 *
  • 0324 * <a name="bs">
  • 0325 * <h4> Backslashes, escapes, and quoting </h4>
  • 0326 *
  • 0327 * <p> The backslash character (<tt>'\'</tt>) serves to introduce escaped
  • 0328 * constructs, as defined in the table above, as well as to quote characters
  • 0329 * that otherwise would be interpreted as unescaped constructs. Thus the
  • 0330 * expression <tt>\\</tt> matches a single backslash and <tt>\{</tt> matches a
  • 0331 * left brace.
  • 0332 *
  • 0333 * <p> It is an error to use a backslash prior to any alphabetic character that
  • 0334 * does not denote an escaped construct; these are reserved for future
  • 0335 * extensions to the regular-expression language. A backslash may be used
  • 0336 * prior to a non-alphabetic character regardless of whether that character is
  • 0337 * part of an unescaped construct.
  • 0338 *
  • 0339 * <p> Backslashes within string literals in Java source code are interpreted
  • 0340 * as required by the <a
  • 0341 * href="http://java.sun.com/docs/books/jls">Java Language
  • 0342 * Specification</a> as either <a
  • 0343 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">Unicode
  • 0344 * escapes</a> or other <a
  • 0345 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#101089">character
  • 0346 * escapes</a>. It is therefore necessary to double backslashes in string
  • 0347 * literals that represent regular expressions to protect them from
  • 0348 * interpretation by the Java bytecode compiler. The string literal
  • 0349 * <tt>"\b"</tt>, for example, matches a single backspace character when
  • 0350 * interpreted as a regular expression, while <tt>"\\b"</tt> matches a
  • 0351 * word boundary. The string literal <tt>"\(hello\)"</tt> is illegal
  • 0352 * and leads to a compile-time error; in order to match the string
  • 0353 * <tt>(hello)</tt> the string literal <tt>"\\(hello\\)"</tt>
  • 0354 * must be used.
  • 0355 *
  • 0356 * <a name="cc">
  • 0357 * <h4> Character Classes </h4>
  • 0358 *
  • 0359 * <p> Character classes may appear within other character classes, and
  • 0360 * may be composed by the union operator (implicit) and the intersection
  • 0361 * operator (<tt>&&</tt>).
  • 0362 * The union operator denotes a class that contains every character that is
  • 0363 * in at least one of its operand classes. The intersection operator
  • 0364 * denotes a class that contains every character that is in both of its
  • 0365 * operand classes.
  • 0366 *
  • 0367 * <p> The precedence of character-class operators is as follows, from
  • 0368 * highest to lowest:
  • 0369 *
  • 0370 * <blockquote><table border="0" cellpadding="1" cellspacing="0"
  • 0371 * summary="Precedence of character class operators.">
  • 0372 * <tr><th>1    </th>
  • 0373 * <td>Literal escape    </td>
  • 0374 * <td><tt>\x</tt></td></tr>
  • 0375 * <tr><th>2    </th>
  • 0376 * <td>Grouping</td>
  • 0377 * <td><tt>[...]</tt></td></tr>
  • 0378 * <tr><th>3    </th>
  • 0379 * <td>Range</td>
  • 0380 * <td><tt>a-z</tt></td></tr>
  • 0381 * <tr><th>4    </th>
  • 0382 * <td>Union</td>
  • 0383 * <td><tt>[a-e][i-u]</tt></td></tr>
  • 0384 * <tr><th>5    </th>
  • 0385 * <td>Intersection</td>
  • 0386 * <td><tt>[a-z&&[aeiou]]</tt></td></tr>
  • 0387 * </table></blockquote>
  • 0388 *
  • 0389 * <p> Note that a different set of metacharacters are in effect inside
  • 0390 * a character class than outside a character class. For instance, the
  • 0391 * regular expression <tt>.</tt> loses its special meaning inside a
  • 0392 * character class, while the expression <tt>-</tt> becomes a range
  • 0393 * forming metacharacter.
  • 0394 *
  • 0395 * <a name="lt">
  • 0396 * <h4> Line terminators </h4>
  • 0397 *
  • 0398 * <p> A <i>line terminator</i> is a one- or two-character sequence that marks
  • 0399 * the end of a line of the input character sequence. The following are
  • 0400 * recognized as line terminators:
  • 0401 *
  • 0402 * <ul>
  • 0403 *
  • 0404 * <li> A newline (line feed) character (<tt>'\n'</tt>),
  • 0405 *
  • 0406 * <li> A carriage-return character followed immediately by a newline
  • 0407 * character (<tt>"\r\n"</tt>),
  • 0408 *
  • 0409 * <li> A standalone carriage-return character (<tt>'\r'</tt>),
  • 0410 *
  • 0411 * <li> A next-line character (<tt>'\u0085'</tt>),
  • 0412 *
  • 0413 * <li> A line-separator character (<tt>'\u2028'</tt>), or
  • 0414 *
  • 0415 * <li> A paragraph-separator character (<tt>'\u2029</tt>).
  • 0416 *
  • 0417 * </ul>
  • 0418 * <p>If {@link #UNIX_LINES} mode is activated, then the only line terminators
  • 0419 * recognized are newline characters.
  • 0420 *
  • 0421 * <p> The regular expression <tt>.</tt> matches any character except a line
  • 0422 * terminator unless the {@link #DOTALL} flag is specified.
  • 0423 *
  • 0424 * <p> By default, the regular expressions <tt>^</tt> and <tt>$</tt> ignore
  • 0425 * line terminators and only match at the beginning and the end, respectively,
  • 0426 * of the entire input sequence. If {@link #MULTILINE} mode is activated then
  • 0427 * <tt>^</tt> matches at the beginning of input and after any line terminator
  • 0428 * except at the end of input. When in {@link #MULTILINE} mode <tt>$</tt>
  • 0429 * matches just before a line terminator or the end of the input sequence.
  • 0430 *
  • 0431 * <a name="cg">
  • 0432 * <h4> Groups and capturing </h4>
  • 0433 *
  • 0434 * <p> Capturing groups are numbered by counting their opening parentheses from
  • 0435 * left to right. In the expression <tt>((A)(B(C)))</tt>, for example, there
  • 0436 * are four such groups: </p>
  • 0437 *
  • 0438 * <blockquote><table cellpadding=1 cellspacing=0 summary="Capturing group numberings">
  • 0439 * <tr><th>1    </th>
  • 0440 * <td><tt>((A)(B(C)))</tt></td></tr>
  • 0441 * <tr><th>2    </th>
  • 0442 * <td><tt>(A)</tt></td></tr>
  • 0443 * <tr><th>3    </th>
  • 0444 * <td><tt>(B(C))</tt></td></tr>
  • 0445 * <tr><th>4    </th>
  • 0446 * <td><tt>(C)</tt></td></tr>
  • 0447 * </table></blockquote>
  • 0448 *
  • 0449 * <p> Group zero always stands for the entire expression.
  • 0450 *
  • 0451 * <p> Capturing groups are so named because, during a match, each subsequence
  • 0452 * of the input sequence that matches such a group is saved. The captured
  • 0453 * subsequence may be used later in the expression, via a back reference, and
  • 0454 * may also be retrieved from the matcher once the match operation is complete.
  • 0455 *
  • 0456 * <p> The captured input associated with a group is always the subsequence
  • 0457 * that the group most recently matched. If a group is evaluated a second time
  • 0458 * because of quantification then its previously-captured value, if any, will
  • 0459 * be retained if the second evaluation fails. Matching the string
  • 0460 * <tt>"aba"</tt> against the expression <tt>(a(b)?)+</tt>, for example, leaves
  • 0461 * group two set to <tt>"b"</tt>. All captured input is discarded at the
  • 0462 * beginning of each match.
  • 0463 *
  • 0464 * <p> Groups beginning with <tt>(?</tt> are pure, <i>non-capturing</i> groups
  • 0465 * that do not capture text and do not count towards the group total.
  • 0466 *
  • 0467 *
  • 0468 * <h4> Unicode support </h4>
  • 0469 *
  • 0470 * <p> This class is in conformance with Level 1 of <a
  • 0471 * href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
  • 0472 * Standard #18: Unicode Regular Expression Guidelines</i></a>, plus RL2.1
  • 0473 * Canonical Equivalents.
  • 0474 *
  • 0475 * <p> Unicode escape sequences such as <tt>\u2014</tt> in Java source code
  • 0476 * are processed as described in <a
  • 0477 * href="http://java.sun.com/docs/books/jls/third_edition/html/lexical.html#100850">\u00A73.3</a>
  • 0478 * of the Java Language Specification. Such escape sequences are also
  • 0479 * implemented directly by the regular-expression parser so that Unicode
  • 0480 * escapes can be used in expressions that are read from files or from the
  • 0481 * keyboard. Thus the strings <tt>"\u2014"</tt> and <tt>"\\u2014"</tt>,
  • 0482 * while not equal, compile into the same pattern, which matches the character
  • 0483 * with hexadecimal value <tt>0x2014</tt>.
  • 0484 *
  • 0485 * <a name="ubc"> <p>Unicode blocks and categories are written with the
  • 0486 * <tt>\p</tt> and <tt>\P</tt> constructs as in
  • 0487 * Perl. <tt>\p{</tt><i>prop</i><tt>}</tt> matches if the input has the
  • 0488 * property <i>prop</i>, while <tt>\P{</tt><i>prop</i><tt>}</tt> does not match if
  • 0489 * the input has that property. Blocks are specified with the prefix
  • 0490 * <tt>In</tt>, as in <tt>InMongolian</tt>. Categories may be specified with
  • 0491 * the optional prefix <tt>Is</tt>: Both <tt>\p{L}</tt> and <tt>\p{IsL}</tt>
  • 0492 * denote the category of Unicode letters. Blocks and categories can be used
  • 0493 * both inside and outside of a character class.
  • 0494 *
  • 0495 * <p> The supported categories are those of
  • 0496 * <a href="http://www.unicode.org/unicode/standard/standard.html">
  • 0497 * <i>The Unicode Standard</i></a> in the version specified by the
  • 0498 * {@link java.lang.Character Character} class. The category names are those
  • 0499 * defined in the Standard, both normative and informative.
  • 0500 * The block names supported by <code>Pattern</code> are the valid block names
  • 0501 * accepted and defined by
  • 0502 * {@link java.lang.Character.UnicodeBlock#forName(String) UnicodeBlock.forName}.
  • 0503 *
  • 0504 * <a name="jcc"> <p>Categories that behave like the java.lang.Character
  • 0505 * boolean is<i>methodname</i> methods (except for the deprecated ones) are
  • 0506 * available through the same <tt>\p{</tt><i>prop</i><tt>}</tt> syntax where
  • 0507 * the specified property has the name <tt>java<i>methodname</i></tt>.
  • 0508 *
  • 0509 * <h4> Comparison to Perl 5 </h4>
  • 0510 *
  • 0511 * <p>The <code>Pattern</code> engine performs traditional NFA-based matching
  • 0512 * with ordered alternation as occurs in Perl 5.
  • 0513 *
  • 0514 * <p> Perl constructs not supported by this class: </p>
  • 0515 *
  • 0516 * <ul>
  • 0517 *
  • 0518 * <li><p> The conditional constructs <tt>(?{</tt><i>X</i><tt>})</tt> and
  • 0519 * <tt>(?(</tt><i>condition</i><tt>)</tt><i>X</i><tt>|</tt><i>Y</i><tt>)</tt>,
  • 0520 * </p></li>
  • 0521 *
  • 0522 * <li><p> The embedded code constructs <tt>(?{</tt><i>code</i><tt>})</tt>
  • 0523 * and <tt>(??{</tt><i>code</i><tt>})</tt>,</p></li>
  • 0524 *
  • 0525 * <li><p> The embedded comment syntax <tt>(?#comment)</tt>, and </p></li>
  • 0526 *
  • 0527 * <li><p> The preprocessing operations <tt>\l</tt> <tt>\u</tt>,
  • 0528 * <tt>\L</tt>, and <tt>\U</tt>. </p></li>
  • 0529 *
  • 0530 * </ul>
  • 0531 *
  • 0532 * <p> Constructs supported by this class but not by Perl: </p>
  • 0533 *
  • 0534 * <ul>
  • 0535 *
  • 0536 * <li><p> Possessive quantifiers, which greedily match as much as they can
  • 0537 * and do not back off, even when doing so would allow the overall match to
  • 0538 * succeed. </p></li>
  • 0539 *
  • 0540 * <li><p> Character-class union and intersection as described
  • 0541 * <a href="#cc">above</a>.</p></li>
  • 0542 *
  • 0543 * </ul>
  • 0544 *
  • 0545 * <p> Notable differences from Perl: </p>
  • 0546 *
  • 0547 * <ul>
  • 0548 *
  • 0549 * <li><p> In Perl, <tt>\1</tt> through <tt>\9</tt> are always interpreted
  • 0550 * as back references; a backslash-escaped number greater than <tt>9</tt> is
  • 0551 * treated as a back reference if at least that many subexpressions exist,
  • 0552 * otherwise it is interpreted, if possible, as an octal escape. In this
  • 0553 * class octal escapes must always begin with a zero. In this class,
  • 0554 * <tt>\1</tt> through <tt>\9</tt> are always interpreted as back
  • 0555 * references, and a larger number is accepted as a back reference if at
  • 0556 * least that many subexpressions exist at that point in the regular
  • 0557 * expression, otherwise the parser will drop digits until the number is
  • 0558 * smaller or equal to the existing number of groups or it is one digit.
  • 0559 * </p></li>
  • 0560 *
  • 0561 * <li><p> Perl uses the <tt>g</tt> flag to request a match that resumes
  • 0562 * where the last match left off. This functionality is provided implicitly
  • 0563 * by the {@link Matcher} class: Repeated invocations of the {@link
  • 0564 * Matcher#find find} method will resume where the last match left off,
  • 0565 * unless the matcher is reset. </p></li>
  • 0566 *
  • 0567 * <li><p> In Perl, embedded flags at the top level of an expression affect
  • 0568 * the whole expression. In this class, embedded flags always take effect
  • 0569 * at the point at which they appear, whether they are at the top level or
  • 0570 * within a group; in the latter case, flags are restored at the end of the
  • 0571 * group just as in Perl. </p></li>
  • 0572 *
  • 0573 * <li><p> Perl is forgiving about malformed matching constructs, as in the
  • 0574 * expression <tt>*a</tt>, as well as dangling brackets, as in the
  • 0575 * expression <tt>abc]</tt>, and treats them as literals. This
  • 0576 * class also accepts dangling brackets but is strict about dangling
  • 0577 * metacharacters like +, ? and *, and will throw a
  • 0578 * {@link PatternSyntaxException} if it encounters them. </p></li>
  • 0579 *
  • 0580 * </ul>
  • 0581 *
  • 0582 *
  • 0583 * <p> For a more precise description of the behavior of regular expression
  • 0584 * constructs, please see <a href="http://www.oreilly.com/catalog/regex3/">
  • 0585 * <i>Mastering Regular Expressions, 3nd Edition</i>, Jeffrey E. F. Friedl,
  • 0586 * O'Reilly and Associates, 2006.</a>
  • 0587 * </p>
  • 0588 *
  • 0589 * @see java.lang.String#split(String, int)
  • 0590 * @see java.lang.String#split(String)
  • 0591 *
  • 0592 * @author Mike McCloskey
  • 0593 * @author Mark Reinhold
  • 0594 * @author JSR-51 Expert Group
  • 0595 * @version 1.124, 07/03/15
  • 0596 * @since 1.4
  • 0597 * @spec JSR-51
  • 0598 */
  • 0599
  • 0600public final class Pattern
  • 0601 implements java.io.Serializable
  • 0602{
  • 0603
  • 0604 /**
  • 0605 * Regular expression modifier values. Instead of being passed as
  • 0606 * arguments, they can also be passed as inline modifiers.
  • 0607 * For example, the following statements have the same effect.
  • 0608 * <pre>
  • 0609 * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
  • 0610 * RegExp r2 = RegExp.compile("(?im)abc", 0);
  • 0611 * </pre>
  • 0612 *
  • 0613 * The flags are duplicated so that the familiar Perl match flag
  • 0614 * names are available.
  • 0615 */
  • 0616
  • 0617 /**
  • 0618 * Enables Unix lines mode.
  • 0619 *
  • 0620 * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
  • 0621 * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
  • 0622 *
  • 0623 * <p> Unix lines mode can also be enabled via the embedded flag
  • 0624 * expression <tt>(?d)</tt>.
  • 0625 */
  • 0626 public static final int UNIX_LINES = 0x01;
  • 0627
  • 0628 /**
  • 0629 * Enables case-insensitive matching.
  • 0630 *
  • 0631 * <p> By default, case-insensitive matching assumes that only characters
  • 0632 * in the US-ASCII charset are being matched. Unicode-aware
  • 0633 * case-insensitive matching can be enabled by specifying the {@link
  • 0634 * #UNICODE_CASE} flag in conjunction with this flag.
  • 0635 *
  • 0636 * <p> Case-insensitive matching can also be enabled via the embedded flag
  • 0637 * expression <tt>(?i)</tt>.
  • 0638 *
  • 0639 * <p> Specifying this flag may impose a slight performance penalty. </p>
  • 0640 */
  • 0641 public static final int CASE_INSENSITIVE = 0x02;
  • 0642
  • 0643 /**
  • 0644 * Permits whitespace and comments in pattern.
  • 0645 *
  • 0646 * <p> In this mode, whitespace is ignored, and embedded comments starting
  • 0647 * with <tt>#</tt> are ignored until the end of a line.
  • 0648 *
  • 0649 * <p> Comments mode can also be enabled via the embedded flag
  • 0650 * expression <tt>(?x)</tt>.
  • 0651 */
  • 0652 public static final int COMMENTS = 0x04;
  • 0653
  • 0654 /**
  • 0655 * Enables multiline mode.
  • 0656 *
  • 0657 * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
  • 0658 * just after or just before, respectively, a line terminator or the end of
  • 0659 * the input sequence. By default these expressions only match at the
  • 0660 * beginning and the end of the entire input sequence.
  • 0661 *
  • 0662 * <p> Multiline mode can also be enabled via the embedded flag
  • 0663 * expression <tt>(?m)</tt>. </p>
  • 0664 */
  • 0665 public static final int MULTILINE = 0x08;
  • 0666
  • 0667 /**
  • 0668 * Enables literal parsing of the pattern.
  • 0669 *
  • 0670 * <p> When this flag is specified then the input string that specifies
  • 0671 * the pattern is treated as a sequence of literal characters.
  • 0672 * Metacharacters or escape sequences in the input sequence will be
  • 0673 * given no special meaning.
  • 0674 *
  • 0675 * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
  • 0676 * matching when used in conjunction with this flag. The other flags
  • 0677 * become superfluous.
  • 0678 *
  • 0679 * <p> There is no embedded flag character for enabling literal parsing.
  • 0680 * @since 1.5
  • 0681 */
  • 0682 public static final int LITERAL = 0x10;
  • 0683
  • 0684 /**
  • 0685 * Enables dotall mode.
  • 0686 *
  • 0687 * <p> In dotall mode, the expression <tt>.</tt> matches any character,
  • 0688 * including a line terminator. By default this expression does not match
  • 0689 * line terminators.
  • 0690 *
  • 0691 * <p> Dotall mode can also be enabled via the embedded flag
  • 0692 * expression <tt>(?s)</tt>. (The <tt>s</tt> is a mnemonic for
  • 0693 * "single-line" mode, which is what this is called in Perl.) </p>
  • 0694 */
  • 0695 public static final int DOTALL = 0x20;
  • 0696
  • 0697 /**
  • 0698 * Enables Unicode-aware case folding.
  • 0699 *
  • 0700 * <p> When this flag is specified then case-insensitive matching, when
  • 0701 * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
  • 0702 * consistent with the Unicode Standard. By default, case-insensitive
  • 0703 * matching assumes that only characters in the US-ASCII charset are being
  • 0704 * matched.
  • 0705 *
  • 0706 * <p> Unicode-aware case folding can also be enabled via the embedded flag
  • 0707 * expression <tt>(?u)</tt>.
  • 0708 *
  • 0709 * <p> Specifying this flag may impose a performance penalty. </p>
  • 0710 */
  • 0711 public static final int UNICODE_CASE = 0x40;
  • 0712
  • 0713 /**
  • 0714 * Enables canonical equivalence.
  • 0715 *
  • 0716 * <p> When this flag is specified then two characters will be considered
  • 0717 * to match if, and only if, their full canonical decompositions match.
  • 0718 * The expression <tt>"a\u030A"</tt>, for example, will match the
  • 0719 * string <tt>"\u00E5"</tt> when this flag is specified. By default,
  • 0720 * matching does not take canonical equivalence into account.
  • 0721 *
  • 0722 * <p> There is no embedded flag character for enabling canonical
  • 0723 * equivalence.
  • 0724 *
  • 0725 * <p> Specifying this flag may impose a performance penalty. </p>
  • 0726 */
  • 0727 public static final int CANON_EQ = 0x80;
  • 0728
  • 0729 /* Pattern has only two serialized components: The pattern string
  • 0730 * and the flags, which are all that is needed to recompile the pattern
  • 0731 * when it is deserialized.
  • 0732 */
  • 0733
  • 0734 /** use serialVersionUID from Merlin b59 for interoperability */
  • 0735 private static final long serialVersionUID = 5073258162644648461L;
  • 0736
  • 0737 /**
  • 0738 * The original regular-expression pattern string.
  • 0739 *
  • 0740 * @serial
  • 0741 */
  • 0742 private String pattern;
  • 0743
  • 0744 /**
  • 0745 * The original pattern flags.
  • 0746 *
  • 0747 * @serial
  • 0748 */
  • 0749 private int flags;
  • 0750
  • 0751 /**
  • 0752 * Boolean indicating this Pattern is compiled; this is necessary in order
  • 0753 * to lazily compile deserialized Patterns.
  • 0754 */
  • 0755 private transient volatile boolean compiled = false;
  • 0756
  • 0757 /**
  • 0758 * The normalized pattern string.
  • 0759 */
  • 0760 private transient String normalizedPattern;
  • 0761
  • 0762 /**
  • 0763 * The starting point of state machine for the find operation. This allows
  • 0764 * a match to start anywhere in the input.
  • 0765 */
  • 0766 transient Node root;
  • 0767
  • 0768 /**
  • 0769 * The root of object tree for a match operation. The pattern is matched
  • 0770 * at the beginning. This may include a find that uses BnM or a First
  • 0771 * node.
  • 0772 */
  • 0773 transient Node matchRoot;
  • 0774
  • 0775 /**
  • 0776 * Temporary storage used by parsing pattern slice.
  • 0777 */
  • 0778 transient int[] buffer;
  • 0779
  • 0780 /**
  • 0781 * Temporary storage used while parsing group references.
  • 0782 */
  • 0783 transient GroupHead[] groupNodes;
  • 0784
  • 0785 /**
  • 0786 * Temporary null terminated code point array used by pattern compiling.
  • 0787 */
  • 0788 private transient int[] temp;
  • 0789
  • 0790 /**
  • 0791 * The number of capturing groups in this Pattern. Used by matchers to
  • 0792 * allocate storage needed to perform a match.
  • 0793 */
  • 0794 transient int capturingGroupCount;
  • 0795
  • 0796 /**
  • 0797 * The local variable count used by parsing tree. Used by matchers to
  • 0798 * allocate storage needed to perform a match.
  • 0799 */
  • 0800 transient int localCount;
  • 0801
  • 0802 /**
  • 0803 * Index into the pattern string that keeps track of how much has been
  • 0804 * parsed.
  • 0805 */
  • 0806 private transient int cursor;
  • 0807
  • 0808 /**
  • 0809 * Holds the length of the pattern string.
  • 0810 */
  • 0811 private transient int patternLength;
  • 0812
  • 0813 /**
  • 0814 * Compiles the given regular expression into a pattern. </p>
  • 0815 *
  • 0816 * @param regex
  • 0817 * The expression to be compiled
  • 0818 *
  • 0819 * @throws PatternSyntaxException
  • 0820 * If the expression's syntax is invalid
  • 0821 */
  • 0822 public static Pattern compile(String regex) {
  • 0823 return new Pattern(regex, 0);
  • 0824 }
  • 0825
  • 0826 /**
  • 0827 * Compiles the given regular expression into a pattern with the given
  • 0828 * flags. </p>
  • 0829 *
  • 0830 * @param regex
  • 0831 * The expression to be compiled
  • 0832 *
  • 0833 * @param flags
  • 0834 * Match flags, a bit mask that may include
  • 0835 * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
  • 0836 * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
  • 0837 * {@link #LITERAL} and {@link #COMMENTS}
  • 0838 *
  • 0839 * @throws IllegalArgumentException
  • 0840 * If bit values other than those corresponding to the defined
  • 0841 * match flags are set in <tt>flags</tt>
  • 0842 *
  • 0843 * @throws PatternSyntaxException
  • 0844 * If the expression's syntax is invalid
  • 0845 */
  • 0846 public static Pattern compile(String regex, int flags) {
  • 0847 return new Pattern(regex, flags);
  • 0848 }
  • 0849
  • 0850 /**
  • 0851 * Returns the regular expression from which this pattern was compiled.
  • 0852 * </p>
  • 0853 *
  • 0854 * @return The source of this pattern
  • 0855 */
  • 0856 public String pattern() {
  • 0857 return pattern;
  • 0858 }
  • 0859
  • 0860 /**
  • 0861 * <p>Returns the string representation of this pattern. This
  • 0862 * is the regular expression from which this pattern was
  • 0863 * compiled.</p>
  • 0864 *
  • 0865 * @return The string representation of this pattern
  • 0866 * @since 1.5
  • 0867 */
  • 0868 public String toString() {
  • 0869 return pattern;
  • 0870 }
  • 0871
  • 0872 /**
  • 0873 * Creates a matcher that will match the given input against this pattern.
  • 0874 * </p>
  • 0875 *
  • 0876 * @param input
  • 0877 * The character sequence to be matched
  • 0878 *
  • 0879 * @return A new matcher for this pattern
  • 0880 */
  • 0881 public Matcher matcher(CharSequence input) {
  • 0882 if (!compiled) {
  • 0883 synchronized(this) {
  • 0884 if (!compiled)
  • 0885 compile();
  • 0886 }
  • 0887 }
  • 0888 Matcher m = new Matcher(this, input);
  • 0889 return m;
  • 0890 }
  • 0891
  • 0892 /**
  • 0893 * Returns this pattern's match flags. </p>
  • 0894 *
  • 0895 * @return The match flags specified when this pattern was compiled
  • 0896 */
  • 0897 public int flags() {
  • 0898 return flags;
  • 0899 }
  • 0900
  • 0901 /**
  • 0902 * Compiles the given regular expression and attempts to match the given
  • 0903 * input against it.
  • 0904 *
  • 0905 * <p> An invocation of this convenience method of the form
  • 0906 *
  • 0907 * <blockquote><pre>
  • 0908 * Pattern.matches(regex, input);</pre></blockquote>
  • 0909 *
  • 0910 * behaves in exactly the same way as the expression
  • 0911 *
  • 0912 * <blockquote><pre>
  • 0913 * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
  • 0914 *
  • 0915 * <p> If a pattern is to be used multiple times, compiling it once and reusing
  • 0916 * it will be more efficient than invoking this method each time. </p>
  • 0917 *
  • 0918 * @param regex
  • 0919 * The expression to be compiled
  • 0920 *
  • 0921 * @param input
  • 0922 * The character sequence to be matched
  • 0923 *
  • 0924 * @throws PatternSyntaxException
  • 0925 * If the expression's syntax is invalid
  • 0926 */
  • 0927 public static boolean matches(String regex, CharSequence input) {
  • 0928 Pattern p = Pattern.compile(regex);
  • 0929 Matcher m = p.matcher(input);
  • 0930 return m.matches();
  • 0931 }
  • 0932
  • 0933 /**
  • 0934 * Splits the given input sequence around matches of this pattern.
  • 0935 *
  • 0936 * <p> The array returned by this method contains each substring of the
  • 0937 * input sequence that is terminated by another subsequence that matches
  • 0938 * this pattern or is terminated by the end of the input sequence. The
  • 0939 * substrings in the array are in the order in which they occur in the
  • 0940 * input. If this pattern does not match any subsequence of the input then
  • 0941 * the resulting array has just one element, namely the input sequence in
  • 0942 * string form.
  • 0943 *
  • 0944 * <p> The <tt>limit</tt> parameter controls the number of times the
  • 0945 * pattern is applied and therefore affects the length of the resulting
  • 0946 * array. If the limit <i>n</i> is greater than zero then the pattern
  • 0947 * will be applied at most <i>n</i> - 1 times, the array's
  • 0948 * length will be no greater than <i>n</i>, and the array's last entry
  • 0949 * will contain all input beyond the last matched delimiter. If <i>n</i>
  • 0950 * is non-positive then the pattern will be applied as many times as
  • 0951 * possible and the array can have any length. If <i>n</i> is zero then
  • 0952 * the pattern will be applied as many times as possible, the array can
  • 0953 * have any length, and trailing empty strings will be discarded.
  • 0954 *
  • 0955 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  • 0956 * results with these parameters:
  • 0957 *
  • 0958 * <blockquote><table cellpadding=1 cellspacing=0
  • 0959 * summary="Split examples showing regex, limit, and result">
  • 0960 * <tr><th><P align="left"><i>Regex    </i></th>
  • 0961 * <th><P align="left"><i>Limit    </i></th>
  • 0962 * <th><P align="left"><i>Result    </i></th></tr>
  • 0963 * <tr><td align=center>:</td>
  • 0964 * <td align=center>2</td>
  • 0965 * <td><tt>{ "boo", "and:foo" }</tt></td></tr>
  • 0966 * <tr><td align=center>:</td>
  • 0967 * <td align=center>5</td>
  • 0968 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  • 0969 * <tr><td align=center>:</td>
  • 0970 * <td align=center>-2</td>
  • 0971 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  • 0972 * <tr><td align=center>o</td>
  • 0973 * <td align=center>5</td>
  • 0974 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  • 0975 * <tr><td align=center>o</td>
  • 0976 * <td align=center>-2</td>
  • 0977 * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
  • 0978 * <tr><td align=center>o</td>
  • 0979 * <td align=center>0</td>
  • 0980 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  • 0981 * </table></blockquote>
  • 0982 *
  • 0983 *
  • 0984 * @param input
  • 0985 * The character sequence to be split
  • 0986 *
  • 0987 * @param limit
  • 0988 * The result threshold, as described above
  • 0989 *
  • 0990 * @return The array of strings computed by splitting the input
  • 0991 * around matches of this pattern
  • 0992 */
  • 0993 public String[] split(CharSequence input, int limit) {
  • 0994 int index = 0;
  • 0995 boolean matchLimited = limit > 0;
  • 0996 ArrayList<String> matchList = new ArrayList<String>();
  • 0997 Matcher m = matcher(input);
  • 0998
  • 0999 // Add segments before each match found
  • 1000 while(m.find()) {
  • 1001 if (!matchLimited || matchList.size() < limit - 1) {
  • 1002 String match = input.subSequence(index, m.start()).toString();
  • 1003 matchList.add(match);
  • 1004 index = m.end();
  • 1005 } else if (matchList.size() == limit - 1) { // last one
  • 1006 String match = input.subSequence(index,
  • 1007 input.length()).toString();
  • 1008 matchList.add(match);
  • 1009 index = m.end();
  • 1010 }
  • 1011 }
  • 1012
  • 1013 // If no match was found, return this
  • 1014 if (index == 0)
  • 1015 return new String[] {input.toString()};
  • 1016
  • 1017 // Add remaining segment
  • 1018 if (!matchLimited || matchList.size() < limit)
  • 1019 matchList.add(input.subSequence(index, input.length()).toString());
  • 1020
  • 1021 // Construct result
  • 1022 int resultSize = matchList.size();
  • 1023 if (limit == 0)
  • 1024 while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
  • 1025 resultSize--;
  • 1026 String[] result = new String[resultSize];
  • 1027 return matchList.subList(0, resultSize).toArray(result);
  • 1028 }
  • 1029
  • 1030 /**
  • 1031 * Splits the given input sequence around matches of this pattern.
  • 1032 *
  • 1033 * <p> This method works as if by invoking the two-argument {@link
  • 1034 * #split(java.lang.CharSequence, int) split} method with the given input
  • 1035 * sequence and a limit argument of zero. Trailing empty strings are
  • 1036 * therefore not included in the resulting array. </p>
  • 1037 *
  • 1038 * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
  • 1039 * results with these expressions:
  • 1040 *
  • 1041 * <blockquote><table cellpadding=1 cellspacing=0
  • 1042 * summary="Split examples showing regex and result">
  • 1043 * <tr><th><P align="left"><i>Regex    </i></th>
  • 1044 * <th><P align="left"><i>Result</i></th></tr>
  • 1045 * <tr><td align=center>:</td>
  • 1046 * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
  • 1047 * <tr><td align=center>o</td>
  • 1048 * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
  • 1049 * </table></blockquote>
  • 1050 *
  • 1051 *
  • 1052 * @param input
  • 1053 * The character sequence to be split
  • 1054 *
  • 1055 * @return The array of strings computed by splitting the input
  • 1056 * around matches of this pattern
  • 1057 */
  • 1058 public String[] split(CharSequence input) {
  • 1059 return split(input, 0);
  • 1060 }
  • 1061
  • 1062 /**
  • 1063 * Returns a literal pattern <code>String</code> for the specified
  • 1064 * <code>String</code>.
  • 1065 *
  • 1066 * <p>This method produces a <code>String</code> that can be used to
  • 1067 * create a <code>Pattern</code> that would match the string
  • 1068 * <code>s</code> as if it were a literal pattern.</p> Metacharacters
  • 1069 * or escape sequences in the input sequence will be given no special
  • 1070 * meaning.
  • 1071 *
  • 1072 * @param s The string to be literalized
  • 1073 * @return A literal string replacement
  • 1074 * @since 1.5
  • 1075 */
  • 1076 public static String quote(String s) {
  • 1077 int slashEIndex = s.indexOf("\\E");
  • 1078 if (slashEIndex == -1)
  • 1079 return "\\Q" + s + "\\E";
  • 1080
  • 1081 StringBuilder sb = new StringBuilder(s.length() * 2);
  • 1082 sb.append("\\Q");
  • 1083 slashEIndex = 0;
  • 1084 int current = 0;
  • 1085 while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
  • 1086 sb.append(s.substring(current, slashEIndex));
  • 1087 current = slashEIndex + 2;
  • 1088 sb.append("\\E\\\\E\\Q");
  • 1089 }
  • 1090 sb.append(s.substring(current, s.length()));
  • 1091 sb.append("\\E");
  • 1092 return sb.toString();
  • 1093 }
  • 1094
  • 1095 /**
  • 1096 * Recompile the Pattern instance from a stream. The original pattern
  • 1097 * string is read in and the object tree is recompiled from it.
  • 1098 */
  • 1099 private void readObject(java.io.ObjectInputStream s)
  • 1100 throws java.io.IOException, ClassNotFoundException {
  • 1101
  • 1102 // Read in all fields
  • 1103 s.defaultReadObject();
  • 1104
  • 1105 // Initialize counts
  • 1106 capturingGroupCount = 1;
  • 1107 localCount = 0;
  • 1108
  • 1109 // if length > 0, the Pattern is lazily compiled
  • 1110 compiled = false;
  • 1111 if (pattern.length() == 0) {
  • 1112 root = new Start(lastAccept);
  • 1113 matchRoot = lastAccept;
  • 1114 compiled = true;
  • 1115 }
  • 1116 }
  • 1117
  • 1118 /**
  • 1119 * This private constructor is used to create all Patterns. The pattern
  • 1120 * string and match flags are all that is needed to completely describe
  • 1121 * a Pattern. An empty pattern string results in an object tree with
  • 1122 * only a Start node and a LastNode node.
  • 1123 */
  • 1124 private Pattern(String p, int f) {
  • 1125 pattern = p;
  • 1126 flags = f;
  • 1127
  • 1128 // Reset group index count
  • 1129 capturingGroupCount = 1;
  • 1130 localCount = 0;
  • 1131
  • 1132 if (pattern.length() > 0) {
  • 1133 compile();
  • 1134 } else {
  • 1135 root = new Start(lastAccept);
  • 1136 matchRoot = lastAccept;
  • 1137 }
  • 1138 }
  • 1139
  • 1140 /**
  • 1141 * The pattern is converted to normalizedD form and then a pure group
  • 1142 * is constructed to match canonical equivalences of the characters.
  • 1143 */
  • 1144 private void normalize() {
  • 1145 boolean inCharClass = false;
  • 1146 int lastCodePoint = -1;
  • 1147
  • 1148 // Convert pattern into normalizedD form
  • 1149 normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
  • 1150 patternLength = normalizedPattern.length();
  • 1151
  • 1152 // Modify pattern to match canonical equivalences
  • 1153 StringBuilder newPattern = new StringBuilder(patternLength);
  • 1154 for(int i=0; i<patternLength; ) {
  • 1155 int c = normalizedPattern.codePointAt(i);
  • 1156 StringBuilder sequenceBuffer;
  • 1157 if ((Character.getType(c) == Character.NON_SPACING_MARK)
  • 1158 && (lastCodePoint != -1)) {
  • 1159 sequenceBuffer = new StringBuilder();
  • 1160 sequenceBuffer.appendCodePoint(lastCodePoint);
  • 1161 sequenceBuffer.appendCodePoint(c);
  • 1162 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  • 1163 i += Character.charCount(c);
  • 1164 if (i >= patternLength)
  • 1165 break;
  • 1166 c = normalizedPattern.codePointAt(i);
  • 1167 sequenceBuffer.appendCodePoint(c);
  • 1168 }
  • 1169 String ea = produceEquivalentAlternation(
  • 1170 sequenceBuffer.toString());
  • 1171 newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
  • 1172 newPattern.append("(?:").append(ea).append(")");
  • 1173 } else if (c == '[' && lastCodePoint != '\\') {
  • 1174 i = normalizeCharClass(newPattern, i);
  • 1175 } else {
  • 1176 newPattern.appendCodePoint(c);
  • 1177 }
  • 1178 lastCodePoint = c;
  • 1179 i += Character.charCount(c);
  • 1180 }
  • 1181 normalizedPattern = newPattern.toString();
  • 1182 }
  • 1183
  • 1184 /**
  • 1185 * Complete the character class being parsed and add a set
  • 1186 * of alternations to it that will match the canonical equivalences
  • 1187 * of the characters within the class.
  • 1188 */
  • 1189 private int normalizeCharClass(StringBuilder newPattern, int i) {
  • 1190 StringBuilder charClass = new StringBuilder();
  • 1191 StringBuilder eq = null;
  • 1192 int lastCodePoint = -1;
  • 1193 String result;
  • 1194
  • 1195 i++;
  • 1196 charClass.append("[");
  • 1197 while(true) {
  • 1198 int c = normalizedPattern.codePointAt(i);
  • 1199 StringBuilder sequenceBuffer;
  • 1200
  • 1201 if (c == ']' && lastCodePoint != '\\') {
  • 1202 charClass.append((char)c);
  • 1203 break;
  • 1204 } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
  • 1205 sequenceBuffer = new StringBuilder();
  • 1206 sequenceBuffer.appendCodePoint(lastCodePoint);
  • 1207 while(Character.getType(c) == Character.NON_SPACING_MARK) {
  • 1208 sequenceBuffer.appendCodePoint(c);
  • 1209 i += Character.charCount(c);
  • 1210 if (i >= normalizedPattern.length())
  • 1211 break;
  • 1212 c = normalizedPattern.codePointAt(i);
  • 1213 }
  • 1214 String ea = produceEquivalentAlternation(
  • 1215 sequenceBuffer.toString());
  • 1216
  • 1217 charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
  • 1218 if (eq == null)
  • 1219 eq = new StringBuilder();
  • 1220 eq.append('|');
  • 1221 eq.append(ea);
  • 1222 } else {
  • 1223 charClass.appendCodePoint(c);
  • 1224 i++;
  • 1225 }
  • 1226 if (i == normalizedPattern.length())
  • 1227 throw error("Unclosed character class");
  • 1228 lastCodePoint = c;
  • 1229 }
  • 1230
  • 1231 if (eq != null) {
  • 1232 result = "(?:"+charClass.toString()+eq.toString()+")";
  • 1233 } else {
  • 1234 result = charClass.toString();
  • 1235 }
  • 1236
  • 1237 newPattern.append(result);
  • 1238 return i;
  • 1239 }
  • 1240
  • 1241 /**
  • 1242 * Given a specific sequence composed of a regular character and
  • 1243 * combining marks that follow it, produce the alternation that will
  • 1244 * match all canonical equivalences of that sequence.
  • 1245 */
  • 1246 private String produceEquivalentAlternation(String source) {
  • 1247 int len = countChars(source, 0, 1);
  • 1248 if (source.length() == len)
  • 1249 // source has one character.
  • 1250 return source;
  • 1251
  • 1252 String base = source.substring(0,len);
  • 1253 String combiningMarks = source.substring(len);
  • 1254
  • 1255 String[] perms = producePermutations(combiningMarks);
  • 1256 StringBuilder result = new StringBuilder(source);
  • 1257
  • 1258 // Add combined permutations
  • 1259 for(int x=0; x<perms.length; x++) {
  • 1260 String next = base + perms[x];
  • 1261 if (x>0)
  • 1262 result.append("|"+next);
  • 1263 next = composeOneStep(next);
  • 1264 if (next != null)
  • 1265 result.append("|"+produceEquivalentAlternation(next));
  • 1266 }
  • 1267 return result.toString();
  • 1268 }
  • 1269
  • 1270 /**
  • 1271 * Returns an array of strings that have all the possible
  • 1272 * permutations of the characters in the input string.
  • 1273 * This is used to get a list of all possible orderings
  • 1274 * of a set of combining marks. Note that some of the permutations
  • 1275 * are invalid because of combining class collisions, and these
  • 1276 * possibilities must be removed because they are not canonically
  • 1277 * equivalent.
  • 1278 */
  • 1279 private String[] producePermutations(String input) {
  • 1280 if (input.length() == countChars(input, 0, 1))
  • 1281 return new String[] {input};
  • 1282
  • 1283 if (input.length() == countChars(input, 0, 2)) {
  • 1284 int c0 = Character.codePointAt(input, 0);
  • 1285 int c1 = Character.codePointAt(input, Character.charCount(c0));
  • 1286 if (getClass(c1) == getClass(c0)) {
  • 1287 return new String[] {input};
  • 1288 }
  • 1289 String[] result = new String[2];
  • 1290 result[0] = input;
  • 1291 StringBuilder sb = new StringBuilder(2);
  • 1292 sb.appendCodePoint(c1);
  • 1293 sb.appendCodePoint(c0);
  • 1294 result[1] = sb.toString();
  • 1295 return result;
  • 1296 }
  • 1297
  • 1298 int length = 1;
  • 1299 int nCodePoints = countCodePoints(input);
  • 1300 for(int x=1; x<nCodePoints; x++)
  • 1301 length = length * (x+1);
  • 1302
  • 1303 String[] temp = new String[length];
  • 1304
  • 1305 int combClass[] = new int[nCodePoints];
  • 1306 for(int x=0, i=0; x<nCodePoints; x++) {
  • 1307 int c = Character.codePointAt(input, i);
  • 1308 combClass[x] = getClass(c);
  • 1309 i += Character.charCount(c);
  • 1310 }
  • 1311
  • 1312 // For each char, take it out and add the permutations
  • 1313 // of the remaining chars
  • 1314 int index = 0;
  • 1315 int len;
  • 1316 // offset maintains the index in code units.
  • 1317loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
  • 1318 len = countChars(input, offset, 1);
  • 1319 boolean skip = false;
  • 1320 for(int y=x-1; y>=0; y--) {
  • 1321 if (combClass[y] == combClass[x]) {
  • 1322 continue loop;
  • 1323 }
  • 1324 }
  • 1325 StringBuilder sb = new StringBuilder(input);
  • 1326 String otherChars = sb.delete(offset, offset+len).toString();
  • 1327 String[] subResult = producePermutations(otherChars);
  • 1328
  • 1329 String prefix = input.substring(offset, offset+len);
  • 1330 for(int y=0; y<subResult.length; y++)
  • 1331 temp[index++] = prefix + subResult[y];
  • 1332 }
  • 1333 String[] result = new String[index];
  • 1334 for (int x=0; x<index; x++)
  • 1335 result[x] = temp[x];
  • 1336 return result;
  • 1337 }
  • 1338
  • 1339 private int getClass(int c) {
  • 1340 return sun.text.Normalizer.getCombiningClass(c);
  • 1341 }
  • 1342
  • 1343 /**
  • 1344 * Attempts to compose input by combining the first character
  • 1345 * with the first combining mark following it. Returns a String
  • 1346 * that is the composition of the leading character with its first
  • 1347 * combining mark followed by the remaining combining marks. Returns
  • 1348 * null if the first two characters cannot be further composed.
  • 1349 */
  • 1350 private String composeOneStep(String input) {
  • 1351 int len = countChars(input, 0, 2);
  • 1352 String firstTwoCharacters = input.substring(0, len);
  • 1353 String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
  • 1354
  • 1355 if (result.equals(firstTwoCharacters))
  • 1356 return null;
  • 1357 else {
  • 1358 String remainder = input.substring(len);
  • 1359 return result + remainder;
  • 1360 }
  • 1361 }
  • 1362
  • 1363 /**
  • 1364 * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
  • 1365 * See the description of `quotemeta' in perlfunc(1).
  • 1366 */
  • 1367 private void RemoveQEQuoting() {
  • 1368 final int pLen = patternLength;
  • 1369 int i = 0;
  • 1370 while (i < pLen-1) {
  • 1371 if (temp[i] != '\\')
  • 1372 i += 1;
  • 1373 else if (temp[i + 1] != 'Q')
  • 1374 i += 2;
  • 1375 else
  • 1376 break;
  • 1377 }
  • 1378 if (i >= pLen - 1) // No \Q sequence found
  • 1379 return;
  • 1380 int j = i;
  • 1381 i += 2;
  • 1382 int[] newtemp = new int[j + 2*(pLen-i) + 2];
  • 1383 System.arraycopy(temp, 0, newtemp, 0, j);
  • 1384
  • 1385 boolean inQuote = true;
  • 1386 while (i < pLen) {
  • 1387 int c = temp[i++];
  • 1388 if (! ASCII.isAscii(c) || ASCII.isAlnum(c)) {
  • 1389 newtemp[j++] = c;
  • 1390 } else if (c != '\\') {
  • 1391 if (inQuote) newtemp[j++] = '\\';
  • 1392 newtemp[j++] = c;
  • 1393 } else if (inQuote) {
  • 1394 if (temp[i] == 'E') {
  • 1395 i++;
  • 1396 inQuote = false;
  • 1397 } else {
  • 1398 newtemp[j++] = '\\';
  • 1399 newtemp[j++] = '\\';
  • 1400 }
  • 1401 } else {
  • 1402 if (temp[i] == 'Q') {
  • 1403 i++;
  • 1404 inQuote = true;
  • 1405 } else {
  • 1406 newtemp[j++] = c;
  • 1407 if (i != pLen)
  • 1408 newtemp[j++] = temp[i++];
  • 1409 }
  • 1410 }
  • 1411 }
  • 1412
  • 1413 patternLength = j;
  • 1414 temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
  • 1415 }
  • 1416
  • 1417 /**
  • 1418 * Copies regular expression to an int array and invokes the parsing
  • 1419 * of the expression which will create the object tree.
  • 1420 */
  • 1421 private void compile() {
  • 1422 // Handle canonical equivalences
  • 1423 if (has(CANON_EQ) && !has(LITERAL)) {
  • 1424 normalize();
  • 1425 } else {
  • 1426 normalizedPattern = pattern;
  • 1427 }
  • 1428 patternLength = normalizedPattern.length();
  • 1429
  • 1430 // Copy pattern to int array for convenience
  • 1431 // Use double zero to terminate pattern
  • 1432 temp = new int[patternLength + 2];
  • 1433
  • 1434 boolean hasSupplementary = false;
  • 1435 int c, count = 0;
  • 1436 // Convert all chars into code points
  • 1437 for (int x = 0; x < patternLength; x += Character.charCount(c)) {
  • 1438 c = normalizedPattern.codePointAt(x);
  • 1439 if (isSupplementary(c)) {
  • 1440 hasSupplementary = true;
  • 1441 }
  • 1442 temp[count++] = c;
  • 1443 }
  • 1444
  • 1445 patternLength = count; // patternLength now in code points
  • 1446
  • 1447 if (! has(LITERAL))
  • 1448 RemoveQEQuoting();
  • 1449
  • 1450 // Allocate all temporary objects here.
  • 1451 buffer = new int[32];
  • 1452 groupNodes = new GroupHead[10];
  • 1453
  • 1454 if (has(LITERAL)) {
  • 1455 // Literal pattern handling
  • 1456 matchRoot = newSlice(temp, patternLength, hasSupplementary);
  • 1457 matchRoot.next = lastAccept;
  • 1458 } else {
  • 1459 // Start recursive descent parsing
  • 1460 matchRoot = expr(lastAccept);
  • 1461 // Check extra pattern characters
  • 1462 if (patternLength != cursor) {
  • 1463 if (peek() == ')') {
  • 1464 throw error("Unmatched closing ')'");
  • 1465 } else {
  • 1466 throw error("Unexpected internal error");
  • 1467 }
  • 1468 }
  • 1469 }
  • 1470
  • 1471 // Peephole optimization
  • 1472 if (matchRoot instanceof Slice) {
  • 1473 root = BnM.optimize(matchRoot);
  • 1474 if (root == matchRoot) {
  • 1475 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  • 1476 }
  • 1477 } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
  • 1478 root = matchRoot;
  • 1479 } else {
  • 1480 root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
  • 1481 }
  • 1482
  • 1483 // Release temporary storage
  • 1484 temp = null;
  • 1485 buffer = null;
  • 1486 groupNodes = null;
  • 1487 patternLength = 0;
  • 1488 compiled = true;
  • 1489 }
  • 1490
  • 1491 /**
  • 1492 * Used to print out a subtree of the Pattern to help with debugging.
  • 1493 */
  • 1494 private static void printObjectTree(Node node) {
  • 1495 while(node != null) {
  • 1496 if (node instanceof Prolog) {
  • 1497 System.out.println(node);
  • 1498 printObjectTree(((Prolog)node).loop);
  • 1499 System.out.println("**** end contents prolog loop");
  • 1500 } else if (node instanceof Loop) {
  • 1501 System.out.println(node);
  • 1502 printObjectTree(((Loop)node).body);
  • 1503 System.out.println("**** end contents Loop body");
  • 1504 } else if (node instanceof Curly) {
  • 1505 System.out.println(node);
  • 1506 printObjectTree(((Curly)node).atom);
  • 1507 System.out.println("**** end contents Curly body");
  • 1508 } else if (node instanceof GroupCurly) {
  • 1509 System.out.println(node);
  • 1510 printObjectTree(((GroupCurly)node).atom);
  • 1511 System.out.println("**** end contents GroupCurly body");
  • 1512 } else if (node instanceof GroupTail) {
  • 1513 System.out.println(node);
  • 1514 System.out.println("Tail next is "+node.next);
  • 1515 return;
  • 1516 } else {
  • 1517 System.out.println(node);
  • 1518 }
  • 1519 node = node.next;
  • 1520 if (node != null)
  • 1521 System.out.println("->next:");
  • 1522 if (node == Pattern.accept) {
  • 1523 System.out.println("Accept Node");
  • 1524 node = null;
  • 1525 }
  • 1526 }
  • 1527 }
  • 1528
  • 1529 /**
  • 1530 * Used to accumulate information about a subtree of the object graph
  • 1531 * so that optimizations can be applied to the subtree.
  • 1532 */
  • 1533 static final class TreeInfo {
  • 1534 int minLength;
  • 1535 int maxLength;
  • 1536 boolean maxValid;
  • 1537 boolean deterministic;
  • 1538
  • 1539 TreeInfo() {
  • 1540 reset();
  • 1541 }
  • 1542 void reset() {
  • 1543 minLength = 0;
  • 1544 maxLength = 0;
  • 1545 maxValid = true;
  • 1546 deterministic = true;
  • 1547 }
  • 1548 }
  • 1549
  • 1550 /*
  • 1551 * The following private methods are mainly used to improve the
  • 1552 * readability of the code. In order to let the Java compiler easily
  • 1553 * inline them, we should not put many assertions or error checks in them.
  • 1554 */
  • 1555
  • 1556 /**
  • 1557 * Indicates whether a particular flag is set or not.
  • 1558 */
  • 1559 private boolean has(int f) {
  • 1560 return (flags & f) != 0;
  • 1561 }
  • 1562
  • 1563 /**
  • 1564 * Match next character, signal error if failed.
  • 1565 */
  • 1566 private void accept(int ch, String s) {
  • 1567 int testChar = temp[cursor++];
  • 1568 if (has(COMMENTS))
  • 1569 testChar = parsePastWhitespace(testChar);
  • 1570 if (ch != testChar) {
  • 1571 throw error(s);
  • 1572 }
  • 1573 }
  • 1574
  • 1575 /**
  • 1576 * Mark the end of pattern with a specific character.
  • 1577 */
  • 1578 private void mark(int c) {
  • 1579 temp[patternLength] = c;
  • 1580 }
  • 1581
  • 1582 /**
  • 1583 * Peek the next character, and do not advance the cursor.
  • 1584 */
  • 1585 private int peek() {
  • 1586 int ch = temp[cursor];
  • 1587 if (has(COMMENTS))
  • 1588 ch = peekPastWhitespace(ch);
  • 1589 return ch;
  • 1590 }
  • 1591
  • 1592 /**
  • 1593 * Read the next character, and advance the cursor by one.
  • 1594 */
  • 1595 private int read() {
  • 1596 int ch = temp[cursor++];
  • 1597 if (has(COMMENTS))
  • 1598 ch = parsePastWhitespace(ch);
  • 1599 return ch;
  • 1600 }
  • 1601
  • 1602 /**
  • 1603 * Read the next character, and advance the cursor by one,
  • 1604 * ignoring the COMMENTS setting
  • 1605 */
  • 1606 private int readEscaped() {
  • 1607 int ch = temp[cursor++];
  • 1608 return ch;
  • 1609 }
  • 1610
  • 1611 /**
  • 1612 * Advance the cursor by one, and peek the next character.
  • 1613 */
  • 1614 private int next() {
  • 1615 int ch = temp[++cursor];
  • 1616 if (has(COMMENTS))
  • 1617 ch = peekPastWhitespace(ch);
  • 1618 return ch;
  • 1619 }
  • 1620
  • 1621 /**
  • 1622 * Advance the cursor by one, and peek the next character,
  • 1623 * ignoring the COMMENTS setting
  • 1624 */
  • 1625 private int nextEscaped() {
  • 1626 int ch = temp[++cursor];
  • 1627 return ch;
  • 1628 }
  • 1629
  • 1630 /**
  • 1631 * If in xmode peek past whitespace and comments.
  • 1632 */
  • 1633 private int peekPastWhitespace(int ch) {
  • 1634 while (ASCII.isSpace(ch) || ch == '#') {
  • 1635 while (ASCII.isSpace(ch))
  • 1636 ch = temp[++cursor];
  • 1637 if (ch == '#') {
  • 1638 ch = peekPastLine();
  • 1639 }
  • 1640 }
  • 1641 return ch;
  • 1642 }
  • 1643
  • 1644 /**
  • 1645 * If in xmode parse past whitespace and comments.
  • 1646 */
  • 1647 private int parsePastWhitespace(int ch) {
  • 1648 while (ASCII.isSpace(ch) || ch == '#') {
  • 1649 while (ASCII.isSpace(ch))
  • 1650 ch = temp[cursor++];
  • 1651 if (ch == '#')
  • 1652 ch = parsePastLine();
  • 1653 }
  • 1654 return ch;
  • 1655 }
  • 1656
  • 1657 /**
  • 1658 * xmode parse past comment to end of line.
  • 1659 */
  • 1660 private int parsePastLine() {
  • 1661 int ch = temp[cursor++];
  • 1662 while (ch != 0 && !isLineSeparator(ch))
  • 1663 ch = temp[cursor++];
  • 1664 return ch;
  • 1665 }
  • 1666
  • 1667 /**
  • 1668 * xmode peek past comment to end of line.
  • 1669 */
  • 1670 private int peekPastLine() {
  • 1671 int ch = temp[++cursor];
  • 1672 while (ch != 0 && !isLineSeparator(ch))
  • 1673 ch = temp[++cursor];
  • 1674 return ch;
  • 1675 }
  • 1676
  • 1677 /**
  • 1678 * Determines if character is a line separator in the current mode
  • 1679 */
  • 1680 private boolean isLineSeparator(int ch) {
  • 1681 if (has(UNIX_LINES)) {
  • 1682 return ch == '\n';
  • 1683 } else {
  • 1684 return (ch == '\n' ||
  • 1685 ch == '\r' ||
  • 1686 (ch|1) == '\u2029' ||
  • 1687 ch == '\u0085');
  • 1688 }
  • 1689 }
  • 1690
  • 1691 /**
  • 1692 * Read the character after the next one, and advance the cursor by two.
  • 1693 */
  • 1694 private int skip() {
  • 1695 int i = cursor;
  • 1696 int ch = temp[i+1];
  • 1697 cursor = i + 2;
  • 1698 return ch;
  • 1699 }
  • 1700
  • 1701 /**
  • 1702 * Unread one next character, and retreat cursor by one.
  • 1703 */
  • 1704 private void unread() {
  • 1705 cursor--;
  • 1706 }
  • 1707
  • 1708 /**
  • 1709 * Internal method used for handling all syntax errors. The pattern is
  • 1710 * displayed with a pointer to aid in locating the syntax error.
  • 1711 */
  • 1712 private PatternSyntaxException error(String s) {
  • 1713 return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
  • 1714 }
  • 1715
  • 1716 /**
  • 1717 * Determines if there is any supplementary character or unpaired
  • 1718 * surrogate in the specified range.
  • 1719 */
  • 1720 private boolean findSupplementary(int start, int end) {
  • 1721 for (int i = start; i < end; i++) {
  • 1722 if (isSupplementary(temp[i]))
  • 1723 return true;
  • 1724 }
  • 1725 return false;
  • 1726 }
  • 1727
  • 1728 /**
  • 1729 * Determines if the specified code point is a supplementary
  • 1730 * character or unpaired surrogate.
  • 1731 */
  • 1732 private static final boolean isSupplementary(int ch) {
  • 1733 return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT || isSurrogate(ch);
  • 1734 }
  • 1735
  • 1736 /**
  • 1737 * The following methods handle the main parsing. They are sorted
  • 1738 * according to their precedence order, the lowest one first.
  • 1739 */
  • 1740
  • 1741 /**
  • 1742 * The expression is parsed with branch nodes added for alternations.
  • 1743 * This may be called recursively to parse sub expressions that may
  • 1744 * contain alternations.
  • 1745 */
  • 1746 private Node expr(Node end) {
  • 1747 Node prev = null;
  • 1748 Node firstTail = null;
  • 1749 Node branchConn = null;
  • 1750
  • 1751 for (;;) {
  • 1752 Node node = sequence(end);
  • 1753 Node nodeTail = root; //double return
  • 1754 if (prev == null) {
  • 1755 prev = node;
  • 1756 firstTail = nodeTail;
  • 1757 } else {
  • 1758 // Branch
  • 1759 if (branchConn == null) {
  • 1760 branchConn = new BranchConn();
  • 1761 branchConn.next = end;
  • 1762 }
  • 1763 if (node == end) {
  • 1764 // if the node returned from sequence() is "end"
  • 1765 // we have an empty expr, set a null atom into
  • 1766 // the branch to indicate to go "next" directly.
  • 1767 node = null;
  • 1768 } else {
  • 1769 // the "tail.next" of each atom goes to branchConn
  • 1770 nodeTail.next = branchConn;
  • 1771 }
  • 1772 if (prev instanceof Branch) {
  • 1773 ((Branch)prev).add(node);
  • 1774 } else {
  • 1775 if (prev == end) {
  • 1776 prev = null;
  • 1777 } else {
  • 1778 // replace the "end" with "branchConn" at its tail.next
  • 1779 // when put the "prev" into the branch as the first atom.
  • 1780 firstTail.next = branchConn;
  • 1781 }
  • 1782 prev = new Branch(prev, node, branchConn);
  • 1783 }
  • 1784 }
  • 1785 if (peek() != '|') {
  • 1786 return prev;
  • 1787 }
  • 1788 next();
  • 1789 }
  • 1790 }
  • 1791
  • 1792 /**
  • 1793 * Parsing of sequences between alternations.
  • 1794 */
  • 1795 private Node sequence(Node end) {
  • 1796 Node head = null;
  • 1797 Node tail = null;
  • 1798 Node node = null;
  • 1799 LOOP:
  • 1800 for (;;) {
  • 1801 int ch = peek();
  • 1802 switch (ch) {
  • 1803 case '(':
  • 1804 // Because group handles its own closure,
  • 1805 // we need to treat it differently
  • 1806 node = group0();
  • 1807 // Check for comment or flag group
  • 1808 if (node == null)
  • 1809 continue;
  • 1810 if (head == null)
  • 1811 head = node;
  • 1812 else
  • 1813 tail.next = node;
  • 1814 // Double return: Tail was returned in root
  • 1815 tail = root;
  • 1816 continue;
  • 1817 case '[':
  • 1818 node = clazz(true);
  • 1819 break;
  • 1820 case '\\':
  • 1821 ch = nextEscaped();
  • 1822 if (ch == 'p' || ch == 'P') {
  • 1823 boolean oneLetter = true;
  • 1824 boolean comp = (ch == 'P');
  • 1825 ch = next(); // Consume { if present
  • 1826 if (ch != '{') {
  • 1827 unread();
  • 1828 } else {
  • 1829 oneLetter = false;
  • 1830 }
  • 1831 node = family(oneLetter).maybeComplement(comp);
  • 1832 } else {
  • 1833 unread();
  • 1834 node = atom();
  • 1835 }
  • 1836 break;
  • 1837 case '^':
  • 1838 next();
  • 1839 if (has(MULTILINE)) {
  • 1840 if (has(UNIX_LINES))
  • 1841 node = new UnixCaret();
  • 1842 else
  • 1843 node = new Caret();
  • 1844 } else {
  • 1845 node = new Begin();
  • 1846 }
  • 1847 break;
  • 1848 case '$':
  • 1849 next();
  • 1850 if (has(UNIX_LINES))
  • 1851 node = new UnixDollar(has(MULTILINE));
  • 1852 else
  • 1853 node = new Dollar(has(MULTILINE));
  • 1854 break;
  • 1855 case '.':
  • 1856 next();
  • 1857 if (has(DOTALL)) {
  • 1858 node = new All();
  • 1859 } else {
  • 1860 if (has(UNIX_LINES))
  • 1861 node = new UnixDot();
  • 1862 else {
  • 1863 node = new Dot();
  • 1864 }
  • 1865 }
  • 1866 break;
  • 1867 case '|':
  • 1868 case ')':
  • 1869 break LOOP;
  • 1870 case ']': // Now interpreting dangling ] and } as literals
  • 1871 case '}':
  • 1872 node = atom();
  • 1873 break;
  • 1874 case '?':
  • 1875 case '*':
  • 1876 case '+':
  • 1877 next();
  • 1878 throw error("Dangling meta character '" + ((char)ch) + "'");
  • 1879 case 0:
  • 1880 if (cursor >= patternLength) {
  • 1881 break LOOP;
  • 1882 }
  • 1883 // Fall through
  • 1884 default:
  • 1885 node = atom();
  • 1886 break;
  • 1887 }
  • 1888
  • 1889 node = closure(node);
  • 1890
  • 1891 if (head == null) {
  • 1892 head = tail = node;
  • 1893 } else {
  • 1894 tail.next = node;
  • 1895 tail = node;
  • 1896 }
  • 1897 }
  • 1898 if (head == null) {
  • 1899 return end;
  • 1900 }
  • 1901 tail.next = end;
  • 1902 root = tail; //double return
  • 1903 return head;
  • 1904 }
  • 1905
  • 1906 /**
  • 1907 * Parse and add a new Single or Slice.
  • 1908 */
  • 1909 private Node atom() {
  • 1910 int first = 0;
  • 1911 int prev = -1;
  • 1912 boolean hasSupplementary = false;
  • 1913 int ch = peek();
  • 1914 for (;;) {
  • 1915 switch (ch) {
  • 1916 case '*':
  • 1917 case '+':
  • 1918 case '?':
  • 1919 case '{':
  • 1920 if (first > 1) {
  • 1921 cursor = prev; // Unwind one character
  • 1922 first--;
  • 1923 }
  • 1924 break;
  • 1925 case '$':
  • 1926 case '.':
  • 1927 case '^':
  • 1928 case '(':
  • 1929 case '[':
  • 1930 case '|':
  • 1931 case ')':
  • 1932 break;
  • 1933 case '\\':
  • 1934 ch = nextEscaped();
  • 1935 if (ch == 'p' || ch == 'P') { // Property
  • 1936 if (first > 0) { // Slice is waiting; handle it first
  • 1937 unread();
  • 1938 break;
  • 1939 } else { // No slice; just return the family node
  • 1940 boolean comp = (ch == 'P');
  • 1941 boolean oneLetter = true;
  • 1942 ch = next(); // Consume { if present
  • 1943 if (ch != '{')
  • 1944 unread();
  • 1945 else
  • 1946 oneLetter = false;
  • 1947 return family(oneLetter).maybeComplement(comp);
  • 1948 }
  • 1949 }
  • 1950 unread();
  • 1951 prev = cursor;
  • 1952 ch = escape(false, first == 0);
  • 1953 if (ch >= 0) {
  • 1954 append(ch, first);
  • 1955 first++;
  • 1956 if (isSupplementary(ch)) {
  • 1957 hasSupplementary = true;
  • 1958 }
  • 1959 ch = peek();
  • 1960 continue;
  • 1961 } else if (first == 0) {
  • 1962 return root;
  • 1963 }
  • 1964 // Unwind meta escape sequence
  • 1965 cursor = prev;
  • 1966 break;
  • 1967 case 0:
  • 1968 if (cursor >= patternLength) {
  • 1969 break;
  • 1970 }
  • 1971 // Fall through
  • 1972 default:
  • 1973 prev = cursor;
  • 1974 append(ch, first);
  • 1975 first++;
  • 1976 if (isSupplementary(ch)) {
  • 1977 hasSupplementary = true;
  • 1978 }
  • 1979 ch = next();
  • 1980 continue;
  • 1981 }
  • 1982 break;
  • 1983 }
  • 1984 if (first == 1) {
  • 1985 return newSingle(buffer[0]);
  • 1986 } else {
  • 1987 return newSlice(buffer, first, hasSupplementary);
  • 1988 }
  • 1989 }
  • 1990
  • 1991 private void append(int ch, int len) {
  • 1992 if (len >= buffer.length) {
  • 1993 int[] tmp = new int[len+len];
  • 1994 System.arraycopy(buffer, 0, tmp, 0, len);
  • 1995 buffer = tmp;
  • 1996 }
  • 1997 buffer[len] = ch;
  • 1998 }
  • 1999
  • 2000 /**
  • 2001 * Parses a backref greedily, taking as many numbers as it
  • 2002 * can. The first digit is always treated as a backref, but
  • 2003 * multi digit numbers are only treated as a backref if at
  • 2004 * least that many backrefs exist at this point in the regex.
  • 2005 */
  • 2006 private Node ref(int refNum) {
  • 2007 boolean done = false;
  • 2008 while(!done) {
  • 2009 int ch = peek();
  • 2010 switch(ch) {
  • 2011 case '0':
  • 2012 case '1':
  • 2013 case '2':
  • 2014 case '3':
  • 2015 case '4':
  • 2016 case '5':
  • 2017 case '6':
  • 2018 case '7':
  • 2019 case '8':
  • 2020 case '9':
  • 2021 int newRefNum = (refNum * 10) + (ch - '0');
  • 2022 // Add another number if it doesn't make a group
  • 2023 // that doesn't exist
  • 2024 if (capturingGroupCount - 1 < newRefNum) {
  • 2025 done = true;
  • 2026 break;
  • 2027 }
  • 2028 refNum = newRefNum;
  • 2029 read();
  • 2030 break;
  • 2031 default:
  • 2032 done = true;
  • 2033 break;
  • 2034 }
  • 2035 }
  • 2036 if (has(CASE_INSENSITIVE))
  • 2037 return new CIBackRef(refNum, has(UNICODE_CASE));
  • 2038 else
  • 2039 return new BackRef(refNum);
  • 2040 }
  • 2041
  • 2042 /**
  • 2043 * Parses an escape sequence to determine the actual value that needs
  • 2044 * to be matched.
  • 2045 * If -1 is returned and create was true a new object was added to the tree
  • 2046 * to handle the escape sequence.
  • 2047 * If the returned value is greater than zero, it is the value that
  • 2048 * matches the escape sequence.
  • 2049 */
  • 2050 private int escape(boolean inclass, boolean create) {
  • 2051 int ch = skip();
  • 2052 switch (ch) {
  • 2053 case '0':
  • 2054 return o();
  • 2055 case '1':
  • 2056 case '2':
  • 2057 case '3':
  • 2058 case '4':
  • 2059 case '5':
  • 2060 case '6':
  • 2061 case '7':
  • 2062 case '8':
  • 2063 case '9':
  • 2064 if (inclass) break;
  • 2065 if (create) {
  • 2066 root = ref((ch - '0'));
  • 2067 }
  • 2068 return -1;
  • 2069 case 'A':
  • 2070 if (inclass) break;
  • 2071 if (create) root = new Begin();
  • 2072 return -1;
  • 2073 case 'B':
  • 2074 if (inclass) break;
  • 2075 if (create) root = new Bound(Bound.NONE);
  • 2076 return -1;
  • 2077 case 'C':
  • 2078 break;
  • 2079 case 'D':
  • 2080 if (create) root = new Ctype(ASCII.DIGIT).complement();
  • 2081 return -1;
  • 2082 case 'E':
  • 2083 case 'F':
  • 2084 break;
  • 2085 case 'G':
  • 2086 if (inclass) break;
  • 2087 if (create) root = new LastMatch();
  • 2088 return -1;
  • 2089 case 'H':
  • 2090 case 'I':
  • 2091 case 'J':
  • 2092 case 'K':
  • 2093 case 'L':
  • 2094 case 'M':
  • 2095 case 'N':
  • 2096 case 'O':
  • 2097 case 'P':
  • 2098 case 'Q':
  • 2099 case 'R':
  • 2100 break;
  • 2101 case 'S':
  • 2102 if (create) root = new Ctype(ASCII.SPACE).complement();
  • 2103 return -1;
  • 2104 case 'T':
  • 2105 case 'U':
  • 2106 case 'V':
  • 2107 break;
  • 2108 case 'W':
  • 2109 if (create) root = new Ctype(ASCII.WORD).complement();
  • 2110 return -1;
  • 2111 case 'X':
  • 2112 case 'Y':
  • 2113 break;
  • 2114 case 'Z':
  • 2115 if (inclass) break;
  • 2116 if (create) {
  • 2117 if (has(UNIX_LINES))
  • 2118 root = new UnixDollar(false);
  • 2119 else
  • 2120 root = new Dollar(false);
  • 2121 }
  • 2122 return -1;
  • 2123 case 'a':
  • 2124 return '\007';
  • 2125 case 'b':
  • 2126 if (inclass) break;
  • 2127 if (create) root = new Bound(Bound.BOTH);
  • 2128 return -1;
  • 2129 case 'c':
  • 2130 return c();
  • 2131 case 'd':
  • 2132 if (create) root = new Ctype(ASCII.DIGIT);
  • 2133 return -1;
  • 2134 case 'e':
  • 2135 return '\033';
  • 2136 case 'f':
  • 2137 return '\f';
  • 2138 case 'g':
  • 2139 case 'h':
  • 2140 case 'i':
  • 2141 case 'j':
  • 2142 case 'k':
  • 2143 case 'l':
  • 2144 case 'm':
  • 2145 break;
  • 2146 case 'n':
  • 2147 return '\n';
  • 2148 case 'o':
  • 2149 case 'p':
  • 2150 case 'q':
  • 2151 break;
  • 2152 case 'r':
  • 2153 return '\r';
  • 2154 case 's':
  • 2155 if (create) root = new Ctype(ASCII.SPACE);
  • 2156 return -1;
  • 2157 case 't':
  • 2158 return '\t';
  • 2159 case 'u':
  • 2160 return u();
  • 2161 case 'v':
  • 2162 return '\013';
  • 2163 case 'w':
  • 2164 if (create) root = new Ctype(ASCII.WORD);
  • 2165 return -1;
  • 2166 case 'x':
  • 2167 return x();
  • 2168 case 'y':
  • 2169 break;
  • 2170 case 'z':
  • 2171 if (inclass) break;
  • 2172 if (create) root = new End();
  • 2173 return -1;
  • 2174 default:
  • 2175 return ch;
  • 2176 }
  • 2177 throw error("Illegal/unsupported escape sequence");
  • 2178 }
  • 2179
  • 2180 /**
  • 2181 * Parse a character class, and return the node that matches it.
  • 2182 *
  • 2183 * Consumes a ] on the way out if consume is true. Usually consume
  • 2184 * is true except for the case of [abc&&def] where def is a separate
  • 2185 * right hand node with "understood" brackets.
  • 2186 */
  • 2187 private CharProperty clazz(boolean consume) {
  • 2188 CharProperty prev = null;
  • 2189 CharProperty node = null;
  • 2190 BitClass bits = new BitClass();
  • 2191 boolean include = true;
  • 2192 boolean firstInClass = true;
  • 2193 int ch = next();
  • 2194 for (;;) {
  • 2195 switch (ch) {
  • 2196 case '^':
  • 2197 // Negates if first char in a class, otherwise literal
  • 2198 if (firstInClass) {
  • 2199 if (temp[cursor-1] != '[')
  • 2200 break;
  • 2201 ch = next();
  • 2202 include = !include;
  • 2203 continue;
  • 2204 } else {
  • 2205 // ^ not first in class, treat as literal
  • 2206 break;
  • 2207 }
  • 2208 case '[':
  • 2209 firstInClass = false;
  • 2210 node = clazz(true);
  • 2211 if (prev == null)
  • 2212 prev = node;
  • 2213 else
  • 2214 prev = union(prev, node);
  • 2215 ch = peek();
  • 2216 continue;
  • 2217 case '&':
  • 2218 firstInClass = false;
  • 2219 ch = next();
  • 2220 if (ch == '&') {
  • 2221 ch = next();
  • 2222 CharProperty rightNode = null;
  • 2223 while (ch != ']' && ch != '&') {
  • 2224 if (ch == '[') {
  • 2225 if (rightNode == null)
  • 2226 rightNode = clazz(true);
  • 2227 else
  • 2228 rightNode = union(rightNode, clazz(true));
  • 2229 } else { // abc&&def
  • 2230 unread();
  • 2231 rightNode = clazz(false);
  • 2232 }
  • 2233 ch = peek();
  • 2234 }
  • 2235 if (rightNode != null)
  • 2236 node = rightNode;
  • 2237 if (prev == null) {
  • 2238 if (rightNode == null)
  • 2239 throw error("Bad class syntax");
  • 2240 else
  • 2241 prev = rightNode;
  • 2242 } else {
  • 2243 prev = intersection(prev, node);
  • 2244 }
  • 2245 } else {
  • 2246 // treat as a literal &
  • 2247 unread();
  • 2248 break;
  • 2249 }
  • 2250 continue;
  • 2251 case 0:
  • 2252 firstInClass = false;
  • 2253 if (cursor >= patternLength)
  • 2254 throw error("Unclosed character class");
  • 2255 break;
  • 2256 case ']':
  • 2257 firstInClass = false;
  • 2258 if (prev != null) {
  • 2259 if (consume)
  • 2260 next();
  • 2261 return prev;
  • 2262 }
  • 2263 break;
  • 2264 default:
  • 2265 firstInClass = false;
  • 2266 break;
  • 2267 }
  • 2268 node = range(bits);
  • 2269 if (include) {
  • 2270 if (prev == null) {
  • 2271 prev = node;
  • 2272 } else {
  • 2273 if (prev != node)
  • 2274 prev = union(prev, node);
  • 2275 }
  • 2276 } else {
  • 2277 if (prev == null) {
  • 2278 prev = node.complement();
  • 2279 } else {
  • 2280 if (prev != node)
  • 2281 prev = setDifference(prev, node);
  • 2282 }
  • 2283 }
  • 2284 ch = peek();
  • 2285 }
  • 2286 }
  • 2287
  • 2288 private CharProperty bitsOrSingle(BitClass bits, int ch) {
  • 2289 /* Bits can only handle codepoints in [u+0000-u+00ff] range.
  • 2290 Use "single" node instead of bits when dealing with unicode
  • 2291 case folding for codepoints listed below.
  • 2292 (1)Uppercase out of range: u+00ff, u+00b5
  • 2293 toUpperCase(u+00ff) -> u+0178
  • 2294 toUpperCase(u+00b5) -> u+039c
  • 2295 (2)LatinSmallLetterLongS u+17f
  • 2296 toUpperCase(u+017f) -> u+0053
  • 2297 (3)LatinSmallLetterDotlessI u+131
  • 2298 toUpperCase(u+0131) -> u+0049
  • 2299 (4)LatinCapitalLetterIWithDotAbove u+0130
  • 2300 toLowerCase(u+0130) -> u+0069
  • 2301 (5)KelvinSign u+212a
  • 2302 toLowerCase(u+212a) ==> u+006B
  • 2303 (6)AngstromSign u+212b
  • 2304 toLowerCase(u+212b) ==> u+00e5
  • 2305 */
  • 2306 int d;
  • 2307 if (ch < 256 &&
  • 2308 !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
  • 2309 (ch == 0xff || ch == 0xb5 ||
  • 2310 ch == 0x49 || ch == 0x69 || //I and i
  • 2311 ch == 0x53 || ch == 0x73 || //S and s
  • 2312 ch == 0x4b || ch == 0x6b || //K and k
  • 2313 ch == 0xc5 || ch == 0xe5))) //A+ring
  • 2314 return bits.add(ch, flags());
  • 2315 return newSingle(ch);
  • 2316 }
  • 2317
  • 2318 /**
  • 2319 * Parse a single character or a character range in a character class
  • 2320 * and return its representative node.
  • 2321 */
  • 2322 private CharProperty range(BitClass bits) {
  • 2323 int ch = peek();
  • 2324 if (ch == '\\') {
  • 2325 ch = nextEscaped();
  • 2326 if (ch == 'p' || ch == 'P') { // A property
  • 2327 boolean comp = (ch == 'P');
  • 2328 boolean oneLetter = true;
  • 2329 // Consume { if present
  • 2330 ch = next();
  • 2331 if (ch != '{')
  • 2332 unread();
  • 2333 else
  • 2334 oneLetter = false;
  • 2335 return family(oneLetter).maybeComplement(comp);
  • 2336 } else { // ordinary escape
  • 2337 unread();
  • 2338 ch = escape(true, true);
  • 2339 if (ch == -1)
  • 2340 return (CharProperty) root;
  • 2341 }
  • 2342 } else {
  • 2343 ch = single();
  • 2344 }
  • 2345 if (ch >= 0) {
  • 2346 if (peek() == '-') {
  • 2347 int endRange = temp[cursor+1];
  • 2348 if (endRange == '[') {
  • 2349 return bitsOrSingle(bits, ch);
  • 2350 }
  • 2351 if (endRange != ']') {
  • 2352 next();
  • 2353 int m = single();
  • 2354 if (m < ch)
  • 2355 throw error("Illegal character range");
  • 2356 if (has(CASE_INSENSITIVE))
  • 2357 return caseInsensitiveRangeFor(ch, m);
  • 2358 else
  • 2359 return rangeFor(ch, m);
  • 2360 }
  • 2361 }
  • 2362 return bitsOrSingle(bits, ch);
  • 2363 }
  • 2364 throw error("Unexpected character '"+((char)ch)+"'");
  • 2365 }
  • 2366
  • 2367 private int single() {
  • 2368 int ch = peek();
  • 2369 switch (ch) {
  • 2370 case '\\':
  • 2371 return escape(true, false);
  • 2372 default:
  • 2373 next();
  • 2374 return ch;
  • 2375 }
  • 2376 }
  • 2377
  • 2378 /**
  • 2379 * Parses a Unicode character family and returns its representative node.
  • 2380 */
  • 2381 private CharProperty family(boolean singleLetter) {
  • 2382 next();
  • 2383 String name;
  • 2384
  • 2385 if (singleLetter) {
  • 2386 int c = temp[cursor];
  • 2387 if (!Character.isSupplementaryCodePoint(c)) {
  • 2388 name = String.valueOf((char)c);
  • 2389 } else {
  • 2390 name = new String(temp, cursor, 1);
  • 2391 }
  • 2392 read();
  • 2393 } else {
  • 2394 int i = cursor;
  • 2395 mark('}');
  • 2396 while(read() != '}') {
  • 2397 }
  • 2398 mark('\000');
  • 2399 int j = cursor;
  • 2400 if (j > patternLength)
  • 2401 throw error("Unclosed character family");
  • 2402 if (i + 1 >= j)
  • 2403 throw error("Empty character family");
  • 2404 name = new String(temp, i, j-i-1);
  • 2405 }
  • 2406
  • 2407 if (name.startsWith("In")) {
  • 2408 return unicodeBlockPropertyFor(name.substring(2));
  • 2409 } else {
  • 2410 if (name.startsWith("Is"))
  • 2411 name = name.substring(2);
  • 2412 return charPropertyNodeFor(name);
  • 2413 }
  • 2414 }
  • 2415
  • 2416 /**
  • 2417 * Returns a CharProperty matching all characters in a UnicodeBlock.
  • 2418 */
  • 2419 private CharProperty unicodeBlockPropertyFor(String name) {
  • 2420 final Character.UnicodeBlock block;
  • 2421 try {
  • 2422 block = Character.UnicodeBlock.forName(name);
  • 2423 } catch (IllegalArgumentException iae) {
  • 2424 throw error("Unknown character block name {" + name + "}");
  • 2425 }
  • 2426 return new CharProperty() {
  • 2427 boolean isSatisfiedBy(int ch) {
  • 2428 return block == Character.UnicodeBlock.of(ch);}};
  • 2429 }
  • 2430
  • 2431 /**
  • 2432 * Returns a CharProperty matching all characters in a named property.
  • 2433 */
  • 2434 private CharProperty charPropertyNodeFor(String name) {
  • 2435 CharProperty p = CharPropertyNames.charPropertyFor(name);
  • 2436 if (p == null)
  • 2437 throw error("Unknown character property name {" + name + "}");
  • 2438 return p;
  • 2439 }
  • 2440
  • 2441 /**
  • 2442 * Parses a group and returns the head node of a set of nodes that process
  • 2443 * the group. Sometimes a double return system is used where the tail is
  • 2444 * returned in root.
  • 2445 */
  • 2446 private Node group0() {
  • 2447 boolean capturingGroup = false;
  • 2448 Node head = null;
  • 2449 Node tail = null;
  • 2450 int save = flags;
  • 2451 root = null;
  • 2452 int ch = next();
  • 2453 if (ch == '?') {
  • 2454 ch = skip();
  • 2455 switch (ch) {
  • 2456 case ':': // (?:xxx) pure group
  • 2457 head = createGroup(true);
  • 2458 tail = root;
  • 2459 head.next = expr(tail);
  • 2460 break;
  • 2461 case '=': // (?=xxx) and (?!xxx) lookahead
  • 2462 case '!':
  • 2463 head = createGroup(true);
  • 2464 tail = root;
  • 2465 head.next = expr(tail);
  • 2466 if (ch == '=') {
  • 2467 head = tail = new Pos(head);
  • 2468 } else {
  • 2469 head = tail = new Neg(head);
  • 2470 }
  • 2471 break;
  • 2472 case '>': // (?>xxx) independent group
  • 2473 head = createGroup(true);
  • 2474 tail = root;
  • 2475 head.next = expr(tail);
  • 2476 head = tail = new Ques(head, INDEPENDENT);
  • 2477 break;
  • 2478 case '<': // (?<xxx) look behind
  • 2479 ch = read();
  • 2480 int start = cursor;
  • 2481 head = createGroup(true);
  • 2482 tail = root;
  • 2483 head.next = expr(tail);
  • 2484 tail.next = lookbehindEnd;
  • 2485 TreeInfo info = new TreeInfo();
  • 2486 head.study(info);
  • 2487 if (info.maxValid == false) {
  • 2488 throw error("Look-behind group does not have "
  • 2489 + "an obvious maximum length");
  • 2490 }
  • 2491 boolean hasSupplementary = findSupplementary(start, patternLength);
  • 2492 if (ch == '=') {
  • 2493 head = tail = (hasSupplementary ?
  • 2494 new BehindS(head, info.maxLength,
  • 2495 info.minLength) :
  • 2496 new Behind(head, info.maxLength,
  • 2497 info.minLength));
  • 2498 } else if (ch == '!') {
  • 2499 head = tail = (hasSupplementary ?
  • 2500 new NotBehindS(head, info.maxLength,
  • 2501 info.minLength) :
  • 2502 new NotBehind(head, info.maxLength,
  • 2503 info.minLength));
  • 2504 } else {
  • 2505 throw error("Unknown look-behind group");
  • 2506 }
  • 2507 break;
  • 2508 case '$':
  • 2509 case '@':
  • 2510 throw error("Unknown group type");
  • 2511 default: // (?xxx:) inlined match flags
  • 2512 unread();
  • 2513 addFlag();
  • 2514 ch = read();
  • 2515 if (ch == ')') {
  • 2516 return null; // Inline modifier only
  • 2517 }
  • 2518 if (ch != ':') {
  • 2519 throw error("Unknown inline modifier");
  • 2520 }
  • 2521 head = createGroup(true);
  • 2522 tail = root;
  • 2523 head.next = expr(tail);
  • 2524 break;
  • 2525 }
  • 2526 } else { // (xxx) a regular group
  • 2527 capturingGroup = true;
  • 2528 head = createGroup(false);
  • 2529 tail = root;
  • 2530 head.next = expr(tail);
  • 2531 }
  • 2532
  • 2533 accept(')', "Unclosed group");
  • 2534 flags = save;
  • 2535
  • 2536 // Check for quantifiers
  • 2537 Node node = closure(head);
  • 2538 if (node == head) { // No closure
  • 2539 root = tail;
  • 2540 return node; // Dual return
  • 2541 }
  • 2542 if (head == tail) { // Zero length assertion
  • 2543 root = node;
  • 2544 return node; // Dual return
  • 2545 }
  • 2546
  • 2547 if (node instanceof Ques) {
  • 2548 Ques ques = (Ques) node;
  • 2549 if (ques.type == POSSESSIVE) {
  • 2550 root = node;
  • 2551 return node;
  • 2552 }
  • 2553 tail.next = new BranchConn();
  • 2554 tail = tail.next;
  • 2555 if (ques.type == GREEDY) {
  • 2556 head = new Branch(head, null, tail);
  • 2557 } else { // Reluctant quantifier
  • 2558 head = new Branch(null, head, tail);
  • 2559 }
  • 2560 root = tail;
  • 2561 return head;
  • 2562 } else if (node instanceof Curly) {
  • 2563 Curly curly = (Curly) node;
  • 2564 if (curly.type == POSSESSIVE) {
  • 2565 root = node;
  • 2566 return node;
  • 2567 }
  • 2568 // Discover if the group is deterministic
  • 2569 TreeInfo info = new TreeInfo();
  • 2570 if (head.study(info)) { // Deterministic
  • 2571 GroupTail temp = (GroupTail) tail;
  • 2572 head = root = new GroupCurly(head.next, curly.cmin,
  • 2573 curly.cmax, curly.type,
  • 2574 ((GroupTail)tail).localIndex,
  • 2575 ((GroupTail)tail).groupIndex,
  • 2576 capturingGroup);
  • 2577 return head;
  • 2578 } else { // Non-deterministic
  • 2579 int temp = ((GroupHead) head).localIndex;
  • 2580 Loop loop;
  • 2581 if (curly.type == GREEDY)
  • 2582 loop = new Loop(this.localCount, temp);
  • 2583 else // Reluctant Curly
  • 2584 loop = new LazyLoop(this.localCount, temp);
  • 2585 Prolog prolog = new Prolog(loop);
  • 2586 this.localCount += 1;
  • 2587 loop.cmin = curly.cmin;
  • 2588 loop.cmax = curly.cmax;
  • 2589 loop.body = head;
  • 2590 tail.next = loop;
  • 2591 root = loop;
  • 2592 return prolog; // Dual return
  • 2593 }
  • 2594 }
  • 2595 throw error("Internal logic error");
  • 2596 }
  • 2597
  • 2598 /**
  • 2599 * Create group head and tail nodes using double return. If the group is
  • 2600 * created with anonymous true then it is a pure group and should not
  • 2601 * affect group counting.
  • 2602 */
  • 2603 private Node createGroup(boolean anonymous) {
  • 2604 int localIndex = localCount++;
  • 2605 int groupIndex = 0;
  • 2606 if (!anonymous)
  • 2607 groupIndex = capturingGroupCount++;
  • 2608 GroupHead head = new GroupHead(localIndex);
  • 2609 root = new GroupTail(localIndex, groupIndex);
  • 2610 if (!anonymous && groupIndex < 10)
  • 2611 groupNodes[groupIndex] = head;
  • 2612 return head;
  • 2613 }
  • 2614
  • 2615 /**
  • 2616 * Parses inlined match flags and set them appropriately.
  • 2617 */
  • 2618 private void addFlag() {
  • 2619 int ch = peek();
  • 2620 for (;;) {
  • 2621 switch (ch) {
  • 2622 case 'i':
  • 2623 flags |= CASE_INSENSITIVE;
  • 2624 break;
  • 2625 case 'm':
  • 2626 flags |= MULTILINE;
  • 2627 break;
  • 2628 case 's':
  • 2629 flags |= DOTALL;
  • 2630 break;
  • 2631 case 'd':
  • 2632 flags |= UNIX_LINES;
  • 2633 break;
  • 2634 case 'u':
  • 2635 flags |= UNICODE_CASE;
  • 2636 break;
  • 2637 case 'c':
  • 2638 flags |= CANON_EQ;
  • 2639 break;
  • 2640 case 'x':
  • 2641 flags |= COMMENTS;
  • 2642 break;
  • 2643 case '-': // subFlag then fall through
  • 2644 ch = next();
  • 2645 subFlag();
  • 2646 default:
  • 2647 return;
  • 2648 }
  • 2649 ch = next();
  • 2650 }
  • 2651 }
  • 2652
  • 2653 /**
  • 2654 * Parses the second part of inlined match flags and turns off
  • 2655 * flags appropriately.
  • 2656 */
  • 2657 private void subFlag() {
  • 2658 int ch = peek();
  • 2659 for (;;) {
  • 2660 switch (ch) {
  • 2661 case 'i':
  • 2662 flags &= ~CASE_INSENSITIVE;
  • 2663 break;
  • 2664 case 'm':
  • 2665 flags &= ~MULTILINE;
  • 2666 break;
  • 2667 case 's':
  • 2668 flags &= ~DOTALL;
  • 2669 break;
  • 2670 case 'd':
  • 2671 flags &= ~UNIX_LINES;
  • 2672 break;
  • 2673 case 'u':
  • 2674 flags &= ~UNICODE_CASE;
  • 2675 break;
  • 2676 case 'c':
  • 2677 flags &= ~CANON_EQ;
  • 2678 break;
  • 2679 case 'x':
  • 2680 flags &= ~COMMENTS;
  • 2681 break;
  • 2682 default:
  • 2683 return;
  • 2684 }
  • 2685 ch = next();
  • 2686 }
  • 2687 }
  • 2688
  • 2689 static final int MAX_REPS = 0x7FFFFFFF;
  • 2690
  • 2691 static final int GREEDY = 0;
  • 2692
  • 2693 static final int LAZY = 1;
  • 2694
  • 2695 static final int POSSESSIVE = 2;
  • 2696
  • 2697 static final int INDEPENDENT = 3;
  • 2698
  • 2699 /**
  • 2700 * Processes repetition. If the next character peeked is a quantifier
  • 2701 * then new nodes must be appended to handle the repetition.
  • 2702 * Prev could be a single or a group, so it could be a chain of nodes.
  • 2703 */
  • 2704 private Node closure(Node prev) {
  • 2705 Node atom;
  • 2706 int ch = peek();
  • 2707 switch (ch) {
  • 2708 case '?':
  • 2709 ch = next();
  • 2710 if (ch == '?') {
  • 2711 next();
  • 2712 return new Ques(prev, LAZY);
  • 2713 } else if (ch == '+') {
  • 2714 next();
  • 2715 return new Ques(prev, POSSESSIVE);
  • 2716 }
  • 2717 return new Ques(prev, GREEDY);
  • 2718 case '*':
  • 2719 ch = next();
  • 2720 if (ch == '?') {
  • 2721 next();
  • 2722 return new Curly(prev, 0, MAX_REPS, LAZY);
  • 2723 } else if (ch == '+') {
  • 2724 next();
  • 2725 return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
  • 2726 }
  • 2727 return new Curly(prev, 0, MAX_REPS, GREEDY);
  • 2728 case '+':
  • 2729 ch = next();
  • 2730 if (ch == '?') {
  • 2731 next();
  • 2732 return new Curly(prev, 1, MAX_REPS, LAZY);
  • 2733 } else if (ch == '+') {
  • 2734 next();
  • 2735 return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
  • 2736 }
  • 2737 return new Curly(prev, 1, MAX_REPS, GREEDY);
  • 2738 case '{':
  • 2739 ch = temp[cursor+1];
  • 2740 if (ASCII.isDigit(ch)) {
  • 2741 skip();
  • 2742 int cmin = 0;
  • 2743 do {
  • 2744 cmin = cmin * 10 + (ch - '0');
  • 2745 } while (ASCII.isDigit(ch = read()));
  • 2746 int cmax = cmin;
  • 2747 if (ch == ',') {
  • 2748 ch = read();
  • 2749 cmax = MAX_REPS;
  • 2750 if (ch != '}') {
  • 2751 cmax = 0;
  • 2752 while (ASCII.isDigit(ch)) {
  • 2753 cmax = cmax * 10 + (ch - '0');
  • 2754 ch = read();
  • 2755 }
  • 2756 }
  • 2757 }
  • 2758 if (ch != '}')
  • 2759 throw error("Unclosed counted closure");
  • 2760 if (((cmin) | (cmax) | (cmax - cmin)) < 0)
  • 2761 throw error("Illegal repetition range");
  • 2762 Curly curly;
  • 2763 ch = peek();
  • 2764 if (ch == '?') {
  • 2765 next();
  • 2766 curly = new Curly(prev, cmin, cmax, LAZY);
  • 2767 } else if (ch == '+') {
  • 2768 next();
  • 2769 curly = new Curly(prev, cmin, cmax, POSSESSIVE);
  • 2770 } else {
  • 2771 curly = new Curly(prev, cmin, cmax, GREEDY);
  • 2772 }
  • 2773 return curly;
  • 2774 } else {
  • 2775 throw error("Illegal repetition");
  • 2776 }
  • 2777 default:
  • 2778 return prev;
  • 2779 }
  • 2780 }
  • 2781
  • 2782 /**
  • 2783 * Utility method for parsing control escape sequences.
  • 2784 */
  • 2785 private int c() {
  • 2786 if (cursor < patternLength) {
  • 2787 return read() ^ 64;
  • 2788 }
  • 2789 throw error("Illegal control escape sequence");
  • 2790 }
  • 2791
  • 2792 /**
  • 2793 * Utility method for parsing octal escape sequences.
  • 2794 */
  • 2795 private int o() {
  • 2796 int n = read();
  • 2797 if (((n-'0')|('7'-n)) >= 0) {
  • 2798 int m = read();
  • 2799 if (((m-'0')|('7'-m)) >= 0) {
  • 2800 int o = read();
  • 2801 if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
  • 2802 return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
  • 2803 }
  • 2804 unread();
  • 2805 return (n - '0') * 8 + (m - '0');
  • 2806 }
  • 2807 unread();
  • 2808 return (n - '0');
  • 2809 }
  • 2810 throw error("Illegal octal escape sequence");
  • 2811 }
  • 2812
  • 2813 /**
  • 2814 * Utility method for parsing hexadecimal escape sequences.
  • 2815 */
  • 2816 private int x() {
  • 2817 int n = read();
  • 2818 if (ASCII.isHexDigit(n)) {
  • 2819 int m = read();
  • 2820 if (ASCII.isHexDigit(m)) {
  • 2821 return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
  • 2822 }
  • 2823 }
  • 2824 throw error("Illegal hexadecimal escape sequence");
  • 2825 }
  • 2826
  • 2827 /**
  • 2828 * Utility method for parsing unicode escape sequences.
  • 2829 */
  • 2830 private int u() {
  • 2831 int n = 0;
  • 2832 for (int i = 0; i < 4; i++) {
  • 2833 int ch = read();
  • 2834 if (!ASCII.isHexDigit(ch)) {
  • 2835 throw error("Illegal Unicode escape sequence");
  • 2836 }
  • 2837 n = n * 16 + ASCII.toDigit(ch);
  • 2838 }
  • 2839 return n;
  • 2840 }
  • 2841
  • 2842 //
  • 2843 // Utility methods for code point support
  • 2844 //
  • 2845
  • 2846 /**
  • 2847 * Tests a surrogate value.
  • 2848 */
  • 2849 private static final boolean isSurrogate(int c) {
  • 2850 return c >= Character.MIN_HIGH_SURROGATE && c <= Character.MAX_LOW_SURROGATE;
  • 2851 }
  • 2852
  • 2853 private static final int countChars(CharSequence seq, int index,
  • 2854 int lengthInCodePoints) {
  • 2855 // optimization
  • 2856 if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
  • 2857 assert (index >= 0 && index < seq.length());
  • 2858 return 1;
  • 2859 }
  • 2860 int length = seq.length();
  • 2861 int x = index;
  • 2862 if (lengthInCodePoints >= 0) {
  • 2863 assert (index >= 0 && index < length);
  • 2864 for (int i = 0; x < length && i < lengthInCodePoints; i++) {
  • 2865 if (Character.isHighSurrogate(seq.charAt(x++))) {
  • 2866 if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
  • 2867 x++;
  • 2868 }
  • 2869 }
  • 2870 }
  • 2871 return x - index;
  • 2872 }
  • 2873
  • 2874 assert (index >= 0 && index <= length);
  • 2875 if (index == 0) {
  • 2876 return 0;
  • 2877 }
  • 2878 int len = -lengthInCodePoints;
  • 2879 for (int i = 0; x > 0 && i < len; i++) {
  • 2880 if (Character.isLowSurrogate(seq.charAt(--x))) {
  • 2881 if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
  • 2882 x--;
  • 2883 }
  • 2884 }
  • 2885 }
  • 2886 return index - x;
  • 2887 }
  • 2888
  • 2889 private static final int countCodePoints(CharSequence seq) {
  • 2890 int length = seq.length();
  • 2891 int n = 0;
  • 2892 for (int i = 0; i < length; ) {
  • 2893 n++;
  • 2894 if (Character.isHighSurrogate(seq.charAt(i++))) {
  • 2895 if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
  • 2896 i++;
  • 2897 }
  • 2898 }
  • 2899 }
  • 2900 return n;
  • 2901 }
  • 2902
  • 2903 /**
  • 2904 * Creates a bit vector for matching Latin-1 values. A normal BitClass
  • 2905 * never matches values above Latin-1, and a complemented BitClass always
  • 2906 * matches values above Latin-1.
  • 2907 */
  • 2908 private static final class BitClass extends BmpCharProperty {
  • 2909 final boolean[] bits;
  • 2910 BitClass() { bits = new boolean[256]; }
  • 2911 private BitClass(boolean[] bits) { this.bits = bits; }
  • 2912 BitClass add(int c, int flags) {
  • 2913 assert c >= 0 && c <= 255;
  • 2914 if ((flags & CASE_INSENSITIVE) != 0) {
  • 2915 if (ASCII.isAscii(c)) {
  • 2916 bits[ASCII.toUpper(c)] = true;
  • 2917 bits[ASCII.toLower(c)] = true;
  • 2918 } else if ((flags & UNICODE_CASE) != 0) {
  • 2919 bits[Character.toLowerCase(c)] = true;
  • 2920 bits[Character.toUpperCase(c)] = true;
  • 2921 }
  • 2922 }
  • 2923 bits[c] = true;
  • 2924 return this;
  • 2925 }
  • 2926 boolean isSatisfiedBy(int ch) {
  • 2927 return ch < 256 && bits[ch];
  • 2928 }
  • 2929 }
  • 2930
  • 2931 /**
  • 2932 * Returns a suitably optimized, single character matcher.
  • 2933 */
  • 2934 private CharProperty newSingle(final int ch) {
  • 2935 if (has(CASE_INSENSITIVE)) {
  • 2936 int lower, upper;
  • 2937 if (has(UNICODE_CASE)) {
  • 2938 upper = Character.toUpperCase(ch);
  • 2939 lower = Character.toLowerCase(upper);
  • 2940 if (upper != lower)
  • 2941 return new SingleU(lower);
  • 2942 } else if (ASCII.isAscii(ch)) {
  • 2943 lower = ASCII.toLower(ch);
  • 2944 upper = ASCII.toUpper(ch);
  • 2945 if (lower != upper)
  • 2946 return new SingleI(lower, upper);
  • 2947 }
  • 2948 }
  • 2949 if (isSupplementary(ch))
  • 2950 return new SingleS(ch); // Match a given Unicode character
  • 2951 return new Single(ch); // Match a given BMP character
  • 2952 }
  • 2953
  • 2954 /**
  • 2955 * Utility method for creating a string slice matcher.
  • 2956 */
  • 2957 private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
  • 2958 int[] tmp = new int[count];
  • 2959 if (has(CASE_INSENSITIVE)) {
  • 2960 if (has(UNICODE_CASE)) {
  • 2961 for (int i = 0; i < count; i++) {
  • 2962 tmp[i] = Character.toLowerCase(
  • 2963 Character.toUpperCase(buf[i]));
  • 2964 }
  • 2965 return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
  • 2966 }
  • 2967 for (int i = 0; i < count; i++) {
  • 2968 tmp[i] = ASCII.toLower(buf[i]);
  • 2969 }
  • 2970 return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
  • 2971 }
  • 2972 for (int i = 0; i < count; i++) {
  • 2973 tmp[i] = buf[i];
  • 2974 }
  • 2975 return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
  • 2976 }
  • 2977
  • 2978 /**
  • 2979 * The following classes are the building components of the object
  • 2980 * tree that represents a compiled regular expression. The object tree
  • 2981 * is made of individual elements that handle constructs in the Pattern.
  • 2982 * Each type of object knows how to match its equivalent construct with
  • 2983 * the match() method.
  • 2984 */
  • 2985
  • 2986 /**
  • 2987 * Base class for all node classes. Subclasses should override the match()
  • 2988 * method as appropriate. This class is an accepting node, so its match()
  • 2989 * always returns true.
  • 2990 */
  • 2991 static class Node extends Object {
  • 2992 Node next;
  • 2993 Node() {
  • 2994 next = Pattern.accept;
  • 2995 }
  • 2996 /**
  • 2997 * This method implements the classic accept node.
  • 2998 */
  • 2999 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3000 matcher.last = i;
  • 3001 matcher.groups[0] = matcher.first;
  • 3002 matcher.groups[1] = matcher.last;
  • 3003 return true;
  • 3004 }
  • 3005 /**
  • 3006 * This method is good for all zero length assertions.
  • 3007 */
  • 3008 boolean study(TreeInfo info) {
  • 3009 if (next != null) {
  • 3010 return next.study(info);
  • 3011 } else {
  • 3012 return info.deterministic;
  • 3013 }
  • 3014 }
  • 3015 }
  • 3016
  • 3017 static class LastNode extends Node {
  • 3018 /**
  • 3019 * This method implements the classic accept node with
  • 3020 * the addition of a check to see if the match occurred
  • 3021 * using all of the input.
  • 3022 */
  • 3023 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3024 if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
  • 3025 return false;
  • 3026 matcher.last = i;
  • 3027 matcher.groups[0] = matcher.first;
  • 3028 matcher.groups[1] = matcher.last;
  • 3029 return true;
  • 3030 }
  • 3031 }
  • 3032
  • 3033 /**
  • 3034 * Used for REs that can start anywhere within the input string.
  • 3035 * This basically tries to match repeatedly at each spot in the
  • 3036 * input string, moving forward after each try. An anchored search
  • 3037 * or a BnM will bypass this node completely.
  • 3038 */
  • 3039 static class Start extends Node {
  • 3040 int minLength;
  • 3041 Start(Node node) {
  • 3042 this.next = node;
  • 3043 TreeInfo info = new TreeInfo();
  • 3044 next.study(info);
  • 3045 minLength = info.minLength;
  • 3046 }
  • 3047 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3048 if (i > matcher.to - minLength) {
  • 3049 matcher.hitEnd = true;
  • 3050 return false;
  • 3051 }
  • 3052 boolean ret = false;
  • 3053 int guard = matcher.to - minLength;
  • 3054 for (; i <= guard; i++) {
  • 3055 if (ret = next.match(matcher, i, seq))
  • 3056 break;
  • 3057 if (i == guard)
  • 3058 matcher.hitEnd = true;
  • 3059 }
  • 3060 if (ret) {
  • 3061 matcher.first = i;
  • 3062 matcher.groups[0] = matcher.first;
  • 3063 matcher.groups[1] = matcher.last;
  • 3064 }
  • 3065 return ret;
  • 3066 }
  • 3067 boolean study(TreeInfo info) {
  • 3068 next.study(info);
  • 3069 info.maxValid = false;
  • 3070 info.deterministic = false;
  • 3071 return false;
  • 3072 }
  • 3073 }
  • 3074
  • 3075 /*
  • 3076 * StartS supports supplementary characters, including unpaired surrogates.
  • 3077 */
  • 3078 static final class StartS extends Start {
  • 3079 StartS(Node node) {
  • 3080 super(node);
  • 3081 }
  • 3082 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3083 if (i > matcher.to - minLength) {
  • 3084 matcher.hitEnd = true;
  • 3085 return false;
  • 3086 }
  • 3087 boolean ret = false;
  • 3088 int guard = matcher.to - minLength;
  • 3089 while (i <= guard) {
  • 3090 if ((ret = next.match(matcher, i, seq)) || i == guard)
  • 3091 break;
  • 3092 // Optimization to move to the next character. This is
  • 3093 // faster than countChars(seq, i, 1).
  • 3094 if (Character.isHighSurrogate(seq.charAt(i++))) {
  • 3095 if (i < seq.length() && Character.isLowSurrogate(seq.charAt(i))) {
  • 3096 i++;
  • 3097 }
  • 3098 }
  • 3099 if (i == guard)
  • 3100 matcher.hitEnd = true;
  • 3101 }
  • 3102 if (ret) {
  • 3103 matcher.first = i;
  • 3104 matcher.groups[0] = matcher.first;
  • 3105 matcher.groups[1] = matcher.last;
  • 3106 }
  • 3107 return ret;
  • 3108 }
  • 3109 }
  • 3110
  • 3111 /**
  • 3112 * Node to anchor at the beginning of input. This object implements the
  • 3113 * match for a \A sequence, and the caret anchor will use this if not in
  • 3114 * multiline mode.
  • 3115 */
  • 3116 static final class Begin extends Node {
  • 3117 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3118 int fromIndex = (matcher.anchoringBounds) ?
  • 3119 matcher.from : 0;
  • 3120 if (i == fromIndex && next.match(matcher, i, seq)) {
  • 3121 matcher.first = i;
  • 3122 matcher.groups[0] = i;
  • 3123 matcher.groups[1] = matcher.last;
  • 3124 return true;
  • 3125 } else {
  • 3126 return false;
  • 3127 }
  • 3128 }
  • 3129 }
  • 3130
  • 3131 /**
  • 3132 * Node to anchor at the end of input. This is the absolute end, so this
  • 3133 * should not match at the last newline before the end as $ will.
  • 3134 */
  • 3135 static final class End extends Node {
  • 3136 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3137 int endIndex = (matcher.anchoringBounds) ?
  • 3138 matcher.to : matcher.getTextLength();
  • 3139 if (i == endIndex) {
  • 3140 matcher.hitEnd = true;
  • 3141 return next.match(matcher, i, seq);
  • 3142 }
  • 3143 return false;
  • 3144 }
  • 3145 }
  • 3146
  • 3147 /**
  • 3148 * Node to anchor at the beginning of a line. This is essentially the
  • 3149 * object to match for the multiline ^.
  • 3150 */
  • 3151 static final class Caret extends Node {
  • 3152 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3153 int startIndex = matcher.from;
  • 3154 int endIndex = matcher.to;
  • 3155 if (!matcher.anchoringBounds) {
  • 3156 startIndex = 0;
  • 3157 endIndex = matcher.getTextLength();
  • 3158 }
  • 3159 // Perl does not match ^ at end of input even after newline
  • 3160 if (i == endIndex) {
  • 3161 matcher.hitEnd = true;
  • 3162 return false;
  • 3163 }
  • 3164 if (i > startIndex) {
  • 3165 char ch = seq.charAt(i-1);
  • 3166 if (ch != '\n' && ch != '\r'
  • 3167 && (ch|1) != '\u2029'
  • 3168 && ch != '\u0085' ) {
  • 3169 return false;
  • 3170 }
  • 3171 // Should treat /r/n as one newline
  • 3172 if (ch == '\r' && seq.charAt(i) == '\n')
  • 3173 return false;
  • 3174 }
  • 3175 return next.match(matcher, i, seq);
  • 3176 }
  • 3177 }
  • 3178
  • 3179 /**
  • 3180 * Node to anchor at the beginning of a line when in unixdot mode.
  • 3181 */
  • 3182 static final class UnixCaret extends Node {
  • 3183 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3184 int startIndex = matcher.from;
  • 3185 int endIndex = matcher.to;
  • 3186 if (!matcher.anchoringBounds) {
  • 3187 startIndex = 0;
  • 3188 endIndex = matcher.getTextLength();
  • 3189 }
  • 3190 // Perl does not match ^ at end of input even after newline
  • 3191 if (i == endIndex) {
  • 3192 matcher.hitEnd = true;
  • 3193 return false;
  • 3194 }
  • 3195 if (i > startIndex) {
  • 3196 char ch = seq.charAt(i-1);
  • 3197 if (ch != '\n') {
  • 3198 return false;
  • 3199 }
  • 3200 }
  • 3201 return next.match(matcher, i, seq);
  • 3202 }
  • 3203 }
  • 3204
  • 3205 /**
  • 3206 * Node to match the location where the last match ended.
  • 3207 * This is used for the \G construct.
  • 3208 */
  • 3209 static final class LastMatch extends Node {
  • 3210 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3211 if (i != matcher.oldLast)
  • 3212 return false;
  • 3213 return next.match(matcher, i, seq);
  • 3214 }
  • 3215 }
  • 3216
  • 3217 /**
  • 3218 * Node to anchor at the end of a line or the end of input based on the
  • 3219 * multiline mode.
  • 3220 *
  • 3221 * When not in multiline mode, the $ can only match at the very end
  • 3222 * of the input, unless the input ends in a line terminator in which
  • 3223 * it matches right before the last line terminator.
  • 3224 *
  • 3225 * Note that \r\n is considered an atomic line terminator.
  • 3226 *
  • 3227 * Like ^ the $ operator matches at a position, it does not match the
  • 3228 * line terminators themselves.
  • 3229 */
  • 3230 static final class Dollar extends Node {
  • 3231 boolean multiline;
  • 3232 Dollar(boolean mul) {
  • 3233 multiline = mul;
  • 3234 }
  • 3235 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3236 int endIndex = (matcher.anchoringBounds) ?
  • 3237 matcher.to : matcher.getTextLength();
  • 3238 if (!multiline) {
  • 3239 if (i < endIndex - 2)
  • 3240 return false;
  • 3241 if (i == endIndex - 2) {
  • 3242 char ch = seq.charAt(i);
  • 3243 if (ch != '\r')
  • 3244 return false;
  • 3245 ch = seq.charAt(i + 1);
  • 3246 if (ch != '\n')
  • 3247 return false;
  • 3248 }
  • 3249 }
  • 3250 // Matches before any line terminator; also matches at the
  • 3251 // end of input
  • 3252 // Before line terminator:
  • 3253 // If multiline, we match here no matter what
  • 3254 // If not multiline, fall through so that the end
  • 3255 // is marked as hit; this must be a /r/n or a /n
  • 3256 // at the very end so the end was hit; more input
  • 3257 // could make this not match here
  • 3258 if (i < endIndex) {
  • 3259 char ch = seq.charAt(i);
  • 3260 if (ch == '\n') {
  • 3261 // No match between \r\n
  • 3262 if (i > 0 && seq.charAt(i-1) == '\r')
  • 3263 return false;
  • 3264 if (multiline)
  • 3265 return next.match(matcher, i, seq);
  • 3266 } else if (ch == '\r' || ch == '\u0085' ||
  • 3267 (ch|1) == '\u2029') {
  • 3268 if (multiline)
  • 3269 return next.match(matcher, i, seq);
  • 3270 } else { // No line terminator, no match
  • 3271 return false;
  • 3272 }
  • 3273 }
  • 3274 // Matched at current end so hit end
  • 3275 matcher.hitEnd = true;
  • 3276 // If a $ matches because of end of input, then more input
  • 3277 // could cause it to fail!
  • 3278 matcher.requireEnd = true;
  • 3279 return next.match(matcher, i, seq);
  • 3280 }
  • 3281 boolean study(TreeInfo info) {
  • 3282 next.study(info);
  • 3283 return info.deterministic;
  • 3284 }
  • 3285 }
  • 3286
  • 3287 /**
  • 3288 * Node to anchor at the end of a line or the end of input based on the
  • 3289 * multiline mode when in unix lines mode.
  • 3290 */
  • 3291 static final class UnixDollar extends Node {
  • 3292 boolean multiline;
  • 3293 UnixDollar(boolean mul) {
  • 3294 multiline = mul;
  • 3295 }
  • 3296 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3297 int endIndex = (matcher.anchoringBounds) ?
  • 3298 matcher.to : matcher.getTextLength();
  • 3299 if (i < endIndex) {
  • 3300 char ch = seq.charAt(i);
  • 3301 if (ch == '\n') {
  • 3302 // If not multiline, then only possible to
  • 3303 // match at very end or one before end
  • 3304 if (multiline == false && i != endIndex - 1)
  • 3305 return false;
  • 3306 // If multiline return next.match without setting
  • 3307 // matcher.hitEnd
  • 3308 if (multiline)
  • 3309 return next.match(matcher, i, seq);
  • 3310 } else {
  • 3311 return false;
  • 3312 }
  • 3313 }
  • 3314 // Matching because at the end or 1 before the end;
  • 3315 // more input could change this so set hitEnd
  • 3316 matcher.hitEnd = true;
  • 3317 // If a $ matches because of end of input, then more input
  • 3318 // could cause it to fail!
  • 3319 matcher.requireEnd = true;
  • 3320 return next.match(matcher, i, seq);
  • 3321 }
  • 3322 boolean study(TreeInfo info) {
  • 3323 next.study(info);
  • 3324 return info.deterministic;
  • 3325 }
  • 3326 }
  • 3327
  • 3328 /**
  • 3329 * Abstract node class to match one character satisfying some
  • 3330 * boolean property.
  • 3331 */
  • 3332 private static abstract class CharProperty extends Node {
  • 3333 abstract boolean isSatisfiedBy(int ch);
  • 3334 CharProperty complement() {
  • 3335 return new CharProperty() {
  • 3336 boolean isSatisfiedBy(int ch) {
  • 3337 return ! CharProperty.this.isSatisfiedBy(ch);}};
  • 3338 }
  • 3339 CharProperty maybeComplement(boolean complement) {
  • 3340 return complement ? complement() : this;
  • 3341 }
  • 3342 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3343 if (i < matcher.to) {
  • 3344 int ch = Character.codePointAt(seq, i);
  • 3345 return isSatisfiedBy(ch)
  • 3346 && next.match(matcher, i+Character.charCount(ch), seq);
  • 3347 } else {
  • 3348 matcher.hitEnd = true;
  • 3349 return false;
  • 3350 }
  • 3351 }
  • 3352 boolean study(TreeInfo info) {
  • 3353 info.minLength++;
  • 3354 info.maxLength++;
  • 3355 return next.study(info);
  • 3356 }
  • 3357 }
  • 3358
  • 3359 /**
  • 3360 * Optimized version of CharProperty that works only for
  • 3361 * properties never satisfied by Supplementary characters.
  • 3362 */
  • 3363 private static abstract class BmpCharProperty extends CharProperty {
  • 3364 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3365 if (i < matcher.to) {
  • 3366 return isSatisfiedBy(seq.charAt(i))
  • 3367 && next.match(matcher, i+1, seq);
  • 3368 } else {
  • 3369 matcher.hitEnd = true;
  • 3370 return false;
  • 3371 }
  • 3372 }
  • 3373 }
  • 3374
  • 3375 /**
  • 3376 * Node class that matches a Supplementary Unicode character
  • 3377 */
  • 3378 static final class SingleS extends CharProperty {
  • 3379 final int c;
  • 3380 SingleS(int c) { this.c = c; }
  • 3381 boolean isSatisfiedBy(int ch) {
  • 3382 return ch == c;
  • 3383 }
  • 3384 }
  • 3385
  • 3386 /**
  • 3387 * Optimization -- matches a given BMP character
  • 3388 */
  • 3389 static final class Single extends BmpCharProperty {
  • 3390 final int c;
  • 3391 Single(int c) { this.c = c; }
  • 3392 boolean isSatisfiedBy(int ch) {
  • 3393 return ch == c;
  • 3394 }
  • 3395 }
  • 3396
  • 3397 /**
  • 3398 * Case insensitive matches a given BMP character
  • 3399 */
  • 3400 static final class SingleI extends BmpCharProperty {
  • 3401 final int lower;
  • 3402 final int upper;
  • 3403 SingleI(int lower, int upper) {
  • 3404 this.lower = lower;
  • 3405 this.upper = upper;
  • 3406 }
  • 3407 boolean isSatisfiedBy(int ch) {
  • 3408 return ch == lower || ch == upper;
  • 3409 }
  • 3410 }
  • 3411
  • 3412 /**
  • 3413 * Unicode case insensitive matches a given Unicode character
  • 3414 */
  • 3415 static final class SingleU extends CharProperty {
  • 3416 final int lower;
  • 3417 SingleU(int lower) {
  • 3418 this.lower = lower;
  • 3419 }
  • 3420 boolean isSatisfiedBy(int ch) {
  • 3421 return lower == ch ||
  • 3422 lower == Character.toLowerCase(Character.toUpperCase(ch));
  • 3423 }
  • 3424 }
  • 3425
  • 3426 /**
  • 3427 * Node class that matches a Unicode category.
  • 3428 */
  • 3429 static final class Category extends CharProperty {
  • 3430 final int typeMask;
  • 3431 Category(int typeMask) { this.typeMask = typeMask; }
  • 3432 boolean isSatisfiedBy(int ch) {
  • 3433 return (typeMask & (1 << Character.getType(ch))) != 0;
  • 3434 }
  • 3435 }
  • 3436
  • 3437 /**
  • 3438 * Node class that matches a POSIX type.
  • 3439 */
  • 3440 static final class Ctype extends BmpCharProperty {
  • 3441 final int ctype;
  • 3442 Ctype(int ctype) { this.ctype = ctype; }
  • 3443 boolean isSatisfiedBy(int ch) {
  • 3444 return ch < 128 && ASCII.isType(ch, ctype);
  • 3445 }
  • 3446 }
  • 3447
  • 3448 /**
  • 3449 * Base class for all Slice nodes
  • 3450 */
  • 3451 static class SliceNode extends Node {
  • 3452 int[] buffer;
  • 3453 SliceNode(int[] buf) {
  • 3454 buffer = buf;
  • 3455 }
  • 3456 boolean study(TreeInfo info) {
  • 3457 info.minLength += buffer.length;
  • 3458 info.maxLength += buffer.length;
  • 3459 return next.study(info);
  • 3460 }
  • 3461 }
  • 3462
  • 3463 /**
  • 3464 * Node class for a case sensitive/BMP-only sequence of literal
  • 3465 * characters.
  • 3466 */
  • 3467 static final class Slice extends SliceNode {
  • 3468 Slice(int[] buf) {
  • 3469 super(buf);
  • 3470 }
  • 3471 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3472 int[] buf = buffer;
  • 3473 int len = buf.length;
  • 3474 for (int j=0; j<len; j++) {
  • 3475 if ((i+j) >= matcher.to) {
  • 3476 matcher.hitEnd = true;
  • 3477 return false;
  • 3478 }
  • 3479 if (buf[j] != seq.charAt(i+j))
  • 3480 return false;
  • 3481 }
  • 3482 return next.match(matcher, i+len, seq);
  • 3483 }
  • 3484 }
  • 3485
  • 3486 /**
  • 3487 * Node class for a case_insensitive/BMP-only sequence of literal
  • 3488 * characters.
  • 3489 */
  • 3490 static class SliceI extends SliceNode {
  • 3491 SliceI(int[] buf) {
  • 3492 super(buf);
  • 3493 }
  • 3494 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3495 int[] buf = buffer;
  • 3496 int len = buf.length;
  • 3497 for (int j=0; j<len; j++) {
  • 3498 if ((i+j) >= matcher.to) {
  • 3499 matcher.hitEnd = true;
  • 3500 return false;
  • 3501 }
  • 3502 int c = seq.charAt(i+j);
  • 3503 if (buf[j] != c &&
  • 3504 buf[j] != ASCII.toLower(c))
  • 3505 return false;
  • 3506 }
  • 3507 return next.match(matcher, i+len, seq);
  • 3508 }
  • 3509 }
  • 3510
  • 3511 /**
  • 3512 * Node class for a unicode_case_insensitive/BMP-only sequence of
  • 3513 * literal characters. Uses unicode case folding.
  • 3514 */
  • 3515 static final class SliceU extends SliceNode {
  • 3516 SliceU(int[] buf) {
  • 3517 super(buf);
  • 3518 }
  • 3519 boolean match(Matcher matcher, int i, CharSequence seq) {
  • 3520 int[] buf = buffer;
  • 3521 int len = buf.length;
  • 3522