1  /*
     2   * Copyright the original author or authors.
     3   * 
     4   * Licensed under the MOZILLA PUBLIC LICENSE, Version 1.1 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   * 
     8   *      http://www.mozilla.org/MPL/MPL-1.1.html
     9   * 
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  import org.as2lib.regexp.AsciiUtil;
    18  import org.as2lib.regexp.Pattern;
    19  import org.as2lib.regexp.node.*;
    20  
    21  import org.as2lib.util.StringUtil;
    22  import org.as2lib.data.holder.map.HashMap;
    23  /**
    24   * {@code PosixPattern} provides implementations of the parsing engine for 
    25   * POSIX character classes and Unicode blocks and categories.
    26   * 
    27   * @author Igor Sadovskiy
    28   * @see org.as2lib.regexp.Matcher
    29   * @see org.as2lib.regexp.Pattern
    30   */
    31  
    32  
    33  class org.as2lib.regexp.PosixPattern extends Pattern {
    34  		
    35      private static var families:HashMap = null;
    36  
    37      private static var categories:HashMap = null;
    38  	
    39      private static var familyNames:Array = [ 
    40          "BasicLatin",
    41          "Latin-1Supplement",
    42          "LatinExtended-A",
    43          "LatinExtended-Bound",
    44          "IPAExtensions",
    45          "SpacingModifierLetters",
    46          "CombiningDiacriticalMarks",
    47          "Greek",
    48          "Cyrillic",
    49          "Armenian",
    50          "Hebrew",
    51          "Arabic",
    52          "Syriac",
    53          "Thaana",
    54          "Devanagari",
    55          "Bengali",
    56          "Gurmukhi",
    57          "Gujarati",
    58          "Oriya",
    59          "Tamil",
    60          "Telugu",
    61          "Kannada",
    62          "Malayalam",
    63          "Sinhala",
    64          "Thai",
    65          "Lao",
    66          "Tibetan",
    67          "Myanmar",
    68          "Georgian",
    69          "HangulJamo",
    70          "Ethiopic",
    71          "Cherokee",
    72          "UnifiedCanadianAboriginalSyllabics",
    73          "Ogham",
    74          "Runic",
    75          "Khmer",
    76          "Mongolian",
    77          "LatinExtendedAdditional",
    78          "GreekExtended",
    79          "GeneralPunctuation",
    80          "SuperscriptsandSubscripts",
    81          "CurrencySymbols",
    82          "CombiningMarksforSymbols",
    83          "LetterlikeSymbols",
    84          "NumberForms",
    85          "Arrows",
    86          "MathematicalOperators",
    87          "MiscellaneousTechnical",
    88          "ControlPictures",
    89          "OpticalCharacterRecognition",
    90          "EnclosedAlphanumerics",
    91          "BoxDrawing",
    92          "BlockElements",
    93          "GeometricShapes",
    94          "MiscellaneousSymbols",
    95          "Dingbats",
    96          "BraillePatterns",
    97          "CJKRadicalsSupplement",
    98          "KangxiRadicals",
    99          "IdeographicDescriptionCharacters",
   100          "CJKSymbolsandPunctuation",
   101          "Hiragana",
   102          "Katakana",
   103          "Bopomofo",
   104          "HangulCompatibilityJamo",
   105          "Kanbun",
   106          "BopomofoExtended",
   107          "EnclosedCJKLettersandMonths",
   108          "CJKCompatibility",
   109          "CJKUnifiedIdeographsExtensionA",
   110          "CJKUnifiedIdeographs",
   111          "YiSyllables",
   112          "YiRadicals",
   113          "HangulSyllables",
   114          "HighSurrogates",
   115          "HighPrivateUseSurrogates",
   116          "LowSurrogates",
   117          "PrivateUse",
   118          "CJKCompatibilityIdeographs",
   119          "AlphabeticPresentationForms",
   120          "ArabicPresentationForms-A",
   121          "CombiningHalfMarks",
   122          "CJKCompatibilityForms",
   123          "SmallFormVariants",
   124          "ArabicPresentationForms-Bound",
   125          "Specials",
   126          "HalfwidthandFullwidthForms"
   127      ];
   128  
   129      private static var categoryNames:Array = [ 
   130  		"Cn",                   // UNASSIGNED		    	= 0,
   131  		"Lu",                   // UPPERCASE_LETTER	    	= 1,
   132  		"Ll",                   // LOWERCASE_LETTER	    	= 2,
   133  		"Lt",                   // TITLECASE_LETTER	    	= 3,
   134  		"Lm",                   // MODIFIER_LETTER	    	= 4,
   135  		"Lo",                   // OTHER_LETTER		    	= 5,
   136  		"Mn",                   // NON_SPACING_MARK	    	= 6,
   137  		"Me",                   // ENCLOSING_MARK	    	= 7,
   138  		"Mc",                   // COMBINING_SPACING_MARK   = 8,
   139  		"Nd",                   // DECIMAL_DIGIT_NUMBER	    = 9,
   140  		"Nl",                   // LETTER_NUMBER	    	= 10,
   141  		"No",                   // OTHER_NUMBER		    	= 11,
   142  		"Zs",                   // SPACE_SEPARATOR	    	= 12,
   143  		"Zl",                   // LINE_SEPARATOR	    	= 13,
   144  		"Zp",                   // PARAGRAPH_SEPARATOR		= 14,
   145  		"Cc",                   // CNTRL		    		= 15,
   146  		"Cf",                   // FORMAT		    		= 16,
   147  		"Co",                   // PRIVATE_USE		    	= 18,
   148  		"Cs",                   // SURROGATE		    	= 19,
   149  		"Pd",                   // DASH_PUNCTUATION	    	= 20,
   150  		"Ps",                   // START_PUNCTUATION		= 21,
   151  		"Pe",                   // END_PUNCTUATION	    	= 22,
   152  		"Pc",                   // CONNECTOR_PUNCTUATION    = 23,
   153  		"Po",                   // OTHER_PUNCTUATION	    = 24,
   154  		"Sm",                   // MATH_SYMBOL		    	= 25,
   155  		"Sc",                   // CURRENCY_SYMBOL	    	= 26,
   156  		"Sk",                   // MODIFIER_SYMBOL	    	= 27,
   157  		"So",                   // OTHER_SYMBOL		    	= 28;
   158  
   159          "L",                    // LETTER
   160          "M",                    // MARK
   161          "N",                    // NUMBER
   162          "Z",                    // SEPARATOR
   163          "C",                    // CONTROL
   164          "P",                    // PUNCTUATION
   165          "S",                    // SYMBOL
   166  
   167          "LD",                   // LETTER_OR_DIGIT
   168          "L1",                   // Latin-1
   169  
   170          "all",                  // ALL
   171          "ASCII",                // ASCII
   172  
   173          "Alnum",                // Alphanumeric characters.
   174          "Alpha",                // Alphabetic characters.
   175          "Blank",                // Space and tab characters.
   176          "Cntrl",                // Control characters.
   177          "Digit",                // Numeric characters.
   178          "Graph",                // Characters that are printable and are also visible.
   179                                  // (A space is printable, but "not visible, while an `a' is both.)
   180          "Lower",                // Lower-case alphabetic characters.
   181          "Print",                // Printable characters (characters that are not control characters.)
   182          "Punct",                // Punctuation characters (characters that are not letter,
   183                                  // digits, control charact ers, or space characters).
   184          "Space",                // Space characters (such as space, tab, and formfeed, to name a few).
   185          "Upper",                // Upper-case alphabetic characters.
   186          "XDigit"                // Characters that are hexadecimal digits.
   187      ];
   188  
   189      private static var familyNodes:Array = [ 
   190          new Range(0x0000007F),      // Basic Latin
   191          new Range(0x008000FF),      // Latin-1 Supplement
   192          new Range(0x0100017F),      // Latin Extended-A
   193          new Range(0x0180024F),      // Latin Extended-Bound
   194          new Range(0x025002AF),      // IPA Extensions
   195          new Range(0x02B002FF),      // Spacing Modifier Letters
   196          new Range(0x0300036F),      // Combining Diacritical Marks
   197          new Range(0x037003FF),      // Greek
   198          new Range(0x040004FF),      // Cyrillic
   199          new Range(0x0530058F),      // Armenian
   200          new Range(0x059005FF),      // Hebrew
   201          new Range(0x060006FF),      // Arabic
   202          new Range(0x0700074F),      // Syriac
   203          new Range(0x078007BF),      // Thaana
   204          new Range(0x0900097F),      // Devanagari
   205          new Range(0x098009FF),      // Bengali
   206          new Range(0x0A000A7F),      // Gurmukhi
   207          new Range(0x0A800AFF),      // Gujarati
   208          new Range(0x0B000B7F),      // Oriya
   209          new Range(0x0B800BFF),      // Tamil
   210          new Range(0x0C000C7F),      // Telugu
   211          new Range(0x0C800CFF),      // Kannada
   212          new Range(0x0D000D7F),      // Malayalam
   213          new Range(0x0D800DFF),      // Sinhala
   214          new Range(0x0E000E7F),      // Thai
   215          new Range(0x0E800EFF),      // Lao
   216          new Range(0x0F000FFF),      // Tibetan
   217          new Range(0x1000109F),      // Myanmar
   218          new Range(0x10A010FF),      // Georgian
   219          new Range(0x110011FF),      // Hangul Jamo
   220          new Range(0x1200137F),      // Ethiopic
   221          new Range(0x13A013FF),      // Cherokee
   222          new Range(0x1400167F),      // Unified Canadian Aboriginal Syllabics
   223          new Range(0x1680169F),      // Ogham
   224          new Range(0x16A016FF),      // Runic
   225          new Range(0x178017FF),      // Khmer
   226          new Range(0x180018AF),      // Mongolian
   227          new Range(0x1E001EFF),      // Latin Extended Additional
   228          new Range(0x1F001FFF),      // Greek Extended
   229          new Range(0x2000206F),      // General Punctuation
   230          new Range(0x2070209F),      // Superscripts and Subscripts
   231          new Range(0x20A020CF),      // Currency Symbols
   232          new Range(0x20D020FF),      // Combining Marks for Symbols
   233          new Range(0x2100214F),      // Letterlike Symbols
   234          new Range(0x2150218F),      // Number Forms
   235          new Range(0x219021FF),      // Arrows
   236          new Range(0x220022FF),      // Mathematical Operators
   237          new Range(0x230023FF),      // Miscellaneous Technical
   238          new Range(0x2400243F),      // Control Pictures
   239          new Range(0x2440245F),      // Optical Character Recognition
   240          new Range(0x246024FF),      // Enclosed Alphanumerics
   241          new Range(0x2500257F),      // Box Drawing
   242          new Range(0x2580259F),      // Block Elements
   243          new Range(0x25A025FF),      // Geometric Shapes
   244          new Range(0x260026FF),      // Miscellaneous Symbols
   245          new Range(0x270027BF),      // Dingbats
   246          new Range(0x280028FF),      // Braille Patterns
   247          new Range(0x2E802EFF),      // CJK Radicals Supplement
   248          new Range(0x2F002FDF),      // Kangxi Radicals
   249          new Range(0x2FF02FFF),      // Ideographic Description Characters
   250          new Range(0x3000303F),      // CJK Symbols and Punctuation
   251          new Range(0x3040309F),      // Hiragana
   252          new Range(0x30A030FF),      // Katakana
   253          new Range(0x3100312F),      // Bopomofo
   254          new Range(0x3130318F),      // Hangul Compatibility Jamo
   255          new Range(0x3190319F),      // Kanbun
   256          new Range(0x31A031BF),      // Bopomofo Extended
   257          new Range(0x320032FF),      // Enclosed CJK Letters and Months
   258          new Range(0x330033FF),      // CJK Compatibility
   259          new Range(0x34004DB5),      // CJK Unified Ideographs Extension A
   260          new Range(0x4E009FFF),      // CJK Unified Ideographs
   261          new Range(0xA000A48F),      // Yi Syllables
   262          new Range(0xA490A4CF),      // Yi Radicals
   263          new Range(0xAC00D7A3),      // Hangul Syllables
   264          new Range(0xD800DB7F),      // High Surrogates
   265          new Range(0xDB80DBFF),      // High Private Use Surrogates
   266          new Range(0xDC00DFFF),      // Low Surrogates
   267          new Range(0xE000F8FF),      // Private Use
   268          new Range(0xF900FAFF),      // CJK Compatibility Ideographs
   269          new Range(0xFB00FB4F),      // Alphabetic Presentation Forms
   270          new Range(0xFB50FDFF),      // Arabic Presentation Forms-A
   271          new Range(0xFE20FE2F),      // Combining Half Marks
   272          new Range(0xFE30FE4F),      // CJK Compatibility Forms
   273          new Range(0xFE50FE6F),      // Small Form Variants
   274          new Range(0xFE70FEFE),      // Arabic Presentation Forms-Bound
   275          new Specials(),             // Specials
   276          new Range(0xFF00FFEF)       // Halfwidth and Fullwidth Forms
   277      ];
   278  
   279      private static var categoryNodes:Array = [ 
   280  		new Category(1<<0),         // UNASSIGNED           	= 0,
   281  		new Category(1<<1),         // UPPERCASE_LETTER	    	= 1,
   282  		new Category(1<<2),         // LOWERCASE_LETTER	    	= 2,
   283  		new Category(1<<3),         // TITLECASE_LETTER	    	= 3,
   284  		new Category(1<<4),         // MODIFIER_LETTER      	= 4,
   285  		new Category(1<<5),         // OTHER_LETTER         	= 5,
   286  		new Category(1<<6),         // NON_SPACING_MARK	    	= 6,
   287  		new Category(1<<7),         // ENCLOSING_MARK	    	= 7,
   288  		new Category(1<<8),         // COMBINING_SPACING_MARK	= 8,
   289  		new Category(1<<9),         // DECIMAL_DIGIT_NUMBER 	= 9,
   290  		new Category(1<<10),        // LETTER_NUMBER	    	= 10,
   291  		new Category(1<<11),        // OTHER_NUMBER         	= 11,
   292  		new Category(1<<12),        // SPACE_SEPARATOR	    	= 12,
   293  		new Category(1<<13),        // LINE_SEPARATOR	    	= 13,
   294  		new Category(1<<14),        // PARAGRAPH_SEPARATOR  	= 14,
   295  		new Category(1<<15),        // CNTRL		    		= 15,
   296  		new Category(1<<16),        // FORMAT		    		= 16,
   297  		new Category(1<<18),        // PRIVATE_USE          	= 18,
   298  		new Category(1<<19),        // SURROGATE            	= 19,
   299  		new Category(1<<20),        // DASH_PUNCTUATION	    	= 20,
   300  		new Category(1<<21),        // START_PUNCTUATION    	= 21,
   301  		new Category(1<<22),        // END_PUNCTUATION	    	= 22,
   302  		new Category(1<<23),        // CONNECTOR_PUNCTUATION	= 23,
   303  		new Category(1<<24),        // OTHER_PUNCTUATION    	= 24,
   304  		new Category(1<<25),        // MATH_SYMBOL          	= 25,
   305  		new Category(1<<26),        // CURRENCY_SYMBOL	    	= 26,
   306  		new Category(1<<27),        // MODIFIER_SYMBOL	    	= 27,
   307  		new Category(1<<28),        // OTHER_SYMBOL         	= 28;
   308  
   309          new Category(0x0000003E),   // LETTER
   310          new Category(0x000001C0),   // MARK
   311          new Category(0x00000E00),   // NUMBER
   312          new Category(0x00007000),   // SEPARATOR
   313          new Category(0x000D8000),   // CONTROL
   314          new Category(0x01F00000),   // PUNCTUATION
   315          new Category(0x1E000000),   // SYMBOL
   316  
   317          new Category(0x0000023E),   // LETTER_OR_DIGIT
   318          new Range(0x000000FF),      // Latin-1
   319  
   320          new All(),                  // ALL
   321          new Range(0x0000007F),      // ASCII
   322  
   323          new Posix(AsciiUtil.ALNUM),     	// Alphanumeric characters.
   324          new Posix(AsciiUtil.ALPHA),     	// Alphabetic characters.
   325          new Posix(AsciiUtil.BLANK),     	// Space and tab characters.
   326          new Posix(AsciiUtil.CNTRL),     	// Control characters.
   327          new Range((0x30<<16)|0x39),			// Numeric characters.
   328          new Posix(AsciiUtil.GRAPH),     	// Characters that are printable and are also visible.
   329                                      		// (A space is printable, but "not visible, while an `a' is both.)
   330          new Range((0x61<<16)|0x7A), 		// Lower-case alphabetic characters.
   331          new Range(0x0020007E),      		// Printable characters (characters that are not control characters.)
   332          new Posix(AsciiUtil.PUNCT),     	// Punctuation characters (characters that are not letter,
   333                                      		// digits, control charact ers, or space characters).
   334          new Posix(AsciiUtil.SPACE),     	// Space characters (such as space, tab, and formfeed, to name a few).
   335          new Range((0x41<<16)|0x5A),			// Upper-case alphabetic characters.
   336          new Posix(AsciiUtil.XDIGIT)     	// Characters that are hexadecimal digits.
   337      ];
   338  	
   339  	
   340      private function parseFamily(flag:Boolean, singleLetter:Boolean):Node {
   341          nextChar();
   342          var name: String;
   343  
   344          if (singleLetter) {
   345              name = chr(temp[cursor]);
   346              readChar();
   347          } else {
   348              var i:Number = cursor;
   349              markChar(0x7D);
   350              while(readChar() != 0x7D) {
   351              	// stuff
   352              }
   353              markChar(0);
   354              var j:Number = cursor;
   355              if (j > patternLength) {
   356                  throwError("Unclosed character family", arguments);
   357              }
   358              if (i + 1 >= j) {
   359                  throwError("Empty character family", arguments);
   360              }
   361              name = fromCharCodeArray(temp.slice(i, j-1));
   362          }
   363  
   364          if (StringUtil.startsWith(name, "In")) {
   365              name = name.substring(2, name.length);
   366              return getFamilyNode(name).dup(flag);
   367          }
   368          if (StringUtil.startsWith(name, "Is")) {
   369              name = name.substring(2, name.length);
   370          }
   371          return getCategoryNode(name).dup(flag);
   372      }
   373  	
   374      private function getFamilyNode(name:String):Node {
   375          if (families == null) {
   376              var fns:Number = familyNodes.length;
   377              families = new HashMap();
   378              for (var x=0; x<fns; x++) {
   379                  families.put(familyNames[x], familyNodes[x]);
   380              }
   381          }
   382          var n:Node = Node(families.get(name));
   383          if (n != null) return n;
   384  
   385          throwFamilyError(name, "Unknown character family", arguments);
   386      }	
   387  	
   388      private function getCategoryNode(name:String):Node {
   389          if (categories == null) {
   390              var cns:Number = categoryNodes.length;
   391              categories = new HashMap();
   392              for (var x=0; x<cns; x++) {
   393                  categories.put(categoryNames[x], categoryNodes[x]);
   394              }
   395          }
   396          var n:Node = Node(categories.get(name));
   397          if (n != null) return n;
   398  
   399          throwFamilyError(name, "Unknown character category", arguments);
   400      }
   401  
   402      private function throwFamilyError(name:String, type:String, args:FunctionArguments):Void {
   403          throwError(type + " " + chr(0x7B) + name + chr(0x7D), args);
   404      }
   405  	
   406  	public function PosixPattern(newPattern:String, newFlags:Number) {
   407  		super(newPattern, newFlags);
   408  	}
   409  	
   410  }