as2lib 0.9

     1  /*
     2   * Copyright the original author or authors.
     3   * 
     4   * Licensed under the MOZILLA PUBLIC LICENSE, Version 1.1 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   * 
     8   *      http://www.mozilla.org/MPL/MPL-1.1.html
     9   * 
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16   
    17  import org.as2lib.regexp.node.Node;
    18  import org.as2lib.regexp.node.Slice;
    19  import org.as2lib.regexp.node.TreeInfo;
    20   
    21  /**
    22   * {@code BnM} attempts to match a slice in the input using the Boyer-Moore 
    23   * string matching algorithm. The algorithm is based on the idea that the
    24   * pattern can be shifted farther ahead in the search text if it is
    25   * matched right to left.
    26   * 
    27   * The pattern is compared to the input one character at a time, from
    28   * the rightmost character in the pattern to the left. If the characters
    29   * all match the pattern has been found. If a character does not match,
    30   * the pattern is shifted right a distance that is the maximum of two
    31   * functions, the bad character shift and the good suffix shift. This
    32   * shift moves the attempted match position through the input more
    33   * quickly than a naive one postion at a time check.
    34   * 
    35   * The bad character shift is based on the character from the text that
    36   * did not match. If the character does not appear in the pattern, the
    37   * pattern can be shifted completely beyond the bad character. If the
    38   * character does occur in the pattern, the pattern can be shifted to
    39   * line the pattern up with the next occurrence of that character.
    40   * 
    41   * The good suffix shift is based on the idea that some subset on the right
    42   * side of the pattern has matched. When a bad character is found, the
    43   * pattern can be shifted right by the pattern length if the subset does
    44   * not occur again in pattern, or by the amount of distance to the
    45   * next occurrence of the subset in the pattern.
    46   *
    47   * @author Igor Sadovskiy
    48   */
    49   
    50  class org.as2lib.regexp.node.BnM extends Node {
    51  	
    52      private var buffer:Array; 
    53      private var lastOcc:Array;
    54      private var optoSft:Array;
    55  
    56      public static function optimize(node:Node):Node {
    57          if (!(node instanceof Slice)) return node;
    58          
    59          var src:Array = Slice(node).getBuffer(); 
    60          var patternLength:Number = src.length;
    61          
    62          // The BM algorithm requires a bit of overhead;
    63          // If the pattern is short don't use it, since
    64          // a shift larger than the pattern length cannot
    65          // be used anyway.
    66          if (patternLength < 4) return node;
    67          
    68          var i, j:Number;
    69          var lastOcc:Array = Array(128); 
    70          var optoSft:Array = Array(patternLength);
    71          
    72          // Precalculate part of the bad character shift
    73          // It is a table for where in the pattern each
    74          // lower 7-bit value occurs
    75          for (i = 0; i < patternLength; i++) {
    76              lastOcc[src[i] & 0x7F] = i + 1;
    77          }
    78          
    79          // Precalculate the good suffix shift
    80          // i is the shift amount being considered
    81  		for (i = patternLength; i > 0; i--) {
    82              // j is the beginning index of suffix being considered
    83              var f:Boolean = false;
    84  	        for (j = patternLength - 1; j >= i; j--) {
    85  	            // Testing for good suffix
    86  	            if (src[j] == src[j-i]) {
    87  	                // src[j..len] is a good suffix
    88  	                optoSft[j-1] = i;
    89  	            } else {
    90  	                // No match. The array has already been
    91  	                // filled up with correct values before.
    92  	                f = true;
    93  	                break;
    94  	            }
    95  	        }
    96  	        if (f) continue;
    97  	        // This fills up the remaining of optoSft
    98  	        // any suffix can not have larger shift amount
    99  	        // then its sub-suffix. Why???
   100  	        while (j > 0) {
   101  	            optoSft[--j] = i;
   102  	        }
   103          }
   104          // Set the guard value because of unicode compression
   105          optoSft[patternLength-1] = 1;
   106          return new BnM(src, lastOcc, optoSft, node.next);
   107      }
   108      
   109      public function BnM(src:Array, lastOcc:Array, optoSft:Array, next:Node) {
   110          this.buffer = src;
   111          this.lastOcc = lastOcc;
   112          this.optoSft = optoSft;
   113          this.next = next;
   114      }
   115      
   116      public function match(matcher:Object, i:Number, seq:String):Boolean {
   117          var src:Array = buffer; // of char
   118          var patternLength:Number = src.length;
   119          var last:Number = matcher.to - patternLength;
   120  
   121          // Loop over all possible match positions in text
   122  		while (i <= last) {
   123              // Loop over pattern from right to left
   124              var f:Boolean = false;
   125              for (var j = patternLength - 1; j >= 0; j--) {
   126                  var ch:Number = seq.charCodeAt(i+j);
   127                  if (src[j] != ch) {
   128                      // Shift search to the right by the maximum of the
   129                      // bad character shift and the good suffix shift
   130                      i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
   131                      f = true;
   132                      break;
   133                  }
   134              }
   135              if (f) continue;
   136              // Entire pattern matched starting at i
   137              matcher.first = i;
   138              var ret:Boolean = next.match(matcher, i + patternLength, seq);
   139              if (ret) {
   140                  matcher.first = i;
   141                  matcher.groups[0] = matcher.first;
   142                  matcher.groups[1] = matcher.last;
   143                  return true;
   144              }
   145              i++;
   146          }
   147          return false;
   148      }
   149      
   150      public function study(info:TreeInfo):Boolean {
   151          info.minLength += buffer.length;
   152          info.maxValid = false;
   153          return next.study(info);
   154      }
   155      
   156  }
   157