1 /* 2 * Copyright the original author or authors. 3 * 4 * Licensed under the MOZILLA PUBLIC LICENSE, Version 1.1 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.mozilla.org/MPL/MPL-1.1.html 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 import org.as2lib.regexp.node.Node; 18 import org.as2lib.regexp.node.Slice; 19 import org.as2lib.regexp.node.TreeInfo; 20 21 /** 22 * {@code BnM} attempts to match a slice in the input using the Boyer-Moore 23 * string matching algorithm. The algorithm is based on the idea that the 24 * pattern can be shifted farther ahead in the search text if it is 25 * matched right to left. 26 * 27 * The pattern is compared to the input one character at a time, from 28 * the rightmost character in the pattern to the left. If the characters 29 * all match the pattern has been found. If a character does not match, 30 * the pattern is shifted right a distance that is the maximum of two 31 * functions, the bad character shift and the good suffix shift. This 32 * shift moves the attempted match position through the input more 33 * quickly than a naive one postion at a time check. 34 * 35 * The bad character shift is based on the character from the text that 36 * did not match. If the character does not appear in the pattern, the 37 * pattern can be shifted completely beyond the bad character. If the 38 * character does occur in the pattern, the pattern can be shifted to 39 * line the pattern up with the next occurrence of that character. 40 * 41 * The good suffix shift is based on the idea that some subset on the right 42 * side of the pattern has matched. When a bad character is found, the 43 * pattern can be shifted right by the pattern length if the subset does 44 * not occur again in pattern, or by the amount of distance to the 45 * next occurrence of the subset in the pattern. 46 * 47 * @author Igor Sadovskiy 48 */ 49 50 class org.as2lib.regexp.node.BnM extends Node { 51 52 private var buffer:Array; 53 private var lastOcc:Array; 54 private var optoSft:Array; 55 56 public static function optimize(node:Node):Node { 57 if (!(node instanceof Slice)) return node; 58 59 var src:Array = Slice(node).getBuffer(); 60 var patternLength:Number = src.length; 61 62 // The BM algorithm requires a bit of overhead; 63 // If the pattern is short don't use it, since 64 // a shift larger than the pattern length cannot 65 // be used anyway. 66 if (patternLength < 4) return node; 67 68 var i, j:Number; 69 var lastOcc:Array = Array(128); 70 var optoSft:Array = Array(patternLength); 71 72 // Precalculate part of the bad character shift 73 // It is a table for where in the pattern each 74 // lower 7-bit value occurs 75 for (i = 0; i < patternLength; i++) { 76 lastOcc[src[i] & 0x7F] = i + 1; 77 } 78 79 // Precalculate the good suffix shift 80 // i is the shift amount being considered 81 for (i = patternLength; i > 0; i--) { 82 // j is the beginning index of suffix being considered 83 var f:Boolean = false; 84 for (j = patternLength - 1; j >= i; j--) { 85 // Testing for good suffix 86 if (src[j] == src[j-i]) { 87 // src[j..len] is a good suffix 88 optoSft[j-1] = i; 89 } else { 90 // No match. The array has already been 91 // filled up with correct values before. 92 f = true; 93 break; 94 } 95 } 96 if (f) continue; 97 // This fills up the remaining of optoSft 98 // any suffix can not have larger shift amount 99 // then its sub-suffix. Why??? 100 while (j > 0) { 101 optoSft[--j] = i; 102 } 103 } 104 // Set the guard value because of unicode compression 105 optoSft[patternLength-1] = 1; 106 return new BnM(src, lastOcc, optoSft, node.next); 107 } 108 109 public function BnM(src:Array, lastOcc:Array, optoSft:Array, next:Node) { 110 this.buffer = src; 111 this.lastOcc = lastOcc; 112 this.optoSft = optoSft; 113 this.next = next; 114 } 115 116 public function match(matcher:Object, i:Number, seq:String):Boolean { 117 var src:Array = buffer; // of char 118 var patternLength:Number = src.length; 119 var last:Number = matcher.to - patternLength; 120 121 // Loop over all possible match positions in text 122 while (i <= last) { 123 // Loop over pattern from right to left 124 var f:Boolean = false; 125 for (var j = patternLength - 1; j >= 0; j--) { 126 var ch:Number = seq.charCodeAt(i+j); 127 if (src[j] != ch) { 128 // Shift search to the right by the maximum of the 129 // bad character shift and the good suffix shift 130 i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]); 131 f = true; 132 break; 133 } 134 } 135 if (f) continue; 136 // Entire pattern matched starting at i 137 matcher.first = i; 138 var ret:Boolean = next.match(matcher, i + patternLength, seq); 139 if (ret) { 140 matcher.first = i; 141 matcher.groups[0] = matcher.first; 142 matcher.groups[1] = matcher.last; 143 return true; 144 } 145 i++; 146 } 147 return false; 148 } 149 150 public function study(info:TreeInfo):Boolean { 151 info.minLength += buffer.length; 152 info.maxValid = false; 153 return next.study(info); 154 } 155 156 } 157