1 |
|
|
// Copyright 2011 the V8 project authors. All rights reserved. |
2 |
|
|
// Use of this source code is governed by a BSD-style license that can be |
3 |
|
|
// found in the LICENSE file. |
4 |
|
|
|
5 |
|
|
#ifndef SRC_STRING_SEARCH_H_ |
6 |
|
|
#define SRC_STRING_SEARCH_H_ |
7 |
|
|
|
8 |
|
|
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
9 |
|
|
|
10 |
|
|
#include "util.h" |
11 |
|
|
|
12 |
|
|
#include <cstring> |
13 |
|
|
#include <algorithm> |
14 |
|
|
|
15 |
|
|
namespace node { |
16 |
|
|
namespace stringsearch { |
17 |
|
|
|
18 |
|
|
template <typename T> |
19 |
|
|
class Vector { |
20 |
|
|
public: |
21 |
|
4696 |
Vector(T* data, size_t length, bool isForward) |
22 |
|
4696 |
: start_(data), length_(length), is_forward_(isForward) { |
23 |
✓✗✗✓
|
4696 |
CHECK(length > 0 && data != nullptr); |
24 |
|
4696 |
} |
25 |
|
|
|
26 |
|
|
// Returns the start of the memory range. |
27 |
|
|
// For vector v this is NOT necessarily &v[0], see forward(). |
28 |
|
1858686 |
const T* start() const { return start_; } |
29 |
|
|
|
30 |
|
|
// Returns the length of the vector, in characters. |
31 |
|
4624492 |
size_t length() const { return length_; } |
32 |
|
|
|
33 |
|
|
// Returns true if the Vector is front-to-back, false if back-to-front. |
34 |
|
|
// In the latter case, v[0] corresponds to the *end* of the memory range. |
35 |
|
1858686 |
bool forward() const { return is_forward_; } |
36 |
|
|
|
37 |
|
|
// Access individual vector elements - checks bounds in debug mode. |
38 |
|
84728114 |
T& operator[](size_t index) const { |
39 |
|
|
DCHECK_LT(index, length_); |
40 |
✓✓ |
84728114 |
return start_[is_forward_ ? index : (length_ - index - 1)]; |
41 |
|
|
} |
42 |
|
|
|
43 |
|
|
private: |
44 |
|
|
T* start_; |
45 |
|
|
size_t length_; |
46 |
|
|
bool is_forward_; |
47 |
|
|
}; |
48 |
|
|
|
49 |
|
|
|
50 |
|
|
//--------------------------------------------------------------------- |
51 |
|
|
// String Search object. |
52 |
|
|
//--------------------------------------------------------------------- |
53 |
|
|
|
54 |
|
|
// Class holding constants and methods that apply to all string search variants, |
55 |
|
|
// independently of subject and pattern char size. |
56 |
|
|
class StringSearchBase { |
57 |
|
|
protected: |
58 |
|
|
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
59 |
|
|
// limit, we can fix the size of tables. For a needle longer than this limit, |
60 |
|
|
// search will not be optimal, since we only build tables for a suffix |
61 |
|
|
// of the string, but it is a safe approximation. |
62 |
|
|
static const int kBMMaxShift = 250; |
63 |
|
|
|
64 |
|
|
// Reduce alphabet to this size. |
65 |
|
|
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
66 |
|
|
// proportional to the input alphabet. We reduce the alphabet size by |
67 |
|
|
// equating input characters modulo a smaller alphabet size. This gives |
68 |
|
|
// a potentially less efficient searching, but is a safe approximation. |
69 |
|
|
// For needles using only characters in the same Unicode 256-code point page, |
70 |
|
|
// there is no search speed degradation. |
71 |
|
|
static const int kLatin1AlphabetSize = 256; |
72 |
|
|
static const int kUC16AlphabetSize = 256; |
73 |
|
|
|
74 |
|
|
// Bad-char shift table stored in the state. It's length is the alphabet size. |
75 |
|
|
// For patterns below this length, the skip length of Boyer-Moore is too short |
76 |
|
|
// to compensate for the algorithmic overhead compared to simple brute force. |
77 |
|
|
static const int kBMMinPatternLength = 8; |
78 |
|
|
|
79 |
|
|
// Store for the BoyerMoore(Horspool) bad char shift table. |
80 |
|
|
int bad_char_shift_table_[kUC16AlphabetSize]; |
81 |
|
|
// Store for the BoyerMoore good suffix shift table. |
82 |
|
|
int good_suffix_shift_table_[kBMMaxShift + 1]; |
83 |
|
|
// Table used temporarily while building the BoyerMoore good suffix |
84 |
|
|
// shift table. |
85 |
|
|
int suffix_table_[kBMMaxShift + 1]; |
86 |
|
|
}; |
87 |
|
|
|
88 |
|
|
template <typename Char> |
89 |
|
|
class StringSearch : private StringSearchBase { |
90 |
|
|
public: |
91 |
|
|
typedef stringsearch::Vector<const Char> Vector; |
92 |
|
|
|
93 |
|
2348 |
explicit StringSearch(Vector pattern) |
94 |
|
2348 |
: pattern_(pattern), start_(0) { |
95 |
✓✓ |
2348 |
if (pattern.length() >= kBMMaxShift) { |
96 |
|
22 |
start_ = pattern.length() - kBMMaxShift; |
97 |
|
|
} |
98 |
|
|
|
99 |
|
2348 |
size_t pattern_length = pattern_.length(); |
100 |
✗✓ |
2348 |
CHECK_GT(pattern_length, 0); |
101 |
✓✓ |
2348 |
if (pattern_length < kBMMinPatternLength) { |
102 |
✓✓ |
990 |
if (pattern_length == 1) { |
103 |
|
394 |
strategy_ = SearchStrategy::kSingleChar; |
104 |
|
990 |
return; |
105 |
|
|
} |
106 |
|
596 |
strategy_ = SearchStrategy::kLinear; |
107 |
|
596 |
return; |
108 |
|
|
} |
109 |
|
1358 |
strategy_ = SearchStrategy::kInitial; |
110 |
|
|
} |
111 |
|
|
|
112 |
|
2348 |
size_t Search(Vector subject, size_t index) { |
113 |
✗✗✓✓ ✓✗ |
2348 |
switch (strategy_) { |
114 |
|
|
case kBoyerMooreHorspool: |
115 |
|
|
return BoyerMooreHorspoolSearch(subject, index); |
116 |
|
|
case kBoyerMoore: |
117 |
|
|
return BoyerMooreSearch(subject, index); |
118 |
|
1358 |
case kInitial: |
119 |
|
1358 |
return InitialSearch(subject, index); |
120 |
|
596 |
case kLinear: |
121 |
|
596 |
return LinearSearch(subject, index); |
122 |
|
394 |
case kSingleChar: |
123 |
|
394 |
return SingleCharSearch(subject, index); |
124 |
|
|
} |
125 |
|
|
UNREACHABLE(); |
126 |
|
|
} |
127 |
|
|
|
128 |
|
54 |
static inline int AlphabetSize() { |
129 |
|
|
if (sizeof(Char) == 1) { |
130 |
|
|
// Latin1 needle. |
131 |
|
54 |
return kLatin1AlphabetSize; |
132 |
|
|
} else { |
133 |
|
|
// UC16 needle. |
134 |
|
|
return kUC16AlphabetSize; |
135 |
|
|
} |
136 |
|
|
|
137 |
|
|
static_assert(sizeof(Char) == sizeof(uint8_t) || |
138 |
|
|
sizeof(Char) == sizeof(uint16_t), |
139 |
|
|
"sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); |
140 |
|
|
} |
141 |
|
|
|
142 |
|
|
private: |
143 |
|
|
typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); |
144 |
|
|
size_t SingleCharSearch(Vector subject, size_t start_index); |
145 |
|
|
size_t LinearSearch(Vector subject, size_t start_index); |
146 |
|
|
size_t InitialSearch(Vector subject, size_t start_index); |
147 |
|
|
size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); |
148 |
|
|
size_t BoyerMooreSearch(Vector subject, size_t start_index); |
149 |
|
|
|
150 |
|
|
void PopulateBoyerMooreHorspoolTable(); |
151 |
|
|
|
152 |
|
|
void PopulateBoyerMooreTable(); |
153 |
|
|
|
154 |
|
127966 |
static inline int CharOccurrence(int* bad_char_occurrence, |
155 |
|
|
Char char_code) { |
156 |
|
|
if (sizeof(Char) == 1) { |
157 |
|
127966 |
return bad_char_occurrence[static_cast<int>(char_code)]; |
158 |
|
|
} |
159 |
|
|
// Both pattern and subject are UC16. Reduce character to equivalence class. |
160 |
|
|
int equiv_class = char_code % kUC16AlphabetSize; |
161 |
|
|
return bad_char_occurrence[equiv_class]; |
162 |
|
|
} |
163 |
|
|
|
164 |
|
|
enum SearchStrategy { |
165 |
|
|
kBoyerMooreHorspool, |
166 |
|
|
kBoyerMoore, |
167 |
|
|
kInitial, |
168 |
|
|
kLinear, |
169 |
|
|
kSingleChar, |
170 |
|
|
}; |
171 |
|
|
|
172 |
|
|
// The pattern to search for. |
173 |
|
|
Vector pattern_; |
174 |
|
|
SearchStrategy strategy_; |
175 |
|
|
// Cache value of Max(0, pattern_length() - kBMMaxShift) |
176 |
|
|
size_t start_; |
177 |
|
|
}; |
178 |
|
|
|
179 |
|
|
|
180 |
|
|
template <typename T, typename U> |
181 |
|
6964 |
inline T AlignDown(T value, U alignment) { |
182 |
|
|
return reinterpret_cast<T>( |
183 |
|
6964 |
(reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
184 |
|
|
} |
185 |
|
|
|
186 |
|
|
|
187 |
|
148 |
inline uint8_t GetHighestValueByte(uint16_t character) { |
188 |
|
296 |
return std::max(static_cast<uint8_t>(character & 0xFF), |
189 |
|
148 |
static_cast<uint8_t>(character >> 8)); |
190 |
|
|
} |
191 |
|
|
|
192 |
|
|
|
193 |
|
|
inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
194 |
|
|
|
195 |
|
|
|
196 |
|
|
// Searches for a byte value in a memory buffer, back to front. |
197 |
|
|
// Uses memrchr(3) on systems which support it, for speed. |
198 |
|
|
// Falls back to a vanilla for loop on non-GNU systems such as Windows. |
199 |
|
366239 |
inline const void* MemrchrFill(const void* haystack, uint8_t needle, |
200 |
|
|
size_t haystack_len) { |
201 |
|
|
#ifdef _GNU_SOURCE |
202 |
|
366239 |
return memrchr(haystack, needle, haystack_len); |
203 |
|
|
#else |
204 |
|
|
const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); |
205 |
|
|
for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { |
206 |
|
|
if (haystack8[i] == needle) { |
207 |
|
|
return haystack8 + i; |
208 |
|
|
} |
209 |
|
|
} |
210 |
|
|
return nullptr; |
211 |
|
|
#endif |
212 |
|
|
} |
213 |
|
|
|
214 |
|
|
|
215 |
|
|
// Finds the first occurrence of *two-byte* character pattern[0] in the string |
216 |
|
|
// `subject`. Does not check that the whole pattern matches. |
217 |
|
|
template <typename Char> |
218 |
|
148 |
inline size_t FindFirstCharacter(Vector<const Char> pattern, |
219 |
|
|
Vector<const Char> subject, size_t index) { |
220 |
|
148 |
const Char pattern_first_char = pattern[0]; |
221 |
|
148 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
222 |
|
|
|
223 |
|
|
// For speed, search for the more `rare` of the two bytes in pattern[0] |
224 |
|
|
// using memchr / memrchr (which are much faster than a simple for loop). |
225 |
|
148 |
const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
226 |
|
148 |
size_t pos = index; |
227 |
|
6828 |
do { |
228 |
|
6976 |
const size_t bytes_to_search = (max_n - pos) * sizeof(Char); |
229 |
|
|
const void* void_pos; |
230 |
✓✓ |
6976 |
if (subject.forward()) { |
231 |
|
|
// Assert that bytes_to_search won't overflow |
232 |
✗✓ |
6975 |
CHECK_LE(pos, max_n); |
233 |
✗✓ |
6975 |
CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); |
234 |
|
6975 |
void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); |
235 |
|
|
} else { |
236 |
✗✓ |
1 |
CHECK_LE(pos, subject.length()); |
237 |
✗✓ |
1 |
CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); |
238 |
|
1 |
void_pos = MemrchrFill(subject.start() + pattern.length() - 1, |
239 |
|
|
search_byte, |
240 |
|
|
bytes_to_search); |
241 |
|
|
} |
242 |
|
6976 |
const Char* char_pos = static_cast<const Char*>(void_pos); |
243 |
✓✓ |
6976 |
if (char_pos == nullptr) |
244 |
|
12 |
return subject.length(); |
245 |
|
|
|
246 |
|
|
// Then, for each match, verify that the full two bytes match pattern[0]. |
247 |
|
6964 |
char_pos = AlignDown(char_pos, sizeof(Char)); |
248 |
|
6964 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
249 |
✓✗ |
6964 |
pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); |
250 |
✓✓ |
6964 |
if (subject[pos] == pattern_first_char) { |
251 |
|
|
// Match found, hooray. |
252 |
|
136 |
return pos; |
253 |
|
|
} |
254 |
|
|
// Search byte matched, but the other byte of pattern[0] didn't. Keep going. |
255 |
✓✗ |
6828 |
} while (++pos < max_n); |
256 |
|
|
|
257 |
|
|
return subject.length(); |
258 |
|
|
} |
259 |
|
|
|
260 |
|
|
|
261 |
|
|
// Finds the first occurrence of the byte pattern[0] in string `subject`. |
262 |
|
|
// Does not verify that the whole pattern matches. |
263 |
|
|
template <> |
264 |
|
457788 |
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
265 |
|
|
Vector<const uint8_t> subject, |
266 |
|
|
size_t index) { |
267 |
|
457788 |
const uint8_t pattern_first_char = pattern[0]; |
268 |
|
457788 |
const size_t subj_len = subject.length(); |
269 |
|
457788 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
270 |
|
|
|
271 |
|
|
const void* pos; |
272 |
✓✓ |
457788 |
if (subject.forward()) { |
273 |
|
91558 |
pos = memchr(subject.start() + index, pattern_first_char, max_n - index); |
274 |
|
|
} else { |
275 |
|
366230 |
pos = MemrchrFill(subject.start() + pattern.length() - 1, |
276 |
|
|
pattern_first_char, |
277 |
|
|
max_n - index); |
278 |
|
|
} |
279 |
|
457788 |
const uint8_t* char_pos = static_cast<const uint8_t*>(pos); |
280 |
✓✓ |
457788 |
if (char_pos == nullptr) { |
281 |
|
173 |
return subj_len; |
282 |
|
|
} |
283 |
|
|
|
284 |
|
457615 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
285 |
✓✓ |
457615 |
return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); |
286 |
|
|
} |
287 |
|
|
|
288 |
|
|
//--------------------------------------------------------------------- |
289 |
|
|
// Single Character Pattern Search Strategy |
290 |
|
|
//--------------------------------------------------------------------- |
291 |
|
|
|
292 |
|
|
template <typename Char> |
293 |
|
394 |
size_t StringSearch<Char>::SingleCharSearch( |
294 |
|
|
Vector subject, |
295 |
|
|
size_t index) { |
296 |
✗✓ |
394 |
CHECK_EQ(1, pattern_.length()); |
297 |
|
394 |
return FindFirstCharacter(pattern_, subject, index); |
298 |
|
|
} |
299 |
|
|
|
300 |
|
|
//--------------------------------------------------------------------- |
301 |
|
|
// Linear Search Strategy |
302 |
|
|
//--------------------------------------------------------------------- |
303 |
|
|
|
304 |
|
|
// Simple linear search for short patterns. Never bails out. |
305 |
|
|
template <typename Char> |
306 |
|
596 |
size_t StringSearch<Char>::LinearSearch( |
307 |
|
|
Vector subject, |
308 |
|
|
size_t index) { |
309 |
✗✓ |
596 |
CHECK_GT(pattern_.length(), 1); |
310 |
|
596 |
const size_t n = subject.length() - pattern_.length(); |
311 |
✓✓ |
168038 |
for (size_t i = index; i <= n; i++) { |
312 |
|
168032 |
i = FindFirstCharacter(pattern_, subject, i); |
313 |
✓✓ |
168032 |
if (i == subject.length()) |
314 |
|
590 |
return subject.length(); |
315 |
✗✓ |
167740 |
CHECK_LE(i, n); |
316 |
|
|
|
317 |
|
167740 |
bool matches = true; |
318 |
✓✓ |
219104 |
for (size_t j = 1; j < pattern_.length(); j++) { |
319 |
✓✓ |
218806 |
if (pattern_[j] != subject[i + j]) { |
320 |
|
167442 |
matches = false; |
321 |
|
167442 |
break; |
322 |
|
|
} |
323 |
|
|
} |
324 |
✓✓ |
167740 |
if (matches) { |
325 |
|
298 |
return i; |
326 |
|
|
} |
327 |
|
|
} |
328 |
|
6 |
return subject.length(); |
329 |
|
|
} |
330 |
|
|
|
331 |
|
|
//--------------------------------------------------------------------- |
332 |
|
|
// Boyer-Moore string search |
333 |
|
|
//--------------------------------------------------------------------- |
334 |
|
|
|
335 |
|
|
template <typename Char> |
336 |
|
6 |
size_t StringSearch<Char>::BoyerMooreSearch( |
337 |
|
|
Vector subject, |
338 |
|
|
size_t start_index) { |
339 |
|
6 |
const size_t subject_length = subject.length(); |
340 |
|
6 |
const size_t pattern_length = pattern_.length(); |
341 |
|
|
// Only preprocess at most kBMMaxShift last characters of pattern. |
342 |
|
6 |
size_t start = start_; |
343 |
|
|
|
344 |
|
6 |
int* bad_char_occurrence = bad_char_shift_table_; |
345 |
|
6 |
int* good_suffix_shift = good_suffix_shift_table_ - start_; |
346 |
|
|
|
347 |
|
6 |
Char last_char = pattern_[pattern_length - 1]; |
348 |
|
6 |
size_t index = start_index; |
349 |
|
|
// Continue search from i. |
350 |
✓✗ |
28726 |
while (index <= subject_length - pattern_length) { |
351 |
|
28726 |
size_t j = pattern_length - 1; |
352 |
|
|
int c; |
353 |
✓✓ |
34862 |
while (last_char != (c = subject[index + j])) { |
354 |
|
6136 |
int shift = j - CharOccurrence(bad_char_occurrence, c); |
355 |
|
6136 |
index += shift; |
356 |
✗✓ |
6136 |
if (index > subject_length - pattern_length) { |
357 |
|
|
return subject.length(); |
358 |
|
|
} |
359 |
|
|
} |
360 |
✓✓ |
15070210 |
while (pattern_[j] == (c = subject[index + j])) { |
361 |
✓✓ |
15041490 |
if (j == 0) { |
362 |
|
6 |
return index; |
363 |
|
|
} |
364 |
|
15041484 |
j--; |
365 |
|
|
} |
366 |
✓✓ |
28720 |
if (j < start) { |
367 |
|
|
// we have matched more than our tables allow us to be smart about. |
368 |
|
|
// Fall back on BMH shift. |
369 |
|
4648 |
index += pattern_length - 1 - |
370 |
|
4648 |
CharOccurrence(bad_char_occurrence, last_char); |
371 |
|
|
} else { |
372 |
|
24072 |
int gs_shift = good_suffix_shift[j + 1]; |
373 |
|
24072 |
int bc_occ = CharOccurrence(bad_char_occurrence, c); |
374 |
|
24072 |
int shift = j - bc_occ; |
375 |
✓✗ |
24072 |
if (gs_shift > shift) { |
376 |
|
24072 |
shift = gs_shift; |
377 |
|
|
} |
378 |
|
24072 |
index += shift; |
379 |
|
|
} |
380 |
|
|
} |
381 |
|
|
|
382 |
|
|
return subject.length(); |
383 |
|
|
} |
384 |
|
|
|
385 |
|
|
template <typename Char> |
386 |
|
6 |
void StringSearch<Char>::PopulateBoyerMooreTable() { |
387 |
|
6 |
const size_t pattern_length = pattern_.length(); |
388 |
|
|
// Only look at the last kBMMaxShift characters of pattern (from start_ |
389 |
|
|
// to pattern_length). |
390 |
|
6 |
const size_t start = start_; |
391 |
|
6 |
const size_t length = pattern_length - start; |
392 |
|
|
|
393 |
|
|
// Biased tables so that we can use pattern indices as table indices, |
394 |
|
|
// even if we only cover the part of the pattern from offset start. |
395 |
|
6 |
int* shift_table = good_suffix_shift_table_ - start_; |
396 |
|
6 |
int* suffix_table = suffix_table_ - start_; |
397 |
|
|
|
398 |
|
|
// Initialize table. |
399 |
✓✓ |
1506 |
for (size_t i = start; i < pattern_length; i++) { |
400 |
|
1500 |
shift_table[i] = length; |
401 |
|
|
} |
402 |
|
6 |
shift_table[pattern_length] = 1; |
403 |
|
6 |
suffix_table[pattern_length] = pattern_length + 1; |
404 |
|
|
|
405 |
✗✓ |
6 |
if (pattern_length <= start) { |
406 |
|
|
return; |
407 |
|
|
} |
408 |
|
|
|
409 |
|
|
// Find suffixes. |
410 |
|
6 |
Char last_char = pattern_[pattern_length - 1]; |
411 |
|
6 |
size_t suffix = pattern_length + 1; |
412 |
|
|
{ |
413 |
|
6 |
size_t i = pattern_length; |
414 |
✓✓ |
1416 |
while (i > start) { |
415 |
|
1410 |
Char c = pattern_[i - 1]; |
416 |
✓✓✓✓ ✓✓ |
1488 |
while (suffix <= pattern_length && c != pattern_[suffix - 1]) { |
417 |
✓✓ |
78 |
if (static_cast<size_t>(shift_table[suffix]) == length) { |
418 |
|
28 |
shift_table[suffix] = suffix - i; |
419 |
|
|
} |
420 |
|
78 |
suffix = suffix_table[suffix]; |
421 |
|
|
} |
422 |
|
1410 |
suffix_table[--i] = --suffix; |
423 |
✓✓ |
1410 |
if (suffix == pattern_length) { |
424 |
|
|
// No suffix to extend, so we check against last_char only. |
425 |
✓✗✓✓ ✓✓ |
90 |
while ((i > start) && (pattern_[i - 1] != last_char)) { |
426 |
✗✓ |
84 |
if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
427 |
|
|
shift_table[pattern_length] = pattern_length - i; |
428 |
|
|
} |
429 |
|
84 |
suffix_table[--i] = pattern_length; |
430 |
|
|
} |
431 |
✓✗ |
6 |
if (i > start) { |
432 |
|
6 |
suffix_table[--i] = --suffix; |
433 |
|
|
} |
434 |
|
|
} |
435 |
|
|
} |
436 |
|
|
} |
437 |
|
|
// Build shift table using suffixes. |
438 |
✓✗ |
6 |
if (suffix < pattern_length) { |
439 |
✓✓ |
1512 |
for (size_t i = start; i <= pattern_length; i++) { |
440 |
✓✓ |
1506 |
if (static_cast<size_t>(shift_table[i]) == length) { |
441 |
|
1472 |
shift_table[i] = suffix - start; |
442 |
|
|
} |
443 |
✓✓ |
1506 |
if (i == suffix) { |
444 |
|
16 |
suffix = suffix_table[suffix]; |
445 |
|
|
} |
446 |
|
|
} |
447 |
|
|
} |
448 |
|
|
} |
449 |
|
|
|
450 |
|
|
//--------------------------------------------------------------------- |
451 |
|
|
// Boyer-Moore-Horspool string search. |
452 |
|
|
//--------------------------------------------------------------------- |
453 |
|
|
|
454 |
|
|
template <typename Char> |
455 |
|
54 |
size_t StringSearch<Char>::BoyerMooreHorspoolSearch( |
456 |
|
|
Vector subject, |
457 |
|
|
size_t start_index) { |
458 |
|
54 |
const size_t subject_length = subject.length(); |
459 |
|
54 |
const size_t pattern_length = pattern_.length(); |
460 |
|
54 |
int* char_occurrences = bad_char_shift_table_; |
461 |
|
54 |
int64_t badness = -static_cast<int64_t>(pattern_length); |
462 |
|
|
|
463 |
|
|
// How bad we are doing without a good-suffix table. |
464 |
|
54 |
Char last_char = pattern_[pattern_length - 1]; |
465 |
|
54 |
int last_char_shift = |
466 |
|
108 |
pattern_length - 1 - |
467 |
|
54 |
CharOccurrence(char_occurrences, last_char); |
468 |
|
|
|
469 |
|
|
// Perform search |
470 |
|
54 |
size_t index = start_index; // No matches found prior to this index. |
471 |
✓✗ |
62954 |
while (index <= subject_length - pattern_length) { |
472 |
|
62954 |
size_t j = pattern_length - 1; |
473 |
|
|
int subject_char; |
474 |
✓✓ |
156006 |
while (last_char != (subject_char = subject[index + j])) { |
475 |
|
93056 |
int bc_occ = CharOccurrence(char_occurrences, subject_char); |
476 |
|
93056 |
int shift = j - bc_occ; |
477 |
|
93056 |
index += shift; |
478 |
|
93056 |
badness += 1 - shift; // at most zero, so badness cannot increase. |
479 |
✓✓ |
93056 |
if (index > subject_length - pattern_length) { |
480 |
|
4 |
return subject_length; |
481 |
|
|
} |
482 |
|
|
} |
483 |
|
62950 |
j--; |
484 |
✓✓ |
5445278 |
while (pattern_[j] == (subject[index + j])) { |
485 |
✓✓ |
5382372 |
if (j == 0) { |
486 |
|
44 |
return index; |
487 |
|
|
} |
488 |
|
5382328 |
j--; |
489 |
|
|
} |
490 |
|
62906 |
index += last_char_shift; |
491 |
|
|
// Badness increases by the number of characters we have |
492 |
|
|
// checked, and decreases by the number of characters we |
493 |
|
|
// can skip by shifting. It's a measure of how we are doing |
494 |
|
|
// compared to reading each character exactly once. |
495 |
|
62906 |
badness += (pattern_length - j) - last_char_shift; |
496 |
✓✓ |
62906 |
if (badness > 0) { |
497 |
|
6 |
PopulateBoyerMooreTable(); |
498 |
|
6 |
strategy_ = SearchStrategy::kBoyerMoore; |
499 |
|
6 |
return BoyerMooreSearch(subject, index); |
500 |
|
|
} |
501 |
|
|
} |
502 |
|
|
return subject.length(); |
503 |
|
|
} |
504 |
|
|
|
505 |
|
|
template <typename Char> |
506 |
|
54 |
void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { |
507 |
|
54 |
const size_t pattern_length = pattern_.length(); |
508 |
|
|
|
509 |
|
54 |
int* bad_char_occurrence = bad_char_shift_table_; |
510 |
|
|
|
511 |
|
|
// Only preprocess at most kBMMaxShift last characters of pattern. |
512 |
|
54 |
const size_t start = start_; |
513 |
|
|
// Run forwards to populate bad_char_table, so that *last* instance |
514 |
|
|
// of character equivalence class is the one registered. |
515 |
|
|
// Notice: Doesn't include the last character. |
516 |
|
54 |
const size_t table_size = AlphabetSize(); |
517 |
✓✓ |
54 |
if (start == 0) { |
518 |
|
|
// All patterns less than kBMMaxShift in length. |
519 |
|
40 |
memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
520 |
|
|
} else { |
521 |
✓✓ |
3598 |
for (size_t i = 0; i < table_size; i++) { |
522 |
|
3584 |
bad_char_occurrence[i] = start - 1; |
523 |
|
|
} |
524 |
|
|
} |
525 |
✓✓ |
5228 |
for (size_t i = start; i < pattern_length - 1; i++) { |
526 |
|
5174 |
Char c = pattern_[i]; |
527 |
|
5174 |
int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); |
528 |
|
5174 |
bad_char_occurrence[bucket] = i; |
529 |
|
|
} |
530 |
|
|
} |
531 |
|
|
|
532 |
|
|
//--------------------------------------------------------------------- |
533 |
|
|
// Linear string search with bailout to BMH. |
534 |
|
|
//--------------------------------------------------------------------- |
535 |
|
|
|
536 |
|
|
// Simple linear search for short patterns, which bails out if the string |
537 |
|
|
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
538 |
|
|
template <typename Char> |
539 |
|
1358 |
size_t StringSearch<Char>::InitialSearch( |
540 |
|
|
Vector subject, |
541 |
|
|
size_t index) { |
542 |
|
1358 |
const size_t pattern_length = pattern_.length(); |
543 |
|
|
// Badness is a count of how much work we have done. When we have |
544 |
|
|
// done enough work we decide it's probably worth switching to a better |
545 |
|
|
// algorithm. |
546 |
|
1358 |
int64_t badness = -10 - (pattern_length << 2); |
547 |
|
|
|
548 |
|
|
// We know our pattern is at least 2 characters, we cache the first so |
549 |
|
|
// the common case of the first character not matching is faster. |
550 |
✓✗ |
747500 |
for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
551 |
|
747500 |
badness++; |
552 |
✓✓ |
747500 |
if (badness <= 0) { |
553 |
|
747446 |
i = FindFirstCharacter(pattern_, subject, i); |
554 |
✓✓ |
747446 |
if (i == subject.length()) |
555 |
|
1358 |
return subject.length(); |
556 |
✗✓ |
747440 |
CHECK_LE(i, n); |
557 |
|
747440 |
size_t j = 1; |
558 |
|
20317878 |
do { |
559 |
✓✓ |
21065318 |
if (pattern_[j] != subject[i + j]) { |
560 |
|
746142 |
break; |
561 |
|
|
} |
562 |
|
20319176 |
j++; |
563 |
✓✓ |
20319176 |
} while (j < pattern_length); |
564 |
✓✓ |
747440 |
if (j == pattern_length) { |
565 |
|
1298 |
return i; |
566 |
|
|
} |
567 |
|
746142 |
badness += j; |
568 |
|
|
} else { |
569 |
|
54 |
PopulateBoyerMooreHorspoolTable(); |
570 |
|
54 |
strategy_ = SearchStrategy::kBoyerMooreHorspool; |
571 |
|
54 |
return BoyerMooreHorspoolSearch(subject, i); |
572 |
|
|
} |
573 |
|
|
} |
574 |
|
|
return subject.length(); |
575 |
|
|
} |
576 |
|
|
|
577 |
|
|
// Perform a single stand-alone search. |
578 |
|
|
// If searching multiple times for the same pattern, a search |
579 |
|
|
// object should be constructed once and the Search function then called |
580 |
|
|
// for each search. |
581 |
|
|
template <typename Char> |
582 |
|
2348 |
size_t SearchString(Vector<const Char> subject, |
583 |
|
|
Vector<const Char> pattern, |
584 |
|
|
size_t start_index) { |
585 |
|
2348 |
StringSearch<Char> search(pattern); |
586 |
|
2348 |
return search.Search(subject, start_index); |
587 |
|
|
} |
588 |
|
|
} // namespace stringsearch |
589 |
|
|
} // namespace node |
590 |
|
|
|
591 |
|
|
namespace node { |
592 |
|
|
|
593 |
|
|
template <typename Char> |
594 |
|
1174 |
size_t SearchString(const Char* haystack, |
595 |
|
|
size_t haystack_length, |
596 |
|
|
const Char* needle, |
597 |
|
|
size_t needle_length, |
598 |
|
|
size_t start_index, |
599 |
|
|
bool is_forward) { |
600 |
|
1174 |
if (haystack_length < needle_length) return haystack_length; |
601 |
|
|
// To do a reverse search (lastIndexOf instead of indexOf) without redundant |
602 |
|
|
// code, create two vectors that are reversed views into the input strings. |
603 |
|
|
// For example, v_needle[0] would return the *last* character of the needle. |
604 |
|
|
// So we're searching for the first instance of rev(needle) in rev(haystack) |
605 |
|
1174 |
stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); |
606 |
|
1174 |
stringsearch::Vector<const Char> v_haystack( |
607 |
|
|
haystack, haystack_length, is_forward); |
608 |
|
1174 |
size_t diff = haystack_length - needle_length; |
609 |
|
|
size_t relative_start_index; |
610 |
|
1174 |
if (is_forward) { |
611 |
|
1099 |
relative_start_index = start_index; |
612 |
|
75 |
} else if (diff < start_index) { |
613 |
|
31 |
relative_start_index = 0; |
614 |
|
|
} else { |
615 |
|
44 |
relative_start_index = diff - start_index; |
616 |
|
|
} |
617 |
|
1174 |
size_t pos = node::stringsearch::SearchString( |
618 |
|
|
v_haystack, v_needle, relative_start_index); |
619 |
|
1174 |
if (pos == haystack_length) { |
620 |
|
|
// not found |
621 |
|
190 |
return pos; |
622 |
|
|
} |
623 |
|
984 |
return is_forward ? pos : (haystack_length - needle_length - pos); |
624 |
|
|
} |
625 |
|
|
|
626 |
|
|
template <size_t N> |
627 |
|
|
size_t SearchString(const char* haystack, size_t haystack_length, |
628 |
|
|
const char (&needle)[N]) { |
629 |
|
|
return SearchString( |
630 |
|
|
reinterpret_cast<const uint8_t*>(haystack), haystack_length, |
631 |
|
|
reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); |
632 |
|
|
} |
633 |
|
|
|
634 |
|
|
} // namespace node |
635 |
|
|
|
636 |
|
|
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
637 |
|
|
|
638 |
|
|
#endif // SRC_STRING_SEARCH_H_ |