GCC Code Coverage Report | |||||||||||||||||||||
|
|||||||||||||||||||||
Line | Branch | Exec | Source |
1 |
// Copyright 2011 the V8 project authors. All rights reserved. |
||
2 |
// Use of this source code is governed by a BSD-style license that can be |
||
3 |
// found in the LICENSE file. |
||
4 |
|||
5 |
#ifndef SRC_STRING_SEARCH_H_ |
||
6 |
#define SRC_STRING_SEARCH_H_ |
||
7 |
|||
8 |
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
||
9 |
|||
10 |
#include "util.h" |
||
11 |
|||
12 |
#include <cstring> |
||
13 |
#include <algorithm> |
||
14 |
|||
15 |
namespace node { |
||
16 |
namespace stringsearch { |
||
17 |
|||
18 |
template <typename T> |
||
19 |
class Vector { |
||
20 |
public: |
||
21 |
2402 |
Vector(T* data, size_t length, bool isForward) |
|
22 |
2402 |
: start_(data), length_(length), is_forward_(isForward) { |
|
23 |
✓✗✗✓ ✓✗✗✓ |
2402 |
CHECK(length > 0 && data != nullptr); |
24 |
2402 |
} |
|
25 |
|||
26 |
// Returns the start of the memory range. |
||
27 |
// For vector v this is NOT necessarily &v[0], see forward(). |
||
28 |
929392 |
const T* start() const { return start_; } |
|
29 |
|||
30 |
// Returns the length of the vector, in characters. |
||
31 |
2312472 |
size_t length() const { return length_; } |
|
32 |
|||
33 |
// Returns true if the Vector is front-to-back, false if back-to-front. |
||
34 |
// In the latter case, v[0] corresponds to the *end* of the memory range. |
||
35 |
929392 |
bool forward() const { return is_forward_; } |
|
36 |
|||
37 |
// Access individual vector elements - checks bounds in debug mode. |
||
38 |
42364436 |
T& operator[](size_t index) const { |
|
39 |
DCHECK_LT(index, length_); |
||
40 |
✓✓✓✓ |
42364436 |
return start_[is_forward_ ? index : (length_ - index - 1)]; |
41 |
} |
||
42 |
|||
43 |
private: |
||
44 |
T* start_; |
||
45 |
size_t length_; |
||
46 |
bool is_forward_; |
||
47 |
}; |
||
48 |
|||
49 |
|||
50 |
//--------------------------------------------------------------------- |
||
51 |
// String Search object. |
||
52 |
//--------------------------------------------------------------------- |
||
53 |
|||
54 |
// Class holding constants and methods that apply to all string search variants, |
||
55 |
// independently of subject and pattern char size. |
||
56 |
class StringSearchBase { |
||
57 |
protected: |
||
58 |
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
||
59 |
// limit, we can fix the size of tables. For a needle longer than this limit, |
||
60 |
// search will not be optimal, since we only build tables for a suffix |
||
61 |
// of the string, but it is a safe approximation. |
||
62 |
static const int kBMMaxShift = 250; |
||
63 |
|||
64 |
// Reduce alphabet to this size. |
||
65 |
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
||
66 |
// proportional to the input alphabet. We reduce the alphabet size by |
||
67 |
// equating input characters modulo a smaller alphabet size. This gives |
||
68 |
// a potentially less efficient searching, but is a safe approximation. |
||
69 |
// For needles using only characters in the same Unicode 256-code point page, |
||
70 |
// there is no search speed degradation. |
||
71 |
static const int kLatin1AlphabetSize = 256; |
||
72 |
static const int kUC16AlphabetSize = 256; |
||
73 |
|||
74 |
// Bad-char shift table stored in the state. It's length is the alphabet size. |
||
75 |
// For patterns below this length, the skip length of Boyer-Moore is too short |
||
76 |
// to compensate for the algorithmic overhead compared to simple brute force. |
||
77 |
static const int kBMMinPatternLength = 8; |
||
78 |
|||
79 |
// Store for the BoyerMoore(Horspool) bad char shift table. |
||
80 |
int bad_char_shift_table_[kUC16AlphabetSize]; |
||
81 |
// Store for the BoyerMoore good suffix shift table. |
||
82 |
int good_suffix_shift_table_[kBMMaxShift + 1]; |
||
83 |
// Table used temporarily while building the BoyerMoore good suffix |
||
84 |
// shift table. |
||
85 |
int suffix_table_[kBMMaxShift + 1]; |
||
86 |
}; |
||
87 |
|||
88 |
template <typename Char> |
||
89 |
class StringSearch : private StringSearchBase { |
||
90 |
public: |
||
91 |
typedef stringsearch::Vector<const Char> Vector; |
||
92 |
|||
93 |
1201 |
explicit StringSearch(Vector pattern) |
|
94 |
1201 |
: pattern_(pattern), start_(0) { |
|
95 |
✓✓✗✓ |
1201 |
if (pattern.length() >= kBMMaxShift) { |
96 |
11 |
start_ = pattern.length() - kBMMaxShift; |
|
97 |
} |
||
98 |
|||
99 |
1201 |
size_t pattern_length = pattern_.length(); |
|
100 |
✗✓✗✓ |
1201 |
CHECK_GT(pattern_length, 0); |
101 |
✓✓✓✓ |
1201 |
if (pattern_length < kBMMinPatternLength) { |
102 |
✓✓✓✓ |
500 |
if (pattern_length == 1) { |
103 |
197 |
strategy_ = SearchStrategy::kSingleChar; |
|
104 |
697 |
return; |
|
105 |
} |
||
106 |
303 |
strategy_ = SearchStrategy::kLinear; |
|
107 |
303 |
return; |
|
108 |
} |
||
109 |
701 |
strategy_ = SearchStrategy::kInitial; |
|
110 |
} |
||
111 |
|||
112 |
1201 |
size_t Search(Vector subject, size_t index) { |
|
113 |
✗✗✓✓ ✓✗✗✗ ✓✓✓✗ |
1201 |
switch (strategy_) { |
114 |
case kBoyerMooreHorspool: |
||
115 |
return BoyerMooreHorspoolSearch(subject, index); |
||
116 |
case kBoyerMoore: |
||
117 |
return BoyerMooreSearch(subject, index); |
||
118 |
701 |
case kInitial: |
|
119 |
701 |
return InitialSearch(subject, index); |
|
120 |
303 |
case kLinear: |
|
121 |
303 |
return LinearSearch(subject, index); |
|
122 |
197 |
case kSingleChar: |
|
123 |
197 |
return SingleCharSearch(subject, index); |
|
124 |
} |
||
125 |
UNREACHABLE(); |
||
126 |
} |
||
127 |
|||
128 |
27 |
static inline int AlphabetSize() { |
|
129 |
if (sizeof(Char) == 1) { |
||
130 |
// Latin1 needle. |
||
131 |
27 |
return kLatin1AlphabetSize; |
|
132 |
} else { |
||
133 |
// UC16 needle. |
||
134 |
return kUC16AlphabetSize; |
||
135 |
} |
||
136 |
|||
137 |
static_assert(sizeof(Char) == sizeof(uint8_t) || |
||
138 |
sizeof(Char) == sizeof(uint16_t), |
||
139 |
"sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); |
||
140 |
} |
||
141 |
|||
142 |
private: |
||
143 |
typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); |
||
144 |
size_t SingleCharSearch(Vector subject, size_t start_index); |
||
145 |
size_t LinearSearch(Vector subject, size_t start_index); |
||
146 |
size_t InitialSearch(Vector subject, size_t start_index); |
||
147 |
size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); |
||
148 |
size_t BoyerMooreSearch(Vector subject, size_t start_index); |
||
149 |
|||
150 |
void PopulateBoyerMooreHorspoolTable(); |
||
151 |
|||
152 |
void PopulateBoyerMooreTable(); |
||
153 |
|||
154 |
63983 |
static inline int CharOccurrence(int* bad_char_occurrence, |
|
155 |
Char char_code) { |
||
156 |
if (sizeof(Char) == 1) { |
||
157 |
63983 |
return bad_char_occurrence[static_cast<int>(char_code)]; |
|
158 |
} |
||
159 |
// Both pattern and subject are UC16. Reduce character to equivalence class. |
||
160 |
int equiv_class = char_code % kUC16AlphabetSize; |
||
161 |
return bad_char_occurrence[equiv_class]; |
||
162 |
} |
||
163 |
|||
164 |
enum SearchStrategy { |
||
165 |
kBoyerMooreHorspool, |
||
166 |
kBoyerMoore, |
||
167 |
kInitial, |
||
168 |
kLinear, |
||
169 |
kSingleChar, |
||
170 |
}; |
||
171 |
|||
172 |
// The pattern to search for. |
||
173 |
Vector pattern_; |
||
174 |
SearchStrategy strategy_; |
||
175 |
// Cache value of Max(0, pattern_length() - kBMMaxShift) |
||
176 |
size_t start_; |
||
177 |
}; |
||
178 |
|||
179 |
|||
180 |
template <typename T, typename U> |
||
181 |
6964 |
inline T AlignDown(T value, U alignment) { |
|
182 |
return reinterpret_cast<T>( |
||
183 |
6964 |
(reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
|
184 |
} |
||
185 |
|||
186 |
|||
187 |
148 |
inline uint8_t GetHighestValueByte(uint16_t character) { |
|
188 |
296 |
return std::max(static_cast<uint8_t>(character & 0xFF), |
|
189 |
444 |
static_cast<uint8_t>(character >> 8)); |
|
190 |
} |
||
191 |
|||
192 |
|||
193 |
inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
||
194 |
|||
195 |
|||
196 |
// Searches for a byte value in a memory buffer, back to front. |
||
197 |
// Uses memrchr(3) on systems which support it, for speed. |
||
198 |
// Falls back to a vanilla for loop on non-GNU systems such as Windows. |
||
199 |
366239 |
inline const void* MemrchrFill(const void* haystack, uint8_t needle, |
|
200 |
size_t haystack_len) { |
||
201 |
#ifdef _GNU_SOURCE |
||
202 |
366239 |
return memrchr(haystack, needle, haystack_len); |
|
203 |
#else |
||
204 |
const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); |
||
205 |
for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { |
||
206 |
if (haystack8[i] == needle) { |
||
207 |
return haystack8 + i; |
||
208 |
} |
||
209 |
} |
||
210 |
return nullptr; |
||
211 |
#endif |
||
212 |
} |
||
213 |
|||
214 |
|||
215 |
// Finds the first occurrence of *two-byte* character pattern[0] in the string |
||
216 |
// `subject`. Does not check that the whole pattern matches. |
||
217 |
template <typename Char> |
||
218 |
148 |
inline size_t FindFirstCharacter(Vector<const Char> pattern, |
|
219 |
Vector<const Char> subject, size_t index) { |
||
220 |
148 |
const Char pattern_first_char = pattern[0]; |
|
221 |
148 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
|
222 |
|||
223 |
// For speed, search for the more `rare` of the two bytes in pattern[0] |
||
224 |
// using memchr / memrchr (which are much faster than a simple for loop). |
||
225 |
148 |
const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
|
226 |
148 |
size_t pos = index; |
|
227 |
6828 |
do { |
|
228 |
6976 |
const size_t bytes_to_search = (max_n - pos) * sizeof(Char); |
|
229 |
const void* void_pos; |
||
230 |
✓✓ | 6976 |
if (subject.forward()) { |
231 |
// Assert that bytes_to_search won't overflow |
||
232 |
✗✓ | 6975 |
CHECK_LE(pos, max_n); |
233 |
✗✓ | 6975 |
CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); |
234 |
6975 |
void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); |
|
235 |
} else { |
||
236 |
✗✓ | 1 |
CHECK_LE(pos, subject.length()); |
237 |
✗✓ | 1 |
CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); |
238 |
1 |
void_pos = MemrchrFill(subject.start() + pattern.length() - 1, |
|
239 |
search_byte, |
||
240 |
bytes_to_search); |
||
241 |
} |
||
242 |
6976 |
const Char* char_pos = static_cast<const Char*>(void_pos); |
|
243 |
✓✓ | 6976 |
if (char_pos == nullptr) |
244 |
12 |
return subject.length(); |
|
245 |
|||
246 |
// Then, for each match, verify that the full two bytes match pattern[0]. |
||
247 |
6964 |
char_pos = AlignDown(char_pos, sizeof(Char)); |
|
248 |
6964 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
|
249 |
✓✗ | 6964 |
pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); |
250 |
✓✓ | 6964 |
if (subject[pos] == pattern_first_char) { |
251 |
// Match found, hooray. |
||
252 |
136 |
return pos; |
|
253 |
} |
||
254 |
// Search byte matched, but the other byte of pattern[0] didn't. Keep going. |
||
255 |
✓✗ | 6828 |
} while (++pos < max_n); |
256 |
|||
257 |
return subject.length(); |
||
258 |
} |
||
259 |
|||
260 |
|||
261 |
// Finds the first occurrence of the byte pattern[0] in string `subject`. |
||
262 |
// Does not verify that the whole pattern matches. |
||
263 |
template <> |
||
264 |
457815 |
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
|
265 |
Vector<const uint8_t> subject, |
||
266 |
size_t index) { |
||
267 |
457815 |
const uint8_t pattern_first_char = pattern[0]; |
|
268 |
457815 |
const size_t subj_len = subject.length(); |
|
269 |
457815 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
|
270 |
|||
271 |
const void* pos; |
||
272 |
✓✓ | 457815 |
if (subject.forward()) { |
273 |
91585 |
pos = memchr(subject.start() + index, pattern_first_char, max_n - index); |
|
274 |
} else { |
||
275 |
366230 |
pos = MemrchrFill(subject.start() + pattern.length() - 1, |
|
276 |
pattern_first_char, |
||
277 |
max_n - index); |
||
278 |
} |
||
279 |
457815 |
const uint8_t* char_pos = static_cast<const uint8_t*>(pos); |
|
280 |
✓✓ | 457815 |
if (char_pos == nullptr) { |
281 |
178 |
return subj_len; |
|
282 |
} |
||
283 |
|||
284 |
457637 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
|
285 |
✓✓ | 457637 |
return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); |
286 |
} |
||
287 |
|||
288 |
//--------------------------------------------------------------------- |
||
289 |
// Single Character Pattern Search Strategy |
||
290 |
//--------------------------------------------------------------------- |
||
291 |
|||
292 |
template <typename Char> |
||
293 |
197 |
size_t StringSearch<Char>::SingleCharSearch( |
|
294 |
Vector subject, |
||
295 |
size_t index) { |
||
296 |
✗✓✗✓ |
197 |
CHECK_EQ(1, pattern_.length()); |
297 |
197 |
return FindFirstCharacter(pattern_, subject, index); |
|
298 |
} |
||
299 |
|||
300 |
//--------------------------------------------------------------------- |
||
301 |
// Linear Search Strategy |
||
302 |
//--------------------------------------------------------------------- |
||
303 |
|||
304 |
// Simple linear search for short patterns. Never bails out. |
||
305 |
template <typename Char> |
||
306 |
303 |
size_t StringSearch<Char>::LinearSearch( |
|
307 |
Vector subject, |
||
308 |
size_t index) { |
||
309 |
✗✓✗✓ |
303 |
CHECK_GT(pattern_.length(), 1); |
310 |
303 |
const size_t n = subject.length() - pattern_.length(); |
|
311 |
✓✓✓✗ |
84024 |
for (size_t i = index; i <= n; i++) { |
312 |
84021 |
i = FindFirstCharacter(pattern_, subject, i); |
|
313 |
✓✓✓✓ |
84021 |
if (i == subject.length()) |
314 |
451 |
return subject.length(); |
|
315 |
✗✓✗✓ |
83870 |
CHECK_LE(i, n); |
316 |
|||
317 |
83870 |
bool matches = true; |
|
318 |
✓✓✓✓ |
109552 |
for (size_t j = 1; j < pattern_.length(); j++) { |
319 |
✓✓✓✓ |
109403 |
if (pattern_[j] != subject[i + j]) { |
320 |
83721 |
matches = false; |
|
321 |
83721 |
break; |
|
322 |
} |
||
323 |
} |
||
324 |
✓✓✓✓ |
83870 |
if (matches) { |
325 |
149 |
return i; |
|
326 |
} |
||
327 |
} |
||
328 |
3 |
return subject.length(); |
|
329 |
} |
||
330 |
|||
331 |
//--------------------------------------------------------------------- |
||
332 |
// Boyer-Moore string search |
||
333 |
//--------------------------------------------------------------------- |
||
334 |
|||
335 |
template <typename Char> |
||
336 |
3 |
size_t StringSearch<Char>::BoyerMooreSearch( |
|
337 |
Vector subject, |
||
338 |
size_t start_index) { |
||
339 |
3 |
const size_t subject_length = subject.length(); |
|
340 |
3 |
const size_t pattern_length = pattern_.length(); |
|
341 |
// Only preprocess at most kBMMaxShift last characters of pattern. |
||
342 |
3 |
size_t start = start_; |
|
343 |
|||
344 |
3 |
int* bad_char_occurrence = bad_char_shift_table_; |
|
345 |
3 |
int* good_suffix_shift = good_suffix_shift_table_ - start_; |
|
346 |
|||
347 |
3 |
Char last_char = pattern_[pattern_length - 1]; |
|
348 |
3 |
size_t index = start_index; |
|
349 |
// Continue search from i. |
||
350 |
✓✗✗✗ |
28723 |
while (index <= subject_length - pattern_length) { |
351 |
14363 |
size_t j = pattern_length - 1; |
|
352 |
int c; |
||
353 |
✓✓✗✗ |
20499 |
while (last_char != (c = subject[index + j])) { |
354 |
3068 |
int shift = j - CharOccurrence(bad_char_occurrence, c); |
|
355 |
3068 |
index += shift; |
|
356 |
✗✓✗✗ |
3068 |
if (index > subject_length - pattern_length) { |
357 |
return subject.length(); |
||
358 |
} |
||
359 |
} |
||
360 |
✓✓✗✗ |
15055847 |
while (pattern_[j] == (c = subject[index + j])) { |
361 |
✓✓✗✗ |
7520745 |
if (j == 0) { |
362 |
3 |
return index; |
|
363 |
} |
||
364 |
7520742 |
j--; |
|
365 |
} |
||
366 |
✓✓✗✗ |
14360 |
if (j < start) { |
367 |
// we have matched more than our tables allow us to be smart about. |
||
368 |
// Fall back on BMH shift. |
||
369 |
2324 |
index += pattern_length - 1 - |
|
370 |
2324 |
CharOccurrence(bad_char_occurrence, last_char); |
|
371 |
} else { |
||
372 |
12036 |
int gs_shift = good_suffix_shift[j + 1]; |
|
373 |
12036 |
int bc_occ = CharOccurrence(bad_char_occurrence, c); |
|
374 |
12036 |
int shift = j - bc_occ; |
|
375 |
✓✗✗✗ |
12036 |
if (gs_shift > shift) { |
376 |
12036 |
shift = gs_shift; |
|
377 |
} |
||
378 |
12036 |
index += shift; |
|
379 |
} |
||
380 |
} |
||
381 |
|||
382 |
return subject.length(); |
||
383 |
} |
||
384 |
|||
385 |
template <typename Char> |
||
386 |
3 |
void StringSearch<Char>::PopulateBoyerMooreTable() { |
|
387 |
3 |
const size_t pattern_length = pattern_.length(); |
|
388 |
// Only look at the last kBMMaxShift characters of pattern (from start_ |
||
389 |
// to pattern_length). |
||
390 |
3 |
const size_t start = start_; |
|
391 |
3 |
const size_t length = pattern_length - start; |
|
392 |
|||
393 |
// Biased tables so that we can use pattern indices as table indices, |
||
394 |
// even if we only cover the part of the pattern from offset start. |
||
395 |
3 |
int* shift_table = good_suffix_shift_table_ - start_; |
|
396 |
3 |
int* suffix_table = suffix_table_ - start_; |
|
397 |
|||
398 |
// Initialize table. |
||
399 |
✓✓✗✗ |
753 |
for (size_t i = start; i < pattern_length; i++) { |
400 |
750 |
shift_table[i] = length; |
|
401 |
} |
||
402 |
3 |
shift_table[pattern_length] = 1; |
|
403 |
3 |
suffix_table[pattern_length] = pattern_length + 1; |
|
404 |
|||
405 |
✗✓✗✗ |
3 |
if (pattern_length <= start) { |
406 |
return; |
||
407 |
} |
||
408 |
|||
409 |
// Find suffixes. |
||
410 |
3 |
Char last_char = pattern_[pattern_length - 1]; |
|
411 |
3 |
size_t suffix = pattern_length + 1; |
|
412 |
{ |
||
413 |
3 |
size_t i = pattern_length; |
|
414 |
✓✓✗✗ |
1413 |
while (i > start) { |
415 |
705 |
Char c = pattern_[i - 1]; |
|
416 |
✓✓✓✓ ✓✓✗✗ ✗✗✗✗ |
783 |
while (suffix <= pattern_length && c != pattern_[suffix - 1]) { |
417 |
✓✓✗✗ |
39 |
if (static_cast<size_t>(shift_table[suffix]) == length) { |
418 |
14 |
shift_table[suffix] = suffix - i; |
|
419 |
} |
||
420 |
39 |
suffix = suffix_table[suffix]; |
|
421 |
} |
||
422 |
705 |
suffix_table[--i] = --suffix; |
|
423 |
✓✓✗✗ |
705 |
if (suffix == pattern_length) { |
424 |
// No suffix to extend, so we check against last_char only. |
||
425 |
✓✗✓✓ ✓✓✗✗ ✗✗✗✗ |
87 |
while ((i > start) && (pattern_[i - 1] != last_char)) { |
426 |
✗✓✗✗ |
42 |
if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
427 |
shift_table[pattern_length] = pattern_length - i; |
||
428 |
} |
||
429 |
42 |
suffix_table[--i] = pattern_length; |
|
430 |
} |
||
431 |
✓✗✗✗ |
3 |
if (i > start) { |
432 |
3 |
suffix_table[--i] = --suffix; |
|
433 |
} |
||
434 |
} |
||
435 |
} |
||
436 |
} |
||
437 |
// Build shift table using suffixes. |
||
438 |
✓✗✗✗ |
3 |
if (suffix < pattern_length) { |
439 |
✓✓✗✗ |
756 |
for (size_t i = start; i <= pattern_length; i++) { |
440 |
✓✓✗✗ |
753 |
if (static_cast<size_t>(shift_table[i]) == length) { |
441 |
736 |
shift_table[i] = suffix - start; |
|
442 |
} |
||
443 |
✓✓✗✗ |
753 |
if (i == suffix) { |
444 |
8 |
suffix = suffix_table[suffix]; |
|
445 |
} |
||
446 |
} |
||
447 |
} |
||
448 |
} |
||
449 |
|||
450 |
//--------------------------------------------------------------------- |
||
451 |
// Boyer-Moore-Horspool string search. |
||
452 |
//--------------------------------------------------------------------- |
||
453 |
|||
454 |
template <typename Char> |
||
455 |
27 |
size_t StringSearch<Char>::BoyerMooreHorspoolSearch( |
|
456 |
Vector subject, |
||
457 |
size_t start_index) { |
||
458 |
27 |
const size_t subject_length = subject.length(); |
|
459 |
27 |
const size_t pattern_length = pattern_.length(); |
|
460 |
27 |
int* char_occurrences = bad_char_shift_table_; |
|
461 |
27 |
int64_t badness = -static_cast<int64_t>(pattern_length); |
|
462 |
|||
463 |
// How bad we are doing without a good-suffix table. |
||
464 |
27 |
Char last_char = pattern_[pattern_length - 1]; |
|
465 |
27 |
int last_char_shift = |
|
466 |
54 |
pattern_length - 1 - |
|
467 |
27 |
CharOccurrence(char_occurrences, last_char); |
|
468 |
|||
469 |
// Perform search |
||
470 |
27 |
size_t index = start_index; // No matches found prior to this index. |
|
471 |
✓✗✗✗ |
62927 |
while (index <= subject_length - pattern_length) { |
472 |
31477 |
size_t j = pattern_length - 1; |
|
473 |
int subject_char; |
||
474 |
✓✓✗✗ |
124529 |
while (last_char != (subject_char = subject[index + j])) { |
475 |
46528 |
int bc_occ = CharOccurrence(char_occurrences, subject_char); |
|
476 |
46528 |
int shift = j - bc_occ; |
|
477 |
46528 |
index += shift; |
|
478 |
46528 |
badness += 1 - shift; // at most zero, so badness cannot increase. |
|
479 |
✓✓✗✗ |
46528 |
if (index > subject_length - pattern_length) { |
480 |
2 |
return subject_length; |
|
481 |
} |
||
482 |
} |
||
483 |
31475 |
j--; |
|
484 |
✓✓✗✗ |
5413803 |
while (pattern_[j] == (subject[index + j])) { |
485 |
✓✓✗✗ |
2691186 |
if (j == 0) { |
486 |
22 |
return index; |
|
487 |
} |
||
488 |
2691164 |
j--; |
|
489 |
} |
||
490 |
31453 |
index += last_char_shift; |
|
491 |
// Badness increases by the number of characters we have |
||
492 |
// checked, and decreases by the number of characters we |
||
493 |
// can skip by shifting. It's a measure of how we are doing |
||
494 |
// compared to reading each character exactly once. |
||
495 |
31453 |
badness += (pattern_length - j) - last_char_shift; |
|
496 |
✓✓✗✗ |
31453 |
if (badness > 0) { |
497 |
3 |
PopulateBoyerMooreTable(); |
|
498 |
3 |
strategy_ = SearchStrategy::kBoyerMoore; |
|
499 |
3 |
return BoyerMooreSearch(subject, index); |
|
500 |
} |
||
501 |
} |
||
502 |
return subject.length(); |
||
503 |
} |
||
504 |
|||
505 |
template <typename Char> |
||
506 |
27 |
void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { |
|
507 |
27 |
const size_t pattern_length = pattern_.length(); |
|
508 |
|||
509 |
27 |
int* bad_char_occurrence = bad_char_shift_table_; |
|
510 |
|||
511 |
// Only preprocess at most kBMMaxShift last characters of pattern. |
||
512 |
27 |
const size_t start = start_; |
|
513 |
// Run forwards to populate bad_char_table, so that *last* instance |
||
514 |
// of character equivalence class is the one registered. |
||
515 |
// Notice: Doesn't include the last character. |
||
516 |
27 |
const size_t table_size = AlphabetSize(); |
|
517 |
✓✓✗✗ |
27 |
if (start == 0) { |
518 |
// All patterns less than kBMMaxShift in length. |
||
519 |
20 |
memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
|
520 |
} else { |
||
521 |
✓✓✗✗ |
1799 |
for (size_t i = 0; i < table_size; i++) { |
522 |
1792 |
bad_char_occurrence[i] = start - 1; |
|
523 |
} |
||
524 |
} |
||
525 |
✓✓✗✗ |
2614 |
for (size_t i = start; i < pattern_length - 1; i++) { |
526 |
2587 |
Char c = pattern_[i]; |
|
527 |
2587 |
int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); |
|
528 |
2587 |
bad_char_occurrence[bucket] = i; |
|
529 |
} |
||
530 |
27 |
} |
|
531 |
|||
532 |
//--------------------------------------------------------------------- |
||
533 |
// Linear string search with bailout to BMH. |
||
534 |
//--------------------------------------------------------------------- |
||
535 |
|||
536 |
// Simple linear search for short patterns, which bails out if the string |
||
537 |
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
||
538 |
template <typename Char> |
||
539 |
701 |
size_t StringSearch<Char>::InitialSearch( |
|
540 |
Vector subject, |
||
541 |
size_t index) { |
||
542 |
701 |
const size_t pattern_length = pattern_.length(); |
|
543 |
// Badness is a count of how much work we have done. When we have |
||
544 |
// done enough work we decide it's probably worth switching to a better |
||
545 |
// algorithm. |
||
546 |
701 |
int64_t badness = -10 - (pattern_length << 2); |
|
547 |
|||
548 |
// We know our pattern is at least 2 characters, we cache the first so |
||
549 |
// the common case of the first character not matching is faster. |
||
550 |
✓✗✓✗ |
373772 |
for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
551 |
373772 |
badness++; |
|
552 |
✓✓✓✗ |
373772 |
if (badness <= 0) { |
553 |
373745 |
i = FindFirstCharacter(pattern_, subject, i); |
|
554 |
✓✓✗✓ |
373745 |
if (i == subject.length()) |
555 |
704 |
return subject.length(); |
|
556 |
✗✓✗✓ |
373742 |
CHECK_LE(i, n); |
557 |
373742 |
size_t j = 1; |
|
558 |
10159093 |
do { |
|
559 |
✓✓✗✓ |
10532835 |
if (pattern_[j] != subject[i + j]) { |
560 |
373071 |
break; |
|
561 |
} |
||
562 |
10159764 |
j++; |
|
563 |
✓✓✓✓ |
10159764 |
} while (j < pattern_length); |
564 |
✓✓✓✗ |
373742 |
if (j == pattern_length) { |
565 |
671 |
return i; |
|
566 |
} |
||
567 |
373071 |
badness += j; |
|
568 |
} else { |
||
569 |
27 |
PopulateBoyerMooreHorspoolTable(); |
|
570 |
27 |
strategy_ = SearchStrategy::kBoyerMooreHorspool; |
|
571 |
27 |
return BoyerMooreHorspoolSearch(subject, i); |
|
572 |
} |
||
573 |
} |
||
574 |
return subject.length(); |
||
575 |
} |
||
576 |
|||
577 |
// Perform a single stand-alone search. |
||
578 |
// If searching multiple times for the same pattern, a search |
||
579 |
// object should be constructed once and the Search function then called |
||
580 |
// for each search. |
||
581 |
template <typename Char> |
||
582 |
1201 |
size_t SearchString(Vector<const Char> subject, |
|
583 |
Vector<const Char> pattern, |
||
584 |
size_t start_index) { |
||
585 |
1201 |
StringSearch<Char> search(pattern); |
|
586 |
1201 |
return search.Search(subject, start_index); |
|
587 |
} |
||
588 |
} // namespace stringsearch |
||
589 |
} // namespace node |
||
590 |
|||
591 |
namespace node { |
||
592 |
|||
593 |
template <typename Char> |
||
594 |
1201 |
size_t SearchString(const Char* haystack, |
|
595 |
size_t haystack_length, |
||
596 |
const Char* needle, |
||
597 |
size_t needle_length, |
||
598 |
size_t start_index, |
||
599 |
bool is_forward) { |
||
600 |
✗✓✗✓ |
1201 |
if (haystack_length < needle_length) return haystack_length; |
601 |
// To do a reverse search (lastIndexOf instead of indexOf) without redundant |
||
602 |
// code, create two vectors that are reversed views into the input strings. |
||
603 |
// For example, v_needle[0] would return the *last* character of the needle. |
||
604 |
// So we're searching for the first instance of rev(needle) in rev(haystack) |
||
605 |
1201 |
stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); |
|
606 |
1201 |
stringsearch::Vector<const Char> v_haystack( |
|
607 |
haystack, haystack_length, is_forward); |
||
608 |
1201 |
size_t diff = haystack_length - needle_length; |
|
609 |
size_t relative_start_index; |
||
610 |
✓✓✓✓ |
1201 |
if (is_forward) { |
611 |
1126 |
relative_start_index = start_index; |
|
612 |
✓✓✓✗ |
75 |
} else if (diff < start_index) { |
613 |
31 |
relative_start_index = 0; |
|
614 |
} else { |
||
615 |
44 |
relative_start_index = diff - start_index; |
|
616 |
} |
||
617 |
1201 |
size_t pos = node::stringsearch::SearchString( |
|
618 |
v_haystack, v_needle, relative_start_index); |
||
619 |
✓✓✓✓ |
1201 |
if (pos == haystack_length) { |
620 |
// not found |
||
621 |
195 |
return pos; |
|
622 |
} |
||
623 |
✓✓✓✗ |
1006 |
return is_forward ? pos : (haystack_length - needle_length - pos); |
624 |
} |
||
625 |
|||
626 |
template <size_t N> |
||
627 |
size_t SearchString(const char* haystack, size_t haystack_length, |
||
628 |
const char (&needle)[N]) { |
||
629 |
return SearchString( |
||
630 |
reinterpret_cast<const uint8_t*>(haystack), haystack_length, |
||
631 |
reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); |
||
632 |
} |
||
633 |
|||
634 |
} // namespace node |
||
635 |
|||
636 |
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
||
637 |
|||
638 |
#endif // SRC_STRING_SEARCH_H_ |
Generated by: GCOVR (Version 4.2) |