GCC Code Coverage Report | |||||||||||||||||||||
|
|||||||||||||||||||||
Line | Branch | Exec | Source |
1 |
// Copyright 2011 the V8 project authors. All rights reserved. |
||
2 |
// Use of this source code is governed by a BSD-style license that can be |
||
3 |
// found in the LICENSE file. |
||
4 |
|||
5 |
#ifndef SRC_STRING_SEARCH_H_ |
||
6 |
#define SRC_STRING_SEARCH_H_ |
||
7 |
|||
8 |
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
||
9 |
|||
10 |
#include "util.h" |
||
11 |
|||
12 |
#include <cstring> |
||
13 |
#include <algorithm> |
||
14 |
|||
15 |
namespace node { |
||
16 |
namespace stringsearch { |
||
17 |
|||
18 |
template <typename T> |
||
19 |
class Vector { |
||
20 |
public: |
||
21 |
42416 |
Vector(T* data, size_t length, bool isForward) |
|
22 |
42416 |
: start_(data), length_(length), is_forward_(isForward) { |
|
23 |
✓✗✗✓ ✗✓✓✗ ✗✓✗✓ |
42416 |
CHECK(length > 0 && data != nullptr); |
24 |
42416 |
} |
|
25 |
|||
26 |
// Returns the start of the memory range. |
||
27 |
// For vector v this is NOT necessarily &v[0], see forward(). |
||
28 |
5102245 |
const T* start() const { return start_; } |
|
29 |
|||
30 |
// Returns the length of the vector, in characters. |
||
31 |
12843520 |
size_t length() const { return length_; } |
|
32 |
|||
33 |
// Returns true if the Vector is front-to-back, false if back-to-front. |
||
34 |
// In the latter case, v[0] corresponds to the *end* of the memory range. |
||
35 |
5102245 |
size_t forward() const { return is_forward_; } |
|
36 |
|||
37 |
// Access individual vector elements - checks bounds in debug mode. |
||
38 |
49986184 |
T& operator[](size_t index) const { |
|
39 |
DCHECK_LT(index, length_); |
||
40 |
✓✓✓✓ |
49986184 |
return start_[is_forward_ ? index : (length_ - index - 1)]; |
41 |
} |
||
42 |
|||
43 |
private: |
||
44 |
T* start_; |
||
45 |
size_t length_; |
||
46 |
bool is_forward_; |
||
47 |
}; |
||
48 |
|||
49 |
|||
50 |
//--------------------------------------------------------------------- |
||
51 |
// String Search object. |
||
52 |
//--------------------------------------------------------------------- |
||
53 |
|||
54 |
// Class holding constants and methods that apply to all string search variants, |
||
55 |
// independently of subject and pattern char size. |
||
56 |
21208 |
class StringSearchBase { |
|
57 |
protected: |
||
58 |
// Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
||
59 |
// limit, we can fix the size of tables. For a needle longer than this limit, |
||
60 |
// search will not be optimal, since we only build tables for a suffix |
||
61 |
// of the string, but it is a safe approximation. |
||
62 |
static const int kBMMaxShift = 250; |
||
63 |
|||
64 |
// Reduce alphabet to this size. |
||
65 |
// One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
||
66 |
// proportional to the input alphabet. We reduce the alphabet size by |
||
67 |
// equating input characters modulo a smaller alphabet size. This gives |
||
68 |
// a potentially less efficient searching, but is a safe approximation. |
||
69 |
// For needles using only characters in the same Unicode 256-code point page, |
||
70 |
// there is no search speed degradation. |
||
71 |
static const int kLatin1AlphabetSize = 256; |
||
72 |
static const int kUC16AlphabetSize = 256; |
||
73 |
|||
74 |
// Bad-char shift table stored in the state. It's length is the alphabet size. |
||
75 |
// For patterns below this length, the skip length of Boyer-Moore is too short |
||
76 |
// to compensate for the algorithmic overhead compared to simple brute force. |
||
77 |
static const int kBMMinPatternLength = 8; |
||
78 |
|||
79 |
// Store for the BoyerMoore(Horspool) bad char shift table. |
||
80 |
int bad_char_shift_table_[kUC16AlphabetSize]; |
||
81 |
// Store for the BoyerMoore good suffix shift table. |
||
82 |
int good_suffix_shift_table_[kBMMaxShift + 1]; |
||
83 |
// Table used temporarily while building the BoyerMoore good suffix |
||
84 |
// shift table. |
||
85 |
int suffix_table_[kBMMaxShift + 1]; |
||
86 |
}; |
||
87 |
|||
88 |
template <typename Char> |
||
89 |
class StringSearch : private StringSearchBase { |
||
90 |
public: |
||
91 |
typedef stringsearch::Vector<const Char> Vector; |
||
92 |
|||
93 |
21208 |
explicit StringSearch(Vector pattern) |
|
94 |
21208 |
: pattern_(pattern), start_(0) { |
|
95 |
✓✓✗✓ |
21208 |
if (pattern.length() >= kBMMaxShift) { |
96 |
11 |
start_ = pattern.length() - kBMMaxShift; |
|
97 |
} |
||
98 |
|||
99 |
21208 |
size_t pattern_length = pattern_.length(); |
|
100 |
✗✓✗✓ |
21208 |
CHECK_GT(pattern_length, 0); |
101 |
✓✓✓✓ |
21208 |
if (pattern_length < kBMMinPatternLength) { |
102 |
✓✓✓✓ |
16111 |
if (pattern_length == 1) { |
103 |
192 |
strategy_ = &StringSearch::SingleCharSearch; |
|
104 |
16303 |
return; |
|
105 |
} |
||
106 |
15919 |
strategy_ = &StringSearch::LinearSearch; |
|
107 |
15919 |
return; |
|
108 |
} |
||
109 |
5097 |
strategy_ = &StringSearch::InitialSearch; |
|
110 |
} |
||
111 |
|||
112 |
21208 |
size_t Search(Vector subject, size_t index) { |
|
113 |
✓✗✓✗ |
21208 |
return (this->*strategy_)(subject, index); |
114 |
} |
||
115 |
|||
116 |
4442 |
static inline int AlphabetSize() { |
|
117 |
if (sizeof(Char) == 1) { |
||
118 |
// Latin1 needle. |
||
119 |
4442 |
return kLatin1AlphabetSize; |
|
120 |
} else { |
||
121 |
// UC16 needle. |
||
122 |
return kUC16AlphabetSize; |
||
123 |
} |
||
124 |
|||
125 |
static_assert(sizeof(Char) == sizeof(uint8_t) || |
||
126 |
sizeof(Char) == sizeof(uint16_t), |
||
127 |
"sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); |
||
128 |
} |
||
129 |
|||
130 |
private: |
||
131 |
typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); |
||
132 |
size_t SingleCharSearch(Vector subject, size_t start_index); |
||
133 |
size_t LinearSearch(Vector subject, size_t start_index); |
||
134 |
size_t InitialSearch(Vector subject, size_t start_index); |
||
135 |
size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); |
||
136 |
size_t BoyerMooreSearch(Vector subject, size_t start_index); |
||
137 |
|||
138 |
void PopulateBoyerMooreHorspoolTable(); |
||
139 |
|||
140 |
void PopulateBoyerMooreTable(); |
||
141 |
|||
142 |
975015 |
static inline int CharOccurrence(int* bad_char_occurrence, |
|
143 |
Char char_code) { |
||
144 |
if (sizeof(Char) == 1) { |
||
145 |
975015 |
return bad_char_occurrence[static_cast<int>(char_code)]; |
|
146 |
} |
||
147 |
// Both pattern and subject are UC16. Reduce character to equivalence class. |
||
148 |
int equiv_class = char_code % kUC16AlphabetSize; |
||
149 |
return bad_char_occurrence[equiv_class]; |
||
150 |
} |
||
151 |
|||
152 |
// The pattern to search for. |
||
153 |
Vector pattern_; |
||
154 |
// Pointer to implementation of the search. |
||
155 |
SearchFunction strategy_; |
||
156 |
// Cache value of Max(0, pattern_length() - kBMMaxShift) |
||
157 |
size_t start_; |
||
158 |
}; |
||
159 |
|||
160 |
|||
161 |
template <typename T, typename U> |
||
162 |
6960 |
inline T AlignDown(T value, U alignment) { |
|
163 |
return reinterpret_cast<T>( |
||
164 |
6960 |
(reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
|
165 |
} |
||
166 |
|||
167 |
|||
168 |
143 |
inline uint8_t GetHighestValueByte(uint16_t character) { |
|
169 |
return std::max(static_cast<uint8_t>(character & 0xFF), |
||
170 |
143 |
static_cast<uint8_t>(character >> 8)); |
|
171 |
} |
||
172 |
|||
173 |
|||
174 |
inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
||
175 |
|||
176 |
|||
177 |
// Searches for a byte value in a memory buffer, back to front. |
||
178 |
// Uses memrchr(3) on systems which support it, for speed. |
||
179 |
// Falls back to a vanilla for loop on non-GNU systems such as Windows. |
||
180 |
366239 |
inline const void* MemrchrFill(const void* haystack, uint8_t needle, |
|
181 |
size_t haystack_len) { |
||
182 |
#ifdef _GNU_SOURCE |
||
183 |
366239 |
return memrchr(haystack, needle, haystack_len); |
|
184 |
#else |
||
185 |
const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); |
||
186 |
for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { |
||
187 |
if (haystack8[i] == needle) { |
||
188 |
return haystack8 + i; |
||
189 |
} |
||
190 |
} |
||
191 |
return nullptr; |
||
192 |
#endif |
||
193 |
} |
||
194 |
|||
195 |
|||
196 |
// Finds the first occurrence of *two-byte* character pattern[0] in the string |
||
197 |
// `subject`. Does not check that the whole pattern matches. |
||
198 |
template <typename Char> |
||
199 |
143 |
inline size_t FindFirstCharacter(Vector<const Char> pattern, |
|
200 |
Vector<const Char> subject, size_t index) { |
||
201 |
143 |
const Char pattern_first_char = pattern[0]; |
|
202 |
143 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
|
203 |
|||
204 |
// For speed, search for the more `rare` of the two bytes in pattern[0] |
||
205 |
// using memchr / memrchr (which are much faster than a simple for loop). |
||
206 |
143 |
const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
|
207 |
143 |
size_t pos = index; |
|
208 |
✓✗ | 6828 |
do { |
209 |
6971 |
const size_t bytes_to_search = (max_n - pos) * sizeof(Char); |
|
210 |
const void* void_pos; |
||
211 |
✓✓ | 6971 |
if (subject.forward()) { |
212 |
// Assert that bytes_to_search won't overflow |
||
213 |
✗✓ | 6970 |
CHECK_LE(pos, max_n); |
214 |
✗✓ | 6970 |
CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); |
215 |
6970 |
void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); |
|
216 |
} else { |
||
217 |
✗✓ | 1 |
CHECK_LE(pos, subject.length()); |
218 |
✗✓ | 1 |
CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); |
219 |
1 |
void_pos = MemrchrFill(subject.start() + pattern.length() - 1, |
|
220 |
search_byte, |
||
221 |
1 |
bytes_to_search); |
|
222 |
} |
||
223 |
6971 |
const Char* char_pos = static_cast<const Char*>(void_pos); |
|
224 |
✓✓ | 6971 |
if (char_pos == nullptr) |
225 |
11 |
return subject.length(); |
|
226 |
|||
227 |
// Then, for each match, verify that the full two bytes match pattern[0]. |
||
228 |
6960 |
char_pos = AlignDown(char_pos, sizeof(Char)); |
|
229 |
6960 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
|
230 |
✗✓ | 6960 |
pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); |
231 |
✓✓ | 6960 |
if (subject[pos] == pattern_first_char) { |
232 |
// Match found, hooray. |
||
233 |
132 |
return pos; |
|
234 |
} |
||
235 |
// Search byte matched, but the other byte of pattern[0] didn't. Keep going. |
||
236 |
} while (++pos < max_n); |
||
237 |
|||
238 |
return subject.length(); |
||
239 |
} |
||
240 |
|||
241 |
|||
242 |
// Finds the first occurrence of the byte pattern[0] in string `subject`. |
||
243 |
// Does not verify that the whole pattern matches. |
||
244 |
template <> |
||
245 |
2547826 |
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
|
246 |
Vector<const uint8_t> subject, |
||
247 |
size_t index) { |
||
248 |
2547826 |
const uint8_t pattern_first_char = pattern[0]; |
|
249 |
2547826 |
const size_t subj_len = subject.length(); |
|
250 |
2547826 |
const size_t max_n = (subject.length() - pattern.length() + 1); |
|
251 |
|||
252 |
const void* pos; |
||
253 |
✓✓ | 2547826 |
if (subject.forward()) { |
254 |
2181596 |
pos = memchr(subject.start() + index, pattern_first_char, max_n - index); |
|
255 |
} else { |
||
256 |
366230 |
pos = MemrchrFill(subject.start() + pattern.length() - 1, |
|
257 |
pattern_first_char, |
||
258 |
732460 |
max_n - index); |
|
259 |
} |
||
260 |
2547826 |
const uint8_t* char_pos = static_cast<const uint8_t*>(pos); |
|
261 |
✓✓ | 2547826 |
if (char_pos == nullptr) { |
262 |
7338 |
return subj_len; |
|
263 |
} |
||
264 |
|||
265 |
2540488 |
size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
|
266 |
✓✓ | 2540488 |
return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); |
267 |
} |
||
268 |
|||
269 |
//--------------------------------------------------------------------- |
||
270 |
// Single Character Pattern Search Strategy |
||
271 |
//--------------------------------------------------------------------- |
||
272 |
|||
273 |
template <typename Char> |
||
274 |
192 |
size_t StringSearch<Char>::SingleCharSearch( |
|
275 |
Vector subject, |
||
276 |
size_t index) { |
||
277 |
✗✓✗✓ |
192 |
CHECK_EQ(1, pattern_.length()); |
278 |
192 |
return FindFirstCharacter(pattern_, subject, index); |
|
279 |
} |
||
280 |
|||
281 |
//--------------------------------------------------------------------- |
||
282 |
// Linear Search Strategy |
||
283 |
//--------------------------------------------------------------------- |
||
284 |
|||
285 |
// Simple linear search for short patterns. Never bails out. |
||
286 |
template <typename Char> |
||
287 |
15919 |
size_t StringSearch<Char>::LinearSearch( |
|
288 |
Vector subject, |
||
289 |
size_t index) { |
||
290 |
✗✓✗✓ |
15919 |
CHECK_GT(pattern_.length(), 1); |
291 |
15919 |
const size_t n = subject.length() - pattern_.length(); |
|
292 |
✓✓✓✗ |
2072650 |
for (size_t i = index; i <= n; i++) { |
293 |
2072647 |
i = FindFirstCharacter(pattern_, subject, i); |
|
294 |
✓✓✓✓ |
2072647 |
if (i == subject.length()) |
295 |
23226 |
return subject.length(); |
|
296 |
✗✓✗✓ |
2065337 |
CHECK_LE(i, n); |
297 |
|||
298 |
2065337 |
bool matches = true; |
|
299 |
✓✓✓✓ |
2163970 |
for (size_t j = 1; j < pattern_.length(); j++) { |
300 |
✓✓✓✓ |
2155364 |
if (pattern_[j] != subject[i + j]) { |
301 |
2056731 |
matches = false; |
|
302 |
2056731 |
break; |
|
303 |
} |
||
304 |
} |
||
305 |
✓✓✓✓ |
2065337 |
if (matches) { |
306 |
8606 |
return i; |
|
307 |
} |
||
308 |
} |
||
309 |
3 |
return subject.length(); |
|
310 |
} |
||
311 |
|||
312 |
//--------------------------------------------------------------------- |
||
313 |
// Boyer-Moore string search |
||
314 |
//--------------------------------------------------------------------- |
||
315 |
|||
316 |
template <typename Char> |
||
317 |
3 |
size_t StringSearch<Char>::BoyerMooreSearch( |
|
318 |
Vector subject, |
||
319 |
size_t start_index) { |
||
320 |
3 |
const size_t subject_length = subject.length(); |
|
321 |
3 |
const size_t pattern_length = pattern_.length(); |
|
322 |
// Only preprocess at most kBMMaxShift last characters of pattern. |
||
323 |
3 |
size_t start = start_; |
|
324 |
|||
325 |
3 |
int* bad_char_occurrence = bad_char_shift_table_; |
|
326 |
3 |
int* good_suffix_shift = good_suffix_shift_table_ - start_; |
|
327 |
|||
328 |
3 |
Char last_char = pattern_[pattern_length - 1]; |
|
329 |
3 |
size_t index = start_index; |
|
330 |
// Continue search from i. |
||
331 |
✓✗✗✗ |
14366 |
while (index <= subject_length - pattern_length) { |
332 |
14363 |
size_t j = pattern_length - 1; |
|
333 |
int c; |
||
334 |
✓✓✗✗ |
31794 |
while (last_char != (c = subject[index + j])) { |
335 |
3068 |
int shift = j - CharOccurrence(bad_char_occurrence, c); |
|
336 |
3068 |
index += shift; |
|
337 |
✗✓✗✗ |
3068 |
if (index > subject_length - pattern_length) { |
338 |
return subject.length(); |
||
339 |
} |
||
340 |
} |
||
341 |
✓✓✗✗ |
7549468 |
while (pattern_[j] == (c = subject[index + j])) { |
342 |
✓✓✗✗ |
7520745 |
if (j == 0) { |
343 |
3 |
return index; |
|
344 |
} |
||
345 |
7520742 |
j--; |
|
346 |
} |
||
347 |
✓✓✗✗ |
14360 |
if (j < start) { |
348 |
// we have matched more than our tables allow us to be smart about. |
||
349 |
// Fall back on BMH shift. |
||
350 |
2324 |
index += pattern_length - 1 - |
|
351 |
2324 |
CharOccurrence(bad_char_occurrence, last_char); |
|
352 |
} else { |
||
353 |
12036 |
int gs_shift = good_suffix_shift[j + 1]; |
|
354 |
12036 |
int bc_occ = CharOccurrence(bad_char_occurrence, c); |
|
355 |
12036 |
int shift = j - bc_occ; |
|
356 |
✓✗✗✗ |
12036 |
if (gs_shift > shift) { |
357 |
12036 |
shift = gs_shift; |
|
358 |
} |
||
359 |
12036 |
index += shift; |
|
360 |
} |
||
361 |
} |
||
362 |
|||
363 |
return subject.length(); |
||
364 |
} |
||
365 |
|||
366 |
template <typename Char> |
||
367 |
3 |
void StringSearch<Char>::PopulateBoyerMooreTable() { |
|
368 |
3 |
const size_t pattern_length = pattern_.length(); |
|
369 |
// Only look at the last kBMMaxShift characters of pattern (from start_ |
||
370 |
// to pattern_length). |
||
371 |
3 |
const size_t start = start_; |
|
372 |
3 |
const size_t length = pattern_length - start; |
|
373 |
|||
374 |
// Biased tables so that we can use pattern indices as table indices, |
||
375 |
// even if we only cover the part of the pattern from offset start. |
||
376 |
3 |
int* shift_table = good_suffix_shift_table_ - start_; |
|
377 |
3 |
int* suffix_table = suffix_table_ - start_; |
|
378 |
|||
379 |
// Initialize table. |
||
380 |
✓✓✗✗ |
753 |
for (size_t i = start; i < pattern_length; i++) { |
381 |
750 |
shift_table[i] = length; |
|
382 |
} |
||
383 |
3 |
shift_table[pattern_length] = 1; |
|
384 |
3 |
suffix_table[pattern_length] = pattern_length + 1; |
|
385 |
|||
386 |
✗✓✗✗ |
3 |
if (pattern_length <= start) { |
387 |
3 |
return; |
|
388 |
} |
||
389 |
|||
390 |
// Find suffixes. |
||
391 |
3 |
Char last_char = pattern_[pattern_length - 1]; |
|
392 |
3 |
size_t suffix = pattern_length + 1; |
|
393 |
{ |
||
394 |
3 |
size_t i = pattern_length; |
|
395 |
✓✓✗✗ |
711 |
while (i > start) { |
396 |
705 |
Char c = pattern_[i - 1]; |
|
397 |
✓✓✓✓ ✓✓✗✗ ✗✗✗✗ |
1449 |
while (suffix <= pattern_length && c != pattern_[suffix - 1]) { |
398 |
✓✓✗✗ |
39 |
if (static_cast<size_t>(shift_table[suffix]) == length) { |
399 |
14 |
shift_table[suffix] = suffix - i; |
|
400 |
} |
||
401 |
39 |
suffix = suffix_table[suffix]; |
|
402 |
} |
||
403 |
705 |
suffix_table[--i] = --suffix; |
|
404 |
✓✓✗✗ |
705 |
if (suffix == pattern_length) { |
405 |
// No suffix to extend, so we check against last_char only. |
||
406 |
✓✗✓✓ ✓✓✗✗ ✗✗✗✗ |
48 |
while ((i > start) && (pattern_[i - 1] != last_char)) { |
407 |
✗✓✗✗ |
42 |
if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
408 |
shift_table[pattern_length] = pattern_length - i; |
||
409 |
} |
||
410 |
42 |
suffix_table[--i] = pattern_length; |
|
411 |
} |
||
412 |
✓✗✗✗ |
3 |
if (i > start) { |
413 |
3 |
suffix_table[--i] = --suffix; |
|
414 |
} |
||
415 |
} |
||
416 |
} |
||
417 |
} |
||
418 |
// Build shift table using suffixes. |
||
419 |
✓✗✗✗ |
3 |
if (suffix < pattern_length) { |
420 |
✓✓✗✗ |
756 |
for (size_t i = start; i <= pattern_length; i++) { |
421 |
✓✓✗✗ |
753 |
if (static_cast<size_t>(shift_table[i]) == length) { |
422 |
736 |
shift_table[i] = suffix - start; |
|
423 |
} |
||
424 |
✓✓✗✗ |
753 |
if (i == suffix) { |
425 |
8 |
suffix = suffix_table[suffix]; |
|
426 |
} |
||
427 |
} |
||
428 |
} |
||
429 |
} |
||
430 |
|||
431 |
//--------------------------------------------------------------------- |
||
432 |
// Boyer-Moore-Horspool string search. |
||
433 |
//--------------------------------------------------------------------- |
||
434 |
|||
435 |
template <typename Char> |
||
436 |
4442 |
size_t StringSearch<Char>::BoyerMooreHorspoolSearch( |
|
437 |
Vector subject, |
||
438 |
size_t start_index) { |
||
439 |
4442 |
const size_t subject_length = subject.length(); |
|
440 |
4442 |
const size_t pattern_length = pattern_.length(); |
|
441 |
4442 |
int* char_occurrences = bad_char_shift_table_; |
|
442 |
4442 |
int64_t badness = -pattern_length; |
|
443 |
|||
444 |
// How bad we are doing without a good-suffix table. |
||
445 |
4442 |
Char last_char = pattern_[pattern_length - 1]; |
|
446 |
int last_char_shift = |
||
447 |
pattern_length - 1 - |
||
448 |
4442 |
CharOccurrence(char_occurrences, last_char); |
|
449 |
|||
450 |
// Perform search |
||
451 |
4442 |
size_t index = start_index; // No matches found prior to this index. |
|
452 |
✓✓✗✗ |
124662 |
while (index <= subject_length - pattern_length) { |
453 |
119677 |
size_t j = pattern_length - 1; |
|
454 |
int subject_char; |
||
455 |
✓✓✗✗ |
1188625 |
while (last_char != (subject_char = subject[index + j])) { |
456 |
953145 |
int bc_occ = CharOccurrence(char_occurrences, subject_char); |
|
457 |
953145 |
int shift = j - bc_occ; |
|
458 |
953145 |
index += shift; |
|
459 |
953145 |
badness += 1 - shift; // at most zero, so badness cannot increase. |
|
460 |
✓✓✗✗ |
953145 |
if (index > subject_length - pattern_length) { |
461 |
3874 |
return subject_length; |
|
462 |
} |
||
463 |
} |
||
464 |
115803 |
j--; |
|
465 |
✓✓✗✗ |
2941082 |
while (pattern_[j] == (subject[index + j])) { |
466 |
✓✓✗✗ |
2709498 |
if (j == 0) { |
467 |
22 |
return index; |
|
468 |
} |
||
469 |
2709476 |
j--; |
|
470 |
} |
||
471 |
115781 |
index += last_char_shift; |
|
472 |
// Badness increases by the number of characters we have |
||
473 |
// checked, and decreases by the number of characters we |
||
474 |
// can skip by shifting. It's a measure of how we are doing |
||
475 |
// compared to reading each character exactly once. |
||
476 |
115781 |
badness += (pattern_length - j) - last_char_shift; |
|
477 |
✓✓✗✗ |
115781 |
if (badness > 0) { |
478 |
3 |
PopulateBoyerMooreTable(); |
|
479 |
3 |
strategy_ = &StringSearch::BoyerMooreSearch; |
|
480 |
3 |
return BoyerMooreSearch(subject, index); |
|
481 |
} |
||
482 |
} |
||
483 |
543 |
return subject.length(); |
|
484 |
} |
||
485 |
|||
486 |
template <typename Char> |
||
487 |
4442 |
void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { |
|
488 |
4442 |
const size_t pattern_length = pattern_.length(); |
|
489 |
|||
490 |
4442 |
int* bad_char_occurrence = bad_char_shift_table_; |
|
491 |
|||
492 |
// Only preprocess at most kBMMaxShift last characters of pattern. |
||
493 |
4442 |
const size_t start = start_; |
|
494 |
// Run forwards to populate bad_char_table, so that *last* instance |
||
495 |
// of character equivalence class is the one registered. |
||
496 |
// Notice: Doesn't include the last character. |
||
497 |
4442 |
const size_t table_size = AlphabetSize(); |
|
498 |
✓✓✗✗ |
4442 |
if (start == 0) { |
499 |
// All patterns less than kBMMaxShift in length. |
||
500 |
4435 |
memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
|
501 |
} else { |
||
502 |
✓✓✗✗ |
1799 |
for (size_t i = 0; i < table_size; i++) { |
503 |
1792 |
bad_char_occurrence[i] = start - 1; |
|
504 |
} |
||
505 |
} |
||
506 |
✓✓✗✗ |
42349 |
for (size_t i = start; i < pattern_length - 1; i++) { |
507 |
37907 |
Char c = pattern_[i]; |
|
508 |
37907 |
int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); |
|
509 |
37907 |
bad_char_occurrence[bucket] = i; |
|
510 |
} |
||
511 |
4442 |
} |
|
512 |
|||
513 |
//--------------------------------------------------------------------- |
||
514 |
// Linear string search with bailout to BMH. |
||
515 |
//--------------------------------------------------------------------- |
||
516 |
|||
517 |
// Simple linear search for short patterns, which bails out if the string |
||
518 |
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
||
519 |
template <typename Char> |
||
520 |
5097 |
size_t StringSearch<Char>::InitialSearch( |
|
521 |
Vector subject, |
||
522 |
size_t index) { |
||
523 |
5097 |
const size_t pattern_length = pattern_.length(); |
|
524 |
// Badness is a count of how much work we have done. When we have |
||
525 |
// done enough work we decide it's probably worth switching to a better |
||
526 |
// algorithm. |
||
527 |
5097 |
int64_t badness = -10 - (pattern_length << 2); |
|
528 |
|||
529 |
// We know our pattern is at least 2 characters, we cache the first so |
||
530 |
// the common case of the first character not matching is faster. |
||
531 |
✓✗✓✗ |
479572 |
for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
532 |
479572 |
badness++; |
|
533 |
✓✓✓✗ |
479572 |
if (badness <= 0) { |
534 |
475130 |
i = FindFirstCharacter(pattern_, subject, i); |
|
535 |
✓✓✗✓ |
475130 |
if (i == subject.length()) |
536 |
5101 |
return subject.length(); |
|
537 |
✗✓✗✓ |
475126 |
CHECK_LE(i, n); |
538 |
475126 |
size_t j = 1; |
|
539 |
✓✓✓✓ |
10160292 |
do { |
540 |
✓✓✗✓ |
10634767 |
if (pattern_[j] != subject[i + j]) { |
541 |
474475 |
break; |
|
542 |
} |
||
543 |
10160292 |
j++; |
|
544 |
} while (j < pattern_length); |
||
545 |
✓✓✓✗ |
475126 |
if (j == pattern_length) { |
546 |
651 |
return i; |
|
547 |
} |
||
548 |
474475 |
badness += j; |
|
549 |
} else { |
||
550 |
4442 |
PopulateBoyerMooreHorspoolTable(); |
|
551 |
4442 |
strategy_ = &StringSearch::BoyerMooreHorspoolSearch; |
|
552 |
4442 |
return BoyerMooreHorspoolSearch(subject, i); |
|
553 |
} |
||
554 |
} |
||
555 |
return subject.length(); |
||
556 |
} |
||
557 |
|||
558 |
// Perform a single stand-alone search. |
||
559 |
// If searching multiple times for the same pattern, a search |
||
560 |
// object should be constructed once and the Search function then called |
||
561 |
// for each search. |
||
562 |
template <typename Char> |
||
563 |
21208 |
size_t SearchString(Vector<const Char> subject, |
|
564 |
Vector<const Char> pattern, |
||
565 |
size_t start_index) { |
||
566 |
21208 |
StringSearch<Char> search(pattern); |
|
567 |
21208 |
return search.Search(subject, start_index); |
|
568 |
} |
||
569 |
} // namespace stringsearch |
||
570 |
} // namespace node |
||
571 |
|||
572 |
namespace node { |
||
573 |
|||
574 |
template <typename Char> |
||
575 |
21211 |
size_t SearchString(const Char* haystack, |
|
576 |
size_t haystack_length, |
||
577 |
const Char* needle, |
||
578 |
size_t needle_length, |
||
579 |
size_t start_index, |
||
580 |
bool is_forward) { |
||
581 |
✓✓✗✓ |
21211 |
if (haystack_length < needle_length) return haystack_length; |
582 |
// To do a reverse search (lastIndexOf instead of indexOf) without redundant |
||
583 |
// code, create two vectors that are reversed views into the input strings. |
||
584 |
// For example, v_needle[0] would return the *last* character of the needle. |
||
585 |
// So we're searching for the first instance of rev(needle) in rev(haystack) |
||
586 |
21208 |
stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); |
|
587 |
stringsearch::Vector<const Char> v_haystack( |
||
588 |
21208 |
haystack, haystack_length, is_forward); |
|
589 |
21208 |
size_t diff = haystack_length - needle_length; |
|
590 |
size_t relative_start_index; |
||
591 |
✓✓✓✓ |
21208 |
if (is_forward) { |
592 |
21133 |
relative_start_index = start_index; |
|
593 |
✓✓✓✗ |
75 |
} else if (diff < start_index) { |
594 |
31 |
relative_start_index = 0; |
|
595 |
} else { |
||
596 |
44 |
relative_start_index = diff - start_index; |
|
597 |
} |
||
598 |
size_t pos = node::stringsearch::SearchString( |
||
599 |
21208 |
v_haystack, v_needle, relative_start_index); |
|
600 |
✓✓✓✓ |
21208 |
if (pos == haystack_length) { |
601 |
// not found |
||
602 |
11769 |
return pos; |
|
603 |
} |
||
604 |
✓✓✓✗ |
9439 |
return is_forward ? pos : (haystack_length - needle_length - pos); |
605 |
} |
||
606 |
|||
607 |
template <size_t N> |
||
608 |
20165 |
size_t SearchString(const char* haystack, size_t haystack_length, |
|
609 |
const char (&needle)[N]) { |
||
610 |
return SearchString( |
||
611 |
reinterpret_cast<const uint8_t*>(haystack), haystack_length, |
||
612 |
20165 |
reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); |
|
613 |
} |
||
614 |
|||
615 |
} // namespace node |
||
616 |
|||
617 |
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
||
618 |
|||
619 |
#endif // SRC_STRING_SEARCH_H_ |
Generated by: GCOVR (Version 3.4) |