GCC Code Coverage Report
Directory: ../ Exec Total Coverage
File: /home/iojs/build/workspace/node-test-commit-linux-coverage-daily/nodes/benchmark/out/../src/string_search.h Lines: 64 241 26.6 %
Date: 2019-02-01 22:03:38 Branches: 29 272 10.7 %

Line Branch Exec Source
1
// Copyright 2011 the V8 project authors. All rights reserved.
2
// Use of this source code is governed by a BSD-style license that can be
3
// found in the LICENSE file.
4
5
#ifndef SRC_STRING_SEARCH_H_
6
#define SRC_STRING_SEARCH_H_
7
8
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9
10
#include "util.h"
11
12
#include <string.h>
13
#include <algorithm>
14
15
namespace node {
16
namespace stringsearch {
17
18
template <typename T>
19
class Vector {
20
 public:
21
10702
  Vector(T* data, size_t length, bool isForward)
22
10702
      : start_(data), length_(length), is_forward_(isForward) {
23



10702
    CHECK(length > 0 && data != nullptr);
24
10702
  }
25
26
  // Returns the start of the memory range.
27
  // For vector v this is NOT necessarily &v[0], see forward().
28
1763981
  const T* start() const { return start_; }
29
30
  // Returns the length of the vector, in characters.
31
4467530
  size_t length() const { return length_; }
32
33
  // Returns true if the Vector is front-to-back, false if back-to-front.
34
  // In the latter case, v[0] corresponds to the *end* of the memory range.
35
1763981
  size_t forward() const { return is_forward_; }
36
37
  // Access individual vector elements - checks bounds in debug mode.
38
2690996
  T& operator[](size_t index) const {
39
    DCHECK_LT(index, length_);
40

2690996
    return start_[is_forward_ ? index : (length_ - index - 1)];
41
  }
42
43
 private:
44
  T* start_;
45
  size_t length_;
46
  bool is_forward_;
47
};
48
49
50
//---------------------------------------------------------------------
51
// String Search object.
52
//---------------------------------------------------------------------
53
54
// Class holding constants and methods that apply to all string search variants,
55
// independently of subject and pattern char size.
56
5351
class StringSearchBase {
57
 protected:
58
  // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
59
  // limit, we can fix the size of tables. For a needle longer than this limit,
60
  // search will not be optimal, since we only build tables for a suffix
61
  // of the string, but it is a safe approximation.
62
  static const int kBMMaxShift = 250;
63
64
  // Reduce alphabet to this size.
65
  // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
66
  // proportional to the input alphabet. We reduce the alphabet size by
67
  // equating input characters modulo a smaller alphabet size. This gives
68
  // a potentially less efficient searching, but is a safe approximation.
69
  // For needles using only characters in the same Unicode 256-code point page,
70
  // there is no search speed degradation.
71
  static const int kLatin1AlphabetSize = 256;
72
  static const int kUC16AlphabetSize = 256;
73
74
  // Bad-char shift table stored in the state. It's length is the alphabet size.
75
  // For patterns below this length, the skip length of Boyer-Moore is too short
76
  // to compensate for the algorithmic overhead compared to simple brute force.
77
  static const int kBMMinPatternLength = 8;
78
79
  // Store for the BoyerMoore(Horspool) bad char shift table.
80
  int bad_char_shift_table_[kUC16AlphabetSize];
81
  // Store for the BoyerMoore good suffix shift table.
82
  int good_suffix_shift_table_[kBMMaxShift + 1];
83
  // Table used temporarily while building the BoyerMoore good suffix
84
  // shift table.
85
  int suffix_table_[kBMMaxShift + 1];
86
};
87
88
template <typename Char>
89
class StringSearch : private StringSearchBase {
90
 public:
91
  typedef stringsearch::Vector<const Char> Vector;
92
93
5351
  explicit StringSearch(Vector pattern)
94
5351
      : pattern_(pattern), start_(0) {
95

5351
    if (pattern.length() >= kBMMaxShift) {
96
      start_ = pattern.length() - kBMMaxShift;
97
    }
98
99
5351
    size_t pattern_length = pattern_.length();
100

5351
    CHECK_GT(pattern_length, 0);
101

5351
    if (pattern_length < kBMMinPatternLength) {
102

5351
      if (pattern_length == 1) {
103
        strategy_ = &StringSearch::SingleCharSearch;
104
5351
        return;
105
      }
106
5351
      strategy_ = &StringSearch::LinearSearch;
107
5351
      return;
108
    }
109
    strategy_ = &StringSearch::InitialSearch;
110
  }
111
112
5351
  size_t Search(Vector subject, size_t index) {
113

5351
    return (this->*strategy_)(subject, index);
114
  }
115
116
  static inline int AlphabetSize() {
117
    if (sizeof(Char) == 1) {
118
      // Latin1 needle.
119
      return kLatin1AlphabetSize;
120
    } else {
121
      // UC16 needle.
122
      return kUC16AlphabetSize;
123
    }
124
125
    static_assert(sizeof(Char) == sizeof(uint8_t) ||
126
                  sizeof(Char) == sizeof(uint16_t),
127
                  "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
128
  }
129
130
 private:
131
  typedef size_t (StringSearch::*SearchFunction)(Vector, size_t);
132
  size_t SingleCharSearch(Vector subject, size_t start_index);
133
  size_t LinearSearch(Vector subject, size_t start_index);
134
  size_t InitialSearch(Vector subject, size_t start_index);
135
  size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index);
136
  size_t BoyerMooreSearch(Vector subject, size_t start_index);
137
138
  void PopulateBoyerMooreHorspoolTable();
139
140
  void PopulateBoyerMooreTable();
141
142
  static inline int CharOccurrence(int* bad_char_occurrence,
143
                                   Char char_code) {
144
    if (sizeof(Char) == 1) {
145
      return bad_char_occurrence[static_cast<int>(char_code)];
146
    }
147
    // Both pattern and subject are UC16. Reduce character to equivalence class.
148
    int equiv_class = char_code % kUC16AlphabetSize;
149
    return bad_char_occurrence[equiv_class];
150
  }
151
152
  // The pattern to search for.
153
  Vector pattern_;
154
  // Pointer to implementation of the search.
155
  SearchFunction strategy_;
156
  // Cache value of Max(0, pattern_length() - kBMMaxShift)
157
  size_t start_;
158
};
159
160
161
template <typename T, typename U>
162
inline T AlignDown(T value, U alignment) {
163
  return reinterpret_cast<T>(
164
      (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
165
}
166
167
168
inline uint8_t GetHighestValueByte(uint16_t character) {
169
  return std::max(static_cast<uint8_t>(character & 0xFF),
170
                  static_cast<uint8_t>(character >> 8));
171
}
172
173
174
inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
175
176
177
// Searches for a byte value in a memory buffer, back to front.
178
// Uses memrchr(3) on systems which support it, for speed.
179
// Falls back to a vanilla for loop on non-GNU systems such as Windows.
180
inline const void* MemrchrFill(const void* haystack, uint8_t needle,
181
                               size_t haystack_len) {
182
#ifdef _GNU_SOURCE
183
  return memrchr(haystack, needle, haystack_len);
184
#else
185
  const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
186
  for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
187
    if (haystack8[i] == needle) {
188
      return haystack8 + i;
189
    }
190
  }
191
  return nullptr;
192
#endif
193
}
194
195
196
// Finds the first occurrence of *two-byte* character pattern[0] in the string
197
// `subject`. Does not check that the whole pattern matches.
198
template <typename Char>
199
inline size_t FindFirstCharacter(Vector<const Char> pattern,
200
                                 Vector<const Char> subject, size_t index) {
201
  const Char pattern_first_char = pattern[0];
202
  const size_t max_n = (subject.length() - pattern.length() + 1);
203
204
  // For speed, search for the more `rare` of the two bytes in pattern[0]
205
  // using memchr / memrchr (which are much faster than a simple for loop).
206
  const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
207
  size_t pos = index;
208
  do {
209
    const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
210
    const void* void_pos;
211
    if (subject.forward()) {
212
      // Assert that bytes_to_search won't overflow
213
      CHECK_LE(pos, max_n);
214
      CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
215
      void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
216
    } else {
217
      CHECK_LE(pos, subject.length());
218
      CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
219
      void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
220
                             search_byte,
221
                             bytes_to_search);
222
    }
223
    const Char* char_pos = static_cast<const Char*>(void_pos);
224
    if (char_pos == nullptr)
225
      return subject.length();
226
227
    // Then, for each match, verify that the full two bytes match pattern[0].
228
    char_pos = AlignDown(char_pos, sizeof(Char));
229
    size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
230
    pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
231
    if (subject[pos] == pattern_first_char) {
232
      // Match found, hooray.
233
      return pos;
234
    }
235
    // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
236
  } while (++pos < max_n);
237
238
  return subject.length();
239
}
240
241
242
// Finds the first occurrence of the byte pattern[0] in string `subject`.
243
// Does not verify that the whole pattern matches.
244
template <>
245
882836
inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
246
                                 Vector<const uint8_t> subject,
247
                                 size_t index) {
248
882836
  const uint8_t pattern_first_char = pattern[0];
249
882836
  const size_t subj_len = subject.length();
250
882836
  const size_t max_n = (subject.length() - pattern.length() + 1);
251
252
  const void* pos;
253
882836
  if (subject.forward()) {
254
882836
    pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
255
  } else {
256
    pos = MemrchrFill(subject.start() + pattern.length() - 1,
257
                      pattern_first_char,
258
                      max_n - index);
259
  }
260
882836
  const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
261
882836
  if (char_pos == nullptr) {
262
1691
    return subj_len;
263
  }
264
265
881145
  size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
266
881145
  return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
267
}
268
269
//---------------------------------------------------------------------
270
// Single Character Pattern Search Strategy
271
//---------------------------------------------------------------------
272
273
template <typename Char>
274
size_t StringSearch<Char>::SingleCharSearch(
275
    Vector subject,
276
    size_t index) {
277
  CHECK_EQ(1, pattern_.length());
278
  return FindFirstCharacter(pattern_, subject, index);
279
}
280
281
//---------------------------------------------------------------------
282
// Linear Search Strategy
283
//---------------------------------------------------------------------
284
285
// Simple linear search for short patterns. Never bails out.
286
template <typename Char>
287
5351
size_t StringSearch<Char>::LinearSearch(
288
    Vector subject,
289
    size_t index) {
290

5351
  CHECK_GT(pattern_.length(), 1);
291
5351
  const size_t n = subject.length() - pattern_.length();
292

882836
  for (size_t i = index; i <= n; i++) {
293
882836
    i = FindFirstCharacter(pattern_, subject, i);
294

882836
    if (i == subject.length())
295
7042
      return subject.length();
296

881145
    CHECK_LE(i, n);
297
298
881145
    bool matches = true;
299

907740
    for (size_t j = 1; j < pattern_.length(); j++) {
300

904080
      if (pattern_[j] != subject[i + j]) {
301
877485
        matches = false;
302
877485
        break;
303
      }
304
    }
305

881145
    if (matches) {
306
3660
      return i;
307
    }
308
  }
309
  return subject.length();
310
}
311
312
//---------------------------------------------------------------------
313
// Boyer-Moore string search
314
//---------------------------------------------------------------------
315
316
template <typename Char>
317
size_t StringSearch<Char>::BoyerMooreSearch(
318
    Vector subject,
319
    size_t start_index) {
320
  const size_t subject_length = subject.length();
321
  const size_t pattern_length = pattern_.length();
322
  // Only preprocess at most kBMMaxShift last characters of pattern.
323
  size_t start = start_;
324
325
  int* bad_char_occurrence = bad_char_shift_table_;
326
  int* good_suffix_shift = good_suffix_shift_table_ - start_;
327
328
  Char last_char = pattern_[pattern_length - 1];
329
  size_t index = start_index;
330
  // Continue search from i.
331
  while (index <= subject_length - pattern_length) {
332
    size_t j = pattern_length - 1;
333
    int c;
334
    while (last_char != (c = subject[index + j])) {
335
      int shift = j - CharOccurrence(bad_char_occurrence, c);
336
      index += shift;
337
      if (index > subject_length - pattern_length) {
338
        return subject.length();
339
      }
340
    }
341
    while (pattern_[j] == (c = subject[index + j])) {
342
      if (j == 0) {
343
        return index;
344
      }
345
      j--;
346
    }
347
    if (j < start) {
348
      // we have matched more than our tables allow us to be smart about.
349
      // Fall back on BMH shift.
350
      index += pattern_length - 1 -
351
               CharOccurrence(bad_char_occurrence,
352
                              static_cast<Char>(last_char));
353
    } else {
354
      int gs_shift = good_suffix_shift[j + 1];
355
      int bc_occ = CharOccurrence(bad_char_occurrence, c);
356
      int shift = j - bc_occ;
357
      if (gs_shift > shift) {
358
        shift = gs_shift;
359
      }
360
      index += shift;
361
    }
362
  }
363
364
  return subject.length();
365
}
366
367
template <typename Char>
368
void StringSearch<Char>::PopulateBoyerMooreTable() {
369
  const size_t pattern_length = pattern_.length();
370
  // Only look at the last kBMMaxShift characters of pattern (from start_
371
  // to pattern_length).
372
  const size_t start = start_;
373
  const size_t length = pattern_length - start;
374
375
  // Biased tables so that we can use pattern indices as table indices,
376
  // even if we only cover the part of the pattern from offset start.
377
  int* shift_table = good_suffix_shift_table_ - start_;
378
  int* suffix_table = suffix_table_ - start_;
379
380
  // Initialize table.
381
  for (size_t i = start; i < pattern_length; i++) {
382
    shift_table[i] = length;
383
  }
384
  shift_table[pattern_length] = 1;
385
  suffix_table[pattern_length] = pattern_length + 1;
386
387
  if (pattern_length <= start) {
388
    return;
389
  }
390
391
  // Find suffixes.
392
  Char last_char = pattern_[pattern_length - 1];
393
  size_t suffix = pattern_length + 1;
394
  {
395
    size_t i = pattern_length;
396
    while (i > start) {
397
      Char c = pattern_[i - 1];
398
      while (suffix <= pattern_length && c != pattern_[suffix - 1]) {
399
        if (static_cast<size_t>(shift_table[suffix]) == length) {
400
          shift_table[suffix] = suffix - i;
401
        }
402
        suffix = suffix_table[suffix];
403
      }
404
      suffix_table[--i] = --suffix;
405
      if (suffix == pattern_length) {
406
        // No suffix to extend, so we check against last_char only.
407
        while ((i > start) && (pattern_[i - 1] != last_char)) {
408
          if (static_cast<size_t>(shift_table[pattern_length]) == length) {
409
            shift_table[pattern_length] = pattern_length - i;
410
          }
411
          suffix_table[--i] = pattern_length;
412
        }
413
        if (i > start) {
414
          suffix_table[--i] = --suffix;
415
        }
416
      }
417
    }
418
  }
419
  // Build shift table using suffixes.
420
  if (suffix < pattern_length) {
421
    for (size_t i = start; i <= pattern_length; i++) {
422
      if (static_cast<size_t>(shift_table[i]) == length) {
423
        shift_table[i] = suffix - start;
424
      }
425
      if (i == suffix) {
426
        suffix = suffix_table[suffix];
427
      }
428
    }
429
  }
430
}
431
432
//---------------------------------------------------------------------
433
// Boyer-Moore-Horspool string search.
434
//---------------------------------------------------------------------
435
436
template <typename Char>
437
size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
438
    Vector subject,
439
    size_t start_index) {
440
  const size_t subject_length = subject.length();
441
  const size_t pattern_length = pattern_.length();
442
  int* char_occurrences = bad_char_shift_table_;
443
  int64_t badness = -pattern_length;
444
445
  // How bad we are doing without a good-suffix table.
446
  Char last_char = pattern_[pattern_length - 1];
447
  int last_char_shift =
448
      pattern_length - 1 -
449
      CharOccurrence(char_occurrences, static_cast<Char>(last_char));
450
451
  // Perform search
452
  size_t index = start_index;  // No matches found prior to this index.
453
  while (index <= subject_length - pattern_length) {
454
    size_t j = pattern_length - 1;
455
    int subject_char;
456
    while (last_char != (subject_char = subject[index + j])) {
457
      int bc_occ = CharOccurrence(char_occurrences, subject_char);
458
      int shift = j - bc_occ;
459
      index += shift;
460
      badness += 1 - shift;  // at most zero, so badness cannot increase.
461
      if (index > subject_length - pattern_length) {
462
        return subject_length;
463
      }
464
    }
465
    j--;
466
    while (pattern_[j] == (subject[index + j])) {
467
      if (j == 0) {
468
        return index;
469
      }
470
      j--;
471
    }
472
    index += last_char_shift;
473
    // Badness increases by the number of characters we have
474
    // checked, and decreases by the number of characters we
475
    // can skip by shifting. It's a measure of how we are doing
476
    // compared to reading each character exactly once.
477
    badness += (pattern_length - j) - last_char_shift;
478
    if (badness > 0) {
479
      PopulateBoyerMooreTable();
480
      strategy_ = &StringSearch::BoyerMooreSearch;
481
      return BoyerMooreSearch(subject, index);
482
    }
483
  }
484
  return subject.length();
485
}
486
487
template <typename Char>
488
void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
489
  const size_t pattern_length = pattern_.length();
490
491
  int* bad_char_occurrence = bad_char_shift_table_;
492
493
  // Only preprocess at most kBMMaxShift last characters of pattern.
494
  const size_t start = start_;
495
  // Run forwards to populate bad_char_table, so that *last* instance
496
  // of character equivalence class is the one registered.
497
  // Notice: Doesn't include the last character.
498
  const size_t table_size = AlphabetSize();
499
  if (start == 0) {
500
    // All patterns less than kBMMaxShift in length.
501
    memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
502
  } else {
503
    for (size_t i = 0; i < table_size; i++) {
504
      bad_char_occurrence[i] = start - 1;
505
    }
506
  }
507
  for (size_t i = start; i < pattern_length - 1; i++) {
508
    Char c = pattern_[i];
509
    int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
510
    bad_char_occurrence[bucket] = i;
511
  }
512
}
513
514
//---------------------------------------------------------------------
515
// Linear string search with bailout to BMH.
516
//---------------------------------------------------------------------
517
518
// Simple linear search for short patterns, which bails out if the string
519
// isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
520
template <typename Char>
521
size_t StringSearch<Char>::InitialSearch(
522
    Vector subject,
523
    size_t index) {
524
  const size_t pattern_length = pattern_.length();
525
  // Badness is a count of how much work we have done.  When we have
526
  // done enough work we decide it's probably worth switching to a better
527
  // algorithm.
528
  int64_t badness = -10 - (pattern_length << 2);
529
530
  // We know our pattern is at least 2 characters, we cache the first so
531
  // the common case of the first character not matching is faster.
532
  for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
533
    badness++;
534
    if (badness <= 0) {
535
      i = FindFirstCharacter(pattern_, subject, i);
536
      if (i == subject.length())
537
        return subject.length();
538
      CHECK_LE(i, n);
539
      size_t j = 1;
540
      do {
541
        if (pattern_[j] != subject[i + j]) {
542
          break;
543
        }
544
        j++;
545
      } while (j < pattern_length);
546
      if (j == pattern_length) {
547
        return i;
548
      }
549
      badness += j;
550
    } else {
551
      PopulateBoyerMooreHorspoolTable();
552
      strategy_ = &StringSearch::BoyerMooreHorspoolSearch;
553
      return BoyerMooreHorspoolSearch(subject, i);
554
    }
555
  }
556
  return subject.length();
557
}
558
559
// Perform a single stand-alone search.
560
// If searching multiple times for the same pattern, a search
561
// object should be constructed once and the Search function then called
562
// for each search.
563
template <typename Char>
564
5351
size_t SearchString(Vector<const Char> subject,
565
                    Vector<const Char> pattern,
566
                    size_t start_index) {
567
5351
  StringSearch<Char> search(pattern);
568
5351
  return search.Search(subject, start_index);
569
}
570
}  // namespace stringsearch
571
}  // namespace node
572
573
namespace node {
574
575
template <typename Char>
576
5351
size_t SearchString(const Char* haystack,
577
                    size_t haystack_length,
578
                    const Char* needle,
579
                    size_t needle_length,
580
                    size_t start_index,
581
                    bool is_forward) {
582

5351
  if (haystack_length < needle_length) return haystack_length;
583
  // To do a reverse search (lastIndexOf instead of indexOf) without redundant
584
  // code, create two vectors that are reversed views into the input strings.
585
  // For example, v_needle[0] would return the *last* character of the needle.
586
  // So we're searching for the first instance of rev(needle) in rev(haystack)
587
5351
  stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward);
588
  stringsearch::Vector<const Char> v_haystack(
589
5351
      haystack, haystack_length, is_forward);
590
5351
  size_t diff = haystack_length - needle_length;
591
  size_t relative_start_index;
592

5351
  if (is_forward) {
593
5351
    relative_start_index = start_index;
594
  } else if (diff < start_index) {
595
    relative_start_index = 0;
596
  } else {
597
    relative_start_index = diff - start_index;
598
  }
599
  size_t pos = node::stringsearch::SearchString(
600
5351
      v_haystack, v_needle, relative_start_index);
601

5351
  if (pos == haystack_length) {
602
    // not found
603
1691
    return pos;
604
  }
605

3660
  return is_forward ? pos : (haystack_length - needle_length - pos);
606
}
607
608
template <size_t N>
609
5351
size_t SearchString(const char* haystack, size_t haystack_length,
610
                    const char (&needle)[N]) {
611
  return SearchString(
612
      reinterpret_cast<const uint8_t*>(haystack), haystack_length,
613
5351
      reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true);
614
}
615
616
}  // namespace node
617
618
#endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
619
620
#endif  // SRC_STRING_SEARCH_H_