summaryrefslogtreecommitdiff
path: root/js/src/new-regexp/regexp-compiler.h
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/new-regexp/regexp-compiler.h')
-rw-r--r--js/src/new-regexp/regexp-compiler.h621
1 files changed, 0 insertions, 621 deletions
diff --git a/js/src/new-regexp/regexp-compiler.h b/js/src/new-regexp/regexp-compiler.h
deleted file mode 100644
index 186d5e838..000000000
--- a/js/src/new-regexp/regexp-compiler.h
+++ /dev/null
@@ -1,621 +0,0 @@
-// Copyright 2019 the V8 project authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef V8_REGEXP_REGEXP_COMPILER_H_
-#define V8_REGEXP_REGEXP_COMPILER_H_
-
-#include <bitset>
-
-#include "new-regexp/regexp-nodes.h"
-
-namespace v8 {
-namespace internal {
-
-class DynamicBitSet;
-class Isolate;
-
-namespace regexp_compiler_constants {
-
-// The '2' variant is has inclusive from and exclusive to.
-// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
-// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
-constexpr uc32 kRangeEndMarker = 0x110000;
-constexpr int kSpaceRanges[] = {
- '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
- 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
- 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
-constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
-
-constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
- '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
-constexpr int kWordRangeCount = arraysize(kWordRanges);
-constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
-constexpr int kDigitRangeCount = arraysize(kDigitRanges);
-constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
- kLeadSurrogateStart + 1, kRangeEndMarker};
-constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
-constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
- 0x2028, 0x202A, kRangeEndMarker};
-constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
-
-// More makes code generation slower, less makes V8 benchmark score lower.
-constexpr int kMaxLookaheadForBoyerMoore = 8;
-// In a 3-character pattern you can maximally step forwards 3 characters
-// at a time, which is not always enough to pay for the extra logic.
-constexpr int kPatternTooShortForBoyerMoore = 2;
-
-} // namespace regexp_compiler_constants
-
-inline bool IgnoreCase(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kIgnoreCase) != 0;
-}
-
-inline bool IsUnicode(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kUnicode) != 0;
-}
-
-inline bool IsSticky(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kSticky) != 0;
-}
-
-inline bool IsGlobal(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kGlobal) != 0;
-}
-
-inline bool DotAll(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kDotAll) != 0;
-}
-
-inline bool Multiline(JSRegExp::Flags flags) {
- return (flags & JSRegExp::kMultiline) != 0;
-}
-
-inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
- // Both unicode and ignore_case flags are set. We need to use ICU to find
- // the closure over case equivalents.
- return IsUnicode(flags) && IgnoreCase(flags);
-}
-
-// Details of a quick mask-compare check that can look ahead in the
-// input stream.
-class QuickCheckDetails {
- public:
- QuickCheckDetails()
- : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
- explicit QuickCheckDetails(int characters)
- : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
- bool Rationalize(bool one_byte);
- // Merge in the information from another branch of an alternation.
- void Merge(QuickCheckDetails* other, int from_index);
- // Advance the current position by some amount.
- void Advance(int by, bool one_byte);
- void Clear();
- bool cannot_match() { return cannot_match_; }
- void set_cannot_match() { cannot_match_ = true; }
- struct Position {
- Position() : mask(0), value(0), determines_perfectly(false) {}
- uc16 mask;
- uc16 value;
- bool determines_perfectly;
- };
- int characters() { return characters_; }
- void set_characters(int characters) { characters_ = characters; }
- Position* positions(int index) {
- DCHECK_LE(0, index);
- DCHECK_GT(characters_, index);
- return positions_ + index;
- }
- uint32_t mask() { return mask_; }
- uint32_t value() { return value_; }
-
- private:
- // How many characters do we have quick check information from. This is
- // the same for all branches of a choice node.
- int characters_;
- Position positions_[4];
- // These values are the condensate of the above array after Rationalize().
- uint32_t mask_;
- uint32_t value_;
- // If set to true, there is no way this quick check can match at all.
- // E.g., if it requires to be at the start of the input, and isn't.
- bool cannot_match_;
-};
-
-// Improve the speed that we scan for an initial point where a non-anchored
-// regexp can match by using a Boyer-Moore-like table. This is done by
-// identifying non-greedy non-capturing loops in the nodes that eat any
-// character one at a time. For example in the middle of the regexp
-// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
-// inserted at the start of any non-anchored regexp.
-//
-// When we have found such a loop we look ahead in the nodes to find the set of
-// characters that can come at given distances. For example for the regexp
-// /.?foo/ we know that there are at least 3 characters ahead of us, and the
-// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
-// the lookahead info where the set of characters is reasonably constrained. In
-// our example this is from index 1 to 2 (0 is not constrained). We can now
-// look 3 characters ahead and if we don't find one of [f, o] (the union of
-// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
-//
-// For Unicode input strings we do the same, but modulo 128.
-//
-// We also look at the first string fed to the regexp and use that to get a hint
-// of the character frequencies in the inputs. This affects the assessment of
-// whether the set of characters is 'reasonably constrained'.
-//
-// We also have another lookahead mechanism (called quick check in the code),
-// which uses a wide load of multiple characters followed by a mask and compare
-// to determine whether a match is possible at this point.
-enum ContainedInLattice {
- kNotYet = 0,
- kLatticeIn = 1,
- kLatticeOut = 2,
- kLatticeUnknown = 3 // Can also mean both in and out.
-};
-
-inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
- return static_cast<ContainedInLattice>(a | b);
-}
-
-class BoyerMoorePositionInfo : public ZoneObject {
- public:
- bool at(int i) const { return map_[i]; }
-
- static constexpr int kMapSize = 128;
- static constexpr int kMask = kMapSize - 1;
-
- int map_count() const { return map_count_; }
-
- void Set(int character);
- void SetInterval(const Interval& interval);
- void SetAll();
-
- bool is_non_word() { return w_ == kLatticeOut; }
- bool is_word() { return w_ == kLatticeIn; }
-
- using Bitset = std::bitset<kMapSize>;
- Bitset raw_bitset() const { return map_; }
-
- private:
- Bitset map_;
- int map_count_ = 0; // Number of set bits in the map.
- ContainedInLattice w_ = kNotYet; // The \w character class.
-};
-
-class BoyerMooreLookahead : public ZoneObject {
- public:
- BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
-
- int length() { return length_; }
- int max_char() { return max_char_; }
- RegExpCompiler* compiler() { return compiler_; }
-
- int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
-
- BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
-
- void Set(int map_number, int character) {
- if (character > max_char_) return;
- BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
- info->Set(character);
- }
-
- void SetInterval(int map_number, const Interval& interval) {
- if (interval.from() > max_char_) return;
- BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
- if (interval.to() > max_char_) {
- info->SetInterval(Interval(interval.from(), max_char_));
- } else {
- info->SetInterval(interval);
- }
- }
-
- void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
-
- void SetRest(int from_map) {
- for (int i = from_map; i < length_; i++) SetAll(i);
- }
- void EmitSkipInstructions(RegExpMacroAssembler* masm);
-
- private:
- // This is the value obtained by EatsAtLeast. If we do not have at least this
- // many characters left in the sample string then the match is bound to fail.
- // Therefore it is OK to read a character this far ahead of the current match
- // point.
- int length_;
- RegExpCompiler* compiler_;
- // 0xff for Latin1, 0xffff for UTF-16.
- int max_char_;
- ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
-
- int GetSkipTable(int min_lookahead, int max_lookahead,
- Handle<ByteArray> boolean_skip_table);
- bool FindWorthwhileInterval(int* from, int* to);
- int FindBestInterval(int max_number_of_chars, int old_biggest_points,
- int* from, int* to);
-};
-
-// There are many ways to generate code for a node. This class encapsulates
-// the current way we should be generating. In other words it encapsulates
-// the current state of the code generator. The effect of this is that we
-// generate code for paths that the matcher can take through the regular
-// expression. A given node in the regexp can be code-generated several times
-// as it can be part of several traces. For example for the regexp:
-// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
-// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
-// to match foo is generated only once (the traces have a common prefix). The
-// code to store the capture is deferred and generated (twice) after the places
-// where baz has been matched.
-class Trace {
- public:
- // A value for a property that is either known to be true, know to be false,
- // or not known.
- enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
-
- class DeferredAction {
- public:
- DeferredAction(ActionNode::ActionType action_type, int reg)
- : action_type_(action_type), reg_(reg), next_(nullptr) {}
- DeferredAction* next() { return next_; }
- bool Mentions(int reg);
- int reg() { return reg_; }
- ActionNode::ActionType action_type() { return action_type_; }
-
- private:
- ActionNode::ActionType action_type_;
- int reg_;
- DeferredAction* next_;
- friend class Trace;
- };
-
- class DeferredCapture : public DeferredAction {
- public:
- DeferredCapture(int reg, bool is_capture, Trace* trace)
- : DeferredAction(ActionNode::STORE_POSITION, reg),
- cp_offset_(trace->cp_offset()),
- is_capture_(is_capture) {}
- int cp_offset() { return cp_offset_; }
- bool is_capture() { return is_capture_; }
-
- private:
- int cp_offset_;
- bool is_capture_;
- void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
- };
-
- class DeferredSetRegisterForLoop : public DeferredAction {
- public:
- DeferredSetRegisterForLoop(int reg, int value)
- : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
- value_(value) {}
- int value() { return value_; }
-
- private:
- int value_;
- };
-
- class DeferredClearCaptures : public DeferredAction {
- public:
- explicit DeferredClearCaptures(Interval range)
- : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
- Interval range() { return range_; }
-
- private:
- Interval range_;
- };
-
- class DeferredIncrementRegister : public DeferredAction {
- public:
- explicit DeferredIncrementRegister(int reg)
- : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
- };
-
- Trace()
- : cp_offset_(0),
- actions_(nullptr),
- backtrack_(nullptr),
- stop_node_(nullptr),
- loop_label_(nullptr),
- characters_preloaded_(0),
- bound_checked_up_to_(0),
- flush_budget_(100),
- at_start_(UNKNOWN) {}
-
- // End the trace. This involves flushing the deferred actions in the trace
- // and pushing a backtrack location onto the backtrack stack. Once this is
- // done we can start a new trace or go to one that has already been
- // generated.
- void Flush(RegExpCompiler* compiler, RegExpNode* successor);
- int cp_offset() { return cp_offset_; }
- DeferredAction* actions() { return actions_; }
- // A trivial trace is one that has no deferred actions or other state that
- // affects the assumptions used when generating code. There is no recorded
- // backtrack location in a trivial trace, so with a trivial trace we will
- // generate code that, on a failure to match, gets the backtrack location
- // from the backtrack stack rather than using a direct jump instruction. We
- // always start code generation with a trivial trace and non-trivial traces
- // are created as we emit code for nodes or add to the list of deferred
- // actions in the trace. The location of the code generated for a node using
- // a trivial trace is recorded in a label in the node so that gotos can be
- // generated to that code.
- bool is_trivial() {
- return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
- characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
- quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
- }
- TriBool at_start() { return at_start_; }
- void set_at_start(TriBool at_start) { at_start_ = at_start; }
- Label* backtrack() { return backtrack_; }
- Label* loop_label() { return loop_label_; }
- RegExpNode* stop_node() { return stop_node_; }
- int characters_preloaded() { return characters_preloaded_; }
- int bound_checked_up_to() { return bound_checked_up_to_; }
- int flush_budget() { return flush_budget_; }
- QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
- bool mentions_reg(int reg);
- // Returns true if a deferred position store exists to the specified
- // register and stores the offset in the out-parameter. Otherwise
- // returns false.
- bool GetStoredPosition(int reg, int* cp_offset);
- // These set methods and AdvanceCurrentPositionInTrace should be used only on
- // new traces - the intention is that traces are immutable after creation.
- void add_action(DeferredAction* new_action) {
- DCHECK(new_action->next_ == nullptr);
- new_action->next_ = actions_;
- actions_ = new_action;
- }
- void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
- void set_stop_node(RegExpNode* node) { stop_node_ = node; }
- void set_loop_label(Label* label) { loop_label_ = label; }
- void set_characters_preloaded(int count) { characters_preloaded_ = count; }
- void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
- void set_flush_budget(int to) { flush_budget_ = to; }
- void set_quick_check_performed(QuickCheckDetails* d) {
- quick_check_performed_ = *d;
- }
- void InvalidateCurrentCharacter();
- void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
-
- private:
- int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
- void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
- const DynamicBitSet& affected_registers,
- DynamicBitSet* registers_to_pop,
- DynamicBitSet* registers_to_clear, Zone* zone);
- void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
- const DynamicBitSet& registers_to_pop,
- const DynamicBitSet& registers_to_clear);
- int cp_offset_;
- DeferredAction* actions_;
- Label* backtrack_;
- RegExpNode* stop_node_;
- Label* loop_label_;
- int characters_preloaded_;
- int bound_checked_up_to_;
- QuickCheckDetails quick_check_performed_;
- int flush_budget_;
- TriBool at_start_;
-};
-
-class GreedyLoopState {
- public:
- explicit GreedyLoopState(bool not_at_start);
-
- Label* label() { return &label_; }
- Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
-
- private:
- Label label_;
- Trace counter_backtrack_trace_;
-};
-
-struct PreloadState {
- static const int kEatsAtLeastNotYetInitialized = -1;
- bool preload_is_current_;
- bool preload_has_checked_bounds_;
- int preload_characters_;
- int eats_at_least_;
- void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
-};
-
-// Analysis performs assertion propagation and computes eats_at_least_ values.
-// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
-// details.
-RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);
-
-class FrequencyCollator {
- public:
- FrequencyCollator() : total_samples_(0) {
- for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
- frequencies_[i] = CharacterFrequency(i);
- }
- }
-
- void CountCharacter(int character) {
- int index = (character & RegExpMacroAssembler::kTableMask);
- frequencies_[index].Increment();
- total_samples_++;
- }
-
- // Does not measure in percent, but rather per-128 (the table size from the
- // regexp macro assembler).
- int Frequency(int in_character) {
- DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
- if (total_samples_ < 1) return 1; // Division by zero.
- int freq_in_per128 =
- (frequencies_[in_character].counter() * 128) / total_samples_;
- return freq_in_per128;
- }
-
- private:
- class CharacterFrequency {
- public:
- CharacterFrequency() : counter_(0), character_(-1) {}
- explicit CharacterFrequency(int character)
- : counter_(0), character_(character) {}
-
- void Increment() { counter_++; }
- int counter() { return counter_; }
- int character() { return character_; }
-
- private:
- int counter_;
- int character_;
- };
-
- private:
- CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
- int total_samples_;
-};
-
-class RegExpCompiler {
- public:
- RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
- bool is_one_byte);
-
- int AllocateRegister() {
- if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
- reg_exp_too_big_ = true;
- return next_register_;
- }
- return next_register_++;
- }
-
- // Lookarounds to match lone surrogates for unicode character class matches
- // are never nested. We can therefore reuse registers.
- int UnicodeLookaroundStackRegister() {
- if (unicode_lookaround_stack_register_ == kNoRegister) {
- unicode_lookaround_stack_register_ = AllocateRegister();
- }
- return unicode_lookaround_stack_register_;
- }
-
- int UnicodeLookaroundPositionRegister() {
- if (unicode_lookaround_position_register_ == kNoRegister) {
- unicode_lookaround_position_register_ = AllocateRegister();
- }
- return unicode_lookaround_position_register_;
- }
-
- struct CompilationResult final {
- explicit CompilationResult(RegExpError err) : error(err) {}
- CompilationResult(Object code, int registers)
- : code(code), num_registers(registers) {}
-
- static CompilationResult RegExpTooBig() {
- return CompilationResult(RegExpError::kTooLarge);
- }
-
- bool Succeeded() const { return error == RegExpError::kNone; }
-
- const RegExpError error = RegExpError::kNone;
- Object code;
- int num_registers = 0;
- };
-
- CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
- RegExpNode* start, int capture_count,
- Handle<String> pattern);
-
- // If the regexp matching starts within a surrogate pair, step back to the
- // lead surrogate and start matching from there.
- static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
- RegExpNode* on_success,
- JSRegExp::Flags flags);
-
- inline void AddWork(RegExpNode* node) {
- if (!node->on_work_list() && !node->label()->is_bound()) {
- node->set_on_work_list(true);
- work_list_->push_back(node);
- }
- }
-
- static const int kImplementationOffset = 0;
- static const int kNumberOfRegistersOffset = 0;
- static const int kCodeOffset = 1;
-
- RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
- EndNode* accept() { return accept_; }
-
- static const int kMaxRecursion = 100;
- inline int recursion_depth() { return recursion_depth_; }
- inline void IncrementRecursionDepth() { recursion_depth_++; }
- inline void DecrementRecursionDepth() { recursion_depth_--; }
-
- void SetRegExpTooBig() { reg_exp_too_big_ = true; }
-
- inline bool one_byte() { return one_byte_; }
- inline bool optimize() { return optimize_; }
- inline void set_optimize(bool value) { optimize_ = value; }
- inline bool limiting_recursion() { return limiting_recursion_; }
- inline void set_limiting_recursion(bool value) {
- limiting_recursion_ = value;
- }
- bool read_backward() { return read_backward_; }
- void set_read_backward(bool value) { read_backward_ = value; }
- FrequencyCollator* frequency_collator() { return &frequency_collator_; }
-
- int current_expansion_factor() { return current_expansion_factor_; }
- void set_current_expansion_factor(int value) {
- current_expansion_factor_ = value;
- }
-
- Isolate* isolate() const { return isolate_; }
- Zone* zone() const { return zone_; }
-
- static const int kNoRegister = -1;
-
- private:
- EndNode* accept_;
- int next_register_;
- int unicode_lookaround_stack_register_;
- int unicode_lookaround_position_register_;
- ZoneVector<RegExpNode*>* work_list_;
- int recursion_depth_;
- RegExpMacroAssembler* macro_assembler_;
- bool one_byte_;
- bool reg_exp_too_big_;
- bool limiting_recursion_;
- bool optimize_;
- bool read_backward_;
- int current_expansion_factor_;
- FrequencyCollator frequency_collator_;
- Isolate* isolate_;
- Zone* zone_;
-};
-
-// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
-class UnicodeRangeSplitter {
- public:
- V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
-
- static constexpr int kInitialSize = 8;
- using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
-
- const CharacterRangeVector* bmp() const { return &bmp_; }
- const CharacterRangeVector* lead_surrogates() const {
- return &lead_surrogates_;
- }
- const CharacterRangeVector* trail_surrogates() const {
- return &trail_surrogates_;
- }
- const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
-
- private:
- void AddRange(CharacterRange range);
-
- CharacterRangeVector bmp_;
- CharacterRangeVector lead_surrogates_;
- CharacterRangeVector trail_surrogates_;
- CharacterRangeVector non_bmp_;
-};
-
-// We need to check for the following characters: 0x39C 0x3BC 0x178.
-// TODO(jgruber): Move to CharacterRange.
-bool RangeContainsLatin1Equivalents(CharacterRange range);
-
-} // namespace internal
-} // namespace v8
-
-#endif // V8_REGEXP_REGEXP_COMPILER_H_