diff options
Diffstat (limited to 'js/src/new-regexp/regexp-compiler.h')
-rw-r--r-- | js/src/new-regexp/regexp-compiler.h | 621 |
1 files changed, 0 insertions, 621 deletions
diff --git a/js/src/new-regexp/regexp-compiler.h b/js/src/new-regexp/regexp-compiler.h deleted file mode 100644 index 186d5e838..000000000 --- a/js/src/new-regexp/regexp-compiler.h +++ /dev/null @@ -1,621 +0,0 @@ -// Copyright 2019 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef V8_REGEXP_REGEXP_COMPILER_H_ -#define V8_REGEXP_REGEXP_COMPILER_H_ - -#include <bitset> - -#include "new-regexp/regexp-nodes.h" - -namespace v8 { -namespace internal { - -class DynamicBitSet; -class Isolate; - -namespace regexp_compiler_constants { - -// The '2' variant is has inclusive from and exclusive to. -// This covers \s as defined in ECMA-262 5.1, 15.10.2.12, -// which include WhiteSpace (7.2) or LineTerminator (7.3) values. -constexpr uc32 kRangeEndMarker = 0x110000; -constexpr int kSpaceRanges[] = { - '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, - 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, - 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; -constexpr int kSpaceRangeCount = arraysize(kSpaceRanges); - -constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', - '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; -constexpr int kWordRangeCount = arraysize(kWordRanges); -constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; -constexpr int kDigitRangeCount = arraysize(kDigitRanges); -constexpr int kSurrogateRanges[] = {kLeadSurrogateStart, - kLeadSurrogateStart + 1, kRangeEndMarker}; -constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges); -constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, - 0x2028, 0x202A, kRangeEndMarker}; -constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges); - -// More makes code generation slower, less makes V8 benchmark score lower. -constexpr int kMaxLookaheadForBoyerMoore = 8; -// In a 3-character pattern you can maximally step forwards 3 characters -// at a time, which is not always enough to pay for the extra logic. -constexpr int kPatternTooShortForBoyerMoore = 2; - -} // namespace regexp_compiler_constants - -inline bool IgnoreCase(JSRegExp::Flags flags) { - return (flags & JSRegExp::kIgnoreCase) != 0; -} - -inline bool IsUnicode(JSRegExp::Flags flags) { - return (flags & JSRegExp::kUnicode) != 0; -} - -inline bool IsSticky(JSRegExp::Flags flags) { - return (flags & JSRegExp::kSticky) != 0; -} - -inline bool IsGlobal(JSRegExp::Flags flags) { - return (flags & JSRegExp::kGlobal) != 0; -} - -inline bool DotAll(JSRegExp::Flags flags) { - return (flags & JSRegExp::kDotAll) != 0; -} - -inline bool Multiline(JSRegExp::Flags flags) { - return (flags & JSRegExp::kMultiline) != 0; -} - -inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) { - // Both unicode and ignore_case flags are set. We need to use ICU to find - // the closure over case equivalents. - return IsUnicode(flags) && IgnoreCase(flags); -} - -// Details of a quick mask-compare check that can look ahead in the -// input stream. -class QuickCheckDetails { - public: - QuickCheckDetails() - : characters_(0), mask_(0), value_(0), cannot_match_(false) {} - explicit QuickCheckDetails(int characters) - : characters_(characters), mask_(0), value_(0), cannot_match_(false) {} - bool Rationalize(bool one_byte); - // Merge in the information from another branch of an alternation. - void Merge(QuickCheckDetails* other, int from_index); - // Advance the current position by some amount. - void Advance(int by, bool one_byte); - void Clear(); - bool cannot_match() { return cannot_match_; } - void set_cannot_match() { cannot_match_ = true; } - struct Position { - Position() : mask(0), value(0), determines_perfectly(false) {} - uc16 mask; - uc16 value; - bool determines_perfectly; - }; - int characters() { return characters_; } - void set_characters(int characters) { characters_ = characters; } - Position* positions(int index) { - DCHECK_LE(0, index); - DCHECK_GT(characters_, index); - return positions_ + index; - } - uint32_t mask() { return mask_; } - uint32_t value() { return value_; } - - private: - // How many characters do we have quick check information from. This is - // the same for all branches of a choice node. - int characters_; - Position positions_[4]; - // These values are the condensate of the above array after Rationalize(). - uint32_t mask_; - uint32_t value_; - // If set to true, there is no way this quick check can match at all. - // E.g., if it requires to be at the start of the input, and isn't. - bool cannot_match_; -}; - -// Improve the speed that we scan for an initial point where a non-anchored -// regexp can match by using a Boyer-Moore-like table. This is done by -// identifying non-greedy non-capturing loops in the nodes that eat any -// character one at a time. For example in the middle of the regexp -// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly -// inserted at the start of any non-anchored regexp. -// -// When we have found such a loop we look ahead in the nodes to find the set of -// characters that can come at given distances. For example for the regexp -// /.?foo/ we know that there are at least 3 characters ahead of us, and the -// sets of characters that can occur are [any, [f, o], [o]]. We find a range in -// the lookahead info where the set of characters is reasonably constrained. In -// our example this is from index 1 to 2 (0 is not constrained). We can now -// look 3 characters ahead and if we don't find one of [f, o] (the union of -// [f, o] and [o]) then we can skip forwards by the range size (in this case 2). -// -// For Unicode input strings we do the same, but modulo 128. -// -// We also look at the first string fed to the regexp and use that to get a hint -// of the character frequencies in the inputs. This affects the assessment of -// whether the set of characters is 'reasonably constrained'. -// -// We also have another lookahead mechanism (called quick check in the code), -// which uses a wide load of multiple characters followed by a mask and compare -// to determine whether a match is possible at this point. -enum ContainedInLattice { - kNotYet = 0, - kLatticeIn = 1, - kLatticeOut = 2, - kLatticeUnknown = 3 // Can also mean both in and out. -}; - -inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) { - return static_cast<ContainedInLattice>(a | b); -} - -class BoyerMoorePositionInfo : public ZoneObject { - public: - bool at(int i) const { return map_[i]; } - - static constexpr int kMapSize = 128; - static constexpr int kMask = kMapSize - 1; - - int map_count() const { return map_count_; } - - void Set(int character); - void SetInterval(const Interval& interval); - void SetAll(); - - bool is_non_word() { return w_ == kLatticeOut; } - bool is_word() { return w_ == kLatticeIn; } - - using Bitset = std::bitset<kMapSize>; - Bitset raw_bitset() const { return map_; } - - private: - Bitset map_; - int map_count_ = 0; // Number of set bits in the map. - ContainedInLattice w_ = kNotYet; // The \w character class. -}; - -class BoyerMooreLookahead : public ZoneObject { - public: - BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone); - - int length() { return length_; } - int max_char() { return max_char_; } - RegExpCompiler* compiler() { return compiler_; } - - int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); } - - BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); } - - void Set(int map_number, int character) { - if (character > max_char_) return; - BoyerMoorePositionInfo* info = bitmaps_->at(map_number); - info->Set(character); - } - - void SetInterval(int map_number, const Interval& interval) { - if (interval.from() > max_char_) return; - BoyerMoorePositionInfo* info = bitmaps_->at(map_number); - if (interval.to() > max_char_) { - info->SetInterval(Interval(interval.from(), max_char_)); - } else { - info->SetInterval(interval); - } - } - - void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); } - - void SetRest(int from_map) { - for (int i = from_map; i < length_; i++) SetAll(i); - } - void EmitSkipInstructions(RegExpMacroAssembler* masm); - - private: - // This is the value obtained by EatsAtLeast. If we do not have at least this - // many characters left in the sample string then the match is bound to fail. - // Therefore it is OK to read a character this far ahead of the current match - // point. - int length_; - RegExpCompiler* compiler_; - // 0xff for Latin1, 0xffff for UTF-16. - int max_char_; - ZoneList<BoyerMoorePositionInfo*>* bitmaps_; - - int GetSkipTable(int min_lookahead, int max_lookahead, - Handle<ByteArray> boolean_skip_table); - bool FindWorthwhileInterval(int* from, int* to); - int FindBestInterval(int max_number_of_chars, int old_biggest_points, - int* from, int* to); -}; - -// There are many ways to generate code for a node. This class encapsulates -// the current way we should be generating. In other words it encapsulates -// the current state of the code generator. The effect of this is that we -// generate code for paths that the matcher can take through the regular -// expression. A given node in the regexp can be code-generated several times -// as it can be part of several traces. For example for the regexp: -// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part -// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code -// to match foo is generated only once (the traces have a common prefix). The -// code to store the capture is deferred and generated (twice) after the places -// where baz has been matched. -class Trace { - public: - // A value for a property that is either known to be true, know to be false, - // or not known. - enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 }; - - class DeferredAction { - public: - DeferredAction(ActionNode::ActionType action_type, int reg) - : action_type_(action_type), reg_(reg), next_(nullptr) {} - DeferredAction* next() { return next_; } - bool Mentions(int reg); - int reg() { return reg_; } - ActionNode::ActionType action_type() { return action_type_; } - - private: - ActionNode::ActionType action_type_; - int reg_; - DeferredAction* next_; - friend class Trace; - }; - - class DeferredCapture : public DeferredAction { - public: - DeferredCapture(int reg, bool is_capture, Trace* trace) - : DeferredAction(ActionNode::STORE_POSITION, reg), - cp_offset_(trace->cp_offset()), - is_capture_(is_capture) {} - int cp_offset() { return cp_offset_; } - bool is_capture() { return is_capture_; } - - private: - int cp_offset_; - bool is_capture_; - void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } - }; - - class DeferredSetRegisterForLoop : public DeferredAction { - public: - DeferredSetRegisterForLoop(int reg, int value) - : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg), - value_(value) {} - int value() { return value_; } - - private: - int value_; - }; - - class DeferredClearCaptures : public DeferredAction { - public: - explicit DeferredClearCaptures(Interval range) - : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {} - Interval range() { return range_; } - - private: - Interval range_; - }; - - class DeferredIncrementRegister : public DeferredAction { - public: - explicit DeferredIncrementRegister(int reg) - : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {} - }; - - Trace() - : cp_offset_(0), - actions_(nullptr), - backtrack_(nullptr), - stop_node_(nullptr), - loop_label_(nullptr), - characters_preloaded_(0), - bound_checked_up_to_(0), - flush_budget_(100), - at_start_(UNKNOWN) {} - - // End the trace. This involves flushing the deferred actions in the trace - // and pushing a backtrack location onto the backtrack stack. Once this is - // done we can start a new trace or go to one that has already been - // generated. - void Flush(RegExpCompiler* compiler, RegExpNode* successor); - int cp_offset() { return cp_offset_; } - DeferredAction* actions() { return actions_; } - // A trivial trace is one that has no deferred actions or other state that - // affects the assumptions used when generating code. There is no recorded - // backtrack location in a trivial trace, so with a trivial trace we will - // generate code that, on a failure to match, gets the backtrack location - // from the backtrack stack rather than using a direct jump instruction. We - // always start code generation with a trivial trace and non-trivial traces - // are created as we emit code for nodes or add to the list of deferred - // actions in the trace. The location of the code generated for a node using - // a trivial trace is recorded in a label in the node so that gotos can be - // generated to that code. - bool is_trivial() { - return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 && - characters_preloaded_ == 0 && bound_checked_up_to_ == 0 && - quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN; - } - TriBool at_start() { return at_start_; } - void set_at_start(TriBool at_start) { at_start_ = at_start; } - Label* backtrack() { return backtrack_; } - Label* loop_label() { return loop_label_; } - RegExpNode* stop_node() { return stop_node_; } - int characters_preloaded() { return characters_preloaded_; } - int bound_checked_up_to() { return bound_checked_up_to_; } - int flush_budget() { return flush_budget_; } - QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; } - bool mentions_reg(int reg); - // Returns true if a deferred position store exists to the specified - // register and stores the offset in the out-parameter. Otherwise - // returns false. - bool GetStoredPosition(int reg, int* cp_offset); - // These set methods and AdvanceCurrentPositionInTrace should be used only on - // new traces - the intention is that traces are immutable after creation. - void add_action(DeferredAction* new_action) { - DCHECK(new_action->next_ == nullptr); - new_action->next_ = actions_; - actions_ = new_action; - } - void set_backtrack(Label* backtrack) { backtrack_ = backtrack; } - void set_stop_node(RegExpNode* node) { stop_node_ = node; } - void set_loop_label(Label* label) { loop_label_ = label; } - void set_characters_preloaded(int count) { characters_preloaded_ = count; } - void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; } - void set_flush_budget(int to) { flush_budget_ = to; } - void set_quick_check_performed(QuickCheckDetails* d) { - quick_check_performed_ = *d; - } - void InvalidateCurrentCharacter(); - void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler); - - private: - int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone); - void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register, - const DynamicBitSet& affected_registers, - DynamicBitSet* registers_to_pop, - DynamicBitSet* registers_to_clear, Zone* zone); - void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register, - const DynamicBitSet& registers_to_pop, - const DynamicBitSet& registers_to_clear); - int cp_offset_; - DeferredAction* actions_; - Label* backtrack_; - RegExpNode* stop_node_; - Label* loop_label_; - int characters_preloaded_; - int bound_checked_up_to_; - QuickCheckDetails quick_check_performed_; - int flush_budget_; - TriBool at_start_; -}; - -class GreedyLoopState { - public: - explicit GreedyLoopState(bool not_at_start); - - Label* label() { return &label_; } - Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; } - - private: - Label label_; - Trace counter_backtrack_trace_; -}; - -struct PreloadState { - static const int kEatsAtLeastNotYetInitialized = -1; - bool preload_is_current_; - bool preload_has_checked_bounds_; - int preload_characters_; - int eats_at_least_; - void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; } -}; - -// Analysis performs assertion propagation and computes eats_at_least_ values. -// See the comments on AssertionPropagator and EatsAtLeastPropagator for more -// details. -RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node); - -class FrequencyCollator { - public: - FrequencyCollator() : total_samples_(0) { - for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) { - frequencies_[i] = CharacterFrequency(i); - } - } - - void CountCharacter(int character) { - int index = (character & RegExpMacroAssembler::kTableMask); - frequencies_[index].Increment(); - total_samples_++; - } - - // Does not measure in percent, but rather per-128 (the table size from the - // regexp macro assembler). - int Frequency(int in_character) { - DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character); - if (total_samples_ < 1) return 1; // Division by zero. - int freq_in_per128 = - (frequencies_[in_character].counter() * 128) / total_samples_; - return freq_in_per128; - } - - private: - class CharacterFrequency { - public: - CharacterFrequency() : counter_(0), character_(-1) {} - explicit CharacterFrequency(int character) - : counter_(0), character_(character) {} - - void Increment() { counter_++; } - int counter() { return counter_; } - int character() { return character_; } - - private: - int counter_; - int character_; - }; - - private: - CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize]; - int total_samples_; -}; - -class RegExpCompiler { - public: - RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count, - bool is_one_byte); - - int AllocateRegister() { - if (next_register_ >= RegExpMacroAssembler::kMaxRegister) { - reg_exp_too_big_ = true; - return next_register_; - } - return next_register_++; - } - - // Lookarounds to match lone surrogates for unicode character class matches - // are never nested. We can therefore reuse registers. - int UnicodeLookaroundStackRegister() { - if (unicode_lookaround_stack_register_ == kNoRegister) { - unicode_lookaround_stack_register_ = AllocateRegister(); - } - return unicode_lookaround_stack_register_; - } - - int UnicodeLookaroundPositionRegister() { - if (unicode_lookaround_position_register_ == kNoRegister) { - unicode_lookaround_position_register_ = AllocateRegister(); - } - return unicode_lookaround_position_register_; - } - - struct CompilationResult final { - explicit CompilationResult(RegExpError err) : error(err) {} - CompilationResult(Object code, int registers) - : code(code), num_registers(registers) {} - - static CompilationResult RegExpTooBig() { - return CompilationResult(RegExpError::kTooLarge); - } - - bool Succeeded() const { return error == RegExpError::kNone; } - - const RegExpError error = RegExpError::kNone; - Object code; - int num_registers = 0; - }; - - CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler, - RegExpNode* start, int capture_count, - Handle<String> pattern); - - // If the regexp matching starts within a surrogate pair, step back to the - // lead surrogate and start matching from there. - static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, - RegExpNode* on_success, - JSRegExp::Flags flags); - - inline void AddWork(RegExpNode* node) { - if (!node->on_work_list() && !node->label()->is_bound()) { - node->set_on_work_list(true); - work_list_->push_back(node); - } - } - - static const int kImplementationOffset = 0; - static const int kNumberOfRegistersOffset = 0; - static const int kCodeOffset = 1; - - RegExpMacroAssembler* macro_assembler() { return macro_assembler_; } - EndNode* accept() { return accept_; } - - static const int kMaxRecursion = 100; - inline int recursion_depth() { return recursion_depth_; } - inline void IncrementRecursionDepth() { recursion_depth_++; } - inline void DecrementRecursionDepth() { recursion_depth_--; } - - void SetRegExpTooBig() { reg_exp_too_big_ = true; } - - inline bool one_byte() { return one_byte_; } - inline bool optimize() { return optimize_; } - inline void set_optimize(bool value) { optimize_ = value; } - inline bool limiting_recursion() { return limiting_recursion_; } - inline void set_limiting_recursion(bool value) { - limiting_recursion_ = value; - } - bool read_backward() { return read_backward_; } - void set_read_backward(bool value) { read_backward_ = value; } - FrequencyCollator* frequency_collator() { return &frequency_collator_; } - - int current_expansion_factor() { return current_expansion_factor_; } - void set_current_expansion_factor(int value) { - current_expansion_factor_ = value; - } - - Isolate* isolate() const { return isolate_; } - Zone* zone() const { return zone_; } - - static const int kNoRegister = -1; - - private: - EndNode* accept_; - int next_register_; - int unicode_lookaround_stack_register_; - int unicode_lookaround_position_register_; - ZoneVector<RegExpNode*>* work_list_; - int recursion_depth_; - RegExpMacroAssembler* macro_assembler_; - bool one_byte_; - bool reg_exp_too_big_; - bool limiting_recursion_; - bool optimize_; - bool read_backward_; - int current_expansion_factor_; - FrequencyCollator frequency_collator_; - Isolate* isolate_; - Zone* zone_; -}; - -// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. -class UnicodeRangeSplitter { - public: - V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base); - - static constexpr int kInitialSize = 8; - using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>; - - const CharacterRangeVector* bmp() const { return &bmp_; } - const CharacterRangeVector* lead_surrogates() const { - return &lead_surrogates_; - } - const CharacterRangeVector* trail_surrogates() const { - return &trail_surrogates_; - } - const CharacterRangeVector* non_bmp() const { return &non_bmp_; } - - private: - void AddRange(CharacterRange range); - - CharacterRangeVector bmp_; - CharacterRangeVector lead_surrogates_; - CharacterRangeVector trail_surrogates_; - CharacterRangeVector non_bmp_; -}; - -// We need to check for the following characters: 0x39C 0x3BC 0x178. -// TODO(jgruber): Move to CharacterRange. -bool RangeContainsLatin1Equivalents(CharacterRange range); - -} // namespace internal -} // namespace v8 - -#endif // V8_REGEXP_REGEXP_COMPILER_H_ |