From 70b3e56de2e67cce36eae97d907ddf0360198faf Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Fri, 14 May 2021 13:52:03 -0400 Subject: [PATCH] Store all characters as ranges; add CharacterClassUnit#to_nfa --- lib/imbecile/regex/parser.rb | 65 ++++++++++++++------- spec/imbecile/regex/parser_spec.rb | 94 +++++++++++++++--------------- 2 files changed, 90 insertions(+), 69 deletions(-) diff --git a/lib/imbecile/regex/parser.rb b/lib/imbecile/regex/parser.rb index 03b0d24..d730a56 100644 --- a/lib/imbecile/regex/parser.rb +++ b/lib/imbecile/regex/parser.rb @@ -65,28 +65,19 @@ module Imbecile end end - class CharacterUnit < Unit - attr_accessor :code_point - def initialize(c) - @code_point = c.ord - end - def to_nfa - nfa = NFA.new - nfa.start_state.add_transition(@code_point, nfa.end_state) - nfa - end - end - class CharacterRangeUnit < Unit attr_accessor :min_code_point attr_accessor :max_code_point - def initialize(c1, c2) + def initialize(c1, c2 = nil) @min_code_point = c1.ord - @max_code_point = c2.ord + @max_code_point = c2 ? c2.ord : @min_code_point + end + def range + @min_code_point..@max_code_point end def to_nfa nfa = NFA.new - nfa.start_state.add_transition((@min_code_point..@max_code_point), nfa.end_state) + nfa.start_state.add_transition(range, nfa.end_state) nfa end end @@ -110,6 +101,36 @@ module Imbecile def replace_last!(new_unit) @units[-1] = new_unit end + def to_nfa + nfa = NFA.new + if @units.empty? + nfa.start_state.add_transition(nil, nfa.end_state) + else + ranges = @units.map(&:range) + if unit.negate + ranges = negate_ranges(ranges) + end + ranges.each do |range| + nfa.start_state.add_transition(range, nfa.end_state) + end + end + end + private + def negate_ranges(ranges) + ranges = ranges.sort_by(&:first) + new_ranges = [] + last_cp = -1 + ranges.each do |range| + if range.first > (last_cp + 1) + new_ranges << ((last_cp + 1)..(range.first - 1)) + last_cp = range.last + end + end + if last_cp < 0xFFFFFFFF + new_ranges << ((last_cp + 1)..0xFFFFFFFF) + end + new_ranges + end end class MultiplicityUnit < Unit @@ -194,7 +215,7 @@ module Imbecile when "\\" au << parse_backslash else - au << CharacterUnit.new(c) + au << CharacterRangeUnit.new(c) end end au @@ -222,18 +243,18 @@ module Imbecile elsif c == "^" && index == 0 ccu.negate = true elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]") - ccu << CharacterUnit.new(c) + ccu << CharacterRangeUnit.new(c) elsif c == "\\" ccu << parse_backslash elsif c == "-" && @pattern[0] != "]" begin_cu = ccu.last_unit - unless begin_cu.is_a?(CharacterUnit) + unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1 raise Error.new("Character range must be between single characters") end if @pattern[0] == "\\" @pattern.slice!(0) end_cu = parse_backslash - unless end_cu.is_a?(CharacterUnit) + unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1 raise Error.new("Character range must be between single characters") end max_code_point = end_cu.code_point @@ -241,10 +262,10 @@ module Imbecile max_code_point = @pattern[0].ord @pattern.slice!(0) end - cru = CharacterRangeUnit.new(begin_cu.code_point, max_code_point) + cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point) ccu.replace_last!(cru) else - ccu << CharacterUnit.new(c) + ccu << CharacterRangeUnit.new(c) end index += 1 end @@ -281,7 +302,7 @@ module Imbecile when "d" CharacterRangeUnit.new("0", "9") else - CharacterUnit.new(c) + CharacterRangeUnit.new(c) end end end diff --git a/spec/imbecile/regex/parser_spec.rb b/spec/imbecile/regex/parser_spec.rb index cc5febb..3976250 100644 --- a/spec/imbecile/regex/parser_spec.rb +++ b/spec/imbecile/regex/parser_spec.rb @@ -16,7 +16,7 @@ module Imbecile expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit seq_unit = parser.unit.alternates[0] expect(seq_unit.size).to eq 1 - expect(seq_unit[0]).to be_a Parser::CharacterUnit + expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit end it "parses a group with a single character unit expression" do @@ -30,7 +30,7 @@ module Imbecile alt_unit = seq_unit[0] expect(alt_unit.alternates.size).to eq 1 expect(alt_unit.alternates[0]).to be_a Parser::SequenceUnit - expect(alt_unit.alternates[0][0]).to be_a Parser::CharacterUnit + expect(alt_unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit end it "parses a *" do @@ -44,7 +44,7 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 0 expect(m_unit.max_count).to be_nil - expect(m_unit.unit).to be_a Parser::CharacterUnit + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit end it "parses a +" do @@ -58,7 +58,7 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 1 expect(m_unit.max_count).to be_nil - expect(m_unit.unit).to be_a Parser::CharacterUnit + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit end it "parses a ?" do @@ -72,7 +72,7 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 0 expect(m_unit.max_count).to eq 1 - expect(m_unit.unit).to be_a Parser::CharacterUnit + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit end it "parses a multiplicity count" do @@ -86,7 +86,7 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 5 expect(m_unit.max_count).to eq 5 - expect(m_unit.unit).to be_a Parser::CharacterUnit + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit end it "parses a minimum-only multiplicity count" do @@ -100,7 +100,7 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 5 expect(m_unit.max_count).to be_nil - expect(m_unit.unit).to be_a Parser::CharacterUnit + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit end it "parses a minimum and maximum multiplicity count" do @@ -114,8 +114,8 @@ module Imbecile m_unit = seq_unit[0] expect(m_unit.min_count).to eq 5 expect(m_unit.max_count).to eq 8 - expect(m_unit.unit).to be_a Parser::CharacterUnit - expect(m_unit.unit.code_point).to eq "a".ord + expect(m_unit.unit).to be_a Parser::CharacterRangeUnit + expect(m_unit.unit.range.first).to eq "a".ord end it "parses an escaped *" do @@ -125,10 +125,10 @@ module Imbecile expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit seq_unit = parser.unit.alternates[0] expect(seq_unit.size).to eq 2 - expect(seq_unit[0]).to be_a Parser::CharacterUnit - expect(seq_unit[0].code_point).to eq "a".ord - expect(seq_unit[1]).to be_a Parser::CharacterUnit - expect(seq_unit[1].code_point).to eq "*".ord + expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[0].min_code_point).to eq "a".ord + expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[1].min_code_point).to eq "*".ord end it "parses an escaped +" do @@ -138,10 +138,10 @@ module Imbecile expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit seq_unit = parser.unit.alternates[0] expect(seq_unit.size).to eq 2 - expect(seq_unit[0]).to be_a Parser::CharacterUnit - expect(seq_unit[0].code_point).to eq "a".ord - expect(seq_unit[1]).to be_a Parser::CharacterUnit - expect(seq_unit[1].code_point).to eq "+".ord + expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[0].min_code_point).to eq "a".ord + expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[1].min_code_point).to eq "+".ord end it "parses an escaped \\" do @@ -151,10 +151,10 @@ module Imbecile expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit seq_unit = parser.unit.alternates[0] expect(seq_unit.size).to eq 2 - expect(seq_unit[0]).to be_a Parser::CharacterUnit - expect(seq_unit[0].code_point).to eq "\\".ord - expect(seq_unit[1]).to be_a Parser::CharacterUnit - expect(seq_unit[1].code_point).to eq "d".ord + expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[0].min_code_point).to eq "\\".ord + expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit + expect(seq_unit[1].min_code_point).to eq "d".ord end it "parses a character class" do @@ -171,8 +171,8 @@ module Imbecile expect(ccu[0]).to be_a Parser::CharacterRangeUnit expect(ccu[0].min_code_point).to eq "a".ord expect(ccu[0].max_code_point).to eq "z".ord - expect(ccu[1]).to be_a Parser::CharacterUnit - expect(ccu[1].code_point).to eq "_".ord + expect(ccu[1]).to be_a Parser::CharacterRangeUnit + expect(ccu[1].min_code_point).to eq "_".ord end it "parses a negated character class" do @@ -186,8 +186,8 @@ module Imbecile ccu = seq_unit[0] expect(ccu.negate).to be_truthy expect(ccu.size).to eq 3 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq "x".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq "x".ord end it "parses - as a plain character at beginning of a character class" do @@ -200,8 +200,8 @@ module Imbecile expect(seq_unit[0]).to be_a Parser::CharacterClassUnit ccu = seq_unit[0] expect(ccu.size).to eq 2 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq "-".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq "-".ord end it "parses - as a plain character at end of a character class" do @@ -214,10 +214,10 @@ module Imbecile expect(seq_unit[0]).to be_a Parser::CharacterClassUnit ccu = seq_unit[0] expect(ccu.size).to eq 2 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq "0".ord - expect(ccu[1]).to be_a Parser::CharacterUnit - expect(ccu[1].code_point).to eq "-".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq "0".ord + expect(ccu[1]).to be_a Parser::CharacterRangeUnit + expect(ccu[1].min_code_point).to eq "-".ord end it "parses - as a plain character at beginning of a negated character class" do @@ -231,8 +231,8 @@ module Imbecile ccu = seq_unit[0] expect(ccu.negate).to be_truthy expect(ccu.size).to eq 2 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq "-".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq "-".ord end it "parses . as a plain character in a negated character class" do @@ -246,8 +246,8 @@ module Imbecile ccu = seq_unit[0] expect(ccu.negate).to be_falsey expect(ccu.size).to eq 1 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq ".".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq ".".ord end it "parses - as a plain character when escaped in middle of character class" do @@ -261,12 +261,12 @@ module Imbecile ccu = seq_unit[0] expect(ccu.negate).to be_falsey expect(ccu.size).to eq 3 - expect(ccu[0]).to be_a Parser::CharacterUnit - expect(ccu[0].code_point).to eq "0".ord - expect(ccu[1]).to be_a Parser::CharacterUnit - expect(ccu[1].code_point).to eq "-".ord - expect(ccu[2]).to be_a Parser::CharacterUnit - expect(ccu[2].code_point).to eq "9".ord + expect(ccu[0]).to be_a Parser::CharacterRangeUnit + expect(ccu[0].min_code_point).to eq "0".ord + expect(ccu[1]).to be_a Parser::CharacterRangeUnit + expect(ccu[1].min_code_point).to eq "-".ord + expect(ccu[2]).to be_a Parser::CharacterRangeUnit + expect(ccu[2].min_code_point).to eq "9".ord end it "parses alternates" do @@ -292,7 +292,7 @@ module Imbecile expect(parser.unit.alternates[0][0].unit.alternates.size).to eq 2 expect(parser.unit.alternates[0][0].unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0][0].unit.alternates[0].size).to eq 1 - expect(parser.unit.alternates[0][0].unit.alternates[0][0]).to be_a Parser::CharacterUnit + expect(parser.unit.alternates[0][0].unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit expect(parser.unit.alternates[0][0].unit.alternates[1]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0][0].unit.alternates[1].size).to eq 0 expect(parser.unit.alternates[1]).to be_a Parser::SequenceUnit @@ -300,13 +300,13 @@ module Imbecile expect(parser.unit.alternates[1][0]).to be_a Parser::CharacterClassUnit expect(parser.unit.alternates[1][0].negate).to be_truthy expect(parser.unit.alternates[1][0].size).to eq 1 - expect(parser.unit.alternates[1][0][0]).to be_a Parser::CharacterUnit + expect(parser.unit.alternates[1][0][0]).to be_a Parser::CharacterRangeUnit expect(parser.unit.alternates[2]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[2].size).to eq 2 - expect(parser.unit.alternates[2][0]).to be_a Parser::CharacterUnit - expect(parser.unit.alternates[2][0].code_point).to eq "|".ord - expect(parser.unit.alternates[2][1]).to be_a Parser::CharacterUnit - expect(parser.unit.alternates[2][1].code_point).to eq "v".ord + expect(parser.unit.alternates[2][0]).to be_a Parser::CharacterRangeUnit + expect(parser.unit.alternates[2][0].min_code_point).to eq "|".ord + expect(parser.unit.alternates[2][1]).to be_a Parser::CharacterRangeUnit + expect(parser.unit.alternates[2][1].min_code_point).to eq "v".ord expect(parser.unit.alternates[3]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[3].size).to eq 1 expect(parser.unit.alternates[3][0]).to be_a Parser::MultiplicityUnit