diff --git a/lib/imbecile.rb b/lib/imbecile.rb index 87fb486..9524a48 100644 --- a/lib/imbecile.rb +++ b/lib/imbecile.rb @@ -1,6 +1,7 @@ require "erb" require "set" require_relative "imbecile/cli" +require_relative "imbecile/code_point_range" require_relative "imbecile/grammar" require_relative "imbecile/regex" require_relative "imbecile/regex/dfa" diff --git a/lib/imbecile/code_point_range.rb b/lib/imbecile/code_point_range.rb new file mode 100644 index 0000000..513a6d0 --- /dev/null +++ b/lib/imbecile/code_point_range.rb @@ -0,0 +1,82 @@ +module Imbecile + class CodePointRange + + MAX_CODE_POINT = 0xFFFFFFFF + + attr_reader :first + attr_reader :last + + include Comparable + + # Build a CodePointRange + def initialize(first, last = nil) + @first = first.ord + if last + @last = last.ord + if @last < @first + raise "Invalid CodePointRange: last code point must be > first code point" + end + else + @last = @first + end + end + + def <=>(other) + if self.first != other.first + @first <=> other.first + else + @last <=> other.last + end + end + + def include?(v) + if v.is_a?(CodePointRange) + @first <= v.first && v.last <= @last + else + @first <= v && v <= @last + end + end + + def size + @last - @first + 1 + end + + class << self + + def invert_ranges(code_point_ranges) + new_ranges = [] + last_cp = -1 + code_point_ranges.sort.each do |code_point_range| + if code_point_range.first > (last_cp + 1) + new_ranges << CodePointRange.new(last_cp + 1, code_point_range.first - 1) + last_cp = code_point_range.last + end + end + if last_cp < MAX_CODE_POINT + new_ranges << CodePointRange.new(last_cp + 1, MAX_CODE_POINT) + end + new_ranges + end + + def first_subrange(code_point_ranges) + code_point_ranges.sort.reduce do |result, code_point_range| + if code_point_range.include?(result.first) + if code_point_range.last < result.last + code_point_range + else + result + end + else + if code_point_range.first <= result.last + CodePointRange.new(result.first, code_point_range.first - 1) + else + result + end + end + end + end + + end + + end +end diff --git a/lib/imbecile/regex.rb b/lib/imbecile/regex.rb index f526b3a..999c194 100644 --- a/lib/imbecile/regex.rb +++ b/lib/imbecile/regex.rb @@ -83,13 +83,13 @@ module Imbecile ccu << parse_backslash elsif c == "-" && @pattern[0] != "]" begin_cu = ccu.last_unit - unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1 + unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.code_point_range.size == 1 raise Error.new("Character range must be between single characters") end if @pattern[0] == "\\" @pattern.slice!(0) end_cu = parse_backslash - unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1 + unless end_cu.is_a?(CharacterRangeUnit) && end_cu.code_point_range.size == 1 raise Error.new("Character range must be between single characters") end max_code_point = end_cu.code_point @@ -97,7 +97,7 @@ module Imbecile max_code_point = @pattern[0].ord @pattern.slice!(0) end - cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point) + cru = CharacterRangeUnit.new(begin_cu.first, max_code_point) ccu.replace_last!(cru) else ccu << CharacterRangeUnit.new(c) diff --git a/lib/imbecile/regex/nfa.rb b/lib/imbecile/regex/nfa.rb index 9abf5ec..b7a13ff 100644 --- a/lib/imbecile/regex/nfa.rb +++ b/lib/imbecile/regex/nfa.rb @@ -24,12 +24,10 @@ module Imbecile def nil_transition_states states = Set[self] analyze_state = lambda do |state| - state.transitions.each do |range, dest_state| - if range.nil? - unless states.include?(dest_state) - states << dest_state - analyze_state[dest_state] - end + state.nil_transitions.each do |range, dest_state| + unless states.include?(dest_state) + states << dest_state + analyze_state[dest_state] end end end @@ -37,6 +35,18 @@ module Imbecile states end + def nil_transitions + @transitions.select do |code_point, dest_state| + code_point.nil? + end + end + + def cp_transitions + @transitions.select do |code_point, dest_state| + code_point + end + end + end attr_accessor :start_state @@ -69,13 +79,13 @@ module Imbecile visit = lambda do |state| accepts_s = state.accepts ? " *" : "" rv += "#{state_id[state]}#{accepts_s}:\n" - state.transitions.each do |range, dest_state| - if range.nil? + state.transitions.each do |code_point_range, dest_state| + if code_point_range.nil? range_s = "nil" else - range_s = chr[range.first] - if range.size > 1 - range_s += "-" + chr[range.last] + range_s = chr[code_point_range.first] + if code_point_range.size > 1 + range_s += "-" + chr[code_point_range.last] end end accepts_s = dest_state.accepts ? " *" : "" diff --git a/lib/imbecile/regex/unit.rb b/lib/imbecile/regex/unit.rb index c793234..28c5c70 100644 --- a/lib/imbecile/regex/unit.rb +++ b/lib/imbecile/regex/unit.rb @@ -68,18 +68,19 @@ module Imbecile end class CharacterRangeUnit < Unit - attr_accessor :min_code_point - attr_accessor :max_code_point + attr_reader :code_point_range def initialize(c1, c2 = nil) - @min_code_point = c1.ord - @max_code_point = c2 ? c2.ord : @min_code_point + @code_point_range = CodePointRange.new(c1, c2) end - def range - @min_code_point..@max_code_point + def first + @code_point_range.first + end + def last + @code_point_range.last end def to_nfa nfa = NFA.new - nfa.start_state.add_transition(range, nfa.end_state) + nfa.start_state.add_transition(@code_point_range, nfa.end_state) nfa end end @@ -108,32 +109,16 @@ module Imbecile if @units.empty? nfa.start_state.add_transition(nil, nfa.end_state) else - ranges = @units.map(&:range) + code_point_ranges = @units.map(&:code_point_range) if @negate - ranges = negate_ranges(ranges) + code_point_ranges = CodePointRange.invert_ranges(code_point_ranges) end - ranges.each do |range| - nfa.start_state.add_transition(range, nfa.end_state) + code_point_ranges.each do |code_point_range| + nfa.start_state.add_transition(code_point_range, nfa.end_state) end end nfa end - private - def negate_ranges(ranges) - ranges = ranges.sort_by(&:first) - new_ranges = [] - last_cp = -1 - ranges.each do |range| - if range.first > (last_cp + 1) - new_ranges << ((last_cp + 1)..(range.first - 1)) - last_cp = range.last - end - end - if last_cp < 0xFFFFFFFF - new_ranges << ((last_cp + 1)..0xFFFFFFFF) - end - new_ranges - end end class MultiplicityUnit < Unit diff --git a/spec/imbecile/regex_spec.rb b/spec/imbecile/regex_spec.rb index 3be355e..caeb533 100644 --- a/spec/imbecile/regex_spec.rb +++ b/spec/imbecile/regex_spec.rb @@ -114,7 +114,7 @@ module Imbecile expect(m_unit.min_count).to eq 5 expect(m_unit.max_count).to eq 8 expect(m_unit.unit).to be_a Regex::CharacterRangeUnit - expect(m_unit.unit.range.first).to eq "a".ord + expect(m_unit.unit.first).to eq "a".ord end it "parses an escaped *" do @@ -125,9 +125,9 @@ module Imbecile seq_unit = regex.unit.alternates[0] expect(seq_unit.size).to eq 2 expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[0].min_code_point).to eq "a".ord + expect(seq_unit[0].first).to eq "a".ord expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[1].min_code_point).to eq "*".ord + expect(seq_unit[1].first).to eq "*".ord end it "parses an escaped +" do @@ -138,9 +138,9 @@ module Imbecile seq_unit = regex.unit.alternates[0] expect(seq_unit.size).to eq 2 expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[0].min_code_point).to eq "a".ord + expect(seq_unit[0].first).to eq "a".ord expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[1].min_code_point).to eq "+".ord + expect(seq_unit[1].first).to eq "+".ord end it "parses an escaped \\" do @@ -151,9 +151,9 @@ module Imbecile seq_unit = regex.unit.alternates[0] expect(seq_unit.size).to eq 2 expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[0].min_code_point).to eq "\\".ord + expect(seq_unit[0].first).to eq "\\".ord expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit - expect(seq_unit[1].min_code_point).to eq "d".ord + expect(seq_unit[1].first).to eq "d".ord end it "parses a character class" do @@ -168,10 +168,10 @@ module Imbecile expect(ccu.negate).to be_falsey expect(ccu.size).to eq 2 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "a".ord - expect(ccu[0].max_code_point).to eq "z".ord + expect(ccu[0].first).to eq "a".ord + expect(ccu[0].last).to eq "z".ord expect(ccu[1]).to be_a Regex::CharacterRangeUnit - expect(ccu[1].min_code_point).to eq "_".ord + expect(ccu[1].first).to eq "_".ord end it "parses a negated character class" do @@ -186,7 +186,7 @@ module Imbecile expect(ccu.negate).to be_truthy expect(ccu.size).to eq 3 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "x".ord + expect(ccu[0].first).to eq "x".ord end it "parses - as a plain character at beginning of a character class" do @@ -200,7 +200,7 @@ module Imbecile ccu = seq_unit[0] expect(ccu.size).to eq 2 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "-".ord + expect(ccu[0].first).to eq "-".ord end it "parses - as a plain character at end of a character class" do @@ -214,9 +214,9 @@ module Imbecile ccu = seq_unit[0] expect(ccu.size).to eq 2 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "0".ord + expect(ccu[0].first).to eq "0".ord expect(ccu[1]).to be_a Regex::CharacterRangeUnit - expect(ccu[1].min_code_point).to eq "-".ord + expect(ccu[1].first).to eq "-".ord end it "parses - as a plain character at beginning of a negated character class" do @@ -231,7 +231,7 @@ module Imbecile expect(ccu.negate).to be_truthy expect(ccu.size).to eq 2 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "-".ord + expect(ccu[0].first).to eq "-".ord end it "parses . as a plain character in a character class" do @@ -246,7 +246,7 @@ module Imbecile expect(ccu.negate).to be_falsey expect(ccu.size).to eq 1 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq ".".ord + expect(ccu[0].first).to eq ".".ord end it "parses - as a plain character when escaped in middle of character class" do @@ -261,11 +261,11 @@ module Imbecile expect(ccu.negate).to be_falsey expect(ccu.size).to eq 3 expect(ccu[0]).to be_a Regex::CharacterRangeUnit - expect(ccu[0].min_code_point).to eq "0".ord + expect(ccu[0].first).to eq "0".ord expect(ccu[1]).to be_a Regex::CharacterRangeUnit - expect(ccu[1].min_code_point).to eq "-".ord + expect(ccu[1].first).to eq "-".ord expect(ccu[2]).to be_a Regex::CharacterRangeUnit - expect(ccu[2].min_code_point).to eq "9".ord + expect(ccu[2].first).to eq "9".ord end it "parses alternates" do @@ -314,9 +314,9 @@ module Imbecile expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit expect(regex.unit.alternates[2].size).to eq 2 expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit - expect(regex.unit.alternates[2][0].min_code_point).to eq "|".ord + expect(regex.unit.alternates[2][0].first).to eq "|".ord expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit - expect(regex.unit.alternates[2][1].min_code_point).to eq "v".ord + expect(regex.unit.alternates[2][1].first).to eq "v".ord expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit expect(regex.unit.alternates[3].size).to eq 1 expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit @@ -325,8 +325,8 @@ module Imbecile expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit expect(regex.unit.alternates[3][0].unit.size).to eq 1 expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit - expect(regex.unit.alternates[3][0].unit[0].min_code_point).to eq "x".ord - expect(regex.unit.alternates[3][0].unit[0].max_code_point).to eq "y".ord + expect(regex.unit.alternates[3][0].unit[0].first).to eq "x".ord + expect(regex.unit.alternates[3][0].unit[0].last).to eq "y".ord end end