Add CodePointRange class
This commit is contained in:
parent
3a1650906e
commit
3987f08cd7
@ -1,6 +1,7 @@
|
||||
require "erb"
|
||||
require "set"
|
||||
require_relative "imbecile/cli"
|
||||
require_relative "imbecile/code_point_range"
|
||||
require_relative "imbecile/grammar"
|
||||
require_relative "imbecile/regex"
|
||||
require_relative "imbecile/regex/dfa"
|
||||
|
82
lib/imbecile/code_point_range.rb
Normal file
82
lib/imbecile/code_point_range.rb
Normal file
@ -0,0 +1,82 @@
|
||||
module Imbecile
|
||||
class CodePointRange
|
||||
|
||||
MAX_CODE_POINT = 0xFFFFFFFF
|
||||
|
||||
attr_reader :first
|
||||
attr_reader :last
|
||||
|
||||
include Comparable
|
||||
|
||||
# Build a CodePointRange
|
||||
def initialize(first, last = nil)
|
||||
@first = first.ord
|
||||
if last
|
||||
@last = last.ord
|
||||
if @last < @first
|
||||
raise "Invalid CodePointRange: last code point must be > first code point"
|
||||
end
|
||||
else
|
||||
@last = @first
|
||||
end
|
||||
end
|
||||
|
||||
def <=>(other)
|
||||
if self.first != other.first
|
||||
@first <=> other.first
|
||||
else
|
||||
@last <=> other.last
|
||||
end
|
||||
end
|
||||
|
||||
def include?(v)
|
||||
if v.is_a?(CodePointRange)
|
||||
@first <= v.first && v.last <= @last
|
||||
else
|
||||
@first <= v && v <= @last
|
||||
end
|
||||
end
|
||||
|
||||
def size
|
||||
@last - @first + 1
|
||||
end
|
||||
|
||||
class << self
|
||||
|
||||
def invert_ranges(code_point_ranges)
|
||||
new_ranges = []
|
||||
last_cp = -1
|
||||
code_point_ranges.sort.each do |code_point_range|
|
||||
if code_point_range.first > (last_cp + 1)
|
||||
new_ranges << CodePointRange.new(last_cp + 1, code_point_range.first - 1)
|
||||
last_cp = code_point_range.last
|
||||
end
|
||||
end
|
||||
if last_cp < MAX_CODE_POINT
|
||||
new_ranges << CodePointRange.new(last_cp + 1, MAX_CODE_POINT)
|
||||
end
|
||||
new_ranges
|
||||
end
|
||||
|
||||
def first_subrange(code_point_ranges)
|
||||
code_point_ranges.sort.reduce do |result, code_point_range|
|
||||
if code_point_range.include?(result.first)
|
||||
if code_point_range.last < result.last
|
||||
code_point_range
|
||||
else
|
||||
result
|
||||
end
|
||||
else
|
||||
if code_point_range.first <= result.last
|
||||
CodePointRange.new(result.first, code_point_range.first - 1)
|
||||
else
|
||||
result
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
end
|
@ -83,13 +83,13 @@ module Imbecile
|
||||
ccu << parse_backslash
|
||||
elsif c == "-" && @pattern[0] != "]"
|
||||
begin_cu = ccu.last_unit
|
||||
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1
|
||||
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.code_point_range.size == 1
|
||||
raise Error.new("Character range must be between single characters")
|
||||
end
|
||||
if @pattern[0] == "\\"
|
||||
@pattern.slice!(0)
|
||||
end_cu = parse_backslash
|
||||
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1
|
||||
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.code_point_range.size == 1
|
||||
raise Error.new("Character range must be between single characters")
|
||||
end
|
||||
max_code_point = end_cu.code_point
|
||||
@ -97,7 +97,7 @@ module Imbecile
|
||||
max_code_point = @pattern[0].ord
|
||||
@pattern.slice!(0)
|
||||
end
|
||||
cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point)
|
||||
cru = CharacterRangeUnit.new(begin_cu.first, max_code_point)
|
||||
ccu.replace_last!(cru)
|
||||
else
|
||||
ccu << CharacterRangeUnit.new(c)
|
||||
|
@ -24,19 +24,29 @@ module Imbecile
|
||||
def nil_transition_states
|
||||
states = Set[self]
|
||||
analyze_state = lambda do |state|
|
||||
state.transitions.each do |range, dest_state|
|
||||
if range.nil?
|
||||
state.nil_transitions.each do |range, dest_state|
|
||||
unless states.include?(dest_state)
|
||||
states << dest_state
|
||||
analyze_state[dest_state]
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
analyze_state[self]
|
||||
states
|
||||
end
|
||||
|
||||
def nil_transitions
|
||||
@transitions.select do |code_point, dest_state|
|
||||
code_point.nil?
|
||||
end
|
||||
end
|
||||
|
||||
def cp_transitions
|
||||
@transitions.select do |code_point, dest_state|
|
||||
code_point
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
attr_accessor :start_state
|
||||
@ -69,13 +79,13 @@ module Imbecile
|
||||
visit = lambda do |state|
|
||||
accepts_s = state.accepts ? " *" : ""
|
||||
rv += "#{state_id[state]}#{accepts_s}:\n"
|
||||
state.transitions.each do |range, dest_state|
|
||||
if range.nil?
|
||||
state.transitions.each do |code_point_range, dest_state|
|
||||
if code_point_range.nil?
|
||||
range_s = "nil"
|
||||
else
|
||||
range_s = chr[range.first]
|
||||
if range.size > 1
|
||||
range_s += "-" + chr[range.last]
|
||||
range_s = chr[code_point_range.first]
|
||||
if code_point_range.size > 1
|
||||
range_s += "-" + chr[code_point_range.last]
|
||||
end
|
||||
end
|
||||
accepts_s = dest_state.accepts ? " *" : ""
|
||||
|
@ -68,18 +68,19 @@ module Imbecile
|
||||
end
|
||||
|
||||
class CharacterRangeUnit < Unit
|
||||
attr_accessor :min_code_point
|
||||
attr_accessor :max_code_point
|
||||
attr_reader :code_point_range
|
||||
def initialize(c1, c2 = nil)
|
||||
@min_code_point = c1.ord
|
||||
@max_code_point = c2 ? c2.ord : @min_code_point
|
||||
@code_point_range = CodePointRange.new(c1, c2)
|
||||
end
|
||||
def range
|
||||
@min_code_point..@max_code_point
|
||||
def first
|
||||
@code_point_range.first
|
||||
end
|
||||
def last
|
||||
@code_point_range.last
|
||||
end
|
||||
def to_nfa
|
||||
nfa = NFA.new
|
||||
nfa.start_state.add_transition(range, nfa.end_state)
|
||||
nfa.start_state.add_transition(@code_point_range, nfa.end_state)
|
||||
nfa
|
||||
end
|
||||
end
|
||||
@ -108,32 +109,16 @@ module Imbecile
|
||||
if @units.empty?
|
||||
nfa.start_state.add_transition(nil, nfa.end_state)
|
||||
else
|
||||
ranges = @units.map(&:range)
|
||||
code_point_ranges = @units.map(&:code_point_range)
|
||||
if @negate
|
||||
ranges = negate_ranges(ranges)
|
||||
code_point_ranges = CodePointRange.invert_ranges(code_point_ranges)
|
||||
end
|
||||
ranges.each do |range|
|
||||
nfa.start_state.add_transition(range, nfa.end_state)
|
||||
code_point_ranges.each do |code_point_range|
|
||||
nfa.start_state.add_transition(code_point_range, nfa.end_state)
|
||||
end
|
||||
end
|
||||
nfa
|
||||
end
|
||||
private
|
||||
def negate_ranges(ranges)
|
||||
ranges = ranges.sort_by(&:first)
|
||||
new_ranges = []
|
||||
last_cp = -1
|
||||
ranges.each do |range|
|
||||
if range.first > (last_cp + 1)
|
||||
new_ranges << ((last_cp + 1)..(range.first - 1))
|
||||
last_cp = range.last
|
||||
end
|
||||
end
|
||||
if last_cp < 0xFFFFFFFF
|
||||
new_ranges << ((last_cp + 1)..0xFFFFFFFF)
|
||||
end
|
||||
new_ranges
|
||||
end
|
||||
end
|
||||
|
||||
class MultiplicityUnit < Unit
|
||||
|
@ -114,7 +114,7 @@ module Imbecile
|
||||
expect(m_unit.min_count).to eq 5
|
||||
expect(m_unit.max_count).to eq 8
|
||||
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
|
||||
expect(m_unit.unit.range.first).to eq "a".ord
|
||||
expect(m_unit.unit.first).to eq "a".ord
|
||||
end
|
||||
|
||||
it "parses an escaped *" do
|
||||
@ -125,9 +125,9 @@ module Imbecile
|
||||
seq_unit = regex.unit.alternates[0]
|
||||
expect(seq_unit.size).to eq 2
|
||||
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[0].min_code_point).to eq "a".ord
|
||||
expect(seq_unit[0].first).to eq "a".ord
|
||||
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[1].min_code_point).to eq "*".ord
|
||||
expect(seq_unit[1].first).to eq "*".ord
|
||||
end
|
||||
|
||||
it "parses an escaped +" do
|
||||
@ -138,9 +138,9 @@ module Imbecile
|
||||
seq_unit = regex.unit.alternates[0]
|
||||
expect(seq_unit.size).to eq 2
|
||||
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[0].min_code_point).to eq "a".ord
|
||||
expect(seq_unit[0].first).to eq "a".ord
|
||||
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[1].min_code_point).to eq "+".ord
|
||||
expect(seq_unit[1].first).to eq "+".ord
|
||||
end
|
||||
|
||||
it "parses an escaped \\" do
|
||||
@ -151,9 +151,9 @@ module Imbecile
|
||||
seq_unit = regex.unit.alternates[0]
|
||||
expect(seq_unit.size).to eq 2
|
||||
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[0].min_code_point).to eq "\\".ord
|
||||
expect(seq_unit[0].first).to eq "\\".ord
|
||||
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(seq_unit[1].min_code_point).to eq "d".ord
|
||||
expect(seq_unit[1].first).to eq "d".ord
|
||||
end
|
||||
|
||||
it "parses a character class" do
|
||||
@ -168,10 +168,10 @@ module Imbecile
|
||||
expect(ccu.negate).to be_falsey
|
||||
expect(ccu.size).to eq 2
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "a".ord
|
||||
expect(ccu[0].max_code_point).to eq "z".ord
|
||||
expect(ccu[0].first).to eq "a".ord
|
||||
expect(ccu[0].last).to eq "z".ord
|
||||
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[1].min_code_point).to eq "_".ord
|
||||
expect(ccu[1].first).to eq "_".ord
|
||||
end
|
||||
|
||||
it "parses a negated character class" do
|
||||
@ -186,7 +186,7 @@ module Imbecile
|
||||
expect(ccu.negate).to be_truthy
|
||||
expect(ccu.size).to eq 3
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "x".ord
|
||||
expect(ccu[0].first).to eq "x".ord
|
||||
end
|
||||
|
||||
it "parses - as a plain character at beginning of a character class" do
|
||||
@ -200,7 +200,7 @@ module Imbecile
|
||||
ccu = seq_unit[0]
|
||||
expect(ccu.size).to eq 2
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "-".ord
|
||||
expect(ccu[0].first).to eq "-".ord
|
||||
end
|
||||
|
||||
it "parses - as a plain character at end of a character class" do
|
||||
@ -214,9 +214,9 @@ module Imbecile
|
||||
ccu = seq_unit[0]
|
||||
expect(ccu.size).to eq 2
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "0".ord
|
||||
expect(ccu[0].first).to eq "0".ord
|
||||
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[1].min_code_point).to eq "-".ord
|
||||
expect(ccu[1].first).to eq "-".ord
|
||||
end
|
||||
|
||||
it "parses - as a plain character at beginning of a negated character class" do
|
||||
@ -231,7 +231,7 @@ module Imbecile
|
||||
expect(ccu.negate).to be_truthy
|
||||
expect(ccu.size).to eq 2
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "-".ord
|
||||
expect(ccu[0].first).to eq "-".ord
|
||||
end
|
||||
|
||||
it "parses . as a plain character in a character class" do
|
||||
@ -246,7 +246,7 @@ module Imbecile
|
||||
expect(ccu.negate).to be_falsey
|
||||
expect(ccu.size).to eq 1
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq ".".ord
|
||||
expect(ccu[0].first).to eq ".".ord
|
||||
end
|
||||
|
||||
it "parses - as a plain character when escaped in middle of character class" do
|
||||
@ -261,11 +261,11 @@ module Imbecile
|
||||
expect(ccu.negate).to be_falsey
|
||||
expect(ccu.size).to eq 3
|
||||
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[0].min_code_point).to eq "0".ord
|
||||
expect(ccu[0].first).to eq "0".ord
|
||||
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[1].min_code_point).to eq "-".ord
|
||||
expect(ccu[1].first).to eq "-".ord
|
||||
expect(ccu[2]).to be_a Regex::CharacterRangeUnit
|
||||
expect(ccu[2].min_code_point).to eq "9".ord
|
||||
expect(ccu[2].first).to eq "9".ord
|
||||
end
|
||||
|
||||
it "parses alternates" do
|
||||
@ -314,9 +314,9 @@ module Imbecile
|
||||
expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit
|
||||
expect(regex.unit.alternates[2].size).to eq 2
|
||||
expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(regex.unit.alternates[2][0].min_code_point).to eq "|".ord
|
||||
expect(regex.unit.alternates[2][0].first).to eq "|".ord
|
||||
expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit
|
||||
expect(regex.unit.alternates[2][1].min_code_point).to eq "v".ord
|
||||
expect(regex.unit.alternates[2][1].first).to eq "v".ord
|
||||
expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit
|
||||
expect(regex.unit.alternates[3].size).to eq 1
|
||||
expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit
|
||||
@ -325,8 +325,8 @@ module Imbecile
|
||||
expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit
|
||||
expect(regex.unit.alternates[3][0].unit.size).to eq 1
|
||||
expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit
|
||||
expect(regex.unit.alternates[3][0].unit[0].min_code_point).to eq "x".ord
|
||||
expect(regex.unit.alternates[3][0].unit[0].max_code_point).to eq "y".ord
|
||||
expect(regex.unit.alternates[3][0].unit[0].first).to eq "x".ord
|
||||
expect(regex.unit.alternates[3][0].unit[0].last).to eq "y".ord
|
||||
end
|
||||
|
||||
end
|
||||
|
Loading…
x
Reference in New Issue
Block a user