Add CodePointRange class

This commit is contained in:
Josh Holtrop 2021-05-23 17:52:20 -04:00
parent 3a1650906e
commit 3987f08cd7
6 changed files with 142 additions and 64 deletions

View File

@ -1,6 +1,7 @@
require "erb" require "erb"
require "set" require "set"
require_relative "imbecile/cli" require_relative "imbecile/cli"
require_relative "imbecile/code_point_range"
require_relative "imbecile/grammar" require_relative "imbecile/grammar"
require_relative "imbecile/regex" require_relative "imbecile/regex"
require_relative "imbecile/regex/dfa" require_relative "imbecile/regex/dfa"

View File

@ -0,0 +1,82 @@
module Imbecile
class CodePointRange
MAX_CODE_POINT = 0xFFFFFFFF
attr_reader :first
attr_reader :last
include Comparable
# Build a CodePointRange
def initialize(first, last = nil)
@first = first.ord
if last
@last = last.ord
if @last < @first
raise "Invalid CodePointRange: last code point must be > first code point"
end
else
@last = @first
end
end
def <=>(other)
if self.first != other.first
@first <=> other.first
else
@last <=> other.last
end
end
def include?(v)
if v.is_a?(CodePointRange)
@first <= v.first && v.last <= @last
else
@first <= v && v <= @last
end
end
def size
@last - @first + 1
end
class << self
def invert_ranges(code_point_ranges)
new_ranges = []
last_cp = -1
code_point_ranges.sort.each do |code_point_range|
if code_point_range.first > (last_cp + 1)
new_ranges << CodePointRange.new(last_cp + 1, code_point_range.first - 1)
last_cp = code_point_range.last
end
end
if last_cp < MAX_CODE_POINT
new_ranges << CodePointRange.new(last_cp + 1, MAX_CODE_POINT)
end
new_ranges
end
def first_subrange(code_point_ranges)
code_point_ranges.sort.reduce do |result, code_point_range|
if code_point_range.include?(result.first)
if code_point_range.last < result.last
code_point_range
else
result
end
else
if code_point_range.first <= result.last
CodePointRange.new(result.first, code_point_range.first - 1)
else
result
end
end
end
end
end
end
end

View File

@ -83,13 +83,13 @@ module Imbecile
ccu << parse_backslash ccu << parse_backslash
elsif c == "-" && @pattern[0] != "]" elsif c == "-" && @pattern[0] != "]"
begin_cu = ccu.last_unit begin_cu = ccu.last_unit
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1 unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.code_point_range.size == 1
raise Error.new("Character range must be between single characters") raise Error.new("Character range must be between single characters")
end end
if @pattern[0] == "\\" if @pattern[0] == "\\"
@pattern.slice!(0) @pattern.slice!(0)
end_cu = parse_backslash end_cu = parse_backslash
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1 unless end_cu.is_a?(CharacterRangeUnit) && end_cu.code_point_range.size == 1
raise Error.new("Character range must be between single characters") raise Error.new("Character range must be between single characters")
end end
max_code_point = end_cu.code_point max_code_point = end_cu.code_point
@ -97,7 +97,7 @@ module Imbecile
max_code_point = @pattern[0].ord max_code_point = @pattern[0].ord
@pattern.slice!(0) @pattern.slice!(0)
end end
cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point) cru = CharacterRangeUnit.new(begin_cu.first, max_code_point)
ccu.replace_last!(cru) ccu.replace_last!(cru)
else else
ccu << CharacterRangeUnit.new(c) ccu << CharacterRangeUnit.new(c)

View File

@ -24,12 +24,10 @@ module Imbecile
def nil_transition_states def nil_transition_states
states = Set[self] states = Set[self]
analyze_state = lambda do |state| analyze_state = lambda do |state|
state.transitions.each do |range, dest_state| state.nil_transitions.each do |range, dest_state|
if range.nil? unless states.include?(dest_state)
unless states.include?(dest_state) states << dest_state
states << dest_state analyze_state[dest_state]
analyze_state[dest_state]
end
end end
end end
end end
@ -37,6 +35,18 @@ module Imbecile
states states
end end
def nil_transitions
@transitions.select do |code_point, dest_state|
code_point.nil?
end
end
def cp_transitions
@transitions.select do |code_point, dest_state|
code_point
end
end
end end
attr_accessor :start_state attr_accessor :start_state
@ -69,13 +79,13 @@ module Imbecile
visit = lambda do |state| visit = lambda do |state|
accepts_s = state.accepts ? " *" : "" accepts_s = state.accepts ? " *" : ""
rv += "#{state_id[state]}#{accepts_s}:\n" rv += "#{state_id[state]}#{accepts_s}:\n"
state.transitions.each do |range, dest_state| state.transitions.each do |code_point_range, dest_state|
if range.nil? if code_point_range.nil?
range_s = "nil" range_s = "nil"
else else
range_s = chr[range.first] range_s = chr[code_point_range.first]
if range.size > 1 if code_point_range.size > 1
range_s += "-" + chr[range.last] range_s += "-" + chr[code_point_range.last]
end end
end end
accepts_s = dest_state.accepts ? " *" : "" accepts_s = dest_state.accepts ? " *" : ""

View File

@ -68,18 +68,19 @@ module Imbecile
end end
class CharacterRangeUnit < Unit class CharacterRangeUnit < Unit
attr_accessor :min_code_point attr_reader :code_point_range
attr_accessor :max_code_point
def initialize(c1, c2 = nil) def initialize(c1, c2 = nil)
@min_code_point = c1.ord @code_point_range = CodePointRange.new(c1, c2)
@max_code_point = c2 ? c2.ord : @min_code_point
end end
def range def first
@min_code_point..@max_code_point @code_point_range.first
end
def last
@code_point_range.last
end end
def to_nfa def to_nfa
nfa = NFA.new nfa = NFA.new
nfa.start_state.add_transition(range, nfa.end_state) nfa.start_state.add_transition(@code_point_range, nfa.end_state)
nfa nfa
end end
end end
@ -108,32 +109,16 @@ module Imbecile
if @units.empty? if @units.empty?
nfa.start_state.add_transition(nil, nfa.end_state) nfa.start_state.add_transition(nil, nfa.end_state)
else else
ranges = @units.map(&:range) code_point_ranges = @units.map(&:code_point_range)
if @negate if @negate
ranges = negate_ranges(ranges) code_point_ranges = CodePointRange.invert_ranges(code_point_ranges)
end end
ranges.each do |range| code_point_ranges.each do |code_point_range|
nfa.start_state.add_transition(range, nfa.end_state) nfa.start_state.add_transition(code_point_range, nfa.end_state)
end end
end end
nfa nfa
end end
private
def negate_ranges(ranges)
ranges = ranges.sort_by(&:first)
new_ranges = []
last_cp = -1
ranges.each do |range|
if range.first > (last_cp + 1)
new_ranges << ((last_cp + 1)..(range.first - 1))
last_cp = range.last
end
end
if last_cp < 0xFFFFFFFF
new_ranges << ((last_cp + 1)..0xFFFFFFFF)
end
new_ranges
end
end end
class MultiplicityUnit < Unit class MultiplicityUnit < Unit

View File

@ -114,7 +114,7 @@ module Imbecile
expect(m_unit.min_count).to eq 5 expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 8 expect(m_unit.max_count).to eq 8
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
expect(m_unit.unit.range.first).to eq "a".ord expect(m_unit.unit.first).to eq "a".ord
end end
it "parses an escaped *" do it "parses an escaped *" do
@ -125,9 +125,9 @@ module Imbecile
seq_unit = regex.unit.alternates[0] seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord expect(seq_unit[0].first).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "*".ord expect(seq_unit[1].first).to eq "*".ord
end end
it "parses an escaped +" do it "parses an escaped +" do
@ -138,9 +138,9 @@ module Imbecile
seq_unit = regex.unit.alternates[0] seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord expect(seq_unit[0].first).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "+".ord expect(seq_unit[1].first).to eq "+".ord
end end
it "parses an escaped \\" do it "parses an escaped \\" do
@ -151,9 +151,9 @@ module Imbecile
seq_unit = regex.unit.alternates[0] seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "\\".ord expect(seq_unit[0].first).to eq "\\".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "d".ord expect(seq_unit[1].first).to eq "d".ord
end end
it "parses a character class" do it "parses a character class" do
@ -168,10 +168,10 @@ module Imbecile
expect(ccu.negate).to be_falsey expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "a".ord expect(ccu[0].first).to eq "a".ord
expect(ccu[0].max_code_point).to eq "z".ord expect(ccu[0].last).to eq "z".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "_".ord expect(ccu[1].first).to eq "_".ord
end end
it "parses a negated character class" do it "parses a negated character class" do
@ -186,7 +186,7 @@ module Imbecile
expect(ccu.negate).to be_truthy expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 3 expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "x".ord expect(ccu[0].first).to eq "x".ord
end end
it "parses - as a plain character at beginning of a character class" do it "parses - as a plain character at beginning of a character class" do
@ -200,7 +200,7 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord expect(ccu[0].first).to eq "-".ord
end end
it "parses - as a plain character at end of a character class" do it "parses - as a plain character at end of a character class" do
@ -214,9 +214,9 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord expect(ccu[0].first).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord expect(ccu[1].first).to eq "-".ord
end end
it "parses - as a plain character at beginning of a negated character class" do it "parses - as a plain character at beginning of a negated character class" do
@ -231,7 +231,7 @@ module Imbecile
expect(ccu.negate).to be_truthy expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord expect(ccu[0].first).to eq "-".ord
end end
it "parses . as a plain character in a character class" do it "parses . as a plain character in a character class" do
@ -246,7 +246,7 @@ module Imbecile
expect(ccu.negate).to be_falsey expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 1 expect(ccu.size).to eq 1
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq ".".ord expect(ccu[0].first).to eq ".".ord
end end
it "parses - as a plain character when escaped in middle of character class" do it "parses - as a plain character when escaped in middle of character class" do
@ -261,11 +261,11 @@ module Imbecile
expect(ccu.negate).to be_falsey expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 3 expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord expect(ccu[0].first).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord expect(ccu[1].first).to eq "-".ord
expect(ccu[2]).to be_a Regex::CharacterRangeUnit expect(ccu[2]).to be_a Regex::CharacterRangeUnit
expect(ccu[2].min_code_point).to eq "9".ord expect(ccu[2].first).to eq "9".ord
end end
it "parses alternates" do it "parses alternates" do
@ -314,9 +314,9 @@ module Imbecile
expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[2].size).to eq 2 expect(regex.unit.alternates[2].size).to eq 2
expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][0].min_code_point).to eq "|".ord expect(regex.unit.alternates[2][0].first).to eq "|".ord
expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][1].min_code_point).to eq "v".ord expect(regex.unit.alternates[2][1].first).to eq "v".ord
expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[3].size).to eq 1 expect(regex.unit.alternates[3].size).to eq 1
expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit
@ -325,8 +325,8 @@ module Imbecile
expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[3][0].unit.size).to eq 1 expect(regex.unit.alternates[3][0].unit.size).to eq 1
expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[3][0].unit[0].min_code_point).to eq "x".ord expect(regex.unit.alternates[3][0].unit[0].first).to eq "x".ord
expect(regex.unit.alternates[3][0].unit[0].max_code_point).to eq "y".ord expect(regex.unit.alternates[3][0].unit[0].last).to eq "y".ord
end end
end end