Store all characters as ranges; add CharacterClassUnit#to_nfa

This commit is contained in:
Josh Holtrop 2021-05-14 13:52:03 -04:00
parent 2e8e72a1e8
commit 70b3e56de2
2 changed files with 90 additions and 69 deletions

View File

@ -65,28 +65,19 @@ module Imbecile
end end
end end
class CharacterUnit < Unit
attr_accessor :code_point
def initialize(c)
@code_point = c.ord
end
def to_nfa
nfa = NFA.new
nfa.start_state.add_transition(@code_point, nfa.end_state)
nfa
end
end
class CharacterRangeUnit < Unit class CharacterRangeUnit < Unit
attr_accessor :min_code_point attr_accessor :min_code_point
attr_accessor :max_code_point attr_accessor :max_code_point
def initialize(c1, c2) def initialize(c1, c2 = nil)
@min_code_point = c1.ord @min_code_point = c1.ord
@max_code_point = c2.ord @max_code_point = c2 ? c2.ord : @min_code_point
end
def range
@min_code_point..@max_code_point
end end
def to_nfa def to_nfa
nfa = NFA.new nfa = NFA.new
nfa.start_state.add_transition((@min_code_point..@max_code_point), nfa.end_state) nfa.start_state.add_transition(range, nfa.end_state)
nfa nfa
end end
end end
@ -110,6 +101,36 @@ module Imbecile
def replace_last!(new_unit) def replace_last!(new_unit)
@units[-1] = new_unit @units[-1] = new_unit
end end
def to_nfa
nfa = NFA.new
if @units.empty?
nfa.start_state.add_transition(nil, nfa.end_state)
else
ranges = @units.map(&:range)
if unit.negate
ranges = negate_ranges(ranges)
end
ranges.each do |range|
nfa.start_state.add_transition(range, nfa.end_state)
end
end
end
private
def negate_ranges(ranges)
ranges = ranges.sort_by(&:first)
new_ranges = []
last_cp = -1
ranges.each do |range|
if range.first > (last_cp + 1)
new_ranges << ((last_cp + 1)..(range.first - 1))
last_cp = range.last
end
end
if last_cp < 0xFFFFFFFF
new_ranges << ((last_cp + 1)..0xFFFFFFFF)
end
new_ranges
end
end end
class MultiplicityUnit < Unit class MultiplicityUnit < Unit
@ -194,7 +215,7 @@ module Imbecile
when "\\" when "\\"
au << parse_backslash au << parse_backslash
else else
au << CharacterUnit.new(c) au << CharacterRangeUnit.new(c)
end end
end end
au au
@ -222,18 +243,18 @@ module Imbecile
elsif c == "^" && index == 0 elsif c == "^" && index == 0
ccu.negate = true ccu.negate = true
elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]") elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]")
ccu << CharacterUnit.new(c) ccu << CharacterRangeUnit.new(c)
elsif c == "\\" elsif c == "\\"
ccu << parse_backslash ccu << parse_backslash
elsif c == "-" && @pattern[0] != "]" elsif c == "-" && @pattern[0] != "]"
begin_cu = ccu.last_unit begin_cu = ccu.last_unit
unless begin_cu.is_a?(CharacterUnit) unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1
raise Error.new("Character range must be between single characters") raise Error.new("Character range must be between single characters")
end end
if @pattern[0] == "\\" if @pattern[0] == "\\"
@pattern.slice!(0) @pattern.slice!(0)
end_cu = parse_backslash end_cu = parse_backslash
unless end_cu.is_a?(CharacterUnit) unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1
raise Error.new("Character range must be between single characters") raise Error.new("Character range must be between single characters")
end end
max_code_point = end_cu.code_point max_code_point = end_cu.code_point
@ -241,10 +262,10 @@ module Imbecile
max_code_point = @pattern[0].ord max_code_point = @pattern[0].ord
@pattern.slice!(0) @pattern.slice!(0)
end end
cru = CharacterRangeUnit.new(begin_cu.code_point, max_code_point) cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point)
ccu.replace_last!(cru) ccu.replace_last!(cru)
else else
ccu << CharacterUnit.new(c) ccu << CharacterRangeUnit.new(c)
end end
index += 1 index += 1
end end
@ -281,7 +302,7 @@ module Imbecile
when "d" when "d"
CharacterRangeUnit.new("0", "9") CharacterRangeUnit.new("0", "9")
else else
CharacterUnit.new(c) CharacterRangeUnit.new(c)
end end
end end
end end

View File

@ -16,7 +16,7 @@ module Imbecile
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0] seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1 expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterUnit expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
end end
it "parses a group with a single character unit expression" do it "parses a group with a single character unit expression" do
@ -30,7 +30,7 @@ module Imbecile
alt_unit = seq_unit[0] alt_unit = seq_unit[0]
expect(alt_unit.alternates.size).to eq 1 expect(alt_unit.alternates.size).to eq 1
expect(alt_unit.alternates[0]).to be_a Parser::SequenceUnit expect(alt_unit.alternates[0]).to be_a Parser::SequenceUnit
expect(alt_unit.alternates[0][0]).to be_a Parser::CharacterUnit expect(alt_unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit
end end
it "parses a *" do it "parses a *" do
@ -44,7 +44,7 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0 expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to be_nil expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end end
it "parses a +" do it "parses a +" do
@ -58,7 +58,7 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 1 expect(m_unit.min_count).to eq 1
expect(m_unit.max_count).to be_nil expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end end
it "parses a ?" do it "parses a ?" do
@ -72,7 +72,7 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0 expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to eq 1 expect(m_unit.max_count).to eq 1
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end end
it "parses a multiplicity count" do it "parses a multiplicity count" do
@ -86,7 +86,7 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5 expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 5 expect(m_unit.max_count).to eq 5
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end end
it "parses a minimum-only multiplicity count" do it "parses a minimum-only multiplicity count" do
@ -100,7 +100,7 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5 expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to be_nil expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end end
it "parses a minimum and maximum multiplicity count" do it "parses a minimum and maximum multiplicity count" do
@ -114,8 +114,8 @@ module Imbecile
m_unit = seq_unit[0] m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5 expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 8 expect(m_unit.max_count).to eq 8
expect(m_unit.unit).to be_a Parser::CharacterUnit expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
expect(m_unit.unit.code_point).to eq "a".ord expect(m_unit.unit.range.first).to eq "a".ord
end end
it "parses an escaped *" do it "parses an escaped *" do
@ -125,10 +125,10 @@ module Imbecile
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0] seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterUnit expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].code_point).to eq "a".ord expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Parser::CharacterUnit expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].code_point).to eq "*".ord expect(seq_unit[1].min_code_point).to eq "*".ord
end end
it "parses an escaped +" do it "parses an escaped +" do
@ -138,10 +138,10 @@ module Imbecile
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0] seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterUnit expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].code_point).to eq "a".ord expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Parser::CharacterUnit expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].code_point).to eq "+".ord expect(seq_unit[1].min_code_point).to eq "+".ord
end end
it "parses an escaped \\" do it "parses an escaped \\" do
@ -151,10 +151,10 @@ module Imbecile
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0] seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2 expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterUnit expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].code_point).to eq "\\".ord expect(seq_unit[0].min_code_point).to eq "\\".ord
expect(seq_unit[1]).to be_a Parser::CharacterUnit expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].code_point).to eq "d".ord expect(seq_unit[1].min_code_point).to eq "d".ord
end end
it "parses a character class" do it "parses a character class" do
@ -171,8 +171,8 @@ module Imbecile
expect(ccu[0]).to be_a Parser::CharacterRangeUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "a".ord expect(ccu[0].min_code_point).to eq "a".ord
expect(ccu[0].max_code_point).to eq "z".ord expect(ccu[0].max_code_point).to eq "z".ord
expect(ccu[1]).to be_a Parser::CharacterUnit expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].code_point).to eq "_".ord expect(ccu[1].min_code_point).to eq "_".ord
end end
it "parses a negated character class" do it "parses a negated character class" do
@ -186,8 +186,8 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.negate).to be_truthy expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 3 expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq "x".ord expect(ccu[0].min_code_point).to eq "x".ord
end end
it "parses - as a plain character at beginning of a character class" do it "parses - as a plain character at beginning of a character class" do
@ -200,8 +200,8 @@ module Imbecile
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq "-".ord expect(ccu[0].min_code_point).to eq "-".ord
end end
it "parses - as a plain character at end of a character class" do it "parses - as a plain character at end of a character class" do
@ -214,10 +214,10 @@ module Imbecile
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq "0".ord expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Parser::CharacterUnit expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].code_point).to eq "-".ord expect(ccu[1].min_code_point).to eq "-".ord
end end
it "parses - as a plain character at beginning of a negated character class" do it "parses - as a plain character at beginning of a negated character class" do
@ -231,8 +231,8 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.negate).to be_truthy expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 2 expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq "-".ord expect(ccu[0].min_code_point).to eq "-".ord
end end
it "parses . as a plain character in a negated character class" do it "parses . as a plain character in a negated character class" do
@ -246,8 +246,8 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.negate).to be_falsey expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 1 expect(ccu.size).to eq 1
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq ".".ord expect(ccu[0].min_code_point).to eq ".".ord
end end
it "parses - as a plain character when escaped in middle of character class" do it "parses - as a plain character when escaped in middle of character class" do
@ -261,12 +261,12 @@ module Imbecile
ccu = seq_unit[0] ccu = seq_unit[0]
expect(ccu.negate).to be_falsey expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 3 expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Parser::CharacterUnit expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].code_point).to eq "0".ord expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Parser::CharacterUnit expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].code_point).to eq "-".ord expect(ccu[1].min_code_point).to eq "-".ord
expect(ccu[2]).to be_a Parser::CharacterUnit expect(ccu[2]).to be_a Parser::CharacterRangeUnit
expect(ccu[2].code_point).to eq "9".ord expect(ccu[2].min_code_point).to eq "9".ord
end end
it "parses alternates" do it "parses alternates" do
@ -292,7 +292,7 @@ module Imbecile
expect(parser.unit.alternates[0][0].unit.alternates.size).to eq 2 expect(parser.unit.alternates[0][0].unit.alternates.size).to eq 2
expect(parser.unit.alternates[0][0].unit.alternates[0]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0][0].unit.alternates[0]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0][0].unit.alternates[0].size).to eq 1 expect(parser.unit.alternates[0][0].unit.alternates[0].size).to eq 1
expect(parser.unit.alternates[0][0].unit.alternates[0][0]).to be_a Parser::CharacterUnit expect(parser.unit.alternates[0][0].unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[0][0].unit.alternates[1]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[0][0].unit.alternates[1]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0][0].unit.alternates[1].size).to eq 0 expect(parser.unit.alternates[0][0].unit.alternates[1].size).to eq 0
expect(parser.unit.alternates[1]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[1]).to be_a Parser::SequenceUnit
@ -300,13 +300,13 @@ module Imbecile
expect(parser.unit.alternates[1][0]).to be_a Parser::CharacterClassUnit expect(parser.unit.alternates[1][0]).to be_a Parser::CharacterClassUnit
expect(parser.unit.alternates[1][0].negate).to be_truthy expect(parser.unit.alternates[1][0].negate).to be_truthy
expect(parser.unit.alternates[1][0].size).to eq 1 expect(parser.unit.alternates[1][0].size).to eq 1
expect(parser.unit.alternates[1][0][0]).to be_a Parser::CharacterUnit expect(parser.unit.alternates[1][0][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[2]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[2].size).to eq 2 expect(parser.unit.alternates[2].size).to eq 2
expect(parser.unit.alternates[2][0]).to be_a Parser::CharacterUnit expect(parser.unit.alternates[2][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2][0].code_point).to eq "|".ord expect(parser.unit.alternates[2][0].min_code_point).to eq "|".ord
expect(parser.unit.alternates[2][1]).to be_a Parser::CharacterUnit expect(parser.unit.alternates[2][1]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2][1].code_point).to eq "v".ord expect(parser.unit.alternates[2][1].min_code_point).to eq "v".ord
expect(parser.unit.alternates[3]).to be_a Parser::SequenceUnit expect(parser.unit.alternates[3]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[3].size).to eq 1 expect(parser.unit.alternates[3].size).to eq 1
expect(parser.unit.alternates[3][0]).to be_a Parser::MultiplicityUnit expect(parser.unit.alternates[3][0]).to be_a Parser::MultiplicityUnit