Merge Regex::Parser into Regex, move Unit to its own file

This commit is contained in:
Josh Holtrop 2021-05-18 16:14:42 -04:00
parent 89a5976064
commit 24054461a2
7 changed files with 651 additions and 664 deletions

View File

@ -2,7 +2,7 @@ require_relative "imbecile/cli"
require_relative "imbecile/grammar" require_relative "imbecile/grammar"
require_relative "imbecile/regex" require_relative "imbecile/regex"
require_relative "imbecile/regex/nfa" require_relative "imbecile/regex/nfa"
require_relative "imbecile/regex/parser" require_relative "imbecile/regex/unit"
require_relative "imbecile/version" require_relative "imbecile/version"
require "erb" require "erb"

View File

@ -41,7 +41,6 @@ module Imbecile
# Build NFA from each token expression. # Build NFA from each token expression.
@tokens.each do |token_name, token_def| @tokens.each do |token_name, token_def|
token_def[:regex] = Regex.new(token_def[:pattern]) token_def[:regex] = Regex.new(token_def[:pattern])
token_def[:nfa] = token_def[:regex].parser.unit.to_nfa
end end
end end

View File

@ -1,10 +1,152 @@
module Imbecile module Imbecile
class Regex class Regex
attr_accessor :parser attr_reader :unit
attr_reader :nfa
def initialize(pattern) def initialize(pattern)
@parser = Parser.new(pattern) @pattern = pattern.dup
@unit = parse_alternates
@nfa = @unit.to_nfa
if @pattern != ""
raise Error.new(%[Unexpected "#{@pattern}" in pattern])
end
end
private
def parse_alternates
au = AlternatesUnit.new
while @pattern != ""
c = @pattern[0]
return au if c == ")"
@pattern.slice!(0)
case c
when "["
au << parse_character_class
when "("
au << parse_group
when "*", "+", "?", "{"
if last_unit = au.last_unit
case c
when "*"
min_count, max_count = 0, nil
when "+"
min_count, max_count = 1, nil
when "?"
min_count, max_count = 0, 1
when "{"
min_count, max_count = parse_curly_count
end
mu = MultiplicityUnit.new(last_unit, min_count, max_count)
au.replace_last!(mu)
else
raise Error.new("#{c} follows nothing")
end
when "|"
au.new_alternate!
when "\\"
au << parse_backslash
when "."
au << period_character_class
else
au << CharacterRangeUnit.new(c)
end
end
au
end
def parse_group
au = parse_alternates
if @pattern[0] != ")"
raise Error.new("Unterminated group in pattern")
end
@pattern.slice!(0)
au
end
def parse_character_class
ccu = CharacterClassUnit.new
index = 0
loop do
if @pattern == ""
raise Error.new("Unterminated character class")
end
c = @pattern.slice!(0)
if c == "]"
break
elsif c == "^" && index == 0
ccu.negate = true
elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]")
ccu << CharacterRangeUnit.new(c)
elsif c == "\\"
ccu << parse_backslash
elsif c == "-" && @pattern[0] != "]"
begin_cu = ccu.last_unit
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1
raise Error.new("Character range must be between single characters")
end
if @pattern[0] == "\\"
@pattern.slice!(0)
end_cu = parse_backslash
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1
raise Error.new("Character range must be between single characters")
end
max_code_point = end_cu.code_point
else
max_code_point = @pattern[0].ord
@pattern.slice!(0)
end
cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point)
ccu.replace_last!(cru)
else
ccu << CharacterRangeUnit.new(c)
end
index += 1
end
ccu
end
def parse_curly_count
if @pattern =~ /^(\d+)(?:(,)(\d*))?\}(.*)$/
min_count, comma, max_count, pattern = $1, $2, $3, $4
min_count = min_count.to_i
if comma.to_s == ""
max_count = min_count
elsif max_count.to_s != ""
max_count = max_count.to_i
if max_count < min_count
raise Error.new("Maximum repetition count cannot be less than minimum repetition count")
end
else
max_count = nil
end
@pattern = pattern
[min_count, max_count]
else
raise Error.new("Unexpected match count at #{@pattern}")
end
end
def parse_backslash
if @pattern == ""
raise Error.new("Error: unfollowed \\")
else
c = @pattern.slice!(0)
case c
when "d"
CharacterRangeUnit.new("0", "9")
else
CharacterRangeUnit.new(c)
end
end
end
def period_character_class
ccu = CharacterClassUnit.new
ccu << CharacterRangeUnit.new(0, "\n".ord - 1)
ccu << CharacterRangeUnit.new("\n".ord + 1, 0xFFFFFFFF)
ccu
end end
end end

View File

@ -1,325 +0,0 @@
module Imbecile
class Regex
class Parser
class Unit
end
class SequenceUnit < Unit
attr_accessor :units
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def to_nfa
if @units.empty?
NFA.empty
else
@units.map do |unit|
unit.to_nfa
end.reduce do |result, nfa|
result.end_state.add_transition(nil, nfa.start_state)
result
end
end
end
end
class AlternatesUnit < Unit
attr_accessor :alternates
def initialize
@alternates = []
new_alternate!
end
def new_alternate!
@alternates << SequenceUnit.new
end
def <<(unit)
@alternates[-1] << unit
end
def last_unit
@alternates[-1][-1]
end
def replace_last!(new_unit)
@alternates[-1][-1] = new_unit
end
def to_nfa
if @alternates.size == 0
NFA.empty
elsif @alternates.size == 1
@alternates[0].to_nfa
else
nfa = NFA.new
alternate_nfas = @alternates.map do |alternate|
alternate.to_nfa
end
alternate_nfas.each do |alternate_nfa|
nfa.start_state.add_transition(nil, alternate_nfa.start_state)
alternate_nfa.end_state.add_transition(nil, nfa.end_state)
end
nfa
end
end
end
class CharacterRangeUnit < Unit
attr_accessor :min_code_point
attr_accessor :max_code_point
def initialize(c1, c2 = nil)
@min_code_point = c1.ord
@max_code_point = c2 ? c2.ord : @min_code_point
end
def range
@min_code_point..@max_code_point
end
def to_nfa
nfa = NFA.new
nfa.start_state.add_transition(range, nfa.end_state)
nfa
end
end
class CharacterClassUnit < Unit
attr_accessor :units
attr_accessor :negate
def initialize
@units = []
@negate = false
end
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def last_unit
@units[-1]
end
def replace_last!(new_unit)
@units[-1] = new_unit
end
def to_nfa
nfa = NFA.new
if @units.empty?
nfa.start_state.add_transition(nil, nfa.end_state)
else
ranges = @units.map(&:range)
if @negate
ranges = negate_ranges(ranges)
end
ranges.each do |range|
nfa.start_state.add_transition(range, nfa.end_state)
end
end
nfa
end
private
def negate_ranges(ranges)
ranges = ranges.sort_by(&:first)
new_ranges = []
last_cp = -1
ranges.each do |range|
if range.first > (last_cp + 1)
new_ranges << ((last_cp + 1)..(range.first - 1))
last_cp = range.last
end
end
if last_cp < 0xFFFFFFFF
new_ranges << ((last_cp + 1)..0xFFFFFFFF)
end
new_ranges
end
end
class MultiplicityUnit < Unit
attr_accessor :unit
attr_accessor :min_count
attr_accessor :max_count
def initialize(unit, min_count, max_count)
@unit = unit
@min_count = min_count
@max_count = max_count
end
def to_nfa
nfa = NFA.new
unit_nfa = @unit.to_nfa
nfa.start_state.add_transition(nil, unit_nfa.start_state)
if @min_count == 0
nfa.start_state.add_transition(nil, nfa.end_state)
else
(@min_count - 1).times do
prev_nfa = unit_nfa
unit_nfa = @unit.to_nfa
prev_nfa.end_state.add_transition(nil, unit_nfa.start_state)
end
end
unit_nfa.end_state.add_transition(nil, nfa.end_state)
if @max_count.nil?
unit_nfa.end_state.add_transition(nil, nfa.start_state)
else
(@max_count - @min_count).times do
prev_nfa = unit_nfa
unit_nfa = @unit.to_nfa
prev_nfa.end_state.add_transition(nil, unit_nfa.start_state)
unit_nfa.end_state.add_transition(nil, nfa.end_state)
end
end
nfa
end
end
attr_reader :unit
attr_reader :nfa
def initialize(pattern)
@pattern = pattern.dup
@unit = parse_alternates
@nfa = @unit.to_nfa
if @pattern != ""
raise Error.new(%[Unexpected "#{@pattern}" in pattern])
end
end
private
def parse_alternates
au = AlternatesUnit.new
while @pattern != ""
c = @pattern[0]
return au if c == ")"
@pattern.slice!(0)
case c
when "["
au << parse_character_class
when "("
au << parse_group
when "*", "+", "?", "{"
if last_unit = au.last_unit
case c
when "*"
min_count, max_count = 0, nil
when "+"
min_count, max_count = 1, nil
when "?"
min_count, max_count = 0, 1
when "{"
min_count, max_count = parse_curly_count
end
mu = MultiplicityUnit.new(last_unit, min_count, max_count)
au.replace_last!(mu)
else
raise Error.new("#{c} follows nothing")
end
when "|"
au.new_alternate!
when "\\"
au << parse_backslash
when "."
au << period_character_class
else
au << CharacterRangeUnit.new(c)
end
end
au
end
def parse_group
au = parse_alternates
if @pattern[0] != ")"
raise Error.new("Unterminated group in pattern")
end
@pattern.slice!(0)
au
end
def parse_character_class
ccu = CharacterClassUnit.new
index = 0
loop do
if @pattern == ""
raise Error.new("Unterminated character class")
end
c = @pattern.slice!(0)
if c == "]"
break
elsif c == "^" && index == 0
ccu.negate = true
elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]")
ccu << CharacterRangeUnit.new(c)
elsif c == "\\"
ccu << parse_backslash
elsif c == "-" && @pattern[0] != "]"
begin_cu = ccu.last_unit
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.range.size == 1
raise Error.new("Character range must be between single characters")
end
if @pattern[0] == "\\"
@pattern.slice!(0)
end_cu = parse_backslash
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.range.size == 1
raise Error.new("Character range must be between single characters")
end
max_code_point = end_cu.code_point
else
max_code_point = @pattern[0].ord
@pattern.slice!(0)
end
cru = CharacterRangeUnit.new(begin_cu.min_code_point, max_code_point)
ccu.replace_last!(cru)
else
ccu << CharacterRangeUnit.new(c)
end
index += 1
end
ccu
end
def parse_curly_count
if @pattern =~ /^(\d+)(?:(,)(\d*))?\}(.*)$/
min_count, comma, max_count, pattern = $1, $2, $3, $4
min_count = min_count.to_i
if comma.to_s == ""
max_count = min_count
elsif max_count.to_s != ""
max_count = max_count.to_i
if max_count < min_count
raise Error.new("Maximum repetition count cannot be less than minimum repetition count")
end
else
max_count = nil
end
@pattern = pattern
[min_count, max_count]
else
raise Error.new("Unexpected match count at #{@pattern}")
end
end
def parse_backslash
if @pattern == ""
raise Error.new("Error: unfollowed \\")
else
c = @pattern.slice!(0)
case c
when "d"
CharacterRangeUnit.new("0", "9")
else
CharacterRangeUnit.new(c)
end
end
end
def period_character_class
ccu = CharacterClassUnit.new
ccu << CharacterRangeUnit.new(0, "\n".ord - 1)
ccu << CharacterRangeUnit.new("\n".ord + 1, 0xFFFFFFFF)
ccu
end
end
end
end

173
lib/imbecile/regex/unit.rb Normal file
View File

@ -0,0 +1,173 @@
module Imbecile
class Regex
class Unit
end
class SequenceUnit < Unit
attr_accessor :units
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def to_nfa
if @units.empty?
NFA.empty
else
@units.map do |unit|
unit.to_nfa
end.reduce do |result, nfa|
result.end_state.add_transition(nil, nfa.start_state)
result
end
end
end
end
class AlternatesUnit < Unit
attr_accessor :alternates
def initialize
@alternates = []
new_alternate!
end
def new_alternate!
@alternates << SequenceUnit.new
end
def <<(unit)
@alternates[-1] << unit
end
def last_unit
@alternates[-1][-1]
end
def replace_last!(new_unit)
@alternates[-1][-1] = new_unit
end
def to_nfa
if @alternates.size == 0
NFA.empty
elsif @alternates.size == 1
@alternates[0].to_nfa
else
nfa = NFA.new
alternate_nfas = @alternates.map do |alternate|
alternate.to_nfa
end
alternate_nfas.each do |alternate_nfa|
nfa.start_state.add_transition(nil, alternate_nfa.start_state)
alternate_nfa.end_state.add_transition(nil, nfa.end_state)
end
nfa
end
end
end
class CharacterRangeUnit < Unit
attr_accessor :min_code_point
attr_accessor :max_code_point
def initialize(c1, c2 = nil)
@min_code_point = c1.ord
@max_code_point = c2 ? c2.ord : @min_code_point
end
def range
@min_code_point..@max_code_point
end
def to_nfa
nfa = NFA.new
nfa.start_state.add_transition(range, nfa.end_state)
nfa
end
end
class CharacterClassUnit < Unit
attr_accessor :units
attr_accessor :negate
def initialize
@units = []
@negate = false
end
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def last_unit
@units[-1]
end
def replace_last!(new_unit)
@units[-1] = new_unit
end
def to_nfa
nfa = NFA.new
if @units.empty?
nfa.start_state.add_transition(nil, nfa.end_state)
else
ranges = @units.map(&:range)
if @negate
ranges = negate_ranges(ranges)
end
ranges.each do |range|
nfa.start_state.add_transition(range, nfa.end_state)
end
end
nfa
end
private
def negate_ranges(ranges)
ranges = ranges.sort_by(&:first)
new_ranges = []
last_cp = -1
ranges.each do |range|
if range.first > (last_cp + 1)
new_ranges << ((last_cp + 1)..(range.first - 1))
last_cp = range.last
end
end
if last_cp < 0xFFFFFFFF
new_ranges << ((last_cp + 1)..0xFFFFFFFF)
end
new_ranges
end
end
class MultiplicityUnit < Unit
attr_accessor :unit
attr_accessor :min_count
attr_accessor :max_count
def initialize(unit, min_count, max_count)
@unit = unit
@min_count = min_count
@max_count = max_count
end
def to_nfa
nfa = NFA.new
unit_nfa = @unit.to_nfa
nfa.start_state.add_transition(nil, unit_nfa.start_state)
if @min_count == 0
nfa.start_state.add_transition(nil, nfa.end_state)
else
(@min_count - 1).times do
prev_nfa = unit_nfa
unit_nfa = @unit.to_nfa
prev_nfa.end_state.add_transition(nil, unit_nfa.start_state)
end
end
unit_nfa.end_state.add_transition(nil, nfa.end_state)
if @max_count.nil?
unit_nfa.end_state.add_transition(nil, nfa.start_state)
else
(@max_count - @min_count).times do
prev_nfa = unit_nfa
unit_nfa = @unit.to_nfa
prev_nfa.end_state.add_transition(nil, unit_nfa.start_state)
unit_nfa.end_state.add_transition(nil, nfa.end_state)
end
end
nfa
end
end
end
end

View File

@ -1,335 +0,0 @@
module Imbecile
class Regex
RSpec.describe Parser do
it "parses an empty expression" do
parser = Parser.new("")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0].size).to eq 0
end
it "parses a single character unit expression" do
parser = Parser.new("a")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
end
it "parses a group with a single character unit expression" do
parser = Parser.new("(a)")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::AlternatesUnit
alt_unit = seq_unit[0]
expect(alt_unit.alternates.size).to eq 1
expect(alt_unit.alternates[0]).to be_a Parser::SequenceUnit
expect(alt_unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit
end
it "parses a *" do
parser = Parser.new("a*")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end
it "parses a +" do
parser = Parser.new("a+")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 1
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end
it "parses a ?" do
parser = Parser.new("a?")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to eq 1
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end
it "parses a multiplicity count" do
parser = Parser.new("a{5}")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 5
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end
it "parses a minimum-only multiplicity count" do
parser = Parser.new("a{5,}")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
end
it "parses a minimum and maximum multiplicity count" do
parser = Parser.new("a{5,8}")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 8
expect(m_unit.unit).to be_a Parser::CharacterRangeUnit
expect(m_unit.unit.range.first).to eq "a".ord
end
it "parses an escaped *" do
parser = Parser.new("a\\*")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "*".ord
end
it "parses an escaped +" do
parser = Parser.new("a\\+")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "+".ord
end
it "parses an escaped \\" do
parser = Parser.new("\\\\d")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "\\".ord
expect(seq_unit[1]).to be_a Parser::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "d".ord
end
it "parses a character class" do
parser = Parser.new("[a-z_]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "a".ord
expect(ccu[0].max_code_point).to eq "z".ord
expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "_".ord
end
it "parses a negated character class" do
parser = Parser.new("[^xyz]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "x".ord
end
it "parses - as a plain character at beginning of a character class" do
parser = Parser.new("[-9]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord
end
it "parses - as a plain character at end of a character class" do
parser = Parser.new("[0-]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord
end
it "parses - as a plain character at beginning of a negated character class" do
parser = Parser.new("[^-9]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord
end
it "parses . as a plain character in a character class" do
parser = Parser.new("[.]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 1
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq ".".ord
end
it "parses - as a plain character when escaped in middle of character class" do
parser = Parser.new("[0\\-9]")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
seq_unit = parser.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Parser::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Parser::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Parser::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord
expect(ccu[2]).to be_a Parser::CharacterRangeUnit
expect(ccu[2].min_code_point).to eq "9".ord
end
it "parses alternates" do
parser = Parser.new("ab|c")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 2
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[1]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0].size).to eq 2
expect(parser.unit.alternates[1].size).to eq 1
end
it "parses a ." do
parser = Parser.new("a.b")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 1
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[0][1]).to be_a Parser::CharacterClassUnit
expect(parser.unit.alternates[0][1].units.size).to eq 2
expect(parser.unit.alternates[0][2]).to be_a Parser::CharacterRangeUnit
end
it "parses something complex" do
parser = Parser.new("(a|)*|[^^]|\\|v|[x-y]+")
expect(parser.unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates.size).to eq 4
expect(parser.unit.alternates[0]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0].size).to eq 1
expect(parser.unit.alternates[0][0]).to be_a Parser::MultiplicityUnit
expect(parser.unit.alternates[0][0].min_count).to eq 0
expect(parser.unit.alternates[0][0].max_count).to be_nil
expect(parser.unit.alternates[0][0].unit).to be_a Parser::AlternatesUnit
expect(parser.unit.alternates[0][0].unit.alternates.size).to eq 2
expect(parser.unit.alternates[0][0].unit.alternates[0]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0][0].unit.alternates[0].size).to eq 1
expect(parser.unit.alternates[0][0].unit.alternates[0][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[0][0].unit.alternates[1]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[0][0].unit.alternates[1].size).to eq 0
expect(parser.unit.alternates[1]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[1].size).to eq 1
expect(parser.unit.alternates[1][0]).to be_a Parser::CharacterClassUnit
expect(parser.unit.alternates[1][0].negate).to be_truthy
expect(parser.unit.alternates[1][0].size).to eq 1
expect(parser.unit.alternates[1][0][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[2].size).to eq 2
expect(parser.unit.alternates[2][0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2][0].min_code_point).to eq "|".ord
expect(parser.unit.alternates[2][1]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[2][1].min_code_point).to eq "v".ord
expect(parser.unit.alternates[3]).to be_a Parser::SequenceUnit
expect(parser.unit.alternates[3].size).to eq 1
expect(parser.unit.alternates[3][0]).to be_a Parser::MultiplicityUnit
expect(parser.unit.alternates[3][0].min_count).to eq 1
expect(parser.unit.alternates[3][0].max_count).to be_nil
expect(parser.unit.alternates[3][0].unit).to be_a Parser::CharacterClassUnit
expect(parser.unit.alternates[3][0].unit.size).to eq 1
expect(parser.unit.alternates[3][0].unit[0]).to be_a Parser::CharacterRangeUnit
expect(parser.unit.alternates[3][0].unit[0].min_code_point).to eq "x".ord
expect(parser.unit.alternates[3][0].unit[0].max_code_point).to eq "y".ord
end
end
end
end

333
spec/imbecile/regex_spec.rb Normal file
View File

@ -0,0 +1,333 @@
module Imbecile
RSpec.describe Regex do
it "parses an empty expression" do
regex = Regex.new("")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0].size).to eq 0
end
it "parses a single character unit expression" do
regex = Regex.new("a")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
end
it "parses a group with a single character unit expression" do
regex = Regex.new("(a)")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::AlternatesUnit
alt_unit = seq_unit[0]
expect(alt_unit.alternates.size).to eq 1
expect(alt_unit.alternates[0]).to be_a Regex::SequenceUnit
expect(alt_unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
end
it "parses a *" do
regex = Regex.new("a*")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a +" do
regex = Regex.new("a+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 1
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a ?" do
regex = Regex.new("a?")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to eq 1
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a multiplicity count" do
regex = Regex.new("a{5}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 5
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a minimum-only multiplicity count" do
regex = Regex.new("a{5,}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a minimum and maximum multiplicity count" do
regex = Regex.new("a{5,8}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 8
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
expect(m_unit.unit.range.first).to eq "a".ord
end
it "parses an escaped *" do
regex = Regex.new("a\\*")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "*".ord
end
it "parses an escaped +" do
regex = Regex.new("a\\+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "+".ord
end
it "parses an escaped \\" do
regex = Regex.new("\\\\d")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].min_code_point).to eq "\\".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].min_code_point).to eq "d".ord
end
it "parses a character class" do
regex = Regex.new("[a-z_]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "a".ord
expect(ccu[0].max_code_point).to eq "z".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "_".ord
end
it "parses a negated character class" do
regex = Regex.new("[^xyz]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "x".ord
end
it "parses - as a plain character at beginning of a character class" do
regex = Regex.new("[-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord
end
it "parses - as a plain character at end of a character class" do
regex = Regex.new("[0-]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord
end
it "parses - as a plain character at beginning of a negated character class" do
regex = Regex.new("[^-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "-".ord
end
it "parses . as a plain character in a character class" do
regex = Regex.new("[.]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 1
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq ".".ord
end
it "parses - as a plain character when escaped in middle of character class" do
regex = Regex.new("[0\\-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].min_code_point).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].min_code_point).to eq "-".ord
expect(ccu[2]).to be_a Regex::CharacterRangeUnit
expect(ccu[2].min_code_point).to eq "9".ord
end
it "parses alternates" do
regex = Regex.new("ab|c")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 2
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0].size).to eq 2
expect(regex.unit.alternates[1].size).to eq 1
end
it "parses a ." do
regex = Regex.new("a.b")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[0][1]).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[0][1].units.size).to eq 2
expect(regex.unit.alternates[0][2]).to be_a Regex::CharacterRangeUnit
end
it "parses something complex" do
regex = Regex.new("(a|)*|[^^]|\\|v|[x-y]+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 4
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0].size).to eq 1
expect(regex.unit.alternates[0][0]).to be_a Regex::MultiplicityUnit
expect(regex.unit.alternates[0][0].min_count).to eq 0
expect(regex.unit.alternates[0][0].max_count).to be_nil
expect(regex.unit.alternates[0][0].unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates[0][0].unit.alternates.size).to eq 2
expect(regex.unit.alternates[0][0].unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0].unit.alternates[0].size).to eq 1
expect(regex.unit.alternates[0][0].unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[0][0].unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0].unit.alternates[1].size).to eq 0
expect(regex.unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[1].size).to eq 1
expect(regex.unit.alternates[1][0]).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[1][0].negate).to be_truthy
expect(regex.unit.alternates[1][0].size).to eq 1
expect(regex.unit.alternates[1][0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[2].size).to eq 2
expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][0].min_code_point).to eq "|".ord
expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][1].min_code_point).to eq "v".ord
expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[3].size).to eq 1
expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit
expect(regex.unit.alternates[3][0].min_count).to eq 1
expect(regex.unit.alternates[3][0].max_count).to be_nil
expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[3][0].unit.size).to eq 1
expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[3][0].unit[0].min_code_point).to eq "x".ord
expect(regex.unit.alternates[3][0].unit[0].max_code_point).to eq "y".ord
end
end
end