From 9459883e7423114c80882162e6ae072b0815f997 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Wed, 18 Aug 2021 17:09:45 -0400 Subject: [PATCH] Add Lexer class; Move LexerDFA to Lexer::DFA --- lib/imbecile.rb | 3 +- lib/imbecile/generator.rb | 2 +- lib/imbecile/lexer.rb | 5 ++ lib/imbecile/lexer/dfa.rb | 89 +++++++++++++++++++ lib/imbecile/lexer_dfa.rb | 87 ------------------ .../{lexer_dfa_spec.rb => lexer/dfa_spec.rb} | 4 +- 6 files changed, 99 insertions(+), 91 deletions(-) create mode 100644 lib/imbecile/lexer.rb create mode 100644 lib/imbecile/lexer/dfa.rb delete mode 100644 lib/imbecile/lexer_dfa.rb rename spec/imbecile/{lexer_dfa_spec.rb => lexer/dfa_spec.rb} (96%) diff --git a/lib/imbecile.rb b/lib/imbecile.rb index 737e779..9790f2e 100644 --- a/lib/imbecile.rb +++ b/lib/imbecile.rb @@ -9,7 +9,8 @@ require_relative "imbecile/generator" require_relative "imbecile/grammar" require_relative "imbecile/grammar/rule" require_relative "imbecile/grammar/token" -require_relative "imbecile/lexer_dfa" +require_relative "imbecile/lexer" +require_relative "imbecile/lexer/dfa" require_relative "imbecile/regex" require_relative "imbecile/regex/nfa" require_relative "imbecile/regex/unit" diff --git a/lib/imbecile/generator.rb b/lib/imbecile/generator.rb index 04062d7..22aab63 100644 --- a/lib/imbecile/generator.rb +++ b/lib/imbecile/generator.rb @@ -25,7 +25,7 @@ module Imbecile unless rule_names["Start"] raise Error.new("Start rule not found") end - lexer_dfa = LexerDFA.new(@grammar.tokens) + lexer_dfa = Lexer::DFA.new(@grammar.tokens) classname = @grammar.classname || File.basename(output_file).sub(%r{[^a-zA-Z0-9].*}, "").capitalize erb = ERB.new(File.read(File.join(File.dirname(File.expand_path(__FILE__)), "../../assets/parser.d.erb")), nil, "<>") result = erb.result(binding.clone) diff --git a/lib/imbecile/lexer.rb b/lib/imbecile/lexer.rb new file mode 100644 index 0000000..7177761 --- /dev/null +++ b/lib/imbecile/lexer.rb @@ -0,0 +1,5 @@ +module Imbecile + class Lexer + + end +end diff --git a/lib/imbecile/lexer/dfa.rb b/lib/imbecile/lexer/dfa.rb new file mode 100644 index 0000000..ab5d3d2 --- /dev/null +++ b/lib/imbecile/lexer/dfa.rb @@ -0,0 +1,89 @@ +module Imbecile + class Lexer + + class DFA < FA + + def initialize(tokens) + super() + start_nfa = Regex::NFA.new + tokens.each do |token| + start_nfa.start_state.add_transition(nil, token.nfa.start_state) + end + @nfa_state_sets = {} + @states = [] + @to_process = Set.new + nil_transition_states = start_nfa.start_state.nil_transition_states + register_nfa_state_set(nil_transition_states) + while @to_process.size > 0 + state_set = @to_process.first + @to_process.delete(state_set) + process_nfa_state_set(state_set) + end + @start_state = @states[0] + end + + private + + def register_nfa_state_set(nfa_state_set) + unless @nfa_state_sets.include?(nfa_state_set) + state_id = @states.size + @nfa_state_sets[nfa_state_set] = state_id + @states << State.new + @to_process << nfa_state_set + end + end + + def process_nfa_state_set(nfa_state_set) + state_id = @nfa_state_sets[nfa_state_set] + state = @states[state_id] + if state_id > 0 + nfa_state_set.each do |nfa_state| + if nfa_state.accepts + if state.accepts + if nfa_state.accepts.id < state.accepts.id + state.accepts = nfa_state.accepts + end + else + state.accepts = nfa_state.accepts + end + end + end + end + transitions = transitions_for(nfa_state_set) + while transitions.size > 0 + subrange = CodePointRange.first_subrange(transitions.map(&:code_point_range)) + dest_nfa_states = transitions.reduce(Set.new) do |result, transition| + if transition.code_point_range.include?(subrange) + result << transition.destination + end + result + end + dest_nfa_states = dest_nfa_states.reduce(Set.new) do |result, dest_nfa_state| + result + dest_nfa_state.nil_transition_states + end + register_nfa_state_set(dest_nfa_states) + dest_state = @states[@nfa_state_sets[dest_nfa_states]] + state.add_transition(subrange, dest_state) + transitions.delete_if do |transition| + transition.code_point_range.last <= subrange.last + end + transitions.map! do |transition| + if transition.code_point_range.first <= subrange.last + Regex::NFA::State::Transition.new(CodePointRange.new(subrange.last + 1, transition.code_point_range.last), transition.destination) + else + transition + end + end + end + end + + def transitions_for(nfa_state_set) + nfa_state_set.reduce([]) do |result, state| + result + state.cp_transitions + end + end + + end + + end +end diff --git a/lib/imbecile/lexer_dfa.rb b/lib/imbecile/lexer_dfa.rb deleted file mode 100644 index 63570d8..0000000 --- a/lib/imbecile/lexer_dfa.rb +++ /dev/null @@ -1,87 +0,0 @@ -module Imbecile - - class LexerDFA < FA - - def initialize(tokens) - super() - start_nfa = Regex::NFA.new - tokens.each do |token| - start_nfa.start_state.add_transition(nil, token.nfa.start_state) - end - @nfa_state_sets = {} - @states = [] - @to_process = Set.new - nil_transition_states = start_nfa.start_state.nil_transition_states - register_nfa_state_set(nil_transition_states) - while @to_process.size > 0 - state_set = @to_process.first - @to_process.delete(state_set) - process_nfa_state_set(state_set) - end - @start_state = @states[0] - end - - private - - def register_nfa_state_set(nfa_state_set) - unless @nfa_state_sets.include?(nfa_state_set) - state_id = @states.size - @nfa_state_sets[nfa_state_set] = state_id - @states << State.new - @to_process << nfa_state_set - end - end - - def process_nfa_state_set(nfa_state_set) - state_id = @nfa_state_sets[nfa_state_set] - state = @states[state_id] - if state_id > 0 - nfa_state_set.each do |nfa_state| - if nfa_state.accepts - if state.accepts - if nfa_state.accepts.id < state.accepts.id - state.accepts = nfa_state.accepts - end - else - state.accepts = nfa_state.accepts - end - end - end - end - transitions = transitions_for(nfa_state_set) - while transitions.size > 0 - subrange = CodePointRange.first_subrange(transitions.map(&:code_point_range)) - dest_nfa_states = transitions.reduce(Set.new) do |result, transition| - if transition.code_point_range.include?(subrange) - result << transition.destination - end - result - end - dest_nfa_states = dest_nfa_states.reduce(Set.new) do |result, dest_nfa_state| - result + dest_nfa_state.nil_transition_states - end - register_nfa_state_set(dest_nfa_states) - dest_state = @states[@nfa_state_sets[dest_nfa_states]] - state.add_transition(subrange, dest_state) - transitions.delete_if do |transition| - transition.code_point_range.last <= subrange.last - end - transitions.map! do |transition| - if transition.code_point_range.first <= subrange.last - Regex::NFA::State::Transition.new(CodePointRange.new(subrange.last + 1, transition.code_point_range.last), transition.destination) - else - transition - end - end - end - end - - def transitions_for(nfa_state_set) - nfa_state_set.reduce([]) do |result, state| - result + state.cp_transitions - end - end - - end - -end diff --git a/spec/imbecile/lexer_dfa_spec.rb b/spec/imbecile/lexer/dfa_spec.rb similarity index 96% rename from spec/imbecile/lexer_dfa_spec.rb rename to spec/imbecile/lexer/dfa_spec.rb index d182dd0..e8b2557 100644 --- a/spec/imbecile/lexer_dfa_spec.rb +++ b/spec/imbecile/lexer/dfa_spec.rb @@ -51,12 +51,12 @@ end def run(grammar, input) g = Imbecile::Grammar.new(grammar) - token_dfa = Imbecile::LexerDFA.new(g.tokens) + token_dfa = Imbecile::Lexer::DFA.new(g.tokens) test_lexer = TestLexer.new(token_dfa) test_lexer.lex(input) end -describe Imbecile::LexerDFA do +describe Imbecile::Lexer::DFA do it "lexes a simple token" do expect(run(<