diff --git a/assets/parser.d.erb b/assets/parser.d.erb index ff25d2c..41ad6ef 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -124,14 +124,19 @@ class <%= @classname %> uint code_id; } -<% transition_table, state_table = @lexer.build_tables %> + private struct Mode + { + uint state_table_offset; + } + +<% transition_table, state_table, mode_table = @lexer.build_tables %> private static immutable Transition transitions[] = [ <% transition_table.each do |transition_table_entry| %> Transition(<%= transition_table_entry[:first] %>u, <%= transition_table_entry[:last] %>u, <%= transition_table_entry[:destination] %>u), <% end %> ]; - private static const State states[] = [ + private static immutable State states[] = [ <% state_table.each do |state_table_entry| %> State(<%= state_table_entry[:transition_table_index] %>u, <%= state_table_entry[:n_transitions] %>u, @@ -140,6 +145,12 @@ class <%= @classname %> <% end %> ]; + private static immutable Mode modes[] = [ +<% mode_table.each do |mode_table_entry| %> + Mode(<%= mode_table_entry[:state_table_offset] %>), +<% end %> + ]; + struct LexedToken { size_t row; @@ -153,11 +164,13 @@ class <%= @classname %> private size_t m_input_position; private size_t m_input_row; private size_t m_input_col; + private size_t m_mode; this(const(ubyte) * input, size_t input_length) { m_input = input; m_input_length = input_length; + m_mode = <%= @lexer.mode_id("default") %>; } LexedToken lex_token() @@ -211,7 +224,7 @@ class <%= @classname %> MatchInfo longest_match_info; longest_match_info.token = _TOKEN_COUNT; MatchInfo attempt_match_info; - uint current_state; + uint current_state = modes[m_mode].state_table_offset; for (;;) { auto decoded = Decoder.decode_code_point(&m_input[m_input_position + attempt_match_info.length], m_input_length - m_input_position - attempt_match_info.length); diff --git a/lib/propane/generator.rb b/lib/propane/generator.rb index 8f37416..5645999 100644 --- a/lib/propane/generator.rb +++ b/lib/propane/generator.rb @@ -26,6 +26,17 @@ class Propane private def process_grammar! + # Assign default pattern mode to patterns without a mode assigned. + found_default = false + @grammar.patterns.each do |pattern| + if pattern.mode.nil? + pattern.mode = "default" + found_default = true + end + end + unless found_default + raise Error.new("No patterns found for default mode") + end # Add EOF token. @grammar.tokens << Token.new("$EOF", nil) tokens_by_name = {} @@ -152,6 +163,13 @@ class Propane def expand_code(code) code.gsub(/\$token\(([$\w]+)\)/) do |match| "TOKEN_#{Token.code_name($1)}" + end.gsub(/\$mode\(([a-zA-Z_][a-zA-Z_0-9]*)\)/) do |match| + mode_name = $1 + mode_id = @lexer.mode_id(mode_name) + unless mode_id + raise Error.new("Lexer mode '#{mode_name}' not found") + end + "m_mode = #{mode_id}u" end end diff --git a/lib/propane/lexer.rb b/lib/propane/lexer.rb index 59bbd17..96c2a6f 100644 --- a/lib/propane/lexer.rb +++ b/lib/propane/lexer.rb @@ -1,51 +1,74 @@ class Propane class Lexer - # @return [DFA] - # Lexer DFA. - attr_accessor :dfa - def initialize(grammar) @grammar = grammar - @dfa = DFA.new(grammar.patterns) end def build_tables + @modes = @grammar.patterns.group_by do |pattern| + pattern.mode + end.transform_values do |patterns| + {dfa: DFA.new(patterns)} + end + @modes.each_with_index do |(mode_name, mode_info), index| + mode_info[:id] = index + end transition_table = [] state_table = [] - states = @dfa.enumerate - states.each do |state, id| - token = - if state.accepts.nil? - @grammar.tokens.size - elsif state.accepts.drop? - TOKEN_DROP - elsif state.accepts.token - state.accepts.token.id - else - @grammar.tokens.size - end - code_id = - if state.accepts && state.accepts.code_id - state.accepts.code_id - else - 0xFFFF_FFFF - end - state_table << { - transition_table_index: transition_table.size, - n_transitions: state.transitions.size, - token: token, - code_id: code_id, + mode_table = [] + @modes.each do |mode_name, mode_info| + state_table_offset = state_table.size + mode_table << { + state_table_offset: state_table_offset, } - state.transitions.each do |transition| - transition_table << { - first: transition.code_point_range.first, - last: transition.code_point_range.last, - destination: states[transition.destination], + states = mode_info[:dfa].enumerate + states.each do |state, id| + token = + if state.accepts.nil? + @grammar.tokens.size + elsif state.accepts.drop? + TOKEN_DROP + elsif state.accepts.token + state.accepts.token.id + else + @grammar.tokens.size + end + code_id = + if state.accepts && state.accepts.code_id + state.accepts.code_id + else + 0xFFFF_FFFF + end + state_table << { + transition_table_index: transition_table.size, + n_transitions: state.transitions.size, + token: token, + code_id: code_id, } + state.transitions.each do |transition| + transition_table << { + first: transition.code_point_range.first, + last: transition.code_point_range.last, + destination: states[transition.destination] + state_table_offset, + } + end end end - [transition_table, state_table] + [transition_table, state_table, mode_table] + end + + # Get ID for a mode. + # + # @param mode_name [String] + # Mode name. + # + # @return [Integer, nil] + # Mode ID. + def mode_id(mode_name) + if mode_info = @modes[mode_name] + mode_info[:id] + end end end diff --git a/lib/propane/pattern.rb b/lib/propane/pattern.rb index 80b8048..a111eac 100644 --- a/lib/propane/pattern.rb +++ b/lib/propane/pattern.rb @@ -28,7 +28,7 @@ class Propane # @return [String, nil] # Lexer mode for this pattern. - attr_reader :mode + attr_accessor :mode # Construct a Pattern. # diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index 64e4779..2e75d96 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -192,4 +192,37 @@ EOF "def!", ]) end + + it "supports lexer modes" do + write_grammar <> +string: /[^"]+/ << + writeln("captured string"); +>> +string: /"/ << + $mode(default); + return $token(string); +>> +Start -> abc string def; +EOF + build_parser + compile("spec/test_lexer_modes.d") + results = run + expect(results.status).to eq 0 + verify_lines(results.stdout, [ + "begin string mode", + "captured string", + "pass1", + "begin string mode", + "captured string", + "pass2", + ]) + end end diff --git a/spec/test_lexer_modes.d b/spec/test_lexer_modes.d new file mode 100644 index 0000000..c1f7b27 --- /dev/null +++ b/spec/test_lexer_modes.d @@ -0,0 +1,20 @@ +import testparser; +import std.stdio; + +int main() +{ + return 0; +} + +unittest +{ + string input = `abc "a string" def`; + auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); + assert(parser.parse() == true); + writeln("pass1"); + + input = `abc "abc def" def`; + parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); + assert(parser.parse() == true); + writeln("pass2"); +}