From 672098ad3222d79d1f7bde4d828199cf74635374 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Sat, 24 Sep 2022 17:31:40 -0400 Subject: [PATCH] Execute user code blocks assigned to tokens --- assets/parser.d.erb | 61 ++++++++++++++++++++++++++++++++++++++---- lib/propane/grammar.rb | 13 ++++++--- lib/propane/lexer.rb | 7 +++++ lib/propane/pattern.rb | 14 ++++++++++ spec/propane_spec.rb | 15 +++++++++++ spec/test_user_code.d | 20 ++++++++++++++ 6 files changed, 122 insertions(+), 8 deletions(-) create mode 100644 spec/test_user_code.d diff --git a/assets/parser.d.erb b/assets/parser.d.erb index 96e8e68..09ea368 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -119,6 +119,7 @@ class <%= @classname %> uint transition_table_index; uint n_transitions; uint accepts; + uint code_id; } <% transition_table, state_table = @lexer.build_tables %> @@ -130,7 +131,10 @@ class <%= @classname %> private static const State states[] = [ <% state_table.each do |state_table_entry| %> - State(<%= state_table_entry[:transition_table_index] %>u, <%= state_table_entry[:n_transitions] %>u, <%= state_table_entry[:accepts] %>u), + State(<%= state_table_entry[:transition_table_index] %>u, + <%= state_table_entry[:n_transitions] %>u, + <%= state_table_entry[:accepts] %>u, + <%= state_table_entry[:code_id] %>u), <% end %> ]; @@ -166,6 +170,31 @@ class <%= @classname %> } } + /** + * Execute user code associated with a lexer pattern. + * + * @param code_id The ID of the user code block to execute. + * + * @return Token ID to accept, or _TOKEN_NONE if the user code does + * not explicitly return a token. + */ + private uint user_code(uint code_id) + { + switch (code_id) + { +<% @grammar.patterns.each do |pattern| %> +<% if pattern.code_id %> + case <%= pattern.code_id %>u: { +<%= pattern.code %> + } break; +<% end %> +<% end %> + default: break; + } + + return _TOKEN_NONE; + } + private LexedToken attempt_lex_token() { LexedToken lt = LexedToken(m_input_row, m_input_col, 0, _TOKEN_NONE); @@ -175,6 +204,7 @@ class <%= @classname %> size_t delta_row; size_t delta_col; uint token; + uint code_id; } LexedTokenState last_accepts_info; last_accepts_info.token = _TOKEN_NONE; @@ -209,6 +239,7 @@ class <%= @classname %> if (states[current_state].accepts != _TOKEN_NONE) { attempt_info.token = states[current_state].accepts; + attempt_info.code_id = states[current_state].code_id; last_accepts_info = attempt_info; } } @@ -220,10 +251,24 @@ class <%= @classname %> } if (!lex_continue) { - if (last_accepts_info.token != _TOKEN_NONE) + bool pattern_accepted = false; + uint token_to_accept = last_accepts_info.token; + if (last_accepts_info.code_id != 0xFFFF_FFFFu) { - lt.token = last_accepts_info.token; - lt.length = last_accepts_info.length; + uint user_code_token = user_code(last_accepts_info.code_id); + /* A return of _TOKEN_NONE from user_code() means + * that the user code did not explicitly return a + * token. So only override the token to return if the + * user code does explicitly return a token. */ + if (user_code_token != _TOKEN_NONE) + { + token_to_accept = user_code_token; + } + pattern_accepted = true; + } + if (pattern_accepted || (token_to_accept != _TOKEN_NONE)) + { + /* Update the input position tracking. */ m_input_position += last_accepts_info.length; m_input_row += last_accepts_info.delta_row; if (last_accepts_info.delta_row != 0u) @@ -235,7 +280,13 @@ class <%= @classname %> m_input_col += last_accepts_info.delta_col; } } - break; + if (token_to_accept != _TOKEN_NONE) + { + /* We have a token to accept. */ + lt.token = last_accepts_info.token; + lt.length = last_accepts_info.length; + break; + } } } return lt; diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb index 2b93d1e..008f439 100644 --- a/lib/propane/grammar.rb +++ b/lib/propane/grammar.rb @@ -12,6 +12,7 @@ class Propane @patterns = [] @tokens = [] @rules = [] + @code_id = 0 input = input.gsub("\r\n", "\n") parse_grammar(input) end @@ -29,7 +30,7 @@ class Propane @modulename = $1 elsif sliced = input.slice!(/\Aclass\s+(\S+)\s*;/) @classname = $1 - elsif sliced = input.slice!(/\Atoken\s+(\S+?)(?:\s+(.+?))?\s*(?:;|<<\n(.*?)^>>\n)/m) + elsif sliced = input.slice!(/\Atoken\s+(\S+?)(?:\s+([^\n]+?))?\s*(?:;|<<\n(.*?)^>>\n)/m) name, pattern, code = $1, $2, $3 if pattern.nil? pattern = name @@ -39,7 +40,13 @@ class Propane end token = Token.new(name: name, id: @tokens.size, line_number: line_number) @tokens << token - pattern = Pattern.new(pattern: pattern, token: token, line_number: line_number) + if code + code_id = @code_id + @code_id += 1 + else + code_id = nil + end + pattern = Pattern.new(pattern: pattern, token: token, line_number: line_number, code: code, code_id: code_id) @patterns << pattern elsif sliced = input.slice!(/\Atokenid\s+(\S+?)\s*;/m) name = $1 @@ -51,7 +58,7 @@ class Propane elsif sliced = input.slice!(/\Adrop\s+(\S+)\s*;/) pattern = $1 @patterns << Pattern.new(pattern: pattern, line_number: line_number, drop: true) - elsif sliced = input.slice!(/\A(\S+)\s*->\s*(.*?)(?:;|<<\n(.*?)^>>\n)/m) + elsif sliced = input.slice!(/\A(\S+)\s*->\s*([^\n]*?)(?:;|<<\n(.*?)^>>\n)/m) rule_name, components, code = $1, $2, $3 unless rule_name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/ raise Error.new("Invalid rule name #{name.inspect}") diff --git a/lib/propane/lexer.rb b/lib/propane/lexer.rb index 0968308..86a9caf 100644 --- a/lib/propane/lexer.rb +++ b/lib/propane/lexer.rb @@ -22,10 +22,17 @@ class Propane else state.accepts.token.id end + code_id = + if state.accepts && state.accepts.code_id + state.accepts.code_id + else + 0xFFFF_FFFF + end state_table << { transition_table_index: transition_table.size, n_transitions: state.transitions.size, accepts: accepts, + code_id: code_id, } state.transitions.each do |transition| transition_table << { diff --git a/lib/propane/pattern.rb b/lib/propane/pattern.rb index 2e9ebae..a8f678b 100644 --- a/lib/propane/pattern.rb +++ b/lib/propane/pattern.rb @@ -2,6 +2,14 @@ class Propane class Pattern + # @return [String, nil] + # Code block to execute when the pattern is matched. + attr_reader :code + + # @option options [Integer, nil] :code_id + # Code block ID. + attr_reader :code_id + # @return [String, nil] # Pattern. attr_reader :pattern @@ -22,6 +30,10 @@ class Propane # # @param options [Hash] # Optional parameters. + # @option options [String, nil] :code + # Code block to execute when the pattern is matched. + # @option options [Integer, nil] :code_id + # Code block ID. # @option options [Boolean] :drop # Whether this is a drop pattern. # @option options [String, nil] :pattern @@ -31,6 +43,8 @@ class Propane # @option options [Integer, nil] :line_number # Line number where the token was defined in the input grammar. def initialize(options) + @code = options[:code] + @code_id = options[:code_id] @drop = options[:drop] @pattern = options[:pattern] @token = options[:token] diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index c9548f7..f481f23 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -95,4 +95,19 @@ EOF compile("spec/test_d_lexer2.d") run end + + it "executes user code when matching lexer token" do + write_grammar <> +token def; +Start -> Abcs def; +Abcs -> ; +Abcs -> abc Abcs; +EOF + build_parser + compile("spec/test_user_code.d") + run + end end diff --git a/spec/test_user_code.d b/spec/test_user_code.d new file mode 100644 index 0000000..ad3fd25 --- /dev/null +++ b/spec/test_user_code.d @@ -0,0 +1,20 @@ +import testparser; +import std.stdio; + +int main() +{ + return 0; +} + +unittest +{ + string input = "abcdef"; + auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); + assert(parser.parse() == true); + writeln("pass1"); + + input = "abcabcdef"; + parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); + assert(parser.parse() == true); + writeln("pass2"); +}