Execute user code blocks assigned to tokens

2022-09-24 17:31:40 -04:00 · 2022-09-24 17:31:40 -04:00 · 672098ad32
commit 672098ad32
parent 92ce30f354
6 changed files with 122 additions and 8 deletions
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -119,6 +119,7 @@ class <%= @classname %>
            uint transition_table_index;
            uint n_transitions;
            uint accepts;
            uint code_id;
        }
 <% transition_table, state_table = @lexer.build_tables %>
@ -130,7 +131,10 @@ class <%= @classname %>
        private static const State states[] = [
 <% state_table.each do |state_table_entry| %>
-            State(<%= state_table_entry[:transition_table_index] %>u, <%= state_table_entry[:n_transitions] %>u, <%= state_table_entry[:accepts] %>u),
+            State(<%= state_table_entry[:transition_table_index] %>u,
                  <%= state_table_entry[:n_transitions] %>u,
                  <%= state_table_entry[:accepts] %>u,
                  <%= state_table_entry[:code_id] %>u),
 <% end %>
        ];
@ -166,6 +170,31 @@ class <%= @classname %>
            }
        }
        /**
         * Execute user code associated with a lexer pattern.
         *
         * @param code_id The ID of the user code block to execute.
         *
         * @return Token ID to accept, or _TOKEN_NONE if the user code does
         *   not explicitly return a token.
         */
        private uint user_code(uint code_id)
        {
            switch (code_id)
            {
 <% @grammar.patterns.each do |pattern| %>
 <%   if pattern.code_id %>
            case <%= pattern.code_id %>u: {
 <%= pattern.code %>
            } break;
 <%   end %>
 <% end %>
            default: break;
            }
            return _TOKEN_NONE;
        }
        private LexedToken attempt_lex_token()
        {
            LexedToken lt = LexedToken(m_input_row, m_input_col, 0, _TOKEN_NONE);
@ -175,6 +204,7 @@ class <%= @classname %>
                size_t delta_row;
                size_t delta_col;
                uint token;
                uint code_id;
            }
            LexedTokenState last_accepts_info;
            last_accepts_info.token = _TOKEN_NONE;
@ -209,6 +239,7 @@ class <%= @classname %>
                        if (states[current_state].accepts != _TOKEN_NONE)
                        {
                            attempt_info.token = states[current_state].accepts;
                            attempt_info.code_id = states[current_state].code_id;
                            last_accepts_info = attempt_info;
                        }
                    }
@ -220,10 +251,24 @@ class <%= @classname %>
                }
                if (!lex_continue)
                {
-                    if (last_accepts_info.token != _TOKEN_NONE)
+                    bool pattern_accepted = false;
                    uint token_to_accept = last_accepts_info.token;
                    if (last_accepts_info.code_id != 0xFFFF_FFFFu)
                    {
-                        lt.token = last_accepts_info.token;
+                        uint user_code_token = user_code(last_accepts_info.code_id);
-                        lt.length = last_accepts_info.length;
+                        /* A return of _TOKEN_NONE from user_code() means
                         * that the user code did not explicitly return a
                         * token. So only override the token to return if the
                         * user code does explicitly return a token. */
                        if (user_code_token != _TOKEN_NONE)
                        {
                            token_to_accept = user_code_token;
                        }
                        pattern_accepted = true;
                    }
                    if (pattern_accepted || (token_to_accept != _TOKEN_NONE))
                    {
                        /* Update the input position tracking. */
                        m_input_position += last_accepts_info.length;
                        m_input_row += last_accepts_info.delta_row;
                        if (last_accepts_info.delta_row != 0u)
@ -235,9 +280,15 @@ class <%= @classname %>
                            m_input_col += last_accepts_info.delta_col;
                        }
                    }
                    if (token_to_accept != _TOKEN_NONE)
                    {
                        /* We have a token to accept. */
                        lt.token = last_accepts_info.token;
                        lt.length = last_accepts_info.length;
                        break;
                    }
                }
            }
            return lt;
        }
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -12,6 +12,7 @@ class Propane
      @patterns = []
      @tokens = []
      @rules = []
      @code_id = 0
      input = input.gsub("\r\n", "\n")
      parse_grammar(input)
    end
@ -29,7 +30,7 @@ class Propane
          @modulename = $1
        elsif sliced = input.slice!(/\Aclass\s+(\S+)\s*;/)
          @classname = $1
-        elsif sliced = input.slice!(/\Atoken\s+(\S+?)(?:\s+(.+?))?\s*(?:;|<<\n(.*?)^>>\n)/m)
+        elsif sliced = input.slice!(/\Atoken\s+(\S+?)(?:\s+([^\n]+?))?\s*(?:;|<<\n(.*?)^>>\n)/m)
          name, pattern, code = $1, $2, $3
          if pattern.nil?
            pattern = name
@ -39,7 +40,13 @@ class Propane
          end
          token = Token.new(name: name, id: @tokens.size, line_number: line_number)
          @tokens << token
-          pattern = Pattern.new(pattern: pattern, token: token, line_number: line_number)
+          if code
            code_id = @code_id
            @code_id += 1
          else
            code_id = nil
          end
          pattern = Pattern.new(pattern: pattern, token: token, line_number: line_number, code: code, code_id: code_id)
          @patterns << pattern
        elsif sliced = input.slice!(/\Atokenid\s+(\S+?)\s*;/m)
          name = $1
@ -51,7 +58,7 @@ class Propane
        elsif sliced = input.slice!(/\Adrop\s+(\S+)\s*;/)
          pattern = $1
          @patterns << Pattern.new(pattern: pattern, line_number: line_number, drop: true)
-        elsif sliced = input.slice!(/\A(\S+)\s*->\s*(.*?)(?:;|<<\n(.*?)^>>\n)/m)
+        elsif sliced = input.slice!(/\A(\S+)\s*->\s*([^\n]*?)(?:;|<<\n(.*?)^>>\n)/m)
          rule_name, components, code = $1, $2, $3
          unless rule_name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/
            raise Error.new("Invalid rule name #{name.inspect}")
--- a/lib/propane/lexer.rb
+++ b/lib/propane/lexer.rb
@ -22,10 +22,17 @@ class Propane
          else
            state.accepts.token.id
          end
        code_id =
          if state.accepts && state.accepts.code_id
            state.accepts.code_id
          else
            0xFFFF_FFFF
          end
        state_table << {
          transition_table_index: transition_table.size,
          n_transitions: state.transitions.size,
          accepts: accepts,
          code_id: code_id,
        }
        state.transitions.each do |transition|
          transition_table << {
--- a/lib/propane/pattern.rb
+++ b/lib/propane/pattern.rb
@ -2,6 +2,14 @@ class Propane
  class Pattern
    # @return [String, nil]
    #   Code block to execute when the pattern is matched.
    attr_reader :code
    # @option options [Integer, nil] :code_id
    #   Code block ID.
    attr_reader :code_id
    # @return [String, nil]
    #   Pattern.
    attr_reader :pattern
@ -22,6 +30,10 @@ class Propane
    #
    # @param options [Hash]
    #   Optional parameters.
    # @option options [String, nil] :code
    #   Code block to execute when the pattern is matched.
    # @option options [Integer, nil] :code_id
    #   Code block ID.
    # @option options [Boolean] :drop
    #   Whether this is a drop pattern.
    # @option options [String, nil] :pattern
@ -31,6 +43,8 @@ class Propane
    # @option options [Integer, nil] :line_number
    #   Line number where the token was defined in the input grammar.
    def initialize(options)
      @code = options[:code]
      @code_id = options[:code_id]
      @drop = options[:drop]
      @pattern = options[:pattern]
      @token = options[:token]
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -95,4 +95,19 @@ EOF
    compile("spec/test_d_lexer2.d")
    run
  end
  it "executes user code when matching lexer token" do
    write_grammar <<EOF
 token abc <<
  writeln("abc!");
 >>
 token def;
 Start -> Abcs def;
 Abcs -> ;
 Abcs -> abc Abcs;
 EOF
    build_parser
    compile("spec/test_user_code.d")
    run
  end
 end
--- a/spec/test_user_code.d
+++ b/spec/test_user_code.d
@ -0,0 +1,20 @@
 import testparser;
 import std.stdio;
 int main()
 {
    return 0;
 }
 unittest
 {
    string input = "abcdef";
    auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
    assert(parser.parse() == true);
    writeln("pass1");
    input = "abcabcdef";
    parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length);
    assert(parser.parse() == true);
    writeln("pass2");
 }