Allow storing a result value for a token from a lexer code block

2022-10-16 21:40:25 -04:00 · 2022-10-16 21:40:25 -04:00 · bca0a14371
commit bca0a14371
parent ca8a360c0e
4 changed files with 74 additions and 19 deletions
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -156,6 +156,7 @@ class <%= @classname %>
            size_t col;
            size_t length;
            uint token;
            <%= @grammar.result_type %> result;
        }
        private string m_input;
@ -187,18 +188,19 @@ class <%= @classname %>
         *
         * @param code_id The ID of the user code block to execute.
         * @param match Matched text for this pattern.
         * @param lt LexedToken lexer result in progress.
         *
         * @return Token ID to accept, or _TOKEN_COUNT if the user code does
         *   not explicitly return a token.
         */
-        private uint user_code(uint code_id, string match)
+        private uint user_code(uint code_id, string match, LexedToken * lt)
        {
            switch (code_id)
            {
 <% @grammar.patterns.each do |pattern| %>
 <%   if pattern.code_id %>
            case <%= pattern.code_id %>u: {
-<%= expand_code(pattern.code) %>
+<%= expand_code(pattern.code, false) %>
            } break;
 <%   end %>
 <% end %>
@ -210,7 +212,10 @@ class <%= @classname %>
        private LexedToken attempt_lex_token()
        {
-            LexedToken lt = LexedToken(m_input_row, m_input_col, 0, _TOKEN_COUNT);
+            LexedToken lt;
            lt.row = m_input_row;
            lt.col = m_input_col;
            lt.token = _TOKEN_COUNT;
            struct MatchInfo
            {
                size_t length;
@ -269,7 +274,7 @@ class <%= @classname %>
                    uint token_to_accept = longest_match_info.token;
                    if (longest_match_info.code_id != 0xFFFF_FFFFu)
                    {
-                        uint user_code_token = user_code(longest_match_info.code_id, m_input[m_input_position..(m_input_position + longest_match_info.length)]);
+                        uint user_code_token = user_code(longest_match_info.code_id, m_input[m_input_position..(m_input_position + longest_match_info.length)], &lt);
                        /* A return of _TOKEN_COUNT from user_code() means
                         * that the user code did not explicitly return a
                         * token. So only override the token to return if the
@ -417,6 +422,7 @@ class <%= @classname %>
                    {
                        /* We shifted a token, mark it consumed. */
                        token = _TOKEN_COUNT;
                        stateresults[$-1].result = lexed_token.result;
                    }
                    else
                    {
@ -520,7 +526,7 @@ class <%= @classname %>
 <% @grammar.rules.each do |rule| %>
 <%   if rule.code %>
            case <%= rule.id %>u: {
-<%= expand_code(rule.code) %>
+<%= expand_code(rule.code, true) %>
            } break;
 <%   end %>
 <% end %>
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -157,25 +157,37 @@ class Propane
    #
    # @param code [String]
    #   User code block.
    # @param parser [Boolean]
    #   Whether the user code is for the parser or lexer.
    #
    # @return [String]
    #   Expanded user code block.
-    def expand_code(code)
+    def expand_code(code, parser)
-      code.gsub(/\$token\(([$\w]+)\)/) do |match|
+      code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
        "TOKEN_#{Token.code_name($1)}"
      end.gsub(/\$mode\(([a-zA-Z_][a-zA-Z_0-9]*)\)/) do |match|
        mode_name = $1
        mode_id = @lexer.mode_id(mode_name)
        unless mode_id
          raise Error.new("Lexer mode '#{mode_name}' not found")
        end
        "m_mode = #{mode_id}u"
      end.gsub(/\$\$/) do |match|
        "_result"
      end.gsub(/\$(\d+)/) do |match|
        index = $1.to_i
        "stateresults[$-1-n_states+#{index}].result"
      end
      if parser
        code = code.gsub(/\$\$/) do |match|
          "_result"
        end
        code = code.gsub(/\$(\d+)/) do |match|
          index = $1.to_i
          "stateresults[$-1-n_states+#{index}].result"
        end
      else
        code = code.gsub(/\$\$/) do |match|
          "lt.result"
        end
        code = code.gsub(/\$mode\(([a-zA-Z_][a-zA-Z_0-9]*)\)/) do |match|
          mode_name = $1
          mode_id = @lexer.mode_id(mode_name)
          unless mode_id
            raise Error.new("Lexer mode '#{mode_name}' not found")
          end
          "m_mode = #{mode_id}u"
        end
      end
      code
    end
  end
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -317,4 +317,21 @@ EOF
      "pass1",
    ])
  end
  it "allows storing a result value for the lexer" do
    write_grammar <<EOF
 result_type ulong;
 token word /[a-z]+/ <<
  $$ = match.length;
 >>
 Start -> word <<
  $$ = $1;
 >>
 EOF
    build_parser
    compile("spec/test_lexer_result_value.d")
    results = run
    expect(results.stderr).to eq ""
    expect(results.status).to eq 0
  end
 end
--- a/spec/test_lexer_result_value.d
+++ b/spec/test_lexer_result_value.d
@ -0,0 +1,20 @@
 import testparser;
 import std.stdio;
 int main()
 {
    return 0;
 }
 unittest
 {
    string input = `x`;
    auto parser = new Testparser.Parser(input);
    assert(parser.parse() == true);
    assert(parser.result == 1u);
    input = `fabulous`;
    parser = new Testparser.Parser(input);
    assert(parser.parse() == true);
    assert(parser.result == 8u);
 }