Allow user to specify custom token node fields

2026-02-21 21:06:40 -05:00 · 2026-02-21 21:06:40 -05:00 · d4ad67c23d
commit d4ad67c23d
parent ff61dd05d9
9 changed files with 295 additions and 13 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -75,6 +75,10 @@ const char * <%= @grammar.prefix %>token_names[] = {
 /**
 * Deinitialize and deallocate lexer/parser context structure.
 *
 * For C++, destructors will be called for any context user fields. However, if
 * pointers are used to store allocated resources, the user should free them
 * before calling this function.
 *
 * @param context
 *   Lexer/parser context structure allocated with <%= @grammar.prefix %>context_new().
 */
@ -1014,13 +1018,18 @@ static size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t start
            {
                /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
 <%   if @cpp %>
                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>();
 <%   else %>
                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *)malloc(sizeof(<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>));
 <%   end %>
                token_tree_node->position = token_info.position;
                token_tree_node->end_position = token_info.end_position;
                token_tree_node->n_fields = 0u;
                token_tree_node->is_token = 1u;
                token_tree_node->token = token;
                token_tree_node->pvalue = token_info.pvalue;
 <%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                state_values_stack_index(&statevalues, -1)->tree_node = token_tree_node;
 <% else %>
                state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue;
@ -1210,10 +1219,16 @@ static void tree_delete(TreeNode * node)
 {
    if (node->is_token)
    {
        <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node;
 <%   if @grammar.free_token_node %>
-        <%= @grammar.free_token_node %>((<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node);
+        <%= @grammar.free_token_node %>(token_tree_node);
 <%   end %>
 <%= expand_code(@grammar.free_token_user_fields, false, nil, nil) %>
 <%   if @cpp %>
        delete token_tree_node;
 <%   else %>
        free(token_tree_node);
 <%   end %>
        /* TODO: free value_t */
    }
    else if (node->n_fields > 0u)
    {
@ -1224,8 +1239,8 @@ static void tree_delete(TreeNode * node)
                tree_delete(node->fields[i]);
            }
        }
    }
        free(node);
    }
 }
 /**
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -103,9 +103,9 @@ public struct <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>
    /* TreeNode fields must be present in the same order here. */
    <%= @grammar.prefix %>position_t position;
    <%= @grammar.prefix %>position_t end_position;
 <%= @grammar.token_user_fields %>
    <%= @grammar.prefix %>token_t token;
    <%= @grammar.prefix %>value_t pvalue;
 <%= @grammar.token_user_fields %>
 }
 <%   @parser.rule_sets.each do |name, rule_set| %>
@ -1055,6 +1055,7 @@ private size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t star
                /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>(token_info.position, token_info.end_position, token, token_info.pvalue);
 <%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                statevalues[$-1].tree_node = token_tree_node;
 <% else %>
                statevalues[$-1].pvalue = token_info.pvalue;
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -256,6 +256,95 @@ If a pointer to any allocated memory is stored in a user-defined context field,
 it is up to the user to free any memory when the program is finished using the
 context structure.
 ### Custom token fields code blocks: the `token_user_fields` statement
 When tree generation mode is active, Propane generates a tree node structure
 and a token node structure for each matching rule and token instance in the
 input string.
 The user may add custom fields to token tree nodes using the `token_user_fields`
 statement.
 The code block supplied to the `token_user_fields` is inserted in the `struct`
 generated by the parser to hold a token tree node.
 Example (D/C++):
 ```
 token_user_fields <<
    string mytokenval;
 >>
 ```
 The `on_token_node` statement can be used to provide code that initializes
 any token user fields when a token tree node instance is created.
 For example (C++):
 ```
 context_user_fields <<
    std::string comments;
 >>
 token_user_fields <<
    std::string comments;
 >>
 on_token_node <<
    ${token.comments} = ${context.comments};
    ${context.comments} = "";
 >>
 drop /#(.*)\n/ <<
    /* Accumulate comments before the next parser tree node. */
    ${context.comments} += std::string((const char *)match, match_length);
 >>
 ```
 If a pointer to any allocated memory is stored in a user-defined context field,
 the `free_token_user_fields` statement can be used to supply a code block which
 will be executed immediately before the token node is freed.
 For C++, the `delete` statement is used to free the token tree node, so the
 destructor for any custom token user fields will be called.
 ### Custom initialization of a token tree node - the `on_tree_node` statement
 The `on_token_node` statement can be used to provide code that initializes
 any token user fields when a token tree node instance is created.
 For example (C++):
 ```
 context_user_fields <<
    std::string comments;
 >>
 token_user_fields <<
    std::string comments;
 >>
 on_token_node <<
    ${token.comments} = ${context.comments};
    ${context.comments} = "";
 >>
 drop /#(.*)\n/ <<
    /* Accumulate comments before the next parser tree node. */
    ${context.comments} += std::string((const char *)match, match_length);
 >>
 ```
 ### Freeing allocated memory in a custom token user field - the `free_token_user_fields` statement
 The `free_token_user_fields` statement allows the user to provide a code block
 which will be executed immediately prior to freeing the token tree node.
 For example (C):
 ```
 token_user_fields <<
    char * comments;
 >>
 on_token_node <<
    ${token.comments} = (char *)malloc(some_len);
 >>
 free_token_user_fields <<
    free(${token.comments});
 >>
 ```
 ##> Tree generation mode - the `tree` statement
 To activate tree generation mode, place the `tree` statement in your grammar file:
--- a/extra/vim/syntax/propane.vim
+++ b/extra/vim/syntax/propane.vim
@ -20,7 +20,7 @@ syn match propaneOperator "->"
 syn match propaneFieldAlias ":[a-zA-Z0-9_]\+" contains=propaneFieldOperator
 syn match propaneFieldOperator ":" contained
 syn match propaneOperator "?"
-syn keyword propaneKeyword drop free_token_node module prefix ptype start token tokenid tree tree_prefix tree_suffix
+syn keyword propaneKeyword drop free_token_node free_token_user_fields module prefix ptype start token token_user_fields tokenid tree tree_prefix tree_suffix
 syn region propaneRegex start="/" end="/" skip="\v\\\\|\\/"
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -13,8 +13,13 @@ class Propane
      @language =
        if output_file.end_with?(".d")
          "d"
-        else
+        elsif output_file.end_with?(".c")
          "c"
        elsif output_file =~ %r{\.(cc|cpp|cxx)$}
          @cpp = true
          "c"
        else
          raise Error.new("Could not determine target language from output file name (#{output_file})")
        end
      @options = options
      process_grammar!
@ -274,6 +279,15 @@ class Propane
          "context.#{fieldname}"
        end
      end
      code = code.gsub(/\$\{token\.(\w+)\}/) do |match|
        fieldname = $1
        case @language
        when "c"
          "token_tree_node->#{fieldname}"
        when "d"
          "token_tree_node.#{fieldname}"
        end
      end
      if parser
        code = code.gsub(/\$\$/) do |match|
          case @language
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -18,8 +18,9 @@ class Propane
    attr_reader :code_blocks
    attr_reader :ptypes
    attr_reader :prefix
-    attr_reader :token_node
+    attr_reader :on_token_node
    attr_reader :token_user_fields
    attr_reader :free_token_user_fields
    def initialize(input)
      @patterns = []
@ -38,8 +39,9 @@ class Propane
      @tree_suffix = ""
      @free_token_node = nil
      @context_user_fields = nil
-      @token_node = nil
+      @on_token_node = ""
      @token_user_fields = nil
      @free_token_user_fields = ""
      parse_grammar!
      @start_rules << "Start" if @start_rules.empty?
    end
@ -74,8 +76,9 @@ class Propane
      elsif parse_tree_suffix_statement!
      elsif parse_free_token_node_statement!
      elsif parse_module_statement!
-      elsif parse_token_node_statement!
+      elsif parse_on_token_node_statement!
      elsif parse_token_user_fields_statement!
      elsif parse_free_token_user_fields_statement!
      elsif parse_ptype_statement!
      elsif parse_pattern_statement!
      elsif parse_start_statement!
@ -151,13 +154,12 @@ class Propane
      end
    end
-    def parse_token_node_statement!
+    def parse_on_token_node_statement!
-      if md = consume!(/token_node\b\s*/)
+      if md = consume!(/on_token_node\b\s*/)
        unless code = parse_code_block!
          raise Error.new("Line #{@line_number}: expected code block")
        end
-        @token_node ||= ""
+        @on_token_node += code
        @token_node += code
      end
    end
@ -171,6 +173,15 @@ class Propane
      end
    end
    def parse_free_token_user_fields_statement!
      if md = consume!(/free_token_user_fields\b\s*/)
        unless code = parse_code_block!
          raise Error.new("Line #{@line_number}: expected code block")
        end
        @free_token_user_fields += code
      end
    end
    def parse_ptype_statement!
      if consume!(/ptype\s+/)
        name = "default"
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -14,6 +14,7 @@ describe Propane do
  end
  def run_propane(options = {})
    options[:language] ||= "d"
    @statics[:build_test_id] ||= 0
    @statics[:build_test_id] += 1
    if ENV["dist_specs"]
@ -1604,6 +1605,103 @@ EOF
        expect(results.stderr).to include %r{acount: 11\n}
        expect(results.status).to eq 0
      end
      it "allows custom token user fields" do
        if language == "d"
          write_grammar <<EOF
 context_user_fields <<
    string comments;
 >>
 token_user_fields <<
    string comments;
 >>
 on_token_node <<
    ${token.comments} = ${context.comments};
    ${context.comments} = "";
 >>
 tree;
 drop /\\s+/;
 drop /#(.*)\\n/ <<
    ${context.comments} ~= match;
 >>
 token id /\\w+/;
 Start -> IDs;
 IDs -> ;
 IDs -> id IDs;
 EOF
        elsif language == "c"
          write_grammar <<EOF
 <<
 #include <string.h>
 #include <stdlib.h>
 >>
 context_user_fields <<
    char * comments;
 >>
 token_user_fields <<
    char * comments;
 >>
 free_token_user_fields <<
    free(${token.comments});
 >>
 on_token_node <<
    ${token.comments} = ${context.comments};
    ${context.comments} = (char *)malloc(1);
    ${context.comments}[0] = '\\0';
 >>
 tree;
 drop /\\s+/;
 drop /#(.*)\\n/ <<
    size_t cur_len = 0u;
    if (${context.comments} != NULL)
        cur_len = strlen(${context.comments});
    char * commentsnew = (char *)malloc(cur_len + match_length + 1);
    if (${context.comments} != NULL)
        memcpy(commentsnew, ${context.comments}, cur_len);
    memcpy(&commentsnew[cur_len], match, match_length);
    commentsnew[cur_len + match_length] = '\\0';
    if (${context.comments} != NULL)
    {
        free(${context.comments});
    }
    ${context.comments} = commentsnew;
 >>
 token id /\\w+/;
 Start -> IDs;
 IDs -> ;
 IDs -> id IDs;
 EOF
        else # C++
          write_grammar <<EOF
 <<header
 #include <string>
 >>
 context_user_fields <<
    std::string comments;
 >>
 token_user_fields <<
    std::string comments;
 >>
 on_token_node <<
    ${token.comments} = ${context.comments};
    ${context.comments} = "";
 >>
 tree;
 drop /\\s+/;
 drop /#(.*)\\n/ <<
    ${context.comments} += std::string((const char *)match, match_length);
 >>
 token id /\\w+/;
 Start -> IDs;
 IDs -> ;
 IDs -> id IDs;
 EOF
        end
        run_propane(language: language)
        compile("spec/test_token_user_fields.#{language}", language: language)
        results = run_test(language: language)
        expect(results.status).to eq 0
      end
    end
  end
 end
--- a/spec/test_token_user_fields.c
+++ b/spec/test_token_user_fields.c
@ -0,0 +1,30 @@
 #include "testparser.h"
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 int main()
 {
    char const * input =
        "# c1\n"
        "#  c2\n"
        "\n"
        "first\n"
        "\n   \n  \n"
        "  # s1\n"
        "   #   s2\n"
        "second\n";
    p_context_t * context;
    context = p_context_new((uint8_t const *)input, strlen(input));
    assert(p_parse(context) == P_SUCCESS);
    Start * start = p_result(context);
 #ifndef __cplusplus
    free(context->comments);
 #endif
    p_context_delete(context);
    p_tree_delete(start);
    return 0;
 }
--- a/spec/test_token_user_fields.d
+++ b/spec/test_token_user_fields.d
@ -0,0 +1,24 @@
 import testparser;
 import std.stdio;
 int main()
 {
    return 0;
 }
 unittest
 {
    string input =
        "# c1\n" ~
        "#  c2\n" ~
        "\n" ~
        "first\n" ~
        "\n   \n  \n" ~
        "  # s1\n" ~
        "   #   s2\n" ~
        "second\n";
    p_context_t * context;
    context = p_context_new(input);
    assert(p_parse(context) == P_SUCCESS);
    Start * start = p_result(context);
 }