Allow user to specify custom token node fields

2026-02-21 21:06:40 -05:00 · 2026-02-21 21:06:40 -05:00 · d4ad67c23d
commit d4ad67c23d
parent ff61dd05d9
9 changed files with 295 additions and 13 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -75,6 +75,10 @@ const char * <%= @grammar.prefix %>token_names[] = {
 /**
 * Deinitialize and deallocate lexer/parser context structure.
 *
+ * For C++, destructors will be called for any context user fields. However, if
+ * pointers are used to store allocated resources, the user should free them
+ * before calling this function.
+ *
 * @param context
 *   Lexer/parser context structure allocated with <%= @grammar.prefix %>context_new().
 */
@ -1014,13 +1018,18 @@ static size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t start
            {
                /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
+<%   if @cpp %>
+                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>();
+<%   else %>
                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *)malloc(sizeof(<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>));
+<%   end %>
                token_tree_node->position = token_info.position;
                token_tree_node->end_position = token_info.end_position;
                token_tree_node->n_fields = 0u;
                token_tree_node->is_token = 1u;
                token_tree_node->token = token;
                token_tree_node->pvalue = token_info.pvalue;
+<%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                state_values_stack_index(&statevalues, -1)->tree_node = token_tree_node;
 <% else %>
                state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue;
@ -1210,10 +1219,16 @@ static void tree_delete(TreeNode * node)
 {
    if (node->is_token)
    {
+        <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node;
 <%   if @grammar.free_token_node %>
-        <%= @grammar.free_token_node %>((<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node);
+        <%= @grammar.free_token_node %>(token_tree_node);
+<%   end %>
+<%= expand_code(@grammar.free_token_user_fields, false, nil, nil) %>
+<%   if @cpp %>
+        delete token_tree_node;
+<%   else %>
+        free(token_tree_node);
 <%   end %>
-        /* TODO: free value_t */
    }
    else if (node->n_fields > 0u)
    {
@ -1224,8 +1239,8 @@ static void tree_delete(TreeNode * node)
                tree_delete(node->fields[i]);
            }
        }
-    }
        free(node);
+    }
 }

 /**
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -103,9 +103,9 @@ public struct <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>
    /* TreeNode fields must be present in the same order here. */
    <%= @grammar.prefix %>position_t position;
    <%= @grammar.prefix %>position_t end_position;
-<%= @grammar.token_user_fields %>
    <%= @grammar.prefix %>token_t token;
    <%= @grammar.prefix %>value_t pvalue;
+<%= @grammar.token_user_fields %>
 }

 <%   @parser.rule_sets.each do |name, rule_set| %>
@ -1055,6 +1055,7 @@ private size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t star
                /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>(token_info.position, token_info.end_position, token, token_info.pvalue);
+<%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                statevalues[$-1].tree_node = token_tree_node;
 <% else %>
                statevalues[$-1].pvalue = token_info.pvalue;
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -256,6 +256,95 @@ If a pointer to any allocated memory is stored in a user-defined context field,
 it is up to the user to free any memory when the program is finished using the
 context structure.

+### Custom token fields code blocks: the `token_user_fields` statement
+
+When tree generation mode is active, Propane generates a tree node structure
+and a token node structure for each matching rule and token instance in the
+input string.
+The user may add custom fields to token tree nodes using the `token_user_fields`
+statement.
+The code block supplied to the `token_user_fields` is inserted in the `struct`
+generated by the parser to hold a token tree node.
+
+Example (D/C++):
+
+```
+token_user_fields <<
+    string mytokenval;
+>>
+```
+
+The `on_token_node` statement can be used to provide code that initializes
+any token user fields when a token tree node instance is created.
+
+For example (C++):
+
+```
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+drop /#(.*)\n/ <<
+    /* Accumulate comments before the next parser tree node. */
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+```
+
+If a pointer to any allocated memory is stored in a user-defined context field,
+the `free_token_user_fields` statement can be used to supply a code block which
+will be executed immediately before the token node is freed.
+For C++, the `delete` statement is used to free the token tree node, so the
+destructor for any custom token user fields will be called.
+
+### Custom initialization of a token tree node - the `on_tree_node` statement
+
+The `on_token_node` statement can be used to provide code that initializes
+any token user fields when a token tree node instance is created.
+
+For example (C++):
+
+```
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+drop /#(.*)\n/ <<
+    /* Accumulate comments before the next parser tree node. */
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+```
+
+### Freeing allocated memory in a custom token user field - the `free_token_user_fields` statement
+
+The `free_token_user_fields` statement allows the user to provide a code block
+which will be executed immediately prior to freeing the token tree node.
+
+For example (C):
+
+```
+token_user_fields <<
+    char * comments;
+>>
+on_token_node <<
+    ${token.comments} = (char *)malloc(some_len);
+>>
+free_token_user_fields <<
+    free(${token.comments});
+>>
+```
+
 ##> Tree generation mode - the `tree` statement

 To activate tree generation mode, place the `tree` statement in your grammar file:
--- a/extra/vim/syntax/propane.vim
+++ b/extra/vim/syntax/propane.vim
@ -20,7 +20,7 @@ syn match propaneOperator "->"
 syn match propaneFieldAlias ":[a-zA-Z0-9_]\+" contains=propaneFieldOperator
 syn match propaneFieldOperator ":" contained
 syn match propaneOperator "?"
-syn keyword propaneKeyword drop free_token_node module prefix ptype start token tokenid tree tree_prefix tree_suffix
+syn keyword propaneKeyword drop free_token_node free_token_user_fields module prefix ptype start token token_user_fields tokenid tree tree_prefix tree_suffix

 syn region propaneRegex start="/" end="/" skip="\v\\\\|\\/"

--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -13,8 +13,13 @@ class Propane
      @language =
        if output_file.end_with?(".d")
          "d"
-        else
+        elsif output_file.end_with?(".c")
          "c"
+        elsif output_file =~ %r{\.(cc|cpp|cxx)$}
+          @cpp = true
+          "c"
+        else
+          raise Error.new("Could not determine target language from output file name (#{output_file})")
        end
      @options = options
      process_grammar!
@ -274,6 +279,15 @@ class Propane
          "context.#{fieldname}"
        end
      end
+      code = code.gsub(/\$\{token\.(\w+)\}/) do |match|
+        fieldname = $1
+        case @language
+        when "c"
+          "token_tree_node->#{fieldname}"
+        when "d"
+          "token_tree_node.#{fieldname}"
+        end
+      end
      if parser
        code = code.gsub(/\$\$/) do |match|
          case @language
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -18,8 +18,9 @@ class Propane
    attr_reader :code_blocks
    attr_reader :ptypes
    attr_reader :prefix
-    attr_reader :token_node
+    attr_reader :on_token_node
    attr_reader :token_user_fields
+    attr_reader :free_token_user_fields

    def initialize(input)
      @patterns = []
@ -38,8 +39,9 @@ class Propane
      @tree_suffix = ""
      @free_token_node = nil
      @context_user_fields = nil
-      @token_node = nil
+      @on_token_node = ""
      @token_user_fields = nil
+      @free_token_user_fields = ""
      parse_grammar!
      @start_rules << "Start" if @start_rules.empty?
    end
@ -74,8 +76,9 @@ class Propane
      elsif parse_tree_suffix_statement!
      elsif parse_free_token_node_statement!
      elsif parse_module_statement!
-      elsif parse_token_node_statement!
+      elsif parse_on_token_node_statement!
      elsif parse_token_user_fields_statement!
+      elsif parse_free_token_user_fields_statement!
      elsif parse_ptype_statement!
      elsif parse_pattern_statement!
      elsif parse_start_statement!
@ -151,13 +154,12 @@ class Propane
      end
    end

-    def parse_token_node_statement!
-      if md = consume!(/token_node\b\s*/)
+    def parse_on_token_node_statement!
+      if md = consume!(/on_token_node\b\s*/)
        unless code = parse_code_block!
          raise Error.new("Line #{@line_number}: expected code block")
        end
-        @token_node ||= ""
-        @token_node += code
+        @on_token_node += code
      end
    end

@ -171,6 +173,15 @@ class Propane
      end
    end

+    def parse_free_token_user_fields_statement!
+      if md = consume!(/free_token_user_fields\b\s*/)
+        unless code = parse_code_block!
+          raise Error.new("Line #{@line_number}: expected code block")
+        end
+        @free_token_user_fields += code
+      end
+    end
+
    def parse_ptype_statement!
      if consume!(/ptype\s+/)
        name = "default"
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -14,6 +14,7 @@ describe Propane do
  end

  def run_propane(options = {})
+    options[:language] ||= "d"
    @statics[:build_test_id] ||= 0
    @statics[:build_test_id] += 1
    if ENV["dist_specs"]
@ -1604,6 +1605,103 @@ EOF
        expect(results.stderr).to include %r{acount: 11\n}
        expect(results.status).to eq 0
      end
+
+      it "allows custom token user fields" do
+        if language == "d"
+          write_grammar <<EOF
+context_user_fields <<
+    string comments;
+>>
+token_user_fields <<
+    string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    ${context.comments} ~= match;
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        elsif language == "c"
+          write_grammar <<EOF
+<<
+#include <string.h>
+#include <stdlib.h>
+>>
+context_user_fields <<
+    char * comments;
+>>
+token_user_fields <<
+    char * comments;
+>>
+free_token_user_fields <<
+    free(${token.comments});
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = (char *)malloc(1);
+    ${context.comments}[0] = '\\0';
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    size_t cur_len = 0u;
+    if (${context.comments} != NULL)
+        cur_len = strlen(${context.comments});
+    char * commentsnew = (char *)malloc(cur_len + match_length + 1);
+    if (${context.comments} != NULL)
+        memcpy(commentsnew, ${context.comments}, cur_len);
+    memcpy(&commentsnew[cur_len], match, match_length);
+    commentsnew[cur_len + match_length] = '\\0';
+    if (${context.comments} != NULL)
+    {
+        free(${context.comments});
+    }
+    ${context.comments} = commentsnew;
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        else # C++
+          write_grammar <<EOF
+<<header
+#include <string>
+>>
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        end
+        run_propane(language: language)
+        compile("spec/test_token_user_fields.#{language}", language: language)
+        results = run_test(language: language)
+        expect(results.status).to eq 0
+      end
    end
  end
 end
--- a/spec/test_token_user_fields.c
+++ b/spec/test_token_user_fields.c
@ -0,0 +1,30 @@
+#include "testparser.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+int main()
+{
+    char const * input =
+        "# c1\n"
+        "#  c2\n"
+        "\n"
+        "first\n"
+        "\n   \n  \n"
+        "  # s1\n"
+        "   #   s2\n"
+        "second\n";
+    p_context_t * context;
+    context = p_context_new((uint8_t const *)input, strlen(input));
+    assert(p_parse(context) == P_SUCCESS);
+    Start * start = p_result(context);
+
+#ifndef __cplusplus
+    free(context->comments);
+#endif
+    p_context_delete(context);
+    p_tree_delete(start);
+
+    return 0;
+}
--- a/spec/test_token_user_fields.d
+++ b/spec/test_token_user_fields.d
@ -0,0 +1,24 @@
+import testparser;
+import std.stdio;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input =
+        "# c1\n" ~
+        "#  c2\n" ~
+        "\n" ~
+        "first\n" ~
+        "\n   \n  \n" ~
+        "  # s1\n" ~
+        "   #   s2\n" ~
+        "second\n";
+    p_context_t * context;
+    context = p_context_new(input);
+    assert(p_parse(context) == P_SUCCESS);
+    Start * start = p_result(context);
+}