From d4ad67c23d57733017a7c3b51634eadc58a43b88 Mon Sep 17 00:00:00 2001
From: Josh Holtrop <jholtrop@gmail.com>
Date: Sat, 21 Feb 2026 21:06:40 -0500
Subject: [PATCH] Allow user to specify custom token node fields

---
 assets/parser.c.erb           | 21 ++++++--
 assets/parser.d.erb           |  3 +-
 doc/user_guide.md             | 89 +++++++++++++++++++++++++++++++
 extra/vim/syntax/propane.vim  |  2 +-
 lib/propane/generator.rb      | 16 +++++-
 lib/propane/grammar.rb        | 25 ++++++---
 spec/propane_spec.rb          | 98 +++++++++++++++++++++++++++++++++++
 spec/test_token_user_fields.c | 30 +++++++++++
 spec/test_token_user_fields.d | 24 +++++++++
 9 files changed, 295 insertions(+), 13 deletions(-)
 create mode 100644 spec/test_token_user_fields.c
 create mode 100644 spec/test_token_user_fields.d

diff --git a/assets/parser.c.erb b/assets/parser.c.erb
index 87d7776..5da3ceb 100644
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@@ -75,6 +75,10 @@ const char * <%= @grammar.prefix %>token_names[] = {
 /**
  * Deinitialize and deallocate lexer/parser context structure.
  *
+ * For C++, destructors will be called for any context user fields. However, if
+ * pointers are used to store allocated resources, the user should free them
+ * before calling this function.
+ *
  * @param context
  *   Lexer/parser context structure allocated with <%= @grammar.prefix %>context_new().
  */
@@ -1014,13 +1018,18 @@ static size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t start
             {
                 /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
+<%   if @cpp %>
+                <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>();
+<%   else %>
                 <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *)malloc(sizeof(<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>));
+<%   end %>
                 token_tree_node->position = token_info.position;
                 token_tree_node->end_position = token_info.end_position;
                 token_tree_node->n_fields = 0u;
                 token_tree_node->is_token = 1u;
                 token_tree_node->token = token;
                 token_tree_node->pvalue = token_info.pvalue;
+<%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                 state_values_stack_index(&statevalues, -1)->tree_node = token_tree_node;
 <% else %>
                 state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue;
@@ -1210,10 +1219,16 @@ static void tree_delete(TreeNode * node)
 {
     if (node->is_token)
     {
+        <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node;
 <%   if @grammar.free_token_node %>
-        <%= @grammar.free_token_node %>((<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node);
+        <%= @grammar.free_token_node %>(token_tree_node);
+<%   end %>
+<%= expand_code(@grammar.free_token_user_fields, false, nil, nil) %>
+<%   if @cpp %>
+        delete token_tree_node;
+<%   else %>
+        free(token_tree_node);
 <%   end %>
-        /* TODO: free value_t */
     }
     else if (node->n_fields > 0u)
     {
@@ -1224,8 +1239,8 @@ static void tree_delete(TreeNode * node)
                 tree_delete(node->fields[i]);
             }
         }
+        free(node);
     }
-    free(node);
 }
 
 /**
diff --git a/assets/parser.d.erb b/assets/parser.d.erb
index e048cda..7dbfea5 100644
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@@ -103,9 +103,9 @@ public struct <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>
     /* TreeNode fields must be present in the same order here. */
     <%= @grammar.prefix %>position_t position;
     <%= @grammar.prefix %>position_t end_position;
-<%= @grammar.token_user_fields %>
     <%= @grammar.prefix %>token_t token;
     <%= @grammar.prefix %>value_t pvalue;
+<%= @grammar.token_user_fields %>
 }
 
 <%   @parser.rule_sets.each do |name, rule_set| %>
@@ -1055,6 +1055,7 @@ private size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t star
                 /* We shifted a token, mark it consumed. */
 <% if @grammar.tree %>
                 <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>(token_info.position, token_info.end_position, token, token_info.pvalue);
+<%= expand_code(@grammar.on_token_node, false, nil, nil) %>
                 statevalues[$-1].tree_node = token_tree_node;
 <% else %>
                 statevalues[$-1].pvalue = token_info.pvalue;
diff --git a/doc/user_guide.md b/doc/user_guide.md
index 78ab874..c8b5de9 100644
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@@ -256,6 +256,95 @@ If a pointer to any allocated memory is stored in a user-defined context field,
 it is up to the user to free any memory when the program is finished using the
 context structure.
 
+### Custom token fields code blocks: the `token_user_fields` statement
+
+When tree generation mode is active, Propane generates a tree node structure
+and a token node structure for each matching rule and token instance in the
+input string.
+The user may add custom fields to token tree nodes using the `token_user_fields`
+statement.
+The code block supplied to the `token_user_fields` is inserted in the `struct`
+generated by the parser to hold a token tree node.
+
+Example (D/C++):
+
+```
+token_user_fields <<
+    string mytokenval;
+>>
+```
+
+The `on_token_node` statement can be used to provide code that initializes
+any token user fields when a token tree node instance is created.
+
+For example (C++):
+
+```
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+drop /#(.*)\n/ <<
+    /* Accumulate comments before the next parser tree node. */
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+```
+
+If a pointer to any allocated memory is stored in a user-defined context field,
+the `free_token_user_fields` statement can be used to supply a code block which
+will be executed immediately before the token node is freed.
+For C++, the `delete` statement is used to free the token tree node, so the
+destructor for any custom token user fields will be called.
+
+### Custom initialization of a token tree node - the `on_tree_node` statement
+
+The `on_token_node` statement can be used to provide code that initializes
+any token user fields when a token tree node instance is created.
+
+For example (C++):
+
+```
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+drop /#(.*)\n/ <<
+    /* Accumulate comments before the next parser tree node. */
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+```
+
+### Freeing allocated memory in a custom token user field - the `free_token_user_fields` statement
+
+The `free_token_user_fields` statement allows the user to provide a code block
+which will be executed immediately prior to freeing the token tree node.
+
+For example (C):
+
+```
+token_user_fields <<
+    char * comments;
+>>
+on_token_node <<
+    ${token.comments} = (char *)malloc(some_len);
+>>
+free_token_user_fields <<
+    free(${token.comments});
+>>
+```
+
 ##> Tree generation mode - the `tree` statement
 
 To activate tree generation mode, place the `tree` statement in your grammar file:
diff --git a/extra/vim/syntax/propane.vim b/extra/vim/syntax/propane.vim
index 50ce5f1..64c4f54 100644
--- a/extra/vim/syntax/propane.vim
+++ b/extra/vim/syntax/propane.vim
@@ -20,7 +20,7 @@ syn match propaneOperator "->"
 syn match propaneFieldAlias ":[a-zA-Z0-9_]\+" contains=propaneFieldOperator
 syn match propaneFieldOperator ":" contained
 syn match propaneOperator "?"
-syn keyword propaneKeyword drop free_token_node module prefix ptype start token tokenid tree tree_prefix tree_suffix
+syn keyword propaneKeyword drop free_token_node free_token_user_fields module prefix ptype start token token_user_fields tokenid tree tree_prefix tree_suffix
 
 syn region propaneRegex start="/" end="/" skip="\v\\\\|\\/"
 
diff --git a/lib/propane/generator.rb b/lib/propane/generator.rb
index e91990d..087df31 100644
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@@ -13,8 +13,13 @@ class Propane
       @language =
         if output_file.end_with?(".d")
           "d"
-        else
+        elsif output_file.end_with?(".c")
           "c"
+        elsif output_file =~ %r{\.(cc|cpp|cxx)$}
+          @cpp = true
+          "c"
+        else
+          raise Error.new("Could not determine target language from output file name (#{output_file})")
         end
       @options = options
       process_grammar!
@@ -274,6 +279,15 @@ class Propane
           "context.#{fieldname}"
         end
       end
+      code = code.gsub(/\$\{token\.(\w+)\}/) do |match|
+        fieldname = $1
+        case @language
+        when "c"
+          "token_tree_node->#{fieldname}"
+        when "d"
+          "token_tree_node.#{fieldname}"
+        end
+      end
       if parser
         code = code.gsub(/\$\$/) do |match|
           case @language
diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb
index 5f64449..dd318cb 100644
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@@ -18,8 +18,9 @@ class Propane
     attr_reader :code_blocks
     attr_reader :ptypes
     attr_reader :prefix
-    attr_reader :token_node
+    attr_reader :on_token_node
     attr_reader :token_user_fields
+    attr_reader :free_token_user_fields
 
     def initialize(input)
       @patterns = []
@@ -38,8 +39,9 @@ class Propane
       @tree_suffix = ""
       @free_token_node = nil
       @context_user_fields = nil
-      @token_node = nil
+      @on_token_node = ""
       @token_user_fields = nil
+      @free_token_user_fields = ""
       parse_grammar!
       @start_rules << "Start" if @start_rules.empty?
     end
@@ -74,8 +76,9 @@ class Propane
       elsif parse_tree_suffix_statement!
       elsif parse_free_token_node_statement!
       elsif parse_module_statement!
-      elsif parse_token_node_statement!
+      elsif parse_on_token_node_statement!
       elsif parse_token_user_fields_statement!
+      elsif parse_free_token_user_fields_statement!
       elsif parse_ptype_statement!
       elsif parse_pattern_statement!
       elsif parse_start_statement!
@@ -151,13 +154,12 @@ class Propane
       end
     end
 
-    def parse_token_node_statement!
-      if md = consume!(/token_node\b\s*/)
+    def parse_on_token_node_statement!
+      if md = consume!(/on_token_node\b\s*/)
         unless code = parse_code_block!
           raise Error.new("Line #{@line_number}: expected code block")
         end
-        @token_node ||= ""
-        @token_node += code
+        @on_token_node += code
       end
     end
 
@@ -171,6 +173,15 @@ class Propane
       end
     end
 
+    def parse_free_token_user_fields_statement!
+      if md = consume!(/free_token_user_fields\b\s*/)
+        unless code = parse_code_block!
+          raise Error.new("Line #{@line_number}: expected code block")
+        end
+        @free_token_user_fields += code
+      end
+    end
+
     def parse_ptype_statement!
       if consume!(/ptype\s+/)
         name = "default"
diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb
index e0cb6df..409cf20 100644
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@@ -14,6 +14,7 @@ describe Propane do
   end
 
   def run_propane(options = {})
+    options[:language] ||= "d"
     @statics[:build_test_id] ||= 0
     @statics[:build_test_id] += 1
     if ENV["dist_specs"]
@@ -1604,6 +1605,103 @@ EOF
         expect(results.stderr).to include %r{acount: 11\n}
         expect(results.status).to eq 0
       end
+
+      it "allows custom token user fields" do
+        if language == "d"
+          write_grammar <<EOF
+context_user_fields <<
+    string comments;
+>>
+token_user_fields <<
+    string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    ${context.comments} ~= match;
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        elsif language == "c"
+          write_grammar <<EOF
+<<
+#include <string.h>
+#include <stdlib.h>
+>>
+context_user_fields <<
+    char * comments;
+>>
+token_user_fields <<
+    char * comments;
+>>
+free_token_user_fields <<
+    free(${token.comments});
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = (char *)malloc(1);
+    ${context.comments}[0] = '\\0';
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    size_t cur_len = 0u;
+    if (${context.comments} != NULL)
+        cur_len = strlen(${context.comments});
+    char * commentsnew = (char *)malloc(cur_len + match_length + 1);
+    if (${context.comments} != NULL)
+        memcpy(commentsnew, ${context.comments}, cur_len);
+    memcpy(&commentsnew[cur_len], match, match_length);
+    commentsnew[cur_len + match_length] = '\\0';
+    if (${context.comments} != NULL)
+    {
+        free(${context.comments});
+    }
+    ${context.comments} = commentsnew;
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        else # C++
+          write_grammar <<EOF
+<<header
+#include <string>
+>>
+context_user_fields <<
+    std::string comments;
+>>
+token_user_fields <<
+    std::string comments;
+>>
+on_token_node <<
+    ${token.comments} = ${context.comments};
+    ${context.comments} = "";
+>>
+tree;
+drop /\\s+/;
+drop /#(.*)\\n/ <<
+    ${context.comments} += std::string((const char *)match, match_length);
+>>
+token id /\\w+/;
+Start -> IDs;
+IDs -> ;
+IDs -> id IDs;
+EOF
+        end
+        run_propane(language: language)
+        compile("spec/test_token_user_fields.#{language}", language: language)
+        results = run_test(language: language)
+        expect(results.status).to eq 0
+      end
     end
   end
 end
diff --git a/spec/test_token_user_fields.c b/spec/test_token_user_fields.c
new file mode 100644
index 0000000..e27e490
--- /dev/null
+++ b/spec/test_token_user_fields.c
@@ -0,0 +1,30 @@
+#include "testparser.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+int main()
+{
+    char const * input =
+        "# c1\n"
+        "#  c2\n"
+        "\n"
+        "first\n"
+        "\n   \n  \n"
+        "  # s1\n"
+        "   #   s2\n"
+        "second\n";
+    p_context_t * context;
+    context = p_context_new((uint8_t const *)input, strlen(input));
+    assert(p_parse(context) == P_SUCCESS);
+    Start * start = p_result(context);
+
+#ifndef __cplusplus
+    free(context->comments);
+#endif
+    p_context_delete(context);
+    p_tree_delete(start);
+
+    return 0;
+}
diff --git a/spec/test_token_user_fields.d b/spec/test_token_user_fields.d
new file mode 100644
index 0000000..e8b8d63
--- /dev/null
+++ b/spec/test_token_user_fields.d
@@ -0,0 +1,24 @@
+import testparser;
+import std.stdio;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input =
+        "# c1\n" ~
+        "#  c2\n" ~
+        "\n" ~
+        "first\n" ~
+        "\n   \n  \n" ~
+        "  # s1\n" ~
+        "   #   s2\n" ~
+        "second\n";
+    p_context_t * context;
+    context = p_context_new(input);
+    assert(p_parse(context) == P_SUCCESS);
+    Start * start = p_result(context);
+}