From d4ad67c23d57733017a7c3b51634eadc58a43b88 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Sat, 21 Feb 2026 21:06:40 -0500 Subject: [PATCH] Allow user to specify custom token node fields --- assets/parser.c.erb | 21 ++++++-- assets/parser.d.erb | 3 +- doc/user_guide.md | 89 +++++++++++++++++++++++++++++++ extra/vim/syntax/propane.vim | 2 +- lib/propane/generator.rb | 16 +++++- lib/propane/grammar.rb | 25 ++++++--- spec/propane_spec.rb | 98 +++++++++++++++++++++++++++++++++++ spec/test_token_user_fields.c | 30 +++++++++++ spec/test_token_user_fields.d | 24 +++++++++ 9 files changed, 295 insertions(+), 13 deletions(-) create mode 100644 spec/test_token_user_fields.c create mode 100644 spec/test_token_user_fields.d diff --git a/assets/parser.c.erb b/assets/parser.c.erb index 87d7776..5da3ceb 100644 --- a/assets/parser.c.erb +++ b/assets/parser.c.erb @@ -75,6 +75,10 @@ const char * <%= @grammar.prefix %>token_names[] = { /** * Deinitialize and deallocate lexer/parser context structure. * + * For C++, destructors will be called for any context user fields. However, if + * pointers are used to store allocated resources, the user should free them + * before calling this function. + * * @param context * Lexer/parser context structure allocated with <%= @grammar.prefix %>context_new(). */ @@ -1014,13 +1018,18 @@ static size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t start { /* We shifted a token, mark it consumed. */ <% if @grammar.tree %> +<% if @cpp %> + <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>(); +<% else %> <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *)malloc(sizeof(<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>)); +<% end %> token_tree_node->position = token_info.position; token_tree_node->end_position = token_info.end_position; token_tree_node->n_fields = 0u; token_tree_node->is_token = 1u; token_tree_node->token = token; token_tree_node->pvalue = token_info.pvalue; +<%= expand_code(@grammar.on_token_node, false, nil, nil) %> state_values_stack_index(&statevalues, -1)->tree_node = token_tree_node; <% else %> state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue; @@ -1210,10 +1219,16 @@ static void tree_delete(TreeNode * node) { if (node->is_token) { + <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = (<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node; <% if @grammar.free_token_node %> - <%= @grammar.free_token_node %>((<%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> *) node); + <%= @grammar.free_token_node %>(token_tree_node); +<% end %> +<%= expand_code(@grammar.free_token_user_fields, false, nil, nil) %> +<% if @cpp %> + delete token_tree_node; +<% else %> + free(token_tree_node); <% end %> - /* TODO: free value_t */ } else if (node->n_fields > 0u) { @@ -1224,8 +1239,8 @@ static void tree_delete(TreeNode * node) tree_delete(node->fields[i]); } } + free(node); } - free(node); } /** diff --git a/assets/parser.d.erb b/assets/parser.d.erb index e048cda..7dbfea5 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -103,9 +103,9 @@ public struct <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> /* TreeNode fields must be present in the same order here. */ <%= @grammar.prefix %>position_t position; <%= @grammar.prefix %>position_t end_position; -<%= @grammar.token_user_fields %> <%= @grammar.prefix %>token_t token; <%= @grammar.prefix %>value_t pvalue; +<%= @grammar.token_user_fields %> } <% @parser.rule_sets.each do |name, rule_set| %> @@ -1055,6 +1055,7 @@ private size_t parse_from(<%= @grammar.prefix %>context_t * context, size_t star /* We shifted a token, mark it consumed. */ <% if @grammar.tree %> <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %> * token_tree_node = new <%= @grammar.tree_prefix %>Token<%= @grammar.tree_suffix %>(token_info.position, token_info.end_position, token, token_info.pvalue); +<%= expand_code(@grammar.on_token_node, false, nil, nil) %> statevalues[$-1].tree_node = token_tree_node; <% else %> statevalues[$-1].pvalue = token_info.pvalue; diff --git a/doc/user_guide.md b/doc/user_guide.md index 78ab874..c8b5de9 100644 --- a/doc/user_guide.md +++ b/doc/user_guide.md @@ -256,6 +256,95 @@ If a pointer to any allocated memory is stored in a user-defined context field, it is up to the user to free any memory when the program is finished using the context structure. +### Custom token fields code blocks: the `token_user_fields` statement + +When tree generation mode is active, Propane generates a tree node structure +and a token node structure for each matching rule and token instance in the +input string. +The user may add custom fields to token tree nodes using the `token_user_fields` +statement. +The code block supplied to the `token_user_fields` is inserted in the `struct` +generated by the parser to hold a token tree node. + +Example (D/C++): + +``` +token_user_fields << + string mytokenval; +>> +``` + +The `on_token_node` statement can be used to provide code that initializes +any token user fields when a token tree node instance is created. + +For example (C++): + +``` +context_user_fields << + std::string comments; +>> +token_user_fields << + std::string comments; +>> +on_token_node << + ${token.comments} = ${context.comments}; + ${context.comments} = ""; +>> +drop /#(.*)\n/ << + /* Accumulate comments before the next parser tree node. */ + ${context.comments} += std::string((const char *)match, match_length); +>> +``` + +If a pointer to any allocated memory is stored in a user-defined context field, +the `free_token_user_fields` statement can be used to supply a code block which +will be executed immediately before the token node is freed. +For C++, the `delete` statement is used to free the token tree node, so the +destructor for any custom token user fields will be called. + +### Custom initialization of a token tree node - the `on_tree_node` statement + +The `on_token_node` statement can be used to provide code that initializes +any token user fields when a token tree node instance is created. + +For example (C++): + +``` +context_user_fields << + std::string comments; +>> +token_user_fields << + std::string comments; +>> +on_token_node << + ${token.comments} = ${context.comments}; + ${context.comments} = ""; +>> +drop /#(.*)\n/ << + /* Accumulate comments before the next parser tree node. */ + ${context.comments} += std::string((const char *)match, match_length); +>> +``` + +### Freeing allocated memory in a custom token user field - the `free_token_user_fields` statement + +The `free_token_user_fields` statement allows the user to provide a code block +which will be executed immediately prior to freeing the token tree node. + +For example (C): + +``` +token_user_fields << + char * comments; +>> +on_token_node << + ${token.comments} = (char *)malloc(some_len); +>> +free_token_user_fields << + free(${token.comments}); +>> +``` + ##> Tree generation mode - the `tree` statement To activate tree generation mode, place the `tree` statement in your grammar file: diff --git a/extra/vim/syntax/propane.vim b/extra/vim/syntax/propane.vim index 50ce5f1..64c4f54 100644 --- a/extra/vim/syntax/propane.vim +++ b/extra/vim/syntax/propane.vim @@ -20,7 +20,7 @@ syn match propaneOperator "->" syn match propaneFieldAlias ":[a-zA-Z0-9_]\+" contains=propaneFieldOperator syn match propaneFieldOperator ":" contained syn match propaneOperator "?" -syn keyword propaneKeyword drop free_token_node module prefix ptype start token tokenid tree tree_prefix tree_suffix +syn keyword propaneKeyword drop free_token_node free_token_user_fields module prefix ptype start token token_user_fields tokenid tree tree_prefix tree_suffix syn region propaneRegex start="/" end="/" skip="\v\\\\|\\/" diff --git a/lib/propane/generator.rb b/lib/propane/generator.rb index e91990d..087df31 100644 --- a/lib/propane/generator.rb +++ b/lib/propane/generator.rb @@ -13,8 +13,13 @@ class Propane @language = if output_file.end_with?(".d") "d" - else + elsif output_file.end_with?(".c") "c" + elsif output_file =~ %r{\.(cc|cpp|cxx)$} + @cpp = true + "c" + else + raise Error.new("Could not determine target language from output file name (#{output_file})") end @options = options process_grammar! @@ -274,6 +279,15 @@ class Propane "context.#{fieldname}" end end + code = code.gsub(/\$\{token\.(\w+)\}/) do |match| + fieldname = $1 + case @language + when "c" + "token_tree_node->#{fieldname}" + when "d" + "token_tree_node.#{fieldname}" + end + end if parser code = code.gsub(/\$\$/) do |match| case @language diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb index 5f64449..dd318cb 100644 --- a/lib/propane/grammar.rb +++ b/lib/propane/grammar.rb @@ -18,8 +18,9 @@ class Propane attr_reader :code_blocks attr_reader :ptypes attr_reader :prefix - attr_reader :token_node + attr_reader :on_token_node attr_reader :token_user_fields + attr_reader :free_token_user_fields def initialize(input) @patterns = [] @@ -38,8 +39,9 @@ class Propane @tree_suffix = "" @free_token_node = nil @context_user_fields = nil - @token_node = nil + @on_token_node = "" @token_user_fields = nil + @free_token_user_fields = "" parse_grammar! @start_rules << "Start" if @start_rules.empty? end @@ -74,8 +76,9 @@ class Propane elsif parse_tree_suffix_statement! elsif parse_free_token_node_statement! elsif parse_module_statement! - elsif parse_token_node_statement! + elsif parse_on_token_node_statement! elsif parse_token_user_fields_statement! + elsif parse_free_token_user_fields_statement! elsif parse_ptype_statement! elsif parse_pattern_statement! elsif parse_start_statement! @@ -151,13 +154,12 @@ class Propane end end - def parse_token_node_statement! - if md = consume!(/token_node\b\s*/) + def parse_on_token_node_statement! + if md = consume!(/on_token_node\b\s*/) unless code = parse_code_block! raise Error.new("Line #{@line_number}: expected code block") end - @token_node ||= "" - @token_node += code + @on_token_node += code end end @@ -171,6 +173,15 @@ class Propane end end + def parse_free_token_user_fields_statement! + if md = consume!(/free_token_user_fields\b\s*/) + unless code = parse_code_block! + raise Error.new("Line #{@line_number}: expected code block") + end + @free_token_user_fields += code + end + end + def parse_ptype_statement! if consume!(/ptype\s+/) name = "default" diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index e0cb6df..409cf20 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -14,6 +14,7 @@ describe Propane do end def run_propane(options = {}) + options[:language] ||= "d" @statics[:build_test_id] ||= 0 @statics[:build_test_id] += 1 if ENV["dist_specs"] @@ -1604,6 +1605,103 @@ EOF expect(results.stderr).to include %r{acount: 11\n} expect(results.status).to eq 0 end + + it "allows custom token user fields" do + if language == "d" + write_grammar <> +token_user_fields << + string comments; +>> +on_token_node << + ${token.comments} = ${context.comments}; + ${context.comments} = ""; +>> +tree; +drop /\\s+/; +drop /#(.*)\\n/ << + ${context.comments} ~= match; +>> +token id /\\w+/; +Start -> IDs; +IDs -> ; +IDs -> id IDs; +EOF + elsif language == "c" + write_grammar < +#include +>> +context_user_fields << + char * comments; +>> +token_user_fields << + char * comments; +>> +free_token_user_fields << + free(${token.comments}); +>> +on_token_node << + ${token.comments} = ${context.comments}; + ${context.comments} = (char *)malloc(1); + ${context.comments}[0] = '\\0'; +>> +tree; +drop /\\s+/; +drop /#(.*)\\n/ << + size_t cur_len = 0u; + if (${context.comments} != NULL) + cur_len = strlen(${context.comments}); + char * commentsnew = (char *)malloc(cur_len + match_length + 1); + if (${context.comments} != NULL) + memcpy(commentsnew, ${context.comments}, cur_len); + memcpy(&commentsnew[cur_len], match, match_length); + commentsnew[cur_len + match_length] = '\\0'; + if (${context.comments} != NULL) + { + free(${context.comments}); + } + ${context.comments} = commentsnew; +>> +token id /\\w+/; +Start -> IDs; +IDs -> ; +IDs -> id IDs; +EOF + else # C++ + write_grammar < +>> +context_user_fields << + std::string comments; +>> +token_user_fields << + std::string comments; +>> +on_token_node << + ${token.comments} = ${context.comments}; + ${context.comments} = ""; +>> +tree; +drop /\\s+/; +drop /#(.*)\\n/ << + ${context.comments} += std::string((const char *)match, match_length); +>> +token id /\\w+/; +Start -> IDs; +IDs -> ; +IDs -> id IDs; +EOF + end + run_propane(language: language) + compile("spec/test_token_user_fields.#{language}", language: language) + results = run_test(language: language) + expect(results.status).to eq 0 + end end end end diff --git a/spec/test_token_user_fields.c b/spec/test_token_user_fields.c new file mode 100644 index 0000000..e27e490 --- /dev/null +++ b/spec/test_token_user_fields.c @@ -0,0 +1,30 @@ +#include "testparser.h" +#include +#include +#include +#include + +int main() +{ + char const * input = + "# c1\n" + "# c2\n" + "\n" + "first\n" + "\n \n \n" + " # s1\n" + " # s2\n" + "second\n"; + p_context_t * context; + context = p_context_new((uint8_t const *)input, strlen(input)); + assert(p_parse(context) == P_SUCCESS); + Start * start = p_result(context); + +#ifndef __cplusplus + free(context->comments); +#endif + p_context_delete(context); + p_tree_delete(start); + + return 0; +} diff --git a/spec/test_token_user_fields.d b/spec/test_token_user_fields.d new file mode 100644 index 0000000..e8b8d63 --- /dev/null +++ b/spec/test_token_user_fields.d @@ -0,0 +1,24 @@ +import testparser; +import std.stdio; + +int main() +{ + return 0; +} + +unittest +{ + string input = + "# c1\n" ~ + "# c2\n" ~ + "\n" ~ + "first\n" ~ + "\n \n \n" ~ + " # s1\n" ~ + " # s2\n" ~ + "second\n"; + p_context_t * context; + context = p_context_new(input); + assert(p_parse(context) == P_SUCCESS); + Start * start = p_result(context); +}