Allow rule terms to be marked as optional

2024-05-09 11:56:13 -04:00 · 2024-05-09 11:56:13 -04:00 · f3e4941ad8
commit f3e4941ad8
parent 494afb7307
14 changed files with 346 additions and 5 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -622,6 +622,13 @@ typedef struct
     * Number of rule set AST node fields.
     */
    uint16_t rule_set_node_field_array_size;
+
+    /**
+     * Whether this rule was a generated optional rule that matched the
+     * optional target. In this case, propagate the matched target node up
+     * instead of making a new node for this rule.
+     */
+    bool propagate_optional_target;
 <% end %>
 } reduce_t;

@ -686,6 +693,7 @@ static const reduce_t parser_reduce_table[] = {
             , &r_<%= reduce[:rule].name.gsub("$", "_") %><%= reduce[:rule].id %>_node_field_index_map[0]
 <%       end %>
             , <%= reduce[:rule].rule_set.ast_fields.size %>
+             , <%= reduce[:propagate_optional_target] %>
 <%     end %>
    },
 <%   end %>
@ -970,7 +978,11 @@ size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context)
        {
            /* We have something to reduce. */
 <% if @grammar.ast %>
-            if (parser_reduce_table[reduce_index].n_states > 0)
+            if (parser_reduce_table[reduce_index].propagate_optional_target)
+            {
+                reduced_parser_node = state_values_stack_index(&statevalues, -1)->ast_node;
+            }
+            else if (parser_reduce_table[reduce_index].n_states > 0)
            {
                void ** node_fields = calloc(parser_reduce_table[reduce_index].rule_set_node_field_array_size, sizeof(void *));
                if (parser_reduce_table[reduce_index].rule_set_node_field_index_map == NULL)
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -72,6 +72,7 @@ public struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>

 <%   @parser.rule_sets.each do |name, rule_set| %>
 <%     next if name.start_with?("$") %>
+<%     next if rule_set.optional? %>
 public struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 {
 <%     rule_set.ast_fields.each do |fields| %>
@ -763,6 +764,13 @@ private struct reduce_t
     * Number of rule set AST node fields.
     */
    ushort rule_set_node_field_array_size;
+
+    /**
+     * Whether this rule was a generated optional rule that matched the
+     * optional target. In this case, propagate the matched target node up
+     * instead of making a new node for this rule.
+     */
+    bool propagate_optional_target;
 <% end %>
 }

@ -832,6 +840,7 @@ private immutable reduce_t[] parser_reduce_table = [
             , &r_<%= reduce[:rule].name.gsub("$", "_") %><%= reduce[:rule].id %>_node_field_index_map[0]
 <%       end %>
             , <%= reduce[:rule].rule_set.ast_fields.size %>
+             , <%= reduce[:propagate_optional_target] %>
 <%     end %>
            ),
 <%   end %>
@ -1015,7 +1024,11 @@ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * cont
        {
            /* We have something to reduce. */
 <% if @grammar.ast %>
-            if (parser_reduce_table[reduce_index].n_states > 0)
+            if (parser_reduce_table[reduce_index].propagate_optional_target)
+            {
+                reduced_parser_node = statevalues[$ - 1].ast_node;
+            }
+            else if (parser_reduce_table[reduce_index].n_states > 0)
            {
                void *[] node_fields = new void *[parser_reduce_table[reduce_index].rule_set_node_field_array_size];
                foreach (i; 0..parser_reduce_table[reduce_index].rule_set_node_field_array_size)
--- a/assets/parser.h.erb
+++ b/assets/parser.h.erb
@ -64,11 +64,13 @@ typedef struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>

 <%   @parser.rule_sets.each do |name, rule_set| %>
 <%     next if name.start_with?("$") %>
+<%     next if rule_set.optional? %>
 struct <%= name %>;
 <%   end %>

 <%   @parser.rule_sets.each do |name, rule_set| %>
 <%     next if name.start_with?("$") %>
+<%     next if rule_set.optional? %>
 typedef struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 {
 <%     rule_set.ast_fields.each do |fields| %>
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -631,6 +631,20 @@ This example uses the default start rule name of `Start`.

 A parser rule has zero or more terms on the right side of its definition.
 Each of these terms is either a token name or a rule name.
+A term can be immediately followed by a `?` character to signify that it is
+optional.
+Another example:
+
+```
+token public;
+token private;
+token int;
+token ident /[a-zA-Z_][a-zA-Z_0-9]*/;
+token semicolon /;/;
+IntegerDeclaration -> Visibility? int ident semicolon;
+Visibility -> public;
+Visibility -> private;
+```

 In a parser rule code block, parser values for the right side terms are
 accessible as `$1` for the first term's parser value, `$2` for the second
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -71,6 +71,9 @@ class Propane
      end
      # Add "real" start rule.
      @grammar.rules.unshift(Rule.new("$Start", [@grammar.start_rule, "$EOF"], nil, nil, nil))
+      # Generate and add rules for optional components.
+      generate_optional_component_rules!(tokens_by_name)
+      # Build rule sets.
      rule_sets = {}
      rule_set_id = @grammar.tokens.size
      @grammar.rules.each_with_index do |rule, rule_id|
@ -128,6 +131,37 @@ class Propane
      @parser = Parser.new(@grammar, rule_sets, @log)
    end

+    # Generate and add rules for any optional components.
+    def generate_optional_component_rules!(tokens_by_name)
+      optional_rules_added = Set.new
+      @grammar.rules.each do |rule|
+        rule.components.each do |component|
+          if component =~ /^(.*)\?$/
+            c = $1
+            unless optional_rules_added.include?(component)
+              # Create two rules for the optional component: one empty and
+              # one just matching the component.
+              # We need to find the ptypename for the optional component in
+              # order to copy it to the generated rules.
+              if tokens_by_name[c]
+                # The optional component is a token.
+                ptypename = tokens_by_name[c].ptypename
+              else
+                # The optional component must be a rule, so find any instance
+                # of that rule that specifies a ptypename.
+                ptypename = @grammar.rules.reduce(nil) do |result, rule|
+                  rule.name == c && rule.ptypename ? rule.ptypename : result
+                end
+              end
+              @grammar.rules << Rule.new(component, [], nil, ptypename, rule.line_number)
+              @grammar.rules << Rule.new(component, [c], "$$ = $1;\n", ptypename, rule.line_number)
+              optional_rules_added << component
+            end
+          end
+        end
+      end
+    end
+
    # Determine which grammar rules could expand to empty sequences.
    #
    # @param rule_sets [Hash]
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -198,7 +198,7 @@ class Propane
        if @ast && ptypename
          raise Error.new("Multiple ptypes are unsupported in AST mode")
        end
-        md = consume!(/((?:#{IDENTIFIER_REGEX}\s*)*)\s*/, "expected rule component list")
+        md = consume!(/((?:#{IDENTIFIER_REGEX}\??\s*)*)\s*/, "expected rule component list")
        components = md[1].strip.split(/\s+/)
        if @ast
          consume!(/;/, "expected `;'")
--- a/lib/propane/parser.rb
+++ b/lib/propane/parser.rb
@ -64,11 +64,13 @@ class Propane
          case ra = item_set.reduce_actions
          when Rule
            [{token_id: @grammar.invalid_token_id, rule_id: ra.id, rule: ra,
-              rule_set_id: ra.rule_set.id, n_states: ra.components.size}]
+              rule_set_id: ra.rule_set.id, n_states: ra.components.size,
+              propagate_optional_target: ra.optional? && ra.components.size == 1}]
          when Hash
            ra.map do |token, rule|
              {token_id: token.id, rule_id: rule.id, rule: rule,
-               rule_set_id: rule.rule_set.id, n_states: rule.components.size}
+               rule_set_id: rule.rule_set.id, n_states: rule.components.size,
+               propagate_optional_target: rule.optional? && rule.components.size == 1}
            end
          else
            []
--- a/lib/propane/rule.rb
+++ b/lib/propane/rule.rb
@ -66,6 +66,14 @@ class Propane
      @components.empty?
    end

+    # Return whether this is an optional Rule.
+    #
+    # @return [Boolean]
+    #   Whether this is an optional Rule.
+    def optional?
+      @name.end_with?("?")
+    end
+
    # Represent the Rule as a String.
    #
    # @return [String]
--- a/lib/propane/rule_set.rb
+++ b/lib/propane/rule_set.rb
@ -56,6 +56,24 @@ class Propane
      @could_be_empty
    end

+    # Return whether this is an optional RuleSet.
+    #
+    # @return [Boolean]
+    #   Whether this is an optional RuleSet.
+    def optional?
+      @name.end_with?("?")
+    end
+
+    # For optional rule sets, return the underlying component that is optional.
+    def option_target
+      @rules.each do |rule|
+        if rule.components.size > 0
+          return rule.components[0]
+        end
+      end
+      raise "Optional rule target not found"
+    end
+
    # Build the start token set for the RuleSet.
    #
    # @return [Set<Token>]
@ -102,6 +120,9 @@ class Propane
      @ast_fields = []
      @rules.each do |rule|
        rule.components.each_with_index do |component, i|
+          if component.is_a?(RuleSet) && component.optional?
+            component = component.option_target
+          end
          if component.is_a?(Token)
            node_name = "Token"
          else
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -910,6 +910,111 @@ EOF
        run_propane(language: language)
        compile("spec/test_start_rule_ast.#{language}", language: language)
      end
+
+      it "allows marking a rule component as optional" do
+        if language == "d"
+          write_grammar <<EOF
+<<
+import std.stdio;
+>>
+
+ptype int;
+ptype float = float;
+ptype string = string;
+
+token a (float) << $$ = 1.5; >>
+token b << $$ = 2; >>
+token c << $$ = 3; >>
+token d << $$ = 4; >>
+Start -> a? b R? <<
+  writeln("a: ", $1);
+  writeln("b: ", $2);
+  writeln("R: ", $3);
+>>
+R -> c d << $$ = "cd"; >>
+R (string) -> d c << $$ = "dc"; >>
+EOF
+        else
+          write_grammar <<EOF
+<<
+#include <stdio.h>
+>>
+
+ptype int;
+ptype float = float;
+ptype string = char *;
+
+token a (float) << $$ = 1.5; >>
+token b << $$ = 2; >>
+token c << $$ = 3; >>
+token d << $$ = 4; >>
+Start -> a? b R? <<
+  printf("a: %.1f\\n", $1);
+  printf("b: %d\\n", $2);
+  printf("R: %s\\n", $3 == NULL ? "" : $3);
+>>
+R -> c d << $$ = "cd"; >>
+R (string) -> d c << $$ = "dc"; >>
+EOF
+        end
+        run_propane(language: language)
+        compile("spec/test_optional_rule_component.#{language}", language: language)
+        results = run_test
+        expect(results.stderr).to eq ""
+        expect(results.status).to eq 0
+        verify_lines(results.stdout, [
+          "a: 0#{language == "d" ? "" : ".0"}",
+          "b: 2",
+          "R: ",
+          "a: 1.5",
+          "b: 2",
+          "R: cd",
+          "a: 1.5",
+          "b: 2",
+          "R: dc",
+        ])
+      end
+
+      it "allows marking a rule component as optional in AST generation mode" do
+        if language == "d"
+          write_grammar <<EOF
+ast;
+
+<<
+import std.stdio;
+>>
+
+token a;
+token b;
+token c;
+token d;
+Start -> a? b R?;
+R -> c d;
+R -> d c;
+EOF
+        else
+          write_grammar <<EOF
+ast;
+
+<<
+#include <stdio.h>
+>>
+
+token a;
+token b;
+token c;
+token d;
+Start -> a? b R?;
+R -> c d;
+R -> d c;
+EOF
+        end
+        run_propane(language: language)
+        compile("spec/test_optional_rule_component_ast.#{language}", language: language)
+        results = run_test
+        expect(results.stderr).to eq ""
+        expect(results.status).to eq 0
+      end
    end
  end
 end
--- a/spec/test_optional_rule_component.c
+++ b/spec/test_optional_rule_component.c
@ -0,0 +1,22 @@
+#include "testparser.h"
+#include <assert.h>
+#include <string.h>
+
+int main()
+{
+    char const * input = "b";
+    p_context_t context;
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "abcd";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "abdc";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+
+    return 0;
+}
+
--- a/spec/test_optional_rule_component.d
+++ b/spec/test_optional_rule_component.d
@ -0,0 +1,23 @@
+import testparser;
+import std.stdio;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input = "b";
+    p_context_t context;
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "abcd";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "abdc";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+}
--- a/spec/test_optional_rule_component_ast.c
+++ b/spec/test_optional_rule_component_ast.c
@ -0,0 +1,42 @@
+#include "testparser.h"
+#include <assert.h>
+#include <string.h>
+#include "testutils.h"
+
+int main()
+{
+    char const * input = "b";
+    p_context_t context;
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    Start * start = p_result(&context);
+    assert(start->pToken1 == NULL);
+    assert(start->pToken2 != NULL);
+    assert_eq(TOKEN_b, start->pToken2->token);
+    assert(start->pR3 == NULL);
+    assert(start->pR == NULL);
+
+    input = "abcd";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+    assert(start->pToken1 != NULL);
+    assert_eq(TOKEN_a, start->pToken1->token);
+    assert(start->pToken2 != NULL);
+    assert(start->pR3 != NULL);
+    assert(start->pR != NULL);
+    assert(start->pR == start->pR3);
+    assert_eq(TOKEN_c, start->pR->pToken1->token);
+
+    input = "bdc";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+    assert(start->pToken1 == NULL);
+    assert(start->pToken2 != NULL);
+    assert(start->pR != NULL);
+    assert_eq(TOKEN_d, start->pR->pToken1->token);
+
+    return 0;
+}
+
--- a/spec/test_optional_rule_component_ast.d
+++ b/spec/test_optional_rule_component_ast.d
@ -0,0 +1,43 @@
+import testparser;
+import std.stdio;
+import testutils;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input = "b";
+    p_context_t context;
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    Start * start = p_result(&context);
+    assert(start.pToken1 is null);
+    assert(start.pToken2 !is null);
+    assert_eq(TOKEN_b, start.pToken2.token);
+    assert(start.pR3 is null);
+    assert(start.pR is null);
+
+    input = "abcd";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+    assert(start.pToken1 != null);
+    assert_eq(TOKEN_a, start.pToken1.token);
+    assert(start.pToken2 != null);
+    assert(start.pR3 != null);
+    assert(start.pR != null);
+    assert(start.pR == start.pR3);
+    assert_eq(TOKEN_c, start.pR.pToken1.token);
+
+    input = "bdc";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+    assert(start.pToken1 is null);
+    assert(start.pToken2 !is null);
+    assert(start.pR !is null);
+    assert_eq(TOKEN_d, start.pR.pToken1.token);
+}