Allow user termination from lexer code blocks - close #15

2024-03-29 13:45:08 -04:00 · 2024-03-29 13:45:08 -04:00 · fad7f4fb36
commit fad7f4fb36
parent d55c5e0080
9 changed files with 106 additions and 20 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
 *   Input text does not match any lexer pattern.
 * @retval P_DROP
 *   A drop pattern was matched so the lexer should continue.
+ * @retval P_USER_TERMINATED
+ *   User code has requested to terminate the lexer.
 */
 static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
            uint8_t const * match = &context->input[context->input_index];
            <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
                match_info.accepting_state->code_id, match, match_info.length, &token_info);
+            /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
+             * that the user code is requesting to terminate the lexer. */
+            if (user_code_token == TERMINATE_TOKEN_ID)
+            {
+                return P_USER_TERMINATED;
+            }
            /* An invalid token returned from lexer_user_code() means that the
             * user code did not explicitly return a token. So only override
             * the token to return if the user code does explicitly return a
@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
+ * @retval P_USER_TERMINATED
+ *   User code has requested to terminate the lexer.
 */
 size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -31,7 +31,7 @@ public enum : size_t
 }

 /** Token type. */
-public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>;
+public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>;

 /** Token IDs. */
 public enum : <%= @grammar.prefix %>token_t
@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t
 <%   end %>
 <% end %>
    INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
+    TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>,
 }

 /** Code point type. */
@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
 *   Input text does not match any lexer pattern.
 * @retval P_DROP
 *   A drop pattern was matched so the lexer should continue.
+ * @retval P_USER_TERMINATED
+ *   User code has requested to terminate the lexer.
 */
 private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
            string match = context.input[context.input_index..(context.input_index + match_info.length)];
            <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
                match_info.accepting_state.code_id, match, &token_info);
+            /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
+             * that the user code is requesting to terminate the lexer. */
+            if (user_code_token == TERMINATE_TOKEN_ID)
+            {
+                return P_USER_TERMINATED;
+            }
            /* An invalid token returned from lexer_user_code() means that the
             * user code did not explicitly return a token. So only override
             * the token to return if the user code does explicitly return a
@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
+ * @retval P_USER_TERMINATED
+ *   User code has requested to terminate the lexer.
 */
 public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
--- a/assets/parser.h.erb
+++ b/assets/parser.h.erb
@ -23,7 +23,7 @@
 #define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u

 /** Token type. */
-typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t;
+typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t;

 /** Token IDs. */
 <% @grammar.tokens.each_with_index do |token, index| %>
@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok
 <%   end %>
 <% end %>
 #define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u
+#define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u

 /** Code point type. */
 typedef uint32_t <%= @grammar.prefix %>code_point_t;
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -574,17 +574,18 @@ default.
 It can also be used when generating multiple lexers/parsers to be used in the
 same program to avoid symbol collisions.

-##> User termination of the parser
+##> User termination of the lexer or parser

-Propane supports allowing parser user code blocks to terminate execution of the
-parser.
-One example use of this functionality is to detect and report an error before
-the parser continues parsing the remainder of the input.
-Another use of this features is to begin parsing input and determine whether a
-different parser should be used instead.
+Propane supports allowing lexer or parser user code blocks to terminate
+execution of the parser.
+Some example uses of this functionality could be to:

-To terminate parsing from a parser user code block, use the `$terminate(code)`
-function, passing an integer expression argument.
+  * Detect integer overflow when lexing an integer literal constant.
+  * Detect and report an error as soon as possible during parsing before continuing to parse any more of the input.
+  * Determine whether parsing should stop and instead be performed using a different parser version.
+
+To terminate parsing from a lexer or parser user code block, use the
+`$terminate(code)` function, passing an integer expression argument.
 For example:

 ```
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -198,6 +198,16 @@ class Propane
      code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
        "TOKEN_#{Token.code_name($1)}"
      end
+      code = code.gsub(/\$terminate\((.*)\);/) do |match|
+        user_terminate_code = $1
+        retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID"
+        case @language
+        when "c"
+          "context->user_terminate_code = (#{user_terminate_code}); return #{retval};"
+        when "d"
+          "context.user_terminate_code = (#{user_terminate_code}); return #{retval};"
+        end
+      end
      if parser
        code = code.gsub(/\$\$/) do |match|
          case @language
@ -216,15 +226,6 @@ class Propane
            "statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}"
          end
        end
-        code = code.gsub(/\$terminate\((.*)\);/) do |match|
-          user_terminate_code = $1
-          case @language
-          when "c"
-            "context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
-          when "d"
-            "context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
-          end
-        end
      else
        code = code.gsub(/\$\$/) do |match|
          case @language
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -35,6 +35,10 @@ class Propane
      @tokens.size
    end

+    def terminate_token_id
+      @tokens.size + 1
+    end
+
    private

    def parse_grammar!
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -730,6 +730,25 @@ EOF
        expect(results.status).to eq 0
      end

+      it "allows the user to terminate the lexer" do
+        write_grammar <<EOF
+token a;
+token b <<
+  $terminate(8675309);
+>>
+token c;
+Start -> Any;
+Any -> a;
+Any -> b;
+Any -> c;
+EOF
+        build_parser(language: language)
+        compile("spec/test_user_terminate_lexer.#{language}", language: language)
+        results = run
+        expect(results.stderr).to eq ""
+        expect(results.status).to eq 0
+      end
+
      it "allows the user to terminate the parser" do
        write_grammar <<EOF
 token a;
--- a/spec/test_user_terminate_lexer.c
+++ b/spec/test_user_terminate_lexer.c
@ -0,0 +1,19 @@
+#include "testparser.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+int main()
+{
+    char const * input = "a";
+    p_context_t context;
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "b";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_USER_TERMINATED);
+    assert(p_user_terminate_code(&context) == 8675309);
+
+    return 0;
+}
--- a/spec/test_user_terminate_lexer.d
+++ b/spec/test_user_terminate_lexer.d
@ -0,0 +1,20 @@
+import testparser;
+import std.stdio;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input = "a";
+    p_context_t context;
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "b";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_USER_TERMINATED);
+    assert(p_user_terminate_code(&context) == 8675309);
+}