Allow user termination from lexer code blocks - close #15

2024-03-29 13:45:08 -04:00 · 2024-03-29 13:45:08 -04:00 · fad7f4fb36
commit fad7f4fb36
parent d55c5e0080
9 changed files with 106 additions and 20 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
 *   Input text does not match any lexer pattern.
 * @retval P_DROP
 *   A drop pattern was matched so the lexer should continue.
 * @retval P_USER_TERMINATED
 *   User code has requested to terminate the lexer.
 */
 static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
            uint8_t const * match = &context->input[context->input_index];
            <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
                match_info.accepting_state->code_id, match, match_info.length, &token_info);
            /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
             * that the user code is requesting to terminate the lexer. */
            if (user_code_token == TERMINATE_TOKEN_ID)
            {
                return P_USER_TERMINATED;
            }
            /* An invalid token returned from lexer_user_code() means that the
             * user code did not explicitly return a token. So only override
             * the token to return if the user code does explicitly return a
@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 * @retval P_USER_TERMINATED
 *   User code has requested to terminate the lexer.
 */
 size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -31,7 +31,7 @@ public enum : size_t
 }
 /** Token type. */
-public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>;
+public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>;
 /** Token IDs. */
 public enum : <%= @grammar.prefix %>token_t
@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t
 <%   end %>
 <% end %>
    INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
    TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>,
 }
 /** Code point type. */
@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
 *   Input text does not match any lexer pattern.
 * @retval P_DROP
 *   A drop pattern was matched so the lexer should continue.
 * @retval P_USER_TERMINATED
 *   User code has requested to terminate the lexer.
 */
 private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
            string match = context.input[context.input_index..(context.input_index + match_info.length)];
            <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
                match_info.accepting_state.code_id, match, &token_info);
            /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
             * that the user code is requesting to terminate the lexer. */
            if (user_code_token == TERMINATE_TOKEN_ID)
            {
                return P_USER_TERMINATED;
            }
            /* An invalid token returned from lexer_user_code() means that the
             * user code did not explicitly return a token. So only override
             * the token to return if the user code does explicitly return a
@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
 *   The decoder encountered invalid text encoding.
 * @reval P_UNEXPECTED_INPUT
 *   Input text does not match any lexer pattern.
 * @retval P_USER_TERMINATED
 *   User code has requested to terminate the lexer.
 */
 public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
 {
--- a/assets/parser.h.erb
+++ b/assets/parser.h.erb
@ -23,7 +23,7 @@
 #define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u
 /** Token type. */
-typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t;
+typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t;
 /** Token IDs. */
 <% @grammar.tokens.each_with_index do |token, index| %>
@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok
 <%   end %>
 <% end %>
 #define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u
 #define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u
 /** Code point type. */
 typedef uint32_t <%= @grammar.prefix %>code_point_t;
--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -574,17 +574,18 @@ default.
 It can also be used when generating multiple lexers/parsers to be used in the
 same program to avoid symbol collisions.
-##> User termination of the parser
+##> User termination of the lexer or parser
-Propane supports allowing parser user code blocks to terminate execution of the
+Propane supports allowing lexer or parser user code blocks to terminate
-parser.
+execution of the parser.
-One example use of this functionality is to detect and report an error before
+Some example uses of this functionality could be to:
 the parser continues parsing the remainder of the input.
 Another use of this features is to begin parsing input and determine whether a
 different parser should be used instead.
-To terminate parsing from a parser user code block, use the `$terminate(code)`
+  * Detect integer overflow when lexing an integer literal constant.
-function, passing an integer expression argument.
+  * Detect and report an error as soon as possible during parsing before continuing to parse any more of the input.
  * Determine whether parsing should stop and instead be performed using a different parser version.
 To terminate parsing from a lexer or parser user code block, use the
 `$terminate(code)` function, passing an integer expression argument.
 For example:
 ```
--- a/lib/propane/generator.rb
+++ b/lib/propane/generator.rb
@ -198,6 +198,16 @@ class Propane
      code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
        "TOKEN_#{Token.code_name($1)}"
      end
      code = code.gsub(/\$terminate\((.*)\);/) do |match|
        user_terminate_code = $1
        retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID"
        case @language
        when "c"
          "context->user_terminate_code = (#{user_terminate_code}); return #{retval};"
        when "d"
          "context.user_terminate_code = (#{user_terminate_code}); return #{retval};"
        end
      end
      if parser
        code = code.gsub(/\$\$/) do |match|
          case @language
@ -216,15 +226,6 @@ class Propane
            "statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}"
          end
        end
        code = code.gsub(/\$terminate\((.*)\);/) do |match|
          user_terminate_code = $1
          case @language
          when "c"
            "context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
          when "d"
            "context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
          end
        end
      else
        code = code.gsub(/\$\$/) do |match|
          case @language
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@ -35,6 +35,10 @@ class Propane
      @tokens.size
    end
    def terminate_token_id
      @tokens.size + 1
    end
    private
    def parse_grammar!
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -730,6 +730,25 @@ EOF
        expect(results.status).to eq 0
      end
      it "allows the user to terminate the lexer" do
        write_grammar <<EOF
 token a;
 token b <<
  $terminate(8675309);
 >>
 token c;
 Start -> Any;
 Any -> a;
 Any -> b;
 Any -> c;
 EOF
        build_parser(language: language)
        compile("spec/test_user_terminate_lexer.#{language}", language: language)
        results = run
        expect(results.stderr).to eq ""
        expect(results.status).to eq 0
      end
      it "allows the user to terminate the parser" do
        write_grammar <<EOF
 token a;
--- a/spec/test_user_terminate_lexer.c
+++ b/spec/test_user_terminate_lexer.c
@ -0,0 +1,19 @@
 #include "testparser.h"
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
 int main()
 {
    char const * input = "a";
    p_context_t context;
    p_context_init(&context, (uint8_t const *)input, strlen(input));
    assert(p_parse(&context) == P_SUCCESS);
    input = "b";
    p_context_init(&context, (uint8_t const *)input, strlen(input));
    assert(p_parse(&context) == P_USER_TERMINATED);
    assert(p_user_terminate_code(&context) == 8675309);
    return 0;
 }
--- a/spec/test_user_terminate_lexer.d
+++ b/spec/test_user_terminate_lexer.d
@ -0,0 +1,20 @@
 import testparser;
 import std.stdio;
 int main()
 {
    return 0;
 }
 unittest
 {
    string input = "a";
    p_context_t context;
    p_context_init(&context, input);
    assert(p_parse(&context) == P_SUCCESS);
    input = "b";
    p_context_init(&context, input);
    assert(p_parse(&context) == P_USER_TERMINATED);
    assert(p_user_terminate_code(&context) == 8675309);
 }