diff --git a/assets/parser.c.erb b/assets/parser.c.erb index dbdab6b..ded8ba7 100644 --- a/assets/parser.c.erb +++ b/assets/parser.c.erb @@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context, * Input text does not match any lexer pattern. * @retval P_DROP * A drop pattern was matched so the lexer should continue. + * @retval P_USER_TERMINATED + * User code has requested to terminate the lexer. */ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { @@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @ uint8_t const * match = &context->input[context->input_index]; <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context, match_info.accepting_state->code_id, match, match_info.length, &token_info); + /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means + * that the user code is requesting to terminate the lexer. */ + if (user_code_token == TERMINATE_TOKEN_ID) + { + return P_USER_TERMINATED; + } /* An invalid token returned from lexer_user_code() means that the * user code did not explicitly return a token. So only override * the token to return if the user code does explicitly return a @@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @ * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. + * @retval P_USER_TERMINATED + * User code has requested to terminate the lexer. */ size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { diff --git a/assets/parser.d.erb b/assets/parser.d.erb index 374ce12..7135a82 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -31,7 +31,7 @@ public enum : size_t } /** Token type. */ -public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>; +public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>; /** Token IDs. */ public enum : <%= @grammar.prefix %>token_t @@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t <% end %> <% end %> INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>, + TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>, } /** Code point type. */ @@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context, * Input text does not match any lexer pattern. * @retval P_DROP * A drop pattern was matched so the lexer should continue. + * @retval P_USER_TERMINATED + * User code has requested to terminate the lexer. */ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { @@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= string match = context.input[context.input_index..(context.input_index + match_info.length)]; <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context, match_info.accepting_state.code_id, match, &token_info); + /* A TERMINATE_TOKEN_ID return code from lexer_user_code() means + * that the user code is requesting to terminate the lexer. */ + if (user_code_token == TERMINATE_TOKEN_ID) + { + return P_USER_TERMINATED; + } /* An invalid token returned from lexer_user_code() means that the * user code did not explicitly return a token. So only override * the token to return if the user code does explicitly return a @@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= * The decoder encountered invalid text encoding. * @reval P_UNEXPECTED_INPUT * Input text does not match any lexer pattern. + * @retval P_USER_TERMINATED + * User code has requested to terminate the lexer. */ public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) { diff --git a/assets/parser.h.erb b/assets/parser.h.erb index fa2f66b..3b32836 100644 --- a/assets/parser.h.erb +++ b/assets/parser.h.erb @@ -23,7 +23,7 @@ #define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u /** Token type. */ -typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t; +typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t; /** Token IDs. */ <% @grammar.tokens.each_with_index do |token, index| %> @@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok <% end %> <% end %> #define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u +#define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u /** Code point type. */ typedef uint32_t <%= @grammar.prefix %>code_point_t; diff --git a/doc/user_guide.md b/doc/user_guide.md index 685ca55..970090a 100644 --- a/doc/user_guide.md +++ b/doc/user_guide.md @@ -574,17 +574,18 @@ default. It can also be used when generating multiple lexers/parsers to be used in the same program to avoid symbol collisions. -##> User termination of the parser +##> User termination of the lexer or parser -Propane supports allowing parser user code blocks to terminate execution of the -parser. -One example use of this functionality is to detect and report an error before -the parser continues parsing the remainder of the input. -Another use of this features is to begin parsing input and determine whether a -different parser should be used instead. +Propane supports allowing lexer or parser user code blocks to terminate +execution of the parser. +Some example uses of this functionality could be to: -To terminate parsing from a parser user code block, use the `$terminate(code)` -function, passing an integer expression argument. + * Detect integer overflow when lexing an integer literal constant. + * Detect and report an error as soon as possible during parsing before continuing to parse any more of the input. + * Determine whether parsing should stop and instead be performed using a different parser version. + +To terminate parsing from a lexer or parser user code block, use the +`$terminate(code)` function, passing an integer expression argument. For example: ``` diff --git a/lib/propane/generator.rb b/lib/propane/generator.rb index 40d7d9a..548a79c 100644 --- a/lib/propane/generator.rb +++ b/lib/propane/generator.rb @@ -198,6 +198,16 @@ class Propane code = code.gsub(/\$token\(([$\w]+)\)/) do |match| "TOKEN_#{Token.code_name($1)}" end + code = code.gsub(/\$terminate\((.*)\);/) do |match| + user_terminate_code = $1 + retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID" + case @language + when "c" + "context->user_terminate_code = (#{user_terminate_code}); return #{retval};" + when "d" + "context.user_terminate_code = (#{user_terminate_code}); return #{retval};" + end + end if parser code = code.gsub(/\$\$/) do |match| case @language @@ -216,15 +226,6 @@ class Propane "statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}" end end - code = code.gsub(/\$terminate\((.*)\);/) do |match| - user_terminate_code = $1 - case @language - when "c" - "context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;" - when "d" - "context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;" - end - end else code = code.gsub(/\$\$/) do |match| case @language diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb index e6d2161..f517d2f 100644 --- a/lib/propane/grammar.rb +++ b/lib/propane/grammar.rb @@ -35,6 +35,10 @@ class Propane @tokens.size end + def terminate_token_id + @tokens.size + 1 + end + private def parse_grammar! diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index 9fd19e8..bd5f298 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -730,6 +730,25 @@ EOF expect(results.status).to eq 0 end + it "allows the user to terminate the lexer" do + write_grammar <> +token c; +Start -> Any; +Any -> a; +Any -> b; +Any -> c; +EOF + build_parser(language: language) + compile("spec/test_user_terminate_lexer.#{language}", language: language) + results = run + expect(results.stderr).to eq "" + expect(results.status).to eq 0 + end + it "allows the user to terminate the parser" do write_grammar < +#include +#include + +int main() +{ + char const * input = "a"; + p_context_t context; + p_context_init(&context, (uint8_t const *)input, strlen(input)); + assert(p_parse(&context) == P_SUCCESS); + + input = "b"; + p_context_init(&context, (uint8_t const *)input, strlen(input)); + assert(p_parse(&context) == P_USER_TERMINATED); + assert(p_user_terminate_code(&context) == 8675309); + + return 0; +} diff --git a/spec/test_user_terminate_lexer.d b/spec/test_user_terminate_lexer.d new file mode 100644 index 0000000..5de5280 --- /dev/null +++ b/spec/test_user_terminate_lexer.d @@ -0,0 +1,20 @@ +import testparser; +import std.stdio; + +int main() +{ + return 0; +} + +unittest +{ + string input = "a"; + p_context_t context; + p_context_init(&context, input); + assert(p_parse(&context) == P_SUCCESS); + + input = "b"; + p_context_init(&context, input); + assert(p_parse(&context) == P_USER_TERMINATED); + assert(p_user_terminate_code(&context) == 8675309); +}