Allow user termination from lexer code blocks - close #15

This commit is contained in:
Josh Holtrop 2024-03-29 13:45:08 -04:00
parent d55c5e0080
commit fad7f4fb36
9 changed files with 106 additions and 20 deletions

View File

@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
* Input text does not match any lexer pattern. * Input text does not match any lexer pattern.
* @retval P_DROP * @retval P_DROP
* A drop pattern was matched so the lexer should continue. * A drop pattern was matched so the lexer should continue.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/ */
static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{ {
@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
uint8_t const * match = &context->input[context->input_index]; uint8_t const * match = &context->input[context->input_index];
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context, <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
match_info.accepting_state->code_id, match, match_info.length, &token_info); match_info.accepting_state->code_id, match, match_info.length, &token_info);
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
* that the user code is requesting to terminate the lexer. */
if (user_code_token == TERMINATE_TOKEN_ID)
{
return P_USER_TERMINATED;
}
/* An invalid token returned from lexer_user_code() means that the /* An invalid token returned from lexer_user_code() means that the
* user code did not explicitly return a token. So only override * user code did not explicitly return a token. So only override
* the token to return if the user code does explicitly return a * the token to return if the user code does explicitly return a
@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
* The decoder encountered invalid text encoding. * The decoder encountered invalid text encoding.
* @reval P_UNEXPECTED_INPUT * @reval P_UNEXPECTED_INPUT
* Input text does not match any lexer pattern. * Input text does not match any lexer pattern.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/ */
size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{ {

View File

@ -31,7 +31,7 @@ public enum : size_t
} }
/** Token type. */ /** Token type. */
public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>; public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>;
/** Token IDs. */ /** Token IDs. */
public enum : <%= @grammar.prefix %>token_t public enum : <%= @grammar.prefix %>token_t
@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t
<% end %> <% end %>
<% end %> <% end %>
INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>, INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>,
} }
/** Code point type. */ /** Code point type. */
@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
* Input text does not match any lexer pattern. * Input text does not match any lexer pattern.
* @retval P_DROP * @retval P_DROP
* A drop pattern was matched so the lexer should continue. * A drop pattern was matched so the lexer should continue.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/ */
private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{ {
@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
string match = context.input[context.input_index..(context.input_index + match_info.length)]; string match = context.input[context.input_index..(context.input_index + match_info.length)];
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context, <%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
match_info.accepting_state.code_id, match, &token_info); match_info.accepting_state.code_id, match, &token_info);
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
* that the user code is requesting to terminate the lexer. */
if (user_code_token == TERMINATE_TOKEN_ID)
{
return P_USER_TERMINATED;
}
/* An invalid token returned from lexer_user_code() means that the /* An invalid token returned from lexer_user_code() means that the
* user code did not explicitly return a token. So only override * user code did not explicitly return a token. So only override
* the token to return if the user code does explicitly return a * the token to return if the user code does explicitly return a
@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
* The decoder encountered invalid text encoding. * The decoder encountered invalid text encoding.
* @reval P_UNEXPECTED_INPUT * @reval P_UNEXPECTED_INPUT
* Input text does not match any lexer pattern. * Input text does not match any lexer pattern.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/ */
public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info) public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{ {

View File

@ -23,7 +23,7 @@
#define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u #define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u
/** Token type. */ /** Token type. */
typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t; typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t;
/** Token IDs. */ /** Token IDs. */
<% @grammar.tokens.each_with_index do |token, index| %> <% @grammar.tokens.each_with_index do |token, index| %>
@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok
<% end %> <% end %>
<% end %> <% end %>
#define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u #define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u
#define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u
/** Code point type. */ /** Code point type. */
typedef uint32_t <%= @grammar.prefix %>code_point_t; typedef uint32_t <%= @grammar.prefix %>code_point_t;

View File

@ -574,17 +574,18 @@ default.
It can also be used when generating multiple lexers/parsers to be used in the It can also be used when generating multiple lexers/parsers to be used in the
same program to avoid symbol collisions. same program to avoid symbol collisions.
##> User termination of the parser ##> User termination of the lexer or parser
Propane supports allowing parser user code blocks to terminate execution of the Propane supports allowing lexer or parser user code blocks to terminate
parser. execution of the parser.
One example use of this functionality is to detect and report an error before Some example uses of this functionality could be to:
the parser continues parsing the remainder of the input.
Another use of this features is to begin parsing input and determine whether a
different parser should be used instead.
To terminate parsing from a parser user code block, use the `$terminate(code)` * Detect integer overflow when lexing an integer literal constant.
function, passing an integer expression argument. * Detect and report an error as soon as possible during parsing before continuing to parse any more of the input.
* Determine whether parsing should stop and instead be performed using a different parser version.
To terminate parsing from a lexer or parser user code block, use the
`$terminate(code)` function, passing an integer expression argument.
For example: For example:
``` ```

View File

@ -198,6 +198,16 @@ class Propane
code = code.gsub(/\$token\(([$\w]+)\)/) do |match| code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
"TOKEN_#{Token.code_name($1)}" "TOKEN_#{Token.code_name($1)}"
end end
code = code.gsub(/\$terminate\((.*)\);/) do |match|
user_terminate_code = $1
retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID"
case @language
when "c"
"context->user_terminate_code = (#{user_terminate_code}); return #{retval};"
when "d"
"context.user_terminate_code = (#{user_terminate_code}); return #{retval};"
end
end
if parser if parser
code = code.gsub(/\$\$/) do |match| code = code.gsub(/\$\$/) do |match|
case @language case @language
@ -216,15 +226,6 @@ class Propane
"statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}" "statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}"
end end
end end
code = code.gsub(/\$terminate\((.*)\);/) do |match|
user_terminate_code = $1
case @language
when "c"
"context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
when "d"
"context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
end
end
else else
code = code.gsub(/\$\$/) do |match| code = code.gsub(/\$\$/) do |match|
case @language case @language

View File

@ -35,6 +35,10 @@ class Propane
@tokens.size @tokens.size
end end
def terminate_token_id
@tokens.size + 1
end
private private
def parse_grammar! def parse_grammar!

View File

@ -730,6 +730,25 @@ EOF
expect(results.status).to eq 0 expect(results.status).to eq 0
end end
it "allows the user to terminate the lexer" do
write_grammar <<EOF
token a;
token b <<
$terminate(8675309);
>>
token c;
Start -> Any;
Any -> a;
Any -> b;
Any -> c;
EOF
build_parser(language: language)
compile("spec/test_user_terminate_lexer.#{language}", language: language)
results = run
expect(results.stderr).to eq ""
expect(results.status).to eq 0
end
it "allows the user to terminate the parser" do it "allows the user to terminate the parser" do
write_grammar <<EOF write_grammar <<EOF
token a; token a;

View File

@ -0,0 +1,19 @@
#include "testparser.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
int main()
{
char const * input = "a";
p_context_t context;
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS);
input = "b";
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_USER_TERMINATED);
assert(p_user_terminate_code(&context) == 8675309);
return 0;
}

View File

@ -0,0 +1,20 @@
import testparser;
import std.stdio;
int main()
{
return 0;
}
unittest
{
string input = "a";
p_context_t context;
p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS);
input = "b";
p_context_init(&context, input);
assert(p_parse(&context) == P_USER_TERMINATED);
assert(p_user_terminate_code(&context) == 8675309);
}