Allow user termination from lexer code blocks - close #15

This commit is contained in:
Josh Holtrop 2024-03-29 13:45:08 -04:00
parent d55c5e0080
commit fad7f4fb36
9 changed files with 106 additions and 20 deletions

View File

@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
* Input text does not match any lexer pattern.
* @retval P_DROP
* A drop pattern was matched so the lexer should continue.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/
static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{
@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
uint8_t const * match = &context->input[context->input_index];
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
match_info.accepting_state->code_id, match, match_info.length, &token_info);
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
* that the user code is requesting to terminate the lexer. */
if (user_code_token == TERMINATE_TOKEN_ID)
{
return P_USER_TERMINATED;
}
/* An invalid token returned from lexer_user_code() means that the
* user code did not explicitly return a token. So only override
* the token to return if the user code does explicitly return a
@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
* The decoder encountered invalid text encoding.
* @reval P_UNEXPECTED_INPUT
* Input text does not match any lexer pattern.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/
size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{

View File

@ -31,7 +31,7 @@ public enum : size_t
}
/** Token type. */
public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>;
public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>;
/** Token IDs. */
public enum : <%= @grammar.prefix %>token_t
@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t
<% end %>
<% end %>
INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>,
}
/** Code point type. */
@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
* Input text does not match any lexer pattern.
* @retval P_DROP
* A drop pattern was matched so the lexer should continue.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/
private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{
@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
string match = context.input[context.input_index..(context.input_index + match_info.length)];
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
match_info.accepting_state.code_id, match, &token_info);
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
* that the user code is requesting to terminate the lexer. */
if (user_code_token == TERMINATE_TOKEN_ID)
{
return P_USER_TERMINATED;
}
/* An invalid token returned from lexer_user_code() means that the
* user code did not explicitly return a token. So only override
* the token to return if the user code does explicitly return a
@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
* The decoder encountered invalid text encoding.
* @reval P_UNEXPECTED_INPUT
* Input text does not match any lexer pattern.
* @retval P_USER_TERMINATED
* User code has requested to terminate the lexer.
*/
public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
{

View File

@ -23,7 +23,7 @@
#define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u
/** Token type. */
typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t;
typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t;
/** Token IDs. */
<% @grammar.tokens.each_with_index do |token, index| %>
@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok
<% end %>
<% end %>
#define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u
#define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u
/** Code point type. */
typedef uint32_t <%= @grammar.prefix %>code_point_t;

View File

@ -574,17 +574,18 @@ default.
It can also be used when generating multiple lexers/parsers to be used in the
same program to avoid symbol collisions.
##> User termination of the parser
##> User termination of the lexer or parser
Propane supports allowing parser user code blocks to terminate execution of the
parser.
One example use of this functionality is to detect and report an error before
the parser continues parsing the remainder of the input.
Another use of this features is to begin parsing input and determine whether a
different parser should be used instead.
Propane supports allowing lexer or parser user code blocks to terminate
execution of the parser.
Some example uses of this functionality could be to:
To terminate parsing from a parser user code block, use the `$terminate(code)`
function, passing an integer expression argument.
* Detect integer overflow when lexing an integer literal constant.
* Detect and report an error as soon as possible during parsing before continuing to parse any more of the input.
* Determine whether parsing should stop and instead be performed using a different parser version.
To terminate parsing from a lexer or parser user code block, use the
`$terminate(code)` function, passing an integer expression argument.
For example:
```

View File

@ -198,6 +198,16 @@ class Propane
code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
"TOKEN_#{Token.code_name($1)}"
end
code = code.gsub(/\$terminate\((.*)\);/) do |match|
user_terminate_code = $1
retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID"
case @language
when "c"
"context->user_terminate_code = (#{user_terminate_code}); return #{retval};"
when "d"
"context.user_terminate_code = (#{user_terminate_code}); return #{retval};"
end
end
if parser
code = code.gsub(/\$\$/) do |match|
case @language
@ -216,15 +226,6 @@ class Propane
"statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}"
end
end
code = code.gsub(/\$terminate\((.*)\);/) do |match|
user_terminate_code = $1
case @language
when "c"
"context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
when "d"
"context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
end
end
else
code = code.gsub(/\$\$/) do |match|
case @language

View File

@ -35,6 +35,10 @@ class Propane
@tokens.size
end
def terminate_token_id
@tokens.size + 1
end
private
def parse_grammar!

View File

@ -730,6 +730,25 @@ EOF
expect(results.status).to eq 0
end
it "allows the user to terminate the lexer" do
write_grammar <<EOF
token a;
token b <<
$terminate(8675309);
>>
token c;
Start -> Any;
Any -> a;
Any -> b;
Any -> c;
EOF
build_parser(language: language)
compile("spec/test_user_terminate_lexer.#{language}", language: language)
results = run
expect(results.stderr).to eq ""
expect(results.status).to eq 0
end
it "allows the user to terminate the parser" do
write_grammar <<EOF
token a;

View File

@ -0,0 +1,19 @@
#include "testparser.h"
#include <assert.h>
#include <stdio.h>
#include <string.h>
int main()
{
char const * input = "a";
p_context_t context;
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS);
input = "b";
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_USER_TERMINATED);
assert(p_user_terminate_code(&context) == 8675309);
return 0;
}

View File

@ -0,0 +1,20 @@
import testparser;
import std.stdio;
int main()
{
return 0;
}
unittest
{
string input = "a";
p_context_t context;
p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS);
input = "b";
p_context_init(&context, input);
assert(p_parse(&context) == P_USER_TERMINATED);
assert(p_user_terminate_code(&context) == 8675309);
}