Allow user termination from lexer code blocks - close #15
This commit is contained in:
parent
d55c5e0080
commit
fad7f4fb36
@ -422,6 +422,8 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
|
||||
* Input text does not match any lexer pattern.
|
||||
* @retval P_DROP
|
||||
* A drop pattern was matched so the lexer should continue.
|
||||
* @retval P_USER_TERMINATED
|
||||
* User code has requested to terminate the lexer.
|
||||
*/
|
||||
static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
|
||||
{
|
||||
@ -441,6 +443,12 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
|
||||
uint8_t const * match = &context->input[context->input_index];
|
||||
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
|
||||
match_info.accepting_state->code_id, match, match_info.length, &token_info);
|
||||
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
|
||||
* that the user code is requesting to terminate the lexer. */
|
||||
if (user_code_token == TERMINATE_TOKEN_ID)
|
||||
{
|
||||
return P_USER_TERMINATED;
|
||||
}
|
||||
/* An invalid token returned from lexer_user_code() means that the
|
||||
* user code did not explicitly return a token. So only override
|
||||
* the token to return if the user code does explicitly return a
|
||||
@ -511,6 +519,8 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
|
||||
* The decoder encountered invalid text encoding.
|
||||
* @reval P_UNEXPECTED_INPUT
|
||||
* Input text does not match any lexer pattern.
|
||||
* @retval P_USER_TERMINATED
|
||||
* User code has requested to terminate the lexer.
|
||||
*/
|
||||
size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
|
||||
{
|
||||
|
@ -31,7 +31,7 @@ public enum : size_t
|
||||
}
|
||||
|
||||
/** Token type. */
|
||||
public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.invalid_token_id) %>;
|
||||
public alias <%= @grammar.prefix %>token_t = <%= get_type_for(@grammar.terminate_token_id) %>;
|
||||
|
||||
/** Token IDs. */
|
||||
public enum : <%= @grammar.prefix %>token_t
|
||||
@ -43,6 +43,7 @@ public enum : <%= @grammar.prefix %>token_t
|
||||
<% end %>
|
||||
<% end %>
|
||||
INVALID_TOKEN_ID = <%= @grammar.invalid_token_id %>,
|
||||
TERMINATE_TOKEN_ID = <%= @grammar.terminate_token_id %>,
|
||||
}
|
||||
|
||||
/** Code point type. */
|
||||
@ -538,6 +539,8 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
|
||||
* Input text does not match any lexer pattern.
|
||||
* @retval P_DROP
|
||||
* A drop pattern was matched so the lexer should continue.
|
||||
* @retval P_USER_TERMINATED
|
||||
* User code has requested to terminate the lexer.
|
||||
*/
|
||||
private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
|
||||
{
|
||||
@ -557,6 +560,12 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
|
||||
string match = context.input[context.input_index..(context.input_index + match_info.length)];
|
||||
<%= @grammar.prefix %>token_t user_code_token = lexer_user_code(context,
|
||||
match_info.accepting_state.code_id, match, &token_info);
|
||||
/* A TERMINATE_TOKEN_ID return code from lexer_user_code() means
|
||||
* that the user code is requesting to terminate the lexer. */
|
||||
if (user_code_token == TERMINATE_TOKEN_ID)
|
||||
{
|
||||
return P_USER_TERMINATED;
|
||||
}
|
||||
/* An invalid token returned from lexer_user_code() means that the
|
||||
* user code did not explicitly return a token. So only override
|
||||
* the token to return if the user code does explicitly return a
|
||||
@ -627,6 +636,8 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
|
||||
* The decoder encountered invalid text encoding.
|
||||
* @reval P_UNEXPECTED_INPUT
|
||||
* Input text does not match any lexer pattern.
|
||||
* @retval P_USER_TERMINATED
|
||||
* User code has requested to terminate the lexer.
|
||||
*/
|
||||
public size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%= @grammar.prefix %>token_info_t * out_token_info)
|
||||
{
|
||||
|
@ -23,7 +23,7 @@
|
||||
#define <%= @grammar.prefix.upcase %>USER_TERMINATED 6u
|
||||
|
||||
/** Token type. */
|
||||
typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>token_t;
|
||||
typedef <%= get_type_for(@grammar.terminate_token_id) %> <%= @grammar.prefix %>token_t;
|
||||
|
||||
/** Token IDs. */
|
||||
<% @grammar.tokens.each_with_index do |token, index| %>
|
||||
@ -33,6 +33,7 @@ typedef <%= get_type_for(@grammar.invalid_token_id) %> <%= @grammar.prefix %>tok
|
||||
<% end %>
|
||||
<% end %>
|
||||
#define INVALID_TOKEN_ID <%= @grammar.invalid_token_id %>u
|
||||
#define TERMINATE_TOKEN_ID <%= @grammar.terminate_token_id %>u
|
||||
|
||||
/** Code point type. */
|
||||
typedef uint32_t <%= @grammar.prefix %>code_point_t;
|
||||
|
@ -574,17 +574,18 @@ default.
|
||||
It can also be used when generating multiple lexers/parsers to be used in the
|
||||
same program to avoid symbol collisions.
|
||||
|
||||
##> User termination of the parser
|
||||
##> User termination of the lexer or parser
|
||||
|
||||
Propane supports allowing parser user code blocks to terminate execution of the
|
||||
parser.
|
||||
One example use of this functionality is to detect and report an error before
|
||||
the parser continues parsing the remainder of the input.
|
||||
Another use of this features is to begin parsing input and determine whether a
|
||||
different parser should be used instead.
|
||||
Propane supports allowing lexer or parser user code blocks to terminate
|
||||
execution of the parser.
|
||||
Some example uses of this functionality could be to:
|
||||
|
||||
To terminate parsing from a parser user code block, use the `$terminate(code)`
|
||||
function, passing an integer expression argument.
|
||||
* Detect integer overflow when lexing an integer literal constant.
|
||||
* Detect and report an error as soon as possible during parsing before continuing to parse any more of the input.
|
||||
* Determine whether parsing should stop and instead be performed using a different parser version.
|
||||
|
||||
To terminate parsing from a lexer or parser user code block, use the
|
||||
`$terminate(code)` function, passing an integer expression argument.
|
||||
For example:
|
||||
|
||||
```
|
||||
|
@ -198,6 +198,16 @@ class Propane
|
||||
code = code.gsub(/\$token\(([$\w]+)\)/) do |match|
|
||||
"TOKEN_#{Token.code_name($1)}"
|
||||
end
|
||||
code = code.gsub(/\$terminate\((.*)\);/) do |match|
|
||||
user_terminate_code = $1
|
||||
retval = rule ? "P_USER_TERMINATED" : "TERMINATE_TOKEN_ID"
|
||||
case @language
|
||||
when "c"
|
||||
"context->user_terminate_code = (#{user_terminate_code}); return #{retval};"
|
||||
when "d"
|
||||
"context.user_terminate_code = (#{user_terminate_code}); return #{retval};"
|
||||
end
|
||||
end
|
||||
if parser
|
||||
code = code.gsub(/\$\$/) do |match|
|
||||
case @language
|
||||
@ -216,15 +226,6 @@ class Propane
|
||||
"statevalues[$-1-n_states+#{index}].pvalue.v_#{rule.components[index - 1].ptypename}"
|
||||
end
|
||||
end
|
||||
code = code.gsub(/\$terminate\((.*)\);/) do |match|
|
||||
user_terminate_code = $1
|
||||
case @language
|
||||
when "c"
|
||||
"context->user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
|
||||
when "d"
|
||||
"context.user_terminate_code = (#{user_terminate_code}); return P_USER_TERMINATED;"
|
||||
end
|
||||
end
|
||||
else
|
||||
code = code.gsub(/\$\$/) do |match|
|
||||
case @language
|
||||
|
@ -35,6 +35,10 @@ class Propane
|
||||
@tokens.size
|
||||
end
|
||||
|
||||
def terminate_token_id
|
||||
@tokens.size + 1
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def parse_grammar!
|
||||
|
@ -730,6 +730,25 @@ EOF
|
||||
expect(results.status).to eq 0
|
||||
end
|
||||
|
||||
it "allows the user to terminate the lexer" do
|
||||
write_grammar <<EOF
|
||||
token a;
|
||||
token b <<
|
||||
$terminate(8675309);
|
||||
>>
|
||||
token c;
|
||||
Start -> Any;
|
||||
Any -> a;
|
||||
Any -> b;
|
||||
Any -> c;
|
||||
EOF
|
||||
build_parser(language: language)
|
||||
compile("spec/test_user_terminate_lexer.#{language}", language: language)
|
||||
results = run
|
||||
expect(results.stderr).to eq ""
|
||||
expect(results.status).to eq 0
|
||||
end
|
||||
|
||||
it "allows the user to terminate the parser" do
|
||||
write_grammar <<EOF
|
||||
token a;
|
||||
|
19
spec/test_user_terminate_lexer.c
Normal file
19
spec/test_user_terminate_lexer.c
Normal file
@ -0,0 +1,19 @@
|
||||
#include "testparser.h"
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
char const * input = "a";
|
||||
p_context_t context;
|
||||
p_context_init(&context, (uint8_t const *)input, strlen(input));
|
||||
assert(p_parse(&context) == P_SUCCESS);
|
||||
|
||||
input = "b";
|
||||
p_context_init(&context, (uint8_t const *)input, strlen(input));
|
||||
assert(p_parse(&context) == P_USER_TERMINATED);
|
||||
assert(p_user_terminate_code(&context) == 8675309);
|
||||
|
||||
return 0;
|
||||
}
|
20
spec/test_user_terminate_lexer.d
Normal file
20
spec/test_user_terminate_lexer.d
Normal file
@ -0,0 +1,20 @@
|
||||
import testparser;
|
||||
import std.stdio;
|
||||
|
||||
int main()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
string input = "a";
|
||||
p_context_t context;
|
||||
p_context_init(&context, input);
|
||||
assert(p_parse(&context) == P_SUCCESS);
|
||||
|
||||
input = "b";
|
||||
p_context_init(&context, input);
|
||||
assert(p_parse(&context) == P_USER_TERMINATED);
|
||||
assert(p_user_terminate_code(&context) == 8675309);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user