Store token end position - #27

This commit is contained in:
Josh Holtrop 2024-07-15 21:03:18 -04:00
parent 3aced70356
commit 87d6d29d60
8 changed files with 105 additions and 24 deletions

View File

@ -226,7 +226,10 @@ typedef struct
/** Number of bytes of input text used to match. */ /** Number of bytes of input text used to match. */
size_t length; size_t length;
/** Input text position delta. */ /** Input text position delta to end of token. */
<%= @grammar.prefix %>position_t end_delta_position;
/** Input text position delta to next code point after token end. */
<%= @grammar.prefix %>position_t delta_position; <%= @grammar.prefix %>position_t delta_position;
/** Accepting lexer state from the match. */ /** Accepting lexer state from the match. */
@ -358,6 +361,7 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
if (transition_state != INVALID_LEXER_STATE_ID) if (transition_state != INVALID_LEXER_STATE_ID)
{ {
attempt_match.length += code_point_length; attempt_match.length += code_point_length;
attempt_match.end_delta_position = attempt_match.delta_position;
if (code_point == '\n') if (code_point == '\n')
{ {
attempt_match.delta_position.row++; attempt_match.delta_position.row++;
@ -490,11 +494,22 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
} }
token_info.token = token_to_accept; token_info.token = token_to_accept;
token_info.length = match_info.length; token_info.length = match_info.length;
if (match_info.end_delta_position.row != 0u)
{
token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row;
token_info.end_position.col = match_info.end_delta_position.col;
}
else
{
token_info.end_position.row = token_info.position.row;
token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col;
}
*out_token_info = token_info; *out_token_info = token_info;
return P_SUCCESS; return P_SUCCESS;
case P_EOF: case P_EOF:
token_info.token = TOKEN___EOF; token_info.token = TOKEN___EOF;
token_info.end_position = token_info.position;
*out_token_info = token_info; *out_token_info = token_info;
return P_SUCCESS; return P_SUCCESS;
@ -952,6 +967,7 @@ size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context)
token_ast_node->token = token; token_ast_node->token = token;
token_ast_node->pvalue = token_info.pvalue; token_ast_node->pvalue = token_info.pvalue;
token_ast_node->position = token_info.position; token_ast_node->position = token_info.position;
token_ast_node->end_position = token_info.end_position;
state_values_stack_index(&statevalues, -1)->ast_node = token_ast_node; state_values_stack_index(&statevalues, -1)->ast_node = token_ast_node;
<% else %> <% else %>
state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue; state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue;

View File

@ -83,6 +83,7 @@ public struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>
<%= @grammar.prefix %>token_t token; <%= @grammar.prefix %>token_t token;
<%= @grammar.prefix %>value_t pvalue; <%= @grammar.prefix %>value_t pvalue;
<%= @grammar.prefix %>position_t position; <%= @grammar.prefix %>position_t position;
<%= @grammar.prefix %>position_t end_position;
} }
<% @parser.rule_sets.each do |name, rule_set| %> <% @parser.rule_sets.each do |name, rule_set| %>
@ -107,9 +108,12 @@ public struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
/** Lexed token information. */ /** Lexed token information. */
public struct <%= @grammar.prefix %>token_info_t public struct <%= @grammar.prefix %>token_info_t
{ {
/** Text position where the token was found. */ /** Text position of first code point in token. */
<%= @grammar.prefix %>position_t position; <%= @grammar.prefix %>position_t position;
/** Text position of last code point in token. */
<%= @grammar.prefix %>position_t end_position;
/** Number of input bytes used by the token. */ /** Number of input bytes used by the token. */
size_t length; size_t length;
@ -373,7 +377,10 @@ private struct lexer_match_info_t
/** Number of bytes of input text used to match. */ /** Number of bytes of input text used to match. */
size_t length; size_t length;
/** Input text position delta. */ /** Input text position delta to end of token. */
<%= @grammar.prefix %>position_t end_delta_position;
/** Input text position delta to next code point after token end. */
<%= @grammar.prefix %>position_t delta_position; <%= @grammar.prefix %>position_t delta_position;
/** Accepting lexer state from the match. */ /** Accepting lexer state from the match. */
@ -501,6 +508,7 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
if (transition_state != INVALID_LEXER_STATE_ID) if (transition_state != INVALID_LEXER_STATE_ID)
{ {
attempt_match.length += code_point_length; attempt_match.length += code_point_length;
attempt_match.end_delta_position = attempt_match.delta_position;
if (code_point == '\n') if (code_point == '\n')
{ {
attempt_match.delta_position.row++; attempt_match.delta_position.row++;
@ -633,11 +641,22 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
} }
token_info.token = token_to_accept; token_info.token = token_to_accept;
token_info.length = match_info.length; token_info.length = match_info.length;
if (match_info.end_delta_position.row != 0u)
{
token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row;
token_info.end_position.col = match_info.end_delta_position.col;
}
else
{
token_info.end_position.row = token_info.position.row;
token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col;
}
*out_token_info = token_info; *out_token_info = token_info;
return P_SUCCESS; return P_SUCCESS;
case P_EOF: case P_EOF:
token_info.token = TOKEN___EOF; token_info.token = TOKEN___EOF;
token_info.end_position = token_info.position;
*out_token_info = token_info; *out_token_info = token_info;
return P_SUCCESS; return P_SUCCESS;
@ -997,7 +1016,7 @@ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * cont
{ {
/* We shifted a token, mark it consumed. */ /* We shifted a token, mark it consumed. */
<% if @grammar.ast %> <% if @grammar.ast %>
<%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token, token_info.pvalue, token_info.position); <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token, token_info.pvalue, token_info.position, token_info.end_position);
statevalues[$-1].ast_node = token_ast_node; statevalues[$-1].ast_node = token_ast_node;
<% else %> <% else %>
statevalues[$-1].pvalue = token_info.pvalue; statevalues[$-1].pvalue = token_info.pvalue;

View File

@ -75,6 +75,7 @@ typedef struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>
<%= @grammar.prefix %>token_t token; <%= @grammar.prefix %>token_t token;
<%= @grammar.prefix %>value_t pvalue; <%= @grammar.prefix %>value_t pvalue;
<%= @grammar.prefix %>position_t position; <%= @grammar.prefix %>position_t position;
<%= @grammar.prefix %>position_t end_position;
} <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>; } <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>;
<% @parser.rule_sets.each do |name, rule_set| %> <% @parser.rule_sets.each do |name, rule_set| %>
@ -105,9 +106,12 @@ typedef struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
/** Lexed token information. */ /** Lexed token information. */
typedef struct typedef struct
{ {
/** Text position where the token was found. */ /** Text position of first code point in token. */
<%= @grammar.prefix %>position_t position; <%= @grammar.prefix %>position_t position;
/** Text position of last code point in token. */
<%= @grammar.prefix %>position_t end_position;
/** Number of input bytes used by the token. */ /** Number of input bytes used by the token. */
size_t length; size_t length;

View File

@ -1086,13 +1086,13 @@ EOF
ast; ast;
token a; token a;
token b; token bb;
token c; token ccc;
drop /\\s+/; drop /\\s+/;
Start -> T T T; Start -> T T T;
T -> a; T -> a;
T -> b; T -> bb;
T -> c; T -> ccc;
EOF EOF
run_propane(language: language) run_propane(language: language)
compile("spec/test_ast_token_positions.#{language}", language: language) compile("spec/test_ast_token_positions.#{language}", language: language)

View File

@ -5,28 +5,40 @@
int main() int main()
{ {
char const * input = "abc"; char const * input = "abbccc";
p_context_t context; p_context_t context;
p_context_init(&context, (uint8_t const *)input, strlen(input)); p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS); assert(p_parse(&context) == P_SUCCESS);
Start * start = p_result(&context); Start * start = p_result(&context);
assert_eq(0, start->pT1->pToken->position.row); assert_eq(0, start->pT1->pToken->position.row);
assert_eq(0, start->pT1->pToken->position.col); assert_eq(0, start->pT1->pToken->position.col);
assert_eq(0, start->pT1->pToken->end_position.row);
assert_eq(0, start->pT1->pToken->end_position.col);
assert_eq(0, start->pT2->pToken->position.row); assert_eq(0, start->pT2->pToken->position.row);
assert_eq(1, start->pT2->pToken->position.col); assert_eq(1, start->pT2->pToken->position.col);
assert_eq(0, start->pT2->pToken->end_position.row);
assert_eq(2, start->pT2->pToken->end_position.col);
assert_eq(0, start->pT3->pToken->position.row); assert_eq(0, start->pT3->pToken->position.row);
assert_eq(2, start->pT3->pToken->position.col); assert_eq(3, start->pT3->pToken->position.col);
assert_eq(0, start->pT3->pToken->end_position.row);
assert_eq(5, start->pT3->pToken->end_position.col);
input = "\n\n a\nc\n\n a"; input = "\n\n bb\nccc\n\n a";
p_context_init(&context, (uint8_t const *)input, strlen(input)); p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS); assert(p_parse(&context) == P_SUCCESS);
start = p_result(&context); start = p_result(&context);
assert_eq(2, start->pT1->pToken->position.row); assert_eq(2, start->pT1->pToken->position.row);
assert_eq(2, start->pT1->pToken->position.col); assert_eq(2, start->pT1->pToken->position.col);
assert_eq(2, start->pT1->pToken->end_position.row);
assert_eq(3, start->pT1->pToken->end_position.col);
assert_eq(3, start->pT2->pToken->position.row); assert_eq(3, start->pT2->pToken->position.row);
assert_eq(0, start->pT2->pToken->position.col); assert_eq(0, start->pT2->pToken->position.col);
assert_eq(3, start->pT2->pToken->end_position.row);
assert_eq(2, start->pT2->pToken->end_position.col);
assert_eq(5, start->pT3->pToken->position.row); assert_eq(5, start->pT3->pToken->position.row);
assert_eq(5, start->pT3->pToken->position.col); assert_eq(5, start->pT3->pToken->position.col);
assert_eq(5, start->pT3->pToken->end_position.row);
assert_eq(5, start->pT3->pToken->end_position.col);
return 0; return 0;
} }

View File

@ -9,26 +9,38 @@ int main()
unittest unittest
{ {
string input = "abc"; string input = "abbccc";
p_context_t context; p_context_t context;
p_context_init(&context, input); p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS); assert(p_parse(&context) == P_SUCCESS);
Start * start = p_result(&context); Start * start = p_result(&context);
assert_eq(0, start.pT1.pToken.position.row); assert_eq(0, start.pT1.pToken.position.row);
assert_eq(0, start.pT1.pToken.position.col); assert_eq(0, start.pT1.pToken.position.col);
assert_eq(0, start.pT1.pToken.end_position.row);
assert_eq(0, start.pT1.pToken.end_position.col);
assert_eq(0, start.pT2.pToken.position.row); assert_eq(0, start.pT2.pToken.position.row);
assert_eq(1, start.pT2.pToken.position.col); assert_eq(1, start.pT2.pToken.position.col);
assert_eq(0, start.pT2.pToken.end_position.row);
assert_eq(2, start.pT2.pToken.end_position.col);
assert_eq(0, start.pT3.pToken.position.row); assert_eq(0, start.pT3.pToken.position.row);
assert_eq(2, start.pT3.pToken.position.col); assert_eq(3, start.pT3.pToken.position.col);
assert_eq(0, start.pT3.pToken.end_position.row);
assert_eq(5, start.pT3.pToken.end_position.col);
input = "\n\n a\nc\n\n a"; input = "\n\n bb\nccc\n\n a";
p_context_init(&context, input); p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS); assert(p_parse(&context) == P_SUCCESS);
start = p_result(&context); start = p_result(&context);
assert_eq(2, start.pT1.pToken.position.row); assert_eq(2, start.pT1.pToken.position.row);
assert_eq(2, start.pT1.pToken.position.col); assert_eq(2, start.pT1.pToken.position.col);
assert_eq(2, start.pT1.pToken.end_position.row);
assert_eq(3, start.pT1.pToken.end_position.col);
assert_eq(3, start.pT2.pToken.position.row); assert_eq(3, start.pT2.pToken.position.row);
assert_eq(0, start.pT2.pToken.position.col); assert_eq(0, start.pT2.pToken.position.col);
assert_eq(3, start.pT2.pToken.end_position.row);
assert_eq(2, start.pT2.pToken.end_position.col);
assert_eq(5, start.pT3.pToken.position.row); assert_eq(5, start.pT3.pToken.position.row);
assert_eq(5, start.pT3.pToken.position.col); assert_eq(5, start.pT3.pToken.position.col);
assert_eq(5, start.pT3.pToken.end_position.row);
assert_eq(5, start.pT3.pToken.end_position.col);
} }

View File

@ -43,41 +43,57 @@ int main()
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 0u); assert(token_info.position.row == 0u);
assert(token_info.position.col == 0u); assert(token_info.position.col == 0u);
assert(token_info.end_position.row == 0u);
assert(token_info.end_position.col == 0u);
assert(token_info.length == 1u); assert(token_info.length == 1u);
assert(token_info.token == TOKEN_int); assert(token_info.token == TOKEN_int);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 0u); assert(token_info.position.row == 0u);
assert(token_info.position.col == 2u); assert(token_info.position.col == 2u);
assert(token_info.end_position.row == 0u);
assert(token_info.end_position.col == 2u);
assert(token_info.length == 1u); assert(token_info.length == 1u);
assert(token_info.token == TOKEN_plus); assert(token_info.token == TOKEN_plus);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 0u); assert(token_info.position.row == 0u);
assert(token_info.position.col == 4u); assert(token_info.position.col == 4u);
assert(token_info.end_position.row == 0u);
assert(token_info.end_position.col == 4u);
assert(token_info.length == 1u); assert(token_info.length == 1u);
assert(token_info.token == TOKEN_int); assert(token_info.token == TOKEN_int);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 0u); assert(token_info.position.row == 0u);
assert(token_info.position.col == 6u); assert(token_info.position.col == 6u);
assert(token_info.end_position.row == 0u);
assert(token_info.end_position.col == 6u);
assert(token_info.length == 1u); assert(token_info.length == 1u);
assert(token_info.token == TOKEN_times); assert(token_info.token == TOKEN_times);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 1u); assert(token_info.position.row == 1u);
assert(token_info.position.col == 0u); assert(token_info.position.col == 0u);
assert(token_info.end_position.row == 1u);
assert(token_info.end_position.col == 2u);
assert(token_info.length == 3u); assert(token_info.length == 3u);
assert(token_info.token == TOKEN_int); assert(token_info.token == TOKEN_int);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 1u); assert(token_info.position.row == 1u);
assert(token_info.position.col == 4u); assert(token_info.position.col == 4u);
assert(token_info.end_position.row == 1u);
assert(token_info.end_position.col == 4u);
assert(token_info.length == 1u); assert(token_info.length == 1u);
assert(token_info.token == TOKEN_plus); assert(token_info.token == TOKEN_plus);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 1u); assert(token_info.position.row == 1u);
assert(token_info.position.col == 6u); assert(token_info.position.col == 6u);
assert(token_info.end_position.row == 1u);
assert(token_info.end_position.col == 8u);
assert(token_info.length == 3u); assert(token_info.length == 3u);
assert(token_info.token == TOKEN_int); assert(token_info.token == TOKEN_int);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 1u); assert(token_info.position.row == 1u);
assert(token_info.position.col == 9u); assert(token_info.position.col == 9u);
assert(token_info.end_position.row == 1u);
assert(token_info.end_position.col == 9u);
assert(token_info.length == 0u); assert(token_info.length == 0u);
assert(token_info.token == TOKEN___EOF); assert(token_info.token == TOKEN___EOF);
@ -85,6 +101,8 @@ int main()
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info.position.row == 0u); assert(token_info.position.row == 0u);
assert(token_info.position.col == 0u); assert(token_info.position.col == 0u);
assert(token_info.end_position.row == 0u);
assert(token_info.end_position.col == 0u);
assert(token_info.length == 0u); assert(token_info.length == 0u);
assert(token_info.token == TOKEN___EOF); assert(token_info.token == TOKEN___EOF);

View File

@ -47,23 +47,23 @@ unittest
p_context_t context; p_context_t context;
p_context_init(&context, input); p_context_init(&context, input);
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(0, 0), 1, TOKEN_int)); assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 1, TOKEN_int));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(0, 2), 1, TOKEN_plus)); assert(token_info == p_token_info_t(p_position_t(0, 2), p_position_t(0, 2), 1, TOKEN_plus));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(0, 4), 1, TOKEN_int)); assert(token_info == p_token_info_t(p_position_t(0, 4), p_position_t(0, 4), 1, TOKEN_int));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(0, 6), 1, TOKEN_times)); assert(token_info == p_token_info_t(p_position_t(0, 6), p_position_t(0, 6), 1, TOKEN_times));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(1, 0), 3, TOKEN_int)); assert(token_info == p_token_info_t(p_position_t(1, 0), p_position_t(1, 2), 3, TOKEN_int));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(1, 4), 1, TOKEN_plus)); assert(token_info == p_token_info_t(p_position_t(1, 4), p_position_t(1, 4), 1, TOKEN_plus));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(1, 6), 3, TOKEN_int)); assert(token_info == p_token_info_t(p_position_t(1, 6), p_position_t(1, 8), 3, TOKEN_int));
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(1, 9), 0, TOKEN___EOF)); assert(token_info == p_token_info_t(p_position_t(1, 9), p_position_t(1, 9), 0, TOKEN___EOF));
p_context_init(&context, ""); p_context_init(&context, "");
assert(p_lex(&context, &token_info) == P_SUCCESS); assert(p_lex(&context, &token_info) == P_SUCCESS);
assert(token_info == p_token_info_t(p_position_t(0, 0), 0, TOKEN___EOF)); assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 0, TOKEN___EOF));
} }