From 87d6d29d60cf11190b7aafc22b2abc0f3da8d21c Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Mon, 15 Jul 2024 21:03:18 -0400 Subject: [PATCH] Store token end position - #27 --- assets/parser.c.erb | 18 +++++++++++++++++- assets/parser.d.erb | 25 ++++++++++++++++++++++--- assets/parser.h.erb | 6 +++++- spec/propane_spec.rb | 8 ++++---- spec/test_ast_token_positions.c | 18 +++++++++++++++--- spec/test_ast_token_positions.d | 18 +++++++++++++++--- spec/test_lexer.c | 18 ++++++++++++++++++ spec/test_lexer.d | 18 +++++++++--------- 8 files changed, 105 insertions(+), 24 deletions(-) diff --git a/assets/parser.c.erb b/assets/parser.c.erb index 6928e54..4915b1a 100644 --- a/assets/parser.c.erb +++ b/assets/parser.c.erb @@ -226,7 +226,10 @@ typedef struct /** Number of bytes of input text used to match. */ size_t length; - /** Input text position delta. */ + /** Input text position delta to end of token. */ + <%= @grammar.prefix %>position_t end_delta_position; + + /** Input text position delta to next code point after token end. */ <%= @grammar.prefix %>position_t delta_position; /** Accepting lexer state from the match. */ @@ -358,6 +361,7 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context, if (transition_state != INVALID_LEXER_STATE_ID) { attempt_match.length += code_point_length; + attempt_match.end_delta_position = attempt_match.delta_position; if (code_point == '\n') { attempt_match.delta_position.row++; @@ -490,11 +494,22 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @ } token_info.token = token_to_accept; token_info.length = match_info.length; + if (match_info.end_delta_position.row != 0u) + { + token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row; + token_info.end_position.col = match_info.end_delta_position.col; + } + else + { + token_info.end_position.row = token_info.position.row; + token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col; + } *out_token_info = token_info; return P_SUCCESS; case P_EOF: token_info.token = TOKEN___EOF; + token_info.end_position = token_info.position; *out_token_info = token_info; return P_SUCCESS; @@ -952,6 +967,7 @@ size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context) token_ast_node->token = token; token_ast_node->pvalue = token_info.pvalue; token_ast_node->position = token_info.position; + token_ast_node->end_position = token_info.end_position; state_values_stack_index(&statevalues, -1)->ast_node = token_ast_node; <% else %> state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue; diff --git a/assets/parser.d.erb b/assets/parser.d.erb index 41e6297..a83d00c 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -83,6 +83,7 @@ public struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> <%= @grammar.prefix %>token_t token; <%= @grammar.prefix %>value_t pvalue; <%= @grammar.prefix %>position_t position; + <%= @grammar.prefix %>position_t end_position; } <% @parser.rule_sets.each do |name, rule_set| %> @@ -107,9 +108,12 @@ public struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %> /** Lexed token information. */ public struct <%= @grammar.prefix %>token_info_t { - /** Text position where the token was found. */ + /** Text position of first code point in token. */ <%= @grammar.prefix %>position_t position; + /** Text position of last code point in token. */ + <%= @grammar.prefix %>position_t end_position; + /** Number of input bytes used by the token. */ size_t length; @@ -373,7 +377,10 @@ private struct lexer_match_info_t /** Number of bytes of input text used to match. */ size_t length; - /** Input text position delta. */ + /** Input text position delta to end of token. */ + <%= @grammar.prefix %>position_t end_delta_position; + + /** Input text position delta to next code point after token end. */ <%= @grammar.prefix %>position_t delta_position; /** Accepting lexer state from the match. */ @@ -501,6 +508,7 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context, if (transition_state != INVALID_LEXER_STATE_ID) { attempt_match.length += code_point_length; + attempt_match.end_delta_position = attempt_match.delta_position; if (code_point == '\n') { attempt_match.delta_position.row++; @@ -633,11 +641,22 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= } token_info.token = token_to_accept; token_info.length = match_info.length; + if (match_info.end_delta_position.row != 0u) + { + token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row; + token_info.end_position.col = match_info.end_delta_position.col; + } + else + { + token_info.end_position.row = token_info.position.row; + token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col; + } *out_token_info = token_info; return P_SUCCESS; case P_EOF: token_info.token = TOKEN___EOF; + token_info.end_position = token_info.position; *out_token_info = token_info; return P_SUCCESS; @@ -997,7 +1016,7 @@ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * cont { /* We shifted a token, mark it consumed. */ <% if @grammar.ast %> - <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token, token_info.pvalue, token_info.position); + <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token, token_info.pvalue, token_info.position, token_info.end_position); statevalues[$-1].ast_node = token_ast_node; <% else %> statevalues[$-1].pvalue = token_info.pvalue; diff --git a/assets/parser.h.erb b/assets/parser.h.erb index e5d21e8..8657ef6 100644 --- a/assets/parser.h.erb +++ b/assets/parser.h.erb @@ -75,6 +75,7 @@ typedef struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> <%= @grammar.prefix %>token_t token; <%= @grammar.prefix %>value_t pvalue; <%= @grammar.prefix %>position_t position; + <%= @grammar.prefix %>position_t end_position; } <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>; <% @parser.rule_sets.each do |name, rule_set| %> @@ -105,9 +106,12 @@ typedef struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %> /** Lexed token information. */ typedef struct { - /** Text position where the token was found. */ + /** Text position of first code point in token. */ <%= @grammar.prefix %>position_t position; + /** Text position of last code point in token. */ + <%= @grammar.prefix %>position_t end_position; + /** Number of input bytes used by the token. */ size_t length; diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index 93d9335..0da89ff 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -1086,13 +1086,13 @@ EOF ast; token a; -token b; -token c; +token bb; +token ccc; drop /\\s+/; Start -> T T T; T -> a; -T -> b; -T -> c; +T -> bb; +T -> ccc; EOF run_propane(language: language) compile("spec/test_ast_token_positions.#{language}", language: language) diff --git a/spec/test_ast_token_positions.c b/spec/test_ast_token_positions.c index 1547445..7282f06 100644 --- a/spec/test_ast_token_positions.c +++ b/spec/test_ast_token_positions.c @@ -5,28 +5,40 @@ int main() { - char const * input = "abc"; + char const * input = "abbccc"; p_context_t context; p_context_init(&context, (uint8_t const *)input, strlen(input)); assert(p_parse(&context) == P_SUCCESS); Start * start = p_result(&context); assert_eq(0, start->pT1->pToken->position.row); assert_eq(0, start->pT1->pToken->position.col); + assert_eq(0, start->pT1->pToken->end_position.row); + assert_eq(0, start->pT1->pToken->end_position.col); assert_eq(0, start->pT2->pToken->position.row); assert_eq(1, start->pT2->pToken->position.col); + assert_eq(0, start->pT2->pToken->end_position.row); + assert_eq(2, start->pT2->pToken->end_position.col); assert_eq(0, start->pT3->pToken->position.row); - assert_eq(2, start->pT3->pToken->position.col); + assert_eq(3, start->pT3->pToken->position.col); + assert_eq(0, start->pT3->pToken->end_position.row); + assert_eq(5, start->pT3->pToken->end_position.col); - input = "\n\n a\nc\n\n a"; + input = "\n\n bb\nccc\n\n a"; p_context_init(&context, (uint8_t const *)input, strlen(input)); assert(p_parse(&context) == P_SUCCESS); start = p_result(&context); assert_eq(2, start->pT1->pToken->position.row); assert_eq(2, start->pT1->pToken->position.col); + assert_eq(2, start->pT1->pToken->end_position.row); + assert_eq(3, start->pT1->pToken->end_position.col); assert_eq(3, start->pT2->pToken->position.row); assert_eq(0, start->pT2->pToken->position.col); + assert_eq(3, start->pT2->pToken->end_position.row); + assert_eq(2, start->pT2->pToken->end_position.col); assert_eq(5, start->pT3->pToken->position.row); assert_eq(5, start->pT3->pToken->position.col); + assert_eq(5, start->pT3->pToken->end_position.row); + assert_eq(5, start->pT3->pToken->end_position.col); return 0; } diff --git a/spec/test_ast_token_positions.d b/spec/test_ast_token_positions.d index cc0768c..0cbd6d6 100644 --- a/spec/test_ast_token_positions.d +++ b/spec/test_ast_token_positions.d @@ -9,26 +9,38 @@ int main() unittest { - string input = "abc"; + string input = "abbccc"; p_context_t context; p_context_init(&context, input); assert(p_parse(&context) == P_SUCCESS); Start * start = p_result(&context); assert_eq(0, start.pT1.pToken.position.row); assert_eq(0, start.pT1.pToken.position.col); + assert_eq(0, start.pT1.pToken.end_position.row); + assert_eq(0, start.pT1.pToken.end_position.col); assert_eq(0, start.pT2.pToken.position.row); assert_eq(1, start.pT2.pToken.position.col); + assert_eq(0, start.pT2.pToken.end_position.row); + assert_eq(2, start.pT2.pToken.end_position.col); assert_eq(0, start.pT3.pToken.position.row); - assert_eq(2, start.pT3.pToken.position.col); + assert_eq(3, start.pT3.pToken.position.col); + assert_eq(0, start.pT3.pToken.end_position.row); + assert_eq(5, start.pT3.pToken.end_position.col); - input = "\n\n a\nc\n\n a"; + input = "\n\n bb\nccc\n\n a"; p_context_init(&context, input); assert(p_parse(&context) == P_SUCCESS); start = p_result(&context); assert_eq(2, start.pT1.pToken.position.row); assert_eq(2, start.pT1.pToken.position.col); + assert_eq(2, start.pT1.pToken.end_position.row); + assert_eq(3, start.pT1.pToken.end_position.col); assert_eq(3, start.pT2.pToken.position.row); assert_eq(0, start.pT2.pToken.position.col); + assert_eq(3, start.pT2.pToken.end_position.row); + assert_eq(2, start.pT2.pToken.end_position.col); assert_eq(5, start.pT3.pToken.position.row); assert_eq(5, start.pT3.pToken.position.col); + assert_eq(5, start.pT3.pToken.end_position.row); + assert_eq(5, start.pT3.pToken.end_position.col); } diff --git a/spec/test_lexer.c b/spec/test_lexer.c index 551ecb4..6ac68b3 100644 --- a/spec/test_lexer.c +++ b/spec/test_lexer.c @@ -43,41 +43,57 @@ int main() assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 0u); assert(token_info.position.col == 0u); + assert(token_info.end_position.row == 0u); + assert(token_info.end_position.col == 0u); assert(token_info.length == 1u); assert(token_info.token == TOKEN_int); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 0u); assert(token_info.position.col == 2u); + assert(token_info.end_position.row == 0u); + assert(token_info.end_position.col == 2u); assert(token_info.length == 1u); assert(token_info.token == TOKEN_plus); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 0u); assert(token_info.position.col == 4u); + assert(token_info.end_position.row == 0u); + assert(token_info.end_position.col == 4u); assert(token_info.length == 1u); assert(token_info.token == TOKEN_int); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 0u); assert(token_info.position.col == 6u); + assert(token_info.end_position.row == 0u); + assert(token_info.end_position.col == 6u); assert(token_info.length == 1u); assert(token_info.token == TOKEN_times); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 1u); assert(token_info.position.col == 0u); + assert(token_info.end_position.row == 1u); + assert(token_info.end_position.col == 2u); assert(token_info.length == 3u); assert(token_info.token == TOKEN_int); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 1u); assert(token_info.position.col == 4u); + assert(token_info.end_position.row == 1u); + assert(token_info.end_position.col == 4u); assert(token_info.length == 1u); assert(token_info.token == TOKEN_plus); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 1u); assert(token_info.position.col == 6u); + assert(token_info.end_position.row == 1u); + assert(token_info.end_position.col == 8u); assert(token_info.length == 3u); assert(token_info.token == TOKEN_int); assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 1u); assert(token_info.position.col == 9u); + assert(token_info.end_position.row == 1u); + assert(token_info.end_position.col == 9u); assert(token_info.length == 0u); assert(token_info.token == TOKEN___EOF); @@ -85,6 +101,8 @@ int main() assert(p_lex(&context, &token_info) == P_SUCCESS); assert(token_info.position.row == 0u); assert(token_info.position.col == 0u); + assert(token_info.end_position.row == 0u); + assert(token_info.end_position.col == 0u); assert(token_info.length == 0u); assert(token_info.token == TOKEN___EOF); diff --git a/spec/test_lexer.d b/spec/test_lexer.d index 3d0f655..9679c3b 100644 --- a/spec/test_lexer.d +++ b/spec/test_lexer.d @@ -47,23 +47,23 @@ unittest p_context_t context; p_context_init(&context, input); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(0, 0), 1, TOKEN_int)); + assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 1, TOKEN_int)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(0, 2), 1, TOKEN_plus)); + assert(token_info == p_token_info_t(p_position_t(0, 2), p_position_t(0, 2), 1, TOKEN_plus)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(0, 4), 1, TOKEN_int)); + assert(token_info == p_token_info_t(p_position_t(0, 4), p_position_t(0, 4), 1, TOKEN_int)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(0, 6), 1, TOKEN_times)); + assert(token_info == p_token_info_t(p_position_t(0, 6), p_position_t(0, 6), 1, TOKEN_times)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(1, 0), 3, TOKEN_int)); + assert(token_info == p_token_info_t(p_position_t(1, 0), p_position_t(1, 2), 3, TOKEN_int)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(1, 4), 1, TOKEN_plus)); + assert(token_info == p_token_info_t(p_position_t(1, 4), p_position_t(1, 4), 1, TOKEN_plus)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(1, 6), 3, TOKEN_int)); + assert(token_info == p_token_info_t(p_position_t(1, 6), p_position_t(1, 8), 3, TOKEN_int)); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(1, 9), 0, TOKEN___EOF)); + assert(token_info == p_token_info_t(p_position_t(1, 9), p_position_t(1, 9), 0, TOKEN___EOF)); p_context_init(&context, ""); assert(p_lex(&context, &token_info) == P_SUCCESS); - assert(token_info == p_token_info_t(p_position_t(0, 0), 0, TOKEN___EOF)); + assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 0, TOKEN___EOF)); }