Add Position struct to track text positions

2023-07-10 11:18:57 -04:00 · 2023-07-10 11:18:57 -04:00 · 1c50d37a3e
commit 1c50d37a3e
parent 80ac6c17f0
2 changed files with 38 additions and 28 deletions
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -54,6 +54,20 @@ class <%= @classname %>
 <% end %>
    }

+    /**
+     * A structure to keep track of parser position.
+     *
+     * This is useful for reporting errors, etc...
+     */
+    static struct Position
+    {
+        /** Input text row (0-based). */
+        uint row;
+
+        /** Input text column (0-based). */
+        uint col;
+    }
+
    static class Decoder
    {
        /**
@ -196,17 +210,15 @@ class <%= @classname %>

        public static struct TokenInfo
        {
-            size_t row;
-            size_t col;
+            Position position;
            size_t length;
            Token token;
            ParserValue pvalue;
        }

        private string m_input;
-        private size_t m_input_position;
-        private size_t m_input_row;
-        private size_t m_input_col;
+        private size_t m_input_index;
+        private Position m_input_position;
        private size_t m_mode;

        this(string input)
@ -274,8 +286,7 @@ class <%= @classname %>
        private size_t attempt_lex_token(TokenInfo * out_token_info)
        {
            TokenInfo token_info;
-            token_info.row = m_input_row;
-            token_info.col = m_input_col;
+            token_info.position = m_input_position;
            token_info.token = INVALID_TOKEN_ID;
            *out_token_info = token_info; // TODO: remove
            MatchInfo match_info;
@ -287,7 +298,7 @@ class <%= @classname %>
                Token token_to_accept = match_info.accepting_state.token;
                if (match_info.accepting_state.code_id != INVALID_USER_CODE_ID)
                {
-                    Token user_code_token = user_code(match_info.accepting_state.code_id, m_input[m_input_position..(m_input_position + match_info.length)], &token_info);
+                    Token user_code_token = user_code(match_info.accepting_state.code_id, m_input[m_input_index..(m_input_index + match_info.length)], &token_info);
                    /* An invalid Token from user_code() means that the user
                     * code did not explicitly return a token. So only override
                     * the token to return if the user code does explicitly
@ -299,15 +310,15 @@ class <%= @classname %>
                }

                /* Update the input position tracking. */
-                m_input_position += match_info.length;
-                m_input_row += match_info.delta_row;
-                if (match_info.delta_row != 0u)
+                m_input_index += match_info.length;
+                m_input_position.row += match_info.delta_position.row;
+                if (match_info.delta_position.row != 0u)
                {
-                    m_input_col = match_info.delta_col;
+                    m_input_position.col = match_info.delta_position.col;
                }
                else
                {
-                    m_input_col += match_info.delta_col;
+                    m_input_position.col += match_info.delta_position.col;
                }
                if (token_to_accept == INVALID_TOKEN_ID)
                {
@ -331,8 +342,7 @@ class <%= @classname %>
        struct MatchInfo
        {
            size_t length;
-            size_t delta_row;
-            size_t delta_col;
+            Position delta_position;
            const(State) * accepting_state;
        }

@ -354,7 +364,7 @@ class <%= @classname %>
            uint current_state = modes[m_mode].state_table_offset;
            for (;;)
            {
-                string input = m_input[(m_input_position + attempt_match.length)..(m_input.length)];
+                string input = m_input[(m_input_index + attempt_match.length)..(m_input.length)];
                CodePoint code_point;
                ubyte code_point_length;
                size_t result = Decoder.decode_code_point(input, code_point, code_point_length);
@ -367,12 +377,12 @@ class <%= @classname %>
                        attempt_match.length += code_point_length;
                        if (code_point == '\n')
                        {
-                            attempt_match.delta_row++;
-                            attempt_match.delta_col = 0u;
+                            attempt_match.delta_position.row++;
+                            attempt_match.delta_position.col = 0u;
                        }
                        else
                        {
-                            attempt_match.delta_col++;
+                            attempt_match.delta_position.col++;
                        }
                        current_state = transition_state;
                        if (states[current_state].accepts)
--- a/spec/test_d_lexer.d
+++ b/spec/test_d_lexer.d
@ -47,23 +47,23 @@ unittest
    string input = "5 + 4 * \n677 + 567";
    Testparser.Lexer lexer = new Testparser.Lexer(input);
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(0, 0, 1, Testparser.TOKEN_int));
+    assert(token_info == TokenInfo(Testparser.Position(0, 0), 1, Testparser.TOKEN_int));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(0, 2, 1, Testparser.TOKEN_plus));
+    assert(token_info == TokenInfo(Testparser.Position(0, 2), 1, Testparser.TOKEN_plus));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(0, 4, 1, Testparser.TOKEN_int));
+    assert(token_info == TokenInfo(Testparser.Position(0, 4), 1, Testparser.TOKEN_int));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(0, 6, 1, Testparser.TOKEN_times));
+    assert(token_info == TokenInfo(Testparser.Position(0, 6), 1, Testparser.TOKEN_times));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(1, 0, 3, Testparser.TOKEN_int));
+    assert(token_info == TokenInfo(Testparser.Position(1, 0), 3, Testparser.TOKEN_int));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(1, 4, 1, Testparser.TOKEN_plus));
+    assert(token_info == TokenInfo(Testparser.Position(1, 4), 1, Testparser.TOKEN_plus));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(1, 6, 3, Testparser.TOKEN_int));
+    assert(token_info == TokenInfo(Testparser.Position(1, 6), 3, Testparser.TOKEN_int));
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(1, 9, 0, Testparser.TOKEN___EOF));
+    assert(token_info == TokenInfo(Testparser.Position(1, 9), 0, Testparser.TOKEN___EOF));

    lexer = new Testparser.Lexer("");
    assert(lexer.lex_token(&token_info) == Testparser.P_TOKEN);
-    assert(token_info == TokenInfo(0, 0, 0, Testparser.TOKEN___EOF));
+    assert(token_info == TokenInfo(Testparser.Position(0, 0), 0, Testparser.TOKEN___EOF));
 }