Document position tracking fields in user guide - #27

Move INVALID_POSITION from header to C source - #27
Test position validity for empty matching rules - #27
2024-07-21 14:04:51 -04:00 · 2024-07-21 13:39:34 -04:00 · 2024-07-21 13:39:30 -04:00 · 2024-07-19 15:37:37 -04:00 · 2024-07-19 14:34:50 -04:00 · 2024-07-18 12:09:26 -04:00
12 changed files with 557 additions and 50 deletions
--- a/assets/parser.c.erb
+++ b/assets/parser.c.erb
@ -226,7 +226,10 @@ typedef struct
    /** Number of bytes of input text used to match. */
    size_t length;

-    /** Input text position delta. */
+    /** Input text position delta to end of token. */
+    <%= @grammar.prefix %>position_t end_delta_position;
+
+    /** Input text position delta to next code point after token end. */
    <%= @grammar.prefix %>position_t delta_position;

    /** Accepting lexer state from the match. */
@ -358,6 +361,7 @@ static size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
            if (transition_state != INVALID_LEXER_STATE_ID)
            {
                attempt_match.length += code_point_length;
+                attempt_match.end_delta_position = attempt_match.delta_position;
                if (code_point == '\n')
                {
                    attempt_match.delta_position.row++;
@ -490,11 +494,22 @@ static size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%= @
        }
        token_info.token = token_to_accept;
        token_info.length = match_info.length;
+        if (match_info.end_delta_position.row != 0u)
+        {
+            token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row;
+            token_info.end_position.col = match_info.end_delta_position.col;
+        }
+        else
+        {
+            token_info.end_position.row = token_info.position.row;
+            token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col;
+        }
        *out_token_info = token_info;
        return P_SUCCESS;

    case P_EOF:
        token_info.token = TOKEN___EOF;
+        token_info.end_position = token_info.position;
        *out_token_info = token_info;
        return P_SUCCESS;

@ -551,6 +566,9 @@ size_t <%= @grammar.prefix %>lex(<%= @grammar.prefix %>context_t * context, <%=
 * Parser
 *************************************************************************/

+/** Invalid position value. */
+#define INVALID_POSITION (<%= @grammar.prefix %>position_t){0xFFFFFFFFu, 0xFFFFFFFFu}
+
 /** Reduce ID type. */
 typedef <%= get_type_for(@parser.reduce_table.size) %> reduce_id_t;

@ -666,6 +684,14 @@ typedef struct
 <% end %>
 } state_value_t;

+/** Common AST node structure. */
+typedef struct
+{
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
+    void * fields[];
+} ASTNode;
+
 /** Parser shift table. */
 static const shift_t parser_shift_table[] = {
 <%   @parser.shift_table.each do |shift| %>
@ -949,9 +975,10 @@ size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context)
                /* We shifted a token, mark it consumed. */
 <% if @grammar.ast %>
                <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = malloc(sizeof(<%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>));
+                token_ast_node->position = token_info.position;
+                token_ast_node->end_position = token_info.end_position;
                token_ast_node->token = token;
                token_ast_node->pvalue = token_info.pvalue;
-                token_ast_node->position = token_info.position;
                state_values_stack_index(&statevalues, -1)->ast_node = token_ast_node;
 <% else %>
                state_values_stack_index(&statevalues, -1)->pvalue = token_info.pvalue;
@ -984,22 +1011,43 @@ size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * context)
            }
            else if (parser_reduce_table[reduce_index].n_states > 0)
            {
-                void ** node_fields = calloc(parser_reduce_table[reduce_index].rule_set_node_field_array_size, sizeof(void *));
+                size_t n_fields = parser_reduce_table[reduce_index].rule_set_node_field_array_size;
+                ASTNode * node = (ASTNode *)malloc(sizeof(ASTNode) + n_fields * sizeof(void *));
+                node->position = INVALID_POSITION;
+                node->end_position = INVALID_POSITION;
+                for (size_t i = 0; i < n_fields; i++)
+                {
+                    node->fields[i] = NULL;
+                }
                if (parser_reduce_table[reduce_index].rule_set_node_field_index_map == NULL)
                {
                    for (size_t i = 0; i < parser_reduce_table[reduce_index].n_states; i++)
                    {
-                        node_fields[i] = state_values_stack_index(&statevalues, -(int)parser_reduce_table[reduce_index].n_states + (int)i)->ast_node;
+                        node->fields[i] = state_values_stack_index(&statevalues, -(int)parser_reduce_table[reduce_index].n_states + (int)i)->ast_node;
                    }
                }
                else
                {
                    for (size_t i = 0; i < parser_reduce_table[reduce_index].n_states; i++)
                    {
-                        node_fields[parser_reduce_table[reduce_index].rule_set_node_field_index_map[i]] = state_values_stack_index(&statevalues, -(int)parser_reduce_table[reduce_index].n_states + (int)i)->ast_node;
+                        node->fields[parser_reduce_table[reduce_index].rule_set_node_field_index_map[i]] = state_values_stack_index(&statevalues, -(int)parser_reduce_table[reduce_index].n_states + (int)i)->ast_node;
                    }
                }
-                reduced_parser_node = node_fields;
+                bool position_found = false;
+                for (size_t i = 0; i < n_fields; i++)
+                {
+                    ASTNode * child = (ASTNode *)node->fields[i];
+                    if ((child != NULL) && <%= @grammar.prefix %>position_valid(child->position))
+                    {
+                        if (!position_found)
+                        {
+                            node->position = child->position;
+                            position_found = true;
+                        }
+                        node->end_position = child->end_position;
+                    }
+                }
+                reduced_parser_node = node;
            }
            else
            {
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@ -8,6 +8,8 @@
 module <%= @grammar.modulename %>;
 <% end %>

+import core.stdc.stdlib : malloc;
+
 /**************************************************************************
 * User code blocks
 *************************************************************************/
@ -61,6 +63,15 @@ public struct <%= @grammar.prefix %>position_t

    /** Input text column (0-based). */
    uint col;
+
+    /** Invalid position value. */
+    enum INVALID = <%= @grammar.prefix %>position_t(0xFFFF_FFFF, 0xFFFF_FFFF);
+
+    /** Return whether the position is valid. */
+    public @property bool valid()
+    {
+        return row != 0xFFFF_FFFFu;
+    }
 }

 <% if @grammar.ast %>
@ -77,12 +88,22 @@ public union <%= @grammar.prefix %>value_t
 <% end %>

 <% if @grammar.ast %>
+/** Common AST node structure. */
+private struct ASTNode
+{
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
+    void *[0] fields;
+}
+
 /** AST node types. @{ */
 public struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>
 {
+    /* ASTNode fields must be present in the same order here. */
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
    <%= @grammar.prefix %>token_t token;
    <%= @grammar.prefix %>value_t pvalue;
-    <%= @grammar.prefix %>position_t position;
 }

 <%   @parser.rule_sets.each do |name, rule_set| %>
@ -90,6 +111,8 @@ public struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>
 <%     next if rule_set.optional? %>
 public struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 {
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
 <%     rule_set.ast_fields.each do |fields| %>
    union
    {
@ -107,9 +130,12 @@ public struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 /** Lexed token information. */
 public struct <%= @grammar.prefix %>token_info_t
 {
-    /** Text position where the token was found. */
+    /** Text position of first code point in token. */
    <%= @grammar.prefix %>position_t position;

+    /** Text position of last code point in token. */
+    <%= @grammar.prefix %>position_t end_position;
+
    /** Number of input bytes used by the token. */
    size_t length;

@ -373,7 +399,10 @@ private struct lexer_match_info_t
    /** Number of bytes of input text used to match. */
    size_t length;

-    /** Input text position delta. */
+    /** Input text position delta to end of token. */
+    <%= @grammar.prefix %>position_t end_delta_position;
+
+    /** Input text position delta to next code point after token end. */
    <%= @grammar.prefix %>position_t delta_position;

    /** Accepting lexer state from the match. */
@ -501,6 +530,7 @@ private size_t find_longest_match(<%= @grammar.prefix %>context_t * context,
            if (transition_state != INVALID_LEXER_STATE_ID)
            {
                attempt_match.length += code_point_length;
+                attempt_match.end_delta_position = attempt_match.delta_position;
                if (code_point == '\n')
                {
                    attempt_match.delta_position.row++;
@ -633,11 +663,22 @@ private size_t attempt_lex_token(<%= @grammar.prefix %>context_t * context, <%=
        }
        token_info.token = token_to_accept;
        token_info.length = match_info.length;
+        if (match_info.end_delta_position.row != 0u)
+        {
+            token_info.end_position.row = token_info.position.row + match_info.end_delta_position.row;
+            token_info.end_position.col = match_info.end_delta_position.col;
+        }
+        else
+        {
+            token_info.end_position.row = token_info.position.row;
+            token_info.end_position.col = token_info.position.col + match_info.end_delta_position.col;
+        }
        *out_token_info = token_info;
        return P_SUCCESS;

    case P_EOF:
        token_info.token = TOKEN___EOF;
+        token_info.end_position = token_info.position;
        *out_token_info = token_info;
        return P_SUCCESS;

@ -997,7 +1038,7 @@ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * cont
            {
                /* We shifted a token, mark it consumed. */
 <% if @grammar.ast %>
-                <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token, token_info.pvalue, token_info.position);
+                <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %> * token_ast_node = new <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>(token_info.position, token_info.end_position, token, token_info.pvalue);
                statevalues[$-1].ast_node = token_ast_node;
 <% else %>
                statevalues[$-1].pvalue = token_info.pvalue;
@ -1030,26 +1071,43 @@ public size_t <%= @grammar.prefix %>parse(<%= @grammar.prefix %>context_t * cont
            }
            else if (parser_reduce_table[reduce_index].n_states > 0)
            {
-                void *[] node_fields = new void *[parser_reduce_table[reduce_index].rule_set_node_field_array_size];
-                foreach (i; 0..parser_reduce_table[reduce_index].rule_set_node_field_array_size)
+                size_t n_fields = parser_reduce_table[reduce_index].rule_set_node_field_array_size;
+                ASTNode * node = cast(ASTNode *)malloc(ASTNode.sizeof + n_fields * (void *).sizeof);
+                node.position = <%= @grammar.prefix %>position_t.INVALID;
+                node.end_position = <%= @grammar.prefix %>position_t.INVALID;
+                foreach (i; 0..n_fields)
                {
-                    node_fields[i] = null;
+                    node.fields[i] = null;
                }
                if (parser_reduce_table[reduce_index].rule_set_node_field_index_map is null)
                {
                    foreach (i; 0..parser_reduce_table[reduce_index].n_states)
                    {
-                        node_fields[i] = statevalues[$ - parser_reduce_table[reduce_index].n_states + i].ast_node;
+                        node.fields[i] = statevalues[$ - parser_reduce_table[reduce_index].n_states + i].ast_node;
                    }
                }
                else
                {
                    foreach (i; 0..parser_reduce_table[reduce_index].n_states)
                    {
-                        node_fields[parser_reduce_table[reduce_index].rule_set_node_field_index_map[i]] = statevalues[$ - parser_reduce_table[reduce_index].n_states + i].ast_node;
+                        node.fields[parser_reduce_table[reduce_index].rule_set_node_field_index_map[i]] = statevalues[$ - parser_reduce_table[reduce_index].n_states + i].ast_node;
                    }
                }
-                reduced_parser_node = node_fields.ptr;
+                bool position_found = false;
+                foreach (i; 0..n_fields)
+                {
+                    ASTNode * child = cast(ASTNode *)node.fields[i];
+                    if (child && child.position.valid)
+                    {
+                        if (!position_found)
+                        {
+                            node.position = child.position;
+                            position_found = true;
+                        }
+                        node.end_position = child.end_position;
+                    }
+                }
+                reduced_parser_node = node;
            }
            else
            {
--- a/assets/parser.h.erb
+++ b/assets/parser.h.erb
@ -52,6 +52,9 @@ typedef struct
    uint32_t col;
 } <%= @grammar.prefix %>position_t;

+/** Return whether the position is valid. */
+#define <%= @grammar.prefix %>position_valid(p) ((p).row != 0xFFFFFFFFu)
+
 /** User header code blocks. */
 <%= @grammar.code_blocks.fetch("header", "") %>

@ -72,9 +75,11 @@ typedef union
 /** AST node types. @{ */
 typedef struct <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>
 {
+    /* ASTNode fields must be present in the same order here. */
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
    <%= @grammar.prefix %>token_t token;
    <%= @grammar.prefix %>value_t pvalue;
-    <%= @grammar.prefix %>position_t position;
 } <%= @grammar.ast_prefix %>Token<%= @grammar.ast_suffix %>;

 <%   @parser.rule_sets.each do |name, rule_set| %>
@ -88,6 +93,8 @@ struct <%= name %>;
 <%     next if rule_set.optional? %>
 typedef struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 {
+    <%= @grammar.prefix %>position_t position;
+    <%= @grammar.prefix %>position_t end_position;
 <%     rule_set.ast_fields.each do |fields| %>
    union
    {
@ -105,9 +112,12 @@ typedef struct <%= @grammar.ast_prefix %><%= name %><%= @grammar.ast_suffix %>
 /** Lexed token information. */
 typedef struct
 {
-    /** Text position where the token was found. */
+    /** Text position of first code point in token. */
    <%= @grammar.prefix %>position_t position;

+    /** Text position of last code point in token. */
+    <%= @grammar.prefix %>position_t end_position;
+
    /** Number of input bytes used by the token. */
    size_t length;

--- a/doc/user_guide.md
+++ b/doc/user_guide.md
@ -15,6 +15,7 @@ Propane is a LALR Parser Generator (LPG) which:
  * generates a table-driven shift/reduce parser to parse input in linear time
  * targets C or D language outputs
  * optionally supports automatic full AST generation
+  * tracks input text start and end positions for all matched tokens/rules
  * is MIT-licensed
  * is distributable as a standalone Ruby script

@ -35,9 +36,14 @@ Propane is typically invoked from the command-line as `./propane`.

    Usage: ./propane [options] <input-file> <output-file>
    Options:
-      --log LOG   Write log file
-      --version   Show program version and exit
-      -h, --help  Show this usage and exit
+      -h, --help  Show this usage and exit.
+      --log LOG   Write log file. This will show all parser states and their
+                  associated shifts and reduces. It can be helpful when
+                  debugging a grammar.
+      --version   Show program version and exit.
+      -w          Treat warnings as errors. This option will treat shift/reduce
+                  conflicts as fatal errors and will print them to stderr in
+                  addition to the log file.

 The user must specify the path to a Propane input grammar file and a path to an
 output file.
@ -502,7 +508,7 @@ tokenid str;
  mystringvalue = "";
  $mode(string);
 >>
-string: /[^"]+/ << mystringvalue += match; >>
+string: /[^"]+/ << mystringvalue ~= match; >>
 string: /"/ <<
  $mode(default);
  return $token(str);
@ -762,6 +768,13 @@ A pointer to this instance is passed to the generated functions.
 The `p_position_t` structure contains two fields `row` and `col`.
 These fields contain the 0-based row and column describing a parser position.

+For D targets, the `p_position_t` structure can be checked for validity by
+querying the `valid` property.
+
+For C targets, the `p_position_t` structure can be checked for validity by
+calling `p_position_valid(pos)` where `pos` is a `p_position_t` structure
+instance.
+
 ### AST Node Types

 If AST generation mode is enabled, a structure type for each rule will be
@ -772,13 +785,26 @@ AST node which refers to a raw parser token rather than a composite rule.

 #### AST Node Fields

-A `Token` node has two fields:
+All AST nodes have a `position` field specifying the text position of the
+beginning of the matched token or rule, and an `end_position` field specifying
+the text position of the end of the matched token or rule.
+Each of these fields are instances of the `p_position_t` structure.
+
+A `Token` node will always have a valid `position` and `end_position`.
+A rule node may not have valid positions if the rule allows for an empty match.
+In this case the `position` structure should be checked for validity before
+using it.
+For C targets this can be accomplished with
+`if (p_position_valid(node->position))` and for D targets this can be
+accomplished with `if (node.position.valid)`.
+
+A `Token` node has the following additional fields:

  * `token` which specifies which token was parsed (one of `TOKEN_*`)
  * `pvalue` which specifies the parser value for the token. If a lexer user
  code block assigned to `$$`, the assigned value will be stored here.

-The other generated AST node structures have fields generated based on the
+AST node structures for rules contain generated fields based on the
 right hand side components specified for all rules of a given name.

 In this example:
@ -802,7 +828,7 @@ The `Items` structure will have fields:

 If a rule can be empty (for example in the second `Items` rule above), then
 an instance of a pointer to that rule's generated AST node will be null if the
-parser matches the empty rule definition.
+parser matches the empty rule pattern.

 The non-positional AST node field pointer will not be generated if there are
 multiple positions in which an instance of the node it points to could be
@ -859,6 +885,24 @@ p_context_init(&context, input, input_length);
 size_t result = p_parse(&context);
 ```

+### `p_position_valid`
+
+The `p_position_valid()` function is only generated for C targets.
+it is used to determine whether or not a `p_position_t` structure is valid.
+
+Example:
+
+```
+if (p_position_valid(node->position))
+{
+    ....
+}
+```
+
+For D targets, rather than using `p_position_valid()`, the `valid` property
+function of the `p_position_t` structure can be queried
+(e.g. `if (node.position.valid)`).
+
 ### `p_result`

 The `p_result()` function can be used to retrieve the final parse value after
--- a/lib/propane/cli.rb
+++ b/lib/propane/cli.rb
@ -4,11 +4,11 @@ class Propane
    USAGE = <<EOF
 Usage: #{$0} [options] <input-file> <output-file>
 Options:
+  -h, --help  Show this usage and exit.
  --log LOG   Write log file. This will show all parser states and their
              associated shifts and reduces. It can be helpful when
              debugging a grammar.
  --version   Show program version and exit.
-  -h, --help  Show this usage and exit.
  -w          Treat warnings as errors. This option will treat shift/reduce
              conflicts as fatal errors and will print them to stderr in
              addition to the log file.
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@ -1081,17 +1081,17 @@ EOF
        expect(results.status).to eq 0
      end

-      it "stores the token position in the AST Token node" do
+      it "stores token and rule positions in AST nodes" do
        write_grammar <<EOF
 ast;

 token a;
-token b;
-token c;
+token bb;
+token c /c(.|\\n)*c/;
 drop /\\s+/;
 Start -> T T T;
 T -> a;
-T -> b;
+T -> bb;
 T -> c;
 EOF
        run_propane(language: language)
@ -1100,6 +1100,26 @@ EOF
        expect(results.stderr).to eq ""
        expect(results.status).to eq 0
      end
+
+      it "stores invalid positions for empty rule matches" do
+        write_grammar <<EOF
+ast;
+
+token a;
+token bb;
+token c /c(.|\\n)*c/;
+drop /\\s+/;
+Start -> T Start;
+Start -> ;
+T -> a A;
+A -> bb? c?;
+EOF
+        run_propane(language: language)
+        compile("spec/test_ast_invalid_positions.#{language}", language: language)
+        results = run_test
+        expect(results.stderr).to eq ""
+        expect(results.status).to eq 0
+      end
    end
  end
 end
--- a/spec/test_ast_invalid_positions.c
+++ b/spec/test_ast_invalid_positions.c
@ -0,0 +1,102 @@
+#include "testparser.h"
+#include <assert.h>
+#include <string.h>
+#include "testutils.h"
+
+int main()
+{
+    char const * input = "\na\n  bb ccc";
+    p_context_t context;
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    Start * start = p_result(&context);
+
+    assert_eq(1, start->pT1->pToken->position.row);
+    assert_eq(0, start->pT1->pToken->position.col);
+    assert_eq(1, start->pT1->pToken->end_position.row);
+    assert_eq(0, start->pT1->pToken->end_position.col);
+    assert(p_position_valid(start->pT1->pA->position));
+    assert_eq(2, start->pT1->pA->position.row);
+    assert_eq(2, start->pT1->pA->position.col);
+    assert_eq(2, start->pT1->pA->end_position.row);
+    assert_eq(7, start->pT1->pA->end_position.col);
+    assert_eq(1, start->pT1->position.row);
+    assert_eq(0, start->pT1->position.col);
+    assert_eq(2, start->pT1->end_position.row);
+    assert_eq(7, start->pT1->end_position.col);
+
+    assert_eq(1, start->position.row);
+    assert_eq(0, start->position.col);
+    assert_eq(2, start->end_position.row);
+    assert_eq(7, start->end_position.col);
+
+    input = "a\nbb";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start->pT1->pToken->position.row);
+    assert_eq(0, start->pT1->pToken->position.col);
+    assert_eq(0, start->pT1->pToken->end_position.row);
+    assert_eq(0, start->pT1->pToken->end_position.col);
+    assert(p_position_valid(start->pT1->pA->position));
+    assert_eq(1, start->pT1->pA->position.row);
+    assert_eq(0, start->pT1->pA->position.col);
+    assert_eq(1, start->pT1->pA->end_position.row);
+    assert_eq(1, start->pT1->pA->end_position.col);
+    assert_eq(0, start->pT1->position.row);
+    assert_eq(0, start->pT1->position.col);
+    assert_eq(1, start->pT1->end_position.row);
+    assert_eq(1, start->pT1->end_position.col);
+
+    assert_eq(0, start->position.row);
+    assert_eq(0, start->position.col);
+    assert_eq(1, start->end_position.row);
+    assert_eq(1, start->end_position.col);
+
+    input = "a\nc\nc";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start->pT1->pToken->position.row);
+    assert_eq(0, start->pT1->pToken->position.col);
+    assert_eq(0, start->pT1->pToken->end_position.row);
+    assert_eq(0, start->pT1->pToken->end_position.col);
+    assert(p_position_valid(start->pT1->pA->position));
+    assert_eq(1, start->pT1->pA->position.row);
+    assert_eq(0, start->pT1->pA->position.col);
+    assert_eq(2, start->pT1->pA->end_position.row);
+    assert_eq(0, start->pT1->pA->end_position.col);
+    assert_eq(0, start->pT1->position.row);
+    assert_eq(0, start->pT1->position.col);
+    assert_eq(2, start->pT1->end_position.row);
+    assert_eq(0, start->pT1->end_position.col);
+
+    assert_eq(0, start->position.row);
+    assert_eq(0, start->position.col);
+    assert_eq(2, start->end_position.row);
+    assert_eq(0, start->end_position.col);
+
+    input = "a";
+    p_context_init(&context, (uint8_t const *)input, strlen(input));
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start->pT1->pToken->position.row);
+    assert_eq(0, start->pT1->pToken->position.col);
+    assert_eq(0, start->pT1->pToken->end_position.row);
+    assert_eq(0, start->pT1->pToken->end_position.col);
+    assert(!p_position_valid(start->pT1->pA->position));
+    assert_eq(0, start->pT1->position.row);
+    assert_eq(0, start->pT1->position.col);
+    assert_eq(0, start->pT1->end_position.row);
+    assert_eq(0, start->pT1->end_position.col);
+
+    assert_eq(0, start->position.row);
+    assert_eq(0, start->position.col);
+    assert_eq(0, start->end_position.row);
+    assert_eq(0, start->end_position.col);
+
+    return 0;
+}
--- a/spec/test_ast_invalid_positions.d
+++ b/spec/test_ast_invalid_positions.d
@ -0,0 +1,104 @@
+import testparser;
+import std.stdio;
+import testutils;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input = "\na\n  bb ccc";
+    p_context_t context;
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    Start * start = p_result(&context);
+
+    assert_eq(1, start.pT1.pToken.position.row);
+    assert_eq(0, start.pT1.pToken.position.col);
+    assert_eq(1, start.pT1.pToken.end_position.row);
+    assert_eq(0, start.pT1.pToken.end_position.col);
+    assert(start.pT1.pA.position.valid);
+    assert_eq(2, start.pT1.pA.position.row);
+    assert_eq(2, start.pT1.pA.position.col);
+    assert_eq(2, start.pT1.pA.end_position.row);
+    assert_eq(7, start.pT1.pA.end_position.col);
+    assert_eq(1, start.pT1.position.row);
+    assert_eq(0, start.pT1.position.col);
+    assert_eq(2, start.pT1.end_position.row);
+    assert_eq(7, start.pT1.end_position.col);
+
+    assert_eq(1, start.position.row);
+    assert_eq(0, start.position.col);
+    assert_eq(2, start.end_position.row);
+    assert_eq(7, start.end_position.col);
+
+    input = "a\nbb";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start.pT1.pToken.position.row);
+    assert_eq(0, start.pT1.pToken.position.col);
+    assert_eq(0, start.pT1.pToken.end_position.row);
+    assert_eq(0, start.pT1.pToken.end_position.col);
+    assert(start.pT1.pA.position.valid);
+    assert_eq(1, start.pT1.pA.position.row);
+    assert_eq(0, start.pT1.pA.position.col);
+    assert_eq(1, start.pT1.pA.end_position.row);
+    assert_eq(1, start.pT1.pA.end_position.col);
+    assert_eq(0, start.pT1.position.row);
+    assert_eq(0, start.pT1.position.col);
+    assert_eq(1, start.pT1.end_position.row);
+    assert_eq(1, start.pT1.end_position.col);
+
+    assert_eq(0, start.position.row);
+    assert_eq(0, start.position.col);
+    assert_eq(1, start.end_position.row);
+    assert_eq(1, start.end_position.col);
+
+    input = "a\nc\nc";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start.pT1.pToken.position.row);
+    assert_eq(0, start.pT1.pToken.position.col);
+    assert_eq(0, start.pT1.pToken.end_position.row);
+    assert_eq(0, start.pT1.pToken.end_position.col);
+    assert(start.pT1.pA.position.valid);
+    assert_eq(1, start.pT1.pA.position.row);
+    assert_eq(0, start.pT1.pA.position.col);
+    assert_eq(2, start.pT1.pA.end_position.row);
+    assert_eq(0, start.pT1.pA.end_position.col);
+    assert_eq(0, start.pT1.position.row);
+    assert_eq(0, start.pT1.position.col);
+    assert_eq(2, start.pT1.end_position.row);
+    assert_eq(0, start.pT1.end_position.col);
+
+    assert_eq(0, start.position.row);
+    assert_eq(0, start.position.col);
+    assert_eq(2, start.end_position.row);
+    assert_eq(0, start.end_position.col);
+
+    input = "a";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+    start = p_result(&context);
+
+    assert_eq(0, start.pT1.pToken.position.row);
+    assert_eq(0, start.pT1.pToken.position.col);
+    assert_eq(0, start.pT1.pToken.end_position.row);
+    assert_eq(0, start.pT1.pToken.end_position.col);
+    assert(!start.pT1.pA.position.valid);
+    assert_eq(0, start.pT1.position.row);
+    assert_eq(0, start.pT1.position.col);
+    assert_eq(0, start.pT1.end_position.row);
+    assert_eq(0, start.pT1.end_position.col);
+
+    assert_eq(0, start.position.row);
+    assert_eq(0, start.position.col);
+    assert_eq(0, start.end_position.row);
+    assert_eq(0, start.end_position.col);
+}
--- a/spec/test_ast_token_positions.c
+++ b/spec/test_ast_token_positions.c
@ -5,29 +5,80 @@

 int main()
 {
-    char const * input = "abc";
+    char const * input = "abbccc";
    p_context_t context;
    p_context_init(&context, (uint8_t const *)input, strlen(input));
    assert(p_parse(&context) == P_SUCCESS);
    Start * start = p_result(&context);
+
    assert_eq(0, start->pT1->pToken->position.row);
    assert_eq(0, start->pT1->pToken->position.col);
+    assert_eq(0, start->pT1->pToken->end_position.row);
+    assert_eq(0, start->pT1->pToken->end_position.col);
+    assert_eq(0, start->pT1->position.row);
+    assert_eq(0, start->pT1->position.col);
+    assert_eq(0, start->pT1->end_position.row);
+    assert_eq(0, start->pT1->end_position.col);
+
    assert_eq(0, start->pT2->pToken->position.row);
    assert_eq(1, start->pT2->pToken->position.col);
-    assert_eq(0, start->pT3->pToken->position.row);
-    assert_eq(2, start->pT3->pToken->position.col);
+    assert_eq(0, start->pT2->pToken->end_position.row);
+    assert_eq(2, start->pT2->pToken->end_position.col);
+    assert_eq(0, start->pT2->position.row);
+    assert_eq(1, start->pT2->position.col);
+    assert_eq(0, start->pT2->end_position.row);
+    assert_eq(2, start->pT2->end_position.col);

-    input = "\n\n  a\nc\n\n     a";
+    assert_eq(0, start->pT3->pToken->position.row);
+    assert_eq(3, start->pT3->pToken->position.col);
+    assert_eq(0, start->pT3->pToken->end_position.row);
+    assert_eq(5, start->pT3->pToken->end_position.col);
+    assert_eq(0, start->pT3->position.row);
+    assert_eq(3, start->pT3->position.col);
+    assert_eq(0, start->pT3->end_position.row);
+    assert_eq(5, start->pT3->end_position.col);
+
+    assert_eq(0, start->position.row);
+    assert_eq(0, start->position.col);
+    assert_eq(0, start->end_position.row);
+    assert_eq(5, start->end_position.col);
+
+    input = "\n\n  bb\nc\ncc\n\n     a";
    p_context_init(&context, (uint8_t const *)input, strlen(input));
    assert(p_parse(&context) == P_SUCCESS);
    start = p_result(&context);
+
    assert_eq(2, start->pT1->pToken->position.row);
    assert_eq(2, start->pT1->pToken->position.col);
+    assert_eq(2, start->pT1->pToken->end_position.row);
+    assert_eq(3, start->pT1->pToken->end_position.col);
+    assert_eq(2, start->pT1->position.row);
+    assert_eq(2, start->pT1->position.col);
+    assert_eq(2, start->pT1->end_position.row);
+    assert_eq(3, start->pT1->end_position.col);
+
    assert_eq(3, start->pT2->pToken->position.row);
    assert_eq(0, start->pT2->pToken->position.col);
-    assert_eq(5, start->pT3->pToken->position.row);
+    assert_eq(4, start->pT2->pToken->end_position.row);
+    assert_eq(1, start->pT2->pToken->end_position.col);
+    assert_eq(3, start->pT2->position.row);
+    assert_eq(0, start->pT2->position.col);
+    assert_eq(4, start->pT2->end_position.row);
+    assert_eq(1, start->pT2->end_position.col);
+
+    assert_eq(6, start->pT3->pToken->position.row);
    assert_eq(5, start->pT3->pToken->position.col);
+    assert_eq(6, start->pT3->pToken->end_position.row);
+    assert_eq(5, start->pT3->pToken->end_position.col);
+    assert_eq(6, start->pT3->position.row);
+    assert_eq(5, start->pT3->position.col);
+    assert_eq(6, start->pT3->end_position.row);
+    assert_eq(5, start->pT3->end_position.col);
+
+    assert_eq(2, start->position.row);
+    assert_eq(2, start->position.col);
+    assert_eq(6, start->end_position.row);
+    assert_eq(5, start->end_position.col);

    return 0;
 }
-
--- a/spec/test_ast_token_positions.d
+++ b/spec/test_ast_token_positions.d
@ -9,26 +9,78 @@ int main()

 unittest
 {
-    string input = "abc";
+    string input = "abbccc";
    p_context_t context;
    p_context_init(&context, input);
    assert(p_parse(&context) == P_SUCCESS);
    Start * start = p_result(&context);
+
    assert_eq(0, start.pT1.pToken.position.row);
    assert_eq(0, start.pT1.pToken.position.col);
+    assert_eq(0, start.pT1.pToken.end_position.row);
+    assert_eq(0, start.pT1.pToken.end_position.col);
+    assert_eq(0, start.pT1.position.row);
+    assert_eq(0, start.pT1.position.col);
+    assert_eq(0, start.pT1.end_position.row);
+    assert_eq(0, start.pT1.end_position.col);
+
    assert_eq(0, start.pT2.pToken.position.row);
    assert_eq(1, start.pT2.pToken.position.col);
-    assert_eq(0, start.pT3.pToken.position.row);
-    assert_eq(2, start.pT3.pToken.position.col);
+    assert_eq(0, start.pT2.pToken.end_position.row);
+    assert_eq(2, start.pT2.pToken.end_position.col);
+    assert_eq(0, start.pT2.position.row);
+    assert_eq(1, start.pT2.position.col);
+    assert_eq(0, start.pT2.end_position.row);
+    assert_eq(2, start.pT2.end_position.col);

-    input = "\n\n  a\nc\n\n     a";
+    assert_eq(0, start.pT3.pToken.position.row);
+    assert_eq(3, start.pT3.pToken.position.col);
+    assert_eq(0, start.pT3.pToken.end_position.row);
+    assert_eq(5, start.pT3.pToken.end_position.col);
+    assert_eq(0, start.pT3.position.row);
+    assert_eq(3, start.pT3.position.col);
+    assert_eq(0, start.pT3.end_position.row);
+    assert_eq(5, start.pT3.end_position.col);
+
+    assert_eq(0, start.position.row);
+    assert_eq(0, start.position.col);
+    assert_eq(0, start.end_position.row);
+    assert_eq(5, start.end_position.col);
+
+    input = "\n\n  bb\nc\ncc\n\n     a";
    p_context_init(&context, input);
    assert(p_parse(&context) == P_SUCCESS);
    start = p_result(&context);
+
    assert_eq(2, start.pT1.pToken.position.row);
    assert_eq(2, start.pT1.pToken.position.col);
+    assert_eq(2, start.pT1.pToken.end_position.row);
+    assert_eq(3, start.pT1.pToken.end_position.col);
+    assert_eq(2, start.pT1.position.row);
+    assert_eq(2, start.pT1.position.col);
+    assert_eq(2, start.pT1.end_position.row);
+    assert_eq(3, start.pT1.end_position.col);
+
    assert_eq(3, start.pT2.pToken.position.row);
    assert_eq(0, start.pT2.pToken.position.col);
-    assert_eq(5, start.pT3.pToken.position.row);
+    assert_eq(4, start.pT2.pToken.end_position.row);
+    assert_eq(1, start.pT2.pToken.end_position.col);
+    assert_eq(3, start.pT2.position.row);
+    assert_eq(0, start.pT2.position.col);
+    assert_eq(4, start.pT2.end_position.row);
+    assert_eq(1, start.pT2.end_position.col);
+
+    assert_eq(6, start.pT3.pToken.position.row);
    assert_eq(5, start.pT3.pToken.position.col);
+    assert_eq(6, start.pT3.pToken.end_position.row);
+    assert_eq(5, start.pT3.pToken.end_position.col);
+    assert_eq(6, start.pT3.position.row);
+    assert_eq(5, start.pT3.position.col);
+    assert_eq(6, start.pT3.end_position.row);
+    assert_eq(5, start.pT3.end_position.col);
+
+    assert_eq(2, start.position.row);
+    assert_eq(2, start.position.col);
+    assert_eq(6, start.end_position.row);
+    assert_eq(5, start.end_position.col);
 }
--- a/spec/test_lexer.c
+++ b/spec/test_lexer.c
@ -43,41 +43,57 @@ int main()
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 0u);
    assert(token_info.position.col == 0u);
+    assert(token_info.end_position.row == 0u);
+    assert(token_info.end_position.col == 0u);
    assert(token_info.length == 1u);
    assert(token_info.token == TOKEN_int);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 0u);
    assert(token_info.position.col == 2u);
+    assert(token_info.end_position.row == 0u);
+    assert(token_info.end_position.col == 2u);
    assert(token_info.length == 1u);
    assert(token_info.token == TOKEN_plus);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 0u);
    assert(token_info.position.col == 4u);
+    assert(token_info.end_position.row == 0u);
+    assert(token_info.end_position.col == 4u);
    assert(token_info.length == 1u);
    assert(token_info.token == TOKEN_int);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 0u);
    assert(token_info.position.col == 6u);
+    assert(token_info.end_position.row == 0u);
+    assert(token_info.end_position.col == 6u);
    assert(token_info.length == 1u);
    assert(token_info.token == TOKEN_times);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 1u);
    assert(token_info.position.col == 0u);
+    assert(token_info.end_position.row == 1u);
+    assert(token_info.end_position.col == 2u);
    assert(token_info.length == 3u);
    assert(token_info.token == TOKEN_int);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 1u);
    assert(token_info.position.col == 4u);
+    assert(token_info.end_position.row == 1u);
+    assert(token_info.end_position.col == 4u);
    assert(token_info.length == 1u);
    assert(token_info.token == TOKEN_plus);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 1u);
    assert(token_info.position.col == 6u);
+    assert(token_info.end_position.row == 1u);
+    assert(token_info.end_position.col == 8u);
    assert(token_info.length == 3u);
    assert(token_info.token == TOKEN_int);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 1u);
    assert(token_info.position.col == 9u);
+    assert(token_info.end_position.row == 1u);
+    assert(token_info.end_position.col == 9u);
    assert(token_info.length == 0u);
    assert(token_info.token == TOKEN___EOF);

@ -85,6 +101,8 @@ int main()
    assert(p_lex(&context, &token_info) == P_SUCCESS);
    assert(token_info.position.row == 0u);
    assert(token_info.position.col == 0u);
+    assert(token_info.end_position.row == 0u);
+    assert(token_info.end_position.col == 0u);
    assert(token_info.length == 0u);
    assert(token_info.token == TOKEN___EOF);

--- a/spec/test_lexer.d
+++ b/spec/test_lexer.d
@ -47,23 +47,23 @@ unittest
    p_context_t context;
    p_context_init(&context, input);
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(0, 0), 1, TOKEN_int));
+    assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 1, TOKEN_int));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(0, 2), 1, TOKEN_plus));
+    assert(token_info == p_token_info_t(p_position_t(0, 2), p_position_t(0, 2), 1, TOKEN_plus));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(0, 4), 1, TOKEN_int));
+    assert(token_info == p_token_info_t(p_position_t(0, 4), p_position_t(0, 4), 1, TOKEN_int));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(0, 6), 1, TOKEN_times));
+    assert(token_info == p_token_info_t(p_position_t(0, 6), p_position_t(0, 6), 1, TOKEN_times));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(1, 0), 3, TOKEN_int));
+    assert(token_info == p_token_info_t(p_position_t(1, 0), p_position_t(1, 2), 3, TOKEN_int));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(1, 4), 1, TOKEN_plus));
+    assert(token_info == p_token_info_t(p_position_t(1, 4), p_position_t(1, 4), 1, TOKEN_plus));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(1, 6), 3, TOKEN_int));
+    assert(token_info == p_token_info_t(p_position_t(1, 6), p_position_t(1, 8), 3, TOKEN_int));
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(1, 9), 0, TOKEN___EOF));
+    assert(token_info == p_token_info_t(p_position_t(1, 9), p_position_t(1, 9), 0, TOKEN___EOF));

    p_context_init(&context, "");
    assert(p_lex(&context, &token_info) == P_SUCCESS);
-    assert(token_info == p_token_info_t(p_position_t(0, 0), 0, TOKEN___EOF));
+    assert(token_info == p_token_info_t(p_position_t(0, 0), p_position_t(0, 0), 0, TOKEN___EOF));
 }
Author	SHA1	Message	Date
Josh Holtrop	9746b3f2bf	Document position tracking fields in user guide - #27	2024-07-21 14:04:51 -04:00
Josh Holtrop	c5b8fc28bd	Move INVALID_POSITION from header to C source - #27	2024-07-21 13:39:34 -04:00
Josh Holtrop	092fce61eb	Test position validity for empty matching rules - #27	2024-07-21 13:39:30 -04:00
Josh Holtrop	e647248e34	Track start and end position of rules in AST nodes - #27	2024-07-19 15:37:37 -04:00
Josh Holtrop	f4ae1b8601	Add position fields to AST nodes (not populated yet) - #27	2024-07-19 14:34:50 -04:00
Josh Holtrop	eae2e17f41	Test tracking token end positions when the token spans a newline - #27	2024-07-18 12:09:26 -04:00
Josh Holtrop	87d6d29d60	Store token end position - #27	2024-07-18 12:03:44 -04:00