From 424ddfe55ae4e72ec9d1e7f46b41c9dcc59e9c26 Mon Sep 17 00:00:00 2001
From: Josh Holtrop <jholtrop@gmail.com>
Date: Wed, 12 Jul 2023 19:22:44 -0400
Subject: [PATCH] Output position info for various error return codes - close
 #10

---
 assets/parser.d.erb                           | 52 +++++++++++++++++--
 spec/propane_spec.rb                          | 15 ++++++
 .../test_d_parser_rule_from_multiple_states.d |  1 +
 spec/test_error_positions.d                   | 36 +++++++++++++
 4 files changed, 99 insertions(+), 5 deletions(-)
 create mode 100644 spec/test_error_positions.d

diff --git a/assets/parser.d.erb b/assets/parser.d.erb
index eea5961..5fe07dd 100644
--- a/assets/parser.d.erb
+++ b/assets/parser.d.erb
@@ -106,7 +106,7 @@ public struct p_context_t
     size_t input_index;
 
     /** Input text position (row/column). */
-    p_position_t input_position;
+    p_position_t text_position;
 
     /** Current lexer mode. */
     size_t mode;
@@ -443,6 +443,7 @@ private size_t find_longest_match(p_context_t * context,
 {
     lexer_match_info_t longest_match;
     lexer_match_info_t attempt_match;
+    *out_match_info = longest_match;
     uint current_state = lexer_mode_table[context.mode].state_table_offset;
     for (;;)
     {
@@ -506,6 +507,13 @@ private size_t find_longest_match(p_context_t * context,
             }
             break;
 
+        case P_DECODE_ERROR:
+            /* If we see a decode error, we may be partially in the middle of
+             * matching a pattern, so return the attempted match info so that
+             * the input text position can be updated. */
+            *out_match_info = attempt_match;
+            return result;
+
         default:
             return result;
         }
@@ -533,7 +541,7 @@ private size_t find_longest_match(p_context_t * context,
 private size_t attempt_lex_token(p_context_t * context, p_token_info_t * out_token_info)
 {
     p_token_info_t token_info;
-    token_info.position = context.input_position;
+    token_info.position = context.text_position;
     token_info.token = INVALID_TOKEN_ID;
     *out_token_info = token_info; // TODO: remove
     lexer_match_info_t match_info;
@@ -560,15 +568,16 @@ private size_t attempt_lex_token(p_context_t * context, p_token_info_t * out_tok
 
         /* Update the input position tracking. */
         context.input_index += match_info.length;
-        context.input_position.row += match_info.delta_position.row;
+        context.text_position.row += match_info.delta_position.row;
         if (match_info.delta_position.row != 0u)
         {
-            context.input_position.col = match_info.delta_position.col;
+            context.text_position.col = match_info.delta_position.col;
         }
         else
         {
-            context.input_position.col += match_info.delta_position.col;
+            context.text_position.col += match_info.delta_position.col;
         }
+
         if (token_to_accept == INVALID_TOKEN_ID)
         {
             return P_DROP;
@@ -583,6 +592,20 @@ private size_t attempt_lex_token(p_context_t * context, p_token_info_t * out_tok
         *out_token_info = token_info;
         return P_SUCCESS;
 
+    case P_DECODE_ERROR:
+        /* Update the input position tracking. */
+        context.input_index += match_info.length;
+        context.text_position.row += match_info.delta_position.row;
+        if (match_info.delta_position.row != 0u)
+        {
+            context.text_position.col = match_info.delta_position.col;
+        }
+        else
+        {
+            context.text_position.col += match_info.delta_position.col;
+        }
+        return result;
+
     default:
         return result;
     }
@@ -902,6 +925,12 @@ public size_t p_parse(p_context_t * context)
         {
             writeln("{other}");
         }
+        /* A token was successfully lexed, so the input text position was
+         * advanced. However, this is an unexpected token, so we want to reset
+         * the context text position to point to the token rather than the text
+         * after it, so that if the caller wants to report the error position,
+         * it will point to the correct position of the unexpected token. */
+        context.text_position = token_info.position;
         return P_UNEXPECTED_TOKEN;
     }
 }
@@ -918,3 +947,16 @@ public <%= start_rule_type[1] %> p_result(p_context_t * context)
 {
     return context.parse_result.v_<%= start_rule_type[0] %>;
 }
+
+/**
+ * Get the current text input position.
+ *
+ * @param context
+ *   Lexer/parser context structure.
+ *
+ * @return Current text position.
+ */
+public p_position_t p_position(p_context_t * context)
+{
+    return context.text_position;
+}
diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb
index f31bc14..1a13360 100644
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@@ -359,6 +359,21 @@ EOF
     expect(results.status).to eq 0
   end
 
+  it "tracks position of parser errors" do
+    write_grammar <<EOF
+token a;
+token num /\\d+/;
+drop /\\s+/;
+Start -> a num Start;
+Start -> a num;
+EOF
+    build_parser
+    compile("spec/test_error_positions.d")
+    results = run
+    expect(results.stderr).to eq ""
+    expect(results.status).to eq 0
+  end
+
   it "allows creating a JSON parser" do
     write_grammar(File.read("spec/json_parser.propane"))
     build_parser
diff --git a/spec/test_d_parser_rule_from_multiple_states.d b/spec/test_d_parser_rule_from_multiple_states.d
index 4671381..e861610 100644
--- a/spec/test_d_parser_rule_from_multiple_states.d
+++ b/spec/test_d_parser_rule_from_multiple_states.d
@@ -12,6 +12,7 @@ unittest
     p_context_t context;
     p_context_init(&context, input);
     assert(p_parse(&context) == P_UNEXPECTED_TOKEN);
+    assert(p_position(&context) == p_position_t(0, 1));
 
     input = "a b";
     p_context_init(&context, input);
diff --git a/spec/test_error_positions.d b/spec/test_error_positions.d
new file mode 100644
index 0000000..89bfbaa
--- /dev/null
+++ b/spec/test_error_positions.d
@@ -0,0 +1,36 @@
+import testparser;
+import std.stdio;
+
+int main()
+{
+    return 0;
+}
+
+unittest
+{
+    string input = "a 42";
+    p_context_t context;
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_SUCCESS);
+
+    input = "a\n123\na  a";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_UNEXPECTED_TOKEN);
+    assert(p_position(&context) == p_position_t(2, 3));
+
+    input = "12";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_UNEXPECTED_TOKEN);
+    assert(p_position(&context) == p_position_t(0, 0));
+
+    input = "a 12\n\nab";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_UNEXPECTED_INPUT);
+    assert(p_position(&context) == p_position_t(2, 1));
+
+    input = "a 12\n\na\n\n77\na   \xAA";
+    p_context_init(&context, input);
+    assert(p_parse(&context) == P_DECODE_ERROR);
+    writeln(p_position(&context));
+    assert(p_position(&context) == p_position_t(5, 4));
+}