From 04367db0acde661d5457d89747b4fa75d0b8bcba Mon Sep 17 00:00:00 2001
From: Josh Holtrop <jholtrop@gmail.com>
Date: Wed, 28 Sep 2022 23:05:01 -0400
Subject: [PATCH] Add forward slashes around patterns and parse more robustly

---
 lib/propane/grammar.rb         | 90 +++++++++++++++++++++++++---------
 spec/propane/grammar_spec.rb   |  8 +--
 spec/propane/lexer/dfa_spec.rb | 12 ++---
 spec/propane_spec.rb           | 20 ++++----
 4 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb
index 8f3164e..8fb499c 100644
--- a/lib/propane/grammar.rb
+++ b/lib/propane/grammar.rb
@@ -14,6 +14,7 @@ class Propane
       @rules = []
       @code_id = 0
       @line_number = 1
+      @next_line_number = @line_number
       @input = input.gsub("\r\n", "\n")
       parse_grammar!
     end
@@ -27,7 +28,6 @@ class Propane
     end
 
     def parse_statement!
-      @next_line_number = @line_number
       if parse_white_space!
       elsif parse_comment_line!
       elsif parse_module_statement!
@@ -42,53 +42,56 @@ class Propane
         end
         raise Error.new("Unexpected grammar input at line #{@line_number}: #{@input.chomp}")
       end
-      @line_number = @next_line_number
     end
 
     def parse_white_space!
-      consume!(/\A\s+/)
+      consume!(/\s+/)
     end
 
     def parse_comment_line!
-      consume!(/\A#.*\n/)
+      consume!(/#.*\n/)
     end
 
     def parse_module_statement!
-      if md = consume!(/\Amodule\s+(\S+)\s*;/)
+      if consume!(/module\s+/)
+        md = consume!(/([\w.]+)\s*/, "expected module name")
         @modulename = md[1]
+        consume!(/;/, "expected `;'")
       end
     end
 
     def parse_class_statement!
-      if md = consume!(/\Aclass\s+(\S+)\s*;/)
+      if consume!(/class\s+/)
+        md = consume!(/([\w.]+)\s*/, "expected class name")
         @classname = md[1]
+        consume!(/;/, "expected `;'")
       end
     end
 
     def parse_token_statement!
-      if md = consume!(/\Atoken\s+(\S+?)(?:\s+([^\n]+?))?\s*(?:;|<<\n(.*?)^>>\n)/m)
-        name, pattern, code = *md[1, 3]
-        if pattern.nil?
-          pattern = name
+      if consume!(/token\s+/)
+        md = consume!(/([a-zA-Z_][a-zA-Z_0-9]*)/, "expected token name")
+        name = md[1]
+        if consume!(/\s+/)
+          pattern = parse_pattern!
         end
-        unless name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/
-          raise Error.new("Invalid token name #{name.inspect}")
-        end
-        token = Token.new(name: name, id: @tokens.size, line_number: @line_number)
-        @tokens << token
-        if code
+        pattern ||= name
+        consume!(/\s+/)
+        if code = parse_code_block!
           code_id = @code_id
           @code_id += 1
         else
-          code_id = nil
+          consume!(/;/, "expected pattern or `;' or code block")
         end
+        token = Token.new(name: name, id: @tokens.size, line_number: @line_number)
+        @tokens << token
         pattern = Pattern.new(pattern: pattern, token: token, line_number: @line_number, code: code, code_id: code_id)
         @patterns << pattern
       end
     end
 
     def parse_tokenid_statement!
-      if md = consume!(/\Atokenid\s+(\S+?)\s*;/m)
+      if md = consume!(/tokenid\s+(\S+?)\s*;/m)
         name = md[1]
         unless name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/
           raise Error.new("Invalid token name #{name.inspect}")
@@ -99,14 +102,19 @@ class Propane
     end
 
     def parse_drop_statement!
-      if md = consume!(/\Adrop\s+(\S+)\s*;/)
-        pattern = md[1]
+      if md = consume!(/drop\s+/)
+        pattern = parse_pattern!
+        unless pattern
+          raise Error.new("Line #{@line_number}: expected pattern to follow `drop'")
+        end
+        consume!(/\s+/)
+        consume!(/;/, "expected `;'")
         @patterns << Pattern.new(pattern: pattern, line_number: @line_number, drop: true)
       end
     end
 
     def parse_rule_statement!
-      if md = consume!(/\A(\S+)\s*->\s*([^\n]*?)(?:;|<<\n(.*?)^>>\n)/m)
+      if md = consume!(/(\S+)\s*->\s*([^\n]*?)(?:;|<<\n(.*?)^>>\n)/m)
         rule_name, components, code = *md[1, 3]
         unless rule_name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/
           raise Error.new("Invalid rule name #{name.inspect}")
@@ -117,22 +125,56 @@ class Propane
       end
     end
 
+    def parse_pattern!
+      if md = consume!(%r{/})
+        pattern = ""
+        while !consume!(%r{/})
+          if consume!(%r{\\})
+            pattern += "\\"
+            if md = consume!(%r{(.)})
+              pattern += md[1]
+            else
+              raise Error.new("Line #{@line_number}: unterminated escape sequence")
+            end
+          elsif md = consume!(%r{(.)})
+            pattern += md[1]
+          end
+        end
+        pattern
+      end
+    end
+
+    def parse_code_block!
+      if md = consume!(/<<\n(.*?)^>>\n/m)
+        md[1]
+      end
+    end
+
     # Check if the input string matches the given regex.
     #
     # If so, remove the match from the input string, and update the line
-    # number.
+    # number. If the regex is not matched and an error message is provided,
+    # the error is raised.
     #
     # @param regex [Regexp]
     #   Regex to attempt to match.
+    # @param error_message [String, nil]
+    #   Error message to display if the regex is not matched. If nil and the
+    #   regex is not matched, an error is not raised.
     #
     # @return [MatchData, nil]
     #   MatchData for the given regex if it was matched and removed from the
     #   input.
-    def consume!(regex)
-      if md = @input.match(regex)
+    def consume!(regex, error_message = nil)
+      @line_number = @next_line_number
+      if md = @input.match(/\A#{regex}/)
         @input.slice!(0, md[0].size)
         @next_line_number += md[0].count("\n")
         md
+      elsif error_message
+        raise Error.new("Line #{@line_number}: Error: #{error_message}")
+      else
+        false
       end
     end
 
diff --git a/spec/propane/grammar_spec.rb b/spec/propane/grammar_spec.rb
index 823d36a..7604e49 100644
--- a/spec/propane/grammar_spec.rb
+++ b/spec/propane/grammar_spec.rb
@@ -10,7 +10,7 @@ class Foobar;
 token while;
 
 token id
-  [a-zA-Z_][a-zA-Z_0-9]*;
+  /[a-zA-Z_][a-zA-Z_0-9]*/;
 
 token token_with_code <<
 Code for the token
@@ -18,7 +18,7 @@ Code for the token
 
 tokenid token_with_no_pattern;
 
-drop \\s+;
+drop /\\s+/;
 
 A -> B <<
   a = 42;
@@ -46,13 +46,13 @@ EOF
 
       o = grammar.tokens.find {|token| token.name == "id"}
       expect(o).to_not be_nil
-      expect(o.line_number).to eq 8
+      expect(o.line_number).to eq 9
       expect(o.id).to eq 1
 
       o = grammar.patterns.find {|pattern| pattern.token == o}
       expect(o).to_not be_nil
       expect(o.pattern).to eq "[a-zA-Z_][a-zA-Z_0-9]*"
-      expect(o.line_number).to eq 8
+      expect(o.line_number).to eq 9
       expect(o.code_id).to be_nil
       expect(o.code).to be_nil
 
diff --git a/spec/propane/lexer/dfa_spec.rb b/spec/propane/lexer/dfa_spec.rb
index 5fab585..8f5789d 100644
--- a/spec/propane/lexer/dfa_spec.rb
+++ b/spec/propane/lexer/dfa_spec.rb
@@ -82,15 +82,15 @@ EOF
     expect(run(<<EOF, "foobar")).to eq expected
 token foo;
 token bar;
-token identifier [a-z]+;
+token identifier /[a-z]+/;
 EOF
     expected = [
       ["plusplus", "++"],
       ["plus", "+"],
     ]
     expect(run(<<EOF, "+++")).to eq expected
-token plus \\+;
-token plusplus \\+\\+;
+token plus /\\+/;
+token plusplus /\\+\\+/;
 EOF
   end
 
@@ -103,7 +103,7 @@ EOF
     expect(run(<<EOF, "foo \tbar")).to eq expected
 token foo;
 token bar;
-token WS \\s+;
+token WS /\\s+/;
 EOF
   end
 
@@ -116,7 +116,7 @@ EOF
     expect(run(<<EOF, "foo \tbar")).to eq expected
 token foo;
 token bar;
-drop \\s+;
+drop /\\s+/;
 EOF
   end
 
@@ -125,7 +125,7 @@ EOF
       ["semicolon", ";"],
     ]
     expect(run(<<EOF, ";")).to eq expected
-token semicolon \;;
+token semicolon /;/;
 EOF
   end
 end
diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb
index f481f23..0d7af0c 100644
--- a/spec/propane_spec.rb
+++ b/spec/propane_spec.rb
@@ -27,10 +27,10 @@ describe Propane do
 
   it "generates a D lexer" do
     write_grammar <<EOF
-token int \\d+;
-token plus \\+;
-token times \\*;
-drop \\s+;
+token int /\\d+/;
+token plus /\\+/;
+token times /\\*/;
+drop /\\s+/;
 Start -> Foo;
 Foo -> int <<
 >>
@@ -44,10 +44,10 @@ EOF
 
   it "generates a parser" do
     write_grammar <<EOF
-token plus \\+;
-token times \\*;
-token zero 0;
-token one 1;
+token plus /\\+/;
+token times /\\*/;
+token zero /0/;
+token one /1/;
 Start -> E;
 E -> E times B;
 E -> E plus B;
@@ -60,7 +60,7 @@ EOF
 
   it "generates an SLR parser" do
     write_grammar <<EOF
-token one 1;
+token one /1/;
 Start -> E;
 E -> one E;
 E -> one;
@@ -86,7 +86,7 @@ EOF
     write_grammar <<EOF
 token a;
 token b;
-drop \\s+;
+drop /\\s+/;
 Start -> a R1;
 Start -> b R1;
 R1 -> b;