From eb9d9026fcd66750c8677f9d2adf9fe0a81fe15d Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Tue, 11 Mar 2025 20:40:10 -0400 Subject: [PATCH] Allow multiple lexer modes to be specified for a lexer pattern - close #35 --- Gemfile | 1 + Gemfile.lock | 2 ++ doc/user_guide.md | 22 +++++++++++++ lib/propane/generator.rb | 4 +-- lib/propane/grammar.rb | 36 ++++++++++++-------- lib/propane/lexer.rb | 10 ++++-- lib/propane/pattern.rb | 12 +++---- spec/propane/grammar_spec.rb | 12 +++---- spec/propane_spec.rb | 56 ++++++++++++++++++++++++++++++++ spec/test_lexer_multiple_modes.c | 20 ++++++++++++ spec/test_lexer_multiple_modes.d | 21 ++++++++++++ 11 files changed, 166 insertions(+), 30 deletions(-) create mode 100644 spec/test_lexer_multiple_modes.c create mode 100644 spec/test_lexer_multiple_modes.d diff --git a/Gemfile b/Gemfile index d3fdb90..05b807a 100644 --- a/Gemfile +++ b/Gemfile @@ -1,5 +1,6 @@ source "https://rubygems.org" +gem "base64" gem "rake" gem "rspec" gem "rdoc" diff --git a/Gemfile.lock b/Gemfile.lock index 2842fcc..b82055d 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,6 +1,7 @@ GEM remote: https://rubygems.org/ specs: + base64 (0.2.0) date (3.4.1) diff-lcs (1.5.1) docile (1.4.1) @@ -37,6 +38,7 @@ PLATFORMS ruby DEPENDENCIES + base64 rake rdoc redcarpet diff --git a/doc/user_guide.md b/doc/user_guide.md index 127ef39..4db7e71 100644 --- a/doc/user_guide.md +++ b/doc/user_guide.md @@ -536,6 +536,28 @@ It also returns the `str` token now that the token is complete. Note that the token name `str` above could have been `string` instead - the namespace for token names is distinct from the namespace for lexer modes. +Multiple modes can be specified for a token or pattern or drop statement. +For example, if the grammar wanted to only recognize an identifier following +a `.` token and not other keywords, it could switch to an `identonly` mode +when matching a `.` +The `ident` token pattern will be matched in either the `default` or +`identonly` mode. + +``` +ptype char; +token abc; +token def; +default, identonly: token ident /[a-z]+/ << + $$ = match[0]; + $mode(default); + return $token(ident); +>> +token dot /\\./ << + $mode(identonly); +>> +default, identonly: drop /\\s+/; +``` + ##> Specifying parser value types - the `ptype` statement The `ptype` statement is used to define parser value type(s). diff --git a/lib/propane/generator.rb b/lib/propane/generator.rb index 0a4bce5..02b072d 100644 --- a/lib/propane/generator.rb +++ b/lib/propane/generator.rb @@ -43,8 +43,8 @@ class Propane # Assign default pattern mode to patterns without a mode assigned. found_default = false @grammar.patterns.each do |pattern| - if pattern.mode.nil? - pattern.mode = "default" + if pattern.modes.empty? + pattern.modes << "default" found_default = true end pattern.ptypename ||= "default" diff --git a/lib/propane/grammar.rb b/lib/propane/grammar.rb index a6e6469..a3fc6ed 100644 --- a/lib/propane/grammar.rb +++ b/lib/propane/grammar.rb @@ -25,7 +25,7 @@ class Propane @code_blocks = {} @line_number = 1 @next_line_number = @line_number - @mode = nil + @modeline = nil @input = input.gsub("\r\n", "\n") @ptypes = {"default" => "void *"} @prefix = "p_" @@ -58,7 +58,7 @@ class Propane def parse_statement! if parse_white_space! elsif parse_comment_line! - elsif @mode.nil? && parse_mode_label! + elsif @modeline.nil? && parse_mode_label! elsif parse_ast_statement! elsif parse_ast_prefix_statement! elsif parse_ast_suffix_statement! @@ -81,8 +81,8 @@ class Propane end def parse_mode_label! - if md = consume!(/(#{IDENTIFIER_REGEX})\s*:/) - @mode = md[1] + if md = consume!(/(#{IDENTIFIER_REGEX}(?:\s*,\s*#{IDENTIFIER_REGEX})*)\s*:/) + @modeline = md[1] end end @@ -117,7 +117,7 @@ class Propane md = consume!(/([\w.]+)\s*/, "expected module name") @modulename = md[1] consume!(/;/, "expected `;'") - @mode = nil + @modeline = nil true end end @@ -153,9 +153,9 @@ class Propane end token = Token.new(name, ptypename, @line_number) @tokens << token - pattern = Pattern.new(pattern: pattern, token: token, line_number: @line_number, code: code, mode: @mode, ptypename: ptypename) + pattern = Pattern.new(pattern: pattern, token: token, line_number: @line_number, code: code, modes: get_modes_from_modeline, ptypename: ptypename) @patterns << pattern - @mode = nil + @modeline = nil true end end @@ -173,7 +173,7 @@ class Propane consume!(/;/, "expected `;'"); token = Token.new(name, ptypename, @line_number) @tokens << token - @mode = nil + @modeline = nil true end end @@ -186,8 +186,8 @@ class Propane end consume!(/\s+/) consume!(/;/, "expected `;'") - @patterns << Pattern.new(pattern: pattern, line_number: @line_number, mode: @mode) - @mode = nil + @patterns << Pattern.new(pattern: pattern, line_number: @line_number, modes: get_modes_from_modeline) + @modeline = nil true end end @@ -208,7 +208,7 @@ class Propane end end @rules << Rule.new(rule_name, components, code, ptypename, @line_number) - @mode = nil + @modeline = nil true end end @@ -225,8 +225,8 @@ class Propane unless code = parse_code_block! raise Error.new("Line #{@line_number}: expected code block to follow pattern") end - @patterns << Pattern.new(pattern: pattern, line_number: @line_number, code: code, mode: @mode, ptypename: ptypename) - @mode = nil + @patterns << Pattern.new(pattern: pattern, line_number: @line_number, code: code, modes: get_modes_from_modeline, ptypename: ptypename) + @modeline = nil true end end @@ -247,7 +247,7 @@ class Propane else @code_blocks[name] = code end - @mode = nil + @modeline = nil true end end @@ -315,6 +315,14 @@ class Propane end end + def get_modes_from_modeline + if @modeline + Set[*@modeline.split(",").map(&:strip)] + else + Set.new + end + end + end end diff --git a/lib/propane/lexer.rb b/lib/propane/lexer.rb index 3a3da8b..6e6b431 100644 --- a/lib/propane/lexer.rb +++ b/lib/propane/lexer.rb @@ -26,8 +26,14 @@ class Propane private def build_tables! - @modes = @grammar.patterns.group_by do |pattern| - pattern.mode + modenames = @grammar.patterns.reduce(Set.new) do |result, pattern| + result + pattern.modes + end + @modes = modenames.reduce({}) do |result, modename| + result[modename] = @grammar.patterns.select do |pattern| + pattern.modes.include?(modename) + end + result end.transform_values do |patterns| {dfa: DFA.new(patterns)} end diff --git a/lib/propane/pattern.rb b/lib/propane/pattern.rb index c046ddb..71e1330 100644 --- a/lib/propane/pattern.rb +++ b/lib/propane/pattern.rb @@ -26,9 +26,9 @@ class Propane # Regex NFA for matching the pattern. attr_reader :nfa - # @return [String, nil] - # Lexer mode for this pattern. - attr_accessor :mode + # @return [Set] + # Lexer modes for this pattern. + attr_accessor :modes # @return [String, nil] # Parser value type name. @@ -46,14 +46,14 @@ class Propane # Token to be returned by this pattern. # @option options [Integer, nil] :line_number # Line number where the token was defined in the input grammar. - # @option options [String, nil] :mode - # Lexer mode for this pattern. + # @option options [String, nil] :modes + # Lexer modes for this pattern. def initialize(options) @code = options[:code] @pattern = options[:pattern] @token = options[:token] @line_number = options[:line_number] - @mode = options[:mode] + @modes = options[:modes] @ptypename = options[:ptypename] regex = Regex.new(@pattern) regex.nfa.end_state.accepts = self diff --git a/spec/propane/grammar_spec.rb b/spec/propane/grammar_spec.rb index 27cb992..2f813fe 100644 --- a/spec/propane/grammar_spec.rb +++ b/spec/propane/grammar_spec.rb @@ -151,30 +151,30 @@ EOF o = grammar.patterns.find {|pattern| pattern.token == o} expect(o).to_not be_nil - expect(o.mode).to be_nil + expect(o.modes).to be_empty o = grammar.tokens.find {|token| token.name == "b"} expect(o).to_not be_nil o = grammar.patterns.find {|pattern| pattern.token == o} expect(o).to_not be_nil - expect(o.mode).to eq "m1" + expect(o.modes).to eq Set["m1"] o = grammar.patterns.find {|pattern| pattern.pattern == "foo"} expect(o).to_not be_nil - expect(o.mode).to be_nil + expect(o.modes).to be_empty o = grammar.patterns.find {|pattern| pattern.pattern == "bar"} expect(o).to_not be_nil - expect(o.mode).to eq "m2" + expect(o.modes).to eq Set["m2"] o = grammar.patterns.find {|pattern| pattern.pattern == "q"} expect(o).to_not be_nil - expect(o.mode).to be_nil + expect(o.modes).to be_empty o = grammar.patterns.find {|pattern| pattern.pattern == "r"} expect(o).to_not be_nil - expect(o.mode).to eq "m3" + expect(o.modes).to eq Set["m3"] end it "allows assigning ptypes to tokens and rules" do diff --git a/spec/propane_spec.rb b/spec/propane_spec.rb index cc26092..993b674 100644 --- a/spec/propane_spec.rb +++ b/spec/propane_spec.rb @@ -621,6 +621,62 @@ EOF ]) end + it "multiple lexer modes may apply to a pattern" do + case language + when "c" + write_grammar < +>> +ptype char; +token abc; +token def; +default, identonly: token ident /[a-z]+/ << + $$ = match[0]; + $mode(default); + return $token(ident); +>> +token dot /\\./ << + $mode(identonly); +>> +default, identonly: drop /\\s+/; +Start -> abc dot ident << + printf("ident: %c\\n", $3); +>> +EOF + when "d" + write_grammar <> +ptype char; +token abc; +token def; +default, identonly: token ident /[a-z]+/ << + $$ = match[0]; + $mode(default); +>> +token dot /\\./ << + $mode(identonly); +>> +default, identonly: drop /\\s+/; +Start -> abc dot ident << + writeln("ident: ", $3); +>> +EOF + end + run_propane(language: language) + compile("spec/test_lexer_multiple_modes.#{language}", language: language) + results = run_test + expect(results.status).to eq 0 + verify_lines(results.stdout, [ + "ident: d", + "pass1", + "ident: a", + "pass2", + ]) + end + it "executes user code associated with a parser rule" do case language when "c" diff --git a/spec/test_lexer_multiple_modes.c b/spec/test_lexer_multiple_modes.c new file mode 100644 index 0000000..962104b --- /dev/null +++ b/spec/test_lexer_multiple_modes.c @@ -0,0 +1,20 @@ +#include "testparser.h" +#include +#include +#include + +int main() +{ + char const * input = "abc.def"; + p_context_t context; + p_context_init(&context, (uint8_t const *)input, strlen(input)); + assert(p_parse(&context) == P_SUCCESS); + printf("pass1\n"); + + input = "abc . abc"; + p_context_init(&context, (uint8_t const *)input, strlen(input)); + assert(p_parse(&context) == P_SUCCESS); + printf("pass2\n"); + + return 0; +} diff --git a/spec/test_lexer_multiple_modes.d b/spec/test_lexer_multiple_modes.d new file mode 100644 index 0000000..36ad01c --- /dev/null +++ b/spec/test_lexer_multiple_modes.d @@ -0,0 +1,21 @@ +import testparser; +import std.stdio; + +int main() +{ + return 0; +} + +unittest +{ + string input = `abc.def`; + p_context_t context; + p_context_init(&context, input); + assert(p_parse(&context) == P_SUCCESS); + writeln("pass1"); + + input = `abc . abc`; + p_context_init(&context, input); + assert(p_parse(&context) == P_SUCCESS); + writeln("pass2"); +}