Allow multiple lexer modes to be specified for a lexer pattern - close #35

This commit is contained in:
Josh Holtrop 2025-03-11 20:40:10 -04:00
parent 54bb3307cd
commit eb9d9026fc
11 changed files with 166 additions and 30 deletions

View File

@ -1,5 +1,6 @@
source "https://rubygems.org" source "https://rubygems.org"
gem "base64"
gem "rake" gem "rake"
gem "rspec" gem "rspec"
gem "rdoc" gem "rdoc"

View File

@ -1,6 +1,7 @@
GEM GEM
remote: https://rubygems.org/ remote: https://rubygems.org/
specs: specs:
base64 (0.2.0)
date (3.4.1) date (3.4.1)
diff-lcs (1.5.1) diff-lcs (1.5.1)
docile (1.4.1) docile (1.4.1)
@ -37,6 +38,7 @@ PLATFORMS
ruby ruby
DEPENDENCIES DEPENDENCIES
base64
rake rake
rdoc rdoc
redcarpet redcarpet

View File

@ -536,6 +536,28 @@ It also returns the `str` token now that the token is complete.
Note that the token name `str` above could have been `string` instead - the Note that the token name `str` above could have been `string` instead - the
namespace for token names is distinct from the namespace for lexer modes. namespace for token names is distinct from the namespace for lexer modes.
Multiple modes can be specified for a token or pattern or drop statement.
For example, if the grammar wanted to only recognize an identifier following
a `.` token and not other keywords, it could switch to an `identonly` mode
when matching a `.`
The `ident` token pattern will be matched in either the `default` or
`identonly` mode.
```
ptype char;
token abc;
token def;
default, identonly: token ident /[a-z]+/ <<
$$ = match[0];
$mode(default);
return $token(ident);
>>
token dot /\\./ <<
$mode(identonly);
>>
default, identonly: drop /\\s+/;
```
##> Specifying parser value types - the `ptype` statement ##> Specifying parser value types - the `ptype` statement
The `ptype` statement is used to define parser value type(s). The `ptype` statement is used to define parser value type(s).

View File

@ -43,8 +43,8 @@ class Propane
# Assign default pattern mode to patterns without a mode assigned. # Assign default pattern mode to patterns without a mode assigned.
found_default = false found_default = false
@grammar.patterns.each do |pattern| @grammar.patterns.each do |pattern|
if pattern.mode.nil? if pattern.modes.empty?
pattern.mode = "default" pattern.modes << "default"
found_default = true found_default = true
end end
pattern.ptypename ||= "default" pattern.ptypename ||= "default"

View File

@ -25,7 +25,7 @@ class Propane
@code_blocks = {} @code_blocks = {}
@line_number = 1 @line_number = 1
@next_line_number = @line_number @next_line_number = @line_number
@mode = nil @modeline = nil
@input = input.gsub("\r\n", "\n") @input = input.gsub("\r\n", "\n")
@ptypes = {"default" => "void *"} @ptypes = {"default" => "void *"}
@prefix = "p_" @prefix = "p_"
@ -58,7 +58,7 @@ class Propane
def parse_statement! def parse_statement!
if parse_white_space! if parse_white_space!
elsif parse_comment_line! elsif parse_comment_line!
elsif @mode.nil? && parse_mode_label! elsif @modeline.nil? && parse_mode_label!
elsif parse_ast_statement! elsif parse_ast_statement!
elsif parse_ast_prefix_statement! elsif parse_ast_prefix_statement!
elsif parse_ast_suffix_statement! elsif parse_ast_suffix_statement!
@ -81,8 +81,8 @@ class Propane
end end
def parse_mode_label! def parse_mode_label!
if md = consume!(/(#{IDENTIFIER_REGEX})\s*:/) if md = consume!(/(#{IDENTIFIER_REGEX}(?:\s*,\s*#{IDENTIFIER_REGEX})*)\s*:/)
@mode = md[1] @modeline = md[1]
end end
end end
@ -117,7 +117,7 @@ class Propane
md = consume!(/([\w.]+)\s*/, "expected module name") md = consume!(/([\w.]+)\s*/, "expected module name")
@modulename = md[1] @modulename = md[1]
consume!(/;/, "expected `;'") consume!(/;/, "expected `;'")
@mode = nil @modeline = nil
true true
end end
end end
@ -153,9 +153,9 @@ class Propane
end end
token = Token.new(name, ptypename, @line_number) token = Token.new(name, ptypename, @line_number)
@tokens << token @tokens << token
pattern = Pattern.new(pattern: pattern, token: token, line_number: @line_number, code: code, mode: @mode, ptypename: ptypename) pattern = Pattern.new(pattern: pattern, token: token, line_number: @line_number, code: code, modes: get_modes_from_modeline, ptypename: ptypename)
@patterns << pattern @patterns << pattern
@mode = nil @modeline = nil
true true
end end
end end
@ -173,7 +173,7 @@ class Propane
consume!(/;/, "expected `;'"); consume!(/;/, "expected `;'");
token = Token.new(name, ptypename, @line_number) token = Token.new(name, ptypename, @line_number)
@tokens << token @tokens << token
@mode = nil @modeline = nil
true true
end end
end end
@ -186,8 +186,8 @@ class Propane
end end
consume!(/\s+/) consume!(/\s+/)
consume!(/;/, "expected `;'") consume!(/;/, "expected `;'")
@patterns << Pattern.new(pattern: pattern, line_number: @line_number, mode: @mode) @patterns << Pattern.new(pattern: pattern, line_number: @line_number, modes: get_modes_from_modeline)
@mode = nil @modeline = nil
true true
end end
end end
@ -208,7 +208,7 @@ class Propane
end end
end end
@rules << Rule.new(rule_name, components, code, ptypename, @line_number) @rules << Rule.new(rule_name, components, code, ptypename, @line_number)
@mode = nil @modeline = nil
true true
end end
end end
@ -225,8 +225,8 @@ class Propane
unless code = parse_code_block! unless code = parse_code_block!
raise Error.new("Line #{@line_number}: expected code block to follow pattern") raise Error.new("Line #{@line_number}: expected code block to follow pattern")
end end
@patterns << Pattern.new(pattern: pattern, line_number: @line_number, code: code, mode: @mode, ptypename: ptypename) @patterns << Pattern.new(pattern: pattern, line_number: @line_number, code: code, modes: get_modes_from_modeline, ptypename: ptypename)
@mode = nil @modeline = nil
true true
end end
end end
@ -247,7 +247,7 @@ class Propane
else else
@code_blocks[name] = code @code_blocks[name] = code
end end
@mode = nil @modeline = nil
true true
end end
end end
@ -315,6 +315,14 @@ class Propane
end end
end end
def get_modes_from_modeline
if @modeline
Set[*@modeline.split(",").map(&:strip)]
else
Set.new
end
end
end end
end end

View File

@ -26,8 +26,14 @@ class Propane
private private
def build_tables! def build_tables!
@modes = @grammar.patterns.group_by do |pattern| modenames = @grammar.patterns.reduce(Set.new) do |result, pattern|
pattern.mode result + pattern.modes
end
@modes = modenames.reduce({}) do |result, modename|
result[modename] = @grammar.patterns.select do |pattern|
pattern.modes.include?(modename)
end
result
end.transform_values do |patterns| end.transform_values do |patterns|
{dfa: DFA.new(patterns)} {dfa: DFA.new(patterns)}
end end

View File

@ -26,9 +26,9 @@ class Propane
# Regex NFA for matching the pattern. # Regex NFA for matching the pattern.
attr_reader :nfa attr_reader :nfa
# @return [String, nil] # @return [Set]
# Lexer mode for this pattern. # Lexer modes for this pattern.
attr_accessor :mode attr_accessor :modes
# @return [String, nil] # @return [String, nil]
# Parser value type name. # Parser value type name.
@ -46,14 +46,14 @@ class Propane
# Token to be returned by this pattern. # Token to be returned by this pattern.
# @option options [Integer, nil] :line_number # @option options [Integer, nil] :line_number
# Line number where the token was defined in the input grammar. # Line number where the token was defined in the input grammar.
# @option options [String, nil] :mode # @option options [String, nil] :modes
# Lexer mode for this pattern. # Lexer modes for this pattern.
def initialize(options) def initialize(options)
@code = options[:code] @code = options[:code]
@pattern = options[:pattern] @pattern = options[:pattern]
@token = options[:token] @token = options[:token]
@line_number = options[:line_number] @line_number = options[:line_number]
@mode = options[:mode] @modes = options[:modes]
@ptypename = options[:ptypename] @ptypename = options[:ptypename]
regex = Regex.new(@pattern) regex = Regex.new(@pattern)
regex.nfa.end_state.accepts = self regex.nfa.end_state.accepts = self

View File

@ -151,30 +151,30 @@ EOF
o = grammar.patterns.find {|pattern| pattern.token == o} o = grammar.patterns.find {|pattern| pattern.token == o}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to be_nil expect(o.modes).to be_empty
o = grammar.tokens.find {|token| token.name == "b"} o = grammar.tokens.find {|token| token.name == "b"}
expect(o).to_not be_nil expect(o).to_not be_nil
o = grammar.patterns.find {|pattern| pattern.token == o} o = grammar.patterns.find {|pattern| pattern.token == o}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to eq "m1" expect(o.modes).to eq Set["m1"]
o = grammar.patterns.find {|pattern| pattern.pattern == "foo"} o = grammar.patterns.find {|pattern| pattern.pattern == "foo"}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to be_nil expect(o.modes).to be_empty
o = grammar.patterns.find {|pattern| pattern.pattern == "bar"} o = grammar.patterns.find {|pattern| pattern.pattern == "bar"}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to eq "m2" expect(o.modes).to eq Set["m2"]
o = grammar.patterns.find {|pattern| pattern.pattern == "q"} o = grammar.patterns.find {|pattern| pattern.pattern == "q"}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to be_nil expect(o.modes).to be_empty
o = grammar.patterns.find {|pattern| pattern.pattern == "r"} o = grammar.patterns.find {|pattern| pattern.pattern == "r"}
expect(o).to_not be_nil expect(o).to_not be_nil
expect(o.mode).to eq "m3" expect(o.modes).to eq Set["m3"]
end end
it "allows assigning ptypes to tokens and rules" do it "allows assigning ptypes to tokens and rules" do

View File

@ -621,6 +621,62 @@ EOF
]) ])
end end
it "multiple lexer modes may apply to a pattern" do
case language
when "c"
write_grammar <<EOF
<<
#include <stdio.h>
>>
ptype char;
token abc;
token def;
default, identonly: token ident /[a-z]+/ <<
$$ = match[0];
$mode(default);
return $token(ident);
>>
token dot /\\./ <<
$mode(identonly);
>>
default, identonly: drop /\\s+/;
Start -> abc dot ident <<
printf("ident: %c\\n", $3);
>>
EOF
when "d"
write_grammar <<EOF
<<
import std.stdio;
>>
ptype char;
token abc;
token def;
default, identonly: token ident /[a-z]+/ <<
$$ = match[0];
$mode(default);
>>
token dot /\\./ <<
$mode(identonly);
>>
default, identonly: drop /\\s+/;
Start -> abc dot ident <<
writeln("ident: ", $3);
>>
EOF
end
run_propane(language: language)
compile("spec/test_lexer_multiple_modes.#{language}", language: language)
results = run_test
expect(results.status).to eq 0
verify_lines(results.stdout, [
"ident: d",
"pass1",
"ident: a",
"pass2",
])
end
it "executes user code associated with a parser rule" do it "executes user code associated with a parser rule" do
case language case language
when "c" when "c"

View File

@ -0,0 +1,20 @@
#include "testparser.h"
#include <assert.h>
#include <string.h>
#include <stdio.h>
int main()
{
char const * input = "abc.def";
p_context_t context;
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS);
printf("pass1\n");
input = "abc . abc";
p_context_init(&context, (uint8_t const *)input, strlen(input));
assert(p_parse(&context) == P_SUCCESS);
printf("pass2\n");
return 0;
}

View File

@ -0,0 +1,21 @@
import testparser;
import std.stdio;
int main()
{
return 0;
}
unittest
{
string input = `abc.def`;
p_context_t context;
p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS);
writeln("pass1");
input = `abc . abc`;
p_context_init(&context, input);
assert(p_parse(&context) == P_SUCCESS);
writeln("pass2");
}