Compare commits

...

148 Commits

Author SHA1 Message Date
164a4854fb Update README 2022-05-30 15:40:31 -04:00
ddadc2008b Rename to propane 2022-05-28 20:20:03 -04:00
fbd215098b Update license years 2022-05-27 21:49:54 -04:00
bfe2916165 Update bundler 2022-05-27 00:15:03 -04:00
c9bc4832f4 bundle update 2022-05-27 00:14:26 -04:00
6dfef8573f Fix ERB constructor call for Ruby 3.2 warnings 2022-05-27 00:12:40 -04:00
f3ed678fe1 Store tokens in Hash by name 2021-09-27 21:40:12 -04:00
280b749e38 Track Rule IDs 2021-09-27 21:29:44 -04:00
d6779aef00 Start on Parser#build_tables 2021-09-22 23:26:36 -04:00
746ec89be8 Add test for a rule that can be arrived at from multiple states 2021-09-21 21:40:11 -04:00
997f34a1e4 Keep track of item set in-links 2021-09-21 21:32:18 -04:00
a2795bb531 Keep track of follow item sets by symbol for each item set 2021-09-21 17:09:53 -04:00
850e639e3a update identical rule spec to use lookahead symbol 2021-09-06 20:18:17 -04:00
5f7e548fe3 Remove Rule::Pattern, Item stores a Rule reference 2021-09-06 19:41:29 -04:00
bdb10e7afc test duplicate rules 2021-09-05 09:50:04 -04:00
7bdaf7cdbc Do not create item set following EOF token 2021-09-05 07:51:59 -04:00
08e3516ad9 Add wikipedia LR(0) parser example test 2021-09-04 22:33:34 -04:00
2c8f3c6e9a Avoid infinite loop with self-referential rules 2021-09-04 22:29:10 -04:00
9dffa3c41a Recursively build item sets 2021-08-29 12:38:44 -04:00
ceb7e9ee32 Add EOF token to Start rule patterns 2021-08-29 11:48:49 -04:00
6026bf1514 Start building following item sets 2021-08-29 09:41:00 -04:00
9cc1890ddc One Rule object stores all alternative patterns 2021-08-28 10:28:50 -04:00
e4f2fffe50 add Item#closed_items 2021-08-28 09:47:01 -04:00
d931bcb513 Do not expand rules 2021-08-28 09:23:08 -04:00
2e16b0bd6e Start on Item and ItemSet 2021-08-28 09:02:19 -04:00
6ce94e15af Expand rules 2021-08-28 08:11:06 -04:00
3f92ae46c4 Map rule components to Token/Rule references 2021-08-22 21:21:41 -04:00
00016f16b3 Combine Grammar and Generator into top-level Imbecile class 2021-08-22 21:04:46 -04:00
9273bfccf6 Move Token/Rule out of Grammar class 2021-08-19 20:00:40 -04:00
f295acb593 Generator builds a Lexer, not a Lexer::DFA 2021-08-19 13:11:12 -04:00
51a31317a6 Move FA#build_tables to Lexer::DFA 2021-08-19 11:55:34 -04:00
9459883e74 Add Lexer class; Move LexerDFA to Lexer::DFA 2021-08-18 17:09:45 -04:00
28591907c1 Move FA class out of Regex class 2021-08-18 17:05:03 -04:00
37d6917b49 Rework Rule constructor 2021-07-27 21:22:46 -04:00
2685c05360 Change rule syntax 2021-07-19 21:55:08 -04:00
c0c3353fd7 Test lexing empty null string returns EOF 2021-07-06 12:06:07 -04:00
3158e51059 Add length field to LexedToken 2021-07-06 11:59:35 -04:00
d9e4f64d2e Fix returning TOKEN_EOF when lexing at EOF 2021-07-06 11:55:44 -04:00
ec2dcf9a72 Fix not progressing through input while lexing a token 2021-07-06 11:47:33 -04:00
578e165e2d Fix off-by-one error in state IDs 2021-07-06 11:44:03 -04:00
e8df4296cc Begin testing lexer 2021-07-06 11:09:39 -04:00
230c324209 Fix iterating through all transitions in a state 2021-07-06 11:09:13 -04:00
1271e19b50 Test multi-byte code point decoding 2021-07-06 11:02:43 -04:00
12e11399af Add decoder tests 2021-07-06 10:57:06 -04:00
24fab8515d Decoder.decode_code_point returns struct with code point and length together 2021-07-06 10:50:32 -04:00
1dcdd87a28 Generate token constants and names to top-level parser class 2021-07-06 10:28:35 -04:00
8aec7ec0de Lexer class can be used standalone 2021-07-06 10:15:07 -04:00
c96d55b031 Fix class name 2021-07-06 10:14:14 -04:00
ca7d4862f9 Run test executable; build with unit tests 2021-07-06 10:03:42 -04:00
3c874ae4c1 Compile generated parser with a test file 2021-07-05 23:05:55 -04:00
748c219625 Do not return dropped tokens from Lexer.lex_token() 2021-07-05 22:53:58 -04:00
71ee7de9f9 Remove obsolete lex() and lex_token() methods 2021-07-05 22:49:50 -04:00
2121acc87e Complete Lexer.lex_token() 2021-07-05 22:41:09 -04:00
f2563cf255 Work on Lexer.lex_token() 2021-07-05 22:02:27 -04:00
24d12be3b9 Add TOKEN enum entries for EOF, decode error, drop, and none 2021-07-05 20:11:55 -04:00
91d6ee25ea Add Lexer class 2021-07-05 19:13:41 -04:00
2f1cb47bea Add Decoder class to decode code points 2021-07-05 18:47:10 -04:00
651461c570 Start on decode_code_point() 2021-06-29 23:17:44 -04:00
3ce54bd303 Start on lex()/lex_token() 2021-06-29 23:10:40 -04:00
15454f926a Add TokenNames array 2021-06-29 22:54:24 -04:00
4beb3d2016 Add some token constants 2021-06-27 23:09:42 -04:00
aae7bc188c Use unsigned literals 2021-06-26 18:11:20 -04:00
a716dedeb6 Start on test framework to compile and run generated parser 2021-06-26 16:17:24 -04:00
93cb25df62 Do not generate token names for drop tokens 2021-06-26 16:16:18 -04:00
61dd5bc5a0 Move imbecile_spec to lexer_dfa_spec 2021-06-26 16:01:49 -04:00
10a8ef5eb4 Update generated lexer state and transition tables 2021-06-26 15:58:36 -04:00
98584ce07a Add FA#build_tables 2021-06-24 15:06:10 -04:00
2122ca02fe Start generating lexer states and transitions 2021-06-23 23:15:02 -04:00
5881f13380 Generate enum of token identifiers 2021-06-23 22:22:45 -04:00
ebc1d8f001 Fix FA#to_s to show correct destination state 2021-06-23 22:21:53 -04:00
5fecd5c6a2 Refactor into FA#enumerate 2021-06-22 22:01:39 -04:00
5b688b090d Add some attr_readers 2021-06-21 22:52:27 -04:00
f77218801f Error if Start rule not found 2021-06-21 22:48:17 -04:00
70118dd019 Check for duplicate token/rule names in Generator 2021-06-21 22:34:43 -04:00
d552f2a540 CLI: accept --log option 2021-06-19 12:06:02 -04:00
d2fac07249 Add Generator class 2021-06-15 16:51:36 -04:00
a34272dfd6 Add Grammar::Rule class 2021-06-14 22:49:43 -04:00
9d05861819 Parse grammar input by multiline regex 2021-06-12 22:57:32 -04:00
03035a25a5 Update spec task to accept an example pattern 2021-06-12 22:46:13 -04:00
db70f8b94d Add "drop" grammar keyword to drop patterns 2021-06-09 22:48:30 -04:00
f67dd62b20 Add \s to expand to whitespace characters 2021-06-09 22:37:00 -04:00
c6bac6d3a1 Rename TokenDFA -> LexerDFA 2021-06-08 13:54:46 -04:00
aa92970c31 Add some lexer tests 2021-06-07 22:21:52 -04:00
b8282e748e Start on a test lexer for lexer specs 2021-06-07 17:17:37 -04:00
930ac56148 Do not accept 0-length tokens 2021-06-06 15:29:30 -04:00
7f54778ba8 Rename Regex::DFA to TokenDFA 2021-06-06 15:18:21 -04:00
701903def2 Token should build its own NFA 2021-06-06 14:09:28 -04:00
afea886ecb Add Grammar::Token class 2021-06-06 14:04:33 -04:00
03b2e87186 Grammar takes in input string instead of file name 2021-06-06 10:09:53 -04:00
e4370cac62 Print accepting token in FA#to_s 2021-06-06 09:59:28 -04:00
ed3f599e25 Create common FA/State/Transition classes across NFA/DFA 2021-06-06 09:41:23 -04:00
1228a76c55 Fix MultiplicityUnit#to_nfa again 2021-05-26 10:17:03 -04:00
538e360cb3 Fix MultiplicityUnit#to_nfa 2021-05-25 16:59:22 -04:00
e7f8c3726c Fix NFA#to_s 2021-05-25 16:14:19 -04:00
b6e3a5c151 Record accepting token in DFA state 2021-05-25 16:00:25 -04:00
35ef94dbd3 Print out DFA to test 2021-05-25 15:52:47 -04:00
37e1252ded Continue building DFA 2021-05-25 15:44:23 -04:00
214ece7d90 Add NFA::Transition, start on DFA construction 2021-05-23 21:41:50 -04:00
8473df421a Add specs for CodePointRange 2021-05-23 20:41:40 -04:00
3987f08cd7 Add CodePointRange class 2021-05-23 17:52:20 -04:00
3a1650906e Show non-printable characters better in NFA#to_s 2021-05-21 14:39:02 -04:00
952bffc33c Move DFA#nil_transition_states to NFA::State 2021-05-21 14:27:42 -04:00
f64f3683c6 Add NFA#to_s 2021-05-21 14:24:16 -04:00
43f5caf449 Fix some NFA creation 2021-05-20 17:34:18 -04:00
f38a7456e9 Add DFA#nil_transition_states 2021-05-20 17:08:34 -04:00
c77c81bf25 Mark regex NFA end state as accepting the token 2021-05-18 16:34:26 -04:00
7196a0605a Add DFA class 2021-05-18 16:31:16 -04:00
24054461a2 Merge Regex::Parser into Regex, move Unit to its own file 2021-05-18 16:14:42 -04:00
89a5976064 Make Regex::Parser build a NFA after parsing 2021-05-18 16:07:39 -04:00
d3df67be1e Update rake 2021-05-18 16:03:14 -04:00
791340b292 Build NFA for each token pattern 2021-05-17 22:57:18 -04:00
cf8718b69c Allow token definition with no pattern 2021-05-17 22:40:23 -04:00
39f164a7db Parse . in a regex 2021-05-17 17:20:56 -04:00
70b3e56de2 Store all characters as ranges; add CharacterClassUnit#to_nfa 2021-05-14 13:52:03 -04:00
2e8e72a1e8 Add CharacterClassUnit and use it instead of AlternatesUnit 2021-05-14 12:32:53 -04:00
ea27baa630 Add #to_nfa for other regex unit types 2021-05-13 15:57:09 -04:00
d8dd64d860 Add NFA class
Start converting units to NFAs
2021-05-13 00:01:12 -04:00
54cefda186 Use Parser 2021-05-11 16:52:28 -04:00
201a38fb51 Add Parser specs 2021-05-11 15:29:40 -04:00
33f9d01883 Rename start/end to min/max for CharacterRangeUnit 2021-05-11 15:28:45 -04:00
9b09625c8a Fix parsing - at beginning of negated character class 2021-05-11 14:57:16 -04:00
6119d860bc Fix character class parsing into an AlternatesUnit 2021-05-11 14:57:01 -04:00
611ebeeddd Fix max multiplicity count parsing 2021-05-11 11:37:46 -04:00
449eec4982 Fix multiplicity count parsing 2021-05-11 11:33:10 -04:00
8cd648fc8f Create spec file for Parser 2021-05-07 16:58:38 -04:00
885ef6c151 Rename Regex::Unit -> Regex::Parser 2021-05-07 16:57:05 -04:00
60adffbbab Add rspec 2021-05-07 15:16:01 -04:00
b8c01ca1d1 Move Unit stuff from Imbecile::Regex to Imbecile::Regex::Unit 2021-05-07 15:10:51 -04:00
b04ff56308 Add Regex class 2021-05-02 15:22:45 -04:00
ca1d2d1e5c Fix class name determination from output file name 2021-05-01 17:01:15 -04:00
13403405b0 Add Error class to handle grammar loading errors 2021-05-01 16:54:24 -04:00
07dd68e367 Write output file from ERB template 2021-05-01 16:44:01 -04:00
c1666a1e74 Require output file on command line 2021-05-01 14:52:16 -04:00
768a0ef17f Extract class name from grammar file 2021-05-01 14:34:00 -04:00
9e865d1982 Throw error on unexpected grammar input line 2021-05-01 09:40:22 -04:00
9884047090 Skip blank lines 2021-05-01 09:39:19 -04:00
04393dcc51 Check for duplicate token names; skip comment lines 2021-05-01 09:38:08 -04:00
7f27b3fd6f Exit with CLI exit code 2021-05-01 09:34:38 -04:00
37ad87d602 Rename GrammarParser -> Grammar 2021-05-01 09:33:35 -04:00
23b7782a5d Begin parsing grammar 2021-05-01 09:31:12 -04:00
0cc4516c0e Add GrammarParser class to parse input file 2021-05-01 08:22:14 -04:00
75a1049040 Parse command-line options 2021-05-01 08:16:09 -04:00
a9ff93dda4 Add script to test run 2021-05-01 08:16:01 -04:00
d879a93d09 Add bin/imbecile and Imbecile::CLI module 2021-04-29 23:26:52 -04:00
ee27c5e9b1 Add Gemfile.lock 2021-04-29 23:26:37 -04:00
989e5f47de Edit some gemspec fields 2021-04-29 23:26:29 -04:00
04e17cde30 Add "bundle gem"-generated files 2021-04-29 23:22:23 -04:00
bc217e7ddb Start on ruby branch 2021-04-29 23:18:22 -04:00
51 changed files with 2313 additions and 1317 deletions

19
.gitignore vendored
View File

@ -1,9 +1,10 @@
imbecile
tags
*.o
.*.swp
*.dep
tmpl.*
tests/*/itest.cc
tests/*/itest.h
tests/*/test
/.bundle/
/.yardoc
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/tmp/
/.rspec_status
/spec/run/

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "refptr"]
path = refptr
url = http://github.com/holtrop/refptr.git

3
.rspec Normal file
View File

@ -0,0 +1,3 @@
--format documentation
--color
--require spec_helper

4
Gemfile Normal file
View File

@ -0,0 +1,4 @@
source "https://rubygems.org"
gem "rake"
gem "rspec"

28
Gemfile.lock Normal file
View File

@ -0,0 +1,28 @@
GEM
remote: https://rubygems.org/
specs:
diff-lcs (1.5.0)
rake (13.0.6)
rspec (3.11.0)
rspec-core (~> 3.11.0)
rspec-expectations (~> 3.11.0)
rspec-mocks (~> 3.11.0)
rspec-core (3.11.0)
rspec-support (~> 3.11.0)
rspec-expectations (3.11.0)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.11.0)
rspec-mocks (3.11.1)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.11.0)
rspec-support (3.11.0)
PLATFORMS
ruby
DEPENDENCIES
rake
rspec
BUNDLED WITH
2.4.0.dev

21
LICENSE.txt Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2010-2022 Josh Holtrop
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View File

@ -1,61 +0,0 @@
TARGET := imbecile
CXXOBJS := $(patsubst %.cc,%.o,$(wildcard *.cc)) tmpl.o
CXXDEPS := $(patsubst %.o,.%.dep,$(CXXOBJS))
CXXFLAGS := -O2
DEPS := $(CXXDEPS)
OBJS := $(CXXOBJS)
LDFLAGS := -lpcre
CPPFLAGS := -I$(shell pwd)/refptr
all: submodule_check tmpl.h $(TARGET)
.PHONY: submodule_check
submodule_check:
@if [ ! -e refptr/refptr.h ]; then \
echo Error: \"refptr\" folder is not populated.; \
echo Perhaps you forgot to do \"git checkout --recursive\"?; \
echo You can remedy the situation with \"git submodule update --init\".; \
exit 1; \
fi
$(TARGET): $(OBJS)
$(CXX) -o $@ $^ $(LDFLAGS)
# Object file rules
%.o: %.cc
$(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $<
# Make dependency files
.%.dep: %.c
@set -e; rm -f $@; \
$(CC) -MM $(CPPFLAGS) $< | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' > $@
.%.dep: %.cc tmpl.h
@set -e; rm -f $@; \
$(CXX) -MM $(CPPFLAGS) $< | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' > $@
tmpl.cc: $(wildcard tmpl/*)
echo -n > $@
for f in $*/*; \
do xxd -i $$f >> $@; \
done
tmpl.h: tmpl.cc
echo '#ifndef $*_h' > $@
echo '#define $*_h' >> $@
grep '$*_' $^ | sed -e 's/^/extern /' -e 's/ =.*/;/' >> $@
echo '#endif' >> $@
.PHONY: tests
tests: PATH := $(shell pwd):$(PATH)
tests: all
$(MAKE) -C $@
tests-clean:
$(MAKE) -C tests clean
clean: tests-clean
-rm -f $(TARGET) *.o .*.dep tmpl.cc tmpl.h
-include $(CXXDEPS)

423
Parser.cc
View File

@ -1,423 +0,0 @@
#include <stdio.h>
#include <string.h>
#include <pcre.h>
#include <ctype.h> /* toupper() */
#include <iostream>
#include <fstream>
#include <string>
#include <map>
#include "Parser.h"
#include "TokenDefinition.h"
#include "RuleDefinition.h"
#include "tmpl.h"
using namespace std;
#define DEBUG
Parser::Parser()
: m_classname("Parser"), m_namespace(""), m_extension("cc"),
m_token_data(new string()), m_token_code(new string()),
m_defines(new string())
{
}
void Parser::makeDefine(const string & defname, const string & definition)
{
*m_defines += string("#define ") + defname + " " + definition + "\n";
}
bool Parser::write(const string & fname)
{
if (m_tokens.size() < 1 || m_rules.size() < 1)
return false;
string header_fname = fname + ".h";
string body_fname = fname + "." + m_extension;
ofstream header(header_fname.c_str());
ofstream body(body_fname.c_str());
/* process data */
refptr<string> token_classes = new string();
refptr<string> token_classes_code = new string();
int i = 0;
for (list<TokenDefinitionRef>::const_iterator it = m_tokens.begin();
it != m_tokens.end();
it++)
{
char buff[20];
sprintf(buff, "%d", i++);
makeDefine((*it)->getIdentifier(), buff);
*token_classes += (*it)->getClassDefinition();
*token_classes_code += (*it)->getProcessMethod();
}
if (m_namespace != "")
{
makeDefine("I_NAMESPACE", m_namespace);
}
makeDefine("I_CLASSNAME", m_classname);
/* set up replacements */
setReplacement("token_list", buildTokenList());
setReplacement("buildToken", buildBuildToken());
setReplacement("header_name",
new string(string("\"") + header_fname + "\""));
setReplacement("token_code", m_token_code);
setReplacement("token_data", m_token_data);
setReplacement("defines", m_defines);
setReplacement("token_classes", token_classes);
setReplacement("token_classes_code", token_classes_code);
/* write the header */
writeTmpl(header, (char *) tmpl_parser_h, tmpl_parser_h_len);
/* write the body */
writeTmpl(body, (char *) tmpl_parser_cc, tmpl_parser_cc_len);
header.close();
body.close();
return true;
}
bool Parser::writeTmpl(std::ostream & out, char * dat, int len)
{
char * newline;
char * data = dat;
const char * errptr;
int erroffset;
data[len-1] = '\n';
const int ovec_size = 6;
int ovector[ovec_size];
pcre * replace = pcre_compile("{%(\\w+)%}", 0, &errptr, &erroffset, NULL);
while (data < (dat + len) && (newline = strstr(data, "\n")) != NULL)
{
if (pcre_exec(replace, NULL, data, newline - data,
0, 0, ovector, ovec_size) >= 0)
{
if (ovector[0] > 0)
{
out.write(data, ovector[0]);
}
out << *getReplacement(string(data, ovector[2],
ovector[3] - ovector[2]));
if (ovector[1] < newline - data)
{
out.write(data + ovector[1], newline - data - ovector[1]);
}
}
else
{
out.write(data, newline - data);
}
out << '\n';
data = newline + 1;
}
}
refptr<std::string> Parser::getReplacement(const std::string & name)
{
if (m_replacements.find(name) != m_replacements.end())
{
return m_replacements[name];
}
#ifdef DEBUG
cerr << "No replacement found for \"" << name << "\"" << endl;
#endif
return new string("");
}
refptr<string> Parser::buildTokenList()
{
refptr<string> tokenlist = new string();
for (list<TokenDefinitionRef>::const_iterator t = m_tokens.begin();
t != m_tokens.end();
t++)
{
if (t != m_tokens.begin())
*tokenlist += " ";
*tokenlist += "{ \"" + (*t)->getName() + "\", \""
+ (*t)->getCString() + "\", "
+ ((*t)->getProcessFlag() ? "true" : "false") + " }";
if (({typeof(t) tmp = t; ++tmp;}) != m_tokens.end())
*tokenlist += ",\n";
}
return tokenlist;
}
refptr<string> Parser::buildBuildToken()
{
refptr<string> buildToken = new string();
for (list<TokenDefinitionRef>::const_iterator t = m_tokens.begin();
t != m_tokens.end();
t++)
{
*buildToken += "case " + (*t)->getIdentifier() + ":\n";
*buildToken += " token = new " + (*t)->getClassName() + "();\n";
*buildToken += " break;\n";
}
return buildToken;
}
bool Parser::parseInputFile(char * buff, int size)
{
typedef pcre * pcre_ptr;
enum { none, tokens, rules };
pcre_ptr empty, comment, section_name, token, rule,
data_begin, data_end, code_begin, code_end;
struct { pcre_ptr * re; const char * pattern; } exprs[] = {
{&empty, "^\\s*$"},
{&comment, "^\\s*#"},
{&section_name, "^\\s*\\[([^\\]]+?)\\]\\s*$"},
{&token, "^\\s*" /* possible leading ws */
"([a-zA-Z_][a-zA-Z_0-9]*)" /* 1: token name */
"\\s+" /* required whitespace */
"((?:[^\\\\\\s]|\\\\.)+)"}, /* 2: token RE */
{&rule, "^\\s*(\\S+)\\s*:=(.*)$"},
{&data_begin, "^\\s*\\${"},
{&data_end, "\\$}"},
{&code_begin, "^\\s*%{"},
{&code_end, "%}"}
};
const int ovec_size = 3 * 10;
int ovector[ovec_size];
int lineno = 0;
char * newline;
char * input = buff;
string current_section_name;
map<string, int> sections;
sections["none"] = none;
sections["tokens"] = tokens;
sections["rules"] = rules;
int section = none;
string line;
bool append_line = false;
bool gathering_data = false;
bool gathering_code = false;
string gather;
bool continue_line = false;
TokenDefinitionRef current_token;
for (int i = 0; i < sizeof(exprs)/sizeof(exprs[0]); i++)
{
const char * errptr;
int erroffset;
*exprs[i].re = pcre_compile(exprs[i].pattern, 0,
&errptr, &erroffset, NULL);
if (*exprs[i].re == NULL)
{
cerr << "Error compiling regex '" << exprs[i].pattern <<
"': " << errptr << " at position " << erroffset << endl;
return false;
}
}
for (;;)
{
if (continue_line)
{
continue_line = false;
}
else
{
if ((newline = strstr(input, "\n")) == NULL)
break;
int line_length = newline - input;
if (line_length >= 1 && newline[-1] == '\r')
{
newline[-1] = '\n';
line_length--;
}
lineno++;
if (append_line)
{
line += string(input, line_length);
}
else
{
line = string(input, line_length);
}
input = newline + 1; /* set up for next loop iteration */
}
if ( (pcre_exec(empty, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
|| (pcre_exec(comment, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
)
{
/* skip empty or comment lines */;
continue;
}
if (! (gathering_code || gathering_data) )
{
if (line.size() > 0 && line[line.size()-1] == '\\')
{
line[line.size()-1] = ' ';
append_line = true;
continue;
}
else
{
append_line = false;
}
if (pcre_exec(section_name, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
current_section_name
= string(line, ovector[2], ovector[3] - ovector[2]);
if (sections.find(current_section_name) != sections.end())
{
section = sections[current_section_name];
}
else
{
cerr << "Unknown section name '" << current_section_name
<< "'!" << endl;
return false;
}
continue;
}
}
switch (section)
{
case none:
cerr << "Unrecognized input on line " << lineno << endl;
return false;
case tokens:
if (gathering_data)
{
if (pcre_exec(data_end, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
gather += string(line, 0, ovector[0]) + "\n";
gathering_data = false;
line = string(line, ovector[1]);
continue_line = true;
if (current_token.isNull())
{
*m_token_data += gather;
}
else
{
current_token->addData(gather);
}
}
else
{
gather += line + "\n";
}
continue;
}
else if (gathering_code)
{
if (pcre_exec(code_end, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
gather += string(line, 0, ovector[0]) + "\n";
gathering_code = false;
line = string(line, ovector[1]);
continue_line = true;
if (current_token.isNull())
{
*m_token_code += gather;
}
else
{
current_token->addCode(gather);
}
}
else
{
gather += line + "\n";
}
continue;
}
else if (pcre_exec(data_begin, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
gathering_data = true;
gather = "";
line = string(line, ovector[1]);
continue_line = true;
continue;
}
else if (pcre_exec(code_begin, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
gathering_code = true;
gather = "";
line = string(line, ovector[1]);
continue_line = true;
continue;
}
else if (pcre_exec(token, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
string name(line, ovector[2], ovector[3] - ovector[2]);
string definition(line,
ovector[4], ovector[5] - ovector[4]);
current_token = new TokenDefinition();
if (current_token->create(name, definition))
{
addTokenDefinition(current_token);
}
else
{
cerr << "Error in token definition ending on line "
<< lineno << endl;
return false;
}
line = string(line, ovector[1]);
continue_line = true;
continue;
}
else
{
cerr << "Unrecognized input on line " << lineno << endl;
return false;
}
break;
case rules:
if (pcre_exec(rule, NULL, line.c_str(), line.size(),
0, 0, ovector, ovec_size) >= 0)
{
string name(line, ovector[2], ovector[3] - ovector[2]);
string definition(line,
ovector[4], ovector[5] - ovector[4]);
refptr<RuleDefinition> rd = new RuleDefinition();
if (rd->create(name, definition))
{
addRuleDefinition(rd);
}
else
{
cerr << "Error in rule definition ending on line "
<< lineno << endl;
return false;
}
}
else
{
cerr << "Unrecognized input on line " << lineno << endl;
return false;
}
break;
}
}
for (int i = 0; i < sizeof(exprs)/sizeof(exprs[0]); i++)
{
pcre_free(*exprs[i].re);
}
return true;
}

View File

@ -1,61 +0,0 @@
#ifndef PARSER_H
#define PARSER_H
#include <vector>
#include <string>
#include <list>
#include <map>
#include "refptr.h"
#include "TokenDefinition.h"
#include "RuleDefinition.h"
class Parser
{
public:
Parser();
void addTokenDefinition(refptr<TokenDefinition> td)
{
m_tokens.push_back(td);
}
void addRuleDefinition(refptr<RuleDefinition> rd)
{
m_rules.push_back(rd);
}
bool write(const std::string & fname);
bool parseInputFile(char * buff, int size);
void setClassName(const std::string & cn) { m_classname = cn; }
std::string getClassName() { return m_classname; }
void setNamespace(const std::string & ns) { m_namespace = ns; }
std::string getNamespace() { return m_namespace; }
void setExtension(const std::string & e) { m_extension = e; }
std::string getExtension() { return m_extension; }
protected:
refptr<std::string> buildTokenList();
refptr<std::string> buildBuildToken();
bool writeTmpl(std::ostream & out, char * dat, int len);
refptr<std::string> getReplacement(const std::string & name);
void setReplacement(const std::string & name, refptr<std::string> val)
{
m_replacements[name] = val;
}
void makeDefine(const std::string & defname,
const std::string & definition);
std::list<TokenDefinitionRef> m_tokens;
std::vector< refptr< RuleDefinition > > m_rules;
std::string m_classname;
std::string m_namespace;
std::string m_extension;
std::map< std::string, refptr<std::string> > m_replacements;
refptr<std::string> m_token_data;
refptr<std::string> m_token_code;
refptr<std::string> m_defines;
};
#endif

5
README
View File

@ -1,5 +0,0 @@
Imbecile is a bottom-up parser generator. It targets C++ and automatically
generates a class heirarchy for interacting with the parser.
Imbecile generates both a lexer and a parser based on the rules given to
it in the input file.

31
README.md Normal file
View File

@ -0,0 +1,31 @@
# The Propane Parser Generator
Propane is an LR Parser Generator (LPG) which:
* accepts LR(0), SLR, and LALR grammars
* generates a built-in lexer to tokenize input
* supports UTF-8 lexer inputs
* generates a table-driven parser to parse input in linear time
* is MIT-licensed
* is distributable as a standalone Ruby script
## Installation
TODO
## Usage
TODO: Write usage instructions here
## Development
After checking out the repository, run `bundle install` to install dependencies.
Run `rake spec` to execute tests.
## Contributing
Bug reports and pull requests are welcome on GitHub at https://github.com/holtrop/propane.
## License
Propane is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).

9
Rakefile Normal file
View File

@ -0,0 +1,9 @@
require "rspec/core/rake_task"
RSpec::Core::RakeTask.new(:spec, :example_pattern) do |task, args|
if args.example_pattern
task.rspec_opts = %W[-e "#{args.example_pattern}" -f documentation]
end
end
task :default => :spec

View File

@ -1,9 +0,0 @@
#include "RuleDefinition.h"
using namespace std;
bool RuleDefinition::create(const string & name, const string & definition)
{
m_name = name;
}

View File

@ -1,16 +0,0 @@
#ifndef RULEDEFINITION_H
#define RULEDEFINITION_H
#include <string>
class RuleDefinition
{
public:
bool create(const std::string & name, const std::string & definition);
protected:
std::string m_name;
};
#endif

View File

@ -1,125 +0,0 @@
#include <pcre.h>
#include <iostream>
#include <string>
#include <vector>
#include "TokenDefinition.h"
#include "refptr.h"
using namespace std;
#define WHITESPACE " \n\r\t\v"
static string trim(string s)
{
size_t lastpos = s.find_last_not_of(WHITESPACE);
if (lastpos == string::npos)
return "";
s.erase(lastpos + 1);
s.erase(0, s.find_first_not_of(WHITESPACE));
return s;
}
static refptr< vector<string> > split(const string & delim, string str)
{
refptr< vector<string> > ret = new vector<string>();
size_t pos;
while ( (pos = str.find(delim)) != string::npos )
{
string t = str.substr(0, pos);
ret->push_back(t);
str.erase(0, pos + 1);
}
if (str != "")
ret->push_back(str);
return ret;
}
static string c_escape(const string & orig)
{
string result;
for (string::const_iterator it = orig.begin(); it != orig.end(); it++)
{
if (*it == '\\' || *it == '"')
result += '\\';
result += *it;
}
return result;
}
TokenDefinition::TokenDefinition()
: m_process(false)
{
}
bool TokenDefinition::create(const string & name,
const string & definition)
{
const char * errptr;
int erroffset;
pcre * re = pcre_compile(definition.c_str(), 0, &errptr, &erroffset, NULL);
if (re == NULL)
{
cerr << "Error compiling regular expression '" << definition
<< "' at position " << erroffset << ": " << errptr << endl;
return false;
}
m_name = name;
m_definition = definition;
pcre_free(re);
#if 0
refptr< vector< string > > parts = split(",", flags);
for (int i = 0, sz = parts->size(); i < sz; i++)
{
(*parts)[i] = trim((*parts)[i]);
string & s = (*parts)[i];
if (s == "p")
{
m_process = true;
}
else
{
cerr << "Unknown token flag \"" << s << "\"" << endl;
return false;
}
}
#endif
return true;
}
string TokenDefinition::getCString() const
{
return c_escape(m_definition);
}
string TokenDefinition::getClassDefinition() const
{
string ret = "class "+ getClassName() + " : public Token {\n";
ret += "public:\n";
if (m_process)
{
ret += " virtual void process(const Matches & matches);\n";
}
ret += "\n";
ret += "protected:\n";
ret += m_data + "\n";
ret += "};\n";
return ret;
}
string TokenDefinition::getProcessMethod() const
{
string ret;
if (m_code != "")
{
ret += "void " + getClassName() + "::process(const Matches & matches) {\n";
ret += m_code + "\n";
ret += "}\n";
}
return ret;
}

View File

@ -1,37 +0,0 @@
#ifndef TOKENDEFINITION_H
#define TOKENDEFINITION_H
#include <string>
#include "refptr.h"
class TokenDefinition
{
public:
TokenDefinition();
bool create(const std::string & name,
const std::string & definition);
std::string getCString() const;
std::string getName() const { return m_name; }
bool getProcessFlag() const { return m_process; }
void setProcessFlag(bool p) { m_process = p; }
void addData(const std::string & d) { m_data += d; }
std::string getData() const { return m_data; }
void addCode(const std::string & c) { m_code += c; m_process = true; }
std::string getCode() const { return m_code; }
std::string getClassDefinition() const;
std::string getProcessMethod() const;
std::string getIdentifier() const { return "TK_" + m_name; }
std::string getClassName() const { return "Tk" + m_name; }
protected:
std::string m_name;
std::string m_definition;
bool m_process;
std::string m_data;
std::string m_code;
};
typedef refptr<TokenDefinition> TokenDefinitionRef;
#endif

252
assets/parser.d.erb Normal file
View File

@ -0,0 +1,252 @@
<% if @modulename %>
module <%= @modulename %>;
<% end %>
class <%= classname %>
{
enum
{
<% @tokens.each_with_index do |(name, token), index| %>
<% if token.name %>
TOKEN_<%= token.c_name %> = <%= index %>,
<% end %>
<% end %>
TOKEN_EOF = <%= TOKEN_EOF %>,
TOKEN_DECODE_ERROR = <%= TOKEN_DECODE_ERROR %>,
TOKEN_DROP = <%= TOKEN_DROP %>,
TOKEN_NONE = <%= TOKEN_NONE %>,
}
static immutable string TokenNames[] = [
<% @tokens.each_with_index do |(name, token), index| %>
<% if token.name %>
"<%= token.name %>",
<% else %>
null,
<% end %>
<% end %>
];
static class Decoder
{
enum
{
CODE_POINT_INVALID = 0xFFFFFFFE,
CODE_POINT_EOF = 0xFFFFFFFF,
}
struct DecodedCodePoint
{
uint code_point;
uint code_point_length;
}
static DecodedCodePoint decode_code_point(const(ubyte) * input, size_t input_length)
{
if (input_length == 0u)
{
return DecodedCodePoint(CODE_POINT_EOF, 0u);
}
ubyte c = *input;
uint code_point;
uint code_point_length;
if ((c & 0x80u) == 0u)
{
code_point = c;
code_point_length = 1u;
}
else
{
ubyte following_bytes;
if ((c & 0xE0u) == 0xC0u)
{
code_point = c & 0x1Fu;
following_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
code_point = c & 0x0Fu;
following_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
code_point = c & 0x07u;
following_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
code_point = c & 0x03u;
following_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
code_point = c & 0x01u;
following_bytes = 5u;
}
if (input_length <= following_bytes)
{
return DecodedCodePoint(CODE_POINT_INVALID, 0u);
}
code_point_length = following_bytes + 1u;
while (following_bytes-- > 0u)
{
input++;
code_point <<= 6u;
code_point |= *input & 0x3Fu;
}
}
return DecodedCodePoint(code_point, code_point_length);
}
}
static class Lexer
{
private struct Transition
{
uint first;
uint last;
uint destination;
}
private struct State
{
uint transition_table_index;
uint n_transitions;
uint accepts;
}
<% transition_table, state_table = lexer.dfa.build_tables %>
private static const Transition transitions[] = [
<% transition_table.each do |transition_table_entry| %>
Transition(<%= transition_table_entry[:first] %>u, <%= transition_table_entry[:last] %>u, <%= transition_table_entry[:destination] %>u),
<% end %>
];
private static const State states[] = [
<% state_table.each do |state_table_entry| %>
State(<%= state_table_entry[:transition_table_index] %>u, <%= state_table_entry[:n_transitions] %>u, <%= state_table_entry[:accepts] %>u),
<% end %>
];
struct LexedToken
{
size_t row;
size_t col;
size_t length;
uint token;
}
private const(ubyte) * m_input;
private size_t m_input_length;
private size_t m_input_position;
private size_t m_input_row;
private size_t m_input_col;
this(const(ubyte) * input, size_t input_length)
{
m_input = input;
m_input_length = input_length;
}
LexedToken lex_token()
{
for (;;)
{
LexedToken lt = attempt_lex_token();
if (lt.token != TOKEN_DROP)
{
return lt;
}
}
}
private LexedToken attempt_lex_token()
{
LexedToken lt = LexedToken(m_input_row, m_input_col, 0, TOKEN_NONE);
struct LexedTokenState
{
size_t length;
size_t delta_row;
size_t delta_col;
uint token;
}
LexedTokenState last_accepts_info;
last_accepts_info.token = TOKEN_NONE;
LexedTokenState attempt_info;
uint current_state;
for (;;)
{
auto decoded = Decoder.decode_code_point(&m_input[m_input_position + attempt_info.length], m_input_length - m_input_position - attempt_info.length);
if (decoded.code_point == Decoder.CODE_POINT_INVALID)
{
lt.token = TOKEN_DECODE_ERROR;
return lt;
}
bool lex_continue = false;
if (decoded.code_point != Decoder.CODE_POINT_EOF)
{
uint dest = transition(current_state, decoded.code_point);
if (dest != cast(uint)-1)
{
lex_continue = true;
attempt_info.length += decoded.code_point_length;
if (decoded.code_point == '\n')
{
attempt_info.delta_row++;
attempt_info.delta_col = 0u;
}
else
{
attempt_info.delta_col++;
}
current_state = dest;
if (states[current_state].accepts != TOKEN_NONE)
{
attempt_info.token = states[current_state].accepts;
last_accepts_info = attempt_info;
}
}
}
else if (attempt_info.length == 0u)
{
lt.token = TOKEN_EOF;
break;
}
if (!lex_continue)
{
if (last_accepts_info.token != TOKEN_NONE)
{
lt.token = last_accepts_info.token;
lt.length = last_accepts_info.length;
m_input_position += last_accepts_info.length;
m_input_row += last_accepts_info.delta_row;
if (last_accepts_info.delta_row != 0u)
{
m_input_col = last_accepts_info.delta_col;
}
else
{
m_input_col += last_accepts_info.delta_col;
}
}
break;
}
}
return lt;
}
private uint transition(uint current_state, uint code_point)
{
uint transition_table_index = states[current_state].transition_table_index;
for (uint i = 0u; i < states[current_state].n_transitions; i++)
{
if ((transitions[transition_table_index + i].first <= code_point) &&
(code_point <= transitions[transition_table_index + i].last))
{
return transitions[transition_table_index + i].destination;
}
}
return cast(uint)-1;
}
}
}

5
bin/propane Executable file
View File

@ -0,0 +1,5 @@
#!/usr/bin/env ruby
require "propane"
exit Propane::CLI.run(ARGV.dup)

View File

@ -1,101 +0,0 @@
#include <getopt.h>
#include <iostream>
#include <fstream>
#include "refptr.h"
#include "Parser.h"
using namespace std;
string buildOutputFilename(string & input_fname);
int main(int argc, char * argv[])
{
int longind = 1;
int opt;
Parser p;
string outfile;
static struct option longopts[] = {
/* name, has_arg, flag, val */
{ "classname", required_argument, NULL, 'c' },
{ "extension", required_argument, NULL, 'e' },
{ "namespace", required_argument, NULL, 'n' },
{ "outfile", required_argument, NULL, 'o' },
{ NULL, 0, NULL, 0 }
};
while ((opt = getopt_long(argc, argv, "", longopts, &longind)) != -1)
{
switch (opt)
{
case 'c': /* classname */
p.setClassName(optarg);
break;
case 'e': /* extension */
p.setExtension(optarg);
break;
case 'n': /* namespace */
p.setNamespace(optarg);
break;
case 'o': /* outfile */
outfile = optarg;
break;
}
}
if (optind >= argc)
{
cerr << "Usage: imbecile [options] <input-file>" << endl;
return 1;
}
string input_fname = argv[optind];
ifstream ifs;
ifs.open(input_fname.c_str(), ios::binary);
if (!ifs.is_open())
{
cerr << "Error opening input file: '" << input_fname << "'";
return 2;
}
ifs.seekg(0, ios_base::end);
int size = ifs.tellg();
ifs.seekg(0, ios_base::beg);
char * buff = new char[size];
ifs.read(buff, size);
ifs.close();
if (outfile == "")
outfile = buildOutputFilename(input_fname);
if (!p.parseInputFile(buff, size))
{
cerr << "Error parsing " << input_fname << endl;
return 3;
}
if (!p.write(outfile))
{
cerr << "Error processing " << input_fname << endl;
return 4;
}
delete[] buff;
return 0;
}
string buildOutputFilename(string & input_fname)
{
string outfile;
size_t len = input_fname.length();
if (len > 2 && input_fname.substr(len - 2) == ".I")
{
outfile = input_fname.substr(0, len - 2);
}
else
{
outfile = input_fname;
}
return outfile;
}

137
lib/propane.rb Normal file
View File

@ -0,0 +1,137 @@
require "erb"
require "set"
require_relative "propane/cli"
require_relative "propane/code_point_range"
require_relative "propane/fa"
require_relative "propane/fa/state"
require_relative "propane/fa/state/transition"
require_relative "propane/lexer"
require_relative "propane/lexer/dfa"
require_relative "propane/parser"
require_relative "propane/parser/item"
require_relative "propane/parser/item_set"
require_relative "propane/regex"
require_relative "propane/regex/nfa"
require_relative "propane/regex/unit"
require_relative "propane/rule"
require_relative "propane/token"
require_relative "propane/version"
class Propane
# EOF.
TOKEN_EOF = 0xFFFFFFFC
# Decoding error.
TOKEN_DECODE_ERROR = 0xFFFFFFFD
# Token ID for a "dropped" token.
TOKEN_DROP = 0xFFFFFFFE
# Invalid token ID.
TOKEN_NONE = 0xFFFFFFFF
class Error < RuntimeError
end
def initialize(input)
@tokens = {}
@rules = {}
input = input.gsub("\r\n", "\n")
while !input.empty?
parse_grammar(input)
end
end
def generate(output_file, log_file)
expand_rules
lexer = Lexer.new(@tokens)
parser = Parser.new(@tokens, @rules)
classname = @classname || File.basename(output_file).sub(%r{[^a-zA-Z0-9].*}, "").capitalize
erb = ERB.new(File.read(File.join(File.dirname(File.expand_path(__FILE__)), "../assets/parser.d.erb")), trim_mode: "<>")
result = erb.result(binding.clone)
File.open(output_file, "wb") do |fh|
fh.write(result)
end
end
private
def parse_grammar(input)
if input.slice!(/\A\s+/)
# Skip white space.
elsif input.slice!(/\A#.*\n/)
# Skip comment lines.
elsif input.slice!(/\Amodule\s+(\S+)\n/)
@modulename = $1
elsif input.slice!(/\Aclass\s+(\S+)\n/)
@classname = $1
elsif input.slice!(/\Atoken\s+(\S+)(?:\s+(\S+))?\n/)
name, pattern = $1, $2
if pattern.nil?
pattern = name
end
unless name =~ /^[a-zA-Z_][a-zA-Z_0-9]*$/
raise Error.new("Invalid token name #{name}")
end
if @tokens[name]
raise Error.new("Duplicate token name #{name}")
else
@tokens[name] = Token.new(name, pattern, @tokens.size)
end
elsif input.slice!(/\Adrop\s+(\S+)\n/)
pattern = $1
@tokens[name] = Token.new(nil, pattern, @tokens.size)
elsif input.slice!(/\A(\S+)\s*:\s*\[(.*?)\] <<\n(.*?)^>>\n/m)
rule_name, components, code = $1, $2, $3
components = components.strip.split(/\s+/)
@rules[rule_name] ||= Rule.new(rule_name, @rules.size)
@rules[rule_name].add_pattern(components, code)
else
if input.size > 25
input = input.slice(0..20) + "..."
end
raise Error.new("Unexpected grammar input: #{input}")
end
end
def expand_rules
@rules.each do |rule_name, rule|
if @tokens.include?(rule_name)
raise Error.new("Rule name collides with token name #{rule_name}")
end
end
unless @rules["Start"]
raise Error.new("Start rule not found")
end
@rules.each do |rule_name, rule|
rule.patterns.each do |rule|
rule.components.map! do |component|
if @tokens[component]
@tokens[component]
elsif @rules[component]
@rules[component]
else
raise Error.new("Symbol #{component} not found")
end
end
end
end
end
class << self
def run(input_file, output_file, log_file)
begin
propane = Propane.new(File.read(input_file))
propane.generate(output_file, log_file)
rescue Error => e
$stderr.puts e.message
return 2
end
return 0
end
end
end

54
lib/propane/cli.rb Normal file
View File

@ -0,0 +1,54 @@
class Propane
module CLI
USAGE = <<EOF
Usage: #{$0} [options] <input-file> <output-file>
Options:
--log LOG Write log file
--version Show program version and exit
-h, --help Show this usage and exit
EOF
class << self
def run(args)
params = []
log_file = nil
i = 0
while i < args.size
arg = args[i]
case arg
when "--log"
if i + 1 < args.size
i += 1
log_file = args[i]
end
when "--version"
puts "propane v#{VERSION}"
return 0
when "-h", "--help"
puts USAGE
return 0
when /^-/
$stderr.puts "Error: unknown option #{arg}"
return 1
else
params << arg
end
i += 1
end
if params.size != 2
$stderr.puts "Error: specify input and output files"
return 1
end
unless File.readable?(params[0])
$stderr.puts "Error: cannot read #{params[0]}"
return 2
end
Propane.run(*params, log_file)
end
end
end
end

View File

@ -0,0 +1,84 @@
class Propane
class CodePointRange
MAX_CODE_POINT = 0xFFFFFFFF
attr_reader :first
attr_reader :last
include Comparable
# Build a CodePointRange
def initialize(first, last = nil)
@first = first.ord
if last
@last = last.ord
if @last < @first
raise "Invalid CodePointRange: last code point must be > first code point"
end
else
@last = @first
end
end
def <=>(other)
if self.first != other.first
@first <=> other.first
else
@last <=> other.last
end
end
def include?(v)
if v.is_a?(CodePointRange)
@first <= v.first && v.last <= @last
else
@first <= v && v <= @last
end
end
def size
@last - @first + 1
end
class << self
def invert_ranges(code_point_ranges)
new_ranges = []
last_cp = -1
code_point_ranges.sort.each do |code_point_range|
if code_point_range.first > (last_cp + 1)
new_ranges << CodePointRange.new(last_cp + 1, code_point_range.first - 1)
last_cp = code_point_range.last
else
last_cp = [last_cp, code_point_range.last].max
end
end
if last_cp < MAX_CODE_POINT
new_ranges << CodePointRange.new(last_cp + 1, MAX_CODE_POINT)
end
new_ranges
end
def first_subrange(code_point_ranges)
code_point_ranges.sort.reduce do |result, code_point_range|
if code_point_range.include?(result.first)
if code_point_range.last < result.last
code_point_range
else
result
end
else
if code_point_range.first <= result.last
CodePointRange.new(result.first, code_point_range.first - 1)
else
result
end
end
end
end
end
end
end

61
lib/propane/fa.rb Normal file
View File

@ -0,0 +1,61 @@
class Propane
class FA
attr_reader :start_state
def initialize
@start_state = State.new
end
def to_s
chr = lambda do |value|
if value < 32 || value > 127
"{#{value}}"
else
value.chr
end
end
rv = ""
states = enumerate
states.each do |state, id|
accepts_s = state.accepts ? " #{state.accepts}" : ""
rv += "#{id}#{accepts_s}:\n"
state.transitions.each do |transition|
if transition.nil?
range_s = "nil"
else
range_s = chr[transition.code_point_range.first]
if transition.code_point_range.size > 1
range_s += "-" + chr[transition.code_point_range.last]
end
end
accepts_s = transition.destination.accepts ? " #{transition.destination.accepts}" : ""
rv += " #{range_s} => #{states[transition.destination]}#{accepts_s}\n"
end
end
rv
end
def enumerate
@_enumerated ||=
begin
id = 0
states = {}
visit = lambda do |state|
unless states.include?(state)
states[state] = id
id += 1
state.transitions.each do |transition|
visit[transition.destination]
end
end
end
visit[@start_state]
states
end
end
end
end

51
lib/propane/fa/state.rb Normal file
View File

@ -0,0 +1,51 @@
class Propane
class FA
class State
attr_accessor :accepts
attr_reader :transitions
def initialize
@transitions = []
end
def add_transition(code_point_range, destination)
@transitions << Transition.new(code_point_range, destination)
end
# Determine the set of states that can be reached by nil transitions.
# from this state.
#
# @return [Set<NFA::State>]
# Set of states.
def nil_transition_states
states = Set[self]
analyze_state = lambda do |state|
state.nil_transitions.each do |transition|
unless states.include?(transition.destination)
states << transition.destination
analyze_state[transition.destination]
end
end
end
analyze_state[self]
states
end
def nil_transitions
@transitions.select do |transition|
transition.nil?
end
end
def cp_transitions
@transitions.reject do |transition|
transition.nil?
end
end
end
end
end

View File

@ -0,0 +1,23 @@
class Propane
class FA
class State
class Transition
attr_reader :code_point_range
attr_reader :destination
def initialize(code_point_range, destination)
@code_point_range = code_point_range
@destination = destination
end
def nil?
@code_point_range.nil?
end
end
end
end
end

13
lib/propane/lexer.rb Normal file
View File

@ -0,0 +1,13 @@
class Propane
class Lexer
# @return [DFA]
# Lexer DFA.
attr_accessor :dfa
def initialize(tokens)
@dfa = DFA.new(tokens)
end
end
end

118
lib/propane/lexer/dfa.rb Normal file
View File

@ -0,0 +1,118 @@
class Propane
class Lexer
class DFA < FA
def initialize(tokens)
super()
start_nfa = Regex::NFA.new
tokens.each do |name, token|
start_nfa.start_state.add_transition(nil, token.nfa.start_state)
end
@nfa_state_sets = {}
@states = []
@to_process = Set.new
nil_transition_states = start_nfa.start_state.nil_transition_states
register_nfa_state_set(nil_transition_states)
while @to_process.size > 0
state_set = @to_process.first
@to_process.delete(state_set)
process_nfa_state_set(state_set)
end
@start_state = @states[0]
end
def build_tables
transition_table = []
state_table = []
states = enumerate
states.each do |state, id|
accepts =
if state.accepts.nil?
TOKEN_NONE
elsif state.accepts.name
state.accepts.id
else
TOKEN_DROP
end
state_table << {
transition_table_index: transition_table.size,
n_transitions: state.transitions.size,
accepts: accepts,
}
state.transitions.each do |transition|
transition_table << {
first: transition.code_point_range.first,
last: transition.code_point_range.last,
destination: states[transition.destination],
}
end
end
[transition_table, state_table]
end
private
def register_nfa_state_set(nfa_state_set)
unless @nfa_state_sets.include?(nfa_state_set)
state_id = @states.size
@nfa_state_sets[nfa_state_set] = state_id
@states << State.new
@to_process << nfa_state_set
end
end
def process_nfa_state_set(nfa_state_set)
state_id = @nfa_state_sets[nfa_state_set]
state = @states[state_id]
if state_id > 0
nfa_state_set.each do |nfa_state|
if nfa_state.accepts
if state.accepts
if nfa_state.accepts.id < state.accepts.id
state.accepts = nfa_state.accepts
end
else
state.accepts = nfa_state.accepts
end
end
end
end
transitions = transitions_for(nfa_state_set)
while transitions.size > 0
subrange = CodePointRange.first_subrange(transitions.map(&:code_point_range))
dest_nfa_states = transitions.reduce(Set.new) do |result, transition|
if transition.code_point_range.include?(subrange)
result << transition.destination
end
result
end
dest_nfa_states = dest_nfa_states.reduce(Set.new) do |result, dest_nfa_state|
result + dest_nfa_state.nil_transition_states
end
register_nfa_state_set(dest_nfa_states)
dest_state = @states[@nfa_state_sets[dest_nfa_states]]
state.add_transition(subrange, dest_state)
transitions.delete_if do |transition|
transition.code_point_range.last <= subrange.last
end
transitions.map! do |transition|
if transition.code_point_range.first <= subrange.last
Regex::NFA::State::Transition.new(CodePointRange.new(subrange.last + 1, transition.code_point_range.last), transition.destination)
else
transition
end
end
end
end
def transitions_for(nfa_state_set)
nfa_state_set.reduce([]) do |result, state|
result + state.cp_transitions
end
end
end
end
end

84
lib/propane/parser.rb Normal file
View File

@ -0,0 +1,84 @@
class Propane
class Parser
def initialize(tokens, rules)
@token_eof = Token.new("$", nil, TOKEN_EOF)
@item_sets = []
@item_sets_set = {}
start_items = rules["Start"].patterns.map do |pattern|
pattern.components << @token_eof
Item.new(pattern, 0)
end
eval_item_sets = Set.new
eval_item_sets << ItemSet.new(start_items)
while eval_item_sets.size > 0
this_eval_item_sets = eval_item_sets
eval_item_sets = Set.new
this_eval_item_sets.each do |item_set|
unless @item_sets_set.include?(item_set)
item_set.id = @item_sets.size
@item_sets << item_set
@item_sets_set[item_set] = item_set
item_set.follow_symbols.each do |follow_symbol|
unless follow_symbol == @token_eof
follow_set = item_set.build_follow_set(follow_symbol)
eval_item_sets << follow_set
end
end
end
end
end
@item_sets.each do |item_set|
process_item_set(item_set)
puts "Item set #{item_set.id}:"
ids = item_set.in_sets.map(&:id)
if ids.size > 0
puts " (in from #{ids.join(", ")})"
end
puts item_set
item_set.follow_item_set.each do |follow_symbol, follow_item_set|
puts " #{follow_symbol.name} => #{follow_item_set.id}"
end
puts
end
end
def build_tables
shift_table = []
state_table = []
@item_sets.each do |item_set|
shift_entries = item_set.follow_symbols.select do |follow_symbol|
follow_symbol.is_a?(Token)
end.map do |follow_symbol|
{
token_id: follow_symbol.id,
state_id: item_set.follow_item_set[follow_symbol].id,
}
end
state_table << {
shift_index: shift_table.size,
n_shifts: shift_entries.size,
}
shift_table += shift_entries
end
[state_table, shift_table]
end
private
def process_item_set(item_set)
item_set.follow_symbols.each do |follow_symbol|
unless follow_symbol == @token_eof
follow_set = @item_sets_set[item_set.build_follow_set(follow_symbol)]
item_set.follow_item_set[follow_symbol] = follow_set
follow_set.in_sets << item_set
end
end
end
end
end

View File

@ -0,0 +1,69 @@
class Propane
class Parser
class Item
attr_reader :pattern
attr_reader :position
def initialize(pattern, position)
@pattern = pattern
@position = position
end
def next_component
@pattern.components[@position]
end
def hash
[@pattern, @position].hash
end
def ==(other)
@pattern == other.pattern && @position == other.position
end
def eql?(other)
self == other
end
def closed_items
if @pattern.components[@position].is_a?(Rule)
@pattern.components[@position].patterns.map do |pattern|
Item.new(pattern, 0)
end
else
[]
end
end
def follow_symbol
@pattern.components[@position]
end
def followed_by?(symbol)
follow_symbol == symbol
end
def next_position
Item.new(@pattern, @position + 1)
end
def to_s
parts = []
@pattern.components.each_with_index do |symbol, index|
if @position == index
parts << "."
end
parts << symbol.name
end
if @position == @pattern.components.size
parts << "."
end
"#{@pattern.rule.name} -> #{parts.join(" ")}"
end
end
end
end

View File

@ -0,0 +1,76 @@
class Propane
class Parser
class ItemSet
attr_reader :items
attr_accessor :id
# @return [Hash]
# Maps a follow symbol to its item set.
attr_reader :follow_item_set
# @return [Set]
# Item sets leading to this item set.
attr_reader :in_sets
def initialize(items)
@items = Set.new(items)
@follow_item_set = {}
@in_sets = Set.new
close!
end
def follow_symbols
Set.new(@items.map(&:follow_symbol).compact)
end
def build_follow_set(symbol)
ItemSet.new(items_followed_by(symbol).map(&:next_position))
end
def hash
@items.hash
end
def ==(other)
@items.eql?(other.items)
end
def eql?(other)
self == other
end
def to_s
@items.map(&:to_s).join("\n")
end
private
def close!
eval_items = @items
while eval_items.size > 0
this_eval_items = eval_items
eval_items = Set.new
this_eval_items.each do |item|
item.closed_items.each do |new_item|
unless @items.include?(new_item)
eval_items << new_item
end
end
end
@items += eval_items
end
end
def items_followed_by(symbol)
@items.select do |item|
item.followed_by?(symbol)
end
end
end
end
end

162
lib/propane/regex.rb Normal file
View File

@ -0,0 +1,162 @@
class Propane
class Regex
attr_reader :unit
attr_reader :nfa
def initialize(pattern)
@pattern = pattern.dup
@unit = parse_alternates
@nfa = @unit.to_nfa
if @pattern != ""
raise Error.new(%[Unexpected "#{@pattern}" in pattern])
end
end
private
def parse_alternates
au = AlternatesUnit.new
while @pattern != ""
c = @pattern[0]
return au if c == ")"
@pattern.slice!(0)
case c
when "["
au << parse_character_class
when "("
au << parse_group
when "*", "+", "?", "{"
if last_unit = au.last_unit
case c
when "*"
min_count, max_count = 0, nil
when "+"
min_count, max_count = 1, nil
when "?"
min_count, max_count = 0, 1
when "{"
min_count, max_count = parse_curly_count
end
mu = MultiplicityUnit.new(last_unit, min_count, max_count)
au.replace_last!(mu)
else
raise Error.new("#{c} follows nothing")
end
when "|"
au.new_alternate!
when "\\"
au << parse_backslash
when "."
au << period_character_class
else
au << CharacterRangeUnit.new(c)
end
end
au
end
def parse_group
au = parse_alternates
if @pattern[0] != ")"
raise Error.new("Unterminated group in pattern")
end
@pattern.slice!(0)
au
end
def parse_character_class
ccu = CharacterClassUnit.new
index = 0
loop do
if @pattern == ""
raise Error.new("Unterminated character class")
end
c = @pattern.slice!(0)
if c == "]"
break
elsif c == "^" && index == 0
ccu.negate = true
elsif c == "-" && (ccu.size == 0 || @pattern[0] == "]")
ccu << CharacterRangeUnit.new(c)
elsif c == "\\"
ccu << parse_backslash
elsif c == "-" && @pattern[0] != "]"
begin_cu = ccu.last_unit
unless begin_cu.is_a?(CharacterRangeUnit) && begin_cu.code_point_range.size == 1
raise Error.new("Character range must be between single characters")
end
if @pattern[0] == "\\"
@pattern.slice!(0)
end_cu = parse_backslash
unless end_cu.is_a?(CharacterRangeUnit) && end_cu.code_point_range.size == 1
raise Error.new("Character range must be between single characters")
end
max_code_point = end_cu.code_point
else
max_code_point = @pattern[0].ord
@pattern.slice!(0)
end
cru = CharacterRangeUnit.new(begin_cu.first, max_code_point)
ccu.replace_last!(cru)
else
ccu << CharacterRangeUnit.new(c)
end
index += 1
end
ccu
end
def parse_curly_count
if @pattern =~ /^(\d+)(?:(,)(\d*))?\}(.*)$/
min_count, comma, max_count, pattern = $1, $2, $3, $4
min_count = min_count.to_i
if comma.to_s == ""
max_count = min_count
elsif max_count.to_s != ""
max_count = max_count.to_i
if max_count < min_count
raise Error.new("Maximum repetition count cannot be less than minimum repetition count")
end
else
max_count = nil
end
@pattern = pattern
[min_count, max_count]
else
raise Error.new("Unexpected match count at #{@pattern}")
end
end
def parse_backslash
if @pattern == ""
raise Error.new("Error: unfollowed \\")
else
c = @pattern.slice!(0)
case c
when "d"
CharacterRangeUnit.new("0", "9")
when "s"
ccu = CharacterClassUnit.new
ccu << CharacterRangeUnit.new(" ")
ccu << CharacterRangeUnit.new("\t")
ccu << CharacterRangeUnit.new("\r")
ccu << CharacterRangeUnit.new("\n")
ccu << CharacterRangeUnit.new("\f")
ccu << CharacterRangeUnit.new("\v")
ccu
else
CharacterRangeUnit.new(c)
end
end
end
def period_character_class
ccu = CharacterClassUnit.new
ccu << CharacterRangeUnit.new(0, "\n".ord - 1)
ccu << CharacterRangeUnit.new("\n".ord + 1, 0xFFFFFFFF)
ccu
end
end
end

26
lib/propane/regex/nfa.rb Normal file
View File

@ -0,0 +1,26 @@
class Propane
class Regex
class NFA < FA
attr_reader :end_state
def initialize
super()
@end_state = State.new
end
class << self
def empty
nfa = NFA.new
nfa.start_state.add_transition(nil, nfa.end_state)
nfa
end
end
end
end
end

172
lib/propane/regex/unit.rb Normal file
View File

@ -0,0 +1,172 @@
class Propane
class Regex
class Unit
end
class SequenceUnit < Unit
attr_accessor :units
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def to_nfa
if @units.empty?
NFA.empty
else
nfa = NFA.new
unit_nfas = @units.map do |unit|
unit.to_nfa
end
nfa.start_state.add_transition(nil, unit_nfas[0].start_state)
unit_nfas.reduce do |prev_nfa, next_nfa|
prev_nfa.end_state.add_transition(nil, next_nfa.start_state)
next_nfa
end.end_state.add_transition(nil, nfa.end_state)
nfa
end
end
end
class AlternatesUnit < Unit
attr_accessor :alternates
def initialize
@alternates = []
new_alternate!
end
def new_alternate!
@alternates << SequenceUnit.new
end
def <<(unit)
@alternates[-1] << unit
end
def last_unit
@alternates[-1][-1]
end
def replace_last!(new_unit)
@alternates[-1][-1] = new_unit
end
def to_nfa
if @alternates.size == 0
NFA.empty
elsif @alternates.size == 1
@alternates[0].to_nfa
else
nfa = NFA.new
alternate_nfas = @alternates.map do |alternate|
alternate.to_nfa
end
alternate_nfas.each do |alternate_nfa|
nfa.start_state.add_transition(nil, alternate_nfa.start_state)
alternate_nfa.end_state.add_transition(nil, nfa.end_state)
end
nfa
end
end
end
class CharacterRangeUnit < Unit
attr_reader :code_point_range
def initialize(c1, c2 = nil)
@code_point_range = CodePointRange.new(c1, c2)
end
def first
@code_point_range.first
end
def last
@code_point_range.last
end
def to_nfa
nfa = NFA.new
nfa.start_state.add_transition(@code_point_range, nfa.end_state)
nfa
end
end
class CharacterClassUnit < Unit
attr_accessor :units
attr_accessor :negate
def initialize
@units = []
@negate = false
end
def initialize
@units = []
end
def method_missing(*args)
@units.__send__(*args)
end
def <<(thing)
if thing.is_a?(CharacterClassUnit)
thing.each do |ccu_unit|
@units << ccu_unit
end
else
@units << thing
end
end
def last_unit
@units[-1]
end
def replace_last!(new_unit)
@units[-1] = new_unit
end
def to_nfa
nfa = NFA.new
if @units.empty?
nfa.start_state.add_transition(nil, nfa.end_state)
else
code_point_ranges = @units.map(&:code_point_range)
if @negate
code_point_ranges = CodePointRange.invert_ranges(code_point_ranges)
end
code_point_ranges.each do |code_point_range|
nfa.start_state.add_transition(code_point_range, nfa.end_state)
end
end
nfa
end
end
class MultiplicityUnit < Unit
attr_accessor :unit
attr_accessor :min_count
attr_accessor :max_count
def initialize(unit, min_count, max_count)
@unit = unit
@min_count = min_count
@max_count = max_count
end
def to_nfa
nfa = NFA.new
last_state = nfa.start_state
unit_nfa = nil
@min_count.times do
unit_nfa = @unit.to_nfa
last_state.add_transition(nil, unit_nfa.start_state)
last_state = unit_nfa.end_state
end
last_state.add_transition(nil, nfa.end_state)
if @max_count.nil?
if @min_count == 0
unit_nfa = @unit.to_nfa
last_state.add_transition(nil, unit_nfa.start_state)
end
unit_nfa.end_state.add_transition(nil, unit_nfa.start_state)
unit_nfa.end_state.add_transition(nil, nfa.end_state)
else
(@max_count - @min_count).times do
unit_nfa = @unit.to_nfa
last_state.add_transition(nil, unit_nfa.start_state)
unit_nfa.end_state.add_transition(nil, nfa.end_state)
last_state = unit_nfa.end_state
end
end
nfa
end
end
end
end

39
lib/propane/rule.rb Normal file
View File

@ -0,0 +1,39 @@
class Propane
class Rule
class Pattern
attr_reader :rule
attr_reader :components
attr_reader :code
def initialize(rule, components, code)
@rule = rule
@components = components
@code = code
end
end
attr_reader :id
attr_reader :name
attr_reader :patterns
def initialize(name, id)
@name = name
@id = id
@patterns = []
end
def add_pattern(components, code)
@patterns << Pattern.new(self, components, code)
end
end
end

42
lib/propane/token.rb Normal file
View File

@ -0,0 +1,42 @@
class Propane
class Token
# @return [String]
# Token name.
attr_reader :name
# @return [String]
# Token pattern.
attr_reader :pattern
# @return [Integer]
# Token ID.
attr_reader :id
# @return [Regex::NFA]
# Regex NFA for matching the token.
attr_reader :nfa
def initialize(name, pattern, id)
@name = name
@pattern = pattern
@id = id
unless pattern.nil?
regex = Regex.new(pattern)
regex.nfa.end_state.accepts = self
@nfa = regex.nfa
end
end
def c_name
@name.upcase
end
def to_s
@name
end
end
end

3
lib/propane/version.rb Normal file
View File

@ -0,0 +1,3 @@
class Propane
VERSION = "0.1.0"
end

2
propane.sh Executable file
View File

@ -0,0 +1,2 @@
#!/bin/sh
exec bundle exec ruby -Ilib bin/propane "$@"

1
refptr

@ -1 +0,0 @@
Subproject commit e2c7e88824c18eb3b218f6308db0194edb422eef

View File

@ -0,0 +1,87 @@
class Propane
describe CodePointRange do
describe "#<=>" do
it "sorts ranges" do
arr = [
CodePointRange.new(100,102),
CodePointRange.new(65, 68),
CodePointRange.new(65, 65),
CodePointRange.new(100, 100),
CodePointRange.new(68, 70),
]
arr.sort!
expect(arr[0]).to eq CodePointRange.new(65, 65)
expect(arr[1]).to eq CodePointRange.new(65, 68)
expect(arr[2]).to eq CodePointRange.new(68, 70)
expect(arr[3]).to eq CodePointRange.new(100, 100)
expect(arr[4]).to eq CodePointRange.new(100, 102)
end
end
describe "#include?" do
it "returns whether the code point is included in the range" do
expect(CodePointRange.new(100).include?(100)).to be_truthy
expect(CodePointRange.new(100, 100).include?(99)).to be_falsey
expect(CodePointRange.new(100, 100).include?(101)).to be_falsey
expect(CodePointRange.new(100, 120).include?(99)).to be_falsey
expect(CodePointRange.new(100, 120).include?(100)).to be_truthy
expect(CodePointRange.new(100, 120).include?(110)).to be_truthy
expect(CodePointRange.new(100, 120).include?(120)).to be_truthy
expect(CodePointRange.new(100, 120).include?(121)).to be_falsey
end
it "returns whether the range is included in the range" do
expect(CodePointRange.new(100).include?(CodePointRange.new(100))).to be_truthy
expect(CodePointRange.new(100, 100).include?(CodePointRange.new(99))).to be_falsey
expect(CodePointRange.new(100, 100).include?(CodePointRange.new(99, 100))).to be_falsey
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(90, 110))).to be_falsey
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(110, 130))).to be_falsey
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(100, 120))).to be_truthy
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(100, 110))).to be_truthy
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(110, 120))).to be_truthy
expect(CodePointRange.new(100, 120).include?(CodePointRange.new(102, 118))).to be_truthy
end
end
describe ".invert_ranges" do
it "inverts ranges" do
expect(CodePointRange.invert_ranges(
[CodePointRange.new(60, 90),
CodePointRange.new(80, 85),
CodePointRange.new(80, 100),
CodePointRange.new(101),
CodePointRange.new(200, 300)])).to eq [
CodePointRange.new(0, 59),
CodePointRange.new(102, 199),
CodePointRange.new(301, 0xFFFFFFFF)]
expect(CodePointRange.invert_ranges(
[CodePointRange.new(0, 500),
CodePointRange.new(7000, 0xFFFFFFFF)])).to eq [
CodePointRange.new(501, 6999)]
end
end
describe ".first_subrange" do
it "returns the first subrange to split" do
expect(CodePointRange.first_subrange(
[CodePointRange.new(65, 90),
CodePointRange.new(66, 66),
CodePointRange.new(80, 90)])).to eq CodePointRange.new(65)
expect(CodePointRange.first_subrange(
[CodePointRange.new(65, 90)])).to eq CodePointRange.new(65, 90)
expect(CodePointRange.first_subrange(
[CodePointRange.new(65, 90),
CodePointRange.new(80, 90)])).to eq CodePointRange.new(65, 79)
expect(CodePointRange.first_subrange(
[CodePointRange.new(65, 90),
CodePointRange.new(65, 100),
CodePointRange.new(65, 95)])).to eq CodePointRange.new(65, 90)
expect(CodePointRange.first_subrange(
[CodePointRange.new(100, 120),
CodePointRange.new(70, 90)])).to eq CodePointRange.new(70, 90)
end
end
end
end

View File

@ -0,0 +1,121 @@
class TestLexer
def initialize(token_dfa)
@token_dfa = token_dfa
end
def lex(input)
input_chars = input.chars
output = []
while lexed_token = lex_token(input_chars)
output << lexed_token
input_chars.slice!(0, lexed_token[1].size)
end
unless input_chars.empty?
raise "Unmatched input #{input_chars.join(" ")}"
end
output
end
def lex_token(input_chars)
return nil if input_chars.empty?
s = ""
current_state = @token_dfa.start_state
last_accepts = nil
last_s = nil
input_chars.each_with_index do |input_char, index|
if next_state = transition(current_state, input_char)
s += input_char
current_state = next_state
if current_state.accepts
last_accepts = current_state.accepts
last_s = s
end
else
break
end
end
if last_accepts
[last_accepts.name, last_s]
end
end
def transition(state, input_char)
state.transitions.each do |transition|
if transition.code_point_range.include?(input_char.ord)
return transition.destination
end
end
nil
end
end
def run(grammar, input)
propane = Propane.new(grammar)
token_dfa = Propane::Lexer::DFA.new(propane.instance_variable_get(:@tokens))
test_lexer = TestLexer.new(token_dfa)
test_lexer.lex(input)
end
describe Propane::Lexer::DFA do
it "lexes a simple token" do
expect(run(<<EOF, "foo")).to eq [["foo", "foo"]]
token foo
EOF
end
it "lexes two tokens" do
expected = [
["foo", "foo"],
["bar", "bar"],
]
expect(run(<<EOF, "foobar")).to eq expected
token foo
token bar
EOF
end
it "lexes the longer of multiple options" do
expected = [
["identifier", "foobar"],
]
expect(run(<<EOF, "foobar")).to eq expected
token foo
token bar
token identifier [a-z]+
EOF
expected = [
["plusplus", "++"],
["plus", "+"],
]
expect(run(<<EOF, "+++")).to eq expected
token plus \\+
token plusplus \\+\\+
EOF
end
it "lexes whitespace" do
expected = [
["foo", "foo"],
["WS", " \t"],
["bar", "bar"],
]
expect(run(<<EOF, "foo \tbar")).to eq expected
token foo
token bar
token WS \\s+
EOF
end
it "allows dropping a matched pattern" do
expected = [
["foo", "foo"],
[nil, " \t"],
["bar", "bar"],
]
expect(run(<<EOF, "foo \tbar")).to eq expected
token foo
token bar
drop \\s+
EOF
end
end

View File

@ -0,0 +1,19 @@
class Propane
class Parser
describe Item do
it "operates properly with a set" do
rule = Object.new
item1 = Item.new(rule, 2)
item2 = Item.new(rule, 2)
expect(item1).to eq item2
expect(item1.eql?(item2)).to be_truthy
set = Set.new([item1, item2])
expect(set.size).to eq 1
end
end
end
end

333
spec/propane/regex_spec.rb Normal file
View File

@ -0,0 +1,333 @@
class Propane
RSpec.describe Regex do
it "parses an empty expression" do
regex = Regex.new("")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0].size).to eq 0
end
it "parses a single character unit expression" do
regex = Regex.new("a")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
end
it "parses a group with a single character unit expression" do
regex = Regex.new("(a)")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::AlternatesUnit
alt_unit = seq_unit[0]
expect(alt_unit.alternates.size).to eq 1
expect(alt_unit.alternates[0]).to be_a Regex::SequenceUnit
expect(alt_unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
end
it "parses a *" do
regex = Regex.new("a*")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a +" do
regex = Regex.new("a+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 1
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a ?" do
regex = Regex.new("a?")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 0
expect(m_unit.max_count).to eq 1
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a multiplicity count" do
regex = Regex.new("a{5}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 5
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a minimum-only multiplicity count" do
regex = Regex.new("a{5,}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to be_nil
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
end
it "parses a minimum and maximum multiplicity count" do
regex = Regex.new("a{5,8}")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::MultiplicityUnit
m_unit = seq_unit[0]
expect(m_unit.min_count).to eq 5
expect(m_unit.max_count).to eq 8
expect(m_unit.unit).to be_a Regex::CharacterRangeUnit
expect(m_unit.unit.first).to eq "a".ord
end
it "parses an escaped *" do
regex = Regex.new("a\\*")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].first).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].first).to eq "*".ord
end
it "parses an escaped +" do
regex = Regex.new("a\\+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].first).to eq "a".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].first).to eq "+".ord
end
it "parses an escaped \\" do
regex = Regex.new("\\\\d")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 2
expect(seq_unit[0]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[0].first).to eq "\\".ord
expect(seq_unit[1]).to be_a Regex::CharacterRangeUnit
expect(seq_unit[1].first).to eq "d".ord
end
it "parses a character class" do
regex = Regex.new("[a-z_]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "a".ord
expect(ccu[0].last).to eq "z".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].first).to eq "_".ord
end
it "parses a negated character class" do
regex = Regex.new("[^xyz]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "x".ord
end
it "parses - as a plain character at beginning of a character class" do
regex = Regex.new("[-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "-".ord
end
it "parses - as a plain character at end of a character class" do
regex = Regex.new("[0-]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].first).to eq "-".ord
end
it "parses - as a plain character at beginning of a negated character class" do
regex = Regex.new("[^-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_truthy
expect(ccu.size).to eq 2
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "-".ord
end
it "parses . as a plain character in a character class" do
regex = Regex.new("[.]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 1
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq ".".ord
end
it "parses - as a plain character when escaped in middle of character class" do
regex = Regex.new("[0\\-9]")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
seq_unit = regex.unit.alternates[0]
expect(seq_unit.size).to eq 1
expect(seq_unit[0]).to be_a Regex::CharacterClassUnit
ccu = seq_unit[0]
expect(ccu.negate).to be_falsey
expect(ccu.size).to eq 3
expect(ccu[0]).to be_a Regex::CharacterRangeUnit
expect(ccu[0].first).to eq "0".ord
expect(ccu[1]).to be_a Regex::CharacterRangeUnit
expect(ccu[1].first).to eq "-".ord
expect(ccu[2]).to be_a Regex::CharacterRangeUnit
expect(ccu[2].first).to eq "9".ord
end
it "parses alternates" do
regex = Regex.new("ab|c")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 2
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0].size).to eq 2
expect(regex.unit.alternates[1].size).to eq 1
end
it "parses a ." do
regex = Regex.new("a.b")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 1
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[0][1]).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[0][1].units.size).to eq 2
expect(regex.unit.alternates[0][2]).to be_a Regex::CharacterRangeUnit
end
it "parses something complex" do
regex = Regex.new("(a|)*|[^^]|\\|v|[x-y]+")
expect(regex.unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates.size).to eq 4
expect(regex.unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0].size).to eq 1
expect(regex.unit.alternates[0][0]).to be_a Regex::MultiplicityUnit
expect(regex.unit.alternates[0][0].min_count).to eq 0
expect(regex.unit.alternates[0][0].max_count).to be_nil
expect(regex.unit.alternates[0][0].unit).to be_a Regex::AlternatesUnit
expect(regex.unit.alternates[0][0].unit.alternates.size).to eq 2
expect(regex.unit.alternates[0][0].unit.alternates[0]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0].unit.alternates[0].size).to eq 1
expect(regex.unit.alternates[0][0].unit.alternates[0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[0][0].unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[0][0].unit.alternates[1].size).to eq 0
expect(regex.unit.alternates[1]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[1].size).to eq 1
expect(regex.unit.alternates[1][0]).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[1][0].negate).to be_truthy
expect(regex.unit.alternates[1][0].size).to eq 1
expect(regex.unit.alternates[1][0][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[2].size).to eq 2
expect(regex.unit.alternates[2][0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][0].first).to eq "|".ord
expect(regex.unit.alternates[2][1]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[2][1].first).to eq "v".ord
expect(regex.unit.alternates[3]).to be_a Regex::SequenceUnit
expect(regex.unit.alternates[3].size).to eq 1
expect(regex.unit.alternates[3][0]).to be_a Regex::MultiplicityUnit
expect(regex.unit.alternates[3][0].min_count).to eq 1
expect(regex.unit.alternates[3][0].max_count).to be_nil
expect(regex.unit.alternates[3][0].unit).to be_a Regex::CharacterClassUnit
expect(regex.unit.alternates[3][0].unit.size).to eq 1
expect(regex.unit.alternates[3][0].unit[0]).to be_a Regex::CharacterRangeUnit
expect(regex.unit.alternates[3][0].unit[0].first).to eq "x".ord
expect(regex.unit.alternates[3][0].unit[0].last).to eq "y".ord
end
end
end

97
spec/propane_spec.rb Normal file
View File

@ -0,0 +1,97 @@
require "fileutils"
describe Propane do
def write_grammar(grammar)
File.write("spec/run/testparser.i", grammar)
end
def build_parser
result = system(*%w[./propane.sh spec/run/testparser.i spec/run/testparser.d])
expect(result).to be_truthy
end
def compile(test_file)
result = system(*%w[gdc -funittest -o spec/run/testparser spec/run/testparser.d], test_file)
expect(result).to be_truthy
end
def run
result = system("spec/run/testparser")
expect(result).to be_truthy
end
before(:each) do
FileUtils.rm_rf("spec/run")
FileUtils.mkdir_p("spec/run")
end
it "generates a D lexer" do
write_grammar <<EOF
token int \\d+
token plus \\+
token times \\*
drop \\s+
Start: [Foo] <<
>>
Foo: [int] <<
>>
Foo: [plus] <<
>>
EOF
build_parser
compile("spec/test_d_lexer.d")
run
end
it "generates a parser" do
write_grammar <<EOF
token plus \\+
token times \\*
token zero 0
token one 1
Start: [E] <<
>>
E: [E times B] <<
>>
E: [E plus B] <<
>>
E: [B] <<
>>
B: [zero] <<
>>
B: [one] <<
>>
EOF
build_parser
end
it "distinguishes between multiple identical rules with lookahead symbol" do
write_grammar <<EOF
token a
token b
Start: [R1 a] <<
>>
Start: [R2 b] <<
>>
R1: [a b] <<
>>
R2: [a b] <<
>>
EOF
build_parser
end
it "handles reducing a rule that could be arrived at from multiple states" do
write_grammar <<EOF
token a
token b
Start: [a R1] <<
>>
Start: [b R1] <<
>>
R1: [b] <<
>>
EOF
build_parser
end
end

11
spec/spec_helper.rb Normal file
View File

@ -0,0 +1,11 @@
require "bundler/setup"
require "propane"
RSpec.configure do |config|
# Enable flags like --only-failures and --next-failure
config.example_status_persistence_file_path = ".rspec_status"
config.expect_with :rspec do |c|
c.syntax = :expect
end
end

66
spec/test_d_lexer.d Normal file
View File

@ -0,0 +1,66 @@
import testparser;
import std.stdio;
int main()
{
return 0;
}
unittest
{
alias DCP = Testparser.Decoder.DecodedCodePoint;
string inputstring = "5+\n 66";
const(ubyte) * input = cast(const(ubyte) *)inputstring.ptr;
size_t input_length = inputstring.length;
DCP dcp;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('5', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('+', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('\n', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(' ', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_EOF, 0u));
inputstring = "\xf0\x9f\xa7\xa1";
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(0x1F9E1, 4u));
}
unittest
{
alias LT = Testparser.Lexer.LexedToken;
string input = "5 + 4 * \n677 + 567";
Testparser.Lexer lexer = new Testparser.Lexer(cast(const(ubyte) *)input.ptr, input.length);
assert(lexer.lex_token() == LT(0, 0, 1, Testparser.TOKEN_INT));
assert(lexer.lex_token() == LT(0, 2, 1, Testparser.TOKEN_PLUS));
assert(lexer.lex_token() == LT(0, 4, 1, Testparser.TOKEN_INT));
assert(lexer.lex_token() == LT(0, 6, 1, Testparser.TOKEN_TIMES));
assert(lexer.lex_token() == LT(1, 0, 3, Testparser.TOKEN_INT));
assert(lexer.lex_token() == LT(1, 4, 1, Testparser.TOKEN_PLUS));
assert(lexer.lex_token() == LT(1, 6, 3, Testparser.TOKEN_INT));
assert(lexer.lex_token() == LT(1, 9, 0, Testparser.TOKEN_EOF));
lexer = new Testparser.Lexer(null, 0u);
assert(lexer.lex_token() == LT(0, 0, 0, Testparser.TOKEN_EOF));
}

View File

@ -1,14 +0,0 @@
all:
for d in *; do \
if [ -d $$d ]; then \
make -C $$d; \
fi; \
done
clean:
for d in *; do \
if [ -d $$d ]; then \
make -C $$d clean; \
fi; \
done

View File

@ -1,15 +0,0 @@
TARGET := test
I_SOURCE := itest
CXXFLAGS := -O2
LDFLAGS := -lpcre
all: $(TARGET)
./$(TARGET)
$(TARGET): $(shell which imbecile) $(I_SOURCE).I $(wildcard *.cc)
imbecile $(I_SOURCE).I
$(CXX) -o $@ *.cc $(LDFLAGS)
clean:
-rm -f $(TARGET) *.o $(I_SOURCE).cc $(I_SOURCE).h

View File

@ -1,37 +0,0 @@
[tokens]
AND and
OR or
NOT not
LPAREN \(
RPAREN \)
WS \s+
EQUALS = %{ cout << "Saw '='" << endl; %}
IDENTIFIER [a-zA-Z_][a-zA-Z_0-9]* %{
cout << "Identify: '" << matches[0] << "'" << endl;
%}
DEC_INT [1-9]\d*\b
${
uint64_t value;
$}
%{
sscanf(matches[0].c_str(), "%lld", &value);
cout << "value: " << value << endl;
%}
HEX_INT 0x([0-9a-fA-F]+)\b ${ uint64_t value; $} %{
sscanf(matches[1].c_str(), "%llx", &value);
cout << "value: " << value << endl;
%}
OCT_INT 0([0-7]*)\b
BIN_INT 0b([01]+)\b
[rules]
Assignment := IDENTIFIER ASSIGN Expression
Expression := IDENTIFIER \
| Assignment

View File

@ -1,17 +0,0 @@
#include <sstream>
#include <string>
#include "itest.h"
using namespace std;
int main(int argc, char * argv[])
{
Parser p;
stringstream t(string(
"hi there (one and two and three and four) or (two = nine)\n"
"0x42 12345 0 011 0b0011\n"
));
p.parse(t);
}

View File

@ -1,202 +0,0 @@
#include <string.h> /* memcpy() */
#include <pcre.h>
#include <iostream>
#include <vector>
#include {%header_name%}
using namespace std;
#ifdef I_NAMESPACE
namespace I_NAMESPACE {
#endif
I_CLASSNAME::I_CLASSNAME()
: m_errstr(NULL)
{
}
static TokenRef buildToken(int typeindex)
{
TokenRef token;
switch (typeindex)
{
{%buildToken%}
}
if (!token.isNull())
{
token->setType(typeindex);
}
return token;
}
static void read_istream(istream & i, vector<char> & buff, int & size)
{
size = 0;
int bytes_read;
char read_buff[1000];
while (!i.eof())
{
i.read(&read_buff[0], sizeof(read_buff));
bytes_read = i.gcount();
size += bytes_read;
for (int j = 0; j < bytes_read; j++)
buff.push_back(read_buff[j]);
}
}
bool I_CLASSNAME::parse(istream & i)
{
struct {
const char * name;
const char * definition;
bool process;
pcre * re;
pcre_extra * re_extra;
} tokens[] = {
{%token_list%}
};
if (sizeof(tokens)/sizeof(tokens[0]) == 0)
{
m_errstr = "No tokens defined";
return false;
}
vector<char> buff;
int buff_size;
read_istream(i, buff, buff_size);
if (buff_size <= 0)
{
m_errstr = "0-length input string";
return false;
}
/* append trailing NUL byte for pcre functions */
buff.push_back('\0');
/* compile all token regular expressions */
for (int i = 0; i < sizeof(tokens)/sizeof(tokens[0]); i++)
{
const char * errptr;
int erroffset;
tokens[i].re = pcre_compile(tokens[i].definition, 0,
&errptr, &erroffset, NULL);
if (tokens[i].re == NULL)
{
cerr << "Error compiling token '" << tokens[i].name
<< "' regular expression at position " << erroffset
<< ": " << errptr << endl;
m_errstr = "Error in token regular expression";
return false;
}
tokens[i].re_extra = pcre_study(tokens[i].re, 0, &errptr);
}
int buff_pos = 0;
const int ovector_num_matches = 16;
const int ovector_size = 3 * (ovector_num_matches + 1);
int ovector[ovector_size];
while (buff_pos < buff_size)
{
int longest_match_length = 0;
int longest_match_index = -1;
int longest_match_ovector[ovector_size];
for (int i = 0; i < sizeof(tokens)/sizeof(tokens[0]); i++)
{
int rc = pcre_exec(tokens[i].re, tokens[i].re_extra,
&buff[0], buff_size, buff_pos,
PCRE_ANCHORED | PCRE_NOTEMPTY,
ovector, ovector_size);
if (rc > 0)
{
/* this pattern matched some of the input */
int len = ovector[1] - ovector[0];
if (len > longest_match_length)
{
longest_match_length = len;
longest_match_index = i;
memcpy(longest_match_ovector, ovector, sizeof(ovector));
}
}
}
if (longest_match_index < 0)
{
/* no pattern matched the input at the current position */
cerr << "Parse error" << endl;
return false;
}
Matches matches(tokens[longest_match_index].re,
&buff[0], longest_match_ovector, ovector_size);
TokenRef token = buildToken(longest_match_index);
if (token.isNull())
{
cerr << "Internal Error: null token" << endl;
return false;
}
token->process(matches);
m_tokens.push_back(token);
buff_pos += longest_match_length;
}
}
refptr<Node> Node::operator[](int index)
{
return (0 <= index && index < m_indexed_children.size())
? m_indexed_children[index]
: NULL;
}
refptr<Node> Node::operator[](const std::string & index)
{
return (m_named_children.find(index) != m_named_children.end())
? m_named_children[index]
: NULL;
}
void Token::process(const Matches & matches)
{
{%token_code%}
}
Matches::Matches(pcre * re, const char * data, int * ovector, int ovec_size)
: m_re(re), m_data(data), m_ovector(ovector), m_ovec_size(ovec_size)
{
}
std::string Matches::operator[](int index) const
{
if (0 <= index && index < (m_ovec_size / 3))
{
int idx = 2 * index;
if (m_ovector[idx] >= 0 && m_ovector[idx + 1] >= 0)
{
return string(m_data, m_ovector[idx],
m_ovector[idx + 1] - m_ovector[idx]);
}
}
return "";
}
std::string Matches::operator[](const std::string & index) const
{
int idx = pcre_get_stringnumber(m_re, index.c_str());
if (idx > 0 && idx < (m_ovec_size / 3))
{
if (m_ovector[idx] >= 0 && m_ovector[idx + 1] >= 0)
{
return string(m_data, m_ovector[idx],
m_ovector[idx + 1] - m_ovector[idx]);
}
}
return "";
}
{%token_classes_code%}
#ifdef I_NAMESPACE
};
#endif

View File

@ -1,181 +0,0 @@
#ifndef IMBECILE_PARSER_HEADER
#define IMBECILE_PARSER_HEADER
#include <pcre.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <iostream>
#include <map>
#include <vector>
#include <list>
{%user_includes%}
{%defines%}
#ifdef I_NAMESPACE
namespace I_NAMESPACE {
#endif
#ifndef REFPTR_H
#define REFPTR_H REFPTR_H
/* Author: Josh Holtrop
* Purpose: Provide a reference-counting pointer-like first order
* C++ object that will free the object it is pointing to when
* all references to it have been destroyed.
* This implementation does not solve the circular reference problem.
* I was not concerned with that when developing this class.
*/
#include <stdlib.h> /* NULL */
template <typename T>
class refptr
{
public:
refptr<T>();
refptr<T>(T * ptr);
refptr<T>(const refptr<T> & orig);
refptr<T> & operator=(const refptr<T> & orig);
refptr<T> & operator=(T * ptr);
~refptr<T>();
T & operator*() const { return *m_ptr; }
T * operator->() const { return m_ptr; }
bool isNull() const { return m_ptr == NULL; }
private:
void cloneFrom(const refptr<T> & orig);
void destroy();
T * m_ptr;
int * m_refCount;
};
template <typename T> refptr<T>::refptr()
{
m_ptr = NULL;
m_refCount = NULL;
}
template <typename T> refptr<T>::refptr(T * ptr)
{
m_ptr = ptr;
m_refCount = new int;
*m_refCount = 1;
}
template <typename T> refptr<T>::refptr(const refptr<T> & orig)
{
cloneFrom(orig);
}
template <typename T> refptr<T> & refptr<T>::operator=(const refptr<T> & orig)
{
destroy();
cloneFrom(orig);
return *this;
}
template <typename T> refptr<T> & refptr<T>::operator=(T * ptr)
{
destroy();
m_ptr = ptr;
m_refCount = new int;
*m_refCount = 1;
return *this;
}
template <typename T> void refptr<T>::cloneFrom(const refptr<T> & orig)
{
this->m_ptr = orig.m_ptr;
this->m_refCount = orig.m_refCount;
if (m_refCount != NULL)
(*m_refCount)++;
}
template <typename T> refptr<T>::~refptr()
{
destroy();
}
template <typename T> void refptr<T>::destroy()
{
if (m_refCount != NULL)
{
if (*m_refCount <= 1)
{
delete m_ptr;
delete m_refCount;
}
else
{
(*m_refCount)--;
}
}
}
#endif
class Matches
{
public:
Matches(pcre * re, const char * data, int * ovector, int ovec_size);
std::string operator[](int index) const;
std::string operator[](const std::string & index) const;
protected:
pcre * m_re;
const char * m_data;
int * m_ovector;
int m_ovec_size;
};
class Node
{
public:
refptr<Node> operator[](int index);
refptr<Node> operator[](const std::string & index);
protected:
std::map< std::string, refptr<Node> > m_named_children;
std::vector< refptr<Node> > m_indexed_children;
};
typedef refptr<Node> NodeRef;
class Token : public Node
{
public:
virtual void process(const Matches & matches);
void setType(int type) { m_type = type; }
int getType() const { return m_type; }
protected:
int m_type;
{%token_data%}
};
typedef refptr<Token> TokenRef;
{%token_classes%}
class I_CLASSNAME
{
public:
I_CLASSNAME();
bool parse(std::istream & in);
const char * getError() { return m_errstr; }
protected:
const char * m_errstr;
std::list<TokenRef> m_tokens;
};
#ifdef I_NAMESPACE
};
#endif
#endif /* IMBECILE_PARSER_HEADER */