Work on real D strings instead of ubyte pointer and length

Also fix a couple UTF-8 decoder bugs!
This commit is contained in:
Josh Holtrop 2022-10-15 13:32:01 -04:00
parent de93d23585
commit 623c644e74
11 changed files with 43 additions and 77 deletions

View File

@ -41,13 +41,13 @@ class <%= @classname %>
uint code_point_length; uint code_point_length;
} }
static DecodedCodePoint decode_code_point(const(ubyte) * input, size_t input_length) static DecodedCodePoint decode_code_point(string input)
{ {
if (input_length == 0u) if (input.length == 0u)
{ {
return DecodedCodePoint(CODE_POINT_EOF, 0u); return DecodedCodePoint(CODE_POINT_EOF, 0u);
} }
ubyte c = *input; char c = input[0];
uint code_point; uint code_point;
uint code_point_length; uint code_point_length;
if ((c & 0x80u) == 0u) if ((c & 0x80u) == 0u)
@ -87,20 +87,19 @@ class <%= @classname %>
{ {
return DecodedCodePoint(CODE_POINT_INVALID, 0u); return DecodedCodePoint(CODE_POINT_INVALID, 0u);
} }
if (input_length <= following_bytes) if (input.length <= following_bytes)
{ {
return DecodedCodePoint(CODE_POINT_INVALID, 0u); return DecodedCodePoint(CODE_POINT_INVALID, 0u);
} }
code_point_length = following_bytes + 1u; code_point_length = following_bytes + 1u;
while (following_bytes-- > 0u) for (size_t i = 0u; i < following_bytes; i++)
{ {
input++; char b = input[i + 1u];
ubyte b = *input; if ((b & 0xC0u) != 0x80u)
if ((b & 0xC0u) != 0u)
{ {
return DecodedCodePoint(CODE_POINT_INVALID, 0u); return DecodedCodePoint(CODE_POINT_INVALID, 0u);
} }
code_point = (code_point << 6u) | b; code_point = (code_point << 6u) | (b & 0x3Fu);
} }
} }
return DecodedCodePoint(code_point, code_point_length); return DecodedCodePoint(code_point, code_point_length);
@ -159,17 +158,15 @@ class <%= @classname %>
uint token; uint token;
} }
private const(ubyte) * m_input; private string m_input;
private size_t m_input_length;
private size_t m_input_position; private size_t m_input_position;
private size_t m_input_row; private size_t m_input_row;
private size_t m_input_col; private size_t m_input_col;
private size_t m_mode; private size_t m_mode;
this(const(ubyte) * input, size_t input_length) this(string input)
{ {
m_input = input; m_input = input;
m_input_length = input_length;
m_mode = <%= @lexer.mode_id("default") %>; m_mode = <%= @lexer.mode_id("default") %>;
} }
@ -227,7 +224,7 @@ class <%= @classname %>
uint current_state = modes[m_mode].state_table_offset; uint current_state = modes[m_mode].state_table_offset;
for (;;) for (;;)
{ {
auto decoded = Decoder.decode_code_point(&m_input[m_input_position + attempt_match_info.length], m_input_length - m_input_position - attempt_match_info.length); auto decoded = Decoder.decode_code_point(m_input[(m_input_position + attempt_match_info.length)..(m_input.length)]);
if (decoded.code_point == Decoder.CODE_POINT_INVALID) if (decoded.code_point == Decoder.CODE_POINT_INVALID)
{ {
lt.token = _TOKEN_DECODE_ERROR; lt.token = _TOKEN_DECODE_ERROR;
@ -377,9 +374,9 @@ class <%= @classname %>
private <%= @grammar.result_type %> parse_result; private <%= @grammar.result_type %> parse_result;
this(const(ubyte) * input, size_t input_length) this(string input)
{ {
m_lexer = new Lexer(input, input_length); m_lexer = new Lexer(input);
} }
bool parse() bool parse()

View File

@ -76,6 +76,7 @@ EOF
build_parser build_parser
compile("spec/test_d_lexer.d") compile("spec/test_d_lexer.d")
results = run results = run
expect(results.stderr).to eq ""
expect(results.status).to eq 0 expect(results.status).to eq 0
end end

View File

@ -9,59 +9,27 @@ int main()
unittest unittest
{ {
alias DCP = Testparser.Decoder.DecodedCodePoint; alias DCP = Testparser.Decoder.DecodedCodePoint;
string inputstring = "5+\n 66";
const(ubyte) * input = cast(const(ubyte) *)inputstring.ptr;
size_t input_length = inputstring.length;
DCP dcp; DCP dcp;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
dcp = Testparser.Decoder.decode_code_point("5");
assert(dcp == DCP('5', 1u)); assert(dcp == DCP('5', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length; dcp = Testparser.Decoder.decode_code_point("");
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('+', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('\n', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(' ', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP('6', 1u));
input += dcp.code_point_length;
input_length -= dcp.code_point_length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_EOF, 0u)); assert(dcp == DCP(Testparser.Decoder.CODE_POINT_EOF, 0u));
inputstring = "\xf0\x1f\x27\x21"; dcp = Testparser.Decoder.decode_code_point("\xC2\xA9");
input = cast(const(ubyte) *)inputstring.ptr; assert(dcp == DCP(0xA9u, 2u));
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length); dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\xa7\xa1");
assert(dcp == DCP(0x1F9E1, 4u)); assert(dcp == DCP(0x1F9E1, 4u));
inputstring = "\xf0\x1f\x27"; dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\x27");
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
inputstring = "\xf0\x1f\x27\xFF"; dcp = Testparser.Decoder.decode_code_point("\xf0\x9f\xa7\xFF");
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
inputstring = "\xfe"; dcp = Testparser.Decoder.decode_code_point("\xfe");
input = cast(const(ubyte) *)inputstring.ptr;
input_length = inputstring.length;
dcp = Testparser.Decoder.decode_code_point(input, input_length);
assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u));
} }
@ -69,7 +37,7 @@ unittest
{ {
alias LT = Testparser.Lexer.LexedToken; alias LT = Testparser.Lexer.LexedToken;
string input = "5 + 4 * \n677 + 567"; string input = "5 + 4 * \n677 + 567";
Testparser.Lexer lexer = new Testparser.Lexer(cast(const(ubyte) *)input.ptr, input.length); Testparser.Lexer lexer = new Testparser.Lexer(input);
assert(lexer.lex_token() == LT(0, 0, 1, Testparser.TOKEN_int)); assert(lexer.lex_token() == LT(0, 0, 1, Testparser.TOKEN_int));
assert(lexer.lex_token() == LT(0, 2, 1, Testparser.TOKEN_plus)); assert(lexer.lex_token() == LT(0, 2, 1, Testparser.TOKEN_plus));
assert(lexer.lex_token() == LT(0, 4, 1, Testparser.TOKEN_int)); assert(lexer.lex_token() == LT(0, 4, 1, Testparser.TOKEN_int));
@ -79,6 +47,6 @@ unittest
assert(lexer.lex_token() == LT(1, 6, 3, Testparser.TOKEN_int)); assert(lexer.lex_token() == LT(1, 6, 3, Testparser.TOKEN_int));
assert(lexer.lex_token() == LT(1, 9, 0, Testparser.TOKEN_0EOF)); assert(lexer.lex_token() == LT(1, 9, 0, Testparser.TOKEN_0EOF));
lexer = new Testparser.Lexer(null, 0u); lexer = new Testparser.Lexer("");
assert(lexer.lex_token() == LT(0, 0, 0, Testparser.TOKEN_0EOF)); assert(lexer.lex_token() == LT(0, 0, 0, Testparser.TOKEN_0EOF));
} }

View File

@ -9,10 +9,10 @@ int main()
unittest unittest
{ {
string input = "aba"; string input = "aba";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
input = "abb"; input = "abb";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
} }

View File

@ -9,14 +9,14 @@ int main()
unittest unittest
{ {
string input = "a"; string input = "a";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == false); assert(parser.parse() == false);
input = "a b"; input = "a b";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
input = "bb"; input = "bb";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
} }

View File

@ -9,12 +9,12 @@ int main()
unittest unittest
{ {
string input = `abc "a string" def`; string input = `abc "a string" def`;
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass1"); writeln("pass1");
input = `abc "abc def" def`; input = `abc "abc def" def`;
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass2"); writeln("pass2");
} }

View File

@ -9,6 +9,6 @@ int main()
unittest unittest
{ {
string input = "ab"; string input = "ab";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
} }

View File

@ -9,17 +9,17 @@ int main()
unittest unittest
{ {
string input = "a"; string input = "a";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
assert(parser.result == 1u); assert(parser.result == 1u);
input = ""; input = "";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
assert(parser.result == 0u); assert(parser.result == 0u);
input = "aaaaaaaaaaaaaaaa"; input = "aaaaaaaaaaaaaaaa";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
assert(parser.result == 16u); assert(parser.result == 16u);
} }

View File

@ -9,12 +9,12 @@ int main()
unittest unittest
{ {
string input = "abcdef"; string input = "abcdef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass1"); writeln("pass1");
input = "defabcdef"; input = "defabcdef";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass2"); writeln("pass2");
} }

View File

@ -9,6 +9,6 @@ int main()
unittest unittest
{ {
string input = "defghidef"; string input = "defghidef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
} }

View File

@ -9,12 +9,12 @@ int main()
unittest unittest
{ {
string input = "abcdef"; string input = "abcdef";
auto parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); auto parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass1"); writeln("pass1");
input = "abcabcdef"; input = "abcabcdef";
parser = new Testparser.Parser(cast(const(ubyte) *)input.ptr, input.length); parser = new Testparser.Parser(input);
assert(parser.parse() == true); assert(parser.parse() == true);
writeln("pass2"); writeln("pass2");
} }