From 7598c589fe8f88f55427543b3b8195ac55dd7f15 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Tue, 31 May 2022 22:26:09 -0400 Subject: [PATCH] Detect other invalid UTF-8 encodings --- assets/parser.d.erb | 12 ++++++++++-- spec/test_d_lexer.d | 20 +++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/assets/parser.d.erb b/assets/parser.d.erb index 43a9839..3c44afb 100644 --- a/assets/parser.d.erb +++ b/assets/parser.d.erb @@ -83,6 +83,10 @@ class <%= classname %> code_point = c & 0x01u; following_bytes = 5u; } + else + { + return DecodedCodePoint(CODE_POINT_INVALID, 0u); + } if (input_length <= following_bytes) { return DecodedCodePoint(CODE_POINT_INVALID, 0u); @@ -91,8 +95,12 @@ class <%= classname %> while (following_bytes-- > 0u) { input++; - code_point <<= 6u; - code_point |= *input & 0x3Fu; + ubyte b = *input; + if ((b & 0xC0u) != 0u) + { + return DecodedCodePoint(CODE_POINT_INVALID, 0u); + } + code_point = (code_point << 6u) | b; } } return DecodedCodePoint(code_point, code_point_length); diff --git a/spec/test_d_lexer.d b/spec/test_d_lexer.d index d03f9ae..041f119 100644 --- a/spec/test_d_lexer.d +++ b/spec/test_d_lexer.d @@ -40,11 +40,29 @@ unittest dcp = Testparser.Decoder.decode_code_point(input, input_length); assert(dcp == DCP(Testparser.Decoder.CODE_POINT_EOF, 0u)); - inputstring = "\xf0\x9f\xa7\xa1"; + inputstring = "\xf0\x1f\x27\x21"; input = cast(const(ubyte) *)inputstring.ptr; input_length = inputstring.length; dcp = Testparser.Decoder.decode_code_point(input, input_length); assert(dcp == DCP(0x1F9E1, 4u)); + + inputstring = "\xf0\x1f\x27"; + input = cast(const(ubyte) *)inputstring.ptr; + input_length = inputstring.length; + dcp = Testparser.Decoder.decode_code_point(input, input_length); + assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); + + inputstring = "\xf0\x1f\x27\xFF"; + input = cast(const(ubyte) *)inputstring.ptr; + input_length = inputstring.length; + dcp = Testparser.Decoder.decode_code_point(input, input_length); + assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); + + inputstring = "\xfe"; + input = cast(const(ubyte) *)inputstring.ptr; + input_length = inputstring.length; + dcp = Testparser.Decoder.decode_code_point(input, input_length); + assert(dcp == DCP(Testparser.Decoder.CODE_POINT_INVALID, 0u)); } unittest