From 926ad3ec0da46dc68f2af6b5867f387fa8c35f73 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Fri, 27 Nov 2020 20:51:40 -0500 Subject: [PATCH] Add Encoding.encode() and Encoding.decode(). --- src/jes/core/encoding.d | 156 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/src/jes/core/encoding.d b/src/jes/core/encoding.d index b8ee7d5..8fe64c4 100644 --- a/src/jes/core/encoding.d +++ b/src/jes/core/encoding.d @@ -18,6 +18,162 @@ struct Encoding this.type = type; } + uint decode(const(ubyte) * data, size_t * encoded_size) + { + uint result; + size_t size; + ubyte following_bytes; + ubyte c; + + switch (type) + { + case UTF8: + c = *data; + if ((c & 0x80u) == 0u) + { + result = c; + size = 1u; + } + else + { + if ((c & 0xE0u) == 0xC0u) + { + result = c & 0x1Fu; + following_bytes = 1u; + } + else if ((c & 0xF0u) == 0xE0u) + { + result = c & 0x0Fu; + following_bytes = 2u; + } + else if ((c & 0xF8u) == 0xF0u) + { + result = c & 0x07u; + following_bytes = 3u; + } + else if ((c & 0xFCu) == 0xF8u) + { + result = c & 0x03u; + following_bytes = 4u; + } + else if ((c & 0xFEu) == 0xFCu) + { + result = c & 0x01u; + following_bytes = 5u; + } + size = following_bytes + 1u; + while (following_bytes-- > 0u) + { + data++; + result <<= 6u; + result |= *data & 0x3Fu; + } + } + break; + case CP1252: + /* TODO */ + break; + case UTF16_LE: + result = data[0] | (data[1] << 8u); + size = 2u; + break; + case UTF16_BE: + result = (data[0] << 8u) | data[1]; + size = 2u; + break; + default: + break; + } + + if (encoded_size != null) + { + *encoded_size = size; + } + + return result; + } + + ubyte num_bytes_to_encode_code_point(uint code_point) + { + switch (type) + { + case UTF8: + if (code_point <= 0x7Fu) + { + return 1u; + } + else if (code_point <= 0x7FFu) + { + return 2u; + } + else if (code_point <= 0xFFFFu) + { + return 3u; + } + else if (code_point <= 0x1FFFFFu) + { + return 4u; + } + else if (code_point <= 0x3FFFFFFu) + { + return 5u; + } + else + { + return 6u; + } + case CP1252: + /* TODO */ + break; + case UTF16_LE: + case UTF16_BE: + return 2u; + default: + break; + } + + return 0u; + } + + ubyte encode(uint code_point, ubyte * buffer) + { + ubyte size; + switch (type) + { + case UTF8: + size = num_bytes_to_encode_code_point(code_point); + if (size == 1u) + { + *buffer = cast(ubyte)code_point; + } + else if (size > 1u) + { + *buffer = cast(ubyte)((0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u)))); + const ubyte end = cast(ubyte)(size - 1u); + for (ubyte i = 0u; i < end; i++) + { + buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu); + } + } + return size; + case CP1252: + /* TODO */ + break; + case UTF16_LE: + buffer[0] = cast(ubyte)code_point; + buffer[1] = cast(ubyte)(code_point >> 8u); + return 2u; + case UTF16_BE: + buffer[0] = cast(ubyte)(code_point >> 8u); + buffer[1] = cast(ubyte)code_point; + return 2u; + default: + break; + } + + return 0u; + } + static Encoding detect_encoding(const ubyte * data, size_t n, size_t * bom_size) { *bom_size = 0u;