From 179690f17752fbdffeb70b7d3a3722847a3c0bde Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Wed, 11 Nov 2020 22:26:16 -0500 Subject: [PATCH] Start on Encoding module. --- src/jes/core/encoding.d | 141 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 src/jes/core/encoding.d diff --git a/src/jes/core/encoding.d b/src/jes/core/encoding.d new file mode 100644 index 0000000..32a6a0d --- /dev/null +++ b/src/jes/core/encoding.d @@ -0,0 +1,141 @@ +module jes.core.encoding; + +struct Encoding +{ + enum : ubyte + { + UTF8, + CP1252, + UTF16_LE, + UTF16_BE, + } + + ubyte type; + alias type this; + + this(ubyte type) + { + this.type = type; + } + + static Encoding detect_encoding(const ubyte * data, size_t n, size_t * bom_size) + { + *bom_size = 0u; + + /* First check for BOM presence. */ + if ((n >= 3) && + (data[0..3] == [0xEFu, 0xBBu, 0xBFu]) && + validate_utf8(&data[3], n - 3u)) + { + *bom_size = 3u; + return Encoding(UTF8); + } + if ((n >= 2) && + (data[0..2] == [0xFEu, 0xFFu]) && + ((n & 1u) == 0u)) + { + *bom_size = 2u; + return Encoding(UTF16_BE); + } + if ((n >= 2) && + (data[0..2] == [0xFFu, 0xFEu]) && + ((n & 1u) == 0u)) + { + *bom_size = 2u; + return Encoding(UTF16_LE); + } + + /* No BOM found. Check heuristically. */ + if (validate_utf8(data, n)) + { + return Encoding(UTF8); + } + + if (check_utf16(data, n, true)) + { + return Encoding(UTF16_LE); + } + + if (check_utf16(data, n, false)) + { + return Encoding(UTF16_BE); + } + + return Encoding(CP1252); + } + + static bool check_utf8_continuation_bytes(const ubyte * data, size_t n) + { + for (size_t i = 0u; i < n; i++) + { + if ((data[i] & 0xC0u) != 0x80u) + return false; + } + return true; + } + + static bool validate_utf8(const ubyte * data, size_t n) + { + for (size_t i = 0u; i < n; i++) + { + const ubyte c = data[i]; + size_t n_continuation_bytes; + if ((c & 0x80u) != 0u) + { + if ((c & 0xE0u) == 0xC0u) + { + n_continuation_bytes = 1u; + } + else if ((c & 0xF0u) == 0xE0u) + { + n_continuation_bytes = 2u; + } + else if ((c & 0xF8u) == 0xF0u) + { + n_continuation_bytes = 3u; + } + else if ((c & 0xFCu) == 0xF8u) + { + n_continuation_bytes = 4u; + } + else if ((c & 0xFEu) == 0xFCu) + { + n_continuation_bytes = 5u; + } + else + { + return false; + } + if (((i + n_continuation_bytes) < n) && + check_utf8_continuation_bytes(&data[i + 1], n_continuation_bytes)) + { + i += n_continuation_bytes; + } + else + { + return false; + } + } + } + + return true; + } + + static bool check_utf16(const ubyte * data, size_t n, bool le) + { + size_t n_high_zeros; + if ((n == 0u) || ((n & 1u) != 0u)) + { + return false; + } + ubyte off = le ? 1u : 0u; + for (size_t i = 0u; i < n; i += 2u) + { + if (data[i + off] == 0u) + { + n_high_zeros++; + } + } + return (20u * n_high_zeros / n) != 0u; + } +};