Start on Encoding module.

This commit is contained in:
Josh Holtrop 2020-11-11 22:26:16 -05:00
parent 2db942f47d
commit 179690f177

141
src/jes/core/encoding.d Normal file
View File

@ -0,0 +1,141 @@
module jes.core.encoding;
struct Encoding
{
enum : ubyte
{
UTF8,
CP1252,
UTF16_LE,
UTF16_BE,
}
ubyte type;
alias type this;
this(ubyte type)
{
this.type = type;
}
static Encoding detect_encoding(const ubyte * data, size_t n, size_t * bom_size)
{
*bom_size = 0u;
/* First check for BOM presence. */
if ((n >= 3) &&
(data[0..3] == [0xEFu, 0xBBu, 0xBFu]) &&
validate_utf8(&data[3], n - 3u))
{
*bom_size = 3u;
return Encoding(UTF8);
}
if ((n >= 2) &&
(data[0..2] == [0xFEu, 0xFFu]) &&
((n & 1u) == 0u))
{
*bom_size = 2u;
return Encoding(UTF16_BE);
}
if ((n >= 2) &&
(data[0..2] == [0xFFu, 0xFEu]) &&
((n & 1u) == 0u))
{
*bom_size = 2u;
return Encoding(UTF16_LE);
}
/* No BOM found. Check heuristically. */
if (validate_utf8(data, n))
{
return Encoding(UTF8);
}
if (check_utf16(data, n, true))
{
return Encoding(UTF16_LE);
}
if (check_utf16(data, n, false))
{
return Encoding(UTF16_BE);
}
return Encoding(CP1252);
}
static bool check_utf8_continuation_bytes(const ubyte * data, size_t n)
{
for (size_t i = 0u; i < n; i++)
{
if ((data[i] & 0xC0u) != 0x80u)
return false;
}
return true;
}
static bool validate_utf8(const ubyte * data, size_t n)
{
for (size_t i = 0u; i < n; i++)
{
const ubyte c = data[i];
size_t n_continuation_bytes;
if ((c & 0x80u) != 0u)
{
if ((c & 0xE0u) == 0xC0u)
{
n_continuation_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
n_continuation_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
n_continuation_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
n_continuation_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
n_continuation_bytes = 5u;
}
else
{
return false;
}
if (((i + n_continuation_bytes) < n) &&
check_utf8_continuation_bytes(&data[i + 1], n_continuation_bytes))
{
i += n_continuation_bytes;
}
else
{
return false;
}
}
}
return true;
}
static bool check_utf16(const ubyte * data, size_t n, bool le)
{
size_t n_high_zeros;
if ((n == 0u) || ((n & 1u) != 0u))
{
return false;
}
ubyte off = le ? 1u : 0u;
for (size_t i = 0u; i < n; i += 2u)
{
if (data[i + off] == 0u)
{
n_high_zeros++;
}
}
return (20u * n_high_zeros / n) != 0u;
}
};