Add Encoding.encode() and Encoding.decode().

This commit is contained in:
Josh Holtrop 2020-11-27 20:51:40 -05:00
parent 575589e5a3
commit 926ad3ec0d

View File

@ -18,6 +18,162 @@ struct Encoding
this.type = type; this.type = type;
} }
uint decode(const(ubyte) * data, size_t * encoded_size)
{
uint result;
size_t size;
ubyte following_bytes;
ubyte c;
switch (type)
{
case UTF8:
c = *data;
if ((c & 0x80u) == 0u)
{
result = c;
size = 1u;
}
else
{
if ((c & 0xE0u) == 0xC0u)
{
result = c & 0x1Fu;
following_bytes = 1u;
}
else if ((c & 0xF0u) == 0xE0u)
{
result = c & 0x0Fu;
following_bytes = 2u;
}
else if ((c & 0xF8u) == 0xF0u)
{
result = c & 0x07u;
following_bytes = 3u;
}
else if ((c & 0xFCu) == 0xF8u)
{
result = c & 0x03u;
following_bytes = 4u;
}
else if ((c & 0xFEu) == 0xFCu)
{
result = c & 0x01u;
following_bytes = 5u;
}
size = following_bytes + 1u;
while (following_bytes-- > 0u)
{
data++;
result <<= 6u;
result |= *data & 0x3Fu;
}
}
break;
case CP1252:
/* TODO */
break;
case UTF16_LE:
result = data[0] | (data[1] << 8u);
size = 2u;
break;
case UTF16_BE:
result = (data[0] << 8u) | data[1];
size = 2u;
break;
default:
break;
}
if (encoded_size != null)
{
*encoded_size = size;
}
return result;
}
ubyte num_bytes_to_encode_code_point(uint code_point)
{
switch (type)
{
case UTF8:
if (code_point <= 0x7Fu)
{
return 1u;
}
else if (code_point <= 0x7FFu)
{
return 2u;
}
else if (code_point <= 0xFFFFu)
{
return 3u;
}
else if (code_point <= 0x1FFFFFu)
{
return 4u;
}
else if (code_point <= 0x3FFFFFFu)
{
return 5u;
}
else
{
return 6u;
}
case CP1252:
/* TODO */
break;
case UTF16_LE:
case UTF16_BE:
return 2u;
default:
break;
}
return 0u;
}
ubyte encode(uint code_point, ubyte * buffer)
{
ubyte size;
switch (type)
{
case UTF8:
size = num_bytes_to_encode_code_point(code_point);
if (size == 1u)
{
*buffer = cast(ubyte)code_point;
}
else if (size > 1u)
{
*buffer = cast(ubyte)((0xFFu << (8u - size)) | (code_point >> (6u * (size - 1u))));
const ubyte end = cast(ubyte)(size - 1u);
for (ubyte i = 0u; i < end; i++)
{
buffer[end - i] = 0x80u | (code_point >> (6u * i) & 0x3Fu);
}
}
return size;
case CP1252:
/* TODO */
break;
case UTF16_LE:
buffer[0] = cast(ubyte)code_point;
buffer[1] = cast(ubyte)(code_point >> 8u);
return 2u;
case UTF16_BE:
buffer[0] = cast(ubyte)(code_point >> 8u);
buffer[1] = cast(ubyte)code_point;
return 2u;
default:
break;
}
return 0u;
}
static Encoding detect_encoding(const ubyte * data, size_t n, size_t * bom_size) static Encoding detect_encoding(const ubyte * data, size_t n, size_t * bom_size)
{ {
*bom_size = 0u; *bom_size = 0u;