From c4253078a9bbbbe745d70bea70a3834ba565b590 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Thu, 11 Aug 2016 20:03:53 -0400 Subject: [PATCH] add Encoding class --- src/core/Encoding.cc | 62 ++++++++++++++++++++++++++++++++++++++++++++ src/core/Encoding.h | 19 ++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 src/core/Encoding.cc create mode 100644 src/core/Encoding.h diff --git a/src/core/Encoding.cc b/src/core/Encoding.cc new file mode 100644 index 0000000..ce428cc --- /dev/null +++ b/src/core/Encoding.cc @@ -0,0 +1,62 @@ +#include "Encoding.h" + +static inline bool +check_continuation_bytes(const uint8_t ** buffer, + uint8_t continuation_bytes_to_check) +{ + for (uint8_t i = 0; i < continuation_bytes_to_check; i++) + { + if ((**buffer & 0xC0u) != 0x80u) + return false; + (*buffer)++; + } + return true; +} + +Encoding::Type Encoding::detect_encoding(const uint8_t * buffer, size_t length) +{ + for (size_t i = 0u; i < length; i++) + { + const uint8_t c = *buffer; + uint8_t continuation_bytes_to_check; + if ((c & 0x80u) == 0u) + { + buffer++; + continue; + } + else if ((c & 0xE0u) == 0xC0u) + { + continuation_bytes_to_check = 1u; + } + else if ((c & 0xF0u) == 0xE0u) + { + continuation_bytes_to_check = 2u; + } + else if ((c & 0xF8u) == 0xF0u) + { + continuation_bytes_to_check = 3u; + } + else if ((c & 0xFCu) == 0xF8u) + { + continuation_bytes_to_check = 4u; + } + else if ((c & 0xFEu) == 0xFCu) + { + continuation_bytes_to_check = 5u; + } + else + { + return CP_1252; + } + buffer++; + if (((i + continuation_bytes_to_check) < length) && + check_continuation_bytes(&buffer, continuation_bytes_to_check)) + { + i += continuation_bytes_to_check; + continue; + } + return CP_1252; + } + + return UTF_8; +} diff --git a/src/core/Encoding.h b/src/core/Encoding.h new file mode 100644 index 0000000..21e11f0 --- /dev/null +++ b/src/core/Encoding.h @@ -0,0 +1,19 @@ +#ifndef ENCODING_H +#define ENCODING_H + +#include +#include + +class Encoding +{ +public: + enum Type + { + UTF_8, + CP_1252, + }; + + static Type detect_encoding(const uint8_t * buffer, size_t length); +}; + +#endif