From f659c142427b1343b4c512c9ef1d8d3f2d045ff5 Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Mon, 15 Aug 2016 22:14:47 -0400 Subject: [PATCH] add Encoding::decode() --- src/core/Buffer.cc | 2 ++ src/core/Encoding.cc | 60 +++++++++++++++++++++++++++++++++++++++++++ src/core/Encoding.h | 1 + src/core/PieceTable.h | 5 ++-- 4 files changed, 66 insertions(+), 2 deletions(-) diff --git a/src/core/Buffer.cc b/src/core/Buffer.cc index 46053a7..143ee0c 100644 --- a/src/core/Buffer.cc +++ b/src/core/Buffer.cc @@ -13,6 +13,7 @@ Buffer::Buffer() m_eol_at_eof = true; m_line_endings = LineEndings::LF; m_encoding = Encoding::UTF_8; + piece_table->encoding = m_encoding; } Buffer::~Buffer() @@ -65,6 +66,7 @@ bool Buffer::load_from_file(const char * filename) m_eol_at_eof = text_loader.get_eol_at_eof(); m_line_endings = text_loader.get_line_endings(); m_encoding = text_loader.get_encoding(); + piece_table->encoding = m_encoding; return true; } diff --git a/src/core/Encoding.cc b/src/core/Encoding.cc index 774d2b4..b026861 100644 --- a/src/core/Encoding.cc +++ b/src/core/Encoding.cc @@ -96,3 +96,63 @@ const uint8_t * Encoding::beginning_of_code_point(Type type, const uint8_t * enc return nullptr; } + +uint32_t Encoding::decode(Type type, const uint8_t * encoded) +{ + switch (type) + { + case UTF_8: + { + const uint8_t c = *encoded; + uint8_t following_bytes = 0u; + uint32_t v; + if ((c & 0x80u) == 0u) + { + return c; + } + else if ((c & 0xE0u) == 0xC0u) + { + v = c & 0x1Fu; + following_bytes = 1u; + } + else if ((c & 0xF0u) == 0xE0u) + { + v = c & 0x0Fu; + following_bytes = 2u; + } + else if ((c & 0xF8u) == 0xF0u) + { + v = c & 0x07u; + following_bytes = 3u; + } + else if ((c & 0xFCu) == 0xF8u) + { + v = c & 0x03u; + following_bytes = 4u; + } + else if ((c & 0xFEu) == 0xFCu) + { + v = c & 0x01u; + following_bytes = 5u; + } + else + { + return 0u; + } + while (following_bytes-- > 0u) + { + encoded++; + v <<= 6u; + v |= *encoded & 0x3Fu; + } + return v; + } + break; + case CP_1252: + /* TODO: map byte to code point */ + return *encoded; + break; + } + + return 0u; +} diff --git a/src/core/Encoding.h b/src/core/Encoding.h index 5be2ca4..3b5761a 100644 --- a/src/core/Encoding.h +++ b/src/core/Encoding.h @@ -16,6 +16,7 @@ public: static Type detect_encoding(const uint8_t * buffer, size_t length); static uint8_t num_bytes_in_code_point(Type type, const uint8_t * encoded); static const uint8_t * beginning_of_code_point(Type type, const uint8_t * encoded); + static uint32_t decode(Type type, const uint8_t * encoded); }; #endif diff --git a/src/core/PieceTable.h b/src/core/PieceTable.h index be78923..f3c6247 100644 --- a/src/core/PieceTable.h +++ b/src/core/PieceTable.h @@ -6,6 +6,7 @@ #include #include #include +#include "Encoding.h" class PieceTable { @@ -64,8 +65,7 @@ public: /** Get the character pointed to by the cursor. */ uint32_t operator*() const { - /* TODO: Use Encoding */ - return piece->start[offset]; + return Encoding::decode(piece_table->encoding, &piece->start[offset]); } /** @@ -92,6 +92,7 @@ public: Piece * start_piece; Piece * end_piece; uint8_t tabstop; + Encoding::Type encoding; PieceTable(const uint8_t * file_buffer, unsigned long file_buffer_size);