From 9b838e93a4c8897bd366944f9dbe4224ea48ffcf Mon Sep 17 00:00:00 2001 From: Josh Holtrop Date: Mon, 29 Mar 2010 15:05:57 -0400 Subject: [PATCH] initial content added with Unicode deserializer --- Makefile | 28 ++++++++++++++ imbecile.cc | 42 +++++++++++++++++++++ refptr/refptr.h | 99 +++++++++++++++++++++++++++++++++++++++++++++++++ serialize.cc | 52 ++++++++++++++++++++++++++ serialize.h | 13 +++++++ unicode.h | 8 ++++ 6 files changed, 242 insertions(+) create mode 100644 Makefile create mode 100644 imbecile.cc create mode 100644 refptr/refptr.h create mode 100644 serialize.cc create mode 100644 serialize.h create mode 100644 unicode.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..48dd1d7 --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ + +TARGET := imbecile +CXXOBJS := $(patsubst %.cc,%.o,$(wildcard *.cc)) +CXXDEPS := $(CXXOBJS:.o=.dep) +CXXFLAGS := -O2 +DEPS := $(CXXDEPS) +OBJS := $(CXXOBJS) + +all: $(TARGET) + +$(TARGET): $(OBJS) + $(CXX) -o $@ $^ $(LDFLAGS) + +# Object file rules +%.o: %.cc + $(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $< + +# Make dependency files +%.dep: %.c + @set -e; rm -f $@; \ + $(CC) -MM $(CPPFLAGS) $< | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' > $@ + +%.dep: %.cc + @set -e; rm -f $@; \ + $(CXX) -MM $(CPPFLAGS) $< | sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' > $@ + +clean: + -rm -f $(TARGET) *.o *.dep diff --git a/imbecile.cc b/imbecile.cc new file mode 100644 index 0000000..720f2a6 --- /dev/null +++ b/imbecile.cc @@ -0,0 +1,42 @@ + +#include +#include +#include +#include +#include "refptr/refptr.h" +#include "serialize.h" +#include "unicode.h" +using namespace std; + +int main(int argc, char * argv[]) +{ + int longind = 1; + int opt; + const char * encoding = "UTF-8"; + + static struct option longopts[] = { + /* name, has_arg, flag, val */ + { "encoding", required_argument, NULL, 'e' }, + { NULL, 0, NULL, 0 } + }; + + while ((opt = getopt_long(argc, argv, "", longopts, &longind)) != -1) + { + switch (opt) + { + case 'e': /* encoding */ + encoding = optarg; + break; + } + } + + ifstream ifs(optarg); + refptr< vector > ucs_str = deserialize(encoding, ifs); + if (ucs_str.isNull()) + { + cerr << "Error deserializing input file." << endl; + return 1; + } + + return 0; +} diff --git a/refptr/refptr.h b/refptr/refptr.h new file mode 100644 index 0000000..7335af5 --- /dev/null +++ b/refptr/refptr.h @@ -0,0 +1,99 @@ + +#ifndef REFPTR_H +#define REFPTR_H REFPTR_H + +/* Author: Josh Holtrop + * Purpose: Provide a reference-counting pointer-like first order + * C++ object that will free the object it is pointing to when + * all references to it have been destroyed. + * This implementation does not solve the circular reference problem. + * I was not concerned with that when developing this class. + */ +#include /* NULL */ + +template +class refptr +{ + public: + refptr(); + refptr(T * ptr); + refptr(const refptr & orig); + refptr & operator=(const refptr & orig); + refptr & operator=(T * ptr); + ~refptr(); + T & operator*() const { return *m_ptr; } + T * operator->() const { return m_ptr; } + bool isNull() const { return m_ptr == NULL; } + + private: + void cloneFrom(const refptr & orig); + void destroy(); + + T * m_ptr; + int * m_refCount; +}; + +template refptr::refptr() +{ + m_ptr = NULL; + m_refCount = NULL; +} + +template refptr::refptr(T * ptr) +{ + m_ptr = ptr; + m_refCount = new int; + *m_refCount = 1; +} + +template refptr::refptr(const refptr & orig) +{ + cloneFrom(orig); +} + +template refptr & refptr::operator=(const refptr & orig) +{ + destroy(); + cloneFrom(orig); + return *this; +} + +template refptr & refptr::operator=(T * ptr) +{ + destroy(); + m_ptr = ptr; + m_refCount = new int; + *m_refCount = 1; + return *this; +} + +template void refptr::cloneFrom(const refptr & orig) +{ + this->m_ptr = orig.m_ptr; + this->m_refCount = orig.m_refCount; + if (m_refCount != NULL) + (*m_refCount)++; +} + +template refptr::~refptr() +{ + destroy(); +} + +template void refptr::destroy() +{ + if (m_refCount != NULL) + { + if (*m_refCount <= 1) + { + delete m_ptr; + delete m_refCount; + } + else + { + (*m_refCount)--; + } + } +} + +#endif diff --git a/serialize.cc b/serialize.cc new file mode 100644 index 0000000..25e24b5 --- /dev/null +++ b/serialize.cc @@ -0,0 +1,52 @@ + +#include "serialize.h" +#include +using namespace std; + +refptr< vector > deserialize(const char * encoding, istream & in) +{ + const int buf_size = 200; + int num_read; + char inbuf[buf_size]; + char * inbuf_ptr = (char *) &inbuf[0]; + unichar_t outbuf[buf_size]; + char * outbuf_ptr; + size_t bytes_converted, inbytesleft = 0, outbytesleft; + refptr< vector > ucs = new vector(); + + iconv_t cd = iconv_open(encoding, "UTF-32"); + if (cd == (iconv_t) -1) + { + cerr << "iconv_open() error" << endl; + return NULL; + } + + for (;;) + { + in.read(&inbuf[0], sizeof(inbuf) - inbytesleft); + num_read = in.gcount(); + if (num_read <= 0) + break; + outbuf_ptr = (char *) &outbuf[0]; + outbytesleft = sizeof(outbuf); + bytes_converted = iconv(cd, &inbuf_ptr, &inbytesleft, + &outbuf_ptr, &outbytesleft); + if (inbytesleft > 0) + { + memmove(&inbuf[0], inbuf_ptr, inbytesleft); + inbuf_ptr = (char *) &inbuf[0]; + } + for (int i = 0; i < (bytes_converted / sizeof(outbuf[0])); i++) + { + ucs->push_back(outbuf[i]); + } + if (bytes_converted & 0x3) + cerr << "Warning: bytes_converted = " << bytes_converted << endl; + if (in.eof()) + break; + } + + iconv_close(cd); + return ucs; +} + diff --git a/serialize.h b/serialize.h new file mode 100644 index 0000000..d1c6d73 --- /dev/null +++ b/serialize.h @@ -0,0 +1,13 @@ + +#ifndef SERIALIZE_H + +#include +#include +#include +#include "refptr/refptr.h" +#include "unicode.h" +using namespace std; + +refptr< vector > deserialize(const char * encoding, istream & in); + +#endif diff --git a/unicode.h b/unicode.h new file mode 100644 index 0000000..ba93a1c --- /dev/null +++ b/unicode.h @@ -0,0 +1,8 @@ + +#ifndef UNICODE_H + +#include + +typedef uint32_t unichar_t; + +#endif