diff --git a/src/main/fart.cc b/src/main/fart.cc index 4b1853b..3ceb2f2 100644 --- a/src/main/fart.cc +++ b/src/main/fart.cc @@ -9,6 +9,10 @@ #include #include #include +#include + +#include +#include #include "Scene.h" #include "distrib/distrib.h" @@ -16,6 +20,106 @@ using namespace std; +/* + * Number of contiguous pixels each worker thread claims at a time. Pulling + * work in small chunks (rather than whole rows) keeps the load balanced even + * when individual rows vary wildly in render cost, while keeping the claimed + * pixels contiguous so cache locality and dispatch overhead stay close to the + * row-based baseline. See benchmarks: 64 was the sweet spot. + */ +#define RENDER_CHUNK_SIZE 64 + +/* + * State shared by the local multithreaded renderer. Worker threads pull + * fixed-size chunks of pixels off a lock-free atomic cursor until the image + * is complete. + */ +typedef struct +{ + Scene * scene; + unsigned char * data; + int width; + int total_pixels; + std::atomic next_pixel; /* dispatch cursor: next unclaimed pixel */ + std::atomic pixels_done; /* pixels finished, for progress */ + std::atomic last_permille; /* last tenth-of-a-percent printed */ +} render_thread_state_t; + +static void * render_thread(void * varg) +{ + render_thread_state_t * state = (render_thread_state_t *) varg; + const int width = state->width; + const int total_pixels = state->total_pixels; + + for (;;) + { + /* atomically claim the next chunk of contiguous pixels */ + int start = state->next_pixel.fetch_add(RENDER_CHUNK_SIZE, + std::memory_order_relaxed); + if (start >= total_pixels) + break; + int end = start + RENDER_CHUNK_SIZE; + if (end > total_pixels) + end = total_pixels; + + for (int pixel = start; pixel < end; pixel++) + { + int y = pixel / width; + int x = pixel % width; + state->scene->renderPixel(x, y, &state->data[3 * pixel]); + } + + /* report progress without locking: of the threads that advance the + * tenth-of-a-percent counter, only the one that wins the update prints, + * so the display stays monotonic and free of interleaved output */ + int done = state->pixels_done.fetch_add(end - start, + std::memory_order_relaxed) + + (end - start); + int permille = (int) (1000L * done / total_pixels); + int prev = state->last_permille.load(std::memory_order_relaxed); + while (permille > prev) + { + if (state->last_permille.compare_exchange_weak( + prev, permille, std::memory_order_relaxed)) + { + printf("\e[8D%2.1f%%", permille / 10.0); + fflush(stdout); + break; + } + /* prev reloaded by compare_exchange_weak; re-test permille > prev */ + } + } + + return NULL; +} + +static void renderThreaded(Scene & scene, + unsigned char * data, + int width, + int height) +{ + int num_threads = sysconf(_SC_NPROCESSORS_ONLN); + if (num_threads < 1) + num_threads = 1; + + render_thread_state_t state; + state.scene = &scene; + state.data = data; + state.width = width; + state.total_pixels = width * height; + state.next_pixel.store(0, std::memory_order_relaxed); + state.pixels_done.store(0, std::memory_order_relaxed); + state.last_permille.store(0, std::memory_order_relaxed); + + vector threads(num_threads); + for (int i = 0; i < num_threads; i++) + pthread_create(&threads[i], NULL, render_thread, &state); + for (int i = 0; i < num_threads; i++) + pthread_join(threads[i], NULL); + + printf("\e[8D"); +} + void usage(const char * progname) { cout << "Usage: " << progname << " [options] " << endl; @@ -274,28 +378,8 @@ int main(int argc, char * argv[]) } else { - int total_pixels = height * width; - int total_pixels_1000 = total_pixels / 1000; - if (total_pixels_1000 < 1) - total_pixels_1000 = 1; - int pixel_num = 0; - /* "sequential" version */ - for (int i = 0; i < height; i++) - { - for (int j = 0; j < width; j++) - { - int pixel = i * width + j; - scene.renderPixel(j, i, &data[3 * pixel]); - pixel_num++; - if (pixel_num % total_pixels_1000 == 0) - { - double pct = 100.0 * pixel_num / (double) total_pixels; - printf("\e[8D%2.1f%%", pct); - fflush(stdout); - } - } - } - printf("\e[8D"); + /* local multithreaded render using all available cores */ + renderThreaded(scene, data, width, height); } gettimeofday(&after, NULL); /* stop timing */ diff --git a/src/util/refptr.h b/src/util/refptr.h index 7455a9e..35bf369 100644 --- a/src/util/refptr.h +++ b/src/util/refptr.h @@ -3,6 +3,7 @@ #define REFPTR_H REFPTR_H #include /* NULL */ +#include /* std::atomic */ template class refptr @@ -25,7 +26,10 @@ class refptr void destroy(); T * m_ptr; - int * m_refCount; + /* reference count is atomic so that refptr copies may be made + * concurrently from multiple threads (e.g. the multithreaded + * renderer) without corrupting the count */ + std::atomic * m_refCount; }; template refptr::refptr() @@ -37,8 +41,7 @@ template refptr::refptr() template refptr::refptr(T * ptr) { m_ptr = ptr; - m_refCount = new int; - *m_refCount = 1; + m_refCount = new std::atomic(1); } template refptr::refptr(const refptr & orig) @@ -57,8 +60,7 @@ template refptr & refptr::operator=(T * ptr) { destroy(); m_ptr = ptr; - m_refCount = new int; - *m_refCount = 1; + m_refCount = new std::atomic(1); return *this; } @@ -67,7 +69,7 @@ template void refptr::cloneFrom(const refptr & orig) this->m_ptr = orig.m_ptr; this->m_refCount = orig.m_refCount; if (m_refCount != NULL) - (*m_refCount)++; + m_refCount->fetch_add(1, std::memory_order_relaxed); } template refptr::~refptr() @@ -79,15 +81,13 @@ template void refptr::destroy() { if (m_refCount != NULL) { - if (*m_refCount <= 1) + /* fetch_sub returns the value prior to the decrement; if it was 1 + * then this was the last reference and we own the cleanup */ + if (m_refCount->fetch_sub(1, std::memory_order_acq_rel) == 1) { delete m_ptr; delete m_refCount; } - else - { - (*m_refCount)--; - } } }