Render multithreaded locally by default

2026-06-18 08:30:20 -04:00 · 2026-06-18 08:30:20 -04:00 · 36e12d1e90
commit 36e12d1e90
parent aad02e522d
2 changed files with 117 additions and 33 deletions
--- a/src/main/fart.cc
+++ b/src/main/fart.cc
@ -9,6 +9,10 @@
 #include <iostream>
 #include <string>
 #include <map>
 #include <vector>
 #include <pthread.h>
 #include <atomic>
 #include "Scene.h"
 #include "distrib/distrib.h"
@ -16,6 +20,106 @@
 using namespace std;
 /*
 * Number of contiguous pixels each worker thread claims at a time.  Pulling
 * work in small chunks (rather than whole rows) keeps the load balanced even
 * when individual rows vary wildly in render cost, while keeping the claimed
 * pixels contiguous so cache locality and dispatch overhead stay close to the
 * row-based baseline.  See benchmarks: 64 was the sweet spot.
 */
 #define RENDER_CHUNK_SIZE 64
 /*
 * State shared by the local multithreaded renderer.  Worker threads pull
 * fixed-size chunks of pixels off a lock-free atomic cursor until the image
 * is complete.
 */
 typedef struct
 {
    Scene * scene;
    unsigned char * data;
    int width;
    int total_pixels;
    std::atomic<int> next_pixel;    /* dispatch cursor: next unclaimed pixel */
    std::atomic<int> pixels_done;   /* pixels finished, for progress */
    std::atomic<int> last_permille; /* last tenth-of-a-percent printed */
 } render_thread_state_t;
 static void * render_thread(void * varg)
 {
    render_thread_state_t * state = (render_thread_state_t *) varg;
    const int width = state->width;
    const int total_pixels = state->total_pixels;
    for (;;)
    {
        /* atomically claim the next chunk of contiguous pixels */
        int start = state->next_pixel.fetch_add(RENDER_CHUNK_SIZE,
                                                std::memory_order_relaxed);
        if (start >= total_pixels)
            break;
        int end = start + RENDER_CHUNK_SIZE;
        if (end > total_pixels)
            end = total_pixels;
        for (int pixel = start; pixel < end; pixel++)
        {
            int y = pixel / width;
            int x = pixel % width;
            state->scene->renderPixel(x, y, &state->data[3 * pixel]);
        }
        /* report progress without locking: of the threads that advance the
         * tenth-of-a-percent counter, only the one that wins the update prints,
         * so the display stays monotonic and free of interleaved output */
        int done = state->pixels_done.fetch_add(end - start,
                                                std::memory_order_relaxed)
                   + (end - start);
        int permille = (int) (1000L * done / total_pixels);
        int prev = state->last_permille.load(std::memory_order_relaxed);
        while (permille > prev)
        {
            if (state->last_permille.compare_exchange_weak(
                    prev, permille, std::memory_order_relaxed))
            {
                printf("\e[8D%2.1f%%", permille / 10.0);
                fflush(stdout);
                break;
            }
            /* prev reloaded by compare_exchange_weak; re-test permille > prev */
        }
    }
    return NULL;
 }
 static void renderThreaded(Scene & scene,
                           unsigned char * data,
                           int width,
                           int height)
 {
    int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
    if (num_threads < 1)
        num_threads = 1;
    render_thread_state_t state;
    state.scene = &scene;
    state.data = data;
    state.width = width;
    state.total_pixels = width * height;
    state.next_pixel.store(0, std::memory_order_relaxed);
    state.pixels_done.store(0, std::memory_order_relaxed);
    state.last_permille.store(0, std::memory_order_relaxed);
    vector<pthread_t> threads(num_threads);
    for (int i = 0; i < num_threads; i++)
        pthread_create(&threads[i], NULL, render_thread, &state);
    for (int i = 0; i < num_threads; i++)
        pthread_join(threads[i], NULL);
    printf("\e[8D");
 }
 void usage(const char * progname)
 {
    cout << "Usage: " << progname << " [options] <scene-file>" << endl;
@ -274,28 +378,8 @@ int main(int argc, char * argv[])
    }
    else
    {
-        int total_pixels = height * width;
+        /* local multithreaded render using all available cores */
-        int total_pixels_1000 = total_pixels / 1000;
+        renderThreaded(scene, data, width, height);
        if (total_pixels_1000 < 1)
            total_pixels_1000 = 1;
        int pixel_num = 0;
        /* "sequential" version */
        for (int i = 0; i < height; i++)
        {
            for (int j = 0; j < width; j++)
            {
                int pixel = i * width + j;
                scene.renderPixel(j, i, &data[3 * pixel]);
                pixel_num++;
                if (pixel_num % total_pixels_1000 == 0)
                {
                    double pct = 100.0 * pixel_num / (double) total_pixels;
                    printf("\e[8D%2.1f%%", pct);
                    fflush(stdout);
                }
            }
        }
        printf("\e[8D");
    }
    gettimeofday(&after, NULL);         /* stop timing */
--- a/src/util/refptr.h
+++ b/src/util/refptr.h
@ -3,6 +3,7 @@
 #define REFPTR_H REFPTR_H
 #include <stdlib.h>             /* NULL */
 #include <atomic>               /* std::atomic */
 template <typename T>
 class refptr
@ -25,7 +26,10 @@ class refptr
        void destroy();
        T * m_ptr;
-        int * m_refCount;
+        /* reference count is atomic so that refptr copies may be made
         * concurrently from multiple threads (e.g. the multithreaded
         * renderer) without corrupting the count */
        std::atomic<int> * m_refCount;
 };
 template <typename T> refptr<T>::refptr()
@ -37,8 +41,7 @@ template <typename T> refptr<T>::refptr()
 template <typename T> refptr<T>::refptr(T * ptr)
 {
    m_ptr = ptr;
-    m_refCount = new int;
+    m_refCount = new std::atomic<int>(1);
    *m_refCount = 1;
 }
 template <typename T> refptr<T>::refptr(const refptr<T> & orig)
@ -57,8 +60,7 @@ template <typename T> refptr<T> & refptr<T>::operator=(T * ptr)
 {
    destroy();
    m_ptr = ptr;
-    m_refCount = new int;
+    m_refCount = new std::atomic<int>(1);
    *m_refCount = 1;
    return *this;
 }
@ -67,7 +69,7 @@ template <typename T> void refptr<T>::cloneFrom(const refptr<T> & orig)
    this->m_ptr = orig.m_ptr;
    this->m_refCount = orig.m_refCount;
    if (m_refCount != NULL)
-        (*m_refCount)++;
+        m_refCount->fetch_add(1, std::memory_order_relaxed);
 }
 template <typename T> refptr<T>::~refptr()
@ -79,15 +81,13 @@ template <typename T> void refptr<T>::destroy()
 {
    if (m_refCount != NULL)
    {
-        if (*m_refCount <= 1)
+        /* fetch_sub returns the value prior to the decrement; if it was 1
         * then this was the last reference and we own the cleanup */
        if (m_refCount->fetch_sub(1, std::memory_order_acq_rel) == 1)
        {
            delete m_ptr;
            delete m_refCount;
        }
        else
        {
            (*m_refCount)--;
        }
    }
 }