Render multithreaded locally by default

This commit is contained in:
Josh Holtrop 2026-06-18 08:30:20 -04:00
parent aad02e522d
commit 36e12d1e90
2 changed files with 117 additions and 33 deletions

View File

@ -9,6 +9,10 @@
#include <iostream>
#include <string>
#include <map>
#include <vector>
#include <pthread.h>
#include <atomic>
#include "Scene.h"
#include "distrib/distrib.h"
@ -16,6 +20,106 @@
using namespace std;
/*
* Number of contiguous pixels each worker thread claims at a time. Pulling
* work in small chunks (rather than whole rows) keeps the load balanced even
* when individual rows vary wildly in render cost, while keeping the claimed
* pixels contiguous so cache locality and dispatch overhead stay close to the
* row-based baseline. See benchmarks: 64 was the sweet spot.
*/
#define RENDER_CHUNK_SIZE 64
/*
* State shared by the local multithreaded renderer. Worker threads pull
* fixed-size chunks of pixels off a lock-free atomic cursor until the image
* is complete.
*/
typedef struct
{
Scene * scene;
unsigned char * data;
int width;
int total_pixels;
std::atomic<int> next_pixel; /* dispatch cursor: next unclaimed pixel */
std::atomic<int> pixels_done; /* pixels finished, for progress */
std::atomic<int> last_permille; /* last tenth-of-a-percent printed */
} render_thread_state_t;
static void * render_thread(void * varg)
{
render_thread_state_t * state = (render_thread_state_t *) varg;
const int width = state->width;
const int total_pixels = state->total_pixels;
for (;;)
{
/* atomically claim the next chunk of contiguous pixels */
int start = state->next_pixel.fetch_add(RENDER_CHUNK_SIZE,
std::memory_order_relaxed);
if (start >= total_pixels)
break;
int end = start + RENDER_CHUNK_SIZE;
if (end > total_pixels)
end = total_pixels;
for (int pixel = start; pixel < end; pixel++)
{
int y = pixel / width;
int x = pixel % width;
state->scene->renderPixel(x, y, &state->data[3 * pixel]);
}
/* report progress without locking: of the threads that advance the
* tenth-of-a-percent counter, only the one that wins the update prints,
* so the display stays monotonic and free of interleaved output */
int done = state->pixels_done.fetch_add(end - start,
std::memory_order_relaxed)
+ (end - start);
int permille = (int) (1000L * done / total_pixels);
int prev = state->last_permille.load(std::memory_order_relaxed);
while (permille > prev)
{
if (state->last_permille.compare_exchange_weak(
prev, permille, std::memory_order_relaxed))
{
printf("\e[8D%2.1f%%", permille / 10.0);
fflush(stdout);
break;
}
/* prev reloaded by compare_exchange_weak; re-test permille > prev */
}
}
return NULL;
}
static void renderThreaded(Scene & scene,
unsigned char * data,
int width,
int height)
{
int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
if (num_threads < 1)
num_threads = 1;
render_thread_state_t state;
state.scene = &scene;
state.data = data;
state.width = width;
state.total_pixels = width * height;
state.next_pixel.store(0, std::memory_order_relaxed);
state.pixels_done.store(0, std::memory_order_relaxed);
state.last_permille.store(0, std::memory_order_relaxed);
vector<pthread_t> threads(num_threads);
for (int i = 0; i < num_threads; i++)
pthread_create(&threads[i], NULL, render_thread, &state);
for (int i = 0; i < num_threads; i++)
pthread_join(threads[i], NULL);
printf("\e[8D");
}
void usage(const char * progname)
{
cout << "Usage: " << progname << " [options] <scene-file>" << endl;
@ -274,28 +378,8 @@ int main(int argc, char * argv[])
}
else
{
int total_pixels = height * width;
int total_pixels_1000 = total_pixels / 1000;
if (total_pixels_1000 < 1)
total_pixels_1000 = 1;
int pixel_num = 0;
/* "sequential" version */
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
int pixel = i * width + j;
scene.renderPixel(j, i, &data[3 * pixel]);
pixel_num++;
if (pixel_num % total_pixels_1000 == 0)
{
double pct = 100.0 * pixel_num / (double) total_pixels;
printf("\e[8D%2.1f%%", pct);
fflush(stdout);
}
}
}
printf("\e[8D");
/* local multithreaded render using all available cores */
renderThreaded(scene, data, width, height);
}
gettimeofday(&after, NULL); /* stop timing */

View File

@ -3,6 +3,7 @@
#define REFPTR_H REFPTR_H
#include <stdlib.h> /* NULL */
#include <atomic> /* std::atomic */
template <typename T>
class refptr
@ -25,7 +26,10 @@ class refptr
void destroy();
T * m_ptr;
int * m_refCount;
/* reference count is atomic so that refptr copies may be made
* concurrently from multiple threads (e.g. the multithreaded
* renderer) without corrupting the count */
std::atomic<int> * m_refCount;
};
template <typename T> refptr<T>::refptr()
@ -37,8 +41,7 @@ template <typename T> refptr<T>::refptr()
template <typename T> refptr<T>::refptr(T * ptr)
{
m_ptr = ptr;
m_refCount = new int;
*m_refCount = 1;
m_refCount = new std::atomic<int>(1);
}
template <typename T> refptr<T>::refptr(const refptr<T> & orig)
@ -57,8 +60,7 @@ template <typename T> refptr<T> & refptr<T>::operator=(T * ptr)
{
destroy();
m_ptr = ptr;
m_refCount = new int;
*m_refCount = 1;
m_refCount = new std::atomic<int>(1);
return *this;
}
@ -67,7 +69,7 @@ template <typename T> void refptr<T>::cloneFrom(const refptr<T> & orig)
this->m_ptr = orig.m_ptr;
this->m_refCount = orig.m_refCount;
if (m_refCount != NULL)
(*m_refCount)++;
m_refCount->fetch_add(1, std::memory_order_relaxed);
}
template <typename T> refptr<T>::~refptr()
@ -79,15 +81,13 @@ template <typename T> void refptr<T>::destroy()
{
if (m_refCount != NULL)
{
if (*m_refCount <= 1)
/* fetch_sub returns the value prior to the decrement; if it was 1
* then this was the last reference and we own the cleanup */
if (m_refCount->fetch_sub(1, std::memory_order_acq_rel) == 1)
{
delete m_ptr;
delete m_refCount;
}
else
{
(*m_refCount)--;
}
}
}