diff --git a/cs677/pa4/Makefile b/cs677/pa4/Makefile
new file mode 100644
index 0000000..d5eb71d
--- /dev/null
+++ b/cs677/pa4/Makefile
@@ -0,0 +1,11 @@
+
+FILE := maximum-parsimony
+TARGET := $(FILE)
+
+all: $(TARGET)
+
+$(TARGET): $(FILE).cc
+	mpiCC -o $@ $<
+
+clean:
+	-rm -f *.o *~ $(TARGET)
diff --git a/cs677/pa4/maximum-parsimony.cc b/cs677/pa4/maximum-parsimony.cc
new file mode 100644
index 0000000..cf86d4e
--- /dev/null
+++ b/cs677/pa4/maximum-parsimony.cc
@@ -0,0 +1,147 @@
+
+#include <iostream>
+#include <sys/time.h>
+#include <mpi.h>
+#include <unistd.h>     /* usleep() */
+using namespace std;
+
+/*
+ * taskAllocate() will divide a set of total_tasks tasks into
+ * total_workers groups, as evenly as possible
+ * Parameters:
+ *   total_tasks   : IN  : the total number of tasks to divide up
+ *   total_workers : IN  : the total number of workers to allocate tasks to (>0)
+ *   this_id       : IN  : the id (0-based) of the task calling us for work
+ *   first_task_id : OUT : the id (0-based) of the first task for this worker
+ *   num           : OUT : the number of tasks assigned to this worker
+ */
+void taskAllocate(int total_tasks, int total_workers, int this_id,
+                         int * first_task_id, int * num)
+{
+    int l_num;
+    int leftovers = total_tasks % total_workers; /* num of "leftover" tasks */
+    if (this_id < leftovers)
+    {
+        l_num = total_tasks / total_workers + 1; /* do one of the leftovers */
+        *first_task_id = l_num * this_id;
+    }
+    else
+    {
+        l_num = total_tasks / total_workers;
+        *first_task_id = l_num * this_id + leftovers;
+    }
+    *num = l_num;
+}
+
+void printMatrix(int * matrix, int width, int height)
+{
+    for (int i = 0; i < height; i++)
+    {
+        for (int j = 0; j < width; j++)
+        {
+            cout << *matrix++ << " ";
+        }
+        cout << endl;
+    }
+}
+
+int main(int argc, char * argv[])
+{
+    int my_rank;
+    int p;          /* the number of processes */
+    int n = 10;     /* the size of the matrix */
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &p);
+
+    /* Initialize the matrices */
+    int matrix[n][n];
+    int recvmatrix[n][n];
+    for (int i = 0; i < n; i++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            matrix[i][j] = 0;           /* zero the matrices for prettier */
+            recvmatrix[i][j] = 0;       /* printing */
+        }
+    }
+
+    /* Determine which rows I am responsible for and initialize them */
+    int my_first_row;
+    int my_num_rows;
+    taskAllocate(n, p, my_rank, &my_first_row, &my_num_rows);
+    for (int row = my_first_row; row < my_first_row + my_num_rows; row++)
+    {
+        for (int j = 0; j < n; j++)
+        {
+            matrix[row][j] = 100 * (row + 1) + (j + 1);
+        }
+    }
+
+    /* Print the initial matrices */
+    if (my_rank == 0)
+        cout << " *** Initial Matrices ***" << endl;
+    for (int i = 0; i < p; i++)
+    {
+        if (my_rank == i)
+        {
+            cout << "Process " << i << " matrix:" << endl;
+            printMatrix(&matrix[0][0], n, n);
+            cout << endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);      /* just for printing coherently */
+        usleep(100000);
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    /* Populate the displacements array */
+    int displs[p];
+    int counts[p];
+    for (int i = 0, total = 0; i < p; i++)
+    {
+        int first;
+        int count;
+        taskAllocate(n, p, i, &first, &count);
+        displs[i] = total;
+        counts[i] = count;
+        total += count;
+    }
+
+    /* Transpose the matrix with n gather operations */
+    for (int i = 0, toproc = 0, proccount = counts[0]; i < n; i++)
+    {
+        int my_col_i_vals[my_num_rows];
+        for (int row_offset = 0; row_offset < my_num_rows; row_offset++)
+            my_col_i_vals[row_offset] = matrix[my_first_row + row_offset][i];
+        MPI_Gatherv(&my_col_i_vals[0], my_num_rows, MPI_INT,
+                    &recvmatrix[i][0], &counts[0], &displs[0],
+                    MPI_INT, toproc, MPI_COMM_WORLD);
+        proccount--;
+        if (proccount <= 0)
+        {
+            toproc++;
+            proccount = counts[toproc];
+        }
+    }
+
+    /* Print the transposed matrices */
+    if (my_rank == 0)
+        cout << " *** Final Transposed Matrices ***" << endl;
+    for (int i = 0; i < p; i++)
+    {
+        if (my_rank == i)
+        {
+            cout << "Process " << i << " matrix:" << endl;
+            printMatrix(&recvmatrix[0][0], n, n);
+            cout << endl;
+        }
+        MPI_Barrier(MPI_COMM_WORLD);      /* just for printing coherently */
+        usleep(100000);
+        MPI_Barrier(MPI_COMM_WORLD);
+    }
+
+    MPI_Finalize();
+
+    return 0;
+}