diff --git a/src/Morton_shuffler.cpp b/src/Morton_shuffler.cpp
index 11f40c595c222fe87eb84e32275e92b099669423..7de49c5029d46735e798a7a83c75fe0d9f3f951a 100644
--- a/src/Morton_shuffler.cpp
+++ b/src/Morton_shuffler.cpp
@@ -8,7 +8,8 @@ Morton_shuffler::Morton_shuffler(
         int nfiles)
 {
     this->d = d;
-    if (nprocs % nfiles != 0)
+    if ((nprocs % nfiles != 0) &&
+        (nfiles % nprocs != 0))
     {
         std::cerr <<
             "Number of output files incompatible with number of processes.\n"
@@ -44,6 +45,13 @@ Morton_shuffler::Morton_shuffler(
     //set up output file descriptor
     int out_rank, out_nprocs;
     out_nprocs = nprocs/nfiles;
+    if (out_nprocs == 0)
+    {
+        out_nprocs = 1;
+        this->files_per_proc = nfiles / nprocs;
+    }
+    else
+        this->files_per_proc = 1;
     this->out_group = myrank / out_nprocs;
     out_rank = myrank % out_nprocs;
     n[0] = ((N0/8) * (N1/8) * (N2/8)) / nfiles;
@@ -128,11 +136,16 @@ int Morton_shuffler::shuffle(
     fftwf_free(rz);
 
     char temp_char[200];
-    sprintf(temp_char,
-            "%s_z%.7x",
-            base_fname,
-            this->out_group*this->doutput->sizes[0]);
-    this->doutput->write(temp_char, rtmp);
+    for (int fcounter = 0; fcounter < this->files_per_proc; fcounter++)
+    {
+        sprintf(temp_char,
+                "%s_z%.7x",
+                base_fname,
+                (this->files_per_proc*this->out_group + fcounter)*this->doutput->sizes[0]);
+        this->doutput->write(
+                temp_char,
+                rtmp + fcounter*this->doutput->local_size);
+    }
     fftwf_free(rtmp);
     return EXIT_SUCCESS;
 }
diff --git a/src/Morton_shuffler.hpp b/src/Morton_shuffler.hpp
index 913f0d847c26bd910d3d13a9656807b7eef00ae8..08825cbd3f3267dae910556d75b7251fc219e5f8 100644
--- a/src/Morton_shuffler.hpp
+++ b/src/Morton_shuffler.hpp
@@ -61,7 +61,7 @@ class Morton_shuffler
 
         // communicator to use for output
         MPI_Comm out_communicator;
-        int out_group;
+        int out_group, files_per_proc;
 
         /* methods */
         Morton_shuffler(
diff --git a/test3.ipynb b/test3.ipynb
index 5fa0f637dcd6778535c67ea6cdf3de16f3877d7a..23ff5b95bbfb2db00076617ccefd8cdc65aa4857 100644
--- a/test3.ipynb
+++ b/test3.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:e744ea7dc72564d6f526a0a73b97b6f43c1ccd7d5c6c2767d8b6c5a5ec7f4487"
+  "signature": "sha256:bb9dddd64d9a5ac46de7f7d5ba2abecb7dd4bb75c76a6845ace376759bd57f78"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -63,8 +63,16 @@
      ],
      "language": "python",
      "metadata": {},
-     "outputs": [],
-     "prompt_number": 14
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stderr",
+       "text": [
+        "-c:15: RuntimeWarning: divide by zero encountered in true_divide\n"
+       ]
+      }
+     ],
+     "prompt_number": 2
     },
     {
      "cell_type": "code",
@@ -148,13 +156,15 @@
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 15
+     "prompt_number": 3
     },
     {
      "cell_type": "code",
      "collapsed": false,
      "input": [
-      "def compute_cpp_data(branch = None):\n",
+      "def compute_cpp_data(\n",
+      "        branch = None,\n",
+      "        nfiles = 16):\n",
       "    if not (type(branch) == type(None)):\n",
       "        subprocess.call(['git', 'checkout', branch])\n",
       "    if subprocess.call(['make', 'full.elf']) == 0:\n",
@@ -164,39 +174,41 @@
       "                         'time',\n",
       "                         'mpirun.mpich',\n",
       "                         '-np',\n",
-      "                         '32',\n",
+      "                         '8',\n",
       "                         './full.elf',\n",
       "                         '{0}'.format(n),\n",
       "                         '{0}'.format(N),\n",
-      "                         '2',\n",
+      "                         '{0}'.format(nfiles),\n",
       "                         '3'])\n",
       "    else:\n",
       "        print ('compilation error')\n",
       "        return None\n",
       "    \n",
-      "def get_cpp_data(branch = None, run = True):\n",
+      "def get_cpp_data(\n",
+      "        branch = None,\n",
+      "        run = True,\n",
+      "        nfiles = 16):\n",
       "    if run:\n",
       "        subprocess.call(['rm',\n",
       "                         'Rdata_z{0:0>7x}'.format(0),\n",
       "                         'Rdata_z{0:0>7x}'.format(Rdata_py.shape[0]//2)])\n",
-      "        compute_cpp_data(branch)\n",
-      "    Rdata0 = np.fromfile(\n",
-      "        'Rdata_z{0:0>7x}'.format(0),\n",
-      "        dtype = np.float32).reshape(-1, 8, 8, 8, 3)\n",
-      "    Rdata1 = np.fromfile(\n",
-      "        'Rdata_z{0:0>7x}'.format(Rdata_py.shape[0]//2),\n",
-      "        dtype = np.float32).reshape(-1, 8, 8, 8, 3)\n",
-      "    return np.concatenate([Rdata0, Rdata1])\n",
+      "        compute_cpp_data(branch, nfiles = nfiles)\n",
+      "    Rdata = []\n",
+      "    for nf in range(nfiles):\n",
+      "        Rdata.append(np.fromfile(\n",
+      "        'Rdata_z{0:0>7x}'.format(nf*Rdata_py.shape[0]//nfiles),\n",
+      "        dtype = np.float32).reshape(-1, 8, 8, 8, 3))\n",
+      "    return np.concatenate(Rdata)\n",
       "\n",
       "#Rdata = get_cpp_data(branch = 'develop')\n",
       "# develop says 30 secs, inplace fft is 28 secs\n",
       "#Rdata = get_cpp_data(branch = 'feature-inplace_fft')\n",
-      "Rdata = get_cpp_data(run = True)"
+      "Rdata = get_cpp_data(run = True, nfiles = 8)"
      ],
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 16
+     "prompt_number": 8
     },
     {
      "cell_type": "code",
@@ -237,7 +249,7 @@
        ]
       }
      ],
-     "prompt_number": 17
+     "prompt_number": 9
     },
     {
      "cell_type": "code",