diff --git a/pypocketfft.cc b/pypocketfft.cc
index 61904dcd77b55cc9618df43d11506d67f2ae039c..4b254bdeaf974f1d0fa4f490707d3026a3030076 100644
--- a/pypocketfft.cc
+++ b/pypocketfft.cc
@@ -372,10 +372,29 @@ py::array genuine_hartley(const py::array &in, const py::object &axes_,
     out_, nthreads))
   }
 
-size_t good_size(size_t n, bool real)
+// Export good_size in raw C-API to reduce overhead (~4x faster)
+PyObject * good_size(PyObject * self, PyObject * args)
   {
+  Py_ssize_t n_ = -1;
+  int real = false;
+  if (!PyArg_ParseTuple(args, "n|p:good_size", &n_, &real))
+    return nullptr;
+
+  if (n_<0)
+    {
+    PyErr_SetString(PyExc_ValueError, "Target length must be positive");
+    return nullptr;
+    }
+  if ((n_-1) > static_cast<Py_ssize_t>(std::numeric_limits<size_t>::max() / 11))
+    {
+    PyErr_Format(PyExc_ValueError,
+                 "Target length is too large to perform an FFT: %zi", n_);
+    return nullptr;
+    }
+  const auto n = static_cast<size_t>(n_);
   using namespace pocketfft::detail;
-  return real ? util::good_size_real(n) : util::good_size_cmplx(n);
+  return PyLong_FromSize_t(
+    real ? util::good_size_real(n) : util::good_size_cmplx(n));
   }
 
 const char *pypocketfft_DS = R"""(Fast Fourier and Hartley transforms.
@@ -702,5 +721,8 @@ PYBIND11_MODULE(pypocketfft, m)
     "out"_a=None, "nthreads"_a=1);
   m.def("dst", dst, dst_DS, "a"_a, "type"_a, "axes"_a=None, "inorm"_a=0,
     "out"_a=None, "nthreads"_a=1);
-  m.def("good_size", good_size, good_size_DS, "n"_a, "real"_a=false);
+
+  static PyMethodDef good_size_meth[] =
+    {{"good_size", good_size, METH_VARARGS, good_size_DS}, {0}};
+  PyModule_AddFunctions(m.ptr(), good_size_meth);
   }