diff --git a/pypocketfft.cc b/pypocketfft.cc
index c6d72bf7a3cd9c345dd25803ec1c8251c030fa14..2bdb3afb4ddb4b3997e92844f9693ff8c114653a 100644
--- a/pypocketfft.cc
+++ b/pypocketfft.cc
@@ -227,7 +227,7 @@ py::array r2r_fftpack(const py::array &in, const py::object &axes_,
   }
 
 template<typename T> py::array dct_internal(const py::array &in,
-  const py::object &axes_, int type, int inorm, bool ortho, py::object &out_,
+  const py::object &axes_, int type, int inorm, py::object &out_,
   size_t nthreads)
   {
   auto axes = makeaxes(in, axes_);
@@ -239,10 +239,9 @@ template<typename T> py::array dct_internal(const py::array &in,
   auto d_out=reinterpret_cast<T *>(res.mutable_data());
   {
   py::gil_scoped_release release;
-  // override
-  if (ortho) inorm=1;
   T fct = (type==1) ? norm_fct<T>(inorm, dims, axes, 2, -1)
                     : norm_fct<T>(inorm, dims, axes, 2);
+  bool ortho = inorm == 1;
   pocketfft::dct(dims, s_in, s_out, axes, type, d_in, d_out, fct, ortho,
     nthreads);
   }
@@ -250,15 +249,15 @@ template<typename T> py::array dct_internal(const py::array &in,
   }
 
 py::array dct(const py::array &in, int type, const py::object &axes_,
-  int inorm, bool ortho, py::object &out_, size_t nthreads)
+  int inorm, py::object &out_, size_t nthreads)
   {
   if ((type<1) || (type>4)) throw invalid_argument("invalid DCT type");
-  DISPATCH(in, f64, f32, flong, dct_internal, (in, axes_, type, inorm, ortho,
-    out_, nthreads))
+  DISPATCH(in, f64, f32, flong, dct_internal, (in, axes_, type, inorm, out_,
+    nthreads))
   }
 
 template<typename T> py::array dst_internal(const py::array &in,
-  const py::object &axes_, int type, int inorm, bool ortho, py::object &out_,
+  const py::object &axes_, int type, int inorm, py::object &out_,
   size_t nthreads)
   {
   auto axes = makeaxes(in, axes_);
@@ -270,10 +269,9 @@ template<typename T> py::array dst_internal(const py::array &in,
   auto d_out=reinterpret_cast<T *>(res.mutable_data());
   {
   py::gil_scoped_release release;
-  // override
-  if (ortho) inorm=1;
   T fct = (type==1) ? norm_fct<T>(inorm, dims, axes, 2, 1)
                     : norm_fct<T>(inorm, dims, axes, 2);
+  bool ortho = inorm == 1;
   pocketfft::dst(dims, s_in, s_out, axes, type, d_in, d_out, fct, ortho,
     nthreads);
   }
@@ -281,10 +279,10 @@ template<typename T> py::array dst_internal(const py::array &in,
   }
 
 py::array dst(const py::array &in, int type, const py::object &axes_,
-  int inorm, bool ortho, py::object &out_, size_t nthreads)
+  int inorm, py::object &out_, size_t nthreads)
   {
   if ((type<1) || (type>4)) throw invalid_argument("invalid DST type");
-  DISPATCH(in, f64, f32, flong, dst_internal, (in, axes_, type, inorm, ortho,
+  DISPATCH(in, f64, f32, flong, dst_internal, (in, axes_, type, inorm,
     out_, nthreads))
   }
 
@@ -599,11 +597,18 @@ axes : list of integers
 inorm : int
     Normalization type
       0 : no normalization
-      1 : divide by sqrt(N)
+      1 : make transform orthogonal and divide by sqrt(N)
       2 : divide by N
     where N is the product of n_i for every transformed axis i.
     n_i is 2*(<axis_length>-1 for type 1 and 2*<axis length>
     for types 2, 3, 4.
+    Making the transform orthogonal involves the following additional steps
+    for every 1D sub-transform:
+      Type 1 : multiply first and last input value by sqrt(2)
+               divide first and last output value by sqrt(2)
+      Type 2 : divide first output value by sqrt(2)
+      Type 3 : multiply first input value by sqrt(2)
+      Type 4 : nothing
 out : numpy.ndarray (same shape and data type as `a`)
     May be identical to `a`, but if it isn't, it must not overlap with `a`.
     If None, a new array is allocated to store the output.
@@ -631,11 +636,17 @@ axes : list of integers
 inorm : int
     Normalization type
       0 : no normalization
-      1 : divide by sqrt(N)
+      1 : make transform orthogonal and divide by sqrt(N)
       2 : divide by N
     where N is the product of n_i for every transformed axis i.
     n_i is 2*(<axis_length>+1 for type 1 and 2*<axis length>
     for types 2, 3, 4.
+    Making the transform orthogonal involves the following additional steps
+    for every 1D sub-transform:
+      Type 1 : nothing
+      Type 2 : divide first output value by sqrt(2)
+      Type 3 : multiply first input value by sqrt(2)
+      Type 4 : nothing
 out : numpy.ndarray (same shape and data type as `a`)
     May be identical to `a`, but if it isn't, it must not overlap with `a`.
     If None, a new array is allocated to store the output.
@@ -669,7 +680,7 @@ PYBIND11_MODULE(pypocketfft, m)
   m.def("genuine_hartley", genuine_hartley, genuine_hartley_DS, "a"_a,
     "axes"_a=None, "inorm"_a=0, "out"_a=None, "nthreads"_a=1);
   m.def("dct", dct, dct_DS, "a"_a, "type"_a, "axes"_a=None, "inorm"_a=0,
-    "ortho"_a=false, "out"_a=None, "nthreads"_a=1);
+    "out"_a=None, "nthreads"_a=1);
   m.def("dst", dst, dst_DS, "a"_a, "type"_a, "axes"_a=None, "inorm"_a=0,
-    "ortho"_a=false, "out"_a=None, "nthreads"_a=1);
+    "out"_a=None, "nthreads"_a=1);
   }
diff --git a/test.py b/test.py
index 3d22f1ebb4c744b0ab2b6edfa82ae03e2242331e..7ee4505c0f86bb52b597bdacce1fc5333be1737d 100644
--- a/test.py
+++ b/test.py
@@ -55,36 +55,27 @@ def irfft_scipy(a, axis, inorm=0, out=None, nthreads=1):
                                    forward=False, inorm=inorm, out=out,
                                    nthreads=nthreads)
 
+tol = {np.float32: 6e-7, np.float64: 1.5e-15, np.longfloat: 1e-18}
+ctype = {np.float32: np.complex64, np.float64: np.complex128, np.longfloat: np.longcomplex}
 
 @pmp("len", len1D)
 @pmp("inorm", [0, 1, 2])
-def test1D(len, inorm):
+@pmp("dtype", [np.float32, np.float64, np.longfloat])
+def test1D(len, inorm, dtype):
     a = np.random.rand(len)-0.5 + 1j*np.random.rand(len)-0.5j
-    b = a.astype(np.complex64)
-    c = a.astype(np.complex256)
-    _assert_close(a, ifftn(fftn(c, inorm=inorm), inorm=2-inorm), 1e-18)
-    assert_(_l2error(a, ifftn(fftn(a, inorm=inorm), inorm=2-inorm)) < 1.5e-15)
+    a = a.astype(ctype[dtype])
+    eps = tol[dtype]
+    assert_(_l2error(a, ifftn(fftn(a, inorm=inorm), inorm=2-inorm)) < eps)
     assert_(_l2error(a.real, ifftn(fftn(a.real, inorm=inorm), inorm=2-inorm))
-            < 1.5e-15)
+            < eps)
     assert_(_l2error(a.real, fftn(ifftn(a.real, inorm=inorm), inorm=2-inorm))
-            < 1.5e-15)
+            < eps)
     assert_(_l2error(a.real, irfftn(rfftn(a.real, inorm=inorm),
-                                    inorm=2-inorm, lastsize=len)) < 1.5e-15)
+                                    inorm=2-inorm, lastsize=len)) < eps)
     tmp = a.copy()
     assert_(ifftn(fftn(tmp, out=tmp, inorm=inorm), out=tmp, inorm=2-inorm)
             is tmp)
-    assert_(_l2error(tmp, a) < 1.5e-15)
-    assert_(_l2error(b, ifftn(fftn(b, inorm=inorm), inorm=2-inorm)) < 6e-7)
-    assert_(_l2error(b.real, ifftn(fftn(b.real, inorm=inorm), inorm=2-inorm))
-            < 6e-7)
-    assert_(_l2error(b.real, fftn(ifftn(b.real, inorm=inorm), inorm=2-inorm))
-            < 6e-7)
-    assert_(_l2error(b.real, irfftn(rfftn(b.real, inorm=inorm), lastsize=len,
-                                    inorm=2-inorm)) < 6e-7)
-    tmp = b.copy()
-    assert_(ifftn(fftn(tmp, out=tmp, inorm=inorm), out=tmp, inorm=2-inorm)
-            is tmp)
-    assert_(_l2error(tmp, b) < 6e-7)
+    assert_(_l2error(tmp, a) < eps)
 
 
 @pmp("shp", shapes)
@@ -206,54 +197,17 @@ def test_genuine_hartley_2D(shp, axes):
 
 
 @pmp("len", len1D)
-@pmp("inorm", [0, 1, 2])
-@pmp("type", [1, 2, 3])
-def testdcst1D(len, inorm, type):
-    a = np.random.rand(len)-0.5
-    b = a.astype(np.float32)
-    c = a.astype(np.float128)
-    itp = (0, 1, 3, 2, 4)
-    itype = itp[type]
-    if type != 1 or len > 1:
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(c, inorm=inorm, type=type), inorm=2-inorm, type=itype), 2e-18)
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), 1.5e-15)
-        _assert_close(b, pypocketfft.dct(pypocketfft.dct(b, inorm=inorm, type=type), inorm=2-inorm, type=itype), 6e-7)
-    _assert_close(a, pypocketfft.dst(pypocketfft.dst(c, inorm=inorm, type=type), inorm=2-inorm, type=itype), 2e-18)
-    _assert_close(a, pypocketfft.dst(pypocketfft.dst(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), 1.5e-15)
-    _assert_close(b, pypocketfft.dst(pypocketfft.dst(b, inorm=inorm, type=type), inorm=2-inorm, type=itype), 6e-7)
-
-@pmp("len", len1D)
-@pmp("type", [1, 2, 3])
-def testdcst1Dortho(len, type):
-    a = np.random.rand(len)-0.5
-    b = a.astype(np.float32)
-    c = a.astype(np.float128)
-    itp = (0, 1, 3, 2, 4)
-    itype = itp[type]
-    if type != 1 or len > 1:
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(c, ortho=True, type=type), ortho=True, type=itype), 2e-18)
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(a, ortho=True, type=type), ortho=True, type=itype), 1.5e-15)
-        _assert_close(b, pypocketfft.dct(pypocketfft.dct(b, ortho=True, type=type), ortho=True, type=itype), 6e-7)
-    if type != 1:
-        _assert_close(a, pypocketfft.dst(pypocketfft.dst(c, ortho=True, type=type), ortho=True, type=itype), 2e-18)
-        _assert_close(a, pypocketfft.dst(pypocketfft.dst(a, ortho=True, type=type), ortho=True, type=itype), 1.5e-15)
-        _assert_close(b, pypocketfft.dst(pypocketfft.dst(b, ortho=True, type=type), ortho=True, type=itype), 6e-7)
-
-
-# TEMPORARY: separate test for DCT/DST IV, since they are less accurate
-@pmp("len", len1D)
-@pmp("inorm", [0, 1, 2])
-@pmp("type", [4])
-def testdcst1D4(len, inorm, type):
-    a = np.random.rand(len)-0.5
-    b = a.astype(np.float32)
-    c = a.astype(np.float128)
+@pmp("inorm", [0, 1])  # inorm==2 not needed, tested via inverse
+@pmp("type", [1, 2, 3, 4])
+@pmp("dtype", [np.float32, np.float64, np.longfloat])
+def testdcst1D(len, inorm, type, dtype):
+    a = (np.random.rand(len)-0.5).astype(dtype)
+    eps = tol[dtype]
     itp = (0, 1, 3, 2, 4)
+    if type==4 and len%2 == 1:  # relaxed accuracies for odd-length type 4 transforms
+        special_tol = {np.float32: 4e-5, np.float64: 6e-14, np.longfloat: 4e-17}
+        eps = special_tol[dtype]
     itype = itp[type]
-    if type != 1 or len > 1:
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(c, inorm=inorm, type=type), inorm=2-inorm, type=itype), 2e-16)
-        _assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), 1.5e-13)
-        _assert_close(b, pypocketfft.dct(pypocketfft.dct(b, inorm=inorm, type=type), inorm=2-inorm, type=itype), 6e-5)
-    _assert_close(a, pypocketfft.dst(pypocketfft.dst(c, inorm=inorm, type=type), inorm=2-inorm, type=itype), 2e-16)
-    _assert_close(a, pypocketfft.dst(pypocketfft.dst(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), 1.5e-13)
-    _assert_close(b, pypocketfft.dst(pypocketfft.dst(b, inorm=inorm, type=type), inorm=2-inorm, type=itype), 6e-5)
+    if type != 1 or len > 1:  # there are no length-1 type 1 DCTs
+        _assert_close(a, pypocketfft.dct(pypocketfft.dct(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), eps)
+    _assert_close(a, pypocketfft.dst(pypocketfft.dst(a, inorm=inorm, type=type), inorm=2-inorm, type=itype), eps)